From ed35e2f2f0ded15df313ae6f8da21e85c8e1e493 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack3000@gmail.com>
Date: Tue, 21 Feb 2023 17:52:05 +0100
Subject: landlock: Clarify documentation for the LANDLOCK_ACCESS_FS_REFER
 right
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clarify the "refer" documentation by splitting up a big paragraph of
text.

- Call out specifically that the denial by default applies to ABI v1 as
  well.
- Turn the three additional constraints for link/rename operations
  into bullet points, to give it more structure.

Signed-off-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20230221165205.4231-1-gnoack3000@gmail.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 46 +++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index f3223f964691..81d09ef9aa50 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -130,21 +130,37 @@ struct landlock_path_beneath_attr {
  * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device.
  * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link.
  * - %LANDLOCK_ACCESS_FS_REFER: Link or rename a file from or to a different
- *   directory (i.e. reparent a file hierarchy).  This access right is
- *   available since the second version of the Landlock ABI.  This is also the
- *   only access right which is always considered handled by any ruleset in
- *   such a way that reparenting a file hierarchy is always denied by default.
- *   To avoid privilege escalation, it is not enough to add a rule with this
- *   access right.  When linking or renaming a file, the destination directory
- *   hierarchy must also always have the same or a superset of restrictions of
- *   the source hierarchy.  If it is not the case, or if the domain doesn't
- *   handle this access right, such actions are denied by default with errno
- *   set to ``EXDEV``.  Linking also requires a ``LANDLOCK_ACCESS_FS_MAKE_*``
- *   access right on the destination directory, and renaming also requires a
- *   ``LANDLOCK_ACCESS_FS_REMOVE_*`` access right on the source's (file or
- *   directory) parent.  Otherwise, such actions are denied with errno set to
- *   ``EACCES``.  The ``EACCES`` errno prevails over ``EXDEV`` to let user space
- *   efficiently deal with an unrecoverable error.
+ *   directory (i.e. reparent a file hierarchy).
+ *
+ *   This access right is available since the second version of the Landlock
+ *   ABI.
+ *
+ *   This is the only access right which is denied by default by any ruleset,
+ *   even if the right is not specified as handled at ruleset creation time.
+ *   The only way to make a ruleset grant this right is to explicitly allow it
+ *   for a specific directory by adding a matching rule to the ruleset.
+ *
+ *   In particular, when using the first Landlock ABI version, Landlock will
+ *   always deny attempts to reparent files between different directories.
+ *
+ *   In addition to the source and destination directories having the
+ *   %LANDLOCK_ACCESS_FS_REFER access right, the attempted link or rename
+ *   operation must meet the following constraints:
+ *
+ *   * The reparented file may not gain more access rights in the destination
+ *     directory than it previously had in the source directory.  If this is
+ *     attempted, the operation results in an ``EXDEV`` error.
+ *
+ *   * When linking or renaming, the ``LANDLOCK_ACCESS_FS_MAKE_*`` right for the
+ *     respective file type must be granted for the destination directory.
+ *     Otherwise, the operation results in an ``EACCES`` error.
+ *
+ *   * When renaming, the ``LANDLOCK_ACCESS_FS_REMOVE_*`` right for the
+ *     respective file type must be granted for the source directory.  Otherwise,
+ *     the operation results in an ``EACCES`` error.
+ *
+ *   If multiple requirements are not met, the ``EACCES`` error code takes
+ *   precedence over ``EXDEV``.
  *
  * .. warning::
  *
-- 
cgit v1.2.3


From 5d5de3a431d87ac51d43da8d796891d014975ab7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 16 Feb 2023 10:48:21 +0800
Subject: bpf: Only allocate one bpf_mem_cache for bpf_cpumask_ma

The size of bpf_cpumask is fixed, so there is no need to allocate many
bpf_mem_caches for bpf_cpumask_ma, just one bpf_mem_cache is enough.
Also add comments for bpf_mem_alloc_init() in bpf_mem_alloc.h to prevent
future miuse.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230216024821.2202916-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 7 +++++++
 kernel/bpf/cpumask.c          | 6 +++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3e164b8efaa9..a7104af61ab4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -14,6 +14,13 @@ struct bpf_mem_alloc {
 	struct work_struct work;
 };
 
+/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
+ * Alloc and free are done with bpf_mem_cache_{alloc,free}().
+ *
+ * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
+ * Alloc and free are done with bpf_mem_{alloc,free}() and the size of
+ * the returned object is given by the size argument of bpf_mem_alloc().
+ */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 52b981512a35..2b3fbbfebdc5 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -55,7 +55,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
 	/* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
 	BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
 
-	cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+	cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
 	if (!cpumask)
 		return NULL;
 
@@ -123,7 +123,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
 
 	if (refcount_dec_and_test(&cpumask->usage)) {
 		migrate_disable();
-		bpf_mem_free(&bpf_cpumask_ma, cpumask);
+		bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
 		migrate_enable();
 	}
 }
@@ -468,7 +468,7 @@ static int __init cpumask_kfunc_init(void)
 		},
 	};
 
-	ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+	ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
 	return  ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
-- 
cgit v1.2.3


From d5f7638eb5fed0eb12e45a127764c4111b11c50e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 24 Oct 2022 05:34:14 -0700
Subject: Input: matrix_keypad - replace header inclusions by forward
 declarations

When the data structure is only referred by pointer, compiler may not need
to see the contents of the data type. Thus, we may replace header inclusions
by respective forward declarations.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220923184632.2157-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/matrix_keypad.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index 9476768c3b90..b8d8d69eba29 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -3,8 +3,9 @@
 #define _MATRIX_KEYPAD_H
 
 #include <linux/types.h>
-#include <linux/input.h>
-#include <linux/of.h>
+
+struct device;
+struct input_dev;
 
 #define MATRIX_MAX_ROWS		32
 #define MATRIX_MAX_COLS		32
-- 
cgit v1.2.3


From 7e0dac2807e6c4ae8c56941d74971fdb0763b4f9 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:45 -0800
Subject: bpf: Refactor process_dynptr_func

This change cleans up process_dynptr_func's flow to be more intuitive
and updates some comments with more context.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-3-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  3 ---
 kernel/bpf/verifier.c        | 62 ++++++++++++++++++++++----------------------
 2 files changed, 31 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cf1bb1cf4a7b..b26ff2a8f63b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   enum bpf_arg_type arg_type);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
-struct bpf_call_arg_meta;
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5cb8b623f639..e0e00509846b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -959,39 +959,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				       int spi)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
+	int spi;
+
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return false;
 
-	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
-	 * will do check_mem_access to check and update stack bounds later, so
-	 * return true for that case.
+	spi = dynptr_get_spi(env, reg);
+
+	/* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+	 * error because this just means the stack state hasn't been updated yet.
+	 * We will do check_mem_access to check and update stack bounds later.
 	 */
-	if (spi < 0)
-		return spi == -ERANGE;
-	/* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
-	 * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
-	 * ensure dynptr objects at the slots we are touching are completely
-	 * destructed before we reinitialize them for a new one. For referenced
-	 * ones, destroy_if_dynptr_stack_slot returns an error early instead of
-	 * delaying it until the end where the user will get "Unreleased
+	if (spi < 0 && spi != -ERANGE)
+		return false;
+
+	/* We don't need to check if the stack slots are marked by previous
+	 * dynptr initializations because we allow overwriting existing unreferenced
+	 * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+	 * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+	 * touching are completely destructed before we reinitialize them for a new
+	 * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+	 * instead of delaying it until the end where the user will get "Unreleased
 	 * reference" error.
 	 */
 	return true;
 }
 
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				     int spi)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int i;
+	int i, spi;
 
-	/* This already represents first slot of initialized bpf_dynptr */
+	/* This already represents first slot of initialized bpf_dynptr.
+	 *
+	 * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+	 * check_func_arg_reg_off's logic, so we don't need to check its
+	 * offset and alignment.
+	 */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return true;
 
+	spi = dynptr_get_spi(env, reg);
 	if (spi < 0)
 		return false;
 	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
@@ -6215,11 +6225,10 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
  * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
  * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
  */
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			       enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
-	int spi = 0;
 
 	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
 	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -6228,15 +6237,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
 		return -EFAULT;
 	}
-	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
-	 * check_func_arg_reg_off's logic. We only need to check offset
-	 * and its alignment for PTR_TO_STACK.
-	 */
-	if (reg->type == PTR_TO_STACK) {
-		spi = dynptr_get_spi(env, reg);
-		if (spi < 0 && spi != -ERANGE)
-			return spi;
-	}
 
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
@@ -6254,7 +6254,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 	 *		 to.
 	 */
 	if (arg_type & MEM_UNINIT) {
-		if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+		if (!is_dynptr_reg_valid_uninit(env, reg)) {
 			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
 			return -EINVAL;
 		}
@@ -6277,7 +6277,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 			return -EINVAL;
 		}
 
-		if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+		if (!is_dynptr_reg_valid_init(env, reg)) {
 			verbose(env,
 				"Expected an initialized dynptr as arg #%d\n",
 				regno);
-- 
cgit v1.2.3


From 8357b366cbb09b17c90e2cd758360a6bd2ea7507 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:47 -0800
Subject: bpf: Define no-ops for externally called bpf dynptr functions

Some bpf dynptr functions will be called from places where
if CONFIG_BPF_SYSCALL is not set, then the dynptr function is
undefined. For example, when skb type dynptrs are added in the
next commit, dynptr functions are called from net/core/filter.c

This patch defines no-op implementations of these dynptr functions
so that they do not break compilation by being an undefined reference.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-5-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 75 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520b238abd5a..296841a31749 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1124,6 +1124,33 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
 	return bpf_func(ctx, insnsi);
 }
 
+/* the implementation of the opaque uapi struct bpf_dynptr */
+struct bpf_dynptr_kern {
+	void *data;
+	/* Size represents the number of usable bytes of dynptr data.
+	 * If for example the offset is at 4 for a local dynptr whose data is
+	 * of type u64, the number of usable bytes is 4.
+	 *
+	 * The upper 8 bits are reserved. It is as follows:
+	 * Bits 0 - 23 = size
+	 * Bits 24 - 30 = dynptr type
+	 * Bit 31 = whether dynptr is read-only
+	 */
+	u32 size;
+	u32 offset;
+} __aligned(8);
+
+enum bpf_dynptr_type {
+	BPF_DYNPTR_TYPE_INVALID,
+	/* Points to memory that is local to the bpf program */
+	BPF_DYNPTR_TYPE_LOCAL,
+	/* Underlying data is a ringbuf record */
+	BPF_DYNPTR_TYPE_RINGBUF,
+};
+
+int bpf_dynptr_check_size(u32 size);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
+
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
 int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
@@ -2266,6 +2293,11 @@ static inline bool has_current_bpf_ctx(void)
 }
 
 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+		     enum bpf_dynptr_type type, u32 offset, u32 size);
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2495,6 +2527,19 @@ static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
 static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
 {
 }
+
+static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+				   enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+}
+
+static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+}
+
+static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -2913,36 +2958,6 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 num_args, struct bpf_bprintf_data *data);
 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
 
-/* the implementation of the opaque uapi struct bpf_dynptr */
-struct bpf_dynptr_kern {
-	void *data;
-	/* Size represents the number of usable bytes of dynptr data.
-	 * If for example the offset is at 4 for a local dynptr whose data is
-	 * of type u64, the number of usable bytes is 4.
-	 *
-	 * The upper 8 bits are reserved. It is as follows:
-	 * Bits 0 - 23 = size
-	 * Bits 24 - 30 = dynptr type
-	 * Bit 31 = whether dynptr is read-only
-	 */
-	u32 size;
-	u32 offset;
-} __aligned(8);
-
-enum bpf_dynptr_type {
-	BPF_DYNPTR_TYPE_INVALID,
-	/* Points to memory that is local to the bpf program */
-	BPF_DYNPTR_TYPE_LOCAL,
-	/* Underlying data is a kernel-produced ringbuf record */
-	BPF_DYNPTR_TYPE_RINGBUF,
-};
-
-void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
-		     enum bpf_dynptr_type type, u32 offset, u32 size);
-void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
-int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
-
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
 void bpf_cgroup_atype_put(int cgroup_atype);
-- 
cgit v1.2.3


From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:50 -0800
Subject: bpf: Add skb dynptrs

Add skb dynptrs, which are dynptrs whose underlying pointer points
to a skb. The dynptr acts on skb data. skb dynptrs have two main
benefits. One is that they allow operations on sizes that are not
statically known at compile-time (eg variable-sized accesses).
Another is that parsing the packet data through dynptrs (instead of
through direct access of skb->data and skb->data_end) can be more
ergonomic and less brittle (eg does not need manual if checking for
being within bounds of data_end).

For bpf prog types that don't support writes on skb data, the dynptr is
read-only (bpf_dynptr_write() will return an error)

For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write()
interfaces, reading and writing from/to data in the head as well as from/to
non-linear paged buffers is supported. Data slices through the
bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and
bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used.

For examples of how skb dynptrs can be used, please see the attached
selftests.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 14 +++++++-
 include/linux/filter.h         | 18 ++++++++++
 include/uapi/linux/bpf.h       | 13 ++++++--
 kernel/bpf/btf.c               | 18 ++++++++++
 kernel/bpf/helpers.c           | 76 ++++++++++++++++++++++++++++++++++--------
 kernel/bpf/verifier.c          | 61 +++++++++++++++++++++++++++++++++
 net/core/filter.c              | 67 +++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 13 ++++++--
 8 files changed, 261 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 296841a31749..e7436d7615b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -607,11 +607,14 @@ enum bpf_type_flag {
 	 */
 	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to sk_buff */
+	DYNPTR_TYPE_SKB		= BIT(15 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1146,6 +1149,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_LOCAL,
 	/* Underlying data is a ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
+	/* Underlying data is a sk_buff */
+	BPF_DYNPTR_TYPE_SKB,
 };
 
 int bpf_dynptr_check_size(u32 size);
@@ -2846,6 +2851,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 				struct bpf_insn *insn_buf,
 				struct bpf_prog *prog,
 				u32 *target_size);
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+			       struct bpf_dynptr_kern *ptr);
 #else
 static inline bool bpf_sock_common_is_valid_access(int off, int size,
 						   enum bpf_access_type type,
@@ -2867,6 +2874,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 {
 	return 0;
 }
+static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+					     struct bpf_dynptr_kern *ptr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #ifdef CONFIG_INET
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1727898f1641..de18e844d15e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1542,4 +1542,22 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index
 	return XDP_REDIRECT;
 }
 
+#ifdef CONFIG_NET
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags);
+#else /* CONFIG_NET */
+static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
+				       void *to, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
+					const void *from, u32 len, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_NET */
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..d0351d30e551 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5325,11 +5325,17 @@ union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  For *flags*, please see the flags accepted by
+ *		       **bpf_skb_store_bytes**\ ().
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
  * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5337,6 +5343,9 @@ union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 84cca8473873..ef2d8969ed1f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -207,6 +207,11 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_TRACING,
 	BTF_KFUNC_HOOK_SYSCALL,
 	BTF_KFUNC_HOOK_FMODRET,
+	BTF_KFUNC_HOOK_CGROUP_SKB,
+	BTF_KFUNC_HOOK_SCHED_ACT,
+	BTF_KFUNC_HOOK_SK_SKB,
+	BTF_KFUNC_HOOK_SOCKET_FILTER,
+	BTF_KFUNC_HOOK_LWT,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7708,6 +7713,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_TRACING;
 	case BPF_PROG_TYPE_SYSCALL:
 		return BTF_KFUNC_HOOK_SYSCALL;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		return BTF_KFUNC_HOOK_CGROUP_SKB;
+	case BPF_PROG_TYPE_SCHED_ACT:
+		return BTF_KFUNC_HOOK_SCHED_ACT;
+	case BPF_PROG_TYPE_SK_SKB:
+		return BTF_KFUNC_HOOK_SK_SKB;
+	case BPF_PROG_TYPE_SOCKET_FILTER:
+		return BTF_KFUNC_HOOK_SOCKET_FILTER;
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+		return BTF_KFUNC_HOOK_LWT;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abdcc52f90a6..e8e2414d1587 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1420,11 +1420,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
 
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
 {
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
 u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
@@ -1497,6 +1507,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!src->data || flags)
@@ -1506,13 +1517,23 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 	if (err)
 		return err;
 
-	/* Source and destination may possibly overlap, hence use memmove to
-	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
-	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
-	 */
-	memmove(dst, src->data + src->offset + offset, len);
+	type = bpf_dynptr_get_type(src);
 
-	return 0;
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		/* Source and destination may possibly overlap, hence use memmove to
+		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+		 */
+		memmove(dst, src->data + src->offset + offset, len);
+		return 0;
+	case BPF_DYNPTR_TYPE_SKB:
+		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+	default:
+		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+		return -EFAULT;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_read_proto = {
@@ -1529,22 +1550,36 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
 	   u32, len, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
-	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
 	if (err)
 		return err;
 
-	/* Source and destination may possibly overlap, hence use memmove to
-	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
-	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
-	 */
-	memmove(dst->data + dst->offset + offset, src, len);
+	type = bpf_dynptr_get_type(dst);
 
-	return 0;
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		if (flags)
+			return -EINVAL;
+		/* Source and destination may possibly overlap, hence use memmove to
+		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+		 */
+		memmove(dst->data + dst->offset + offset, src, len);
+		return 0;
+	case BPF_DYNPTR_TYPE_SKB:
+		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+					     flags);
+	default:
+		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+		return -EFAULT;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_write_proto = {
@@ -1560,6 +1595,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 
 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!ptr->data)
@@ -1572,7 +1608,19 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	if (bpf_dynptr_is_rdonly(ptr))
 		return 0;
 
-	return (unsigned long)(ptr->data + ptr->offset + offset);
+	type = bpf_dynptr_get_type(ptr);
+
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return (unsigned long)(ptr->data + ptr->offset + offset);
+	case BPF_DYNPTR_TYPE_SKB:
+		/* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+		return 0;
+	default:
+		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+		return 0;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_data_proto = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d052aa5800de..4f5fce16543b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -750,6 +750,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_LOCAL;
 	case DYNPTR_TYPE_RINGBUF:
 		return BPF_DYNPTR_TYPE_RINGBUF;
+	case DYNPTR_TYPE_SKB:
+		return BPF_DYNPTR_TYPE_SKB;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -6295,6 +6297,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 			case DYNPTR_TYPE_RINGBUF:
 				err_extra = "ringbuf";
 				break;
+			case DYNPTR_TYPE_SKB:
+				err_extra = "skb ";
+				break;
 			default:
 				err_extra = "<unknown>";
 				break;
@@ -6737,6 +6742,24 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
 	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+					    struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return reg->dynptr.type;
+
+	spi = __get_spi(reg->off);
+	if (spi < 0) {
+		verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+		return BPF_DYNPTR_TYPE_INVALID;
+	}
+
+	return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn,
@@ -8383,6 +8406,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 		break;
 	}
+	case BPF_FUNC_dynptr_write:
+	{
+		enum bpf_dynptr_type dynptr_type;
+		struct bpf_reg_state *reg;
+
+		reg = get_dynptr_arg_reg(env, fn, regs);
+		if (!reg)
+			return -EFAULT;
+
+		dynptr_type = dynptr_get_type(env, reg);
+		if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+			return -EFAULT;
+
+		if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+			/* this will trigger clear_all_pkt_pointers(), which will
+			 * invalidate all dynptr slices associated with the skb
+			 */
+			changes_data = true;
+
+		break;
+	}
 	case BPF_FUNC_user_ringbuf_drain:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_user_ringbuf_callback_state);
@@ -8898,6 +8942,7 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_remove,
 	KF_bpf_rbtree_add,
 	KF_bpf_rbtree_first,
+	KF_bpf_dynptr_from_skb,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8912,6 +8957,7 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8928,6 +8974,7 @@ BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9682,6 +9729,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (is_kfunc_arg_uninit(btf, &args[i]))
 				dynptr_arg_type |= MEM_UNINIT;
 
+			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+				dynptr_arg_type |= DYNPTR_TYPE_SKB;
+
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
 				return ret;
@@ -16356,6 +16406,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+		bool seen_direct_write = env->seen_direct_write;
+		bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+		if (is_rdonly)
+			insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly);
+
+		/* restore env->seen_direct_write to its original value, since
+		 * may_access_direct_pkt_data mutates it
+		 */
+		env->seen_direct_write = seen_direct_write;
 	}
 	return 0;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 1d6f165923bf..f3afa31a9b10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1721,6 +1721,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags)
+{
+	return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
 	   void *, to, u32, len)
 {
@@ -1751,6 +1757,11 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+{
+	return ____bpf_skb_load_bytes(skb, offset, to, len);
+}
+
 BPF_CALL_4(bpf_flow_dissector_load_bytes,
 	   const struct bpf_flow_dissector *, ctx, u32, offset,
 	   void *, to, u32, len)
@@ -11621,3 +11632,59 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 
 	return func;
 }
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
+				    struct bpf_dynptr_kern *ptr__uninit)
+{
+	if (flags) {
+		bpf_dynptr_set_null(ptr__uninit);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+	return 0;
+}
+__diag_pop();
+
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+			       struct bpf_dynptr_kern *ptr__uninit)
+{
+	int err;
+
+	err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
+	if (err)
+		return err;
+
+	bpf_dynptr_set_rdonly(ptr__uninit);
+
+	return 0;
+}
+
+BTF_SET8_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
+BTF_SET8_END(bpf_kfunc_check_set_skb)
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_skb,
+};
+
+static int __init bpf_kfunc_init(void)
+{
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+}
+late_initcall(bpf_kfunc_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..d0351d30e551 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5325,11 +5325,17 @@ union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  For *flags*, please see the flags accepted by
+ *		       **bpf_skb_store_bytes**\ ().
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
  * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5337,6 +5343,9 @@ union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
-- 
cgit v1.2.3


From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:51 -0800
Subject: bpf: Add xdp dynptrs

Add xdp dynptrs, which are dynptrs whose underlying pointer points
to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main
benefits. One is that they allow operations on sizes that are not
statically known at compile-time (eg variable-sized accesses).
Another is that parsing the packet data through dynptrs (instead of
through direct access of xdp->data and xdp->data_end) can be more
ergonomic and less brittle (eg does not need manual if checking for
being within bounds of data_end).

For reads and writes on the dynptr, this includes reading/writing
from/to and across fragments. Data slices through the bpf_dynptr_data
API are not supported; instead bpf_dynptr_slice() and
bpf_dynptr_slice_rdwr() should be used.

For examples of how xdp dynptrs can be used, please see the attached
selftests.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 +++++++-
 include/linux/filter.h         | 14 ++++++++++++++
 include/uapi/linux/bpf.h       |  2 +-
 kernel/bpf/helpers.c           |  9 ++++++++-
 kernel/bpf/verifier.c          | 10 ++++++++++
 net/core/filter.c              | 37 +++++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |  2 +-
 7 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7436d7615b0..23ec684e660d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -610,11 +610,15 @@ enum bpf_type_flag {
 	/* DYNPTR points to sk_buff */
 	DYNPTR_TYPE_SKB		= BIT(15 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to xdp_buff */
+	DYNPTR_TYPE_XDP		= BIT(16 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
+				 | DYNPTR_TYPE_XDP)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1151,6 +1155,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_RINGBUF,
 	/* Underlying data is a sk_buff */
 	BPF_DYNPTR_TYPE_SKB,
+	/* Underlying data is a xdp_buff */
+	BPF_DYNPTR_TYPE_XDP,
 };
 
 int bpf_dynptr_check_size(u32 size);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index de18e844d15e..3f6992261ec5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1546,6 +1546,8 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index
 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
 			  u32 len, u64 flags);
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
 				       void *to, u32 len)
@@ -1558,6 +1560,18 @@ static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
+				       void *buf, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
+					void *buf, u32 len)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_NET */
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d0351d30e551..faa304c926cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5344,7 +5344,7 @@ union bpf_attr {
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
  *
- *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		skb and xdp type dynptrs may not use bpf_dynptr_data. They should
  *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e8e2414d1587..114a875a05b1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1530,6 +1530,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 		return 0;
 	case BPF_DYNPTR_TYPE_SKB:
 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+	case BPF_DYNPTR_TYPE_XDP:
+		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1576,6 +1578,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
 	case BPF_DYNPTR_TYPE_SKB:
 		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
 					     flags);
+	case BPF_DYNPTR_TYPE_XDP:
+		if (flags)
+			return -EINVAL;
+		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1615,7 +1621,8 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	case BPF_DYNPTR_TYPE_RINGBUF:
 		return (unsigned long)(ptr->data + ptr->offset + offset);
 	case BPF_DYNPTR_TYPE_SKB:
-		/* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+	case BPF_DYNPTR_TYPE_XDP:
+		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
 		return 0;
 	default:
 		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f5fce16543b..5e42946e53ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -752,6 +752,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_RINGBUF;
 	case DYNPTR_TYPE_SKB:
 		return BPF_DYNPTR_TYPE_SKB;
+	case DYNPTR_TYPE_XDP:
+		return BPF_DYNPTR_TYPE_XDP;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -6300,6 +6302,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 			case DYNPTR_TYPE_SKB:
 				err_extra = "skb ";
 				break;
+			case DYNPTR_TYPE_XDP:
+				err_extra = "xdp ";
+				break;
 			default:
 				err_extra = "<unknown>";
 				break;
@@ -8943,6 +8948,7 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_add,
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
+	KF_bpf_dynptr_from_xdp,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8958,6 +8964,7 @@ BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8975,6 +8982,7 @@ BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9731,6 +9739,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
 				dynptr_arg_type |= DYNPTR_TYPE_SKB;
+			else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+				dynptr_arg_type |= DYNPTR_TYPE_XDP;
 
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f3afa31a9b10..c692046fa7f6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3839,7 +3839,7 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_1(bpf_xdp_get_buff_len, struct  xdp_buff*, xdp)
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
 {
 	return xdp_get_buff_len(xdp);
 }
@@ -3999,6 +3999,11 @@ static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+	return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
+}
+
 BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
 	   void *, buf, u32, len)
 {
@@ -4026,6 +4031,11 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+	return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
+}
+
 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -11648,6 +11658,19 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
 
 	return 0;
 }
+
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
+				    struct bpf_dynptr_kern *ptr__uninit)
+{
+	if (flags) {
+		bpf_dynptr_set_null(ptr__uninit);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+
+	return 0;
+}
 __diag_pop();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11668,11 +11691,20 @@ BTF_SET8_START(bpf_kfunc_check_set_skb)
 BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
 BTF_SET8_END(bpf_kfunc_check_set_skb)
 
+BTF_SET8_START(bpf_kfunc_check_set_xdp)
+BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_SET8_END(bpf_kfunc_check_set_xdp)
+
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
 	.owner = THIS_MODULE,
 	.set = &bpf_kfunc_check_set_skb,
 };
 
+static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_xdp,
+};
+
 static int __init bpf_kfunc_init(void)
 {
 	int ret;
@@ -11685,6 +11717,7 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
 }
 late_initcall(bpf_kfunc_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d0351d30e551..faa304c926cf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5344,7 +5344,7 @@ union bpf_attr {
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
  *
- *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		skb and xdp type dynptrs may not use bpf_dynptr_data. They should
  *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
-- 
cgit v1.2.3


From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:52 -0800
Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr

Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
The user must pass in a buffer to store the contents of the data slice
if a direct pointer to the data cannot be obtained.

For skb and xdp type dynptrs, these two APIs are the only way to obtain
a data slice. However, for other types of dynptrs, there is no
difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.

For skb type dynptrs, the data is copied into the user provided buffer
if any of the data is not in the linear portion of the skb. For xdp type
dynptrs, the data is copied into the user provided buffer if the data is
between xdp frags.

If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
the skb will be uncloned (see bpf_unclone_prologue()).

Please note that any bpf_dynptr_write() automatically invalidates any prior
data slices of the skb dynptr. This is because the skb may be cloned or
may need to pull its paged buffer into the head. As such, any
bpf_dynptr_write() will automatically have its prior data slices
invalidated, even if the write is to data in the skb head of an uncloned
skb. Please note as well that any other helper calls that change the
underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
slices of the skb dynptr as well, for the same reasons.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h         |  14 +++++
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/helpers.c           | 138 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 127 +++++++++++++++++++++++++++++++++++--
 net/core/filter.c              |   6 +-
 tools/include/uapi/linux/bpf.h |   5 ++
 6 files changed, 288 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3f6992261ec5..efa5d4a1677e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1548,6 +1548,9 @@ int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
 			  u32 len, u64 flags);
 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
 				       void *to, u32 len)
@@ -1572,6 +1575,17 @@ static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	return NULL;
+}
+
+static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
+				     unsigned long len, bool flush)
+{
+	return NULL;
+}
 #endif /* CONFIG_NET */
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@ union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 114a875a05b1..648b29e78b84 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2193,6 +2193,142 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
 	return p;
 }
 
+/**
+ * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+				   void *buffer, u32 buffer__szk)
+{
+	enum bpf_dynptr_type type;
+	u32 len = buffer__szk;
+	int err;
+
+	if (!ptr->data)
+		return 0;
+
+	err = bpf_dynptr_check_off_len(ptr, offset, len);
+	if (err)
+		return 0;
+
+	type = bpf_dynptr_get_type(ptr);
+
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return ptr->data + ptr->offset + offset;
+	case BPF_DYNPTR_TYPE_SKB:
+		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+	case BPF_DYNPTR_TYPE_XDP:
+	{
+		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+		if (xdp_ptr)
+			return xdp_ptr;
+
+		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+		return buffer;
+	}
+	default:
+		WARN_ONCE(true, "unknown dynptr type %d\n", type);
+		return 0;
+	}
+}
+
+/**
+ * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ *	return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+					void *buffer, u32 buffer__szk)
+{
+	if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+		return 0;
+
+	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+	 *
+	 * For skb-type dynptrs, it is safe to write into the returned pointer
+	 * if the bpf program allows skb data writes. There are two possiblities
+	 * that may occur when calling bpf_dynptr_slice_rdwr:
+	 *
+	 * 1) The requested slice is in the head of the skb. In this case, the
+	 * returned pointer is directly to skb data, and if the skb is cloned, the
+	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+	 * The pointer can be directly written into.
+	 *
+	 * 2) Some portion of the requested slice is in the paged buffer area.
+	 * In this case, the requested data will be copied out into the buffer
+	 * and the returned pointer will be a pointer to the buffer. The skb
+	 * will not be pulled. To persist the write, the user will need to call
+	 * bpf_dynptr_write(), which will pull the skb and commit the write.
+	 *
+	 * Similarly for xdp programs, if the requested slice is not across xdp
+	 * fragments, then a direct pointer will be returned, otherwise the data
+	 * will be copied out into the buffer and the user will need to call
+	 * bpf_dynptr_write() to commit changes.
+	 */
+	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2262,6 +2398,8 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_ID_FLAGS(func, bpf_rdonly_cast)
 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e42946e53ab..a856896e835a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -759,6 +759,22 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 	}
 }
 
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+		return DYNPTR_TYPE_LOCAL;
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return DYNPTR_TYPE_RINGBUF;
+	case BPF_DYNPTR_TYPE_SKB:
+		return DYNPTR_TYPE_SKB;
+	case BPF_DYNPTR_TYPE_XDP:
+		return DYNPTR_TYPE_XDP;
+	default:
+		return 0;
+	}
+}
+
 static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 {
 	return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -1681,6 +1697,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
 	       reg->type == PTR_TO_PACKET_END;
 }
 
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+	return base_type(reg->type) == PTR_TO_MEM &&
+		(reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
 /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
 static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
 				    enum bpf_reg_type which)
@@ -7429,6 +7451,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
  * are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -7436,7 +7461,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	struct bpf_reg_state *reg;
 
 	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
-		if (reg_is_pkt_pointer_any(reg))
+		if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
 			mark_reg_invalid(env, reg);
 	}));
 }
@@ -8688,6 +8713,11 @@ struct bpf_kfunc_call_arg_meta {
 	struct {
 		struct btf_field *field;
 	} arg_rbtree_root;
+	struct {
+		enum bpf_dynptr_type type;
+		u32 id;
+	} initialized_dynptr;
+	u64 mem_size;
 };
 
 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
@@ -8761,6 +8791,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	return __kfunc_param_match_suffix(btf, arg, "__sz");
 }
 
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+					const struct btf_param *arg,
+					const struct bpf_reg_state *reg)
+{
+	const struct btf_type *t;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
 static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
 {
 	return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8949,6 +8992,8 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
+	KF_bpf_dynptr_slice,
+	KF_bpf_dynptr_slice_rdwr,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8965,6 +9010,8 @@ BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8983,6 +9030,8 @@ BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9062,7 +9111,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_CALLBACK;
 
-	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+	if (argno + 1 < nargs &&
+	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
 		arg_mem_size = true;
 
 	/* This is the catch all argument type of register types supported by
@@ -9745,6 +9797,18 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
 				return ret;
+
+			if (!(dynptr_arg_type & MEM_UNINIT)) {
+				int id = dynptr_id(env, reg);
+
+				if (id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+					return id;
+				}
+				meta->initialized_dynptr.id = id;
+				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+			}
+
 			break;
 		}
 		case KF_ARG_PTR_TO_LIST_HEAD:
@@ -9840,14 +9904,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_MEM_SIZE:
-			ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+		{
+			struct bpf_reg_state *size_reg = &regs[regno + 1];
+			const struct btf_param *size_arg = &args[i + 1];
+
+			ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
 			if (ret < 0) {
 				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
 				return ret;
 			}
-			/* Skip next '__sz' argument */
+
+			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+				if (meta->arg_constant.found) {
+					verbose(env, "verifier internal error: only one constant argument permitted\n");
+					return -EFAULT;
+				}
+				if (!tnum_is_const(size_reg->var_off)) {
+					verbose(env, "R%d must be a known constant\n", regno + 1);
+					return -EINVAL;
+				}
+				meta->arg_constant.found = true;
+				meta->arg_constant.value = size_reg->var_off.value;
+			}
+
+			/* Skip next '__sz' or '__szk' argument */
 			i++;
 			break;
+		}
 		case KF_ARG_PTR_TO_CALLBACK:
 			meta->subprogno = reg->subprogno;
 			break;
@@ -10082,6 +10165,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
 				regs[BPF_REG_0].btf = desc_btf;
 				regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+			} else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+				   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+				enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+				mark_reg_known_zero(env, regs, BPF_REG_0);
+
+				if (!meta.arg_constant.found) {
+					verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+					return -EFAULT;
+				}
+
+				regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+				/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+				regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+				if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+					regs[BPF_REG_0].type |= MEM_RDONLY;
+				} else {
+					/* this will set env->seen_direct_write to true */
+					if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+						verbose(env, "the prog does not allow writes to packet data\n");
+						return -EINVAL;
+					}
+				}
+
+				if (!meta.initialized_dynptr.id) {
+					verbose(env, "verifier internal error: no dynptr id\n");
+					return -EFAULT;
+				}
+				regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+				/* we don't need to set BPF_REG_0's ref obj id
+				 * because packet slices are not refcounted (see
+				 * dynptr_type_refcounted)
+				 */
 			} else {
 				verbose(env, "kernel function %s unhandled dynamic return type\n",
 					meta.func_name);
diff --git a/net/core/filter.c b/net/core/filter.c
index c692046fa7f6..8f3124e06133 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3894,8 +3894,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
-			     void *buf, unsigned long len, bool flush)
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush)
 {
 	unsigned long ptr_len, ptr_off = 0;
 	skb_frag_t *next_frag, *end_frag;
@@ -3941,7 +3941,7 @@ static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
 	}
 }
 
-static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
 	u32 size = xdp->data_end - xdp->data;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@ union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return
-- 
cgit v1.2.3


From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 25 Feb 2023 16:40:09 +0100
Subject: bpf: Support kptrs in local storage maps

Enable support for kptrs in local storage maps by wiring up the freeing
of these kptrs from map value. Freeing of bpf_local_storage_map is only
delayed in case there are special fields, therefore bpf_selem_free_*
path can also only dereference smap safely in that case. This is
recorded using a bool utilizing a hole in bpF_local_storage_elem. It
could have been tagged in the pointer value smap using the lowest bit
(since alignment > 1), but since there was already a hole I went with
the simpler option. Only the map structure freeing is delayed using RCU
barriers, as the buckets aren't used when selem is being freed, so they
can be freed once all readers of the bucket lists can no longer access
it.

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 +++++
 kernel/bpf/bpf_local_storage.c    | 48 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/syscall.c              |  6 ++++-
 kernel/bpf/verifier.c             | 12 ++++++----
 4 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 6d37a40cd90e..0fe92986412b 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -74,6 +74,12 @@ struct bpf_local_storage_elem {
 	struct hlist_node snode;	/* Linked to bpf_local_storage */
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
+	bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
+			    * callbacks? bpf_local_storage_map_free only
+			    * executes rcu_barrier when there are special
+			    * fields, this field remembers that to ensure we
+			    * don't access already freed smap in sdata.
+			    */
 	/* 8 bytes hole */
 	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 58da17ae5124..2bdd722fe293 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (selem) {
 		if (value)
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
+		/* No need to call check_and_init_map_value as memory is zero init */
 		return selem;
 	}
 
@@ -113,10 +114,25 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	/* The can_use_smap bool is set whenever we need to free additional
+	 * fields in selem data before freeing selem. bpf_local_storage_map_free
+	 * only executes rcu_barrier to wait for RCU callbacks when it has
+	 * special fields, hence we can only conditionally dereference smap, as
+	 * by this time the map might have already been freed without waiting
+	 * for our call_rcu callback if it did not have any special fields.
+	 */
+	if (selem->can_use_smap)
+		bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
+	kfree(selem);
+}
+
+static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
+{
+	/* Free directly if Tasks Trace RCU GP also implies RCU GP */
 	if (rcu_trace_implies_rcu_gp())
-		kfree(selem);
+		bpf_selem_free_rcu(rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -170,9 +186,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
 	if (use_trace_rcu)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
 
 	return free_local_storage;
 }
@@ -240,6 +256,11 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
 	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	/* If our data will have special fields, smap will wait for us to use
+	 * its record in bpf_selem_free_* RCU callbacks before freeing itself.
+	 */
+	selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
@@ -723,6 +744,25 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
+	/* Only delay freeing of smap, buckets are not needed anymore */
 	kvfree(smap->buckets);
+
+	/* When local storage has special fields, callbacks for
+	 * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
+	 * the map BTF record, we need to execute an RCU barrier to wait for
+	 * them as the record will be freed right after our map_free callback.
+	 */
+	if (!IS_ERR_OR_NULL(smap->map.record)) {
+		rcu_barrier_tasks_trace();
+		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
+		 * is true, because while call_rcu invocation is skipped in that
+		 * case in bpf_selem_free_tasks_trace_rcu (and all local storage
+		 * maps pass use_trace_rcu = true), there can be call_rcu
+		 * callbacks based on use_trace_rcu = false in the earlier while
+		 * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
+		 * called from owner's free path.
+		 */
+		rcu_barrier();
+	}
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index da117a2a83b2..eb50025b03c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1063,7 +1063,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY &&
-				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
 					ret = -EOPNOTSUPP;
 					goto free_map_tab;
 				}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a856896e835a..bf580f246a01 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7222,22 +7222,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_MAP_TYPE_SK_STORAGE:
 		if (func_id != BPF_FUNC_sk_storage_get &&
-		    func_id != BPF_FUNC_sk_storage_delete)
+		    func_id != BPF_FUNC_sk_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_INODE_STORAGE:
 		if (func_id != BPF_FUNC_inode_storage_get &&
-		    func_id != BPF_FUNC_inode_storage_delete)
+		    func_id != BPF_FUNC_inode_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_TASK_STORAGE:
 		if (func_id != BPF_FUNC_task_storage_get &&
-		    func_id != BPF_FUNC_task_storage_delete)
+		    func_id != BPF_FUNC_task_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGRP_STORAGE:
 		if (func_id != BPF_FUNC_cgrp_storage_get &&
-		    func_id != BPF_FUNC_cgrp_storage_delete)
+		    func_id != BPF_FUNC_cgrp_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_BLOOM_FILTER:
-- 
cgit v1.2.3


From f71f8530494bb5ab43d3369ef0ce8373eb1ee077 Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Thu, 2 Mar 2023 13:46:13 +0200
Subject: bpf: Add support for absolute value BPF timers

Add a new flag BPF_F_TIMER_ABS that can be passed to bpf_timer_start()
to start an absolute value timer instead of the default relative value.
This makes the timer expire at an exact point in time, instead of a time
with latencies induced by both the BPF and timer subsystems.

Suggested-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Link: https://lore.kernel.org/r/20230302114614.2985072-2-tero.kristo@linux.intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 15 +++++++++++++++
 kernel/bpf/helpers.c           | 11 +++++++++--
 tools/include/uapi/linux/bpf.h | 15 +++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c9699304aed2..976b194eb775 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4969,6 +4969,12 @@ union bpf_attr {
  *		different maps if key/value layout matches across maps.
  *		Every bpf_timer_set_callback() can have different callback_fn.
  *
+ *		*flags* can be one of:
+ *
+ *		**BPF_F_TIMER_ABS**
+ *			Start the timer in absolute expire value instead of the
+ *			default relative one.
+ *
  *	Return
  *		0 on success.
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -7097,4 +7103,13 @@ struct bpf_core_relo {
 	enum bpf_core_relo_kind kind;
 };
 
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ *       relative to current time.
+ */
+enum {
+	BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6fc0d6c44e4c..12f12e879bcf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1264,10 +1264,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 {
 	struct bpf_hrtimer *t;
 	int ret = 0;
+	enum hrtimer_mode mode;
 
 	if (in_nmi())
 		return -EOPNOTSUPP;
-	if (flags)
+	if (flags > BPF_F_TIMER_ABS)
 		return -EINVAL;
 	__bpf_spin_lock_irqsave(&timer->lock);
 	t = timer->timer;
@@ -1275,7 +1276,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 		ret = -EINVAL;
 		goto out;
 	}
-	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+
+	if (flags & BPF_F_TIMER_ABS)
+		mode = HRTIMER_MODE_ABS_SOFT;
+	else
+		mode = HRTIMER_MODE_REL_SOFT;
+
+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	return ret;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c9699304aed2..976b194eb775 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4969,6 +4969,12 @@ union bpf_attr {
  *		different maps if key/value layout matches across maps.
  *		Every bpf_timer_set_callback() can have different callback_fn.
  *
+ *		*flags* can be one of:
+ *
+ *		**BPF_F_TIMER_ABS**
+ *			Start the timer in absolute expire value instead of the
+ *			default relative one.
+ *
  *	Return
  *		0 on success.
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -7097,4 +7103,13 @@ struct bpf_core_relo {
 	enum bpf_core_relo_kind kind;
 };
 
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ *       relative to current time.
+ */
+enum {
+	BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 20c09d92faeefb8536f705d3a4629e0dc314c8a1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 2 Mar 2023 20:14:43 -0800
Subject: bpf: Introduce kptr_rcu.

The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
Derefrence of other kptr-s returns PTR_UNTRUSTED.

For example:
struct map_value {
   struct cgroup __kptr *cgrp;
};

SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
{
  struct cgroup *cg, *cg2;

  cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
  bpf_kptr_xchg(&v->cgrp, cg);

  cg2 = v->cgrp; // This is new feature introduced by this patch.
  // cg2 is PTR_MAYBE_NULL | MEM_RCU.
  // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero

  if (cg2)
    bpf_cgroup_ancestor(cg2, level); // safe to do.
}

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com
---
 Documentation/bpf/kfuncs.rst                       | 12 +++--
 include/linux/btf.h                                |  2 +-
 kernel/bpf/helpers.c                               |  6 ++-
 kernel/bpf/verifier.c                              | 55 ++++++++++++++++++----
 net/bpf/test_run.c                                 |  3 +-
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       |  2 +-
 tools/testing/selftests/bpf/progs/map_kptr_fail.c  |  4 +-
 tools/testing/selftests/bpf/verifier/calls.c       |  2 +-
 tools/testing/selftests/bpf/verifier/map_kptr.c    |  2 +-
 9 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index b5d9b0d446bc..69eccf6f98ef 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -249,11 +249,13 @@ added later.
 2.4.8 KF_RCU flag
 -----------------
 
-The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
-When used together with KF_ACQUIRE, it indicates the kfunc should have a
-single argument which must be a trusted argument or a MEM_RCU pointer.
-The argument may have reference count of 0 and the kfunc must take this
-into consideration.
+The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
+KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
+that the objects are valid and there is no use-after-free. The pointers are not
+NULL, but the object's refcount could have reached zero. The kfuncs need to
+consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE
+pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely
+also be KF_RET_NULL.
 
 .. _KF_deprecated_flag:
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 49e0fe6d8274..556b3e2e7471 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -70,7 +70,7 @@
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
-#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
+#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 12f12e879bcf..637ac4e92e75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2163,8 +2163,10 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
 	if (level > cgrp->level || level < 0)
 		return NULL;
 
+	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
 	ancestor = cgrp->ancestors[level];
-	cgroup_get(ancestor);
+	if (!cgroup_tryget(ancestor))
+		return NULL;
 	return ancestor;
 }
 
@@ -2382,7 +2384,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b834f3d2d81a..a095055d7ef4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4218,7 +4218,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, u32 regno)
 {
 	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
-	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
+	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
 	const char *reg_name = "";
 
 	/* Only unreferenced case accepts untrusted pointers */
@@ -4285,6 +4285,34 @@ bad_type:
 	return -EINVAL;
 }
 
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+	if (!btf_is_kernel(btf))
+		return false;
+	return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+	const struct btf_field_kptr *kptr = &field->kptr;
+
+	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+}
+
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 				 int value_regno, int insn_idx,
 				 struct btf_field *kptr_field)
@@ -4319,7 +4347,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id,
+				rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
+				PTR_MAYBE_NULL | MEM_RCU :
+				PTR_MAYBE_NULL | PTR_UNTRUSTED);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -5163,10 +5194,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	 * An RCU-protected pointer can also be deemed trusted if we are in an
 	 * RCU read region. This case is handled below.
 	 */
-	if (nested_ptr_is_trusted(env, reg, off))
+	if (nested_ptr_is_trusted(env, reg, off)) {
 		flag |= PTR_TRUSTED;
-	else
+		/*
+		 * task->cgroups is trusted. It provides a stronger guarantee
+		 * than __rcu tag on 'cgroups' field in 'struct task_struct'.
+		 * Clear MEM_RCU in such case.
+		 */
+		flag &= ~MEM_RCU;
+	} else {
 		flag &= ~PTR_TRUSTED;
+	}
 
 	if (flag & MEM_RCU) {
 		/* Mark value register as MEM_RCU only if it is protected by
@@ -5175,11 +5213,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
 		 * it could be null in some cases.
 		 */
-		if (!env->cur_state->active_rcu_lock ||
-		    !(is_trusted_reg(reg) || is_rcu_reg(reg)))
-			flag &= ~MEM_RCU;
-		else
+		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
 			flag |= PTR_MAYBE_NULL;
+		else
+			flag &= ~MEM_RCU;
 	} else if (reg->type & MEM_RCU) {
 		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
 		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
@@ -9676,7 +9713,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EINVAL;
 		}
 
-		if (is_kfunc_trusted_args(meta) &&
+		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
 		    (register_is_null(reg) || type_may_be_null(reg->type))) {
 			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
 			return -EACCES;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6f3d654b3339..6a8b33a103a4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 
 __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
 {
+	/* p != NULL, but p->cnt could be 0 */
 }
 
 __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
@@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_SET8_END(test_sk_check_kfunc_ids)
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 4ad7fe24966d..b42291ed9586 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -205,7 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket")
+__failure __msg("expects refcounted")
 int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value *v;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index e19e2a5f38cf..08f9ec18c345 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
+__failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_")
 int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
+__failure __msg("R2 must be referenced")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 289ed202ec66..9a326a800e5c 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -243,7 +243,7 @@
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "R1 must be referenced",
+	.errstr = "R1 must be",
 },
 {
 	"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index 6914904344c0..d775ccb01989 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -336,7 +336,7 @@
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_map_kptr = { 1 },
 	.result = REJECT,
-	.errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_",
+	.errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_",
 },
 {
 	"map_kptr: ref: reject off != 0",
-- 
cgit v1.2.3


From 6fcd486b3a0a628c41f12b3a7329a18a2c74b351 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 2 Mar 2023 20:14:46 -0800
Subject: bpf: Refactor RCU enforcement in the verifier.

bpf_rcu_read_lock/unlock() are only available in clang compiled kernels. Lack
of such key mechanism makes it impossible for sleepable bpf programs to use RCU
pointers.

Allow bpf_rcu_read_lock/unlock() in GCC compiled kernels (though GCC doesn't
support btf_type_tag yet) and allowlist certain field dereferences in important
data structures like tast_struct, cgroup, socket that are used by sleepable
programs either as RCU pointer or full trusted pointer (which is valid outside
of RCU CS). Use BTF_TYPE_SAFE_RCU and BTF_TYPE_SAFE_TRUSTED macros for such
tagging. They will be removed once GCC supports btf_type_tag.

With that refactor check_ptr_to_btf_access(). Make it strict in enforcing
PTR_TRUSTED and PTR_UNTRUSTED while deprecating old PTR_TO_BTF_ID without
modifier flags. There is a chance that this strict enforcement might break
existing programs (especially on GCC compiled kernels), but this cleanup has to
start sooner than later. Note PTR_TO_CTX access still yields old deprecated
PTR_TO_BTF_ID. Once it's converted to strict PTR_TRUSTED or PTR_UNTRUSTED the
kfuncs and helpers will be able to default to KF_TRUSTED_ARGS. KF_RCU will
remain as a weaker version of KF_TRUSTED_ARGS where obj refcnt could be 0.

Adjust rcu_read_lock selftest to run on gcc and clang compiled kernels.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230303041446.3630-7-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h                                |   2 +-
 include/linux/bpf_verifier.h                       |   1 -
 kernel/bpf/btf.c                                   |  16 +-
 kernel/bpf/cpumask.c                               |  40 ++---
 kernel/bpf/verifier.c                              | 178 ++++++++++++++-------
 .../selftests/bpf/prog_tests/cgrp_local_storage.c  |  14 +-
 .../selftests/bpf/prog_tests/rcu_read_lock.c       |  16 +-
 .../selftests/bpf/progs/cgrp_ls_sleepable.c        |   4 +-
 .../testing/selftests/bpf/progs/cpumask_failure.c  |   2 +-
 .../selftests/bpf/progs/nested_trust_failure.c     |   2 +-
 tools/testing/selftests/bpf/progs/rcu_read_lock.c  |   6 +-
 tools/testing/selftests/bpf/verifier/calls.c       |   2 +-
 12 files changed, 173 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 23ec684e660d..d3456804f7aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2279,7 +2279,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off);
+				int off, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b26ff2a8f63b..18538bad2b8c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -537,7 +537,6 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
-	bool rcu_tag_supported;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c5e1d6955491..a8cb09e5973b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6163,6 +6163,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 	const char *tname, *mname, *tag_value;
 	u32 vlen, elem_id, mid;
 
+	*flag = 0;
 again:
 	tname = __btf_name_by_offset(btf, t->name_off);
 	if (!btf_type_is_struct(t)) {
@@ -6329,6 +6330,15 @@ error:
 		 * of this field or inside of this struct
 		 */
 		if (btf_type_is_struct(mtype)) {
+			if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
+			    btf_type_vlen(mtype) != 1)
+				/*
+				 * walking unions yields untrusted pointers
+				 * with exception of __bpf_md_ptr and other
+				 * unions with a single member
+				 */
+				*flag |= PTR_UNTRUSTED;
+
 			/* our field must be inside that union or struct */
 			t = mtype;
 
@@ -6373,7 +6383,7 @@ error:
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
-				*flag = tmp_flag;
+				*flag |= tmp_flag;
 				return WALK_PTR;
 			}
 		}
@@ -8357,7 +8367,7 @@ out:
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off)
+				int off, const char *suffix)
 {
 	struct btf *btf = reg->btf;
 	const struct btf_type *walk_type, *safe_type;
@@ -8374,7 +8384,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 
 	tname = btf_name_by_offset(btf, walk_type->name_off);
 
-	ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+	ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
 	if (ret < 0)
 		return false;
 
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2b3fbbfebdc5..b6587ec40f1b 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -427,26 +427,26 @@ BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a095055d7ef4..c2adf3c24c64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5073,29 +5073,76 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
 	return 0;
 }
 
-#define BTF_TYPE_SAFE_NESTED(__type)  __PASTE(__type, __safe_fields)
+#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
 
-BTF_TYPE_SAFE_NESTED(struct task_struct) {
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
 	const cpumask_t *cpus_ptr;
 	struct css_set __rcu *cgroups;
+	struct task_struct __rcu *real_parent;
+	struct task_struct *group_leader;
 };
 
-BTF_TYPE_SAFE_NESTED(struct css_set) {
+BTF_TYPE_SAFE_RCU(struct css_set) {
 	struct cgroup *dfl_cgrp;
 };
 
-static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
-				  struct bpf_reg_state *reg,
-				  int off)
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+	__bpf_md_ptr(struct seq_file *, seq);
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+	struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+	struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+	/* no negative dentry-s in places where bpf can see it */
+	struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+	struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+			struct bpf_reg_state *reg,
+			int off)
 {
-	/* If its parent is not trusted, it can't regain its trusted status. */
-	if (!is_trusted_reg(reg))
-		return false;
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
-	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
-	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct css_set));
+	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+}
 
-	return btf_nested_type_is_trusted(&env->log, reg, off);
+static bool type_is_trusted(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *reg,
+			    int off)
+{
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
+
+	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5181,49 +5228,58 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
-	/* If this is an untrusted pointer, all pointers formed by walking it
-	 * also inherit the untrusted flag.
-	 */
-	if (type_flag(reg->type) & PTR_UNTRUSTED)
-		flag |= PTR_UNTRUSTED;
+	if (ret != PTR_TO_BTF_ID) {
+		/* just mark; */
 
-	/* By default any pointer obtained from walking a trusted pointer is no
-	 * longer trusted, unless the field being accessed has explicitly been
-	 * marked as inheriting its parent's state of trust.
-	 *
-	 * An RCU-protected pointer can also be deemed trusted if we are in an
-	 * RCU read region. This case is handled below.
-	 */
-	if (nested_ptr_is_trusted(env, reg, off)) {
-		flag |= PTR_TRUSTED;
-		/*
-		 * task->cgroups is trusted. It provides a stronger guarantee
-		 * than __rcu tag on 'cgroups' field in 'struct task_struct'.
-		 * Clear MEM_RCU in such case.
+	} else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+		/* If this is an untrusted pointer, all pointers formed by walking it
+		 * also inherit the untrusted flag.
+		 */
+		flag = PTR_UNTRUSTED;
+
+	} else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+		/* By default any pointer obtained from walking a trusted pointer is no
+		 * longer trusted, unless the field being accessed has explicitly been
+		 * marked as inheriting its parent's state of trust (either full or RCU).
+		 * For example:
+		 * 'cgroups' pointer is untrusted if task->cgroups dereference
+		 * happened in a sleepable program outside of bpf_rcu_read_lock()
+		 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+		 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+		 *
+		 * A regular RCU-protected pointer with __rcu tag can also be deemed
+		 * trusted if we are in an RCU CS. Such pointer can be NULL.
 		 */
-		flag &= ~MEM_RCU;
+		if (type_is_trusted(env, reg, off)) {
+			flag |= PTR_TRUSTED;
+		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+			if (type_is_rcu(env, reg, off)) {
+				/* ignore __rcu tag and mark it MEM_RCU */
+				flag |= MEM_RCU;
+			} else if (flag & MEM_RCU) {
+				/* __rcu tagged pointers can be NULL */
+				flag |= PTR_MAYBE_NULL;
+			} else if (flag & (MEM_PERCPU | MEM_USER)) {
+				/* keep as-is */
+			} else {
+				/* walking unknown pointers yields untrusted pointer */
+				flag = PTR_UNTRUSTED;
+			}
+		} else {
+			/*
+			 * If not in RCU CS or MEM_RCU pointer can be NULL then
+			 * aggressively mark as untrusted otherwise such
+			 * pointers will be plain PTR_TO_BTF_ID without flags
+			 * and will be allowed to be passed into helpers for
+			 * compat reasons.
+			 */
+			flag = PTR_UNTRUSTED;
+		}
 	} else {
+		/* Old compat. Deprecated */
 		flag &= ~PTR_TRUSTED;
 	}
 
-	if (flag & MEM_RCU) {
-		/* Mark value register as MEM_RCU only if it is protected by
-		 * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
-		 * itself can already indicate trustedness inside the rcu
-		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
-		 * it could be null in some cases.
-		 */
-		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
-			flag |= PTR_MAYBE_NULL;
-		else
-			flag &= ~MEM_RCU;
-	} else if (reg->type & MEM_RCU) {
-		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
-		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
-		 */
-		flag |= PTR_UNTRUSTED;
-	}
-
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
@@ -10049,10 +10105,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
-	if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
-		verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
-		return -EACCES;
-	}
 
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
@@ -14911,8 +14963,22 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose(env, "same insn cannot be used with different pointers\n");
-				return -EINVAL;
+				if (base_type(src_reg_type) == PTR_TO_BTF_ID &&
+				    base_type(*prev_src_type) == PTR_TO_BTF_ID) {
+					/*
+					 * Have to support a use case when one path through
+					 * the program yields TRUSTED pointer while another
+					 * is UNTRUSTED. Fallback to UNTRUSTED to generate
+					 * BPF_PROBE_MEM.
+					 */
+					*prev_src_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+				} else {
+					verbose(env,
+						"The same insn cannot be used with different pointers: %s",
+						reg_type_str(env, src_reg_type));
+					verbose(env, " != %s\n", reg_type_str(env, *prev_src_type));
+					return -EINVAL;
+				}
 			}
 
 		} else if (class == BPF_STX) {
@@ -17984,8 +18050,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
 	env->bpf_capable = bpf_capable();
-	env->rcu_tag_supported = btf_vmlinux &&
-		btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
index 2cc759956e3b..63e776f4176e 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
@@ -193,7 +193,7 @@ out:
 	cgrp_ls_sleepable__destroy(skel);
 }
 
-static void test_no_rcu_lock(__u64 cgroup_id)
+static void test_yes_rcu_lock(__u64 cgroup_id)
 {
 	struct cgrp_ls_sleepable *skel;
 	int err;
@@ -204,7 +204,7 @@ static void test_no_rcu_lock(__u64 cgroup_id)
 
 	skel->bss->target_pid = syscall(SYS_gettid);
 
-	bpf_program__set_autoload(skel->progs.no_rcu_lock, true);
+	bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
 	err = cgrp_ls_sleepable__load(skel);
 	if (!ASSERT_OK(err, "skel_load"))
 		goto out;
@@ -220,7 +220,7 @@ out:
 	cgrp_ls_sleepable__destroy(skel);
 }
 
-static void test_rcu_lock(void)
+static void test_no_rcu_lock(void)
 {
 	struct cgrp_ls_sleepable *skel;
 	int err;
@@ -229,7 +229,7 @@ static void test_rcu_lock(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
+	bpf_program__set_autoload(skel->progs.no_rcu_lock, true);
 	err = cgrp_ls_sleepable__load(skel);
 	ASSERT_ERR(err, "skel_load");
 
@@ -256,10 +256,10 @@ void test_cgrp_local_storage(void)
 		test_negative();
 	if (test__start_subtest("cgroup_iter_sleepable"))
 		test_cgroup_iter_sleepable(cgroup_fd, cgroup_id);
+	if (test__start_subtest("yes_rcu_lock"))
+		test_yes_rcu_lock(cgroup_id);
 	if (test__start_subtest("no_rcu_lock"))
-		test_no_rcu_lock(cgroup_id);
-	if (test__start_subtest("rcu_lock"))
-		test_rcu_lock();
+		test_no_rcu_lock();
 
 	close(cgroup_fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
index 447d8560ecb6..3f1f58d3a729 100644
--- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
@@ -25,10 +25,10 @@ static void test_success(void)
 
 	bpf_program__set_autoload(skel->progs.get_cgroup_id, true);
 	bpf_program__set_autoload(skel->progs.task_succ, true);
-	bpf_program__set_autoload(skel->progs.no_lock, true);
 	bpf_program__set_autoload(skel->progs.two_regions, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_1, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_2, true);
+	bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true);
 	err = rcu_read_lock__load(skel);
 	if (!ASSERT_OK(err, "skel_load"))
 		goto out;
@@ -69,6 +69,7 @@ out:
 
 static const char * const inproper_region_tests[] = {
 	"miss_lock",
+	"no_lock",
 	"miss_unlock",
 	"non_sleepable_rcu_mismatch",
 	"inproper_sleepable_helper",
@@ -99,7 +100,6 @@ out:
 }
 
 static const char * const rcuptr_misuse_tests[] = {
-	"task_untrusted_non_rcuptr",
 	"task_untrusted_rcuptr",
 	"cross_rcu_region",
 };
@@ -128,17 +128,8 @@ out:
 
 void test_rcu_read_lock(void)
 {
-	struct btf *vmlinux_btf;
 	int cgroup_fd;
 
-	vmlinux_btf = btf__load_vmlinux_btf();
-	if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF"))
-		return;
-	if (btf__find_by_name_kind(vmlinux_btf, "rcu", BTF_KIND_TYPE_TAG) < 0) {
-		test__skip();
-		goto out;
-	}
-
 	cgroup_fd = test__join_cgroup("/rcu_read_lock");
 	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /rcu_read_lock"))
 		goto out;
@@ -153,6 +144,5 @@ void test_rcu_read_lock(void)
 	if (test__start_subtest("negative_tests_rcuptr_misuse"))
 		test_rcuptr_misuse();
 	close(cgroup_fd);
-out:
-	btf__free(vmlinux_btf);
+out:;
 }
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
index 2d11ed528b6f..7615dc23d301 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
@@ -49,7 +49,7 @@ int no_rcu_lock(void *ctx)
 	if (task->pid != target_pid)
 		return 0;
 
-	/* ptr_to_btf_id semantics. should work. */
+	/* task->cgroups is untrusted in sleepable prog outside of RCU CS */
 	cgrp = task->cgroups->dfl_cgrp;
 	ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -71,7 +71,7 @@ int yes_rcu_lock(void *ctx)
 
 	bpf_rcu_read_lock();
 	cgrp = task->cgroups->dfl_cgrp;
-	/* cgrp is untrusted and cannot pass to bpf_cgrp_storage_get() helper. */
+	/* cgrp is trusted under RCU CS */
 	ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (ptr)
 		cgroup_id = cgrp->kn->id;
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index 33e8e86dd090..c16f7563b84e 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -44,7 +44,7 @@ int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flag
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("bpf_cpumask_acquire args#0 expected pointer to STRUCT bpf_cpumask")
+__failure __msg("must be referenced")
 int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_flags)
 {
 	struct bpf_cpumask *cpumask;
diff --git a/tools/testing/selftests/bpf/progs/nested_trust_failure.c b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
index 14aff7676436..0d1aa6bbace4 100644
--- a/tools/testing/selftests/bpf/progs/nested_trust_failure.c
+++ b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
@@ -17,7 +17,7 @@ char _license[] SEC("license") = "GPL";
  */
 
 SEC("tp_btf/task_newtask")
-__failure __msg("R2 must be referenced or trusted")
+__failure __msg("R2 must be")
 int BPF_PROG(test_invalid_nested_user_cpus, struct task_struct *task, u64 clone_flags)
 {
 	bpf_cpumask_test_cpu(0, task->user_cpus_ptr);
diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
index 5cecbdbbb16e..7250bb76d18a 100644
--- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
@@ -81,7 +81,7 @@ int no_lock(void *ctx)
 {
 	struct task_struct *task, *real_parent;
 
-	/* no bpf_rcu_read_lock(), old code still works */
+	/* old style ptr_to_btf_id is not allowed in sleepable */
 	task = bpf_get_current_task_btf();
 	real_parent = task->real_parent;
 	(void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
@@ -286,13 +286,13 @@ out:
 }
 
 SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
-int task_untrusted_non_rcuptr(void *ctx)
+int task_trusted_non_rcuptr(void *ctx)
 {
 	struct task_struct *task, *group_leader;
 
 	task = bpf_get_current_task_btf();
 	bpf_rcu_read_lock();
-	/* the pointer group_leader marked as untrusted */
+	/* the pointer group_leader is explicitly marked as trusted */
 	group_leader = task->real_parent->group_leader;
 	(void)bpf_task_storage_get(&map_a, group_leader, 0, 0);
 	bpf_rcu_read_unlock();
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 9a326a800e5c..5702fc9761ef 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -181,7 +181,7 @@
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "negative offset ptr_ ptr R1 off=-4 disallowed",
+	.errstr = "ptr R1 off=-4 disallowed",
 },
 {
 	"calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset",
-- 
cgit v1.2.3


From e768e3c5aab44ee63f58649d4c8cbbb3270e5c06 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 3 Mar 2023 15:15:42 +0100
Subject: bpf: Use separate RCU callbacks for freeing selem

Martin suggested that instead of using a byte in the hole (which he has
a use for in his future patch) in bpf_local_storage_elem, we can
dispatch a different call_rcu callback based on whether we need to free
special fields in bpf_local_storage_elem data. The free path, described
in commit 9db44fdd8105 ("bpf: Support kptrs in local storage maps"),
only waits for call_rcu callbacks when there are special (kptrs, etc.)
fields in the map value, hence it is necessary that we only access
smap in this case.

Therefore, dispatch different RCU callbacks based on the BPF map has a
valid btf_record, which dereference and use smap's btf_record only when
it is valid.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230303141542.300068-1-memxor@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 ---
 kernel/bpf/bpf_local_storage.c    | 79 ++++++++++++++++++++++++---------------
 2 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 0fe92986412b..6d37a40cd90e 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -74,12 +74,6 @@ struct bpf_local_storage_elem {
 	struct hlist_node snode;	/* Linked to bpf_local_storage */
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
-	bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
-			    * callbacks? bpf_local_storage_map_free only
-			    * executes rcu_barrier when there are special
-			    * fields, this field remembers that to ensure we
-			    * don't access already freed smap in sdata.
-			    */
 	/* 8 bytes hole */
 	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 2bdd722fe293..3d320393a12c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -109,30 +109,36 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 		kfree_rcu(local_storage, rcu);
 }
 
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_fields_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	/* The can_use_smap bool is set whenever we need to free additional
-	 * fields in selem data before freeing selem. bpf_local_storage_map_free
-	 * only executes rcu_barrier to wait for RCU callbacks when it has
-	 * special fields, hence we can only conditionally dereference smap, as
-	 * by this time the map might have already been freed without waiting
-	 * for our call_rcu callback if it did not have any special fields.
-	 */
-	if (selem->can_use_smap)
-		bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
+	/* protected by the rcu_barrier*() */
+	smap = rcu_dereference_protected(SDATA(selem)->smap, true);
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
 	kfree(selem);
 }
 
-static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu)
 {
 	/* Free directly if Tasks Trace RCU GP also implies RCU GP */
 	if (rcu_trace_implies_rcu_gp())
-		bpf_selem_free_rcu(rcu);
+		bpf_selem_free_fields_rcu(rcu);
+	else
+		call_rcu(rcu, bpf_selem_free_fields_rcu);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(selem);
 	else
-		call_rcu(rcu, bpf_selem_free_rcu);
+		kfree_rcu(selem, rcu);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -145,6 +151,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
+	struct btf_record *rec;
 	void *owner;
 
 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
@@ -185,10 +192,26 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	if (use_trace_rcu)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	/* A different RCU callback is chosen whenever we need to free
+	 * additional fields in selem data before freeing selem.
+	 * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU
+	 * callbacks when it has special fields, hence we can only conditionally
+	 * dereference smap, as by this time the map might have already been
+	 * freed without waiting for our call_rcu callback if it did not have
+	 * any special fields.
+	 */
+	rec = smap->map.record;
+	if (use_trace_rcu) {
+		if (!IS_ERR_OR_NULL(rec))
+			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
+		else
+			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	} else {
+		if (!IS_ERR_OR_NULL(rec))
+			call_rcu(&selem->rcu, bpf_selem_free_fields_rcu);
+		else
+			kfree_rcu(selem, rcu);
+	}
 
 	return free_local_storage;
 }
@@ -256,11 +279,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-
-	/* If our data will have special fields, smap will wait for us to use
-	 * its record in bpf_selem_free_* RCU callbacks before freeing itself.
-	 */
-	selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
@@ -748,19 +766,20 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	kvfree(smap->buckets);
 
 	/* When local storage has special fields, callbacks for
-	 * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
-	 * the map BTF record, we need to execute an RCU barrier to wait for
-	 * them as the record will be freed right after our map_free callback.
+	 * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will
+	 * keep using the map BTF record, we need to execute an RCU barrier to
+	 * wait for them as the record will be freed right after our map_free
+	 * callback.
 	 */
 	if (!IS_ERR_OR_NULL(smap->map.record)) {
 		rcu_barrier_tasks_trace();
 		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
 		 * is true, because while call_rcu invocation is skipped in that
-		 * case in bpf_selem_free_tasks_trace_rcu (and all local storage
-		 * maps pass use_trace_rcu = true), there can be call_rcu
-		 * callbacks based on use_trace_rcu = false in the earlier while
-		 * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
-		 * called from owner's free path.
+		 * case in bpf_selem_free_fields_trace_rcu (and all local
+		 * storage maps pass use_trace_rcu = true), there can be
+		 * call_rcu callbacks based on use_trace_rcu = false in the
+		 * while ((selem = ...)) loop above or when owner's free path
+		 * calls bpf_local_storage_unlink_nolock.
 		 */
 		rcu_barrier();
 	}
-- 
cgit v1.2.3


From a6ff3c0021468721b96e84892a8cae24bde8d65f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 1 Mar 2023 21:14:29 +0100
Subject: thermal/core: Add a thermal zone 'devdata' accessor

The thermal zone device structure is exposed to the different drivers
and obviously they access the internals while that should be
restricted to the core thermal code.

In order to self-encapsulate the thermal core code, we need to prevent
the drivers accessing directly the thermal zone structure and provide
accessor functions to deal with.

Provide an accessor to the 'devdata' structure and make use of it in
the different drivers.

No functional changes intended.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 6 ++++++
 include/linux/thermal.h        | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 55679fd86505..8a0cdabccc7d 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1377,6 +1377,12 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, int n
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_register);
 
+void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
+{
+	return tzd->devdata;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_priv);
+
 /**
  * thermal_zone_device_unregister - removes the registered thermal zone device
  * @tz: the thermal zone device to remove
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 2bb4bf33f4f3..7dbb5712434c 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -365,6 +365,8 @@ thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int
 					void *, struct thermal_zone_device_ops *,
 					struct thermal_zone_params *, int, int);
 
+void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
+
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
 				     unsigned long, unsigned long,
@@ -436,6 +438,11 @@ static inline int thermal_zone_get_offset(
 		struct thermal_zone_device *tz)
 { return -ENODEV; }
 
+static inline void *thermal_zone_device_priv(struct thermal_zone_device *tz)
+{
+	return NULL;
+}
+
 static inline int thermal_zone_device_enable(struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-- 
cgit v1.2.3


From 072e35c98806100182c0a7263cf4cba09ce43463 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 1 Mar 2023 21:14:38 +0100
Subject: thermal/core: Add thermal_zone_device structure 'type' accessor

The thermal zone device structure is exposed via the exported
thermal.h header. This structure should stay private the thermal core
code. In order to encapsulate the structure, let's add an accessor to
get the 'type' of the thermal zone.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 6 ++++++
 include/linux/thermal.h        | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 8a0cdabccc7d..fa1c17529018 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1383,6 +1383,12 @@ void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_priv);
 
+const char *thermal_zone_device_type(struct thermal_zone_device *tzd)
+{
+	return tzd->type;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_type);
+
 /**
  * thermal_zone_device_unregister - removes the registered thermal zone device
  * @tz: the thermal zone device to remove
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 7dbb5712434c..21686e676b3d 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -366,6 +366,7 @@ thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int
 					struct thermal_zone_params *, int, int);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
+const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
@@ -443,6 +444,11 @@ static inline void *thermal_zone_device_priv(struct thermal_zone_device *tz)
 	return NULL;
 }
 
+static inline const char *thermal_zone_device_type(struct thermal_zone_device *tzd)
+{
+	return NULL;
+}
+
 static inline int thermal_zone_device_enable(struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-- 
cgit v1.2.3


From 3034f859b90d49ee661fad4d23593589d0d37779 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 1 Mar 2023 21:14:40 +0100
Subject: thermal: Add a thermal zone id accessor

In order to get the thermal zone id but without directly accessing the
thermal zone device structure, add an accessor.

Use the accessor in the hwmon_scmi and acpi_thermal.

No functional change intented.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/thermal.c         | 2 +-
 drivers/hwmon/scmi-hwmon.c     | 2 +-
 drivers/thermal/thermal_core.c | 6 ++++++
 include/linux/thermal.h        | 6 ++++++
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 392b73b3e269..255efa73ed70 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -842,7 +842,7 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
 		goto acpi_bus_detach;
 
 	dev_info(&tz->device->dev, "registered as thermal_zone%d\n",
-		 tz->thermal_zone->id);
+		 thermal_zone_device_id(tz->thermal_zone));
 
 	return 0;
 
diff --git a/drivers/hwmon/scmi-hwmon.c b/drivers/hwmon/scmi-hwmon.c
index 046ac157749d..364199b332c0 100644
--- a/drivers/hwmon/scmi-hwmon.c
+++ b/drivers/hwmon/scmi-hwmon.c
@@ -220,7 +220,7 @@ static int scmi_thermal_sensor_register(struct device *dev,
 			sensor->name);
 	} else {
 		dev_dbg(dev, "Sensor '%s' attached to thermal zone ID:%d\n",
-			sensor->name, tzd->id);
+			sensor->name, thermal_zone_device_id(tzd));
 	}
 
 	return 0;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index fa1c17529018..5ae72f314683 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1389,6 +1389,12 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd)
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_type);
 
+int thermal_zone_device_id(struct thermal_zone_device *tzd)
+{
+	return tzd->id;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_id);
+
 /**
  * thermal_zone_device_unregister - removes the registered thermal zone device
  * @tz: the thermal zone device to remove
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 21686e676b3d..eb80cee4f64f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -367,6 +367,7 @@ thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
 const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
+int thermal_zone_device_id(struct thermal_zone_device *tzd);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
@@ -449,6 +450,11 @@ static inline const char *thermal_zone_device_type(struct thermal_zone_device *t
 	return NULL;
 }
 
+static inline int thermal_zone_device_id(struct thermal_zone_device *tzd)
+{
+	return -ENODEV;
+}
+
 static inline int thermal_zone_device_enable(struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-- 
cgit v1.2.3


From f5030564938be112183ba3df0cdd6dea3f694c2e Mon Sep 17 00:00:00 2001
From: Lucas Tanure <lucas.tanure@collabora.com>
Date: Thu, 23 Feb 2023 08:43:23 +0000
Subject: ALSA: cs35l41: Add shared boost feature

Shared boost allows two amplifiers to share a single boost circuit by
communicating on the MDSYNC bus.
The passive amplifier does not control the boost and receives data from
the active amplifier.

Shared Boost is not supported in HDA Systems.
Based on David Rhodes shared boost patches.

Signed-off-by: Lucas Tanure <lucas.tanure@collabora.com>
Reviewed-by: David Rhodes <david.rhodes@cirrus.com>
Link: https://lore.kernel.org/r/20230223084324.9076-4-lucas.tanure@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l41.h        | 13 +++++++-
 sound/pci/hda/cs35l41_hda.c    |  6 ++--
 sound/soc/codecs/cs35l41-lib.c | 73 +++++++++++++++++++++++++++++++++++++++++-
 sound/soc/codecs/cs35l41.c     | 27 ++++++++++++++--
 sound/soc/codecs/cs35l41.h     |  1 +
 5 files changed, 113 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 9ac5918269a5..7239d943942c 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -11,6 +11,7 @@
 #define __CS35L41_H
 
 #include <linux/regmap.h>
+#include <linux/completion.h>
 #include <linux/firmware/cirrus/cs_dsp.h>
 
 #define CS35L41_FIRSTREG		0x00000000
@@ -677,6 +678,7 @@
 
 #define CS35L36_PUP_DONE_IRQ_UNMASK	0x5F
 #define CS35L36_PUP_DONE_IRQ_MASK	0xBF
+#define CS35L41_SYNC_EN_MASK		BIT(8)
 
 #define CS35L41_AMP_SHORT_ERR		0x80000000
 #define CS35L41_BST_SHORT_ERR		0x0100
@@ -686,6 +688,7 @@
 #define CS35L41_BST_DCM_UVP_ERR		0x80
 #define CS35L41_OTP_BOOT_DONE		0x02
 #define CS35L41_PLL_UNLOCK		0x10
+#define CS35L41_PLL_LOCK		BIT(1)
 #define CS35L41_OTP_BOOT_ERR		0x80000000
 
 #define CS35L41_AMP_SHORT_ERR_RLS	0x02
@@ -705,6 +708,8 @@
 #define CS35L41_INT1_MASK_DEFAULT	0x7FFCFE3F
 #define CS35L41_INT1_UNMASK_PUP		0xFEFFFFFF
 #define CS35L41_INT1_UNMASK_PDN		0xFF7FFFFF
+#define CS35L41_INT3_PLL_LOCK_SHIFT	1
+#define CS35L41_INT3_PLL_LOCK_MASK	BIT(CS35L41_INT3_PLL_LOCK_SHIFT)
 
 #define CS35L41_GPIO_DIR_MASK		0x80000000
 #define CS35L41_GPIO_DIR_SHIFT		31
@@ -742,6 +747,11 @@
 enum cs35l41_boost_type {
 	CS35L41_INT_BOOST,
 	CS35L41_EXT_BOOST,
+	CS35L41_SHD_BOOST_ACTV,
+	CS35L41_SHD_BOOST_PASS,
+
+	// Not present in Binding Documentation, so no system should use this value.
+	// This value is only used in CLSA0100 Laptop
 	CS35L41_EXT_BOOST_NO_VSPK_SWITCH,
 };
 
@@ -891,6 +901,7 @@ int cs35l41_exit_hibernate(struct device *dev, struct regmap *regmap);
 int cs35l41_init_boost(struct device *dev, struct regmap *regmap,
 		       struct cs35l41_hw_cfg *hw_cfg);
 bool cs35l41_safe_reset(struct regmap *regmap, enum cs35l41_boost_type b_type);
-int cs35l41_global_enable(struct regmap *regmap, enum cs35l41_boost_type b_type, int enable);
+int cs35l41_global_enable(struct regmap *regmap, enum cs35l41_boost_type b_type, int enable,
+			  struct completion *pll_lock);
 
 #endif /* __CS35L41_H */
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index 75020edd39e7..b5210abb5141 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -514,13 +514,13 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action)
 		break;
 	case HDA_GEN_PCM_ACT_PREPARE:
 		mutex_lock(&cs35l41->fw_mutex);
-		ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 1);
+		ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 1, NULL);
 		mutex_unlock(&cs35l41->fw_mutex);
 		break;
 	case HDA_GEN_PCM_ACT_CLEANUP:
 		mutex_lock(&cs35l41->fw_mutex);
 		regmap_multi_reg_write(reg, cs35l41_hda_mute, ARRAY_SIZE(cs35l41_hda_mute));
-		ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 0);
+		ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 0, NULL);
 		mutex_unlock(&cs35l41->fw_mutex);
 		break;
 	case HDA_GEN_PCM_ACT_CLOSE:
@@ -672,7 +672,7 @@ static int cs35l41_runtime_suspend(struct device *dev)
 	if (cs35l41->playback_started) {
 		regmap_multi_reg_write(cs35l41->regmap, cs35l41_hda_mute,
 				       ARRAY_SIZE(cs35l41_hda_mute));
-		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 0);
+		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 0, NULL);
 		regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2,
 				   CS35L41_AMP_EN_MASK, 0 << CS35L41_AMP_EN_SHIFT);
 		if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST)
diff --git a/sound/soc/codecs/cs35l41-lib.c b/sound/soc/codecs/cs35l41-lib.c
index 04be71435491..8538e2871c5f 100644
--- a/sound/soc/codecs/cs35l41-lib.c
+++ b/sound/soc/codecs/cs35l41-lib.c
@@ -1114,12 +1114,31 @@ static const struct reg_sequence cs35l41_reset_to_safe[] = {
 	{ 0x00000040,			0x00000033 },
 };
 
+static const struct reg_sequence cs35l41_actv_seq[] = {
+	/* SYNC_BST_CTL_RX_EN = 1; SYNC_BST_CTL_TX_EN = 1 */
+	{CS35L41_MDSYNC_EN,        0x00003000},
+	/* BST_CTL_SEL = MDSYNC */
+	{CS35L41_BSTCVRT_VCTRL2,    0x00000002},
+};
+
+static const struct reg_sequence cs35l41_pass_seq[] = {
+	/* SYNC_BST_CTL_RX_EN = 0; SYNC_BST_CTL_TX_EN = 1 */
+	{CS35L41_MDSYNC_EN,        0x00001000},
+	/* BST_EN = 0 */
+	{CS35L41_PWR_CTRL2,        0x00003300},
+	/* BST_CTL_SEL = MDSYNC */
+	{CS35L41_BSTCVRT_VCTRL2,    0x00000002},
+};
+
 int cs35l41_init_boost(struct device *dev, struct regmap *regmap,
 		       struct cs35l41_hw_cfg *hw_cfg)
 {
 	int ret;
 
 	switch (hw_cfg->bst_type) {
+	case CS35L41_SHD_BOOST_ACTV:
+		regmap_multi_reg_write(regmap, cs35l41_actv_seq, ARRAY_SIZE(cs35l41_actv_seq));
+		fallthrough;
 	case CS35L41_INT_BOOST:
 		ret = cs35l41_boost_config(dev, regmap, hw_cfg->bst_ind,
 					   hw_cfg->bst_cap, hw_cfg->bst_ipk);
@@ -1138,6 +1157,10 @@ int cs35l41_init_boost(struct device *dev, struct regmap *regmap,
 		ret = regmap_update_bits(regmap, CS35L41_PWR_CTRL2, CS35L41_BST_EN_MASK,
 					 CS35L41_BST_DIS_FET_OFF << CS35L41_BST_EN_SHIFT);
 		break;
+	case CS35L41_SHD_BOOST_PASS:
+		ret = regmap_multi_reg_write(regmap, cs35l41_pass_seq,
+					     ARRAY_SIZE(cs35l41_pass_seq));
+		break;
 	default:
 		dev_err(dev, "Boost type %d not supported\n", hw_cfg->bst_type);
 		ret = -EINVAL;
@@ -1165,11 +1188,59 @@ bool cs35l41_safe_reset(struct regmap *regmap, enum cs35l41_boost_type b_type)
 }
 EXPORT_SYMBOL_GPL(cs35l41_safe_reset);
 
-int cs35l41_global_enable(struct regmap *regmap, enum cs35l41_boost_type b_type, int enable)
+int cs35l41_global_enable(struct regmap *regmap, enum cs35l41_boost_type b_type, int enable,
+			  struct completion *pll_lock)
 {
 	int ret;
+	unsigned int gpio1_func, pad_control, pwr_ctrl1, pwr_ctrl3;
+	struct reg_sequence cs35l41_mdsync_down_seq[] = {
+		{CS35L41_PWR_CTRL3,		0},
+		{CS35L41_GPIO_PAD_CONTROL,	0},
+		{CS35L41_PWR_CTRL1,		0, 3000},
+	};
+	struct reg_sequence cs35l41_mdsync_up_seq[] = {
+		{CS35L41_PWR_CTRL3,	0},
+		{CS35L41_PWR_CTRL1,	0x00000000, 3000},
+		{CS35L41_PWR_CTRL1,	0x00000001, 3000},
+	};
 
 	switch (b_type) {
+	case CS35L41_SHD_BOOST_ACTV:
+	case CS35L41_SHD_BOOST_PASS:
+		regmap_read(regmap, CS35L41_PWR_CTRL3, &pwr_ctrl3);
+		regmap_read(regmap, CS35L41_GPIO_PAD_CONTROL, &pad_control);
+
+		pwr_ctrl3 &= ~CS35L41_SYNC_EN_MASK;
+		pwr_ctrl1 = enable << CS35L41_GLOBAL_EN_SHIFT;
+
+		gpio1_func = enable ? CS35L41_GPIO1_MDSYNC : CS35L41_GPIO1_HIZ;
+		gpio1_func <<= CS35L41_GPIO1_CTRL_SHIFT;
+
+		pad_control &= ~CS35L41_GPIO1_CTRL_MASK;
+		pad_control |= gpio1_func & CS35L41_GPIO1_CTRL_MASK;
+
+		cs35l41_mdsync_down_seq[0].def = pwr_ctrl3;
+		cs35l41_mdsync_down_seq[1].def = pad_control;
+		cs35l41_mdsync_down_seq[2].def = pwr_ctrl1;
+		ret = regmap_multi_reg_write(regmap, cs35l41_mdsync_down_seq,
+					     ARRAY_SIZE(cs35l41_mdsync_down_seq));
+		if (!enable)
+			break;
+
+		if (!pll_lock)
+			return -EINVAL;
+
+		ret = wait_for_completion_timeout(pll_lock, msecs_to_jiffies(1000));
+		if (ret == 0) {
+			ret = -ETIMEDOUT;
+		} else {
+			regmap_read(regmap, CS35L41_PWR_CTRL3, &pwr_ctrl3);
+			pwr_ctrl3 |= CS35L41_SYNC_EN_MASK;
+			cs35l41_mdsync_up_seq[0].def = pwr_ctrl3;
+			ret = regmap_multi_reg_write(regmap, cs35l41_mdsync_up_seq,
+						     ARRAY_SIZE(cs35l41_mdsync_up_seq));
+		}
+		break;
 	case CS35L41_INT_BOOST:
 		ret = regmap_update_bits(regmap, CS35L41_PWR_CTRL1, CS35L41_GLOBAL_EN_MASK,
 					 enable << CS35L41_GLOBAL_EN_SHIFT);
diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c
index c006364e5335..1624510d09c0 100644
--- a/sound/soc/codecs/cs35l41.c
+++ b/sound/soc/codecs/cs35l41.c
@@ -360,6 +360,7 @@ static void cs35l41_boost_enable(struct cs35l41_private *cs35l41, unsigned int e
 {
 	switch (cs35l41->hw_cfg.bst_type) {
 	case CS35L41_INT_BOOST:
+	case CS35L41_SHD_BOOST_ACTV:
 		enable = enable ? CS35L41_BST_EN_DEFAULT : CS35L41_BST_DIS_FET_OFF;
 		regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2, CS35L41_BST_EN_MASK,
 				enable << CS35L41_BST_EN_SHIFT);
@@ -455,6 +456,12 @@ static irqreturn_t cs35l41_irq(int irq, void *data)
 		ret = IRQ_HANDLED;
 	}
 
+	if (status[2] & CS35L41_PLL_LOCK) {
+		regmap_write(cs35l41->regmap, CS35L41_IRQ1_STATUS3, CS35L41_PLL_LOCK);
+		complete(&cs35l41->pll_lock);
+		ret = IRQ_HANDLED;
+	}
+
 done:
 	pm_runtime_mark_last_busy(cs35l41->dev);
 	pm_runtime_put_autosuspend(cs35l41->dev);
@@ -492,10 +499,12 @@ static int cs35l41_main_amp_event(struct snd_soc_dapm_widget *w,
 						cs35l41_pup_patch,
 						ARRAY_SIZE(cs35l41_pup_patch));
 
-		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 1);
+		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 1,
+				      &cs35l41->pll_lock);
 		break;
 	case SND_SOC_DAPM_POST_PMD:
-		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 0);
+		cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 0,
+				      &cs35l41->pll_lock);
 
 		ret = regmap_read_poll_timeout(cs35l41->regmap, CS35L41_IRQ1_STATUS1,
 					       val, val &  CS35L41_PDN_DONE_MASK,
@@ -802,6 +811,10 @@ static const struct snd_pcm_hw_constraint_list cs35l41_constraints = {
 static int cs35l41_pcm_startup(struct snd_pcm_substream *substream,
 			       struct snd_soc_dai *dai)
 {
+	struct cs35l41_private *cs35l41 = snd_soc_component_get_drvdata(dai->component);
+
+	reinit_completion(&cs35l41->pll_lock);
+
 	if (substream->runtime)
 		return snd_pcm_hw_constraint_list(substream->runtime, 0,
 						  SNDRV_PCM_HW_PARAM_RATE,
@@ -1252,6 +1265,10 @@ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *
 	/* Set interrupt masks for critical errors */
 	regmap_write(cs35l41->regmap, CS35L41_IRQ1_MASK1,
 		     CS35L41_INT1_MASK_DEFAULT);
+	if (cs35l41->hw_cfg.bst_type == CS35L41_SHD_BOOST_PASS ||
+	    cs35l41->hw_cfg.bst_type == CS35L41_SHD_BOOST_ACTV)
+		regmap_update_bits(cs35l41->regmap, CS35L41_IRQ1_MASK3, CS35L41_INT3_PLL_LOCK_MASK,
+				   0 << CS35L41_INT3_PLL_LOCK_SHIFT);
 
 	ret = devm_request_threaded_irq(cs35l41->dev, cs35l41->irq, NULL, cs35l41_irq,
 					IRQF_ONESHOT | IRQF_SHARED | irq_pol,
@@ -1275,6 +1292,8 @@ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *
 	if (ret < 0)
 		goto err;
 
+	init_completion(&cs35l41->pll_lock);
+
 	pm_runtime_set_autosuspend_delay(cs35l41->dev, 3000);
 	pm_runtime_use_autosuspend(cs35l41->dev);
 	pm_runtime_mark_last_busy(cs35l41->dev);
@@ -1317,6 +1336,10 @@ void cs35l41_remove(struct cs35l41_private *cs35l41)
 	pm_runtime_disable(cs35l41->dev);
 
 	regmap_write(cs35l41->regmap, CS35L41_IRQ1_MASK1, 0xFFFFFFFF);
+	if (cs35l41->hw_cfg.bst_type == CS35L41_SHD_BOOST_PASS ||
+	    cs35l41->hw_cfg.bst_type == CS35L41_SHD_BOOST_ACTV)
+		regmap_update_bits(cs35l41->regmap, CS35L41_IRQ1_MASK3, CS35L41_INT3_PLL_LOCK_MASK,
+				   1 << CS35L41_INT3_PLL_LOCK_SHIFT);
 	kfree(cs35l41->dsp.system_name);
 	wm_adsp2_remove(&cs35l41->dsp);
 	cs35l41_safe_reset(cs35l41->regmap, cs35l41->hw_cfg.bst_type);
diff --git a/sound/soc/codecs/cs35l41.h b/sound/soc/codecs/cs35l41.h
index c85cbc1dd333..34d967d4372b 100644
--- a/sound/soc/codecs/cs35l41.h
+++ b/sound/soc/codecs/cs35l41.h
@@ -33,6 +33,7 @@ struct cs35l41_private {
 	int irq;
 	/* GPIO for /RST */
 	struct gpio_desc *reset_gpio;
+	struct completion pll_lock;
 };
 
 int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *hw_cfg);
-- 
cgit v1.2.3


From f8c760e8fc414bb02373c5ede62fdac53ca8ccb2 Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Fri, 17 Feb 2023 15:56:36 +0100
Subject: dt-bindings: soc: fsl: cpm_qe: Add TSA controller

Add support for the time slot assigner (TSA)
available in some PowerQUICC SoC such as MPC885
or MPC866.

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/20230217145645.1768659-2-herve.codina@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml      | 215 +++++++++++++++++++++
 include/dt-bindings/soc/cpm1-fsl,tsa.h             |  13 ++
 2 files changed, 228 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml
 create mode 100644 include/dt-bindings/soc/cpm1-fsl,tsa.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml
new file mode 100644
index 000000000000..332e902bcc21
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: PowerQUICC CPM Time-slot assigner (TSA) controller
+
+maintainers:
+  - Herve Codina <herve.codina@bootlin.com>
+
+description:
+  The TSA is the time-slot assigner that can be found on some PowerQUICC SoC.
+  Its purpose is to route some TDM time-slots to other internal serial
+  controllers.
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - fsl,mpc885-tsa
+          - fsl,mpc866-tsa
+      - const: fsl,cpm1-tsa
+
+  reg:
+    items:
+      - description: SI (Serial Interface) register base
+      - description: SI RAM base
+
+  reg-names:
+    items:
+      - const: si_regs
+      - const: si_ram
+
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
+
+  '#fsl,serial-cells':
+    $ref: /schemas/types.yaml#/definitions/uint32
+    const: 1
+    description:
+      TSA consumers that use a phandle to TSA need to pass the serial identifier
+      with this phandle (defined in dt-bindings/soc/fsl,tsa.h).
+      For instance "fsl,tsa-serial = <&tsa FSL_CPM_TSA_SCC4>;".
+
+patternProperties:
+  '^tdm@[0-1]$':
+    description:
+      The TDM managed by this controller
+    type: object
+
+    additionalProperties: false
+
+    properties:
+      reg:
+        minimum: 0
+        maximum: 1
+        description:
+          The TDM number for this TDM, 0 for TDMa and 1 for TDMb
+
+      fsl,common-rxtx-pins:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The hardware can use four dedicated pins for Tx clock, Tx sync, Rx
+          clock and Rx sync or use only two pins, Tx/Rx clock and Tx/Rx sync.
+          Without the 'fsl,common-rxtx-pins' property, the four pins are used.
+          With the 'fsl,common-rxtx-pins' property, two pins are used.
+
+      clocks:
+        minItems: 2
+        items:
+          - description: External clock connected to L1RSYNC pin
+          - description: External clock connected to L1RCLK pin
+          - description: External clock connected to L1TSYNC pin
+          - description: External clock connected to L1TCLK pin
+
+      clock-names:
+        minItems: 2
+        items:
+          - const: l1rsync
+          - const: l1rclk
+          - const: l1tsync
+          - const: l1tclk
+
+      fsl,rx-frame-sync-delay-bits:
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Receive frame sync delay in number of bits.
+          Indicates the delay between the Rx sync and the first bit of the Rx
+          frame. 0 for no bit delay. 1, 2 or 3 for 1, 2 or 3 bits delay.
+
+      fsl,tx-frame-sync-delay-bits:
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Transmit frame sync delay in number of bits.
+          Indicates the delay between the Tx sync and the first bit of the Tx
+          frame. 0 for no bit delay. 1, 2 or 3 for 1, 2 or 3 bits delay.
+
+      fsl,clock-falling-edge:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          Data is sent on falling edge of the clock (and received on the rising
+          edge). If 'clock-falling-edge' is not present, data is sent on the
+          rising edge (and received on the falling edge).
+
+      fsl,fsync-rising-edge:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          Frame sync pulses are sampled with the rising edge of the channel
+          clock. If 'fsync-rising-edge' is not present, pulses are sampled with
+          the falling edge.
+
+      fsl,double-speed-clock:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The channel clock is twice the data rate.
+
+    patternProperties:
+      '^fsl,[rt]x-ts-routes$':
+        $ref: /schemas/types.yaml#/definitions/uint32-matrix
+        description: |
+          A list of tuple that indicates the Tx or Rx time-slots routes.
+        items:
+          items:
+            - description:
+                The number of time-slots
+              minimum: 1
+              maximum: 64
+            - description: |
+                The source (Tx) or destination (Rx) serial interface
+                (dt-bindings/soc/cpm1-fsl,tsa.h defines these values)
+                 - 0: No destination
+                 - 1: SCC2
+                 - 2: SCC3
+                 - 3: SCC4
+                 - 4: SMC1
+                 - 5: SMC2
+              enum: [0, 1, 2, 3, 4, 5]
+        minItems: 1
+        maxItems: 64
+
+    allOf:
+      # If fsl,common-rxtx-pins is present, only 2 clocks are needed.
+      # Else, the 4 clocks must be present.
+      - if:
+          required:
+            - fsl,common-rxtx-pins
+        then:
+          properties:
+            clocks:
+              maxItems: 2
+            clock-names:
+              maxItems: 2
+        else:
+          properties:
+            clocks:
+              minItems: 4
+            clock-names:
+              minItems: 4
+
+    required:
+      - reg
+      - clocks
+      - clock-names
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - '#address-cells'
+  - '#size-cells'
+  - '#fsl,serial-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/soc/cpm1-fsl,tsa.h>
+
+    tsa@ae0 {
+        compatible = "fsl,mpc885-tsa", "fsl,cpm1-tsa";
+        reg = <0xae0 0x10>,
+              <0xc00 0x200>;
+        reg-names = "si_regs", "si_ram";
+
+        #address-cells = <1>;
+        #size-cells = <0>;
+        #fsl,serial-cells = <1>;
+
+        tdm@0 {
+            /* TDMa */
+            reg = <0>;
+
+            clocks = <&clk_l1rsynca>, <&clk_l1rclka>;
+            clock-names = "l1rsync", "l1rclk";
+
+            fsl,common-rxtx-pins;
+            fsl,fsync-rising-edge;
+
+            fsl,tx-ts-routes = <2 0>,             /* TS 0..1 */
+                           <24 FSL_CPM_TSA_SCC4>, /* TS 2..25 */
+                           <1 0>,                 /* TS 26 */
+                           <5 FSL_CPM_TSA_SCC3>;  /* TS 27..31 */
+
+            fsl,rx-ts-routes = <2 0>,             /* TS 0..1 */
+                           <24 FSL_CPM_TSA_SCC4>, /* 2..25 */
+                           <1 0>,                 /* TS 26 */
+                           <5 FSL_CPM_TSA_SCC3>;  /* TS 27..31 */
+        };
+    };
diff --git a/include/dt-bindings/soc/cpm1-fsl,tsa.h b/include/dt-bindings/soc/cpm1-fsl,tsa.h
new file mode 100644
index 000000000000..2cc44e867dbe
--- /dev/null
+++ b/include/dt-bindings/soc/cpm1-fsl,tsa.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+
+#ifndef __DT_BINDINGS_SOC_FSL_TSA_H
+#define __DT_BINDINGS_SOC_FSL_TSA_H
+
+#define FSL_CPM_TSA_NU		0	/* Pseuso Cell Id for not used item */
+#define FSL_CPM_TSA_SCC2	1
+#define FSL_CPM_TSA_SCC3	2
+#define FSL_CPM_TSA_SCC4	3
+#define FSL_CPM_TSA_SMC1	4
+#define FSL_CPM_TSA_SMC2	5
+
+#endif
-- 
cgit v1.2.3


From 3178d58e0b9772d690456c0bdf8c9f5e191d45f1 Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Fri, 17 Feb 2023 15:56:41 +0100
Subject: soc: fsl: cpm1: Add support for QMC

The QMC (QUICC Multichannel Controller) emulates up to 64
channels within one serial controller using the same TDM
physical interface routed from the TSA.

It is available in some	PowerQUICC SoC such as the
MPC885 or MPC866.

It is also available on some Quicc Engine SoCs.
This current version support CPM1 SoCs only and some
enhancement are needed to support Quicc Engine SoCs.

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Acked-by: Li Yang <leoyang.li@nxp.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/20230217145645.1768659-7-herve.codina@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soc/fsl/qe/Kconfig  |   12 +
 drivers/soc/fsl/qe/Makefile |    1 +
 drivers/soc/fsl/qe/qmc.c    | 1533 +++++++++++++++++++++++++++++++++++++++++++
 include/soc/fsl/qe/qmc.h    |   71 ++
 4 files changed, 1617 insertions(+)
 create mode 100644 drivers/soc/fsl/qe/qmc.c
 create mode 100644 include/soc/fsl/qe/qmc.h

(limited to 'include')

diff --git a/drivers/soc/fsl/qe/Kconfig b/drivers/soc/fsl/qe/Kconfig
index b0088495c323..f90cfdf0c763 100644
--- a/drivers/soc/fsl/qe/Kconfig
+++ b/drivers/soc/fsl/qe/Kconfig
@@ -44,6 +44,18 @@ config CPM_TSA
 	  This option enables support for this
 	  controller
 
+config CPM_QMC
+	tristate "CPM QMC support"
+	depends on OF && HAS_IOMEM
+	depends on CPM1 || (SOC_FSL && COMPILE_TEST)
+	depends on CPM_TSA
+	help
+	  Freescale CPM QUICC Multichannel Controller
+	  (QMC)
+
+	  This option enables support for this
+	  controller
+
 config QE_TDM
 	bool
 	default y if FSL_UCC_HDLC
diff --git a/drivers/soc/fsl/qe/Makefile b/drivers/soc/fsl/qe/Makefile
index 45c961acc81b..ec8506e13113 100644
--- a/drivers/soc/fsl/qe/Makefile
+++ b/drivers/soc/fsl/qe/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_QUICC_ENGINE)+= qe.o qe_common.o qe_ic.o qe_io.o
 obj-$(CONFIG_CPM)	+= qe_common.o
 obj-$(CONFIG_CPM_TSA)	+= tsa.o
+obj-$(CONFIG_CPM_QMC)	+= qmc.o
 obj-$(CONFIG_UCC)	+= ucc.o
 obj-$(CONFIG_UCC_SLOW)	+= ucc_slow.o
 obj-$(CONFIG_UCC_FAST)	+= ucc_fast.o
diff --git a/drivers/soc/fsl/qe/qmc.c b/drivers/soc/fsl/qe/qmc.c
new file mode 100644
index 000000000000..cfa7207353e0
--- /dev/null
+++ b/drivers/soc/fsl/qe/qmc.c
@@ -0,0 +1,1533 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * QMC driver
+ *
+ * Copyright 2022 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#include <soc/fsl/qe/qmc.h>
+#include <linux/dma-mapping.h>
+#include <linux/hdlc.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <soc/fsl/cpm.h>
+#include <sysdev/fsl_soc.h>
+#include "tsa.h"
+
+/* SCC general mode register high (32 bits) */
+#define SCC_GSMRL	0x00
+#define SCC_GSMRL_ENR		(1 << 5)
+#define SCC_GSMRL_ENT		(1 << 4)
+#define SCC_GSMRL_MODE_QMC	(0x0A << 0)
+
+/* SCC general mode register low (32 bits) */
+#define SCC_GSMRH	0x04
+#define   SCC_GSMRH_CTSS	(1 << 7)
+#define   SCC_GSMRH_CDS		(1 << 8)
+#define   SCC_GSMRH_CTSP	(1 << 9)
+#define   SCC_GSMRH_CDP		(1 << 10)
+
+/* SCC event register (16 bits) */
+#define SCC_SCCE	0x10
+#define   SCC_SCCE_IQOV		(1 << 3)
+#define   SCC_SCCE_GINT		(1 << 2)
+#define   SCC_SCCE_GUN		(1 << 1)
+#define   SCC_SCCE_GOV		(1 << 0)
+
+/* SCC mask register (16 bits) */
+#define SCC_SCCM	0x14
+/* Multichannel base pointer (32 bits) */
+#define QMC_GBL_MCBASE		0x00
+/* Multichannel controller state (16 bits) */
+#define QMC_GBL_QMCSTATE	0x04
+/* Maximum receive buffer length (16 bits) */
+#define QMC_GBL_MRBLR		0x06
+/* Tx time-slot assignment table pointer (16 bits) */
+#define QMC_GBL_TX_S_PTR	0x08
+/* Rx pointer (16 bits) */
+#define QMC_GBL_RXPTR		0x0A
+/* Global receive frame threshold (16 bits) */
+#define QMC_GBL_GRFTHR		0x0C
+/* Global receive frame count (16 bits) */
+#define QMC_GBL_GRFCNT		0x0E
+/* Multichannel interrupt base address (32 bits) */
+#define QMC_GBL_INTBASE		0x10
+/* Multichannel interrupt pointer (32 bits) */
+#define QMC_GBL_INTPTR		0x14
+/* Rx time-slot assignment table pointer (16 bits) */
+#define QMC_GBL_RX_S_PTR	0x18
+/* Tx pointer (16 bits) */
+#define QMC_GBL_TXPTR		0x1A
+/* CRC constant (32 bits) */
+#define QMC_GBL_C_MASK32	0x1C
+/* Time slot assignment table Rx (32 x 16 bits) */
+#define QMC_GBL_TSATRX		0x20
+/* Time slot assignment table Tx (32 x 16 bits) */
+#define QMC_GBL_TSATTX		0x60
+/* CRC constant (16 bits) */
+#define QMC_GBL_C_MASK16	0xA0
+
+/* TSA entry (16bit entry in TSATRX and TSATTX) */
+#define QMC_TSA_VALID		(1 << 15)
+#define QMC_TSA_WRAP		(1 << 14)
+#define QMC_TSA_MASK		(0x303F)
+#define QMC_TSA_CHANNEL(x)	((x) << 6)
+
+/* Tx buffer descriptor base address (16 bits, offset from MCBASE) */
+#define QMC_SPE_TBASE	0x00
+
+/* Channel mode register (16 bits) */
+#define QMC_SPE_CHAMR	0x02
+#define   QMC_SPE_CHAMR_MODE_HDLC	(1 << 15)
+#define   QMC_SPE_CHAMR_MODE_TRANSP	((0 << 15) | (1 << 13))
+#define   QMC_SPE_CHAMR_ENT		(1 << 12)
+#define   QMC_SPE_CHAMR_POL		(1 << 8)
+#define   QMC_SPE_CHAMR_HDLC_IDLM	(1 << 13)
+#define   QMC_SPE_CHAMR_HDLC_CRC	(1 << 7)
+#define   QMC_SPE_CHAMR_HDLC_NOF	(0x0f << 0)
+#define   QMC_SPE_CHAMR_TRANSP_RD	(1 << 14)
+#define   QMC_SPE_CHAMR_TRANSP_SYNC	(1 << 10)
+
+/* Tx internal state (32 bits) */
+#define QMC_SPE_TSTATE	0x04
+/* Tx buffer descriptor pointer (16 bits) */
+#define QMC_SPE_TBPTR	0x0C
+/* Zero-insertion state (32 bits) */
+#define QMC_SPE_ZISTATE	0x14
+/* Channel’s interrupt mask flags (16 bits) */
+#define QMC_SPE_INTMSK	0x1C
+/* Rx buffer descriptor base address (16 bits, offset from MCBASE) */
+#define QMC_SPE_RBASE	0x20
+/* HDLC: Maximum frame length register (16 bits) */
+#define QMC_SPE_MFLR	0x22
+/* TRANSPARENT: Transparent maximum receive length (16 bits) */
+#define QMC_SPE_TMRBLR	0x22
+/* Rx internal state (32 bits) */
+#define QMC_SPE_RSTATE	0x24
+/* Rx buffer descriptor pointer (16 bits) */
+#define QMC_SPE_RBPTR	0x2C
+/* Packs 4 bytes to 1 long word before writing to buffer (32 bits) */
+#define QMC_SPE_RPACK	0x30
+/* Zero deletion state (32 bits) */
+#define QMC_SPE_ZDSTATE	0x34
+
+/* Transparent synchronization (16 bits) */
+#define QMC_SPE_TRNSYNC 0x3C
+#define   QMC_SPE_TRNSYNC_RX(x)	((x) << 8)
+#define   QMC_SPE_TRNSYNC_TX(x)	((x) << 0)
+
+/* Interrupt related registers bits */
+#define QMC_INT_V		(1 << 15)
+#define QMC_INT_W		(1 << 14)
+#define QMC_INT_NID		(1 << 13)
+#define QMC_INT_IDL		(1 << 12)
+#define QMC_INT_GET_CHANNEL(x)	(((x) & 0x0FC0) >> 6)
+#define QMC_INT_MRF		(1 << 5)
+#define QMC_INT_UN		(1 << 4)
+#define QMC_INT_RXF		(1 << 3)
+#define QMC_INT_BSY		(1 << 2)
+#define QMC_INT_TXB		(1 << 1)
+#define QMC_INT_RXB		(1 << 0)
+
+/* BD related registers bits */
+#define QMC_BD_RX_E	(1 << 15)
+#define QMC_BD_RX_W	(1 << 13)
+#define QMC_BD_RX_I	(1 << 12)
+#define QMC_BD_RX_L	(1 << 11)
+#define QMC_BD_RX_F	(1 << 10)
+#define QMC_BD_RX_CM	(1 << 9)
+#define QMC_BD_RX_UB	(1 << 7)
+#define QMC_BD_RX_LG	(1 << 5)
+#define QMC_BD_RX_NO	(1 << 4)
+#define QMC_BD_RX_AB	(1 << 3)
+#define QMC_BD_RX_CR	(1 << 2)
+
+#define QMC_BD_TX_R	(1 << 15)
+#define QMC_BD_TX_W	(1 << 13)
+#define QMC_BD_TX_I	(1 << 12)
+#define QMC_BD_TX_L	(1 << 11)
+#define QMC_BD_TX_TC	(1 << 10)
+#define QMC_BD_TX_CM	(1 << 9)
+#define QMC_BD_TX_UB	(1 << 7)
+#define QMC_BD_TX_PAD	(0x0f << 0)
+
+/* Numbers of BDs and interrupt items */
+#define QMC_NB_TXBDS	8
+#define QMC_NB_RXBDS	8
+#define QMC_NB_INTS	128
+
+struct qmc_xfer_desc {
+	union {
+		void (*tx_complete)(void *context);
+		void (*rx_complete)(void *context, size_t length);
+	};
+	void *context;
+};
+
+struct qmc_chan {
+	struct list_head list;
+	unsigned int id;
+	struct qmc *qmc;
+	void *__iomem s_param;
+	enum qmc_mode mode;
+	u64	tx_ts_mask;
+	u64	rx_ts_mask;
+	bool is_reverse_data;
+
+	spinlock_t	tx_lock;
+	cbd_t __iomem *txbds;
+	cbd_t __iomem *txbd_free;
+	cbd_t __iomem *txbd_done;
+	struct qmc_xfer_desc tx_desc[QMC_NB_TXBDS];
+	u64	nb_tx_underrun;
+	bool	is_tx_stopped;
+
+	spinlock_t	rx_lock;
+	cbd_t __iomem *rxbds;
+	cbd_t __iomem *rxbd_free;
+	cbd_t __iomem *rxbd_done;
+	struct qmc_xfer_desc rx_desc[QMC_NB_RXBDS];
+	u64	nb_rx_busy;
+	int	rx_pending;
+	bool	is_rx_halted;
+	bool	is_rx_stopped;
+};
+
+struct qmc {
+	struct device *dev;
+	struct tsa_serial *tsa_serial;
+	void *__iomem scc_regs;
+	void *__iomem scc_pram;
+	void *__iomem dpram;
+	u16 scc_pram_offset;
+	cbd_t __iomem *bd_table;
+	dma_addr_t bd_dma_addr;
+	size_t bd_size;
+	u16 __iomem *int_table;
+	u16 __iomem *int_curr;
+	dma_addr_t int_dma_addr;
+	size_t int_size;
+	struct list_head chan_head;
+	struct qmc_chan *chans[64];
+};
+
+static inline void qmc_write16(void *__iomem addr, u16 val)
+{
+	iowrite16be(val, addr);
+}
+
+static inline u16 qmc_read16(void *__iomem addr)
+{
+	return ioread16be(addr);
+}
+
+static inline void qmc_setbits16(void *__iomem addr, u16 set)
+{
+	qmc_write16(addr, qmc_read16(addr) | set);
+}
+
+static inline void qmc_clrbits16(void *__iomem addr, u16 clr)
+{
+	qmc_write16(addr, qmc_read16(addr) & ~clr);
+}
+
+static inline void qmc_write32(void *__iomem addr, u32 val)
+{
+	iowrite32be(val, addr);
+}
+
+static inline u32 qmc_read32(void *__iomem addr)
+{
+	return ioread32be(addr);
+}
+
+static inline void qmc_setbits32(void *__iomem addr, u32 set)
+{
+	qmc_write32(addr, qmc_read32(addr) | set);
+}
+
+
+int qmc_chan_get_info(struct qmc_chan *chan, struct qmc_chan_info *info)
+{
+	struct tsa_serial_info tsa_info;
+	int ret;
+
+	/* Retrieve info from the TSA related serial */
+	ret = tsa_serial_get_info(chan->qmc->tsa_serial, &tsa_info);
+	if (ret)
+		return ret;
+
+	info->mode = chan->mode;
+	info->rx_fs_rate = tsa_info.rx_fs_rate;
+	info->rx_bit_rate = tsa_info.rx_bit_rate;
+	info->nb_tx_ts = hweight64(chan->tx_ts_mask);
+	info->tx_fs_rate = tsa_info.tx_fs_rate;
+	info->tx_bit_rate = tsa_info.tx_bit_rate;
+	info->nb_rx_ts = hweight64(chan->rx_ts_mask);
+
+	return 0;
+}
+EXPORT_SYMBOL(qmc_chan_get_info);
+
+int qmc_chan_set_param(struct qmc_chan *chan, const struct qmc_chan_param *param)
+{
+	if (param->mode != chan->mode)
+		return -EINVAL;
+
+	switch (param->mode) {
+	case QMC_HDLC:
+		if ((param->hdlc.max_rx_buf_size % 4) ||
+		    (param->hdlc.max_rx_buf_size < 8))
+			return -EINVAL;
+
+		qmc_write16(chan->qmc->scc_pram + QMC_GBL_MRBLR,
+			    param->hdlc.max_rx_buf_size - 8);
+		qmc_write16(chan->s_param + QMC_SPE_MFLR,
+			    param->hdlc.max_rx_frame_size);
+		if (param->hdlc.is_crc32) {
+			qmc_setbits16(chan->s_param + QMC_SPE_CHAMR,
+				      QMC_SPE_CHAMR_HDLC_CRC);
+		} else {
+			qmc_clrbits16(chan->s_param + QMC_SPE_CHAMR,
+				      QMC_SPE_CHAMR_HDLC_CRC);
+		}
+		break;
+
+	case QMC_TRANSPARENT:
+		qmc_write16(chan->s_param + QMC_SPE_TMRBLR,
+			    param->transp.max_rx_buf_size);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(qmc_chan_set_param);
+
+int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length,
+			  void (*complete)(void *context), void *context)
+{
+	struct qmc_xfer_desc *xfer_desc;
+	unsigned long flags;
+	cbd_t *__iomem bd;
+	u16 ctrl;
+	int ret;
+
+	/*
+	 * R bit  UB bit
+	 *   0       0  : The BD is free
+	 *   1       1  : The BD is in used, waiting for transfer
+	 *   0       1  : The BD is in used, waiting for completion
+	 *   1       0  : Should not append
+	 */
+
+	spin_lock_irqsave(&chan->tx_lock, flags);
+	bd = chan->txbd_free;
+
+	ctrl = qmc_read16(&bd->cbd_sc);
+	if (ctrl & (QMC_BD_TX_R | QMC_BD_TX_UB)) {
+		/* We are full ... */
+		ret = -EBUSY;
+		goto end;
+	}
+
+	qmc_write16(&bd->cbd_datlen, length);
+	qmc_write32(&bd->cbd_bufaddr, addr);
+
+	xfer_desc = &chan->tx_desc[bd - chan->txbds];
+	xfer_desc->tx_complete = complete;
+	xfer_desc->context = context;
+
+	/* Activate the descriptor */
+	ctrl |= (QMC_BD_TX_R | QMC_BD_TX_UB);
+	wmb(); /* Be sure to flush the descriptor before control update */
+	qmc_write16(&bd->cbd_sc, ctrl);
+
+	if (!chan->is_tx_stopped)
+		qmc_setbits16(chan->s_param + QMC_SPE_CHAMR, QMC_SPE_CHAMR_POL);
+
+	if (ctrl & QMC_BD_TX_W)
+		chan->txbd_free = chan->txbds;
+	else
+		chan->txbd_free++;
+
+	ret = 0;
+
+end:
+	spin_unlock_irqrestore(&chan->tx_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(qmc_chan_write_submit);
+
+static void qmc_chan_write_done(struct qmc_chan *chan)
+{
+	struct qmc_xfer_desc *xfer_desc;
+	void (*complete)(void *context);
+	unsigned long flags;
+	void *context;
+	cbd_t *__iomem bd;
+	u16 ctrl;
+
+	/*
+	 * R bit  UB bit
+	 *   0       0  : The BD is free
+	 *   1       1  : The BD is in used, waiting for transfer
+	 *   0       1  : The BD is in used, waiting for completion
+	 *   1       0  : Should not append
+	 */
+
+	spin_lock_irqsave(&chan->tx_lock, flags);
+	bd = chan->txbd_done;
+
+	ctrl = qmc_read16(&bd->cbd_sc);
+	while (!(ctrl & QMC_BD_TX_R)) {
+		if (!(ctrl & QMC_BD_TX_UB))
+			goto end;
+
+		xfer_desc = &chan->tx_desc[bd - chan->txbds];
+		complete = xfer_desc->tx_complete;
+		context = xfer_desc->context;
+		xfer_desc->tx_complete = NULL;
+		xfer_desc->context = NULL;
+
+		qmc_write16(&bd->cbd_sc, ctrl & ~QMC_BD_TX_UB);
+
+		if (ctrl & QMC_BD_TX_W)
+			chan->txbd_done = chan->txbds;
+		else
+			chan->txbd_done++;
+
+		if (complete) {
+			spin_unlock_irqrestore(&chan->tx_lock, flags);
+			complete(context);
+			spin_lock_irqsave(&chan->tx_lock, flags);
+		}
+
+		bd = chan->txbd_done;
+		ctrl = qmc_read16(&bd->cbd_sc);
+	}
+
+end:
+	spin_unlock_irqrestore(&chan->tx_lock, flags);
+}
+
+int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length,
+			 void (*complete)(void *context, size_t length), void *context)
+{
+	struct qmc_xfer_desc *xfer_desc;
+	unsigned long flags;
+	cbd_t *__iomem bd;
+	u16 ctrl;
+	int ret;
+
+	/*
+	 * E bit  UB bit
+	 *   0       0  : The BD is free
+	 *   1       1  : The BD is in used, waiting for transfer
+	 *   0       1  : The BD is in used, waiting for completion
+	 *   1       0  : Should not append
+	 */
+
+	spin_lock_irqsave(&chan->rx_lock, flags);
+	bd = chan->rxbd_free;
+
+	ctrl = qmc_read16(&bd->cbd_sc);
+	if (ctrl & (QMC_BD_RX_E | QMC_BD_RX_UB)) {
+		/* We are full ... */
+		ret = -EBUSY;
+		goto end;
+	}
+
+	qmc_write16(&bd->cbd_datlen, 0); /* data length is updated by the QMC */
+	qmc_write32(&bd->cbd_bufaddr, addr);
+
+	xfer_desc = &chan->rx_desc[bd - chan->rxbds];
+	xfer_desc->rx_complete = complete;
+	xfer_desc->context = context;
+
+	/* Activate the descriptor */
+	ctrl |= (QMC_BD_RX_E | QMC_BD_RX_UB);
+	wmb(); /* Be sure to flush data before descriptor activation */
+	qmc_write16(&bd->cbd_sc, ctrl);
+
+	/* Restart receiver if needed */
+	if (chan->is_rx_halted && !chan->is_rx_stopped) {
+		/* Restart receiver */
+		if (chan->mode == QMC_TRANSPARENT)
+			qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x18000080);
+		else
+			qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x00000080);
+		qmc_write32(chan->s_param + QMC_SPE_RSTATE, 0x31000000);
+		chan->is_rx_halted = false;
+	}
+	chan->rx_pending++;
+
+	if (ctrl & QMC_BD_RX_W)
+		chan->rxbd_free = chan->rxbds;
+	else
+		chan->rxbd_free++;
+
+	ret = 0;
+end:
+	spin_unlock_irqrestore(&chan->rx_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(qmc_chan_read_submit);
+
+static void qmc_chan_read_done(struct qmc_chan *chan)
+{
+	void (*complete)(void *context, size_t size);
+	struct qmc_xfer_desc *xfer_desc;
+	unsigned long flags;
+	cbd_t *__iomem bd;
+	void *context;
+	u16 datalen;
+	u16 ctrl;
+
+	/*
+	 * E bit  UB bit
+	 *   0       0  : The BD is free
+	 *   1       1  : The BD is in used, waiting for transfer
+	 *   0       1  : The BD is in used, waiting for completion
+	 *   1       0  : Should not append
+	 */
+
+	spin_lock_irqsave(&chan->rx_lock, flags);
+	bd = chan->rxbd_done;
+
+	ctrl = qmc_read16(&bd->cbd_sc);
+	while (!(ctrl & QMC_BD_RX_E)) {
+		if (!(ctrl & QMC_BD_RX_UB))
+			goto end;
+
+		xfer_desc = &chan->rx_desc[bd - chan->rxbds];
+		complete = xfer_desc->rx_complete;
+		context = xfer_desc->context;
+		xfer_desc->rx_complete = NULL;
+		xfer_desc->context = NULL;
+
+		datalen = qmc_read16(&bd->cbd_datlen);
+		qmc_write16(&bd->cbd_sc, ctrl & ~QMC_BD_RX_UB);
+
+		if (ctrl & QMC_BD_RX_W)
+			chan->rxbd_done = chan->rxbds;
+		else
+			chan->rxbd_done++;
+
+		chan->rx_pending--;
+
+		if (complete) {
+			spin_unlock_irqrestore(&chan->rx_lock, flags);
+			complete(context, datalen);
+			spin_lock_irqsave(&chan->rx_lock, flags);
+		}
+
+		bd = chan->rxbd_done;
+		ctrl = qmc_read16(&bd->cbd_sc);
+	}
+
+end:
+	spin_unlock_irqrestore(&chan->rx_lock, flags);
+}
+
+static int qmc_chan_command(struct qmc_chan *chan, u8 qmc_opcode)
+{
+	return cpm_command(chan->id << 2, (qmc_opcode << 4) | 0x0E);
+}
+
+static int qmc_chan_stop_rx(struct qmc_chan *chan)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&chan->rx_lock, flags);
+
+	/* Send STOP RECEIVE command */
+	ret = qmc_chan_command(chan, 0x0);
+	if (ret) {
+		dev_err(chan->qmc->dev, "chan %u: Send STOP RECEIVE failed (%d)\n",
+			chan->id, ret);
+		goto end;
+	}
+
+	chan->is_rx_stopped = true;
+
+end:
+	spin_unlock_irqrestore(&chan->rx_lock, flags);
+	return ret;
+}
+
+static int qmc_chan_stop_tx(struct qmc_chan *chan)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&chan->tx_lock, flags);
+
+	/* Send STOP TRANSMIT command */
+	ret = qmc_chan_command(chan, 0x1);
+	if (ret) {
+		dev_err(chan->qmc->dev, "chan %u: Send STOP TRANSMIT failed (%d)\n",
+			chan->id, ret);
+		goto end;
+	}
+
+	chan->is_tx_stopped = true;
+
+end:
+	spin_unlock_irqrestore(&chan->tx_lock, flags);
+	return ret;
+}
+
+int qmc_chan_stop(struct qmc_chan *chan, int direction)
+{
+	int ret;
+
+	if (direction & QMC_CHAN_READ) {
+		ret = qmc_chan_stop_rx(chan);
+		if (ret)
+			return ret;
+	}
+
+	if (direction & QMC_CHAN_WRITE) {
+		ret = qmc_chan_stop_tx(chan);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(qmc_chan_stop);
+
+static void qmc_chan_start_rx(struct qmc_chan *chan)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->rx_lock, flags);
+
+	/* Restart the receiver */
+	if (chan->mode == QMC_TRANSPARENT)
+		qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x18000080);
+	else
+		qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x00000080);
+	qmc_write32(chan->s_param + QMC_SPE_RSTATE, 0x31000000);
+	chan->is_rx_halted = false;
+
+	chan->is_rx_stopped = false;
+
+	spin_unlock_irqrestore(&chan->rx_lock, flags);
+}
+
+static void qmc_chan_start_tx(struct qmc_chan *chan)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->tx_lock, flags);
+
+	/*
+	 * Enable channel transmitter as it could be disabled if
+	 * qmc_chan_reset() was called.
+	 */
+	qmc_setbits16(chan->s_param + QMC_SPE_CHAMR, QMC_SPE_CHAMR_ENT);
+
+	/* Set the POL bit in the channel mode register */
+	qmc_setbits16(chan->s_param + QMC_SPE_CHAMR, QMC_SPE_CHAMR_POL);
+
+	chan->is_tx_stopped = false;
+
+	spin_unlock_irqrestore(&chan->tx_lock, flags);
+}
+
+int qmc_chan_start(struct qmc_chan *chan, int direction)
+{
+	if (direction & QMC_CHAN_READ)
+		qmc_chan_start_rx(chan);
+
+	if (direction & QMC_CHAN_WRITE)
+		qmc_chan_start_tx(chan);
+
+	return 0;
+}
+EXPORT_SYMBOL(qmc_chan_start);
+
+static void qmc_chan_reset_rx(struct qmc_chan *chan)
+{
+	struct qmc_xfer_desc *xfer_desc;
+	unsigned long flags;
+	cbd_t *__iomem bd;
+	u16 ctrl;
+
+	spin_lock_irqsave(&chan->rx_lock, flags);
+	bd = chan->rxbds;
+	do {
+		ctrl = qmc_read16(&bd->cbd_sc);
+		qmc_write16(&bd->cbd_sc, ctrl & ~(QMC_BD_RX_UB | QMC_BD_RX_E));
+
+		xfer_desc = &chan->rx_desc[bd - chan->rxbds];
+		xfer_desc->rx_complete = NULL;
+		xfer_desc->context = NULL;
+
+		bd++;
+	} while (!(ctrl & QMC_BD_RX_W));
+
+	chan->rxbd_free = chan->rxbds;
+	chan->rxbd_done = chan->rxbds;
+	qmc_write16(chan->s_param + QMC_SPE_RBPTR,
+		    qmc_read16(chan->s_param + QMC_SPE_RBASE));
+
+	chan->rx_pending = 0;
+	chan->is_rx_stopped = false;
+
+	spin_unlock_irqrestore(&chan->rx_lock, flags);
+}
+
+static void qmc_chan_reset_tx(struct qmc_chan *chan)
+{
+	struct qmc_xfer_desc *xfer_desc;
+	unsigned long flags;
+	cbd_t *__iomem bd;
+	u16 ctrl;
+
+	spin_lock_irqsave(&chan->tx_lock, flags);
+
+	/* Disable transmitter. It will be re-enable on qmc_chan_start() */
+	qmc_clrbits16(chan->s_param + QMC_SPE_CHAMR, QMC_SPE_CHAMR_ENT);
+
+	bd = chan->txbds;
+	do {
+		ctrl = qmc_read16(&bd->cbd_sc);
+		qmc_write16(&bd->cbd_sc, ctrl & ~(QMC_BD_TX_UB | QMC_BD_TX_R));
+
+		xfer_desc = &chan->tx_desc[bd - chan->txbds];
+		xfer_desc->tx_complete = NULL;
+		xfer_desc->context = NULL;
+
+		bd++;
+	} while (!(ctrl & QMC_BD_TX_W));
+
+	chan->txbd_free = chan->txbds;
+	chan->txbd_done = chan->txbds;
+	qmc_write16(chan->s_param + QMC_SPE_TBPTR,
+		    qmc_read16(chan->s_param + QMC_SPE_TBASE));
+
+	/* Reset TSTATE and ZISTATE to their initial value */
+	qmc_write32(chan->s_param + QMC_SPE_TSTATE, 0x30000000);
+	qmc_write32(chan->s_param + QMC_SPE_ZISTATE, 0x00000100);
+
+	spin_unlock_irqrestore(&chan->tx_lock, flags);
+}
+
+int qmc_chan_reset(struct qmc_chan *chan, int direction)
+{
+	if (direction & QMC_CHAN_READ)
+		qmc_chan_reset_rx(chan);
+
+	if (direction & QMC_CHAN_WRITE)
+		qmc_chan_reset_tx(chan);
+
+	return 0;
+}
+EXPORT_SYMBOL(qmc_chan_reset);
+
+static int qmc_check_chans(struct qmc *qmc)
+{
+	struct tsa_serial_info info;
+	bool is_one_table = false;
+	struct qmc_chan *chan;
+	u64 tx_ts_mask = 0;
+	u64 rx_ts_mask = 0;
+	u64 tx_ts_assigned_mask;
+	u64 rx_ts_assigned_mask;
+	int ret;
+
+	/* Retrieve info from the TSA related serial */
+	ret = tsa_serial_get_info(qmc->tsa_serial, &info);
+	if (ret)
+		return ret;
+
+	/*
+	 * If more than 32 TS are assigned to this serial, one common table is
+	 * used for Tx and Rx and so masks must be equal for all channels.
+	 */
+	if ((info.nb_tx_ts > 32) || (info.nb_rx_ts > 32)) {
+		if (info.nb_tx_ts != info.nb_rx_ts) {
+			dev_err(qmc->dev, "Number of TSA Tx/Rx TS assigned are not equal\n");
+			return -EINVAL;
+		}
+		is_one_table = true;
+	}
+
+
+	tx_ts_assigned_mask = (((u64)1) << info.nb_tx_ts) - 1;
+	rx_ts_assigned_mask = (((u64)1) << info.nb_rx_ts) - 1;
+
+	list_for_each_entry(chan, &qmc->chan_head, list) {
+		if (chan->tx_ts_mask > tx_ts_assigned_mask) {
+			dev_err(qmc->dev, "chan %u uses TSA unassigned Tx TS\n", chan->id);
+			return -EINVAL;
+		}
+		if (tx_ts_mask & chan->tx_ts_mask) {
+			dev_err(qmc->dev, "chan %u uses an already used Tx TS\n", chan->id);
+			return -EINVAL;
+		}
+
+		if (chan->rx_ts_mask > rx_ts_assigned_mask) {
+			dev_err(qmc->dev, "chan %u uses TSA unassigned Rx TS\n", chan->id);
+			return -EINVAL;
+		}
+		if (rx_ts_mask & chan->rx_ts_mask) {
+			dev_err(qmc->dev, "chan %u uses an already used Rx TS\n", chan->id);
+			return -EINVAL;
+		}
+
+		if (is_one_table && (chan->tx_ts_mask != chan->rx_ts_mask)) {
+			dev_err(qmc->dev, "chan %u uses different Rx and Tx TS\n", chan->id);
+			return -EINVAL;
+		}
+
+		tx_ts_mask |= chan->tx_ts_mask;
+		rx_ts_mask |= chan->rx_ts_mask;
+	}
+
+	return 0;
+}
+
+static unsigned int qmc_nb_chans(struct qmc *qmc)
+{
+	unsigned int count = 0;
+	struct qmc_chan *chan;
+
+	list_for_each_entry(chan, &qmc->chan_head, list)
+		count++;
+
+	return count;
+}
+
+static int qmc_of_parse_chans(struct qmc *qmc, struct device_node *np)
+{
+	struct device_node *chan_np;
+	struct qmc_chan *chan;
+	const char *mode;
+	u32 chan_id;
+	u64 ts_mask;
+	int ret;
+
+	for_each_available_child_of_node(np, chan_np) {
+		ret = of_property_read_u32(chan_np, "reg", &chan_id);
+		if (ret) {
+			dev_err(qmc->dev, "%pOF: failed to read reg\n", chan_np);
+			of_node_put(chan_np);
+			return ret;
+		}
+		if (chan_id > 63) {
+			dev_err(qmc->dev, "%pOF: Invalid chan_id\n", chan_np);
+			of_node_put(chan_np);
+			return -EINVAL;
+		}
+
+		chan = devm_kzalloc(qmc->dev, sizeof(*chan), GFP_KERNEL);
+		if (!chan) {
+			of_node_put(chan_np);
+			return -ENOMEM;
+		}
+
+		chan->id = chan_id;
+		spin_lock_init(&chan->rx_lock);
+		spin_lock_init(&chan->tx_lock);
+
+		ret = of_property_read_u64(chan_np, "fsl,tx-ts-mask", &ts_mask);
+		if (ret) {
+			dev_err(qmc->dev, "%pOF: failed to read fsl,tx-ts-mask\n",
+				chan_np);
+			of_node_put(chan_np);
+			return ret;
+		}
+		chan->tx_ts_mask = ts_mask;
+
+		ret = of_property_read_u64(chan_np, "fsl,rx-ts-mask", &ts_mask);
+		if (ret) {
+			dev_err(qmc->dev, "%pOF: failed to read fsl,rx-ts-mask\n",
+				chan_np);
+			of_node_put(chan_np);
+			return ret;
+		}
+		chan->rx_ts_mask = ts_mask;
+
+		mode = "transparent";
+		ret = of_property_read_string(chan_np, "fsl,operational-mode", &mode);
+		if (ret && ret != -EINVAL) {
+			dev_err(qmc->dev, "%pOF: failed to read fsl,operational-mode\n",
+				chan_np);
+			of_node_put(chan_np);
+			return ret;
+		}
+		if (!strcmp(mode, "transparent")) {
+			chan->mode = QMC_TRANSPARENT;
+		} else if (!strcmp(mode, "hdlc")) {
+			chan->mode = QMC_HDLC;
+		} else {
+			dev_err(qmc->dev, "%pOF: Invalid fsl,operational-mode (%s)\n",
+				chan_np, mode);
+			of_node_put(chan_np);
+			return -EINVAL;
+		}
+
+		chan->is_reverse_data = of_property_read_bool(chan_np,
+							      "fsl,reverse-data");
+
+		list_add_tail(&chan->list, &qmc->chan_head);
+		qmc->chans[chan->id] = chan;
+	}
+
+	return qmc_check_chans(qmc);
+}
+
+static int qmc_setup_tsa_64rxtx(struct qmc *qmc, const struct tsa_serial_info *info)
+{
+	struct qmc_chan *chan;
+	unsigned int i;
+	u16 val;
+
+	/*
+	 * Use a common Tx/Rx 64 entries table.
+	 * Everything was previously checked, Tx and Rx related stuffs are
+	 * identical -> Used Rx related stuff to build the table
+	 */
+
+	/* Invalidate all entries */
+	for (i = 0; i < 64; i++)
+		qmc_write16(qmc->scc_pram + QMC_GBL_TSATRX + (i * 2), 0x0000);
+
+	/* Set entries based on Rx stuff*/
+	list_for_each_entry(chan, &qmc->chan_head, list) {
+		for (i = 0; i < info->nb_rx_ts; i++) {
+			if (!(chan->rx_ts_mask & (((u64)1) << i)))
+				continue;
+
+			val = QMC_TSA_VALID | QMC_TSA_MASK |
+			      QMC_TSA_CHANNEL(chan->id);
+			qmc_write16(qmc->scc_pram + QMC_GBL_TSATRX + (i * 2), val);
+		}
+	}
+
+	/* Set Wrap bit on last entry */
+	qmc_setbits16(qmc->scc_pram + QMC_GBL_TSATRX + ((info->nb_rx_ts - 1) * 2),
+		      QMC_TSA_WRAP);
+
+	/* Init pointers to the table */
+	val = qmc->scc_pram_offset + QMC_GBL_TSATRX;
+	qmc_write16(qmc->scc_pram + QMC_GBL_RX_S_PTR, val);
+	qmc_write16(qmc->scc_pram + QMC_GBL_RXPTR, val);
+	qmc_write16(qmc->scc_pram + QMC_GBL_TX_S_PTR, val);
+	qmc_write16(qmc->scc_pram + QMC_GBL_TXPTR, val);
+
+	return 0;
+}
+
+static int qmc_setup_tsa_32rx_32tx(struct qmc *qmc, const struct tsa_serial_info *info)
+{
+	struct qmc_chan *chan;
+	unsigned int i;
+	u16 val;
+
+	/*
+	 * Use a Tx 32 entries table and a Rx 32 entries table.
+	 * Everything was previously checked.
+	 */
+
+	/* Invalidate all entries */
+	for (i = 0; i < 32; i++) {
+		qmc_write16(qmc->scc_pram + QMC_GBL_TSATRX + (i * 2), 0x0000);
+		qmc_write16(qmc->scc_pram + QMC_GBL_TSATTX + (i * 2), 0x0000);
+	}
+
+	/* Set entries based on Rx and Tx stuff*/
+	list_for_each_entry(chan, &qmc->chan_head, list) {
+		/* Rx part */
+		for (i = 0; i < info->nb_rx_ts; i++) {
+			if (!(chan->rx_ts_mask & (((u64)1) << i)))
+				continue;
+
+			val = QMC_TSA_VALID | QMC_TSA_MASK |
+			      QMC_TSA_CHANNEL(chan->id);
+			qmc_write16(qmc->scc_pram + QMC_GBL_TSATRX + (i * 2), val);
+		}
+		/* Tx part */
+		for (i = 0; i < info->nb_tx_ts; i++) {
+			if (!(chan->tx_ts_mask & (((u64)1) << i)))
+				continue;
+
+			val = QMC_TSA_VALID | QMC_TSA_MASK |
+			      QMC_TSA_CHANNEL(chan->id);
+			qmc_write16(qmc->scc_pram + QMC_GBL_TSATTX + (i * 2), val);
+		}
+	}
+
+	/* Set Wrap bit on last entries */
+	qmc_setbits16(qmc->scc_pram + QMC_GBL_TSATRX + ((info->nb_rx_ts - 1) * 2),
+		      QMC_TSA_WRAP);
+	qmc_setbits16(qmc->scc_pram + QMC_GBL_TSATTX + ((info->nb_tx_ts - 1) * 2),
+		      QMC_TSA_WRAP);
+
+	/* Init Rx pointers ...*/
+	val = qmc->scc_pram_offset + QMC_GBL_TSATRX;
+	qmc_write16(qmc->scc_pram + QMC_GBL_RX_S_PTR, val);
+	qmc_write16(qmc->scc_pram + QMC_GBL_RXPTR, val);
+
+	/* ... and Tx pointers */
+	val = qmc->scc_pram_offset + QMC_GBL_TSATTX;
+	qmc_write16(qmc->scc_pram + QMC_GBL_TX_S_PTR, val);
+	qmc_write16(qmc->scc_pram + QMC_GBL_TXPTR, val);
+
+	return 0;
+}
+
+static int qmc_setup_tsa(struct qmc *qmc)
+{
+	struct tsa_serial_info info;
+	int ret;
+
+	/* Retrieve info from the TSA related serial */
+	ret = tsa_serial_get_info(qmc->tsa_serial, &info);
+	if (ret)
+		return ret;
+
+	/*
+	 * Setup one common 64 entries table or two 32 entries (one for Tx and
+	 * one for Tx) according to assigned TS numbers.
+	 */
+	return ((info.nb_tx_ts > 32) || (info.nb_rx_ts > 32)) ?
+		qmc_setup_tsa_64rxtx(qmc, &info) :
+		qmc_setup_tsa_32rx_32tx(qmc, &info);
+}
+
+static int qmc_setup_chan_trnsync(struct qmc *qmc, struct qmc_chan *chan)
+{
+	struct tsa_serial_info info;
+	u16 first_rx, last_tx;
+	u16 trnsync;
+	int ret;
+
+	/* Retrieve info from the TSA related serial */
+	ret = tsa_serial_get_info(chan->qmc->tsa_serial, &info);
+	if (ret)
+		return ret;
+
+	/* Find the first Rx TS allocated to the channel */
+	first_rx = chan->rx_ts_mask ? __ffs64(chan->rx_ts_mask) + 1 : 0;
+
+	/* Find the last Tx TS allocated to the channel */
+	last_tx = fls64(chan->tx_ts_mask);
+
+	trnsync = 0;
+	if (info.nb_rx_ts)
+		trnsync |= QMC_SPE_TRNSYNC_RX((first_rx % info.nb_rx_ts) * 2);
+	if (info.nb_tx_ts)
+		trnsync |= QMC_SPE_TRNSYNC_TX((last_tx % info.nb_tx_ts) * 2);
+
+	qmc_write16(chan->s_param + QMC_SPE_TRNSYNC, trnsync);
+
+	dev_dbg(qmc->dev, "chan %u: trnsync=0x%04x, rx %u/%u 0x%llx, tx %u/%u 0x%llx\n",
+		chan->id, trnsync,
+		first_rx, info.nb_rx_ts, chan->rx_ts_mask,
+		last_tx, info.nb_tx_ts, chan->tx_ts_mask);
+
+	return 0;
+}
+
+static int qmc_setup_chan(struct qmc *qmc, struct qmc_chan *chan)
+{
+	unsigned int i;
+	cbd_t __iomem *bd;
+	int ret;
+	u16 val;
+
+	chan->qmc = qmc;
+
+	/* Set channel specific parameter base address */
+	chan->s_param = qmc->dpram + (chan->id * 64);
+	/* 16 bd per channel (8 rx and 8 tx) */
+	chan->txbds = qmc->bd_table + (chan->id * (QMC_NB_TXBDS + QMC_NB_RXBDS));
+	chan->rxbds = qmc->bd_table + (chan->id * (QMC_NB_TXBDS + QMC_NB_RXBDS)) + QMC_NB_TXBDS;
+
+	chan->txbd_free = chan->txbds;
+	chan->txbd_done = chan->txbds;
+	chan->rxbd_free = chan->rxbds;
+	chan->rxbd_done = chan->rxbds;
+
+	/* TBASE and TBPTR*/
+	val = chan->id * (QMC_NB_TXBDS + QMC_NB_RXBDS) * sizeof(cbd_t);
+	qmc_write16(chan->s_param + QMC_SPE_TBASE, val);
+	qmc_write16(chan->s_param + QMC_SPE_TBPTR, val);
+
+	/* RBASE and RBPTR*/
+	val = ((chan->id * (QMC_NB_TXBDS + QMC_NB_RXBDS)) + QMC_NB_TXBDS) * sizeof(cbd_t);
+	qmc_write16(chan->s_param + QMC_SPE_RBASE, val);
+	qmc_write16(chan->s_param + QMC_SPE_RBPTR, val);
+	qmc_write32(chan->s_param + QMC_SPE_TSTATE, 0x30000000);
+	qmc_write32(chan->s_param + QMC_SPE_RSTATE, 0x31000000);
+	qmc_write32(chan->s_param + QMC_SPE_ZISTATE, 0x00000100);
+	if (chan->mode == QMC_TRANSPARENT) {
+		qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x18000080);
+		qmc_write16(chan->s_param + QMC_SPE_TMRBLR, 60);
+		val = QMC_SPE_CHAMR_MODE_TRANSP | QMC_SPE_CHAMR_TRANSP_SYNC;
+		if (chan->is_reverse_data)
+			val |= QMC_SPE_CHAMR_TRANSP_RD;
+		qmc_write16(chan->s_param + QMC_SPE_CHAMR, val);
+		ret = qmc_setup_chan_trnsync(qmc, chan);
+		if (ret)
+			return ret;
+	} else {
+		qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x00000080);
+		qmc_write16(chan->s_param + QMC_SPE_MFLR, 60);
+		qmc_write16(chan->s_param + QMC_SPE_CHAMR,
+			QMC_SPE_CHAMR_MODE_HDLC | QMC_SPE_CHAMR_HDLC_IDLM);
+	}
+
+	/* Do not enable interrupts now. They will be enabled later */
+	qmc_write16(chan->s_param + QMC_SPE_INTMSK, 0x0000);
+
+	/* Init Rx BDs and set Wrap bit on last descriptor */
+	BUILD_BUG_ON(QMC_NB_RXBDS == 0);
+	val = QMC_BD_RX_I;
+	for (i = 0; i < QMC_NB_RXBDS; i++) {
+		bd = chan->rxbds + i;
+		qmc_write16(&bd->cbd_sc, val);
+	}
+	bd = chan->rxbds + QMC_NB_RXBDS - 1;
+	qmc_write16(&bd->cbd_sc, val | QMC_BD_RX_W);
+
+	/* Init Tx BDs and set Wrap bit on last descriptor */
+	BUILD_BUG_ON(QMC_NB_TXBDS == 0);
+	val = QMC_BD_TX_I;
+	if (chan->mode == QMC_HDLC)
+		val |= QMC_BD_TX_L | QMC_BD_TX_TC;
+	for (i = 0; i < QMC_NB_TXBDS; i++) {
+		bd = chan->txbds + i;
+		qmc_write16(&bd->cbd_sc, val);
+	}
+	bd = chan->txbds + QMC_NB_TXBDS - 1;
+	qmc_write16(&bd->cbd_sc, val | QMC_BD_TX_W);
+
+	return 0;
+}
+
+static int qmc_setup_chans(struct qmc *qmc)
+{
+	struct qmc_chan *chan;
+	int ret;
+
+	list_for_each_entry(chan, &qmc->chan_head, list) {
+		ret = qmc_setup_chan(qmc, chan);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int qmc_finalize_chans(struct qmc *qmc)
+{
+	struct qmc_chan *chan;
+	int ret;
+
+	list_for_each_entry(chan, &qmc->chan_head, list) {
+		/* Unmask channel interrupts */
+		if (chan->mode == QMC_HDLC) {
+			qmc_write16(chan->s_param + QMC_SPE_INTMSK,
+				    QMC_INT_NID | QMC_INT_IDL | QMC_INT_MRF |
+				    QMC_INT_UN | QMC_INT_RXF | QMC_INT_BSY |
+				    QMC_INT_TXB | QMC_INT_RXB);
+		} else {
+			qmc_write16(chan->s_param + QMC_SPE_INTMSK,
+				    QMC_INT_UN | QMC_INT_BSY |
+				    QMC_INT_TXB | QMC_INT_RXB);
+		}
+
+		/* Forced stop the channel */
+		ret = qmc_chan_stop(chan, QMC_CHAN_ALL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int qmc_setup_ints(struct qmc *qmc)
+{
+	unsigned int i;
+	u16 __iomem *last;
+
+	/* Raz all entries */
+	for (i = 0; i < (qmc->int_size / sizeof(u16)); i++)
+		qmc_write16(qmc->int_table + i, 0x0000);
+
+	/* Set Wrap bit on last entry */
+	if (qmc->int_size >= sizeof(u16)) {
+		last = qmc->int_table + (qmc->int_size / sizeof(u16)) - 1;
+		qmc_write16(last, QMC_INT_W);
+	}
+
+	return 0;
+}
+
+static void qmc_irq_gint(struct qmc *qmc)
+{
+	struct qmc_chan *chan;
+	unsigned int chan_id;
+	unsigned long flags;
+	u16 int_entry;
+
+	int_entry = qmc_read16(qmc->int_curr);
+	while (int_entry & QMC_INT_V) {
+		/* Clear all but the Wrap bit */
+		qmc_write16(qmc->int_curr, int_entry & QMC_INT_W);
+
+		chan_id = QMC_INT_GET_CHANNEL(int_entry);
+		chan = qmc->chans[chan_id];
+		if (!chan) {
+			dev_err(qmc->dev, "interrupt on invalid chan %u\n", chan_id);
+			goto int_next;
+		}
+
+		if (int_entry & QMC_INT_TXB)
+			qmc_chan_write_done(chan);
+
+		if (int_entry & QMC_INT_UN) {
+			dev_info(qmc->dev, "intr chan %u, 0x%04x (UN)\n", chan_id,
+				 int_entry);
+			chan->nb_tx_underrun++;
+		}
+
+		if (int_entry & QMC_INT_BSY) {
+			dev_info(qmc->dev, "intr chan %u, 0x%04x (BSY)\n", chan_id,
+				 int_entry);
+			chan->nb_rx_busy++;
+			/* Restart the receiver if needed */
+			spin_lock_irqsave(&chan->rx_lock, flags);
+			if (chan->rx_pending && !chan->is_rx_stopped) {
+				if (chan->mode == QMC_TRANSPARENT)
+					qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x18000080);
+				else
+					qmc_write32(chan->s_param + QMC_SPE_ZDSTATE, 0x00000080);
+				qmc_write32(chan->s_param + QMC_SPE_RSTATE, 0x31000000);
+				chan->is_rx_halted = false;
+			} else {
+				chan->is_rx_halted = true;
+			}
+			spin_unlock_irqrestore(&chan->rx_lock, flags);
+		}
+
+		if (int_entry & QMC_INT_RXB)
+			qmc_chan_read_done(chan);
+
+int_next:
+		if (int_entry & QMC_INT_W)
+			qmc->int_curr = qmc->int_table;
+		else
+			qmc->int_curr++;
+		int_entry = qmc_read16(qmc->int_curr);
+	}
+}
+
+static irqreturn_t qmc_irq_handler(int irq, void *priv)
+{
+	struct qmc *qmc = (struct qmc *)priv;
+	u16 scce;
+
+	scce = qmc_read16(qmc->scc_regs + SCC_SCCE);
+	qmc_write16(qmc->scc_regs + SCC_SCCE, scce);
+
+	if (unlikely(scce & SCC_SCCE_IQOV))
+		dev_info(qmc->dev, "IRQ queue overflow\n");
+
+	if (unlikely(scce & SCC_SCCE_GUN))
+		dev_err(qmc->dev, "Global transmitter underrun\n");
+
+	if (unlikely(scce & SCC_SCCE_GOV))
+		dev_err(qmc->dev, "Global receiver overrun\n");
+
+	/* normal interrupt */
+	if (likely(scce & SCC_SCCE_GINT))
+		qmc_irq_gint(qmc);
+
+	return IRQ_HANDLED;
+}
+
+static int qmc_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	unsigned int nb_chans;
+	struct resource *res;
+	struct qmc *qmc;
+	int irq;
+	int ret;
+
+	qmc = devm_kzalloc(&pdev->dev, sizeof(*qmc), GFP_KERNEL);
+	if (!qmc)
+		return -ENOMEM;
+
+	qmc->dev = &pdev->dev;
+	INIT_LIST_HEAD(&qmc->chan_head);
+
+	qmc->scc_regs = devm_platform_ioremap_resource_byname(pdev, "scc_regs");
+	if (IS_ERR(qmc->scc_regs))
+		return PTR_ERR(qmc->scc_regs);
+
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "scc_pram");
+	if (!res)
+		return -EINVAL;
+	qmc->scc_pram_offset = res->start - get_immrbase();
+	qmc->scc_pram = devm_ioremap_resource(qmc->dev, res);
+	if (IS_ERR(qmc->scc_pram))
+		return PTR_ERR(qmc->scc_pram);
+
+	qmc->dpram  = devm_platform_ioremap_resource_byname(pdev, "dpram");
+	if (IS_ERR(qmc->dpram))
+		return PTR_ERR(qmc->dpram);
+
+	qmc->tsa_serial = devm_tsa_serial_get_byphandle(qmc->dev, np, "fsl,tsa-serial");
+	if (IS_ERR(qmc->tsa_serial)) {
+		return dev_err_probe(qmc->dev, PTR_ERR(qmc->tsa_serial),
+				     "Failed to get TSA serial\n");
+	}
+
+	/* Connect the serial (SCC) to TSA */
+	ret = tsa_serial_connect(qmc->tsa_serial);
+	if (ret) {
+		dev_err(qmc->dev, "Failed to connect TSA serial\n");
+		return ret;
+	}
+
+	/* Parse channels informationss */
+	ret = qmc_of_parse_chans(qmc, np);
+	if (ret)
+		goto err_tsa_serial_disconnect;
+
+	nb_chans = qmc_nb_chans(qmc);
+
+	/* Init GMSR_H and GMSR_L registers */
+	qmc_write32(qmc->scc_regs + SCC_GSMRH,
+		    SCC_GSMRH_CDS | SCC_GSMRH_CTSS | SCC_GSMRH_CDP | SCC_GSMRH_CTSP);
+
+	/* enable QMC mode */
+	qmc_write32(qmc->scc_regs + SCC_GSMRL, SCC_GSMRL_MODE_QMC);
+
+	/*
+	 * Allocate the buffer descriptor table
+	 * 8 rx and 8 tx descriptors per channel
+	 */
+	qmc->bd_size = (nb_chans * (QMC_NB_TXBDS + QMC_NB_RXBDS)) * sizeof(cbd_t);
+	qmc->bd_table = dmam_alloc_coherent(qmc->dev, qmc->bd_size,
+		&qmc->bd_dma_addr, GFP_KERNEL);
+	if (!qmc->bd_table) {
+		dev_err(qmc->dev, "Failed to allocate bd table\n");
+		ret = -ENOMEM;
+		goto err_tsa_serial_disconnect;
+	}
+	memset(qmc->bd_table, 0, qmc->bd_size);
+
+	qmc_write32(qmc->scc_pram + QMC_GBL_MCBASE, qmc->bd_dma_addr);
+
+	/* Allocate the interrupt table */
+	qmc->int_size = QMC_NB_INTS * sizeof(u16);
+	qmc->int_table = dmam_alloc_coherent(qmc->dev, qmc->int_size,
+		&qmc->int_dma_addr, GFP_KERNEL);
+	if (!qmc->int_table) {
+		dev_err(qmc->dev, "Failed to allocate interrupt table\n");
+		ret = -ENOMEM;
+		goto err_tsa_serial_disconnect;
+	}
+	memset(qmc->int_table, 0, qmc->int_size);
+
+	qmc->int_curr = qmc->int_table;
+	qmc_write32(qmc->scc_pram + QMC_GBL_INTBASE, qmc->int_dma_addr);
+	qmc_write32(qmc->scc_pram + QMC_GBL_INTPTR, qmc->int_dma_addr);
+
+	/* Set MRBLR (valid for HDLC only) max MRU + max CRC */
+	qmc_write16(qmc->scc_pram + QMC_GBL_MRBLR, HDLC_MAX_MRU + 4);
+
+	qmc_write16(qmc->scc_pram + QMC_GBL_GRFTHR, 1);
+	qmc_write16(qmc->scc_pram + QMC_GBL_GRFCNT, 1);
+
+	qmc_write32(qmc->scc_pram + QMC_GBL_C_MASK32, 0xDEBB20E3);
+	qmc_write16(qmc->scc_pram + QMC_GBL_C_MASK16, 0xF0B8);
+
+	ret = qmc_setup_tsa(qmc);
+	if (ret)
+		goto err_tsa_serial_disconnect;
+
+	qmc_write16(qmc->scc_pram + QMC_GBL_QMCSTATE, 0x8000);
+
+	ret = qmc_setup_chans(qmc);
+	if (ret)
+		goto err_tsa_serial_disconnect;
+
+	/* Init interrupts table */
+	ret = qmc_setup_ints(qmc);
+	if (ret)
+		goto err_tsa_serial_disconnect;
+
+	/* Disable and clear interrupts,  set the irq handler */
+	qmc_write16(qmc->scc_regs + SCC_SCCM, 0x0000);
+	qmc_write16(qmc->scc_regs + SCC_SCCE, 0x000F);
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		goto err_tsa_serial_disconnect;
+	ret = devm_request_irq(qmc->dev, irq, qmc_irq_handler, 0, "qmc", qmc);
+	if (ret < 0)
+		goto err_tsa_serial_disconnect;
+
+	/* Enable interrupts */
+	qmc_write16(qmc->scc_regs + SCC_SCCM,
+		SCC_SCCE_IQOV | SCC_SCCE_GINT | SCC_SCCE_GUN | SCC_SCCE_GOV);
+
+	ret = qmc_finalize_chans(qmc);
+	if (ret < 0)
+		goto err_disable_intr;
+
+	/* Enable transmiter and receiver */
+	qmc_setbits32(qmc->scc_regs + SCC_GSMRL, SCC_GSMRL_ENR | SCC_GSMRL_ENT);
+
+	platform_set_drvdata(pdev, qmc);
+
+	return 0;
+
+err_disable_intr:
+	qmc_write16(qmc->scc_regs + SCC_SCCM, 0);
+
+err_tsa_serial_disconnect:
+	tsa_serial_disconnect(qmc->tsa_serial);
+	return ret;
+}
+
+static int qmc_remove(struct platform_device *pdev)
+{
+	struct qmc *qmc = platform_get_drvdata(pdev);
+
+	/* Disable transmiter and receiver */
+	qmc_setbits32(qmc->scc_regs + SCC_GSMRL, 0);
+
+	/* Disable interrupts */
+	qmc_write16(qmc->scc_regs + SCC_SCCM, 0);
+
+	/* Disconnect the serial from TSA */
+	tsa_serial_disconnect(qmc->tsa_serial);
+
+	return 0;
+}
+
+static const struct of_device_id qmc_id_table[] = {
+	{ .compatible = "fsl,cpm1-scc-qmc" },
+	{} /* sentinel */
+};
+MODULE_DEVICE_TABLE(of, qmc_id_table);
+
+static struct platform_driver qmc_driver = {
+	.driver = {
+		.name = "fsl-qmc",
+		.of_match_table = of_match_ptr(qmc_id_table),
+	},
+	.probe = qmc_probe,
+	.remove = qmc_remove,
+};
+module_platform_driver(qmc_driver);
+
+struct qmc_chan *qmc_chan_get_byphandle(struct device_node *np, const char *phandle_name)
+{
+	struct of_phandle_args out_args;
+	struct platform_device *pdev;
+	struct qmc_chan *qmc_chan;
+	struct qmc *qmc;
+	int ret;
+
+	ret = of_parse_phandle_with_fixed_args(np, phandle_name, 1, 0,
+					       &out_args);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!of_match_node(qmc_driver.driver.of_match_table, out_args.np)) {
+		of_node_put(out_args.np);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pdev = of_find_device_by_node(out_args.np);
+	of_node_put(out_args.np);
+	if (!pdev)
+		return ERR_PTR(-ENODEV);
+
+	qmc = platform_get_drvdata(pdev);
+	if (!qmc) {
+		platform_device_put(pdev);
+		return ERR_PTR(-EPROBE_DEFER);
+	}
+
+	if (out_args.args_count != 1) {
+		platform_device_put(pdev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (out_args.args[0] >= ARRAY_SIZE(qmc->chans)) {
+		platform_device_put(pdev);
+		return ERR_PTR(-EINVAL);
+	}
+
+	qmc_chan = qmc->chans[out_args.args[0]];
+	if (!qmc_chan) {
+		platform_device_put(pdev);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return qmc_chan;
+}
+EXPORT_SYMBOL(qmc_chan_get_byphandle);
+
+void qmc_chan_put(struct qmc_chan *chan)
+{
+	put_device(chan->qmc->dev);
+}
+EXPORT_SYMBOL(qmc_chan_put);
+
+static void devm_qmc_chan_release(struct device *dev, void *res)
+{
+	struct qmc_chan **qmc_chan = res;
+
+	qmc_chan_put(*qmc_chan);
+}
+
+struct qmc_chan *devm_qmc_chan_get_byphandle(struct device *dev,
+					     struct device_node *np,
+					     const char *phandle_name)
+{
+	struct qmc_chan *qmc_chan;
+	struct qmc_chan **dr;
+
+	dr = devres_alloc(devm_qmc_chan_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return ERR_PTR(-ENOMEM);
+
+	qmc_chan = qmc_chan_get_byphandle(np, phandle_name);
+	if (!IS_ERR(qmc_chan)) {
+		*dr = qmc_chan;
+		devres_add(dev, dr);
+	} else {
+		devres_free(dr);
+	}
+
+	return qmc_chan;
+}
+EXPORT_SYMBOL(devm_qmc_chan_get_byphandle);
+
+MODULE_AUTHOR("Herve Codina <herve.codina@bootlin.com>");
+MODULE_DESCRIPTION("CPM QMC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/soc/fsl/qe/qmc.h b/include/soc/fsl/qe/qmc.h
new file mode 100644
index 000000000000..3c61a50d2ae2
--- /dev/null
+++ b/include/soc/fsl/qe/qmc.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * QMC management
+ *
+ * Copyright 2022 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+#ifndef __SOC_FSL_QMC_H__
+#define __SOC_FSL_QMC_H__
+
+#include <linux/types.h>
+
+struct device_node;
+struct device;
+struct qmc_chan;
+
+struct qmc_chan *qmc_chan_get_byphandle(struct device_node *np, const char *phandle_name);
+void qmc_chan_put(struct qmc_chan *chan);
+struct qmc_chan *devm_qmc_chan_get_byphandle(struct device *dev, struct device_node *np,
+					     const char *phandle_name);
+
+enum qmc_mode {
+	QMC_TRANSPARENT,
+	QMC_HDLC,
+};
+
+struct qmc_chan_info {
+	enum qmc_mode mode;
+	unsigned long rx_fs_rate;
+	unsigned long rx_bit_rate;
+	u8 nb_rx_ts;
+	unsigned long tx_fs_rate;
+	unsigned long tx_bit_rate;
+	u8 nb_tx_ts;
+};
+
+int qmc_chan_get_info(struct qmc_chan *chan, struct qmc_chan_info *info);
+
+struct qmc_chan_param {
+	enum qmc_mode mode;
+	union {
+		struct {
+			u16 max_rx_buf_size;
+			u16 max_rx_frame_size;
+			bool is_crc32;
+		} hdlc;
+		struct {
+			u16 max_rx_buf_size;
+		} transp;
+	};
+};
+
+int qmc_chan_set_param(struct qmc_chan *chan, const struct qmc_chan_param *param);
+
+int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length,
+			  void (*complete)(void *context), void *context);
+
+int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length,
+			 void (*complete)(void *context, size_t length),
+			 void *context);
+
+#define QMC_CHAN_READ  (1<<0)
+#define QMC_CHAN_WRITE (1<<1)
+#define QMC_CHAN_ALL   (QMC_CHAN_READ | QMC_CHAN_WRITE)
+
+int qmc_chan_start(struct qmc_chan *chan, int direction);
+int qmc_chan_stop(struct qmc_chan *chan, int direction);
+int qmc_chan_reset(struct qmc_chan *chan, int direction);
+
+#endif /* __SOC_FSL_QMC_H__ */
-- 
cgit v1.2.3


From 4d60cac951fd2dfbf261b83abb805748abc8b995 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Mon, 27 Feb 2023 11:45:22 -0500
Subject: regmap-irq: Add no_status support

Some devices lack status registers, yet expect to handle interrupts.
Introduce a no_status flag to indicate such a configuration, where
rather than read a status register to verify, all interrupts received
are assumed to be active.

Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/bd501b4b5ff88da24d467f75e8c71b4e0e6f21e2.1677515341.git.william.gray@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 22 +++++++++++++++-------
 include/linux/regmap.h           |  2 ++
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 8c903b8c9714..f4d544ee7c30 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -433,7 +433,10 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 	 * possible in order to reduce the I/O overheads.
 	 */
 
-	if (chip->num_main_regs) {
+	if (chip->no_status) {
+		/* no status register so default to all active */
+		memset32(data->status_buf, GENMASK(31, 0), chip->num_regs);
+	} else if (chip->num_main_regs) {
 		unsigned int max_main_bits;
 		unsigned long size;
 
@@ -949,12 +952,17 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			continue;
 
 		/* Ack masked but set interrupts */
-		reg = d->get_irq_reg(d, d->chip->status_base, i);
-		ret = regmap_read(map, reg, &d->status_buf[i]);
-		if (ret != 0) {
-			dev_err(map->dev, "Failed to read IRQ status: %d\n",
-				ret);
-			goto err_alloc;
+		if (d->chip->no_status) {
+			/* no status register so default to all active */
+			d->status_buf[i] = GENMASK(31, 0);
+		} else {
+			reg = d->get_irq_reg(d, d->chip->status_base, i);
+			ret = regmap_read(map, reg, &d->status_buf[i]);
+			if (ret != 0) {
+				dev_err(map->dev, "Failed to read IRQ status: %d\n",
+					ret);
+				goto err_alloc;
+			}
 		}
 
 		if (chip->status_invert)
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4d10790adeb0..301286149643 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1567,6 +1567,7 @@ struct regmap_irq_chip_data;
  *		      the need for a @sub_reg_offsets table.
  * @status_invert: Inverted status register: cleared bits are active interrupts.
  * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
+ * @no_status: No status register: all interrupts assumed generated by device.
  *
  * @num_regs:    Number of registers in each control bank.
  * @irqs:        Descriptors for individual IRQs.  Interrupt numbers are
@@ -1631,6 +1632,7 @@ struct regmap_irq_chip {
 	unsigned int clear_on_unmask:1;
 	unsigned int not_fixed_stride:1;
 	unsigned int status_invert:1;
+	unsigned int no_status:1;
 
 	int num_regs;
 
-- 
cgit v1.2.3


From 9b400171a69d2487c3196cc3b6de60de3b08e1ee Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Feb 2023 17:33:34 +0200
Subject: regmap-irq: Place kernel doc of struct regmap_irq_chip in order

It seems that a couple of members got lost theirorder, put them back.
Besides that, split field descriptions into groups in the same way
as it's done in the structure definition.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230220153334.87049-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4d10790adeb0..1c777566fb7d 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1551,6 +1551,7 @@ struct regmap_irq_chip_data;
  * @use_ack:     Use @ack register even if it is zero.
  * @ack_invert:  Inverted ack register: cleared bits for ack.
  * @clear_ack:  Use this to set 1 and 0 or vice-versa to clear interrupts.
+ * @status_invert: Inverted status register: cleared bits are active interrupts.
  * @wake_invert: Inverted wake register: cleared bits are wake enabled.
  * @type_in_mask: Use the mask registers for controlling irq type. Use this if
  *		  the hardware provides separate bits for rising/falling edge
@@ -1560,18 +1561,19 @@ struct regmap_irq_chip_data;
  * @clear_on_unmask: For chips with interrupts cleared on read: read the status
  *                   registers before unmasking interrupts to clear any bits
  *                   set when they were masked.
+ * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
  * @not_fixed_stride: Used when chip peripherals are not laid out with fixed
  *		      stride. Must be used with sub_reg_offsets containing the
  *		      offsets to each peripheral. Deprecated; the same thing
  *		      can be accomplished with a @get_irq_reg callback, without
  *		      the need for a @sub_reg_offsets table.
- * @status_invert: Inverted status register: cleared bits are active interrupts.
- * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
  *
  * @num_regs:    Number of registers in each control bank.
+ *
  * @irqs:        Descriptors for individual IRQs.  Interrupt numbers are
  *               assigned based on the index in the array of the interrupt.
  * @num_irqs:    Number of descriptors.
+ *
  * @num_type_reg:    Number of type registers. Deprecated, use config registers
  *		     instead.
  * @num_virt_regs:   Number of non-standard irq configuration registers.
@@ -1579,6 +1581,7 @@ struct regmap_irq_chip_data;
  *		     instead.
  * @num_config_bases:	Number of config base registers.
  * @num_config_regs:	Number of config registers for each config base register.
+ *
  * @handle_pre_irq:  Driver specific callback to handle interrupt from device
  *		     before regmap_irq_handler process the interrupts.
  * @handle_post_irq: Driver specific callback to handle interrupt from device
@@ -1625,12 +1628,12 @@ struct regmap_irq_chip {
 	unsigned int use_ack:1;
 	unsigned int ack_invert:1;
 	unsigned int clear_ack:1;
+	unsigned int status_invert:1;
 	unsigned int wake_invert:1;
-	unsigned int runtime_pm:1;
 	unsigned int type_in_mask:1;
 	unsigned int clear_on_unmask:1;
+	unsigned int runtime_pm:1;
 	unsigned int not_fixed_stride:1;
-	unsigned int status_invert:1;
 
 	int num_regs;
 
-- 
cgit v1.2.3


From ae2ade4ba58167f165fbf3db19380f9b72c56db8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 1 Mar 2023 21:58:52 +0100
Subject: spi: Reorder fields in 'struct spi_message'

Group some variables based on their sizes to reduce hole and avoid padding.
On x86_64, this shrinks the size from 112 to 96 bytes.

This should have no real impact on memory allocation because 'struct
spi_message' is mostly used on stack, but it can save a few cycles
when the structure is initialized with spi_message_init() and co.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Link: https://lore.kernel.org/r/c112aad16eb47808e1ec10abd87b3d273c969a68.1677704283.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 4fa26b9a3572..bdb35a91b4bf 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1093,6 +1093,9 @@ struct spi_message {
 
 	unsigned		is_dma_mapped:1;
 
+	/* spi_prepare_message() was called for this message */
+	bool			prepared;
+
 	/* REVISIT:  we might want a flag affecting the behavior of the
 	 * last transfer ... allowing things like "read 16 bit length L"
 	 * immediately followed by "read L bytes".  Basically imposing
@@ -1105,11 +1108,11 @@ struct spi_message {
 	 */
 
 	/* Completion is reported through a callback */
+	int			status;
 	void			(*complete)(void *context);
 	void			*context;
 	unsigned		frame_length;
 	unsigned		actual_length;
-	int			status;
 
 	/* For optional use by whatever driver currently owns the
 	 * spi_message ...  between calls to spi_async and then later
@@ -1120,9 +1123,6 @@ struct spi_message {
 
 	/* List of spi_res reources when the spi message is processed */
 	struct list_head        resources;
-
-	/* spi_prepare_message() was called for this message */
-	bool			prepared;
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
-- 
cgit v1.2.3


From 0aaf08de8426f823bd0e36797445222e6392e374 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2023 19:22:22 -0500
Subject: __blockdev_direct_IO(): get rid of submit_io callback

always NULL...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/direct-io.c     | 9 ++-------
 fs/ocfs2/aops.c    | 2 +-
 include/linux/fs.h | 4 ++--
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index ab0d7ea89813..0b380bb8a81e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -86,7 +86,6 @@ struct dio_submit {
 	sector_t final_block_in_request;/* doesn't change */
 	int boundary;			/* prev block is at a boundary */
 	get_block_t *get_block;		/* block mapping function */
-	dio_submit_t *submit_io;	/* IO submition function */
 
 	loff_t logical_offset_in_bio;	/* current first logical block in bio */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
@@ -431,10 +430,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 
 	dio->bio_disk = bio->bi_bdev->bd_disk;
 
-	if (sdio->submit_io)
-		sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
-	else
-		submit_bio(bio);
+	submit_bio(bio);
 
 	sdio->bio = NULL;
 	sdio->boundary = 0;
@@ -1098,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio)
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		struct block_device *bdev, struct iov_iter *iter,
 		get_block_t get_block, dio_iodone_t end_io,
-		dio_submit_t submit_io, int flags)
+		int flags)
 {
 	unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
@@ -1215,7 +1211,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 
 	sdio.get_block = get_block;
 	dio->end_io = end_io;
-	sdio.submit_io = submit_io;
 	sdio.final_block_in_bio = -1;
 	sdio.next_block_for_io = -1;
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1d65f6ef00ca..50448ba5fda8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2448,7 +2448,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
 				    iter, get_block,
-				    ocfs2_dio_end_io, NULL, 0);
+				    ocfs2_dio_end_io, 0);
 }
 
 const struct address_space_operations ocfs2_aops = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..b07d13e546d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2778,7 +2778,7 @@ enum {
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
 			     get_block_t get_block,
-			     dio_iodone_t end_io, dio_submit_t submit_io,
+			     dio_iodone_t end_io,
 			     int flags);
 
 static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
@@ -2787,7 +2787,7 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 					 get_block_t get_block)
 {
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-			get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
+			get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
 }
 #endif
 
-- 
cgit v1.2.3


From 97ab4c116aae4d3756a37c4d8458d49756e4cf89 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 26 Sep 2022 18:18:42 -0500
Subject: ASoC: uapi: Replace zero-length arrays with __DECLARE_FLEX_ARRAY()
 helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zero-length arrays are deprecated and we are moving towards adopting
C99 flexible-array members, instead. So, replace zero-length arrays
declarations in anonymous union with the new __DECLARE_FLEX_ARRAY()
helper macro.

This helper allows for flexible-array members in unions.

Link: https://github.com/KSPP/linux/issues/193
Link: https://github.com/KSPP/linux/issues/227
Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 include/uapi/sound/asoc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 9f35bedafcff..10851bca7174 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -222,9 +222,9 @@ struct snd_soc_tplg_vendor_array {
 	__le32 type;	/* SND_SOC_TPLG_TUPLE_TYPE_ */
 	__le32 num_elems;	/* number of elements in array */
 	union {
-		struct snd_soc_tplg_vendor_uuid_elem uuid[0];
-		struct snd_soc_tplg_vendor_value_elem value[0];
-		struct snd_soc_tplg_vendor_string_elem string[0];
+		__DECLARE_FLEX_ARRAY(struct snd_soc_tplg_vendor_uuid_elem, uuid);
+		__DECLARE_FLEX_ARRAY(struct snd_soc_tplg_vendor_value_elem, value);
+		__DECLARE_FLEX_ARRAY(struct snd_soc_tplg_vendor_string_elem, string);
 	};
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From 721da5cee9d43901105f5b8bd33fcb9101b12fc3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 23 Feb 2023 08:33:26 +0100
Subject: driver core: remove CONFIG_SYSFS_DEPRECATED and
 CONFIG_SYSFS_DEPRECATED_V2

CONFIG_SYSFS_DEPRECATED was added in commit 88a22c985e35
("CONFIG_SYSFS_DEPRECATED") in 2006 to allow systems with older versions
of some tools (i.e. Fedora 3's version of udev) to boot properly.  Four
years later, in 2010, the option was attempted to be removed as most of
userspace should have been fixed up properly by then, but some kernel
developers clung to those old systems and refused to update, so we added
CONFIG_SYSFS_DEPRECATED_V2 in commit e52eec13cd6b ("SYSFS: Allow boot
time switching between deprecated and modern sysfs layout") to allow
them to continue to boot properly, and we allowed a boot time parameter
to be used to switch back to the old format if needed.

Over time, the logic that was covered under these config options was
slowly removed from individual driver subsystems successfully, removed,
and the only thing that is now left in the kernel are some changes in
the block layer's representation in sysfs where real directories are
used instead of symlinks like normal.

Because the original changes were done to userspace tools in 2006, and
all distros that use those tools are long end-of-life, and older
non-udev-based systems do not care about the block layer's sysfs
representation, it is time to finally remove this old logic and the
config entries from the kernel.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: linux-block@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20230223073326.2073220-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 ------
 block/genhd.c                                   | 19 +++++--------
 drivers/base/class.c                            |  2 +-
 drivers/base/core.c                             | 37 ------------------------
 include/linux/device.h                          |  6 ----
 init/Kconfig                                    | 38 -------------------------
 6 files changed, 8 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6221a1d057dd..207ca1d6e3d3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6130,15 +6130,6 @@
 			later by a loaded module cannot be set this way.
 			Example: sysctl.vm.swappiness=40
 
-	sysfs.deprecated=0|1 [KNL]
-			Enable/disable old style sysfs layout for old udev
-			on older distributions. When this option is enabled
-			very new udev will not work anymore. When this option
-			is disabled (or CONFIG_SYSFS_DEPRECATED not compiled)
-			in older udev will not work anymore.
-			Default depends on CONFIG_SYSFS_DEPRECATED_V2 set in
-			the kernel configuration.
-
 	sysrq_always_enabled
 			[KNL]
 			Ignore sysrq setting - this boot parameter will
diff --git a/block/genhd.c b/block/genhd.c
index 3ee5577e1586..e1e1230b1b9f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -466,12 +466,10 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	if (ret)
 		goto out_device_del;
 
-	if (!sysfs_deprecated) {
-		ret = sysfs_create_link(block_depr, &ddev->kobj,
-					kobject_name(&ddev->kobj));
-		if (ret)
-			goto out_device_del;
-	}
+	ret = sysfs_create_link(block_depr, &ddev->kobj,
+				kobject_name(&ddev->kobj));
+	if (ret)
+		goto out_device_del;
 
 	/*
 	 * avoid probable deadlock caused by allocating memory with
@@ -554,8 +552,7 @@ out_put_holder_dir:
 out_del_integrity:
 	blk_integrity_del(disk);
 out_del_block_link:
-	if (!sysfs_deprecated)
-		sysfs_remove_link(block_depr, dev_name(ddev));
+	sysfs_remove_link(block_depr, dev_name(ddev));
 out_device_del:
 	device_del(ddev);
 out_free_ext_minor:
@@ -657,8 +654,7 @@ void del_gendisk(struct gendisk *disk)
 
 	part_stat_set_all(disk->part0, 0);
 	disk->part0->bd_stamp = 0;
-	if (!sysfs_deprecated)
-		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
 	device_del(disk_to_dev(disk));
 
@@ -912,8 +908,7 @@ static int __init genhd_device_init(void)
 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 
 	/* create top-level block dir */
-	if (!sysfs_deprecated)
-		block_depr = kobject_create_and_add("block", NULL);
+	block_depr = kobject_create_and_add("block", NULL);
 	return 0;
 }
 
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 2373b3e210d8..fb8f2a1e1c19 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -180,7 +180,7 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 
 #if defined(CONFIG_BLOCK)
 	/* let the block class directory show up in the root of sysfs */
-	if (!sysfs_deprecated || cls != &block_class)
+	if (cls != &block_class)
 		cp->subsys.kobj.kset = class_kset;
 #else
 	cp->subsys.kobj.kset = class_kset;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6878dfcbf0d6..ba11b51ce4ee 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -36,19 +36,6 @@
 #include "physical_location.h"
 #include "power/power.h"
 
-#ifdef CONFIG_SYSFS_DEPRECATED
-#ifdef CONFIG_SYSFS_DEPRECATED_V2
-long sysfs_deprecated = 1;
-#else
-long sysfs_deprecated = 0;
-#endif
-static int __init sysfs_deprecated_setup(char *arg)
-{
-	return kstrtol(arg, 10, &sysfs_deprecated);
-}
-early_param("sysfs.deprecated", sysfs_deprecated_setup);
-#endif
-
 /* Device links support. */
 static LIST_HEAD(deferred_sync);
 static unsigned int defer_sync_state_count = 1;
@@ -3137,15 +3124,6 @@ static struct kobject *get_device_parent(struct device *dev,
 		struct kobject *parent_kobj;
 		struct kobject *k;
 
-#ifdef CONFIG_BLOCK
-		/* block disks show up in /sys/block */
-		if (sysfs_deprecated && dev->class == &block_class) {
-			if (parent && parent->class == &block_class)
-				return &parent->kobj;
-			return &block_class.p->subsys.kobj;
-		}
-#endif
-
 		/*
 		 * If we have no parent, we live in "virtual".
 		 * Class-devices with a non class-device as parent, live
@@ -3324,12 +3302,6 @@ static int device_add_class_symlinks(struct device *dev)
 			goto out_subsys;
 	}
 
-#ifdef CONFIG_BLOCK
-	/* /sys/block has directories and does not need symlinks */
-	if (sysfs_deprecated && dev->class == &block_class)
-		return 0;
-#endif
-
 	/* link in the class directory pointing to the device */
 	error = sysfs_create_link(&dev->class->p->subsys.kobj,
 				  &dev->kobj, dev_name(dev));
@@ -3359,10 +3331,6 @@ static void device_remove_class_symlinks(struct device *dev)
 	if (dev->parent && device_is_not_partition(dev))
 		sysfs_remove_link(&dev->kobj, "device");
 	sysfs_remove_link(&dev->kobj, "subsystem");
-#ifdef CONFIG_BLOCK
-	if (sysfs_deprecated && dev->class == &block_class)
-		return;
-#endif
 	sysfs_delete_link(&dev->class->p->subsys.kobj, &dev->kobj, dev_name(dev));
 }
 
@@ -4652,11 +4620,6 @@ int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
 	if (error)
 		goto out;
 
-#ifdef CONFIG_BLOCK
-	if (sysfs_deprecated && dev->class == &block_class)
-		goto out;
-#endif
-
 	/*
 	 * Change the owner of the symlink located in the class directory of
 	 * the device class associated with @dev which points to the actual
diff --git a/include/linux/device.h b/include/linux/device.h
index 1508e637bb26..19b6ba478fbf 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1092,10 +1092,4 @@ int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);
 #define MODULE_ALIAS_CHARDEV_MAJOR(major) \
 	MODULE_ALIAS("char-major-" __stringify(major) "-*")
 
-#ifdef CONFIG_SYSFS_DEPRECATED
-extern long sysfs_deprecated;
-#else
-#define sysfs_deprecated 0
-#endif
-
 #endif /* _DEVICE_H_ */
diff --git a/init/Kconfig b/init/Kconfig
index 1fb5f313d18f..4464ef98ea0b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1297,44 +1297,6 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
-config SYSFS_DEPRECATED
-	bool "Enable deprecated sysfs features to support old userspace tools"
-	depends on SYSFS
-	default n
-	help
-	  This option adds code that switches the layout of the "block" class
-	  devices, to not show up in /sys/class/block/, but only in
-	  /sys/block/.
-
-	  This switch is only active when the sysfs.deprecated=1 boot option is
-	  passed or the SYSFS_DEPRECATED_V2 option is set.
-
-	  This option allows new kernels to run on old distributions and tools,
-	  which might get confused by /sys/class/block/. Since 2007/2008 all
-	  major distributions and tools handle this just fine.
-
-	  Recent distributions and userspace tools after 2009/2010 depend on
-	  the existence of /sys/class/block/, and will not work with this
-	  option enabled.
-
-	  Only if you are using a new kernel on an old distribution, you might
-	  need to say Y here.
-
-config SYSFS_DEPRECATED_V2
-	bool "Enable deprecated sysfs features by default"
-	default n
-	depends on SYSFS
-	depends on SYSFS_DEPRECATED
-	help
-	  Enable deprecated sysfs by default.
-
-	  See the CONFIG_SYSFS_DEPRECATED option for more details about this
-	  option.
-
-	  Only if you are using a new kernel on an old distribution, you might
-	  need to say Y here. Even then, odds are you would not need it
-	  enabled, you can always pass the boot option if absolutely necessary.
-
 config RELAY
 	bool "Kernel->user space relay support (formerly relayfs)"
 	select IRQ_WORK
-- 
cgit v1.2.3


From f2620f166e2a4db08f016b7b30b904ab28c265e4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 1 Feb 2023 14:14:52 +0100
Subject: xattr: simplify listxattr helpers

The generic_listxattr() and simple_xattr_list() helpers list xattrs and
contain duplicated code. Add two helpers that both generic_listxattr()
and simple_xattr_list() can use.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c            | 25 +++++++++++++
 fs/xattr.c                | 89 ++++++++++++++++++-----------------------------
 include/linux/posix_acl.h |  7 ++++
 include/linux/xattr.h     |  1 +
 4 files changed, 66 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5a76fb35923a..b350e8396adb 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -957,6 +957,31 @@ set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 }
 EXPORT_SYMBOL(set_posix_acl);
 
+int posix_acl_listxattr(struct inode *inode, char **buffer,
+			ssize_t *remaining_size)
+{
+	int err;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+
+	if (inode->i_acl) {
+		err = xattr_list_one(buffer, remaining_size,
+				     XATTR_NAME_POSIX_ACL_ACCESS);
+		if (err)
+			return err;
+	}
+
+	if (inode->i_default_acl) {
+		err = xattr_list_one(buffer, remaining_size,
+				     XATTR_NAME_POSIX_ACL_DEFAULT);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static bool
 posix_acl_xattr_list(struct dentry *dentry)
 {
diff --git a/fs/xattr.c b/fs/xattr.c
index 14a7eb3c8fa8..1a35235ecc73 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -949,6 +949,21 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	return error;
 }
 
+int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
+{
+	size_t len;
+
+	len = strlen(name) + 1;
+	if (*buffer) {
+		if (*remaining_size < len)
+			return -ERANGE;
+		memcpy(*buffer, name, len);
+		*buffer += len;
+	}
+	*remaining_size -= len;
+	return 0;
+}
+
 /*
  * Combine the results of the list() operation from every xattr_handler in the
  * list.
@@ -957,33 +972,22 @@ ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
-	unsigned int size = 0;
-
-	if (!buffer) {
-		for_each_xattr_handler(handlers, handler) {
-			if (!handler->name ||
-			    (handler->list && !handler->list(dentry)))
-				continue;
-			size += strlen(handler->name) + 1;
-		}
-	} else {
-		char *buf = buffer;
-		size_t len;
-
-		for_each_xattr_handler(handlers, handler) {
-			if (!handler->name ||
-			    (handler->list && !handler->list(dentry)))
-				continue;
-			len = strlen(handler->name);
-			if (len + 1 > buffer_size)
-				return -ERANGE;
-			memcpy(buf, handler->name, len + 1);
-			buf += len + 1;
-			buffer_size -= len + 1;
-		}
-		size = buf - buffer;
+	ssize_t remaining_size = buffer_size;
+	int err = 0;
+
+	err = posix_acl_listxattr(d_inode(dentry), &buffer, &remaining_size);
+	if (err)
+		return err;
+
+	for_each_xattr_handler(handlers, handler) {
+		if (!handler->name || (handler->list && !handler->list(dentry)))
+			continue;
+		err = xattr_list_one(&buffer, &remaining_size, handler->name);
+		if (err)
+			return err;
 	}
-	return size;
+
+	return err ? err : buffer_size - remaining_size;
 }
 EXPORT_SYMBOL(generic_listxattr);
 
@@ -1245,20 +1249,6 @@ static bool xattr_is_trusted(const char *name)
 	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
-static int xattr_list_one(char **buffer, ssize_t *remaining_size,
-			  const char *name)
-{
-	size_t len = strlen(name) + 1;
-	if (*buffer) {
-		if (*remaining_size < len)
-			return -ERANGE;
-		memcpy(*buffer, name, len);
-		*buffer += len;
-	}
-	*remaining_size -= len;
-	return 0;
-}
-
 /**
  * simple_xattr_list - list all xattr objects
  * @inode: inode from which to get the xattrs
@@ -1287,22 +1277,9 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 	ssize_t remaining_size = size;
 	int err = 0;
 
-#ifdef CONFIG_FS_POSIX_ACL
-	if (IS_POSIXACL(inode)) {
-		if (inode->i_acl) {
-			err = xattr_list_one(&buffer, &remaining_size,
-					     XATTR_NAME_POSIX_ACL_ACCESS);
-			if (err)
-				return err;
-		}
-		if (inode->i_default_acl) {
-			err = xattr_list_one(&buffer, &remaining_size,
-					     XATTR_NAME_POSIX_ACL_DEFAULT);
-			if (err)
-				return err;
-		}
-	}
-#endif
+	err = posix_acl_listxattr(inode, &buffer, &remaining_size);
+	if (err)
+		return err;
 
 	read_lock(&xattrs->lock);
 	for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 21cc29b8a9e8..0e65b3d634d9 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -106,6 +106,8 @@ struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
 			      struct dentry *dentry, const char *acl_name);
 int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		   const char *acl_name);
+int posix_acl_listxattr(struct inode *inode, char **buffer,
+			ssize_t *remaining_size);
 #else
 static inline int posix_acl_chmod(struct mnt_idmap *idmap,
 				  struct dentry *dentry, umode_t mode)
@@ -153,6 +155,11 @@ static inline int vfs_remove_acl(struct mnt_idmap *idmap,
 {
 	return -EOPNOTSUPP;
 }
+static inline int posix_acl_listxattr(struct inode *inode, char **buffer,
+				      ssize_t *remaining_size)
+{
+	return 0;
+}
 #endif /* CONFIG_FS_POSIX_ACL */
 
 struct posix_acl *get_inode_acl(struct inode *inode, int type);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 6af72461397d..ce8fb0bd56c6 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -109,5 +109,6 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 			  char *buffer, size_t size);
 void simple_xattr_add(struct simple_xattrs *xattrs,
 		      struct simple_xattr *new_xattr);
+int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name);
 
 #endif	/* _LINUX_XATTR_H */
-- 
cgit v1.2.3


From 2db8a948046cab3a2f707561592906a3d096972f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 1 Feb 2023 14:14:53 +0100
Subject: xattr: add listxattr helper

Add a tiny helper to determine whether an xattr handler given a specific
dentry supports listing the requested xattr. We will use this helper in
various filesystems in later commits.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/xattr.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index ce8fb0bd56c6..7adf768d4396 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -47,6 +47,22 @@ struct xattr_handler {
 		   size_t size, int flags);
 };
 
+/**
+ * xattr_handler_can_list - check whether xattr can be listed
+ * @handler: handler for this type of xattr
+ * @dentry: dentry whose inode xattr to list
+ *
+ * Determine whether the xattr associated with @dentry can be listed given
+ * @handler.
+ *
+ * Return: true if xattr can be listed, false if not.
+ */
+static inline bool xattr_handler_can_list(const struct xattr_handler *handler,
+					  struct dentry *dentry)
+{
+	return handler && (!handler->list || handler->list(dentry));
+}
+
 const char *xattr_full_name(const struct xattr_handler *, const char *);
 
 struct xattr {
-- 
cgit v1.2.3


From 831be973aa21d1cf8948bf4b1d4e73e6d5d028c0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 1 Feb 2023 14:14:54 +0100
Subject: xattr: remove unused argument

his helpers is really just used to check for user.* xattr support so
don't make it pointlessly generic.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/nfsd/nfs4xdr.c     |  3 +--
 fs/xattr.c            | 10 ++++------
 include/linux/xattr.h |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e12e5a4ad502..e30c0746305b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3444,8 +3444,7 @@ out_acl:
 		p = xdr_reserve_space(xdr, 4);
 		if (!p)
 			goto out_resource;
-		err = xattr_supported_namespace(d_inode(dentry),
-						XATTR_USER_PREFIX);
+		err = xattr_supports_user_prefix(d_inode(dentry));
 		*p++ = cpu_to_be32(err == 0);
 	}
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 1a35235ecc73..95bb6b30ab13 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -160,11 +160,10 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
  * Look for any handler that deals with the specified namespace.
  */
 int
-xattr_supported_namespace(struct inode *inode, const char *prefix)
+xattr_supports_user_prefix(struct inode *inode)
 {
 	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
-	size_t preflen;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
 		if (unlikely(is_bad_inode(inode)))
@@ -172,16 +171,15 @@ xattr_supported_namespace(struct inode *inode, const char *prefix)
 		return -EOPNOTSUPP;
 	}
 
-	preflen = strlen(prefix);
-
 	for_each_xattr_handler(handlers, handler) {
-		if (!strncmp(xattr_prefix(handler), prefix, preflen))
+		if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX,
+			     XATTR_USER_PREFIX_LEN))
 			return 0;
 	}
 
 	return -EOPNOTSUPP;
 }
-EXPORT_SYMBOL(xattr_supported_namespace);
+EXPORT_SYMBOL(xattr_supports_user_prefix);
 
 int
 __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 7adf768d4396..d591ef59aa98 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -94,7 +94,7 @@ int vfs_getxattr_alloc(struct mnt_idmap *idmap,
 		       struct dentry *dentry, const char *name,
 		       char **xattr_value, size_t size, gfp_t flags);
 
-int xattr_supported_namespace(struct inode *inode, const char *prefix);
+int xattr_supports_user_prefix(struct inode *inode);
 
 static inline const char *xattr_prefix(const struct xattr_handler *handler)
 {
-- 
cgit v1.2.3


From d549b741740e63e87e661754e2d1b336fdc51d50 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 1 Feb 2023 14:14:58 +0100
Subject: fs: rename generic posix acl handlers

Reflect in their naming and document that they are kept around for
legacy reasons and shouldn't be used anymore by new code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/erofs/xattr.h                |  6 ++----
 fs/ext2/xattr.c                 |  4 ++--
 fs/ext4/xattr.c                 |  4 ++--
 fs/f2fs/xattr.c                 |  4 ++--
 fs/jffs2/xattr.c                |  4 ++--
 fs/ocfs2/xattr.c                | 12 +++++-------
 fs/posix_acl.c                  | 24 ++++++++++++++++++------
 include/linux/posix_acl_xattr.h |  5 +++--
 8 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 08658e414c33..97185cb649b6 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -49,10 +49,8 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
 	static const struct xattr_handler *xattr_handler_map[] = {
 		[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-		[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] =
-			&posix_acl_access_xattr_handler,
-		[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
-			&posix_acl_default_xattr_handler,
+		[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+		[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
 #endif
 		[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
 #ifdef CONFIG_EROFS_FS_SECURITY
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 958976f809f5..b126af5f8b15 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,8 +101,8 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 static const struct xattr_handler *ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
+	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
 #endif
 	[EXT2_XATTR_INDEX_TRUSTED]	     = &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_SECURITY
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 09a9fade2053..839020b7a222 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -88,8 +88,8 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *);
 static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c8a265966a6c..213805d3592c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -192,8 +192,8 @@ const struct xattr_handler f2fs_xattr_security_handler = {
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
-	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
 #endif
 	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 1189a70d2007..aa4048a27f31 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -939,10 +939,10 @@ static const char *jffs2_xattr_prefix(int xprefix, struct dentry *dentry)
 #endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 	case JFFS2_XPREFIX_ACL_ACCESS:
-		ret = &posix_acl_access_xattr_handler;
+		ret = &nop_posix_acl_access;
 		break;
 	case JFFS2_XPREFIX_ACL_DEFAULT:
-		ret = &posix_acl_default_xattr_handler;
+		ret = &nop_posix_acl_default;
 		break;
 #endif
 	case JFFS2_XPREFIX_TRUSTED:
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 10dab2a7b2e2..83881bb4eace 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -95,13 +95,11 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {
 };
 
 static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
-	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
-	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
-					= &posix_acl_access_xattr_handler,
-	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
-					= &posix_acl_default_xattr_handler,
-	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
-	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
+	[OCFS2_XATTR_INDEX_USER]		= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]	= &nop_posix_acl_access,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &nop_posix_acl_default,
+	[OCFS2_XATTR_INDEX_TRUSTED]		= &ocfs2_xattr_trusted_handler,
+	[OCFS2_XATTR_INDEX_SECURITY]		= &ocfs2_xattr_security_handler,
 };
 
 struct ocfs2_xattr_info {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index b350e8396adb..e262c428f869 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -988,19 +988,31 @@ posix_acl_xattr_list(struct dentry *dentry)
 	return IS_POSIXACL(d_backing_inode(dentry));
 }
 
-const struct xattr_handler posix_acl_access_xattr_handler = {
+/*
+ * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs
+ *
+ * This is the legacy POSIX ACL access xattr handler. It is used by some
+ * filesystems to implement their ->listxattr() inode operation. New code
+ * should never use them.
+ */
+const struct xattr_handler nop_posix_acl_access = {
 	.name = XATTR_NAME_POSIX_ACL_ACCESS,
-	.flags = ACL_TYPE_ACCESS,
 	.list = posix_acl_xattr_list,
 };
-EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
+EXPORT_SYMBOL_GPL(nop_posix_acl_access);
 
-const struct xattr_handler posix_acl_default_xattr_handler = {
+/*
+ * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs
+ *
+ * This is the legacy POSIX ACL default xattr handler. It is used by some
+ * filesystems to implement their ->listxattr() inode operation. New code
+ * should never use them.
+ */
+const struct xattr_handler nop_posix_acl_default = {
 	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
-	.flags = ACL_TYPE_DEFAULT,
 	.list = posix_acl_xattr_list,
 };
-EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
+EXPORT_SYMBOL_GPL(nop_posix_acl_default);
 
 int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		   struct posix_acl *acl, int type)
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 54cd7a14330d..e86f3b731da2 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -68,7 +68,8 @@ static inline int posix_acl_type(const char *name)
 	return -1;
 }
 
-extern const struct xattr_handler posix_acl_access_xattr_handler;
-extern const struct xattr_handler posix_acl_default_xattr_handler;
+/* These are legacy handlers. Don't use them for new code. */
+extern const struct xattr_handler nop_posix_acl_access;
+extern const struct xattr_handler nop_posix_acl_default;
 
 #endif	/* _POSIX_ACL_XATTR_H */
-- 
cgit v1.2.3


From 21d9526d13b5467b0a6532e5a8b0eb04c01ce326 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 3 Jul 2020 15:04:51 +0200
Subject: gpiolib: Make the legacy <linux/gpio.h> consumer-only

The legacy <linux/gpio.h> header was an all-inclusive header used
by drivers and consumers alike. After eliminating the last users
of the driver defines, we can drop the inclusion of the
<linux/gpio/driver.h> header.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/asm-generic/gpio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 22cb8c9efc1d..1c910d124423 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -8,7 +8,6 @@
 #ifdef CONFIG_GPIOLIB
 
 #include <linux/compiler.h>
-#include <linux/gpio/driver.h>
 #include <linux/gpio/consumer.h>
 
 /*
-- 
cgit v1.2.3


From 94d20f7d674d589e94aea77e2cbe37fc58541d04 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 7 Feb 2023 16:29:42 +0200
Subject: gpiolib: coldfire: remove custom asm/gpio.h

Now that coldfire is the only user of a custom asm/gpio.h, it seems
better to remove this as well, and have the same interface everywhere.

For the gpio_get_value()/gpio_set_value()/gpio_to_irq(), gpio_cansleep()
functions, the custom version is only a micro-optimization to inline the
function for constant GPIO numbers. However, in the coldfire defconfigs,
I was unable to find a single instance where this micro-optimization
was even used, and according to Geert the only user appears to be the
QSPI chip that is disabled everywhere.

The custom gpio_request_one() function is even less useful, as it is
guarded by an #ifdef that is never true.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/m68k/Kconfig.cpu        |  1 -
 arch/m68k/include/asm/gpio.h | 95 --------------------------------------------
 drivers/gpio/Kconfig         |  8 ----
 include/linux/gpio.h         |  7 ----
 4 files changed, 111 deletions(-)
 delete mode 100644 arch/m68k/include/asm/gpio.h

(limited to 'include')

diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 9380f6e3bb66..96a0fb4f1af5 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -24,7 +24,6 @@ config M68KCLASSIC
 
 config COLDFIRE
 	bool "Coldfire CPU family support"
-	select ARCH_HAVE_CUSTOM_GPIO_H
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_CAS
 	select CPU_HAS_NO_MULDIV64
diff --git a/arch/m68k/include/asm/gpio.h b/arch/m68k/include/asm/gpio.h
deleted file mode 100644
index 5cfc0996ba94..000000000000
--- a/arch/m68k/include/asm/gpio.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Coldfire generic GPIO support
- *
- * (C) Copyright 2009, Steven King <sfking@fdwdc.com>
-*/
-
-#ifndef coldfire_gpio_h
-#define coldfire_gpio_h
-
-#include <linux/io.h>
-#include <asm/coldfire.h>
-#include <asm/mcfsim.h>
-#include <asm/mcfgpio.h>
-/*
- * The Generic GPIO functions
- *
- * If the gpio is a compile time constant and is one of the Coldfire gpios,
- * use the inline version, otherwise dispatch thru gpiolib.
- */
-
-static inline int gpio_get_value(unsigned gpio)
-{
-	if (__builtin_constant_p(gpio) && gpio < MCFGPIO_PIN_MAX)
-		return mcfgpio_read(__mcfgpio_ppdr(gpio)) & mcfgpio_bit(gpio);
-	else
-		return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-	if (__builtin_constant_p(gpio) && gpio < MCFGPIO_PIN_MAX) {
-		if (gpio < MCFGPIO_SCR_START) {
-			unsigned long flags;
-			MCFGPIO_PORTTYPE data;
-
-			local_irq_save(flags);
-			data = mcfgpio_read(__mcfgpio_podr(gpio));
-			if (value)
-				data |= mcfgpio_bit(gpio);
-			else
-				data &= ~mcfgpio_bit(gpio);
-			mcfgpio_write(data, __mcfgpio_podr(gpio));
-			local_irq_restore(flags);
-		} else {
-			if (value)
-				mcfgpio_write(mcfgpio_bit(gpio),
-						MCFGPIO_SETR_PORT(gpio));
-			else
-				mcfgpio_write(~mcfgpio_bit(gpio),
-						MCFGPIO_CLRR_PORT(gpio));
-		}
-	} else
-		__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_to_irq(unsigned gpio)
-{
-#if defined(MCFGPIO_IRQ_MIN)
-	if ((gpio >= MCFGPIO_IRQ_MIN) && (gpio < MCFGPIO_IRQ_MAX))
-#else
-	if (gpio < MCFGPIO_IRQ_MAX)
-#endif
-		return gpio + MCFGPIO_IRQ_VECBASE;
-	else
-		return __gpio_to_irq(gpio);
-}
-
-static inline int gpio_cansleep(unsigned gpio)
-{
-	return gpio < MCFGPIO_PIN_MAX ? 0 : __gpio_cansleep(gpio);
-}
-
-#ifndef CONFIG_GPIOLIB
-static inline int gpio_request_one(unsigned gpio, unsigned long flags, const char *label)
-{
-	int err;
-
-	err = gpio_request(gpio, label);
-	if (err)
-		return err;
-
-	if (flags & GPIOF_DIR_IN)
-		err = gpio_direction_input(gpio);
-	else
-		err = gpio_direction_output(gpio,
-			(flags & GPIOF_INIT_HIGH) ? 1 : 0);
-
-	if (err)
-		gpio_free(gpio);
-
-	return err;
-}
-#endif /* !CONFIG_GPIOLIB */
-#endif
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 13be729710f2..df82fb7eb0eb 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -3,14 +3,6 @@
 # GPIO infrastructure and drivers
 #
 
-config ARCH_HAVE_CUSTOM_GPIO_H
-	bool
-	help
-	  Selecting this config option from the architecture Kconfig allows
-	  the architecture to provide a custom asm/gpio.h implementation
-	  overriding the default implementations.  New uses of this are
-	  strongly discouraged.
-
 menuconfig GPIOLIB
 	bool "GPIO Support"
 	help
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 85beb236c925..2b75017b3aad 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -54,11 +54,6 @@ struct gpio {
 };
 
 #ifdef CONFIG_GPIOLIB
-
-#ifdef CONFIG_ARCH_HAVE_CUSTOM_GPIO_H
-#include <asm/gpio.h>
-#else
-
 #include <asm-generic/gpio.h>
 
 static inline int gpio_get_value(unsigned int gpio)
@@ -81,8 +76,6 @@ static inline int gpio_to_irq(unsigned int gpio)
 	return __gpio_to_irq(gpio);
 }
 
-#endif /* ! CONFIG_ARCH_HAVE_CUSTOM_GPIO_H */
-
 /* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */
 
 struct device;
-- 
cgit v1.2.3


From eccb7a00613c804ec7244676090bf6ee43a23da2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Feb 2023 17:37:22 +0200
Subject: gpiolib: remove asm-generic/gpio.h

The asm-generic/gpio.h file is now always included when
using gpiolib, so just move its contents into linux/gpio.h
with a few minor simplifications.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 MAINTAINERS                     |   1 -
 arch/m68k/include/asm/mcfgpio.h |   2 +-
 drivers/gpio/gpio-davinci.c     |   2 -
 drivers/pinctrl/core.c          |   1 -
 include/asm-generic/gpio.h      | 146 ----------------------------------------
 include/linux/gpio.h            |  94 +++++++++++++++++++++++---
 6 files changed, 85 insertions(+), 161 deletions(-)
 delete mode 100644 include/asm-generic/gpio.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d5bc223f305..ddf686c2e546 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8752,7 +8752,6 @@ F:	Documentation/admin-guide/gpio/
 F:	Documentation/devicetree/bindings/gpio/
 F:	Documentation/driver-api/gpio/
 F:	drivers/gpio/
-F:	include/asm-generic/gpio.h
 F:	include/dt-bindings/gpio/
 F:	include/linux/gpio.h
 F:	include/linux/gpio/
diff --git a/arch/m68k/include/asm/mcfgpio.h b/arch/m68k/include/asm/mcfgpio.h
index 27f32cc81da6..2cefe8445980 100644
--- a/arch/m68k/include/asm/mcfgpio.h
+++ b/arch/m68k/include/asm/mcfgpio.h
@@ -9,7 +9,7 @@
 #define mcfgpio_h
 
 #ifdef CONFIG_GPIOLIB
-#include <asm-generic/gpio.h>
+#include <linux/gpio.h>
 #else
 
 int __mcfgpio_get_value(unsigned gpio);
diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 26b1f7465e09..7fc83057990a 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -24,8 +24,6 @@
 #include <linux/spinlock.h>
 #include <linux/pm_runtime.h>
 
-#include <asm-generic/gpio.h>
-
 #define MAX_REGS_BANKS 5
 #define MAX_INT_PER_BANK 32
 
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index d6e6c751255f..401886c81344 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -30,7 +30,6 @@
 
 #ifdef CONFIG_GPIOLIB
 #include "../gpio/gpiolib.h"
-#include <asm-generic/gpio.h>
 #endif
 
 #include "core.h"
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
deleted file mode 100644
index 1c910d124423..000000000000
--- a/include/asm-generic/gpio.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_GENERIC_GPIO_H
-#define _ASM_GENERIC_GPIO_H
-
-#include <linux/types.h>
-#include <linux/errno.h>
-
-#ifdef CONFIG_GPIOLIB
-
-#include <linux/compiler.h>
-#include <linux/gpio/consumer.h>
-
-/*
- * Platforms may implement their GPIO interface with library code,
- * at a small performance cost for non-inlined operations and some
- * extra memory (for code and for per-GPIO table entries).
- */
-
-/*
- * At the end we want all GPIOs to be dynamically allocated from 0.
- * However, some legacy drivers still perform fixed allocation.
- * Until they are all fixed, leave 0-512 space for them.
- */
-#define GPIO_DYNAMIC_BASE	512
-
-struct device;
-struct gpio;
-struct seq_file;
-struct module;
-struct device_node;
-struct gpio_desc;
-
-/* Always use the library code for GPIO management calls,
- * or when sleeping may be involved.
- */
-extern int gpio_request(unsigned gpio, const char *label);
-extern void gpio_free(unsigned gpio);
-
-static inline int gpio_direction_input(unsigned gpio)
-{
-	return gpiod_direction_input(gpio_to_desc(gpio));
-}
-static inline int gpio_direction_output(unsigned gpio, int value)
-{
-	return gpiod_direction_output_raw(gpio_to_desc(gpio), value);
-}
-
-static inline int gpio_set_debounce(unsigned gpio, unsigned debounce)
-{
-	return gpiod_set_debounce(gpio_to_desc(gpio), debounce);
-}
-
-static inline int gpio_get_value_cansleep(unsigned gpio)
-{
-	return gpiod_get_raw_value_cansleep(gpio_to_desc(gpio));
-}
-static inline void gpio_set_value_cansleep(unsigned gpio, int value)
-{
-	return gpiod_set_raw_value_cansleep(gpio_to_desc(gpio), value);
-}
-
-
-/* A platform's <asm/gpio.h> code may want to inline the I/O calls when
- * the GPIO is constant and refers to some always-present controller,
- * giving direct access to chip registers and tight bitbanging loops.
- */
-static inline int __gpio_get_value(unsigned gpio)
-{
-	return gpiod_get_raw_value(gpio_to_desc(gpio));
-}
-static inline void __gpio_set_value(unsigned gpio, int value)
-{
-	return gpiod_set_raw_value(gpio_to_desc(gpio), value);
-}
-
-static inline int __gpio_cansleep(unsigned gpio)
-{
-	return gpiod_cansleep(gpio_to_desc(gpio));
-}
-
-static inline int __gpio_to_irq(unsigned gpio)
-{
-	return gpiod_to_irq(gpio_to_desc(gpio));
-}
-
-extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int gpio_request_array(const struct gpio *array, size_t num);
-extern void gpio_free_array(const struct gpio *array, size_t num);
-
-/*
- * A sysfs interface can be exported by individual drivers if they want,
- * but more typically is configured entirely from userspace.
- */
-static inline int gpio_export(unsigned gpio, bool direction_may_change)
-{
-	return gpiod_export(gpio_to_desc(gpio), direction_may_change);
-}
-
-static inline void gpio_unexport(unsigned gpio)
-{
-	gpiod_unexport(gpio_to_desc(gpio));
-}
-
-#else	/* !CONFIG_GPIOLIB */
-
-#include <linux/kernel.h>
-
-/* platforms that don't directly support access to GPIOs through I2C, SPI,
- * or other blocking infrastructure can use these wrappers.
- */
-
-static inline int gpio_cansleep(unsigned gpio)
-{
-	return 0;
-}
-
-static inline int gpio_get_value_cansleep(unsigned gpio)
-{
-	might_sleep();
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value_cansleep(unsigned gpio, int value)
-{
-	might_sleep();
-	__gpio_set_value(gpio, value);
-}
-
-#endif /* !CONFIG_GPIOLIB */
-
-/*
- * "valid" GPIO numbers are nonnegative and may be passed to
- * setup routines like gpio_request().  only some valid numbers
- * can successfully be requested and used.
- *
- * Invalid GPIO numbers are useful for indicating no-such-GPIO in
- * platform data and other tables.
- */
-
-static inline bool gpio_is_valid(int number)
-{
-	/* only non-negative numbers are valid */
-	return number >= 0;
-}
-
-#endif /* _ASM_GENERIC_GPIO_H */
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 2b75017b3aad..d5ce78e2bdd9 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -13,6 +13,7 @@
 #define __LINUX_GPIO_H
 
 #include <linux/errno.h>
+#include <linux/types.h>
 
 /* see Documentation/driver-api/gpio/legacy.rst */
 
@@ -54,26 +55,100 @@ struct gpio {
 };
 
 #ifdef CONFIG_GPIOLIB
-#include <asm-generic/gpio.h>
 
-static inline int gpio_get_value(unsigned int gpio)
+#include <linux/gpio/consumer.h>
+
+/*
+ * "valid" GPIO numbers are nonnegative and may be passed to
+ * setup routines like gpio_request().  Only some valid numbers
+ * can successfully be requested and used.
+ *
+ * Invalid GPIO numbers are useful for indicating no-such-GPIO in
+ * platform data and other tables.
+ */
+static inline bool gpio_is_valid(int number)
+{
+	/* only non-negative numbers are valid */
+	return number >= 0;
+}
+
+/*
+ * Platforms may implement their GPIO interface with library code,
+ * at a small performance cost for non-inlined operations and some
+ * extra memory (for code and for per-GPIO table entries).
+ */
+
+/*
+ * At the end we want all GPIOs to be dynamically allocated from 0.
+ * However, some legacy drivers still perform fixed allocation.
+ * Until they are all fixed, leave 0-512 space for them.
+ */
+#define GPIO_DYNAMIC_BASE	512
+
+/* Always use the library code for GPIO management calls,
+ * or when sleeping may be involved.
+ */
+int gpio_request(unsigned gpio, const char *label);
+void gpio_free(unsigned gpio);
+
+static inline int gpio_direction_input(unsigned gpio)
+{
+	return gpiod_direction_input(gpio_to_desc(gpio));
+}
+static inline int gpio_direction_output(unsigned gpio, int value)
+{
+	return gpiod_direction_output_raw(gpio_to_desc(gpio), value);
+}
+
+static inline int gpio_set_debounce(unsigned gpio, unsigned debounce)
+{
+	return gpiod_set_debounce(gpio_to_desc(gpio), debounce);
+}
+
+static inline int gpio_get_value_cansleep(unsigned gpio)
+{
+	return gpiod_get_raw_value_cansleep(gpio_to_desc(gpio));
+}
+static inline void gpio_set_value_cansleep(unsigned gpio, int value)
 {
-	return __gpio_get_value(gpio);
+	return gpiod_set_raw_value_cansleep(gpio_to_desc(gpio), value);
 }
 
-static inline void gpio_set_value(unsigned int gpio, int value)
+static inline int gpio_get_value(unsigned gpio)
+{
+	return gpiod_get_raw_value(gpio_to_desc(gpio));
+}
+static inline void gpio_set_value(unsigned gpio, int value)
 {
-	__gpio_set_value(gpio, value);
+	return gpiod_set_raw_value(gpio_to_desc(gpio), value);
 }
 
-static inline int gpio_cansleep(unsigned int gpio)
+static inline int gpio_cansleep(unsigned gpio)
 {
-	return __gpio_cansleep(gpio);
+	return gpiod_cansleep(gpio_to_desc(gpio));
 }
 
-static inline int gpio_to_irq(unsigned int gpio)
+static inline int gpio_to_irq(unsigned gpio)
 {
-	return __gpio_to_irq(gpio);
+	return gpiod_to_irq(gpio_to_desc(gpio));
+}
+
+int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+int gpio_request_array(const struct gpio *array, size_t num);
+void gpio_free_array(const struct gpio *array, size_t num);
+
+/*
+ * A sysfs interface can be exported by individual drivers if they want,
+ * but more typically is configured entirely from userspace.
+ */
+static inline int gpio_export(unsigned gpio, bool direction_may_change)
+{
+	return gpiod_export(gpio_to_desc(gpio), direction_may_change);
+}
+
+static inline void gpio_unexport(unsigned gpio)
+{
+	gpiod_unexport(gpio_to_desc(gpio));
 }
 
 /* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */
@@ -88,7 +163,6 @@ int devm_gpio_request_one(struct device *dev, unsigned gpio,
 
 #include <linux/bug.h>
 #include <linux/kernel.h>
-#include <linux/types.h>
 
 struct device;
 struct gpio_chip;
-- 
cgit v1.2.3


From 0e685c3e7158d35626d6d76b9f859eae806d87fa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 7 Feb 2023 16:29:44 +0200
Subject: gpiolib: remove gpio_set_debounce()

gpio_set_debounce() only has a single user, which is trivially
converted to gpiod_set_debounce().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/driver-api/gpio/legacy.rst                    |  2 --
 Documentation/translations/zh_CN/driver-api/gpio/legacy.rst |  1 -
 Documentation/translations/zh_TW/gpio.txt                   |  1 -
 drivers/input/touchscreen/ads7846.c                         |  5 +++--
 include/linux/gpio.h                                        | 10 ----------
 5 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/gpio/legacy.rst b/Documentation/driver-api/gpio/legacy.rst
index a0559d93efd1..e0306e78e34b 100644
--- a/Documentation/driver-api/gpio/legacy.rst
+++ b/Documentation/driver-api/gpio/legacy.rst
@@ -238,8 +238,6 @@ setup or driver probe/teardown code, so this is an easy constraint.)::
         ## 	gpio_free_array()
 
                 gpio_free()
-                gpio_set_debounce()
-
 
 
 Claiming and Releasing GPIOs
diff --git a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
index 74fa473bb504..dee2a0517c1c 100644
--- a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
+++ b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
@@ -219,7 +219,6 @@ GPIO 值的命令需要等待其信息排到队首才发送命令，再获得其
         ## 	gpio_free_array()
 
                 gpio_free()
-                gpio_set_debounce()
 
 
diff --git a/Documentation/translations/zh_TW/gpio.txt b/Documentation/translations/zh_TW/gpio.txt
index 1b986bbb0909..dc608358d90a 100644
--- a/Documentation/translations/zh_TW/gpio.txt
+++ b/Documentation/translations/zh_TW/gpio.txt
@@ -226,7 +226,6 @@ GPIO 值的命令需要等待其信息排到隊首才發送命令，再獲得其
 ## 	gpio_free_array()
 
 	gpio_free()
-	gpio_set_debounce()
 
 
diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 17f11bce8113..bb1058b1e7fd 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -27,6 +27,7 @@
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/ads7846.h>
@@ -1012,8 +1013,8 @@ static int ads7846_setup_pendown(struct spi_device *spi,
 		ts->gpio_pendown = pdata->gpio_pendown;
 
 		if (pdata->gpio_pendown_debounce)
-			gpio_set_debounce(pdata->gpio_pendown,
-					  pdata->gpio_pendown_debounce);
+			gpiod_set_debounce(gpio_to_desc(ts->gpio_pendown),
+					   pdata->gpio_pendown_debounce);
 	} else {
 		dev_err(&spi->dev, "no get_pendown_state nor gpio_pendown?\n");
 		return -EINVAL;
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index d5ce78e2bdd9..fc56733e8514 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -100,11 +100,6 @@ static inline int gpio_direction_output(unsigned gpio, int value)
 	return gpiod_direction_output_raw(gpio_to_desc(gpio), value);
 }
 
-static inline int gpio_set_debounce(unsigned gpio, unsigned debounce)
-{
-	return gpiod_set_debounce(gpio_to_desc(gpio), debounce);
-}
-
 static inline int gpio_get_value_cansleep(unsigned gpio)
 {
 	return gpiod_get_raw_value_cansleep(gpio_to_desc(gpio));
@@ -214,11 +209,6 @@ static inline int gpio_direction_output(unsigned gpio, int value)
 	return -ENOSYS;
 }
 
-static inline int gpio_set_debounce(unsigned gpio, unsigned debounce)
-{
-	return -ENOSYS;
-}
-
 static inline int gpio_get_value(unsigned gpio)
 {
 	/* GPIO can never have been requested or set as {in,out}put */
-- 
cgit v1.2.3


From d74e316633e49f44756c23997fa071979a939405 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 7 Feb 2023 16:29:45 +0200
Subject: gpiolib: remove legacy gpio_export()

There are only a handful of users of gpio_export() and
related functions.

As these are just wrappers around the modern gpiod_export()
helper, remove the wrappers and open-code the gpio_to_desc
in all callers to shrink the legacy API.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/admin-guide/gpio/sysfs.rst           |  2 +-
 Documentation/driver-api/gpio/legacy.rst           | 21 -----------------
 .../translations/zh_CN/driver-api/gpio/legacy.rst  | 19 ---------------
 Documentation/translations/zh_TW/gpio.txt          | 18 ---------------
 arch/arm/mach-omap2/pdata-quirks.c                 |  9 ++++----
 arch/sh/boards/mach-ap325rxa/setup.c               |  7 +++---
 drivers/gpio/gpiolib-sysfs.c                       |  4 ++--
 drivers/media/pci/sta2x11/sta2x11_vip.c            | 10 +++++---
 drivers/net/ieee802154/ca8210.c                    |  3 ++-
 include/linux/gpio.h                               | 27 ----------------------
 10 files changed, 21 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst
index ec09ffd983e7..35171d15f78d 100644
--- a/Documentation/admin-guide/gpio/sysfs.rst
+++ b/Documentation/admin-guide/gpio/sysfs.rst
@@ -145,7 +145,7 @@ requested using gpio_request()::
 	/* export the GPIO to userspace */
 	int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
 
-	/* reverse gpio_export() */
+	/* reverse gpiod_export() */
 	void gpiod_unexport(struct gpio_desc *desc);
 
 	/* create a sysfs link to an exported GPIO node */
diff --git a/Documentation/driver-api/gpio/legacy.rst b/Documentation/driver-api/gpio/legacy.rst
index e0306e78e34b..78372853c6d4 100644
--- a/Documentation/driver-api/gpio/legacy.rst
+++ b/Documentation/driver-api/gpio/legacy.rst
@@ -714,27 +714,6 @@ gpiochip nodes (possibly in conjunction with schematics) to determine
 the correct GPIO number to use for a given signal.
 
 
-Exporting from Kernel code
---------------------------
-Kernel code can explicitly manage exports of GPIOs which have already been
-requested using gpio_request()::
-
-	/* export the GPIO to userspace */
-	int gpio_export(unsigned gpio, bool direction_may_change);
-
-	/* reverse gpio_export() */
-	void gpio_unexport();
-
-After a kernel driver requests a GPIO, it may only be made available in
-the sysfs interface by gpio_export().  The driver can control whether the
-signal direction may change.  This helps drivers prevent userspace code
-from accidentally clobbering important system state.
-
-This explicit exporting can help with debugging (by making some kinds
-of experiments easier), or can provide an always-there interface that's
-suitable for documenting as part of a board support package.
-
-
 API Reference
 =============
 
diff --git a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
index dee2a0517c1c..84ce2322fdba 100644
--- a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
+++ b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
@@ -653,25 +653,6 @@ GPIO 控制器的路径类似 /sys/class/gpio/gpiochip42/ (对于从#42 GPIO
 确定给定信号所用的 GPIO 编号。
 
 
-从内核代码中导出
-----------------
-
-内核代码可以明确地管理那些已通过 gpio_request()申请的 GPIO 的导出::
-
-	/* 导出 GPIO 到用户空间 */
-	int gpio_export(unsigned gpio, bool direction_may_change);
-
-	/* gpio_export()的逆操作 */
-	void gpio_unexport();
-
-在一个内核驱动申请一个 GPIO 之后，它可以通过 gpio_export()使其在 sysfs
-接口中可见。该驱动可以控制信号方向是否可修改。这有助于防止用户空间代码无意间
-破坏重要的系统状态。
-
-这个明确的导出有助于(通过使某些实验更容易来)调试，也可以提供一个始终存在的接口，
-与文档配合作为板级支持包的一部分。
-
-
 API参考
 =======
 
diff --git a/Documentation/translations/zh_TW/gpio.txt b/Documentation/translations/zh_TW/gpio.txt
index dc608358d90a..62e560ffe628 100644
--- a/Documentation/translations/zh_TW/gpio.txt
+++ b/Documentation/translations/zh_TW/gpio.txt
@@ -614,21 +614,3 @@ GPIO 控制器的路徑類似 /sys/class/gpio/gpiochip42/ (對於從#42 GPIO
 固定的,例如在擴展卡上的 GPIO會根據所使用的主板或所在堆疊架構中其他的板子而
 有所不同。在這種情況下,你可能需要使用 gpiochip 節點(儘可能地結合電路圖)來
 確定給定信號所用的 GPIO 編號。
-
-
-從內核代碼中導出
--------------
-內核代碼可以明確地管理那些已通過 gpio_request()申請的 GPIO 的導出:
-
-	/* 導出 GPIO 到用戶空間 */
-	int gpio_export(unsigned gpio, bool direction_may_change);
-
-	/* gpio_export()的逆操作 */
-	void gpio_unexport();
-
-在一個內核驅動申請一個 GPIO 之後，它可以通過 gpio_export()使其在 sysfs
-接口中可見。該驅動可以控制信號方向是否可修改。這有助於防止用戶空間代碼無意間
-破壞重要的系統狀態。
-
-這個明確的導出有助於(通過使某些實驗更容易來)調試，也可以提供一個始終存在的接口，
-與文檔配合作爲板級支持包的一部分。
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index baba73fd6f11..04208cc52784 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -6,6 +6,7 @@
  */
 #include <linux/clk.h>
 #include <linux/davinci_emac.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -108,7 +109,7 @@ static int omap3_sbc_t3730_twl_callback(struct device *dev,
 	if (res)
 		return res;
 
-	gpio_export(gpio, 0);
+	gpiod_export(gpio_to_desc(gpio), 0);
 
 	return 0;
 }
@@ -123,7 +124,7 @@ static void __init omap3_sbc_t3x_usb_hub_init(int gpio, char *hub_name)
 		return;
 	}
 
-	gpio_export(gpio, 0);
+	gpiod_export(gpio_to_desc(gpio), 0);
 
 	udelay(10);
 	gpio_set_value(gpio, 1);
@@ -200,8 +201,8 @@ static void __init omap3_sbc_t3517_wifi_init(void)
 		return;
 	}
 
-	gpio_export(cm_t3517_wlan_gpios[0].gpio, 0);
-	gpio_export(cm_t3517_wlan_gpios[1].gpio, 0);
+	gpiod_export(gpio_to_desc(cm_t3517_wlan_gpios[0].gpio), 0);
+	gpiod_export(gpio_to_desc(cm_t3517_wlan_gpios[1].gpio), 0);
 
 	msleep(100);
 	gpio_set_value(cm_t3517_wlan_gpios[1].gpio, 0);
diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c
index c77b5f00a66a..151792162152 100644
--- a/arch/sh/boards/mach-ap325rxa/setup.c
+++ b/arch/sh/boards/mach-ap325rxa/setup.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio/machine.h>
 #include <linux/i2c.h>
 #include <linux/init.h>
@@ -411,16 +412,16 @@ static int __init ap325rxa_devices_setup(void)
 	/* LD3 and LD4 LEDs */
 	gpio_request(GPIO_PTX5, NULL); /* RUN */
 	gpio_direction_output(GPIO_PTX5, 1);
-	gpio_export(GPIO_PTX5, 0);
+	gpiod_export(gpio_to_desc(GPIO_PTX5), 0);
 
 	gpio_request(GPIO_PTX4, NULL); /* INDICATOR */
 	gpio_direction_output(GPIO_PTX4, 0);
-	gpio_export(GPIO_PTX4, 0);
+	gpiod_export(gpio_to_desc(GPIO_PTX4), 0);
 
 	/* SW1 input */
 	gpio_request(GPIO_PTF7, NULL); /* MODE */
 	gpio_direction_input(GPIO_PTF7);
-	gpio_export(GPIO_PTF7, 0);
+	gpiod_export(gpio_to_desc(GPIO_PTF7), 0);
 
 	/* LCDC */
 	gpio_request(GPIO_FN_LCDD15, NULL);
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index cd27bf173dec..6e4267944f80 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -491,7 +491,7 @@ static ssize_t unexport_store(struct class *class,
 		goto done;
 
 	desc = gpio_to_desc(gpio);
-	/* reject bogus commands (gpio_unexport ignores them) */
+	/* reject bogus commands (gpiod_unexport() ignores them) */
 	if (!desc) {
 		pr_warn("%s: invalid GPIO %ld\n", __func__, gpio);
 		return -EINVAL;
@@ -790,7 +790,7 @@ static int __init gpiolib_sysfs_init(void)
 	 * early (e.g. before the class_register above was called).
 	 *
 	 * We run before arch_initcall() so chip->dev nodes can have
-	 * registered, and so arch_initcall() can always gpio_export().
+	 * registered, and so arch_initcall() can always gpiod_export().
 	 */
 	spin_lock_irqsave(&gpio_lock, flags);
 	list_for_each_entry(gdev, &gpio_devices, list) {
diff --git a/drivers/media/pci/sta2x11/sta2x11_vip.c b/drivers/media/pci/sta2x11/sta2x11_vip.c
index 8535e49a4c4f..e4cf9d63e926 100644
--- a/drivers/media/pci/sta2x11/sta2x11_vip.c
+++ b/drivers/media/pci/sta2x11/sta2x11_vip.c
@@ -18,6 +18,7 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 #include <linux/delay.h>
@@ -889,6 +890,7 @@ static int sta2x11_vip_init_controls(struct sta2x11_vip *vip)
 static int vip_gpio_reserve(struct device *dev, int pin, int dir,
 			    const char *name)
 {
+	struct gpio_desc *desc = gpio_to_desc(pin);
 	int ret = -ENODEV;
 
 	if (!gpio_is_valid(pin))
@@ -900,7 +902,7 @@ static int vip_gpio_reserve(struct device *dev, int pin, int dir,
 		return ret;
 	}
 
-	ret = gpio_direction_output(pin, dir);
+	ret = gpiod_direction_output(desc, dir);
 	if (ret) {
 		dev_err(dev, "Failed to set direction for pin %d (%s)\n",
 			pin, name);
@@ -908,7 +910,7 @@ static int vip_gpio_reserve(struct device *dev, int pin, int dir,
 		return ret;
 	}
 
-	ret = gpio_export(pin, false);
+	ret = gpiod_export(desc, false);
 	if (ret) {
 		dev_err(dev, "Failed to export pin %d (%s)\n", pin, name);
 		gpio_free(pin);
@@ -928,8 +930,10 @@ static int vip_gpio_reserve(struct device *dev, int pin, int dir,
 static void vip_gpio_release(struct device *dev, int pin, const char *name)
 {
 	if (gpio_is_valid(pin)) {
+		struct gpio_desc *desc = gpio_to_desc(pin);
+
 		dev_dbg(dev, "releasing pin %d (%s)\n",	pin, name);
-		gpio_unexport(pin);
+		gpiod_unexport(desc);
 		gpio_free(pin);
 	}
 }
diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c
index e1a569b99e4a..5c0be6a3ec5e 100644
--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -51,6 +51,7 @@
 #include <linux/clk-provider.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
 #include <linux/ieee802154.h>
 #include <linux/io.h>
@@ -2853,7 +2854,7 @@ static int ca8210_interrupt_init(struct spi_device *spi)
 	);
 	if (ret) {
 		dev_crit(&spi->dev, "request_irq %d failed\n", pdata->irq_id);
-		gpio_unexport(pdata->gpio_irq);
+		gpiod_unexport(gpio_to_desc(pdata->gpio_irq));
 		gpio_free(pdata->gpio_irq);
 	}
 
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index fc56733e8514..a86953696e47 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -132,20 +132,6 @@ int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
 int gpio_request_array(const struct gpio *array, size_t num);
 void gpio_free_array(const struct gpio *array, size_t num);
 
-/*
- * A sysfs interface can be exported by individual drivers if they want,
- * but more typically is configured entirely from userspace.
- */
-static inline int gpio_export(unsigned gpio, bool direction_may_change)
-{
-	return gpiod_export(gpio_to_desc(gpio), direction_may_change);
-}
-
-static inline void gpio_unexport(unsigned gpio)
-{
-	gpiod_unexport(gpio_to_desc(gpio));
-}
-
 /* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */
 
 struct device;
@@ -242,19 +228,6 @@ static inline void gpio_set_value_cansleep(unsigned gpio, int value)
 	WARN_ON(1);
 }
 
-static inline int gpio_export(unsigned gpio, bool direction_may_change)
-{
-	/* GPIO can never have been requested or set as {in,out}put */
-	WARN_ON(1);
-	return -EINVAL;
-}
-
-static inline void gpio_unexport(unsigned gpio)
-{
-	/* GPIO can never have been exported */
-	WARN_ON(1);
-}
-
 static inline int gpio_to_irq(unsigned gpio)
 {
 	/* GPIO can never have been requested or set as input */
-- 
cgit v1.2.3


From a8e59744e16b9ae18fab773a0fd3b23cdf21ad75 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Feb 2023 17:41:47 +0200
Subject: gpiolib: split linux/gpio/driver.h out of linux/gpio.h

Almost all gpio drivers include linux/gpio/driver.h, and other
files should not rely on includes from this header.

Remove the indirect include from here and include the correct
headers directly from where they are used.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Lee Jones <lee@kernel.org>
---
 arch/arm/mach-omap1/irq.c                              | 1 +
 arch/arm/mach-orion5x/board-rd88f5182.c                | 1 +
 arch/arm/mach-sa1100/assabet.c                         | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c | 1 +
 include/linux/mfd/ucb1x00.h                            | 1 +
 5 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/arch/arm/mach-omap1/irq.c b/arch/arm/mach-omap1/irq.c
index 9ccc784fd614..bfc7ab010ae2 100644
--- a/arch/arm/mach-omap1/irq.c
+++ b/arch/arm/mach-omap1/irq.c
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/irqdomain.h>
 
 #include <asm/irq.h>
 #include <asm/exception.h>
diff --git a/arch/arm/mach-orion5x/board-rd88f5182.c b/arch/arm/mach-orion5x/board-rd88f5182.c
index 596601367989..1c14e49a90a6 100644
--- a/arch/arm/mach-orion5x/board-rd88f5182.c
+++ b/arch/arm/mach-orion5x/board-rd88f5182.c
@@ -9,6 +9,7 @@
 #include <linux/gpio.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pci.h>
 #include <linux/irq.h>
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 2eba112f2ad8..d000c678b439 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <linux/gpio/driver.h>
 #include <linux/gpio/gpio-reg.h>
 #include <linux/gpio/machine.h>
 #include <linux/gpio_keys.h>
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
index 9540a05247c2..89c8829528c2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <net/mac80211.h>
 #include <linux/bcma/bcma_driver_chipcommon.h>
+#include <linux/gpio.h>
 #include <linux/gpio/driver.h>
 #include <linux/gpio/machine.h>
 #include <linux/gpio/consumer.h>
diff --git a/include/linux/mfd/ucb1x00.h b/include/linux/mfd/ucb1x00.h
index 43bcf35afe27..ede237384723 100644
--- a/include/linux/mfd/ucb1x00.h
+++ b/include/linux/mfd/ucb1x00.h
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/mfd/mcp.h>
 #include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/mutex.h>
 
 #define UCB_IO_DATA	0x00
-- 
cgit v1.2.3


From a99cc66807d6c854a7f65f962766c530c91be149 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 7 Feb 2023 16:29:47 +0200
Subject: gpiolib: split of_mm_gpio_chip out of linux/of_gpio.h

This is a rarely used feature that has nothing to do with the
client-side of_gpio.h.

Split it out with a separate header file and Kconfig option
so it can be removed on its own timeline aside from removing
the of_gpio consumer interfaces.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/powerpc/platforms/44x/Kconfig         |  1 +
 arch/powerpc/platforms/4xx/gpio.c          |  2 +-
 arch/powerpc/platforms/8xx/Kconfig         |  1 +
 arch/powerpc/platforms/8xx/cpm1.c          |  2 +-
 arch/powerpc/platforms/Kconfig             |  2 ++
 arch/powerpc/sysdev/cpm_common.c           |  2 +-
 drivers/gpio/Kconfig                       | 11 +++++++++
 drivers/gpio/TODO                          | 15 ++++++++-----
 drivers/gpio/gpio-altera.c                 |  2 +-
 drivers/gpio/gpio-mm-lantiq.c              |  2 +-
 drivers/gpio/gpio-mpc5200.c                |  2 +-
 drivers/gpio/gpiolib-of.c                  |  3 +++
 drivers/soc/fsl/qe/gpio.c                  |  2 +-
 include/linux/gpio/legacy-of-mm-gpiochip.h | 36 ++++++++++++++++++++++++++++++
 include/linux/of_gpio.h                    | 21 -----------------
 15 files changed, 71 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/gpio/legacy-of-mm-gpiochip.h

(limited to 'include')

diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 25b80cd558f8..1624ebf95497 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -230,6 +230,7 @@ config PPC4xx_GPIO
 	bool "PPC4xx GPIO support"
 	depends on 44x
 	select GPIOLIB
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Enable gpiolib support for ppc440 based boards
 
diff --git a/arch/powerpc/platforms/4xx/gpio.c b/arch/powerpc/platforms/4xx/gpio.c
index 49ee8d365852..e5f2319e5cbe 100644
--- a/arch/powerpc/platforms/4xx/gpio.c
+++ b/arch/powerpc/platforms/4xx/gpio.c
@@ -14,7 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #include <linux/gpio/driver.h>
 #include <linux/types.h>
 #include <linux/slab.h>
diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig
index 60cc5b537a98..a14d9d8997a4 100644
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -101,6 +101,7 @@ comment "Generic MPC8xx Options"
 config 8xx_GPIO
 	bool "GPIO API Support"
 	select GPIOLIB
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Saying Y here will cause the ports on an MPC8xx processor to be used
 	  with the GPIO API.  If you say N here, the kernel needs less memory.
diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c
index bb38c8d8f8de..56ca14f77543 100644
--- a/arch/powerpc/platforms/8xx/cpm1.c
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -44,7 +44,7 @@
 #include <asm/fs_pd.h>
 
 #ifdef CONFIG_8xx_GPIO
-#include <linux/of_gpio.h>
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #endif
 
 #define CPM_MAP_SIZE    (0x4000)
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index d41dad227de8..8e4bbd19dec5 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -244,6 +244,7 @@ config QE_GPIO
 	bool "QE GPIO support"
 	depends on QUICC_ENGINE
 	select GPIOLIB
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Say Y here if you're going to use hardware that connects to the
 	  QE GPIOs.
@@ -254,6 +255,7 @@ config CPM2
 	select CPM
 	select HAVE_PCI
 	select GPIOLIB
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  The CPM2 (Communications Processor Module) is a coprocessor on
 	  embedded CPUs made by Freescale.  Selecting this option means that
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 7dc1960f8bdb..8234013a8772 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -31,7 +31,7 @@
 #include <mm/mmu_decl.h>
 
 #if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO)
-#include <linux/of_gpio.h>
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #endif
 
 static int __init cpm_init(void)
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index df82fb7eb0eb..2b72a7a7084a 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -39,6 +39,14 @@ config GPIOLIB_IRQCHIP
 	select IRQ_DOMAIN
 	bool
 
+config OF_GPIO_MM_GPIOCHIP
+	bool
+	help
+	  This adds support for the legacy 'struct of_mm_gpio_chip' interface
+	  from PowerPC. Existing drivers using this interface need to select
+	  this symbol, but new drivers should use the generic gpio-regmap
+	  infrastructure instead.
+
 config DEBUG_GPIO
 	bool "Debug GPIO calls"
 	depends on DEBUG_KERNEL
@@ -131,6 +139,7 @@ config GPIO_ALTERA
 	tristate "Altera GPIO"
 	depends on OF_GPIO
 	select GPIOLIB_IRQCHIP
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Say Y or M here to build support for the Altera PIO device.
 
@@ -403,6 +412,7 @@ config GPIO_MENZ127
 config GPIO_MM_LANTIQ
 	bool "Lantiq Memory mapped GPIOs"
 	depends on LANTIQ && SOC_XWAY
+	select OF_GPIO_MM_GPIOCHIP
 	help
 	  This enables support for memory mapped GPIOs on the External Bus Unit
 	  (EBU) found on Lantiq SoCs. The GPIOs are output only as they are
@@ -411,6 +421,7 @@ config GPIO_MM_LANTIQ
 config GPIO_MPC5200
 	def_bool y
 	depends on PPC_MPC52xx
+	select OF_GPIO_MM_GPIOCHIP
 
 config GPIO_MPC8XXX
 	bool "MPC512x/MPC8xxx/QorIQ GPIO support"
diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO
index 68ada1066941..189c3abe7e79 100644
--- a/drivers/gpio/TODO
+++ b/drivers/gpio/TODO
@@ -59,11 +59,6 @@ the device tree back-end. It is legacy and should not be used in new code.
 
 Work items:
 
-- Get rid of struct of_mm_gpio_chip altogether: use the generic  MMIO
-  GPIO for all current users (see below). Delete struct of_mm_gpio_chip,
-  to_of_mm_gpio_chip(), of_mm_gpiochip_add_data(), of_mm_gpiochip_remove()
-  from the kernel.
-
 - Change all consumer drivers that #include <linux/of_gpio.h> to
   #include <linux/gpio/consumer.h> and stop doing custom parsing of the
   GPIO lines from the device tree. This can be tricky and often ivolves
@@ -81,6 +76,16 @@ Work items:
   uses <linux/gpio/consumer.h> or <linux/gpio/driver.h> instead.
 
 
+Get rid of <linux/gpio/legacy-of-mm-gpiochip.h>
+
+Work items:
+
+- Get rid of struct of_mm_gpio_chip altogether: use the generic  MMIO
+  GPIO for all current users (see below). Delete struct of_mm_gpio_chip,
+  to_of_mm_gpio_chip(), of_mm_gpiochip_add_data(), of_mm_gpiochip_remove(),
+  CONFIG_OF_GPIO_MM_GPIOCHIP from the kernel.
+
+
 Get rid of <linux/gpio.h>
 
 This legacy header is a one stop shop for anything GPIO is closely tied
diff --git a/drivers/gpio/gpio-altera.c b/drivers/gpio/gpio-altera.c
index b59fae993626..99e137f8097e 100644
--- a/drivers/gpio/gpio-altera.c
+++ b/drivers/gpio/gpio-altera.c
@@ -7,7 +7,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/gpio/driver.h>
-#include <linux/of_gpio.h> /* For of_mm_gpio_chip */
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #include <linux/platform_device.h>
 
 #define ALTERA_GPIO_MAX_NGPIO		32
diff --git a/drivers/gpio/gpio-mm-lantiq.c b/drivers/gpio/gpio-mm-lantiq.c
index 538e31fe8903..27ff84c5d162 100644
--- a/drivers/gpio/gpio-mm-lantiq.c
+++ b/drivers/gpio/gpio-mm-lantiq.c
@@ -10,8 +10,8 @@
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/gpio/driver.h>
+#include <linux/gpio/legacy-of-mm-gpiochip.h.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 
diff --git a/drivers/gpio/gpio-mpc5200.c b/drivers/gpio/gpio-mpc5200.c
index 000494e0c533..3b0bfff8c778 100644
--- a/drivers/gpio/gpio-mpc5200.c
+++ b/drivers/gpio/gpio-mpc5200.c
@@ -8,7 +8,7 @@
 #include <linux/of.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #include <linux/io.h>
 #include <linux/of_platform.h>
 #include <linux/module.h>
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 266352b1a966..0f699af438b0 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -892,6 +892,8 @@ static int of_gpio_simple_xlate(struct gpio_chip *gc,
 	return gpiospec->args[0];
 }
 
+#if IS_ENABLED(CONFIG_OF_GPIO_MM_GPIOCHIP)
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 /**
  * of_mm_gpiochip_add_data - Add memory mapped GPIO chip (bank)
  * @np:		device node of the GPIO chip
@@ -964,6 +966,7 @@ void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc)
 	kfree(gc->label);
 }
 EXPORT_SYMBOL_GPL(of_mm_gpiochip_remove);
+#endif
 
 #ifdef CONFIG_PINCTRL
 static int of_gpiochip_add_pin_range(struct gpio_chip *chip)
diff --git a/drivers/soc/fsl/qe/gpio.c b/drivers/soc/fsl/qe/gpio.c
index 1c41eb49d5a7..3ef24ba0245b 100644
--- a/drivers/soc/fsl/qe/gpio.c
+++ b/drivers/soc/fsl/qe/gpio.c
@@ -13,7 +13,7 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>	/* for of_mm_gpio_chip */
+#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #include <linux/gpio/consumer.h>
 #include <linux/gpio/driver.h>
 #include <linux/slab.h>
diff --git a/include/linux/gpio/legacy-of-mm-gpiochip.h b/include/linux/gpio/legacy-of-mm-gpiochip.h
new file mode 100644
index 000000000000..2e2bd3b19cc3
--- /dev/null
+++ b/include/linux/gpio/legacy-of-mm-gpiochip.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * OF helpers for the old of_mm_gpio_chip, used on ppc32 and nios2,
+ * do not use in new code.
+ *
+ * Copyright (c) 2007-2008  MontaVista Software, Inc.
+ *
+ * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
+ */
+
+#ifndef __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H
+#define __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H
+
+#include <linux/gpio/driver.h>
+#include <linux/of.h>
+
+/*
+ * OF GPIO chip for memory mapped banks
+ */
+struct of_mm_gpio_chip {
+	struct gpio_chip gc;
+	void (*save_regs)(struct of_mm_gpio_chip *mm_gc);
+	void __iomem *regs;
+};
+
+static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
+{
+	return container_of(gc, struct of_mm_gpio_chip, gc);
+}
+
+extern int of_mm_gpiochip_add_data(struct device_node *np,
+				   struct of_mm_gpio_chip *mm_gc,
+				   void *data);
+extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
+
+#endif /* __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H */
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 5d58b3b0a97e..d0f66a5e1b2a 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -19,30 +19,9 @@ struct device_node;
 
 #ifdef CONFIG_OF_GPIO
 
-#include <linux/container_of.h>
-
-/*
- * OF GPIO chip for memory mapped banks
- */
-struct of_mm_gpio_chip {
-	struct gpio_chip gc;
-	void (*save_regs)(struct of_mm_gpio_chip *mm_gc);
-	void __iomem *regs;
-};
-
-static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
-{
-	return container_of(gc, struct of_mm_gpio_chip, gc);
-}
-
 extern int of_get_named_gpio(const struct device_node *np,
 			     const char *list_name, int index);
 
-extern int of_mm_gpiochip_add_data(struct device_node *np,
-				   struct of_mm_gpio_chip *mm_gc,
-				   void *data);
-extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
-
 #else /* CONFIG_OF_GPIO */
 
 #include <linux/errno.h>
-- 
cgit v1.2.3


From 6cfd84c4f4f61905088b642a3ef1038ee627cd98 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 Feb 2023 16:29:49 +0200
Subject: gpiolib: Drop unused forward declaration from driver.h

There is no struct device_node pointers anywhere in the header,
drop unused forward declaration.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ccd8a512d854..262a84ce9bcb 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -16,7 +16,6 @@
 
 struct gpio_desc;
 struct of_phandle_args;
-struct device_node;
 struct seq_file;
 struct gpio_device;
 struct module;
-- 
cgit v1.2.3


From 91e5ae95a0af7d2db9e98e8f99436c02c304378b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 Feb 2023 16:29:50 +0200
Subject: gpiolib: Deduplicate forward declarations in consumer.h

The struct fwnode_handle pointer is used in both branches of ifdeffery,
no need to have a copy of the same in each of them, just make it global.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/consumer.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 59cb20cfac3d..a7eb8aa1e54c 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 
 struct device;
+struct fwnode_handle;
 struct gpio_desc;
 struct gpio_array;
 
@@ -171,9 +172,6 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name);
 struct gpio_desc *gpio_to_desc(unsigned gpio);
 int desc_to_gpio(const struct gpio_desc *desc);
 
-/* Child properties interface */
-struct fwnode_handle;
-
 struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
 					 const char *con_id, int index,
 					 enum gpiod_flags flags,
@@ -546,9 +544,6 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
 	return -EINVAL;
 }
 
-/* Child properties interface */
-struct fwnode_handle;
-
 static inline
 struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
 					 const char *con_id, int index,
-- 
cgit v1.2.3


From 5b1911976ccf83e892452afb0363500228b35d81 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 Feb 2023 16:29:51 +0200
Subject: gpiolib: Group forward declarations in consumer.h

For better maintenance group the forward declarations together.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/consumer.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index a7eb8aa1e54c..5432e5d5fbfb 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -7,6 +7,7 @@
 #include <linux/compiler_types.h>
 #include <linux/err.h>
 
+struct acpi_device;
 struct device;
 struct fwnode_handle;
 struct gpio_desc;
@@ -602,8 +603,6 @@ struct acpi_gpio_mapping {
 	unsigned int quirks;
 };
 
-struct acpi_device;
-
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI)
 
 int acpi_dev_add_driver_gpios(struct acpi_device *adev,
-- 
cgit v1.2.3


From 380c7ba3923c6e471aff0f951a6cf42e8dec2c79 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 8 Feb 2023 19:07:28 +0200
Subject: gpiolib: Clean up headers

There is a few things done:
- include only the headers we are direct user of
- when pointer is in use, provide a forward declaration
- add missing headers
- group generic headers and subsystem headers
- sort each group alphabetically

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/gpio/gpiolib-acpi.c   | 10 ++++++----
 drivers/gpio/gpiolib-acpi.h   |  1 -
 drivers/gpio/gpiolib-of.c     |  6 ++++--
 drivers/gpio/gpiolib-of.h     |  1 -
 drivers/gpio/gpiolib-swnode.c |  5 +++--
 drivers/gpio/gpiolib-sysfs.c  | 21 ++++++++++++++++-----
 drivers/gpio/gpiolib.c        |  9 ++++++---
 include/linux/gpio.h          | 10 ++++------
 include/linux/gpio/consumer.h | 14 ++++++++++----
 include/linux/gpio/driver.h   | 30 +++++++++++++++++++++++-------
 10 files changed, 72 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index d8a421ce26a8..0605399c84e7 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -7,17 +7,19 @@
  *          Mika Westerberg <mika.westerberg@linux.intel.com>
  */
 
+#include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/errno.h>
-#include <linux/gpio/consumer.h>
-#include <linux/gpio/driver.h>
-#include <linux/gpio/machine.h>
 #include <linux/export.h>
-#include <linux/acpi.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/mutex.h>
 #include <linux/pinctrl/pinctrl.h>
 
+#include <linux/gpio/consumer.h>
+#include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
+
 #include "gpiolib.h"
 #include "gpiolib-acpi.h"
 
diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h
index 90fd6b04f24d..0fcd7e14d7f9 100644
--- a/drivers/gpio/gpiolib-acpi.h
+++ b/drivers/gpio/gpiolib-acpi.h
@@ -9,7 +9,6 @@
 #define GPIOLIB_ACPI_H
 
 #include <linux/err.h>
-#include <linux/errno.h>
 #include <linux/types.h>
 
 #include <linux/gpio/consumer.h>
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 0f699af438b0..1436cdb5fa26 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -10,14 +10,16 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/errno.h>
-#include <linux/module.h>
 #include <linux/io.h>
-#include <linux/gpio/consumer.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_gpio.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/gpio/consumer.h>
 #include <linux/gpio/machine.h>
 
 #include "gpiolib.h"
diff --git a/drivers/gpio/gpiolib-of.h b/drivers/gpio/gpiolib-of.h
index e5bb065d82ef..6b3a5347c5d9 100644
--- a/drivers/gpio/gpiolib-of.h
+++ b/drivers/gpio/gpiolib-of.h
@@ -4,7 +4,6 @@
 #define GPIOLIB_OF_H
 
 #include <linux/err.h>
-#include <linux/errno.h>
 #include <linux/types.h>
 
 #include <linux/notifier.h>
diff --git a/drivers/gpio/gpiolib-swnode.c b/drivers/gpio/gpiolib-swnode.c
index dd9ccac214d1..b5a6eaf3729b 100644
--- a/drivers/gpio/gpiolib-swnode.c
+++ b/drivers/gpio/gpiolib-swnode.c
@@ -6,13 +6,14 @@
  */
 #include <linux/err.h>
 #include <linux/errno.h>
-#include <linux/gpio/consumer.h>
-#include <linux/gpio/driver.h>
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/property.h>
 #include <linux/string.h>
 
+#include <linux/gpio/consumer.h>
+#include <linux/gpio/driver.h>
+
 #include "gpiolib.h"
 #include "gpiolib-swnode.h"
 
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index 6e4267944f80..c1cbf71329f0 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -1,18 +1,29 @@
 // SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/device.h>
 #include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kdev_t.h>
+#include <linux/kstrtox.h>
+#include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/device.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
 #include <linux/sysfs.h>
+#include <linux/types.h>
+
 #include <linux/gpio/consumer.h>
 #include <linux/gpio/driver.h>
-#include <linux/interrupt.h>
-#include <linux/kdev_t.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
 
 #include "gpiolib.h"
 #include "gpiolib-sysfs.h"
 
+struct kernfs_node;
+
 #define GPIO_IRQF_TRIGGER_NONE		0
 #define GPIO_IRQF_TRIGGER_FALLING	BIT(0)
 #define GPIO_IRQF_TRIGGER_RISING	BIT(1)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 19bd23044b01..700195a1faae 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -6,22 +6,25 @@
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/gpio.h>
-#include <linux/gpio/driver.h>
-#include <linux/gpio/machine.h>
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
+#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
+
 #include <uapi/linux/gpio.h>
 
 #include "gpiolib-acpi.h"
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index a86953696e47..8528353e073b 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -12,9 +12,10 @@
 #ifndef __LINUX_GPIO_H
 #define __LINUX_GPIO_H
 
-#include <linux/errno.h>
 #include <linux/types.h>
 
+struct device;
+
 /* see Documentation/driver-api/gpio/legacy.rst */
 
 /* make these flag values available regardless of GPIO kconfig options */
@@ -134,19 +135,16 @@ void gpio_free_array(const struct gpio *array, size_t num);
 
 /* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */
 
-struct device;
-
 int devm_gpio_request(struct device *dev, unsigned gpio, const char *label);
 int devm_gpio_request_one(struct device *dev, unsigned gpio,
 			  unsigned long flags, const char *label);
 
 #else /* ! CONFIG_GPIOLIB */
 
-#include <linux/bug.h>
 #include <linux/kernel.h>
 
-struct device;
-struct gpio_chip;
+#include <asm/bug.h>
+#include <asm/errno.h>
 
 static inline bool gpio_is_valid(int number)
 {
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 5432e5d5fbfb..1c4385a00f88 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -3,15 +3,14 @@
 #define __LINUX_GPIO_CONSUMER_H
 
 #include <linux/bits.h>
-#include <linux/bug.h>
-#include <linux/compiler_types.h>
-#include <linux/err.h>
+#include <linux/types.h>
 
 struct acpi_device;
 struct device;
 struct fwnode_handle;
-struct gpio_desc;
+
 struct gpio_array;
+struct gpio_desc;
 
 /**
  * struct gpio_descs - Struct containing an array of descriptors that can be
@@ -185,8 +184,11 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
 
 #else /* CONFIG_GPIOLIB */
 
+#include <linux/err.h>
 #include <linux/kernel.h>
 
+#include <asm/bug.h>
+
 static inline int gpiod_count(struct device *dev, const char *con_id)
 {
 	return 0;
@@ -616,6 +618,8 @@ struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin, char
 
 #else  /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
+#include <linux/err.h>
+
 static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 			      const struct acpi_gpio_mapping *gpios)
 {
@@ -647,6 +651,8 @@ void gpiod_unexport(struct gpio_desc *desc);
 
 #else  /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */
 
+#include <asm/errno.h>
+
 static inline int gpiod_export(struct gpio_desc *desc,
 			       bool direction_may_change)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 262a84ce9bcb..5c6db5533be6 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -2,27 +2,35 @@
 #ifndef __LINUX_GPIO_DRIVER_H
 #define __LINUX_GPIO_DRIVER_H
 
-#include <linux/device.h>
-#include <linux/irq.h>
+#include <linux/bits.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqhandler.h>
 #include <linux/lockdep.h>
 #include <linux/pinctrl/pinconf-generic.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/property.h>
+#include <linux/spinlock_types.h>
 #include <linux/types.h>
 
+#ifdef CONFIG_GENERIC_MSI_IRQ
 #include <asm/msi.h>
+#endif
 
-struct gpio_desc;
+struct device;
+struct irq_chip;
+struct irq_data;
+struct module;
 struct of_phandle_args;
+struct pinctrl_dev;
 struct seq_file;
-struct gpio_device;
-struct module;
-enum gpiod_flags;
-enum gpio_lookup_flags;
 
 struct gpio_chip;
+struct gpio_desc;
+struct gpio_device;
+
+enum gpio_lookup_flags;
+enum gpiod_flags;
 
 union gpio_irq_fwspec {
 	struct irq_fwspec	fwspec;
@@ -679,6 +687,10 @@ bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 				struct irq_domain *domain);
 #else
+
+#include <asm/bug.h>
+#include <asm/errno.h>
+
 static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 					      struct irq_domain *domain)
 {
@@ -756,6 +768,10 @@ struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
 
 #else /* CONFIG_GPIOLIB */
 
+#include <linux/err.h>
+
+#include <asm/bug.h>
+
 static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-- 
cgit v1.2.3


From dd4f373ef94bd186e18a7366adfb7b98bc31b786 Mon Sep 17 00:00:00 2001
From: "Roy-CW.Yeh" <roy-cw.yeh@mediatek.com>
Date: Mon, 6 Feb 2023 17:11:07 +0800
Subject: soc: mediatek: mmsys: add config api for RSZ switching and DCM

Due to MT8195 HW design, some RSZs have additional settings that
need to be configured in MMSYS.

Signed-off-by: Roy-CW.Yeh <roy-cw.yeh@mediatek.com>
Signed-off-by: Moudy Ho <moudy.ho@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Change-Id: I41978bf14951221c88abbe70d8c24cb0770e11e3
Link: https://lore.kernel.org/r/20230206091109.1324-5-moudy.ho@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8195-mmsys.h    | 13 ++++++++++
 drivers/soc/mediatek/mtk-mmsys.c       | 44 ++++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-mmsys.h |  6 +++++
 3 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mt8195-mmsys.h b/drivers/soc/mediatek/mt8195-mmsys.h
index a6652ae63431..9be2df2832a4 100644
--- a/drivers/soc/mediatek/mt8195-mmsys.h
+++ b/drivers/soc/mediatek/mt8195-mmsys.h
@@ -146,6 +146,19 @@
 #define MT8195_VDO1_MIXER_SOUT_SEL_IN				0xf68
 #define MT8195_MIXER_SOUT_SEL_IN_FROM_DISP_MIXER			0
 
+/* VPPSYS1 */
+#define MT8195_VPP1_HW_DCM_1ST_DIS0				0x150
+#define MT8195_VPP1_HW_DCM_1ST_DIS1				0x160
+#define MT8195_VPP1_HW_DCM_2ND_DIS0				0x1a0
+#define MT8195_VPP1_HW_DCM_2ND_DIS1				0x1b0
+#define MT8195_SVPP2_BUF_BF_RSZ_SWITCH				0xf48
+#define MT8195_SVPP3_BUF_BF_RSZ_SWITCH				0xf74
+
+/* VPPSYS1 HW DCM client*/
+#define MT8195_SVPP1_MDP_RSZ					BIT(25)
+#define MT8195_SVPP2_MDP_RSZ					BIT(4)
+#define MT8195_SVPP3_MDP_RSZ					BIT(5)
+
 static const struct mtk_mmsys_routes mmsys_mt8195_routing_table[] = {
 	{
 		DDP_COMPONENT_OVL0, DDP_COMPONENT_RDMA0,
diff --git a/drivers/soc/mediatek/mtk-mmsys.c b/drivers/soc/mediatek/mtk-mmsys.c
index eb4c7e57896c..fd2b37d451c8 100644
--- a/drivers/soc/mediatek/mtk-mmsys.c
+++ b/drivers/soc/mediatek/mtk-mmsys.c
@@ -242,6 +242,50 @@ void mtk_mmsys_ddp_dpi_fmt_config(struct device *dev, u32 val)
 }
 EXPORT_SYMBOL_GPL(mtk_mmsys_ddp_dpi_fmt_config);
 
+void mtk_mmsys_vpp_rsz_merge_config(struct device *dev, u32 id, bool enable,
+				    struct cmdq_pkt *cmdq_pkt)
+{
+	u32 reg;
+
+	switch (id) {
+	case 2:
+		reg = MT8195_SVPP2_BUF_BF_RSZ_SWITCH;
+		break;
+	case 3:
+		reg = MT8195_SVPP3_BUF_BF_RSZ_SWITCH;
+		break;
+	default:
+		dev_err(dev, "Invalid id %d\n", id);
+		return;
+	}
+
+	mtk_mmsys_update_bits(dev_get_drvdata(dev), reg, ~0, enable, cmdq_pkt);
+}
+EXPORT_SYMBOL_GPL(mtk_mmsys_vpp_rsz_merge_config);
+
+void mtk_mmsys_vpp_rsz_dcm_config(struct device *dev, bool enable,
+				  struct cmdq_pkt *cmdq_pkt)
+{
+	u32 client;
+
+	client = MT8195_SVPP1_MDP_RSZ;
+	mtk_mmsys_update_bits(dev_get_drvdata(dev),
+			      MT8195_VPP1_HW_DCM_1ST_DIS0, client,
+			      ((enable) ? client : 0), cmdq_pkt);
+	mtk_mmsys_update_bits(dev_get_drvdata(dev),
+			      MT8195_VPP1_HW_DCM_2ND_DIS0, client,
+			      ((enable) ? client : 0), cmdq_pkt);
+
+	client = MT8195_SVPP2_MDP_RSZ | MT8195_SVPP3_MDP_RSZ;
+	mtk_mmsys_update_bits(dev_get_drvdata(dev),
+			      MT8195_VPP1_HW_DCM_1ST_DIS1, client,
+			      ((enable) ? client : 0), cmdq_pkt);
+	mtk_mmsys_update_bits(dev_get_drvdata(dev),
+			      MT8195_VPP1_HW_DCM_2ND_DIS1, client,
+			      ((enable) ? client : 0), cmdq_pkt);
+}
+EXPORT_SYMBOL_GPL(mtk_mmsys_vpp_rsz_dcm_config);
+
 static int mtk_mmsys_reset_update(struct reset_controller_dev *rcdev, unsigned long id,
 				  bool assert)
 {
diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index dc2963a0a0f7..37544ea6286d 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -99,4 +99,10 @@ void mtk_mmsys_mixer_in_config(struct device *dev, int idx, bool alpha_sel, u16
 void mtk_mmsys_mixer_in_channel_swap(struct device *dev, int idx, bool channel_swap,
 				     struct cmdq_pkt *cmdq_pkt);
 
+void mtk_mmsys_vpp_rsz_merge_config(struct device *dev, u32 id, bool enable,
+				    struct cmdq_pkt *cmdq_pkt);
+
+void mtk_mmsys_vpp_rsz_dcm_config(struct device *dev, bool enable,
+				  struct cmdq_pkt *cmdq_pkt);
+
 #endif /* __MTK_MMSYS_H */
-- 
cgit v1.2.3


From 549053b69c54aecc6f44ddefe6c245a35f162cf4 Mon Sep 17 00:00:00 2001
From: "Roy-CW.Yeh" <roy-cw.yeh@mediatek.com>
Date: Mon, 6 Feb 2023 17:11:09 +0800
Subject: soc: mediatek: mutex: support MT8195 VPPSYS

Add MT8195 VPPSYS0 and VPPSYS1 mutex info to driver data

Signed-off-by: Roy-CW.Yeh <roy-cw.yeh@mediatek.com>
Signed-off-by: Moudy Ho <moudy.ho@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Change-Id: Ie371dc9dcf35ea308d9460acd60fb9c3d6475deb
Link: https://lore.kernel.org/r/20230206091109.1324-7-moudy.ho@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mtk-mutex.c       | 102 +++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-mutex.h |  35 +++++++++++
 2 files changed, 137 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-mutex.c b/drivers/soc/mediatek/mtk-mutex.c
index a8d5fc3df5f0..7751527fc30d 100644
--- a/drivers/soc/mediatek/mtk-mutex.c
+++ b/drivers/soc/mediatek/mtk-mutex.c
@@ -164,6 +164,53 @@
 #define MT8195_MUTEX_MOD_DISP1_DPI1		26
 #define MT8195_MUTEX_MOD_DISP1_DP_INTF0		27
 
+/* VPPSYS0 */
+#define MT8195_MUTEX_MOD_MDP_RDMA0             0
+#define MT8195_MUTEX_MOD_MDP_FG0               1
+#define MT8195_MUTEX_MOD_MDP_STITCH0           2
+#define MT8195_MUTEX_MOD_MDP_HDR0              3
+#define MT8195_MUTEX_MOD_MDP_AAL0              4
+#define MT8195_MUTEX_MOD_MDP_RSZ0              5
+#define MT8195_MUTEX_MOD_MDP_TDSHP0            6
+#define MT8195_MUTEX_MOD_MDP_COLOR0            7
+#define MT8195_MUTEX_MOD_MDP_OVL0              8
+#define MT8195_MUTEX_MOD_MDP_PAD0              9
+#define MT8195_MUTEX_MOD_MDP_TCC0              10
+#define MT8195_MUTEX_MOD_MDP_WROT0             11
+
+/* VPPSYS1 */
+#define MT8195_MUTEX_MOD_MDP_TCC1              3
+#define MT8195_MUTEX_MOD_MDP_RDMA1             4
+#define MT8195_MUTEX_MOD_MDP_RDMA2             5
+#define MT8195_MUTEX_MOD_MDP_RDMA3             6
+#define MT8195_MUTEX_MOD_MDP_FG1               7
+#define MT8195_MUTEX_MOD_MDP_FG2               8
+#define MT8195_MUTEX_MOD_MDP_FG3               9
+#define MT8195_MUTEX_MOD_MDP_HDR1              10
+#define MT8195_MUTEX_MOD_MDP_HDR2              11
+#define MT8195_MUTEX_MOD_MDP_HDR3              12
+#define MT8195_MUTEX_MOD_MDP_AAL1              13
+#define MT8195_MUTEX_MOD_MDP_AAL2              14
+#define MT8195_MUTEX_MOD_MDP_AAL3              15
+#define MT8195_MUTEX_MOD_MDP_RSZ1              16
+#define MT8195_MUTEX_MOD_MDP_RSZ2              17
+#define MT8195_MUTEX_MOD_MDP_RSZ3              18
+#define MT8195_MUTEX_MOD_MDP_TDSHP1            19
+#define MT8195_MUTEX_MOD_MDP_TDSHP2            20
+#define MT8195_MUTEX_MOD_MDP_TDSHP3            21
+#define MT8195_MUTEX_MOD_MDP_MERGE2            22
+#define MT8195_MUTEX_MOD_MDP_MERGE3            23
+#define MT8195_MUTEX_MOD_MDP_COLOR1            24
+#define MT8195_MUTEX_MOD_MDP_COLOR2            25
+#define MT8195_MUTEX_MOD_MDP_COLOR3            26
+#define MT8195_MUTEX_MOD_MDP_OVL1              27
+#define MT8195_MUTEX_MOD_MDP_PAD1              28
+#define MT8195_MUTEX_MOD_MDP_PAD2              29
+#define MT8195_MUTEX_MOD_MDP_PAD3              30
+#define MT8195_MUTEX_MOD_MDP_WROT1             31
+#define MT8195_MUTEX_MOD_MDP_WROT2             32
+#define MT8195_MUTEX_MOD_MDP_WROT3             33
+
 #define MT8365_MUTEX_MOD_DISP_OVL0		7
 #define MT8365_MUTEX_MOD_DISP_OVL0_2L		8
 #define MT8365_MUTEX_MOD_DISP_RDMA0		9
@@ -444,6 +491,52 @@ static const unsigned int mt8195_mutex_mod[DDP_COMPONENT_ID_MAX] = {
 	[DDP_COMPONENT_DP_INTF1] = MT8195_MUTEX_MOD_DISP1_DP_INTF0,
 };
 
+static const unsigned int mt8195_mutex_table_mod[MUTEX_MOD_IDX_MAX] = {
+	[MUTEX_MOD_IDX_MDP_RDMA0] = MT8195_MUTEX_MOD_MDP_RDMA0,
+	[MUTEX_MOD_IDX_MDP_RDMA1] = MT8195_MUTEX_MOD_MDP_RDMA1,
+	[MUTEX_MOD_IDX_MDP_RDMA2] = MT8195_MUTEX_MOD_MDP_RDMA2,
+	[MUTEX_MOD_IDX_MDP_RDMA3] = MT8195_MUTEX_MOD_MDP_RDMA3,
+	[MUTEX_MOD_IDX_MDP_STITCH0] = MT8195_MUTEX_MOD_MDP_STITCH0,
+	[MUTEX_MOD_IDX_MDP_FG0] = MT8195_MUTEX_MOD_MDP_FG0,
+	[MUTEX_MOD_IDX_MDP_FG1] = MT8195_MUTEX_MOD_MDP_FG1,
+	[MUTEX_MOD_IDX_MDP_FG2] = MT8195_MUTEX_MOD_MDP_FG2,
+	[MUTEX_MOD_IDX_MDP_FG3] = MT8195_MUTEX_MOD_MDP_FG3,
+	[MUTEX_MOD_IDX_MDP_HDR0] = MT8195_MUTEX_MOD_MDP_HDR0,
+	[MUTEX_MOD_IDX_MDP_HDR1] = MT8195_MUTEX_MOD_MDP_HDR1,
+	[MUTEX_MOD_IDX_MDP_HDR2] = MT8195_MUTEX_MOD_MDP_HDR2,
+	[MUTEX_MOD_IDX_MDP_HDR3] = MT8195_MUTEX_MOD_MDP_HDR3,
+	[MUTEX_MOD_IDX_MDP_AAL0] = MT8195_MUTEX_MOD_MDP_AAL0,
+	[MUTEX_MOD_IDX_MDP_AAL1] = MT8195_MUTEX_MOD_MDP_AAL1,
+	[MUTEX_MOD_IDX_MDP_AAL2] = MT8195_MUTEX_MOD_MDP_AAL2,
+	[MUTEX_MOD_IDX_MDP_AAL3] = MT8195_MUTEX_MOD_MDP_AAL3,
+	[MUTEX_MOD_IDX_MDP_RSZ0] = MT8195_MUTEX_MOD_MDP_RSZ0,
+	[MUTEX_MOD_IDX_MDP_RSZ1] = MT8195_MUTEX_MOD_MDP_RSZ1,
+	[MUTEX_MOD_IDX_MDP_RSZ2] = MT8195_MUTEX_MOD_MDP_RSZ2,
+	[MUTEX_MOD_IDX_MDP_RSZ3] = MT8195_MUTEX_MOD_MDP_RSZ3,
+	[MUTEX_MOD_IDX_MDP_MERGE2] = MT8195_MUTEX_MOD_MDP_MERGE2,
+	[MUTEX_MOD_IDX_MDP_MERGE3] = MT8195_MUTEX_MOD_MDP_MERGE3,
+	[MUTEX_MOD_IDX_MDP_TDSHP0] = MT8195_MUTEX_MOD_MDP_TDSHP0,
+	[MUTEX_MOD_IDX_MDP_TDSHP1] = MT8195_MUTEX_MOD_MDP_TDSHP1,
+	[MUTEX_MOD_IDX_MDP_TDSHP2] = MT8195_MUTEX_MOD_MDP_TDSHP2,
+	[MUTEX_MOD_IDX_MDP_TDSHP3] = MT8195_MUTEX_MOD_MDP_TDSHP3,
+	[MUTEX_MOD_IDX_MDP_COLOR0] = MT8195_MUTEX_MOD_MDP_COLOR0,
+	[MUTEX_MOD_IDX_MDP_COLOR1] = MT8195_MUTEX_MOD_MDP_COLOR1,
+	[MUTEX_MOD_IDX_MDP_COLOR2] = MT8195_MUTEX_MOD_MDP_COLOR2,
+	[MUTEX_MOD_IDX_MDP_COLOR3] = MT8195_MUTEX_MOD_MDP_COLOR3,
+	[MUTEX_MOD_IDX_MDP_OVL0] = MT8195_MUTEX_MOD_MDP_OVL0,
+	[MUTEX_MOD_IDX_MDP_OVL1] = MT8195_MUTEX_MOD_MDP_OVL1,
+	[MUTEX_MOD_IDX_MDP_PAD0] = MT8195_MUTEX_MOD_MDP_PAD0,
+	[MUTEX_MOD_IDX_MDP_PAD1] = MT8195_MUTEX_MOD_MDP_PAD1,
+	[MUTEX_MOD_IDX_MDP_PAD2] = MT8195_MUTEX_MOD_MDP_PAD2,
+	[MUTEX_MOD_IDX_MDP_PAD3] = MT8195_MUTEX_MOD_MDP_PAD3,
+	[MUTEX_MOD_IDX_MDP_TCC0] = MT8195_MUTEX_MOD_MDP_TCC0,
+	[MUTEX_MOD_IDX_MDP_TCC1] = MT8195_MUTEX_MOD_MDP_TCC1,
+	[MUTEX_MOD_IDX_MDP_WROT0] = MT8195_MUTEX_MOD_MDP_WROT0,
+	[MUTEX_MOD_IDX_MDP_WROT1] = MT8195_MUTEX_MOD_MDP_WROT1,
+	[MUTEX_MOD_IDX_MDP_WROT2] = MT8195_MUTEX_MOD_MDP_WROT2,
+	[MUTEX_MOD_IDX_MDP_WROT3] = MT8195_MUTEX_MOD_MDP_WROT3,
+};
+
 static const unsigned int mt8365_mutex_mod[DDP_COMPONENT_ID_MAX] = {
 	[DDP_COMPONENT_AAL0] = MT8365_MUTEX_MOD_DISP_AAL,
 	[DDP_COMPONENT_CCORR] = MT8365_MUTEX_MOD_DISP_CCORR,
@@ -604,6 +697,13 @@ static const struct mtk_mutex_data mt8195_mutex_driver_data = {
 	.mutex_sof_reg = MT8183_MUTEX0_SOF0,
 };
 
+static const struct mtk_mutex_data mt8195_vpp_mutex_driver_data = {
+	.mutex_sof = mt8195_mutex_sof,
+	.mutex_mod_reg = MT8183_MUTEX0_MOD0,
+	.mutex_sof_reg = MT8183_MUTEX0_SOF0,
+	.mutex_table_mod = mt8195_mutex_table_mod,
+};
+
 static const struct mtk_mutex_data mt8365_mutex_driver_data = {
 	.mutex_mod = mt8365_mutex_mod,
 	.mutex_sof = mt8183_mutex_sof,
@@ -962,6 +1062,8 @@ static const struct of_device_id mutex_driver_dt_match[] = {
 	  .data = &mt8192_mutex_driver_data},
 	{ .compatible = "mediatek,mt8195-disp-mutex",
 	  .data = &mt8195_mutex_driver_data},
+	{ .compatible = "mediatek,mt8195-vpp-mutex",
+	  .data = &mt8195_vpp_mutex_driver_data},
 	{ .compatible = "mediatek,mt8365-disp-mutex",
 	  .data = &mt8365_mutex_driver_data},
 	{},
diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h
index b335c2837cd8..635218e3ac68 100644
--- a/include/linux/soc/mediatek/mtk-mutex.h
+++ b/include/linux/soc/mediatek/mtk-mutex.h
@@ -22,6 +22,41 @@ enum mtk_mutex_mod_index {
 	MUTEX_MOD_IDX_MDP_CCORR0,
 	MUTEX_MOD_IDX_MDP_HDR0,
 	MUTEX_MOD_IDX_MDP_COLOR0,
+	MUTEX_MOD_IDX_MDP_RDMA1,
+	MUTEX_MOD_IDX_MDP_RDMA2,
+	MUTEX_MOD_IDX_MDP_RDMA3,
+	MUTEX_MOD_IDX_MDP_STITCH0,
+	MUTEX_MOD_IDX_MDP_FG0,
+	MUTEX_MOD_IDX_MDP_FG1,
+	MUTEX_MOD_IDX_MDP_FG2,
+	MUTEX_MOD_IDX_MDP_FG3,
+	MUTEX_MOD_IDX_MDP_HDR1,
+	MUTEX_MOD_IDX_MDP_HDR2,
+	MUTEX_MOD_IDX_MDP_HDR3,
+	MUTEX_MOD_IDX_MDP_AAL1,
+	MUTEX_MOD_IDX_MDP_AAL2,
+	MUTEX_MOD_IDX_MDP_AAL3,
+	MUTEX_MOD_IDX_MDP_RSZ2,
+	MUTEX_MOD_IDX_MDP_RSZ3,
+	MUTEX_MOD_IDX_MDP_MERGE2,
+	MUTEX_MOD_IDX_MDP_MERGE3,
+	MUTEX_MOD_IDX_MDP_TDSHP1,
+	MUTEX_MOD_IDX_MDP_TDSHP2,
+	MUTEX_MOD_IDX_MDP_TDSHP3,
+	MUTEX_MOD_IDX_MDP_COLOR1,
+	MUTEX_MOD_IDX_MDP_COLOR2,
+	MUTEX_MOD_IDX_MDP_COLOR3,
+	MUTEX_MOD_IDX_MDP_OVL0,
+	MUTEX_MOD_IDX_MDP_OVL1,
+	MUTEX_MOD_IDX_MDP_PAD0,
+	MUTEX_MOD_IDX_MDP_PAD1,
+	MUTEX_MOD_IDX_MDP_PAD2,
+	MUTEX_MOD_IDX_MDP_PAD3,
+	MUTEX_MOD_IDX_MDP_TCC0,
+	MUTEX_MOD_IDX_MDP_TCC1,
+	MUTEX_MOD_IDX_MDP_WROT1,
+	MUTEX_MOD_IDX_MDP_WROT2,
+	MUTEX_MOD_IDX_MDP_WROT3,
 
 	MUTEX_MOD_IDX_MAX		/* ALWAYS keep at the end */
 };
-- 
cgit v1.2.3


From 521568cff7065069bfb02b3a9945bac637d2e324 Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Wed, 22 Feb 2023 22:21:28 -0600
Subject: dt-bindings: clock: exynos850: Add Exynos850 CMU_G3D

CMU_G3D generates Gondul GPU and bus clocks for BLK_G3D.
Add clock indices and binding documentation for CMU_G3D.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Link: https://lore.kernel.org/r/20230223042133.26551-2-semen.protsenko@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../bindings/clock/samsung,exynos850-clock.yaml      | 19 +++++++++++++++++++
 include/dt-bindings/clock/exynos850.h                | 20 +++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml
index 141cf173f87d..8aa87b8c1b33 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml
@@ -37,6 +37,7 @@ properties:
       - samsung,exynos850-cmu-cmgp
       - samsung,exynos850-cmu-core
       - samsung,exynos850-cmu-dpu
+      - samsung,exynos850-cmu-g3d
       - samsung,exynos850-cmu-hsi
       - samsung,exynos850-cmu-is
       - samsung,exynos850-cmu-mfcmscl
@@ -169,6 +170,24 @@ allOf:
             - const: oscclk
             - const: dout_dpu
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: samsung,exynos850-cmu-g3d
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (26 MHz)
+            - description: G3D clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: dout_g3d_switch
+
   - if:
       properties:
         compatible:
diff --git a/include/dt-bindings/clock/exynos850.h b/include/dt-bindings/clock/exynos850.h
index 88d5289883d3..8bb62e43fd60 100644
--- a/include/dt-bindings/clock/exynos850.h
+++ b/include/dt-bindings/clock/exynos850.h
@@ -85,7 +85,10 @@
 #define CLK_DOUT_MFCMSCL_M2M		73
 #define CLK_DOUT_MFCMSCL_MCSC		74
 #define CLK_DOUT_MFCMSCL_JPEG		75
-#define TOP_NR_CLK			76
+#define CLK_MOUT_G3D_SWITCH		76
+#define CLK_GOUT_G3D_SWITCH		77
+#define CLK_DOUT_G3D_SWITCH		78
+#define TOP_NR_CLK			79
 
 /* CMU_APM */
 #define CLK_RCO_I3C_PMIC		1
@@ -195,6 +198,21 @@
 #define CLK_GOUT_SYSREG_CMGP_PCLK	15
 #define CMGP_NR_CLK			16
 
+/* CMU_G3D */
+#define CLK_FOUT_G3D_PLL		1
+#define CLK_MOUT_G3D_PLL		2
+#define CLK_MOUT_G3D_SWITCH_USER	3
+#define CLK_MOUT_G3D_BUSD		4
+#define CLK_DOUT_G3D_BUSP		5
+#define CLK_GOUT_G3D_CMU_G3D_PCLK	6
+#define CLK_GOUT_G3D_GPU_CLK		7
+#define CLK_GOUT_G3D_TZPC_PCLK		8
+#define CLK_GOUT_G3D_GRAY2BIN_CLK	9
+#define CLK_GOUT_G3D_BUSD_CLK		10
+#define CLK_GOUT_G3D_BUSP_CLK		11
+#define CLK_GOUT_G3D_SYSREG_PCLK	12
+#define G3D_NR_CLK			13
+
 /* CMU_HSI */
 #define CLK_MOUT_HSI_BUS_USER		1
 #define CLK_MOUT_HSI_MMC_CARD_USER	2
-- 
cgit v1.2.3


From 284f6dcb50ae7f25167617c89aa2d3f93323ee67 Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Wed, 22 Feb 2023 22:21:29 -0600
Subject: dt-bindings: clock: exynos850: Add AUD and HSI main gate clocks

Add main gate clocks for controlling AUD and HSI CMUs:
  - gout_aud_cmu_aud_pclk
  - gout_hsi_cmu_hsi_pclk

While at it, add missing PPMU (Performance Profiling Monitor Unit)
clocks for CMU_HSI.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Link: https://lore.kernel.org/r/20230223042133.26551-3-semen.protsenko@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/dt-bindings/clock/exynos850.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/exynos850.h b/include/dt-bindings/clock/exynos850.h
index 8bb62e43fd60..afacba338c91 100644
--- a/include/dt-bindings/clock/exynos850.h
+++ b/include/dt-bindings/clock/exynos850.h
@@ -178,7 +178,8 @@
 #define IOCLK_AUDIOCDCLK5		58
 #define IOCLK_AUDIOCDCLK6		59
 #define TICK_USB			60
-#define AUD_NR_CLK			61
+#define CLK_GOUT_AUD_CMU_AUD_PCLK	61
+#define AUD_NR_CLK			62
 
 /* CMU_CMGP */
 #define CLK_RCO_CMGP			1
@@ -227,7 +228,10 @@
 #define CLK_GOUT_MMC_CARD_ACLK		11
 #define CLK_GOUT_MMC_CARD_SDCLKIN	12
 #define CLK_GOUT_SYSREG_HSI_PCLK	13
-#define HSI_NR_CLK			14
+#define CLK_GOUT_HSI_PPMU_ACLK		14
+#define CLK_GOUT_HSI_PPMU_PCLK		15
+#define CLK_GOUT_HSI_CMU_HSI_PCLK	16
+#define HSI_NR_CLK			17
 
 /* CMU_IS */
 #define CLK_MOUT_IS_BUS_USER		1
-- 
cgit v1.2.3


From 1661372c912d1966e21e0cb5463559984df8249b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 7 Feb 2023 17:06:51 -0500
Subject: lsm: move the program execution hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 52 -----------------------------------
 security/security.c       | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 6e156d2acffc..784e19fd665b 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,58 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for program execution operations.
- *
- * @bprm_creds_for_exec:
- *	If the setup in prepare_exec_creds did not setup @bprm->cred->security
- *	properly for executing @bprm->file, update the LSM's portion of
- *	@bprm->cred->security to be what commit_creds needs to install for the
- *	new program.  This hook may also optionally check permissions
- *	(e.g. for transitions between security domains).
- *	The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to
- *	request libc enable secure mode.
- *	@bprm contains the linux_binprm structure.
- *	Return 0 if the hook is successful and permission is granted.
- * @bprm_creds_from_file:
- *	If @file is setpcap, suid, sgid or otherwise marked to change
- *	privilege upon exec, update @bprm->cred to reflect that change.
- *	This is called after finding the binary that will be executed.
- *	without an interpreter.  This ensures that the credentials will not
- *	be derived from a script that the binary will need to reopen, which
- *	when reopend may end up being a completely different file.  This
- *	hook may also optionally check permissions (e.g. for transitions
- *	between security domains).
- *	The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to
- *	request libc enable secure mode.
- *	The hook must add to @bprm->per_clear any personality flags that
- * 	should be cleared from current->personality.
- *	@bprm contains the linux_binprm structure.
- *	Return 0 if the hook is successful and permission is granted.
- * @bprm_check_security:
- *	This hook mediates the point when a search for a binary handler will
- *	begin.  It allows a check against the @bprm->cred->security value
- *	which was set in the preceding creds_for_exec call.  The argv list and
- *	envp list are reliably available in @bprm.  This hook may be called
- *	multiple times during a single execve.
- *	@bprm contains the linux_binprm structure.
- *	Return 0 if the hook is successful and permission is granted.
- * @bprm_committing_creds:
- *	Prepare to install the new security attributes of a process being
- *	transformed by an execve operation, based on the old credentials
- *	pointed to by @current->cred and the information set in @bprm->cred by
- *	the bprm_creds_for_exec hook.  @bprm points to the linux_binprm
- *	structure.  This hook is a good place to perform state changes on the
- *	process such as closing open file descriptors to which access will no
- *	longer be granted when the attributes are changed.  This is called
- *	immediately before commit_creds().
- * @bprm_committed_creds:
- *	Tidy up after the installation of the new security attributes of a
- *	process being transformed by an execve operation.  The new credentials
- *	have, by this point, been set to @current->cred.  @bprm points to the
- *	linux_binprm structure.  This hook is a good place to perform state
- *	changes on the process such as clearing out non-inheritable signal
- *	state.  This is called immediately after commit_creds().
- *
  * Security hooks for mount using fs_context.
  *	[See also Documentation/filesystems/mount_api.rst]
  *
diff --git a/security/security.c b/security/security.c
index cf6cc576736f..1e7dbaf9ad28 100644
--- a/security/security.c
+++ b/security/security.c
@@ -6,6 +6,7 @@
  * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
  * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
  * Copyright (C) 2016 Mellanox Technologies
+ * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
  */
 
 #define pr_fmt(fmt) "LSM: " fmt
@@ -880,16 +881,61 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
 
+/**
+ * security_bprm_creds_for_exec() - Prepare the credentials for exec()
+ * @bprm: binary program information
+ *
+ * If the setup in prepare_exec_creds did not setup @bprm->cred->security
+ * properly for executing @bprm->file, update the LSM's portion of
+ * @bprm->cred->security to be what commit_creds needs to install for the new
+ * program.  This hook may also optionally check permissions (e.g. for
+ * transitions between security domains).  The hook must set @bprm->secureexec
+ * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
+ * contains the linux_binprm structure.
+ *
+ * Return: Returns 0 if the hook is successful and permission is granted.
+ */
 int security_bprm_creds_for_exec(struct linux_binprm *bprm)
 {
 	return call_int_hook(bprm_creds_for_exec, 0, bprm);
 }
 
+/**
+ * security_bprm_creds_from_file() - Update linux_binprm creds based on file
+ * @bprm: binary program information
+ * @file: associated file
+ *
+ * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
+ * exec, update @bprm->cred to reflect that change. This is called after
+ * finding the binary that will be executed without an interpreter.  This
+ * ensures that the credentials will not be derived from a script that the
+ * binary will need to reopen, which when reopend may end up being a completely
+ * different file.  This hook may also optionally check permissions (e.g. for
+ * transitions between security domains).  The hook must set @bprm->secureexec
+ * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
+ * hook must add to @bprm->per_clear any personality flags that should be
+ * cleared from current->personality.  @bprm contains the linux_binprm
+ * structure.
+ *
+ * Return: Returns 0 if the hook is successful and permission is granted.
+ */
 int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
 {
 	return call_int_hook(bprm_creds_from_file, 0, bprm, file);
 }
 
+/**
+ * security_bprm_check() - Mediate binary handler search
+ * @bprm: binary program information
+ *
+ * This hook mediates the point when a search for a binary handler will begin.
+ * It allows a check against the @bprm->cred->security value which was set in
+ * the preceding creds_for_exec call.  The argv list and envp list are reliably
+ * available in @bprm.  This hook may be called multiple times during a single
+ * execve.  @bprm contains the linux_binprm structure.
+ *
+ * Return: Returns 0 if the hook is successful and permission is granted.
+ */
 int security_bprm_check(struct linux_binprm *bprm)
 {
 	int ret;
@@ -900,11 +946,34 @@ int security_bprm_check(struct linux_binprm *bprm)
 	return ima_bprm_check(bprm);
 }
 
+/**
+ * security_bprm_committing_creds() - Install creds for a process during exec()
+ * @bprm: binary program information
+ *
+ * Prepare to install the new security attributes of a process being
+ * transformed by an execve operation, based on the old credentials pointed to
+ * by @current->cred and the information set in @bprm->cred by the
+ * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
+ * hook is a good place to perform state changes on the process such as closing
+ * open file descriptors to which access will no longer be granted when the
+ * attributes are changed.  This is called immediately before commit_creds().
+ */
 void security_bprm_committing_creds(struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committing_creds, bprm);
 }
 
+/**
+ * security_bprm_committed_creds() - Tidy up after cred install during exec()
+ * @bprm: binary program information
+ *
+ * Tidy up after the installation of the new security attributes of a process
+ * being transformed by an execve operation.  The new credentials have, by this
+ * point, been set to @current->cred.  @bprm points to the linux_binprm
+ * structure.  This hook is a good place to perform state changes on the
+ * process such as clearing out non-inheritable signal state.  This is called
+ * immediately after commit_creds().
+ */
 void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committed_creds, bprm);
-- 
cgit v1.2.3


From 36819f185590c1e0e47d095f04bf5da20650fe4d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 7 Feb 2023 17:33:31 -0500
Subject: lsm: move the fs_context hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 17 -----------------
 security/security.c       | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 784e19fd665b..a37f3a380918 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,23 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for mount using fs_context.
- *	[See also Documentation/filesystems/mount_api.rst]
- *
- * @fs_context_dup:
- *	Allocate and attach a security structure to sc->security.  This pointer
- *	is initialised to NULL by the caller.
- *	@fc indicates the new filesystem context.
- *	@src_fc indicates the original filesystem context.
- *	Return 0 on success or a negative error code on failure.
- * @fs_context_parse_param:
- *	Userspace provided a parameter to configure a superblock.  The LSM may
- *	reject it with an error and may use it for itself, in which case it
- *	should return 0; otherwise it should return -ENOPARAM to pass it on to
- *	the filesystem.
- *	@fc indicates the filesystem context.
- *	@param The parameter.
- *
  * Security hooks for filesystem operations.
  *
  * @sb_alloc_security:
diff --git a/security/security.c b/security/security.c
index 1e7dbaf9ad28..64c8805570b1 100644
--- a/security/security.c
+++ b/security/security.c
@@ -979,11 +979,34 @@ void security_bprm_committed_creds(struct linux_binprm *bprm)
 	call_void_hook(bprm_committed_creds, bprm);
 }
 
+/**
+ * security_fs_context_dup() - Duplicate a fs_context LSM blob
+ * @fc: destination filesystem context
+ * @src_fc: source filesystem context
+ *
+ * Allocate and attach a security structure to sc->security.  This pointer is
+ * initialised to NULL by the caller.  @fc indicates the new filesystem context.
+ * @src_fc indicates the original filesystem context.
+ *
+ * Return: Returns 0 on success or a negative error code on failure.
+ */
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
 {
 	return call_int_hook(fs_context_dup, 0, fc, src_fc);
 }
 
+/**
+ * security_fs_context_parse_param() - Configure a filesystem context
+ * @fc: filesystem context
+ * @param: filesystem parameter
+ *
+ * Userspace provided a parameter to configure a superblock.  The LSM can
+ * consume the parameter or return it to the caller for use elsewhere.
+ *
+ * Return: If the parameter is used by the LSM it should return 0, if it is
+ *         returned to the caller -ENOPARAM is returned, otherwise a negative
+ *         error code is returned.
+ */
 int security_fs_context_parse_param(struct fs_context *fc,
 				    struct fs_parameter *param)
 {
-- 
cgit v1.2.3


From 08526a902cc4bc57bb160e5b1114e67a56fc9280 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 7 Feb 2023 17:44:01 -0500
Subject: lsm: move the filesystem hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 106 ---------------------------
 security/security.c       | 181 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a37f3a380918..3cd6e7abe0df 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,112 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for filesystem operations.
- *
- * @sb_alloc_security:
- *	Allocate and attach a security structure to the sb->s_security field.
- *	The s_security field is initialized to NULL when the structure is
- *	allocated.
- *	@sb contains the super_block structure to be modified.
- *	Return 0 if operation was successful.
- * @sb_delete:
- *	Release objects tied to a superblock (e.g. inodes).
- *	@sb contains the super_block structure being released.
- * @sb_free_security:
- *	Deallocate and clear the sb->s_security field.
- *	@sb contains the super_block structure to be modified.
- * @sb_free_mnt_opts:
- * 	Free memory associated with @mnt_ops.
- * @sb_eat_lsm_opts:
- * 	Eat (scan @orig options) and save them in @mnt_opts.
- *	Return 0 on success, negative values on failure.
- * @sb_statfs:
- *	Check permission before obtaining filesystem statistics for the @mnt
- *	mountpoint.
- *	@dentry is a handle on the superblock for the filesystem.
- *	Return 0 if permission is granted.
- * @sb_mount:
- *	Check permission before an object specified by @dev_name is mounted on
- *	the mount point named by @nd.  For an ordinary mount, @dev_name
- *	identifies a device if the file system type requires a device.  For a
- *	remount (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a
- *	loopback/bind mount (@flags & MS_BIND), @dev_name identifies the
- *	pathname of the object being mounted.
- *	@dev_name contains the name for object being mounted.
- *	@path contains the path for mount point object.
- *	@type contains the filesystem type.
- *	@flags contains the mount flags.
- *	@data contains the filesystem-specific data.
- *	Return 0 if permission is granted.
- * @sb_mnt_opts_compat:
- *	Determine if the new mount options in @mnt_opts are allowed given
- *	the existing mounted filesystem at @sb.
- *	@sb superblock being compared.
- *	@mnt_opts new mount options.
- *	Return 0 if options are compatible.
- * @sb_remount:
- *	Extracts security system specific mount options and verifies no changes
- *	are being made to those options.
- *	@sb superblock being remounted.
- *	@data contains the filesystem-specific data.
- *	Return 0 if permission is granted.
- * @sb_kern_mount:
- *	Mount this @sb if allowed by permissions.
- *	Return 0 if permission is granted.
- * @sb_show_options:
- * 	Show (print on @m) mount options for this @sb.
- *	Return 0 on success, negative values on failure.
- * @sb_umount:
- *	Check permission before the @mnt file system is unmounted.
- *	@mnt contains the mounted file system.
- *	@flags contains the unmount flags, e.g. MNT_FORCE.
- *	Return 0 if permission is granted.
- * @sb_pivotroot:
- *	Check permission before pivoting the root filesystem.
- *	@old_path contains the path for the new location of the
- *	current root (put_old).
- *	@new_path contains the path for the new root (new_root).
- *	Return 0 if permission is granted.
- * @sb_set_mnt_opts:
- *	Set the security relevant mount options used for a superblock
- *	@sb the superblock to set security mount options for.
- *	@opts binary data structure containing all lsm mount data.
- *	Return 0 on success, error on failure.
- * @sb_clone_mnt_opts:
- *	Copy all security options from a given superblock to another
- *	@oldsb old superblock which contain information to clone.
- *	@newsb new superblock which needs filled in.
- *	Return 0 on success, error on failure.
- * @move_mount:
- *	Check permission before a mount is moved.
- *	@from_path indicates the mount that is going to be moved.
- *	@to_path indicates the mountpoint that will be mounted upon.
- *	Return 0 if permission is granted.
- * @dentry_init_security:
- *	Compute a context for a dentry as the inode is not yet available
- *	since NFSv4 has no label backed by an EA anyway.
- *	@dentry dentry to use in calculating the context.
- *	@mode mode used to determine resource type.
- *	@name name of the last path component used to create file.
- *	@xattr_name pointer to place the pointer to security xattr name.
- *		    Caller does not have to free the resulting pointer. Its
- *		    a pointer to static string.
- *	@ctx pointer to place the pointer to the resulting context in.
- *	@ctxlen point to place the length of the resulting context.
- *	Return 0 on success, negative values on failure.
- * @dentry_create_files_as:
- *	Compute a context for a dentry as the inode is not yet available
- *	and set that context in passed in creds so that new files are
- *	created using that context. Context is calculated using the
- *	passed in creds and not the creds of the caller.
- *	@dentry dentry to use in calculating the context.
- *	@mode mode used to determine resource type.
- *	@name name of the last path component used to create file.
- *	@old creds which should be used for context calculation.
- *	@new creds to modify.
- *	Return 0 on success, error on failure.
- *
- *
  * Security hooks for inode operations.
  *
  * @inode_alloc_security:
diff --git a/security/security.c b/security/security.c
index 64c8805570b1..f1ab64f390bf 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1025,6 +1025,16 @@ int security_fs_context_parse_param(struct fs_context *fc,
 	return rc;
 }
 
+/**
+ * security_sb_alloc() - Allocate a super_block LSM blob
+ * @sb: filesystem superblock
+ *
+ * Allocate and attach a security structure to the sb->s_security field.  The
+ * s_security field is initialized to NULL when the structure is allocated.
+ * @sb contains the super_block structure to be modified.
+ *
+ * Return: Returns 0 if operation was successful.
+ */
 int security_sb_alloc(struct super_block *sb)
 {
 	int rc = lsm_superblock_alloc(sb);
@@ -1037,11 +1047,25 @@ int security_sb_alloc(struct super_block *sb)
 	return rc;
 }
 
+/**
+ * security_sb_delete() - Release super_block LSM associated objects
+ * @sb: filesystem superblock
+ *
+ * Release objects tied to a superblock (e.g. inodes).  @sb contains the
+ * super_block structure being released.
+ */
 void security_sb_delete(struct super_block *sb)
 {
 	call_void_hook(sb_delete, sb);
 }
 
+/**
+ * security_sb_free() - Free a super_block LSM blob
+ * @sb: filesystem superblock
+ *
+ * Deallocate and clear the sb->s_security field.  @sb contains the super_block
+ * structure to be modified.
+ */
 void security_sb_free(struct super_block *sb)
 {
 	call_void_hook(sb_free_security, sb);
@@ -1049,6 +1073,12 @@ void security_sb_free(struct super_block *sb)
 	sb->s_security = NULL;
 }
 
+/**
+ * security_free_mnt_opts() - Free memory associated with mount options
+ * @mnt_ops: LSM processed mount options
+ *
+ * Free memory associated with @mnt_ops.
+ */
 void security_free_mnt_opts(void **mnt_opts)
 {
 	if (!*mnt_opts)
@@ -1058,12 +1088,31 @@ void security_free_mnt_opts(void **mnt_opts)
 }
 EXPORT_SYMBOL(security_free_mnt_opts);
 
+/**
+ * security_sb_eat_lsm_opts() - Consume LSM mount options
+ * @options: mount options
+ * @mnt_ops: LSM processed mount options
+ *
+ * Eat (scan @options) and save them in @mnt_opts.
+ *
+ * Return: Returns 0 on success, negative values on failure.
+ */
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
 {
 	return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts);
 }
 EXPORT_SYMBOL(security_sb_eat_lsm_opts);
 
+/**
+ * security_sb_mnt_opts_compat() - Check if new mount options are allowed
+ * @sb: filesystem superblock
+ * @mnt_opts: new mount options
+ *
+ * Determine if the new mount options in @mnt_opts are allowed given the
+ * existing mounted filesystem at @sb.  @sb superblock being compared.
+ *
+ * Return: Returns 0 if options are compatible.
+ */
 int security_sb_mnt_opts_compat(struct super_block *sb,
 				void *mnt_opts)
 {
@@ -1071,6 +1120,16 @@ int security_sb_mnt_opts_compat(struct super_block *sb,
 }
 EXPORT_SYMBOL(security_sb_mnt_opts_compat);
 
+/**
+ * security_sb_remount() - Verify no incompatible mount changes during remount
+ * @sb: filesystem superblock
+ * @mnt_opts: (re)mount options
+ *
+ * Extracts security system specific mount options and verifies no changes are
+ * being made to those options.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_remount(struct super_block *sb,
 			void *mnt_opts)
 {
@@ -1078,37 +1137,109 @@ int security_sb_remount(struct super_block *sb,
 }
 EXPORT_SYMBOL(security_sb_remount);
 
+/**
+ * security_sb_kern_mount() - Check if a kernel mount is allowed
+ * @sb: filesystem superblock
+ *
+ * Mount this @sb if allowed by permissions.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_kern_mount(struct super_block *sb)
 {
 	return call_int_hook(sb_kern_mount, 0, sb);
 }
 
+/**
+ * security_sb_show_options() - Output the mount options for a superblock
+ * @m: output file
+ * @sb: filesystem superblock
+ *
+ * Show (print on @m) mount options for this @sb.
+ *
+ * Return: Returns 0 on success, negative values on failure.
+ */
 int security_sb_show_options(struct seq_file *m, struct super_block *sb)
 {
 	return call_int_hook(sb_show_options, 0, m, sb);
 }
 
+/**
+ * security_sb_statfs() - Check if accessing fs stats is allowed
+ * @dentry: superblock handle
+ *
+ * Check permission before obtaining filesystem statistics for the @mnt
+ * mountpoint.  @dentry is a handle on the superblock for the filesystem.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_statfs(struct dentry *dentry)
 {
 	return call_int_hook(sb_statfs, 0, dentry);
 }
 
+/**
+ * security_sb_mount() - Check permission for mounting a filesystem
+ * @dev_name: filesystem backing device
+ * @path: mount point
+ * @type: filesystem type
+ * @flags: mount flags
+ * @data: filesystem specific data
+ *
+ * Check permission before an object specified by @dev_name is mounted on the
+ * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
+ * device if the file system type requires a device.  For a remount
+ * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
+ * (@flags & MS_BIND), @dev_name identifies the	pathname of the object being
+ * mounted.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_mount(const char *dev_name, const struct path *path,
                        const char *type, unsigned long flags, void *data)
 {
 	return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data);
 }
 
+/**
+ * security_sb_umount() - Check permission for unmounting a filesystem
+ * @mnt: mounted filesystem
+ * @flags: unmount flags
+ *
+ * Check permission before the @mnt file system is unmounted.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_umount(struct vfsmount *mnt, int flags)
 {
 	return call_int_hook(sb_umount, 0, mnt, flags);
 }
 
+/**
+ * security_sb_pivotroot() - Check permissions for pivoting the rootfs
+ * @old_path: new location for current rootfs
+ * @new_path: location of the new rootfs
+ *
+ * Check permission before pivoting the root filesystem.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sb_pivotroot(const struct path *old_path, const struct path *new_path)
 {
 	return call_int_hook(sb_pivotroot, 0, old_path, new_path);
 }
 
+/**
+ * security_sb_set_mnt_opts() - Set the mount options for a filesystem
+ * @sb: filesystem superblock
+ * @mnt_opts: binary mount options
+ * @kern_flags: kernel flags (in)
+ * @set_kern_flags: kernel flags (out)
+ *
+ * Set the security relevant mount options used for a superblock.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_sb_set_mnt_opts(struct super_block *sb,
 				void *mnt_opts,
 				unsigned long kern_flags,
@@ -1120,6 +1251,17 @@ int security_sb_set_mnt_opts(struct super_block *sb,
 }
 EXPORT_SYMBOL(security_sb_set_mnt_opts);
 
+/**
+ * security_sb_clone_mnt_opts() - Duplicate superblock mount options
+ * @olddb: source superblock
+ * @newdb: destination superblock
+ * @kern_flags: kernel flags (in)
+ * @set_kern_flags: kernel flags (out)
+ *
+ * Copy all security options from a given superblock to another.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				struct super_block *newsb,
 				unsigned long kern_flags,
@@ -1130,6 +1272,15 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 }
 EXPORT_SYMBOL(security_sb_clone_mnt_opts);
 
+/**
+ * security_move_mount() - Check permissions for moving a mount
+ * @from_path: source mount point
+ * @to_path: destination mount point
+ *
+ * Check permission before a mount is moved.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_move_mount(const struct path *from_path, const struct path *to_path)
 {
 	return call_int_hook(move_mount, 0, from_path, to_path);
@@ -1179,6 +1330,21 @@ void security_inode_free(struct inode *inode)
 				inode_free_by_rcu);
 }
 
+/**
+ * security_dentry_init_security() - Perform dentry initialization
+ * @dentry: the dentry to initialize
+ * @mode: mode used to determine resource type
+ * @name: name of the last path component
+ * @xattr_name: name of the security/LSM xattr
+ * @ctx: pointer to the resulting LSM context
+ * @ctxlen: length of @ctx
+ *
+ * Compute a context for a dentry as the inode is not yet available since NFSv4
+ * has no label backed by an EA anyway.  It is important to note that
+ * @xattr_name does not need to be free'd by the caller, it is a static string.
+ *
+ * Return: Returns 0 on success, negative values on failure.
+ */
 int security_dentry_init_security(struct dentry *dentry, int mode,
 				  const struct qstr *name,
 				  const char **xattr_name, void **ctx,
@@ -1200,6 +1366,21 @@ int security_dentry_init_security(struct dentry *dentry, int mode,
 }
 EXPORT_SYMBOL(security_dentry_init_security);
 
+/**
+ * security_dentry_create_files_as() - Perform dentry initialization
+ * @dentry: the dentry to initialize
+ * @mode: mode used to determine resource type
+ * @name: name of the last path component
+ * @old: creds to use for LSM context calculations
+ * @new: creds to modify
+ *
+ * Compute a context for a dentry as the inode is not yet available and set
+ * that context in passed in creds so that new files are created using that
+ * context. Context is calculated using the passed in creds and not the creds
+ * of the caller.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_dentry_create_files_as(struct dentry *dentry, int mode,
 				    struct qstr *name,
 				    const struct cred *old, struct cred *new)
-- 
cgit v1.2.3


From 916e32584dfaff068792db70c89936801b476ad0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 8 Feb 2023 16:31:55 -0500
Subject: lsm: move the inode hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 303 ---------------------------
 security/security.c       | 524 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 524 insertions(+), 303 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3cd6e7abe0df..3cdd58424796 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,309 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for inode operations.
- *
- * @inode_alloc_security:
- *	Allocate and attach a security structure to @inode->i_security.  The
- *	i_security field is initialized to NULL when the inode structure is
- *	allocated.
- *	@inode contains the inode structure.
- *	Return 0 if operation was successful.
- * @inode_free_security:
- *	@inode contains the inode structure.
- *	Deallocate the inode security structure and set @inode->i_security to
- *	NULL.
- * @inode_init_security:
- *	Obtain the security attribute name suffix and value to set on a newly
- *	created inode and set up the incore security field for the new inode.
- *	This hook is called by the fs code as part of the inode creation
- *	transaction and provides for atomic labeling of the inode, unlike
- *	the post_create/mkdir/... hooks called by the VFS.  The hook function
- *	is expected to allocate the name and value via kmalloc, with the caller
- *	being responsible for calling kfree after using them.
- *	If the security module does not use security attributes or does
- *	not wish to put a security attribute on this particular inode,
- *	then it should return -EOPNOTSUPP to skip this processing.
- *	@inode contains the inode structure of the newly created inode.
- *	@dir contains the inode structure of the parent directory.
- *	@qstr contains the last path component of the new object.
- *	@name will be set to the allocated name suffix (e.g. selinux).
- *	@value will be set to the allocated attribute value.
- *	@len will be set to the length of the value.
- *	Returns 0 if @name and @value have been successfully set,
- *	-EOPNOTSUPP if no security attribute is needed, or
- *	-ENOMEM on memory allocation failure.
- * @inode_init_security_anon:
- *      Set up the incore security field for the new anonymous inode
- *      and return whether the inode creation is permitted by the security
- *      module or not.
- *      @inode contains the inode structure.
- *      @name name of the anonymous inode class.
- *      @context_inode optional related inode.
- *	Returns 0 on success, -EACCES if the security module denies the
- *	creation of this inode, or another -errno upon other errors.
- * @inode_create:
- *	Check permission to create a regular file.
- *	@dir contains inode structure of the parent of the new file.
- *	@dentry contains the dentry structure for the file to be created.
- *	@mode contains the file mode of the file to be created.
- *	Return 0 if permission is granted.
- * @inode_link:
- *	Check permission before creating a new hard link to a file.
- *	@old_dentry contains the dentry structure for an existing
- *	link to the file.
- *	@dir contains the inode structure of the parent directory
- *	of the new link.
- *	@new_dentry contains the dentry structure for the new link.
- *	Return 0 if permission is granted.
- * @path_link:
- *	Check permission before creating a new hard link to a file.
- *	@old_dentry contains the dentry structure for an existing link
- *	to the file.
- *	@new_dir contains the path structure of the parent directory of
- *	the new link.
- *	@new_dentry contains the dentry structure for the new link.
- *	Return 0 if permission is granted.
- * @inode_unlink:
- *	Check the permission to remove a hard link to a file.
- *	@dir contains the inode structure of parent directory of the file.
- *	@dentry contains the dentry structure for file to be unlinked.
- *	Return 0 if permission is granted.
- * @path_unlink:
- *	Check the permission to remove a hard link to a file.
- *	@dir contains the path structure of parent directory of the file.
- *	@dentry contains the dentry structure for file to be unlinked.
- *	Return 0 if permission is granted.
- * @inode_symlink:
- *	Check the permission to create a symbolic link to a file.
- *	@dir contains the inode structure of parent directory of
- *	the symbolic link.
- *	@dentry contains the dentry structure of the symbolic link.
- *	@old_name contains the pathname of file.
- *	Return 0 if permission is granted.
- * @path_symlink:
- *	Check the permission to create a symbolic link to a file.
- *	@dir contains the path structure of parent directory of
- *	the symbolic link.
- *	@dentry contains the dentry structure of the symbolic link.
- *	@old_name contains the pathname of file.
- *	Return 0 if permission is granted.
- * @inode_mkdir:
- *	Check permissions to create a new directory in the existing directory
- *	associated with inode structure @dir.
- *	@dir contains the inode structure of parent of the directory
- *	to be created.
- *	@dentry contains the dentry structure of new directory.
- *	@mode contains the mode of new directory.
- *	Return 0 if permission is granted.
- * @path_mkdir:
- *	Check permissions to create a new directory in the existing directory
- *	associated with path structure @path.
- *	@dir contains the path structure of parent of the directory
- *	to be created.
- *	@dentry contains the dentry structure of new directory.
- *	@mode contains the mode of new directory.
- *	Return 0 if permission is granted.
- * @inode_rmdir:
- *	Check the permission to remove a directory.
- *	@dir contains the inode structure of parent of the directory
- *	to be removed.
- *	@dentry contains the dentry structure of directory to be removed.
- *	Return 0 if permission is granted.
- * @path_rmdir:
- *	Check the permission to remove a directory.
- *	@dir contains the path structure of parent of the directory to be
- *	removed.
- *	@dentry contains the dentry structure of directory to be removed.
- *	Return 0 if permission is granted.
- * @inode_mknod:
- *	Check permissions when creating a special file (or a socket or a fifo
- *	file created via the mknod system call).  Note that if mknod operation
- *	is being done for a regular file, then the create hook will be called
- *	and not this hook.
- *	@dir contains the inode structure of parent of the new file.
- *	@dentry contains the dentry structure of the new file.
- *	@mode contains the mode of the new file.
- *	@dev contains the device number.
- *	Return 0 if permission is granted.
- * @path_mknod:
- *	Check permissions when creating a file. Note that this hook is called
- *	even if mknod operation is being done for a regular file.
- *	@dir contains the path structure of parent of the new file.
- *	@dentry contains the dentry structure of the new file.
- *	@mode contains the mode of the new file.
- *	@dev contains the undecoded device number. Use new_decode_dev() to get
- *	the decoded device number.
- *	Return 0 if permission is granted.
- * @inode_rename:
- *	Check for permission to rename a file or directory.
- *	@old_dir contains the inode structure for parent of the old link.
- *	@old_dentry contains the dentry structure of the old link.
- *	@new_dir contains the inode structure for parent of the new link.
- *	@new_dentry contains the dentry structure of the new link.
- *	Return 0 if permission is granted.
- * @path_rename:
- *	Check for permission to rename a file or directory.
- *	@old_dir contains the path structure for parent of the old link.
- *	@old_dentry contains the dentry structure of the old link.
- *	@new_dir contains the path structure for parent of the new link.
- *	@new_dentry contains the dentry structure of the new link.
- *	@flags may contain rename options such as RENAME_EXCHANGE.
- *	Return 0 if permission is granted.
- * @path_chmod:
- *	Check for permission to change a mode of the file @path. The new
- *	mode is specified in @mode.
- *	@path contains the path structure of the file to change the mode.
- *	@mode contains the new DAC's permission, which is a bitmask of
- *	constants from <include/uapi/linux/stat.h>.
- *	Return 0 if permission is granted.
- * @path_chown:
- *	Check for permission to change owner/group of a file or directory.
- *	@path contains the path structure.
- *	@uid contains new owner's ID.
- *	@gid contains new group's ID.
- *	Return 0 if permission is granted.
- * @path_chroot:
- *	Check for permission to change root directory.
- *	@path contains the path structure.
- *	Return 0 if permission is granted.
- * @path_notify:
- *	Check permissions before setting a watch on events as defined by @mask,
- *	on an object at @path, whose type is defined by @obj_type.
- *	Return 0 if permission is granted.
- * @inode_readlink:
- *	Check the permission to read the symbolic link.
- *	@dentry contains the dentry structure for the file link.
- *	Return 0 if permission is granted.
- * @inode_follow_link:
- *	Check permission to follow a symbolic link when looking up a pathname.
- *	@dentry contains the dentry structure for the link.
- *	@inode contains the inode, which itself is not stable in RCU-walk.
- *	@rcu indicates whether we are in RCU-walk mode.
- *	Return 0 if permission is granted.
- * @inode_permission:
- *	Check permission before accessing an inode.  This hook is called by the
- *	existing Linux permission function, so a security module can use it to
- *	provide additional checking for existing Linux permission checks.
- *	Notice that this hook is called when a file is opened (as well as many
- *	other operations), whereas the file_security_ops permission hook is
- *	called when the actual read/write operations are performed.
- *	@inode contains the inode structure to check.
- *	@mask contains the permission mask.
- *	Return 0 if permission is granted.
- * @inode_setattr:
- *	Check permission before setting file attributes.  Note that the kernel
- *	call to notify_change is performed from several locations, whenever
- *	file attributes change (such as when a file is truncated, chown/chmod
- *	operations, transferring disk quotas, etc).
- *	@dentry contains the dentry structure for the file.
- *	@attr is the iattr structure containing the new file attributes.
- *	Return 0 if permission is granted.
- * @path_truncate:
- *	Check permission before truncating the file indicated by path.
- *	Note that truncation permissions may also be checked based on
- *	already opened files, using the @file_truncate hook.
- *	@path contains the path structure for the file.
- *	Return 0 if permission is granted.
- * @inode_getattr:
- *	Check permission before obtaining file attributes.
- *	@path contains the path structure for the file.
- *	Return 0 if permission is granted.
- * @inode_setxattr:
- *	Check permission before setting the extended attributes
- *	@value identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_post_setxattr:
- *	Update inode security field after successful setxattr operation.
- *	@value identified by @name for @dentry.
- * @inode_getxattr:
- *	Check permission before obtaining the extended attributes
- *	identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_listxattr:
- *	Check permission before obtaining the list of extended attribute
- *	names for @dentry.
- *	Return 0 if permission is granted.
- * @inode_removexattr:
- *	Check permission before removing the extended attribute
- *	identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_set_acl:
- *	Check permission before setting posix acls
- *	The posix acls in @kacl are identified by @acl_name.
- *	Return 0 if permission is granted.
- * @inode_get_acl:
- *	Check permission before getting osix acls
- *	The posix acls are identified by @acl_name.
- *	Return 0 if permission is granted.
- * @inode_remove_acl:
- *	Check permission before removing posix acls
- *	The posix acls are identified by @acl_name.
- *	Return 0 if permission is granted.
- * @inode_getsecurity:
- *	Retrieve a copy of the extended attribute representation of the
- *	security label associated with @name for @inode via @buffer.  Note that
- *	@name is the remainder of the attribute name after the security prefix
- *	has been removed. @alloc is used to specify if the call should return a
- *	value via the buffer or just the value length.
- *	Return size of buffer on success.
- * @inode_setsecurity:
- *	Set the security label associated with @name for @inode from the
- *	extended attribute value @value.  @size indicates the size of the
- *	@value in bytes.  @flags may be XATTR_CREATE, XATTR_REPLACE, or 0.
- *	Note that @name is the remainder of the attribute name after the
- *	security. prefix has been removed.
- *	Return 0 on success.
- * @inode_listsecurity:
- *	Copy the extended attribute names for the security labels
- *	associated with @inode into @buffer.  The maximum size of @buffer
- *	is specified by @buffer_size.  @buffer may be NULL to request
- *	the size of the buffer required.
- *	Returns number of bytes used/required on success.
- * @inode_need_killpriv:
- *	Called when an inode has been changed.
- *	@dentry is the dentry being changed.
- *	Return <0 on error to abort the inode change operation.
- *	Return 0 if inode_killpriv does not need to be called.
- *	Return >0 if inode_killpriv does need to be called.
- * @inode_killpriv:
- *	The setuid bit is being removed.  Remove similar security labels.
- *	Called with the dentry->d_inode->i_mutex held.
- *	@idmap: idmap of the mount.
- *	@dentry is the dentry being changed.
- *	Return 0 on success.  If error is returned, then the operation
- *	causing setuid bit removal is failed.
- * @inode_getsecid:
- *	Get the secid associated with the node.
- *	@inode contains a pointer to the inode.
- *	@secid contains a pointer to the location where result will be saved.
- *	In case of failure, @secid will be set to zero.
- * @inode_copy_up:
- *	A file is about to be copied up from lower layer to upper layer of
- *	overlay filesystem. Security module can prepare a set of new creds
- *	and modify as need be and return new creds. Caller will switch to
- *	new creds temporarily to create new file and release newly allocated
- *	creds.
- *	@src indicates the union dentry of file that is being copied up.
- *	@new pointer to pointer to return newly allocated creds.
- *	Returns 0 on success or a negative error code on error.
- * @inode_copy_up_xattr:
- *	Filter the xattrs being copied up when a unioned file is copied
- *	up from a lower layer to the union/overlay layer.
- *	@name indicates the name of the xattr.
- *	Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP if
- *	security module does not know about attribute or a negative error code
- *	to abort the copy up. Note that the caller is responsible for reading
- *	and writing the xattrs as this hook is merely a filter.
- * @d_instantiate:
- *	Fill in @inode security information for a @dentry if allowed.
- * @getprocattr:
- *	Read attribute @name for process @p and store it into @value if allowed.
- *	Return the length of @value on success, a negative value otherwise.
- * @setprocattr:
- *	Write (set) attribute @name to @value, size @size if allowed.
- *	Return written bytes on success, a negative value otherwise.
- *
  * Security hooks for kernfs node operations
  *
  * @kernfs_init_security:
diff --git a/security/security.c b/security/security.c
index f1ab64f390bf..bd35854f956a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1286,12 +1286,33 @@ int security_move_mount(const struct path *from_path, const struct path *to_path
 	return call_int_hook(move_mount, 0, from_path, to_path);
 }
 
+/**
+ * security_path_notify() - Check if setting a watch is allowed
+ * @path: file path
+ * @mask: event mask
+ * @obj_type: file path type
+ *
+ * Check permissions before setting a watch on events as defined by @mask, on
+ * an object at @path, whose type is defined by @obj_type.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_notify(const struct path *path, u64 mask,
 				unsigned int obj_type)
 {
 	return call_int_hook(path_notify, 0, path, mask, obj_type);
 }
 
+/**
+ * security_inode_alloc() - Allocate an inode LSM blob
+ * @inode: the inode
+ *
+ * Allocate and attach a security structure to @inode->i_security.  The
+ * i_security field is initialized to NULL when the inode structure is
+ * allocated.
+ *
+ * Return: Return 0 if operation was successful.
+ */
 int security_inode_alloc(struct inode *inode)
 {
 	int rc = lsm_inode_alloc(inode);
@@ -1312,6 +1333,12 @@ static void inode_free_by_rcu(struct rcu_head *head)
 	kmem_cache_free(lsm_inode_cache, head);
 }
 
+/**
+ * security_inode_free() - Free an inode's LSM blob
+ * @inode: the inode
+ *
+ * Deallocate the inode security structure and set @inode->i_security to NULL.
+ */
 void security_inode_free(struct inode *inode)
 {
 	integrity_inode_free(inode);
@@ -1390,6 +1417,27 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode,
 }
 EXPORT_SYMBOL(security_dentry_create_files_as);
 
+/**
+ * security_inode_init_security() - Initialize an inode's LSM context
+ * @inode: the inode
+ * @dir: parent directory
+ * @qstr: last component of the pathname
+ * @initxattrs: callback function to write xattrs
+ * @fs_data: filesystem specific data
+ *
+ * Obtain the security attribute name suffix and value to set on a newly
+ * created inode and set up the incore security field for the new inode.  This
+ * hook is called by the fs code as part of the inode creation transaction and
+ * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
+ * hooks called by the VFS.  The hook function is expected to allocate the name
+ * and value via kmalloc, with the caller being responsible for calling kfree
+ * after using them.  If the security module does not use security attributes
+ * or does not wish to put a security attribute on this particular inode, then
+ * it should return -EOPNOTSUPP to skip this processing.
+ *
+ * Return: Returns 0 on success, -EOPNOTSUPP if no security attribute is
+ * needed, or -ENOMEM on memory allocation failure.
+ */
 int security_inode_init_security(struct inode *inode, struct inode *dir,
 				 const struct qstr *qstr,
 				 const initxattrs initxattrs, void *fs_data)
@@ -1425,6 +1473,18 @@ out:
 }
 EXPORT_SYMBOL(security_inode_init_security);
 
+/**
+ * security_inode_init_security_anon() - Initialize an anonymous inode
+ * @inode: the inode
+ * @name: the anonymous inode class
+ * @context_inode: an optional related inode
+ *
+ * Set up the incore security field for the new anonymous inode and return
+ * whether the inode creation is permitted by the security module or not.
+ *
+ * Return: Returns 0 on success, -EACCES if the security module denies the
+ * creation of this inode, or another -errno upon other errors.
+ */
 int security_inode_init_security_anon(struct inode *inode,
 				      const struct qstr *name,
 				      const struct inode *context_inode)
@@ -1445,6 +1505,18 @@ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
 EXPORT_SYMBOL(security_old_inode_init_security);
 
 #ifdef CONFIG_SECURITY_PATH
+/**
+ * security_path_mknod() - Check if creating a special file is allowed
+ * @dir: parent directory
+ * @dentry: new file
+ * @mode: new file mode
+ * @dev: device number
+ *
+ * Check permissions when creating a file. Note that this hook is called even
+ * if mknod operation is being done for a regular file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
 			unsigned int dev)
 {
@@ -1454,6 +1526,16 @@ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t m
 }
 EXPORT_SYMBOL(security_path_mknod);
 
+/**
+ * security_path_mkdir() - Check if creating a new directory is allowed
+ * @dir: parent directory
+ * @dentry: new directory
+ * @mode: new directory mode
+ *
+ * Check permissions to create a new directory in the existing directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
@@ -1462,6 +1544,15 @@ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t m
 }
 EXPORT_SYMBOL(security_path_mkdir);
 
+/**
+ * security_path_rmdir() - Check if removing a directory is allowed
+ * @dir: parent directory
+ * @dentry: directory to remove
+ *
+ * Check the permission to remove a directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_rmdir(const struct path *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
@@ -1469,6 +1560,15 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry)
 	return call_int_hook(path_rmdir, 0, dir, dentry);
 }
 
+/**
+ * security_path_unlink() - Check if removing a hard link is allowed
+ * @dir: parent directory
+ * @dentry: file
+ *
+ * Check the permission to remove a hard link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_unlink(const struct path *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
@@ -1477,6 +1577,16 @@ int security_path_unlink(const struct path *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL(security_path_unlink);
 
+/**
+ * security_path_symlink() - Check if creating a symbolic link is allowed
+ * @dir: parent directory
+ * @dentry: symbolic link
+ * @old_name: file pathname
+ *
+ * Check the permission to create a symbolic link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_symlink(const struct path *dir, struct dentry *dentry,
 			  const char *old_name)
 {
@@ -1485,6 +1595,16 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry,
 	return call_int_hook(path_symlink, 0, dir, dentry, old_name);
 }
 
+/**
+ * security_path_link - Check if creating a hard link is allowed
+ * @old_dentry: existing file
+ * @new_dir: new parent directory
+ * @new_dentry: new link
+ *
+ * Check permission before creating a new hard link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 		       struct dentry *new_dentry)
 {
@@ -1493,6 +1613,18 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 	return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
 }
 
+/**
+ * security_path_rename() - Check if renaming a file is allowed
+ * @old_dir: parent directory of the old file
+ * @old_dentry: the old file
+ * @new_dir: parent directory of the new file
+ * @new_dentry: the new file
+ * @flags: flags
+ *
+ * Check for permission to rename a file or directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
 			 const struct path *new_dir, struct dentry *new_dentry,
 			 unsigned int flags)
@@ -1506,6 +1638,16 @@ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
 }
 EXPORT_SYMBOL(security_path_rename);
 
+/**
+ * security_path_truncate() - Check if truncating a file is allowed
+ * @path: file
+ *
+ * Check permission before truncating the file indicated by path.  Note that
+ * truncation permissions may also be checked based on already opened files,
+ * using the security_file_truncate() hook.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_truncate(const struct path *path)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
@@ -1513,6 +1655,17 @@ int security_path_truncate(const struct path *path)
 	return call_int_hook(path_truncate, 0, path);
 }
 
+/**
+ * security_path_chmod() - Check if changing the file's mode is allowed
+ * @path: file
+ * @mode: new mode
+ *
+ * Check for permission to change a mode of the file @path. The new mode is
+ * specified in @mode which is a bitmask of constants from
+ * <include/uapi/linux/stat.h>.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_chmod(const struct path *path, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
@@ -1520,6 +1673,16 @@ int security_path_chmod(const struct path *path, umode_t mode)
 	return call_int_hook(path_chmod, 0, path, mode);
 }
 
+/**
+ * security_path_chown() - Check if changing the file's owner/group is allowed
+ * @path: file
+ * @uid: file owner
+ * @gid: file group
+ *
+ * Check for permission to change owner/group of a file or directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
@@ -1527,12 +1690,30 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 	return call_int_hook(path_chown, 0, path, uid, gid);
 }
 
+/**
+ * security_path_chroot() - Check if changing the root directory is allowed
+ * @path: directory
+ *
+ * Check for permission to change root directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_path_chroot(const struct path *path)
 {
 	return call_int_hook(path_chroot, 0, path);
 }
 #endif
 
+/**
+ * security_inode_create() - Check if creating a file is allowed
+ * @dir: the parent directory
+ * @dentry: the file being created
+ * @mode: requested file mode
+ *
+ * Check permission to create a regular file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(dir)))
@@ -1541,6 +1722,16 @@ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode
 }
 EXPORT_SYMBOL_GPL(security_inode_create);
 
+/**
+ * security_inode_link() - Check if creating a hard link is allowed
+ * @old_dentry: existing file
+ * @dir: new parent directory
+ * @new_dentry: new link
+ *
+ * Check permission before creating a new hard link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_link(struct dentry *old_dentry, struct inode *dir,
 			 struct dentry *new_dentry)
 {
@@ -1549,6 +1740,15 @@ int security_inode_link(struct dentry *old_dentry, struct inode *dir,
 	return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry);
 }
 
+/**
+ * security_inode_unlink() - Check if removing a hard link is allowed
+ * @dir: parent directory
+ * @dentry: file
+ *
+ * Check the permission to remove a hard link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_unlink(struct inode *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
@@ -1556,6 +1756,16 @@ int security_inode_unlink(struct inode *dir, struct dentry *dentry)
 	return call_int_hook(inode_unlink, 0, dir, dentry);
 }
 
+/**
+ * security_inode_symlink() Check if creating a symbolic link is allowed
+ * @dir: parent directory
+ * @dentry: symbolic link
+ * @old_name: existing filename
+ *
+ * Check the permission to create a symbolic link to a file.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *old_name)
 {
@@ -1564,6 +1774,17 @@ int security_inode_symlink(struct inode *dir, struct dentry *dentry,
 	return call_int_hook(inode_symlink, 0, dir, dentry, old_name);
 }
 
+/**
+ * security_inode_mkdir() - Check if creation a new director is allowed
+ * @dir: parent directory
+ * @dentry: new directory
+ * @mode: new directory mode
+ *
+ * Check permissions to create a new directory in the existing directory
+ * associated with inode structure @dir.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(dir)))
@@ -1572,6 +1793,15 @@ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 }
 EXPORT_SYMBOL_GPL(security_inode_mkdir);
 
+/**
+ * security_inode_rmdir() - Check if removing a directory is allowed
+ * @dir: parent directory
+ * @dentry: directory to be removed
+ *
+ * Check the permission to remove a directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
@@ -1579,6 +1809,20 @@ int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
 	return call_int_hook(inode_rmdir, 0, dir, dentry);
 }
 
+/**
+ * security_inode_mknod() - Check if creating a special file is allowed
+ * @dir: parent directory
+ * @dentry: new file
+ * @mode: new file mode
+ * @dev: device number
+ *
+ * Check permissions when creating a special file (or a socket or a fifo file
+ * created via the mknod system call).  Note that if mknod operation is being
+ * done for a regular file, then the create hook will be called and not this
+ * hook.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	if (unlikely(IS_PRIVATE(dir)))
@@ -1586,6 +1830,18 @@ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev);
 }
 
+/**
+ * security_inode_rename() - Check if renaming a file is allowed
+ * @old_dir: parent directory of the old file
+ * @old_dentry: the old file
+ * @new_dir: parent directory of the new file
+ * @new_dentry: the new file
+ * @flags: flags
+ *
+ * Check for permission to rename a file or directory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags)
@@ -1605,6 +1861,14 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 					   new_dir, new_dentry);
 }
 
+/**
+ * security_inode_readlink() - Check if reading a symbolic link is allowed
+ * @dentry: link
+ *
+ * Check the permission to read the symbolic link.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_readlink(struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
@@ -1612,6 +1876,17 @@ int security_inode_readlink(struct dentry *dentry)
 	return call_int_hook(inode_readlink, 0, dentry);
 }
 
+/**
+ * security_inode_follow_link() - Check if following a symbolic link is allowed
+ * @dentry: link dentry
+ * @inode: link inode
+ * @rcu: true if in RCU-walk mode
+ *
+ * Check permission to follow a symbolic link when looking up a pathname.  If
+ * @rcu is true, @inode is not stable.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 			       bool rcu)
 {
@@ -1620,6 +1895,20 @@ int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 	return call_int_hook(inode_follow_link, 0, dentry, inode, rcu);
 }
 
+/**
+ * security_inode_permission() - Check if accessing an inode is allowed
+ * @inode: inode
+ * @mask: access mask
+ *
+ * Check permission before accessing an inode.  This hook is called by the
+ * existing Linux permission function, so a security module can use it to
+ * provide additional checking for existing Linux permission checks.  Notice
+ * that this hook is called when a file is opened (as well as many other
+ * operations), whereas the file_security_ops permission hook is called when
+ * the actual read/write operations are performed.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_permission(struct inode *inode, int mask)
 {
 	if (unlikely(IS_PRIVATE(inode)))
@@ -1627,6 +1916,19 @@ int security_inode_permission(struct inode *inode, int mask)
 	return call_int_hook(inode_permission, 0, inode, mask);
 }
 
+/**
+ * security_inode_setattr() - Check if setting file attributes is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @attr: new attributes
+ *
+ * Check permission before setting file attributes.  Note that the kernel call
+ * to notify_change is performed from several locations, whenever file
+ * attributes change (such as when a file is truncated, chown/chmod operations,
+ * transferring disk quotas, etc).
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_setattr(struct mnt_idmap *idmap,
 			   struct dentry *dentry, struct iattr *attr)
 {
@@ -1641,6 +1943,14 @@ int security_inode_setattr(struct mnt_idmap *idmap,
 }
 EXPORT_SYMBOL_GPL(security_inode_setattr);
 
+/**
+ * security_inode_getattr() - Check if getting file attributes is allowed
+ * @path: file
+ *
+ * Check permission before obtaining file attributes.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_getattr(const struct path *path)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
@@ -1648,6 +1958,18 @@ int security_inode_getattr(const struct path *path)
 	return call_int_hook(inode_getattr, 0, path);
 }
 
+/**
+ * security_inode_setxattr() - Check if setting file xattrs is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @name: xattr name
+ * @value: xattr value
+ * @flags: flags
+ *
+ * Check permission before setting the extended attributes.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_setxattr(struct mnt_idmap *idmap,
 			    struct dentry *dentry, const char *name,
 			    const void *value, size_t size, int flags)
@@ -1673,6 +1995,18 @@ int security_inode_setxattr(struct mnt_idmap *idmap,
 	return evm_inode_setxattr(idmap, dentry, name, value, size);
 }
 
+/**
+ * security_inode_set_acl() - Check if setting posix acls is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @acl_name: acl name
+ * @kacl: acl struct
+ *
+ * Check permission before setting posix acls, the posix acls in @kacl are
+ * identified by @acl_name.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_set_acl(struct mnt_idmap *idmap,
 			   struct dentry *dentry, const char *acl_name,
 			   struct posix_acl *kacl)
@@ -1691,6 +2025,17 @@ int security_inode_set_acl(struct mnt_idmap *idmap,
 	return evm_inode_set_acl(idmap, dentry, acl_name, kacl);
 }
 
+/**
+ * security_inode_get_acl() - Check if reading posix acls is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @acl_name: acl name
+ *
+ * Check permission before getting osix acls, the posix acls are identified by
+ * @acl_name.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_get_acl(struct mnt_idmap *idmap,
 			   struct dentry *dentry, const char *acl_name)
 {
@@ -1699,6 +2044,17 @@ int security_inode_get_acl(struct mnt_idmap *idmap,
 	return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name);
 }
 
+/**
+ * security_inode_remove_acl() - Check if removing a posix acl is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @acl_name: acl name
+ *
+ * Check permission before removing posix acls, the posix acls are identified
+ * by @acl_name.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_remove_acl(struct mnt_idmap *idmap,
 			      struct dentry *dentry, const char *acl_name)
 {
@@ -1715,6 +2071,16 @@ int security_inode_remove_acl(struct mnt_idmap *idmap,
 	return evm_inode_remove_acl(idmap, dentry, acl_name);
 }
 
+/**
+ * security_inode_post_setxattr() - Update the inode after a setxattr operation
+ * @dentry: file
+ * @name: xattr name
+ * @value: xattr value
+ * @size: xattr value size
+ * @flags: flags
+ *
+ * Update inode security field after successful setxattr operation.
+ */
 void security_inode_post_setxattr(struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags)
 {
@@ -1724,6 +2090,16 @@ void security_inode_post_setxattr(struct dentry *dentry, const char *name,
 	evm_inode_post_setxattr(dentry, name, value, size);
 }
 
+/**
+ * security_inode_getxattr() - Check if xattr access is allowed
+ * @dentry: file
+ * @name: xattr name
+ *
+ * Check permission before obtaining the extended attributes identified by
+ * @name for @dentry.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_getxattr(struct dentry *dentry, const char *name)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
@@ -1731,6 +2107,15 @@ int security_inode_getxattr(struct dentry *dentry, const char *name)
 	return call_int_hook(inode_getxattr, 0, dentry, name);
 }
 
+/**
+ * security_inode_listxattr() - Check if listing xattrs is allowed
+ * @dentry: file
+ *
+ * Check permission before obtaining the list of extended attribute names for
+ * @dentry.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_listxattr(struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
@@ -1738,6 +2123,17 @@ int security_inode_listxattr(struct dentry *dentry)
 	return call_int_hook(inode_listxattr, 0, dentry);
 }
 
+/**
+ * security_inode_removexattr() - Check if removing an xattr is allowed
+ * @idmap: idmap of the mount
+ * @dentry: file
+ * @name: xattr name
+ *
+ * Check permission before removing the extended attribute identified by @name
+ * for @dentry.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inode_removexattr(struct mnt_idmap *idmap,
 			       struct dentry *dentry, const char *name)
 {
@@ -1760,17 +2156,55 @@ int security_inode_removexattr(struct mnt_idmap *idmap,
 	return evm_inode_removexattr(idmap, dentry, name);
 }
 
+/**
+ * security_inode_need_killpriv() - Check if security_inode_killpriv() required
+ * @dentry: associated dentry
+ *
+ * Called when an inode has been changed to determine if
+ * security_inode_killpriv() should be called.
+ *
+ * Return: Return <0 on error to abort the inode change operation, return 0 if
+ *         security_inode_killpriv() does not need to be called, return >0 if
+ *         security_inode_killpriv() does need to be called.
+ */
 int security_inode_need_killpriv(struct dentry *dentry)
 {
 	return call_int_hook(inode_need_killpriv, 0, dentry);
 }
 
+/**
+ * security_inode_killpriv() - The setuid bit is removed, update LSM state
+ * @idmap: idmap of the mount
+ * @dentry: associated dentry
+ *
+ * The @dentry's setuid bit is being removed.  Remove similar security labels.
+ * Called with the dentry->d_inode->i_mutex held.
+ *
+ * Return: Return 0 on success.  If error is returned, then the operation
+ *         causing setuid bit removal is failed.
+ */
 int security_inode_killpriv(struct mnt_idmap *idmap,
 			    struct dentry *dentry)
 {
 	return call_int_hook(inode_killpriv, 0, idmap, dentry);
 }
 
+/**
+ * security_inode_getsecurity() - Get the xattr security label of an inode
+ * @idmap: idmap of the mount
+ * @inode: inode
+ * @name: xattr name
+ * @buffer: security label buffer
+ * @alloc: allocation flag
+ *
+ * Retrieve a copy of the extended attribute representation of the security
+ * label associated with @name for @inode via @buffer.  Note that @name is the
+ * remainder of the attribute name after the security prefix has been removed.
+ * @alloc is used to specify if the call should return a value via the buffer
+ * or just the value length.
+ *
+ * Return: Returns size of buffer on success.
+ */
 int security_inode_getsecurity(struct mnt_idmap *idmap,
 			       struct inode *inode, const char *name,
 			       void **buffer, bool alloc)
@@ -1791,6 +2225,21 @@ int security_inode_getsecurity(struct mnt_idmap *idmap,
 	return LSM_RET_DEFAULT(inode_getsecurity);
 }
 
+/**
+ * security_inode_setsecurity() - Set the xattr security label of an inode
+ * @inode: inode
+ * @name: xattr name
+ * @value: security label
+ * @size: length of security label
+ * @flags: flags
+ *
+ * Set the security label associated with @name for @inode from the extended
+ * attribute value @value.  @size indicates the size of the @value in bytes.
+ * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
+ * remainder of the attribute name after the security. prefix has been removed.
+ *
+ * Return: Returns 0 on success.
+ */
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
 {
 	struct security_hook_list *hp;
@@ -1810,6 +2259,19 @@ int security_inode_setsecurity(struct inode *inode, const char *name, const void
 	return LSM_RET_DEFAULT(inode_setsecurity);
 }
 
+/**
+ * security_inode_listsecurity() - List the xattr security label names
+ * @inode: inode
+ * @buffer: buffer
+ * @buffer_size: size of buffer
+ *
+ * Copy the extended attribute names for the security labels associated with
+ * @inode into @buffer.  The maximum size of @buffer is specified by
+ * @buffer_size.  @buffer may be NULL to request the size of the buffer
+ * required.
+ *
+ * Return: Returns number of bytes used/required on success.
+ */
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
 {
 	if (unlikely(IS_PRIVATE(inode)))
@@ -1818,17 +2280,49 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer
 }
 EXPORT_SYMBOL(security_inode_listsecurity);
 
+/**
+ * security_inode_getsecid() - Get an inode's secid
+ * @inode: inode
+ * @secid: secid to return
+ *
+ * Get the secid associated with the node.  In case of failure, @secid will be
+ * set to zero.
+ */
 void security_inode_getsecid(struct inode *inode, u32 *secid)
 {
 	call_void_hook(inode_getsecid, inode, secid);
 }
 
+/**
+ * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
+ * @src: union dentry of copy-up file
+ * @new: newly created creds
+ *
+ * A file is about to be copied up from lower layer to upper layer of overlay
+ * filesystem. Security module can prepare a set of new creds and modify as
+ * need be and return new creds. Caller will switch to new creds temporarily to
+ * create new file and release newly allocated creds.
+ *
+ * Return: Returns 0 on success or a negative error code on error.
+ */
 int security_inode_copy_up(struct dentry *src, struct cred **new)
 {
 	return call_int_hook(inode_copy_up, 0, src, new);
 }
 EXPORT_SYMBOL(security_inode_copy_up);
 
+/**
+ * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
+ * @name: xattr name
+ *
+ * Filter the xattrs being copied up when a unioned file is copied up from a
+ * lower layer to the union/overlay layer.   The caller is responsible for
+ * reading and writing the xattrs, this hook is merely a filter.
+ *
+ * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP
+ *         if the security module does not know about attribute, or a negative
+ *         error code to abort the copy up.
+ */
 int security_inode_copy_up_xattr(const char *name)
 {
 	struct security_hook_list *hp;
@@ -2405,6 +2899,13 @@ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 	return call_int_hook(sem_semop, 0, sma, sops, nsops, alter);
 }
 
+/**
+ * security_d_instantiate() - Populate an inode's LSM state based on a dentry
+ * @dentry: dentry
+ * @inode: inode
+ *
+ * Fill in @inode security information for a @dentry if allowed.
+ */
 void security_d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	if (unlikely(inode && IS_PRIVATE(inode)))
@@ -2413,6 +2914,17 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode)
 }
 EXPORT_SYMBOL(security_d_instantiate);
 
+/**
+ * security_getprocattr() - Read an attribute for a task
+ * @p: the task
+ * @lsm: LSM name
+ * @name: attribute name
+ * @value: attribute value
+ *
+ * Read attribute @name for task @p and store it into @value if allowed.
+ *
+ * Return: Returns the length of @value on success, a negative value otherwise.
+ */
 int security_getprocattr(struct task_struct *p, const char *lsm,
 			 const char *name, char **value)
 {
@@ -2426,6 +2938,18 @@ int security_getprocattr(struct task_struct *p, const char *lsm,
 	return LSM_RET_DEFAULT(getprocattr);
 }
 
+/**
+ * security_setprocattr() - Set an attribute for a task
+ * @lsm: LSM name
+ * @name: attribute name
+ * @value: attribute value
+ * @size: attribute value size
+ *
+ * Write (set) the current task's attribute @name to @value, size @size if
+ * allowed.
+ *
+ * Return: Returns bytes written on success, a negative value otherwise.
+ */
 int security_setprocattr(const char *lsm, const char *name, void *value,
 			 size_t size)
 {
-- 
cgit v1.2.3


From 9348944b775d9a12ee7ab698df5bad85409bdd48 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 10 Feb 2023 13:20:33 -0500
Subject: lsm: move the kernfs hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h |  9 ---------
 security/security.c       | 10 ++++++++++
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3cdd58424796..c953f6c356f2 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,15 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for kernfs node operations
- *
- * @kernfs_init_security:
- *	Initialize the security context of a newly created kernfs node based
- *	on its own and its parent's attributes.
- *	@kn_dir the parent kernfs node.
- *	@kn the new child kernfs node.
- *	Return 0 if permission is granted.
- *
  * Security hooks for file operations
  *
  * @file_permission:
diff --git a/security/security.c b/security/security.c
index bd35854f956a..b070a24157ec 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2344,6 +2344,16 @@ int security_inode_copy_up_xattr(const char *name)
 }
 EXPORT_SYMBOL(security_inode_copy_up_xattr);
 
+/**
+ * security_kernfs_init_security() - Init LSM context for a kernfs node
+ * @kn_dir: parent kernfs node
+ * @kn: the kernfs node to initialize
+ *
+ * Initialize the security context of a newly created kernfs node based on its
+ * own and its parent's attributes.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_kernfs_init_security(struct kernfs_node *kn_dir,
 				  struct kernfs_node *kn)
 {
-- 
cgit v1.2.3


From a0fd6480de489780cdf0940a38bb75b949976d6c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 10 Feb 2023 13:23:09 -0500
Subject: lsm: move the file hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 104 -------------------------------
 security/security.c       | 151 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c953f6c356f2..7c58683b9288 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,110 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for file operations
- *
- * @file_permission:
- *	Check file permissions before accessing an open file.  This hook is
- *	called by various operations that read or write files.  A security
- *	module can use this hook to perform additional checking on these
- *	operations, e.g.  to revalidate permissions on use to support privilege
- *	bracketing or policy changes.  Notice that this hook is used when the
- *	actual read/write operations are performed, whereas the
- *	inode_security_ops hook is called when a file is opened (as well as
- *	many other operations).
- *	Caveat:  Although this hook can be used to revalidate permissions for
- *	various system call operations that read or write files, it does not
- *	address the revalidation of permissions for memory-mapped files.
- *	Security modules must handle this separately if they need such
- *	revalidation.
- *	@file contains the file structure being accessed.
- *	@mask contains the requested permissions.
- *	Return 0 if permission is granted.
- * @file_alloc_security:
- *	Allocate and attach a security structure to the file->f_security field.
- *	The security field is initialized to NULL when the structure is first
- *	created.
- *	@file contains the file structure to secure.
- *	Return 0 if the hook is successful and permission is granted.
- * @file_free_security:
- *	Deallocate and free any security structures stored in file->f_security.
- *	@file contains the file structure being modified.
- * @file_ioctl:
- *	@file contains the file structure.
- *	@cmd contains the operation to perform.
- *	@arg contains the operational arguments.
- *	Check permission for an ioctl operation on @file.  Note that @arg
- *	sometimes represents a user space pointer; in other cases, it may be a
- *	simple integer value.  When @arg represents a user space pointer, it
- *	should never be used by the security module.
- *	Return 0 if permission is granted.
- * @mmap_addr:
- *	Check permissions for a mmap operation at @addr.
- *	@addr contains virtual address that will be used for the operation.
- *	Return 0 if permission is granted.
- * @mmap_file:
- *	Check permissions for a mmap operation.  The @file may be NULL, e.g.
- *	if mapping anonymous memory.
- *	@file contains the file structure for file to map (may be NULL).
- *	@reqprot contains the protection requested by the application.
- *	@prot contains the protection that will be applied by the kernel.
- *	@flags contains the operational flags.
- *	Return 0 if permission is granted.
- * @file_mprotect:
- *	Check permissions before changing memory access permissions.
- *	@vma contains the memory region to modify.
- *	@reqprot contains the protection requested by the application.
- *	@prot contains the protection that will be applied by the kernel.
- *	Return 0 if permission is granted.
- * @file_lock:
- *	Check permission before performing file locking operations.
- *	Note the hook mediates both flock and fcntl style locks.
- *	@file contains the file structure.
- *	@cmd contains the posix-translated lock operation to perform
- *	(e.g. F_RDLCK, F_WRLCK).
- *	Return 0 if permission is granted.
- * @file_fcntl:
- *	Check permission before allowing the file operation specified by @cmd
- *	from being performed on the file @file.  Note that @arg sometimes
- *	represents a user space pointer; in other cases, it may be a simple
- *	integer value.  When @arg represents a user space pointer, it should
- *	never be used by the security module.
- *	@file contains the file structure.
- *	@cmd contains the operation to be performed.
- *	@arg contains the operational arguments.
- *	Return 0 if permission is granted.
- * @file_set_fowner:
- *	Save owner security information (typically from current->security) in
- *	file->f_security for later use by the send_sigiotask hook.
- *	@file contains the file structure to update.
- *	Return 0 on success.
- * @file_send_sigiotask:
- *	Check permission for the file owner @fown to send SIGIO or SIGURG to the
- *	process @tsk.  Note that this hook is sometimes called from interrupt.
- *	Note that the fown_struct, @fown, is never outside the context of a
- *	struct file, so the file structure (and associated security information)
- *	can always be obtained: container_of(fown, struct file, f_owner)
- *	@tsk contains the structure of task receiving signal.
- *	@fown contains the file owner information.
- *	@sig is the signal that will be sent.  When 0, kernel sends SIGIO.
- *	Return 0 if permission is granted.
- * @file_receive:
- *	This hook allows security modules to control the ability of a process
- *	to receive an open file descriptor via socket IPC.
- *	@file contains the file structure being received.
- *	Return 0 if permission is granted.
- * @file_truncate:
- *	Check permission before truncating a file, i.e. using ftruncate.
- *	Note that truncation permission may also be checked based on the path,
- *	using the @path_truncate hook.
- *	@file contains the file structure for the file.
- *	Return 0 if permission is granted.
- * @file_open:
- *	Save open-time permission checking state for later use upon
- *	file_permission, and recheck access if anything has changed
- *	since inode_permission.
- *	Return 0 if permission is granted.
- *
  * Security hooks for task operations.
  *
  * @task_alloc:
diff --git a/security/security.c b/security/security.c
index b070a24157ec..a527291c191d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2360,6 +2360,25 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir,
 	return call_int_hook(kernfs_init_security, 0, kn_dir, kn);
 }
 
+/**
+ * security_file_permission() - Check file permissions
+ * @file: file
+ * @mask: requested permissions
+ *
+ * Check file permissions before accessing an open file.  This hook is called
+ * by various operations that read or write files.  A security module can use
+ * this hook to perform additional checking on these operations, e.g. to
+ * revalidate permissions on use to support privilege bracketing or policy
+ * changes.  Notice that this hook is used when the actual read/write
+ * operations are performed, whereas the inode_security_ops hook is called when
+ * a file is opened (as well as many other operations).  Although this hook can
+ * be used to revalidate permissions for various system call operations that
+ * read or write files, it does not address the revalidation of permissions for
+ * memory-mapped files.  Security modules must handle this separately if they
+ * need such revalidation.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_permission(struct file *file, int mask)
 {
 	int ret;
@@ -2371,6 +2390,15 @@ int security_file_permission(struct file *file, int mask)
 	return fsnotify_perm(file, mask);
 }
 
+/**
+ * security_file_alloc() - Allocate and init a file's LSM blob
+ * @file: the file
+ *
+ * Allocate and attach a security structure to the file->f_security field.  The
+ * security field is initialized to NULL when the structure is first created.
+ *
+ * Return: Return 0 if the hook is successful and permission is granted.
+ */
 int security_file_alloc(struct file *file)
 {
 	int rc = lsm_file_alloc(file);
@@ -2383,6 +2411,12 @@ int security_file_alloc(struct file *file)
 	return rc;
 }
 
+/**
+ * security_file_free() - Free a file's LSM blob
+ * @file: the file
+ *
+ * Deallocate and free any security structures stored in file->f_security.
+ */
 void security_file_free(struct file *file)
 {
 	void *blob;
@@ -2396,6 +2430,19 @@ void security_file_free(struct file *file)
 	}
 }
 
+/**
+ * security_file_ioctl() - Check if an ioctl is allowed
+ * @file: associated file
+ * @cmd: ioctl cmd
+ * @arg: ioctl arguments
+ *
+ * Check permission for an ioctl operation on @file.  Note that @arg sometimes
+ * represents a user space pointer; in other cases, it may be a simple integer
+ * value.  When @arg represents a user space pointer, it should never be used
+ * by the security module.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	return call_int_hook(file_ioctl, 0, file, cmd, arg);
@@ -2435,6 +2482,17 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 	return prot;
 }
 
+/**
+ * security_mmap_file() - Check if mmap'ing a file is allowed
+ * @file: file
+ * @prot: protection applied by the kernel
+ * @flags: flags
+ *
+ * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
+ * mapping anonymous memory.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_mmap_file(struct file *file, unsigned long prot,
 			unsigned long flags)
 {
@@ -2447,11 +2505,29 @@ int security_mmap_file(struct file *file, unsigned long prot,
 	return ima_file_mmap(file, prot, prot_adj, flags);
 }
 
+/**
+ * security_mmap_addr() - Check if mmap'ing an address is allowed
+ * @addr: address
+ *
+ * Check permissions for a mmap operation at @addr.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_mmap_addr(unsigned long addr)
 {
 	return call_int_hook(mmap_addr, 0, addr);
 }
 
+/**
+ * security_file_mprotect() - Check if changing memory protections is allowed
+ * @vma: memory region
+ * @reqprot: application requested protection
+ * @prog: protection applied by the kernel
+ *
+ * Check permissions before changing memory access permissions.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
 			    unsigned long prot)
 {
@@ -2463,32 +2539,97 @@ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
 	return ima_file_mprotect(vma, prot);
 }
 
+/**
+ * security_file_lock() - Check if a file lock is allowed
+ * @file: file
+ * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
+ *
+ * Check permission before performing file locking operations.  Note the hook
+ * mediates both flock and fcntl style locks.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_lock(struct file *file, unsigned int cmd)
 {
 	return call_int_hook(file_lock, 0, file, cmd);
 }
 
+/**
+ * security_file_fcntl() - Check if fcntl() op is allowed
+ * @file: file
+ * @cmd: fnctl command
+ * @arg: command argument
+ *
+ * Check permission before allowing the file operation specified by @cmd from
+ * being performed on the file @file.  Note that @arg sometimes represents a
+ * user space pointer; in other cases, it may be a simple integer value.  When
+ * @arg represents a user space pointer, it should never be used by the
+ * security module.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	return call_int_hook(file_fcntl, 0, file, cmd, arg);
 }
 
+/**
+ * security_file_set_fowner() - Set the file owner info in the LSM blob
+ * @file: the file
+ *
+ * Save owner security information (typically from current->security) in
+ * file->f_security for later use by the send_sigiotask hook.
+ *
+ * Return: Returns 0 on success.
+ */
 void security_file_set_fowner(struct file *file)
 {
 	call_void_hook(file_set_fowner, file);
 }
 
+/**
+ * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
+ * @tsk: target task
+ * @fown: signal sender
+ * @sig: signal to be sent, SIGIO is sent if 0
+ *
+ * Check permission for the file owner @fown to send SIGIO or SIGURG to the
+ * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
+ * that the fown_struct, @fown, is never outside the context of a struct file,
+ * so the file structure (and associated security information) can always be
+ * obtained: container_of(fown, struct file, f_owner).
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_send_sigiotask(struct task_struct *tsk,
 				  struct fown_struct *fown, int sig)
 {
 	return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig);
 }
 
+/**
+ * security_file_receive() - Check is receiving a file via IPC is allowed
+ * @file: file being received
+ *
+ * This hook allows security modules to control the ability of a process to
+ * receive an open file descriptor via socket IPC.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_receive(struct file *file)
 {
 	return call_int_hook(file_receive, 0, file);
 }
 
+/**
+ * security_file_open() - Save open() time state for late use by the LSM
+ * @file:
+ *
+ * Save open-time permission checking state for later use upon file_permission,
+ * and recheck access if anything has changed since inode_permission.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_open(struct file *file)
 {
 	int ret;
@@ -2500,6 +2641,16 @@ int security_file_open(struct file *file)
 	return fsnotify_perm(file, MAY_OPEN);
 }
 
+/**
+ * security_file_truncate() - Check if truncating a file is allowed
+ * @file: file
+ *
+ * Check permission before truncating a file, i.e. using ftruncate.  Note that
+ * truncation permission may also be checked based on the path, using the
+ * @path_truncate hook.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_file_truncate(struct file *file)
 {
 	return call_int_hook(file_truncate, 0, file);
-- 
cgit v1.2.3


From 130c53bfee4bc70919d356bd8b30137ba4d665b1 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sat, 11 Feb 2023 19:27:58 -0500
Subject: lsm: move the task hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 210 -----------------------------
 security/security.c       | 334 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 334 insertions(+), 210 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7c58683b9288..5578a13729bd 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,216 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for task operations.
- *
- * @task_alloc:
- *	@task task being allocated.
- *	@clone_flags contains the flags indicating what should be shared.
- *	Handle allocation of task-related resources.
- *	Returns a zero on success, negative values on failure.
- * @task_free:
- *	@task task about to be freed.
- *	Handle release of task-related resources. (Note that this can be called
- *	from interrupt context.)
- * @cred_alloc_blank:
- *	@cred points to the credentials.
- *	@gfp indicates the atomicity of any memory allocations.
- *	Only allocate sufficient memory and attach to @cred such that
- *	cred_transfer() will not get ENOMEM.
- *	Return 0 on success, negative values on failure.
- * @cred_free:
- *	@cred points to the credentials.
- *	Deallocate and clear the cred->security field in a set of credentials.
- * @cred_prepare:
- *	@new points to the new credentials.
- *	@old points to the original credentials.
- *	@gfp indicates the atomicity of any memory allocations.
- *	Prepare a new set of credentials by copying the data from the old set.
- *	Return 0 on success, negative values on failure.
- * @cred_transfer:
- *	@new points to the new credentials.
- *	@old points to the original credentials.
- *	Transfer data from original creds to new creds
- * @cred_getsecid:
- *	Retrieve the security identifier of the cred structure @c
- *	@c contains the credentials, secid will be placed into @secid.
- *	In case of failure, @secid will be set to zero.
- * @kernel_act_as:
- *	Set the credentials for a kernel service to act as (subjective context).
- *	@new points to the credentials to be modified.
- *	@secid specifies the security ID to be set.
- *	The current task must be the one that nominated @secid.
- *	Return 0 if successful.
- * @kernel_create_files_as:
- *	Set the file creation context in a set of credentials to be the same as
- *	the objective context of the specified inode.
- *	@new points to the credentials to be modified.
- *	@inode points to the inode to use as a reference.
- *	The current task must be the one that nominated @inode.
- *	Return 0 if successful.
- * @kernel_module_request:
- *	Ability to trigger the kernel to automatically upcall to userspace for
- *	userspace to load a kernel module with the given name.
- *	@kmod_name name of the module requested by the kernel.
- *	Return 0 if successful.
- * @kernel_load_data:
- *	Load data provided by userspace.
- *	@id kernel load data identifier.
- *	@contents if a subsequent @kernel_post_load_data will be called.
- *	Return 0 if permission is granted.
- * @kernel_post_load_data:
- *	Load data provided by a non-file source (usually userspace buffer).
- *	@buf pointer to buffer containing the data contents.
- *	@size length of the data contents.
- *	@id kernel load data identifier.
- *	@description a text description of what was loaded, @id-specific.
- *	Return 0 if permission is granted.
- *	This must be paired with a prior @kernel_load_data call that had
- *	@contents set to true.
- * @kernel_read_file:
- *	Read a file specified by userspace.
- *	@file contains the file structure pointing to the file being read
- *	by the kernel.
- *	@id kernel read file identifier.
- *	@contents if a subsequent @kernel_post_read_file will be called.
- *	Return 0 if permission is granted.
- * @kernel_post_read_file:
- *	Read a file specified by userspace.
- *	@file contains the file structure pointing to the file being read
- *	by the kernel.
- *	@buf pointer to buffer containing the file contents.
- *	@size length of the file contents.
- *	@id kernel read file identifier.
- *	This must be paired with a prior @kernel_read_file call that had
- *	@contents set to true.
- *	Return 0 if permission is granted.
- * @task_fix_setuid:
- *	Update the module's state after setting one or more of the user
- *	identity attributes of the current process.  The @flags parameter
- *	indicates which of the set*uid system calls invoked this hook.  If
- *	@new is the set of credentials that will be installed.  Modifications
- *	should be made to this rather than to @current->cred.
- *	@old is the set of credentials that are being replaced.
- *	@flags contains one of the LSM_SETID_* values.
- *	Return 0 on success.
- * @task_fix_setgid:
- *	Update the module's state after setting one or more of the group
- *	identity attributes of the current process.  The @flags parameter
- *	indicates which of the set*gid system calls invoked this hook.
- *	@new is the set of credentials that will be installed.  Modifications
- *	should be made to this rather than to @current->cred.
- *	@old is the set of credentials that are being replaced.
- *	@flags contains one of the LSM_SETID_* values.
- *	Return 0 on success.
- * @task_fix_setgroups:
- *	Update the module's state after setting the supplementary group
- *	identity attributes of the current process.
- *	@new is the set of credentials that will be installed.  Modifications
- *	should be made to this rather than to @current->cred.
- *	@old is the set of credentials that are being replaced.
- *	Return 0 on success.
- * @task_setpgid:
- *	Check permission before setting the process group identifier of the
- *	process @p to @pgid.
- *	@p contains the task_struct for process being modified.
- *	@pgid contains the new pgid.
- *	Return 0 if permission is granted.
- * @task_getpgid:
- *	Check permission before getting the process group identifier of the
- *	process @p.
- *	@p contains the task_struct for the process.
- *	Return 0 if permission is granted.
- * @task_getsid:
- *	Check permission before getting the session identifier of the process
- *	@p.
- *	@p contains the task_struct for the process.
- *	Return 0 if permission is granted.
- * @current_getsecid_subj:
- *	Retrieve the subjective security identifier of the current task and
- *	return it in @secid.
- *	In case of failure, @secid will be set to zero.
- * @task_getsecid_obj:
- *	Retrieve the objective security identifier of the task_struct in @p
- *	and return it in @secid.
- *	In case of failure, @secid will be set to zero.
- *
- * @task_setnice:
- *	Check permission before setting the nice value of @p to @nice.
- *	@p contains the task_struct of process.
- *	@nice contains the new nice value.
- *	Return 0 if permission is granted.
- * @task_setioprio:
- *	Check permission before setting the ioprio value of @p to @ioprio.
- *	@p contains the task_struct of process.
- *	@ioprio contains the new ioprio value.
- *	Return 0 if permission is granted.
- * @task_getioprio:
- *	Check permission before getting the ioprio value of @p.
- *	@p contains the task_struct of process.
- *	Return 0 if permission is granted.
- * @task_prlimit:
- *	Check permission before getting and/or setting the resource limits of
- *	another task.
- *	@cred points to the cred structure for the current task.
- *	@tcred points to the cred structure for the target task.
- *	@flags contains the LSM_PRLIMIT_* flag bits indicating whether the
- *	resource limits are being read, modified, or both.
- *	Return 0 if permission is granted.
- * @task_setrlimit:
- *	Check permission before setting the resource limits of process @p
- *	for @resource to @new_rlim.  The old resource limit values can
- *	be examined by dereferencing (p->signal->rlim + resource).
- *	@p points to the task_struct for the target task's group leader.
- *	@resource contains the resource whose limit is being set.
- *	@new_rlim contains the new limits for @resource.
- *	Return 0 if permission is granted.
- * @task_setscheduler:
- *	Check permission before setting scheduling policy and/or parameters of
- *	process @p.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_getscheduler:
- *	Check permission before obtaining scheduling information for process
- *	@p.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_movememory:
- *	Check permission before moving memory owned by process @p.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_kill:
- *	Check permission before sending signal @sig to @p.  @info can be NULL,
- *	the constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
- *	SI_FROMKERNEL(info) is true, then the signal should be viewed as coming
- *	from the kernel and should typically be permitted.
- *	SIGIO signals are handled separately by the send_sigiotask hook in
- *	file_security_ops.
- *	@p contains the task_struct for process.
- *	@info contains the signal information.
- *	@sig contains the signal value.
- *	@cred contains the cred of the process where the signal originated, or
- *	NULL if the current task is the originator.
- *	Return 0 if permission is granted.
- * @task_prctl:
- *	Check permission before performing a process control operation on the
- *	current process.
- *	@option contains the operation.
- *	@arg2 contains a argument.
- *	@arg3 contains a argument.
- *	@arg4 contains a argument.
- *	@arg5 contains a argument.
- *	Return -ENOSYS if no-one wanted to handle this op, any other value to
- *	cause prctl() to return immediately with that value.
- * @task_to_inode:
- *	Set the security attributes for an inode based on an associated task's
- *	security attributes, e.g. for /proc/pid inodes.
- *	@p contains the task_struct for the task.
- *	@inode contains the inode structure for the inode.
- * @userns_create:
- *	Check permission prior to creating a new user namespace.
- *	@cred points to prepared creds.
- *	Return 0 if successful, otherwise < 0 error code.
- *
  * Security hooks for Netlink messaging.
  *
  * @netlink_send:
diff --git a/security/security.c b/security/security.c
index a527291c191d..766595a76ab4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2656,6 +2656,15 @@ int security_file_truncate(struct file *file)
 	return call_int_hook(file_truncate, 0, file);
 }
 
+/**
+ * security_task_alloc() - Allocate a task's LSM blob
+ * @task: the task
+ * @clone_flags: flags indicating what is being shared
+ *
+ * Handle allocation of task-related resources.
+ *
+ * Return: Returns a zero on success, negative values on failure.
+ */
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
 {
 	int rc = lsm_task_alloc(task);
@@ -2668,6 +2677,13 @@ int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
 	return rc;
 }
 
+/**
+ * security_task_free() - Free a task's LSM blob and related resources
+ * @task: task
+ *
+ * Handle release of task-related resources.  Note that this can be called from
+ * interrupt context.
+ */
 void security_task_free(struct task_struct *task)
 {
 	call_void_hook(task_free, task);
@@ -2676,6 +2692,16 @@ void security_task_free(struct task_struct *task)
 	task->security = NULL;
 }
 
+/**
+ * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
+ * @cred: credentials
+ * @gfp: gfp flags
+ *
+ * Only allocate sufficient memory and attach to @cred such that
+ * cred_transfer() will not get ENOMEM.
+ *
+ * Return: Returns 0 on success, negative values on failure.
+ */
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
 	int rc = lsm_cred_alloc(cred, gfp);
@@ -2689,6 +2715,12 @@ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 	return rc;
 }
 
+/**
+ * security_cred_free() - Free the cred's LSM blob and associated resources
+ * @cred: credentials
+ *
+ * Deallocate and clear the cred->security field in a set of credentials.
+ */
 void security_cred_free(struct cred *cred)
 {
 	/*
@@ -2704,6 +2736,16 @@ void security_cred_free(struct cred *cred)
 	cred->security = NULL;
 }
 
+/**
+ * security_prepare_creds() - Prepare a new set of credentials
+ * @new: new credentials
+ * @old: original credentials
+ * @gfp: gfp flags
+ *
+ * Prepare a new set of credentials by copying the data from the old set.
+ *
+ * Return: Returns 0 on success, negative values on failure.
+ */
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 {
 	int rc = lsm_cred_alloc(new, gfp);
@@ -2717,11 +2759,26 @@ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 	return rc;
 }
 
+/**
+ * security_transfer_creds() - Transfer creds
+ * @new: target credentials
+ * @old: original credentials
+ *
+ * Transfer data from original creds to new creds.
+ */
 void security_transfer_creds(struct cred *new, const struct cred *old)
 {
 	call_void_hook(cred_transfer, new, old);
 }
 
+/**
+ * security_cred_getsecid() - Get the secid from a set of credentials
+ * @c: credentials
+ * @secid: secid value
+ *
+ * Retrieve the security identifier of the cred structure @c.  In case of
+ * failure, @secid will be set to zero.
+ */
 void security_cred_getsecid(const struct cred *c, u32 *secid)
 {
 	*secid = 0;
@@ -2729,16 +2786,46 @@ void security_cred_getsecid(const struct cred *c, u32 *secid)
 }
 EXPORT_SYMBOL(security_cred_getsecid);
 
+/**
+ * security_kernel_act_as() - Set the kernel credentials to act as secid
+ * @new: credentials
+ * @secid: secid
+ *
+ * Set the credentials for a kernel service to act as (subjective context).
+ * The current task must be the one that nominated @secid.
+ *
+ * Return: Returns 0 if successful.
+ */
 int security_kernel_act_as(struct cred *new, u32 secid)
 {
 	return call_int_hook(kernel_act_as, 0, new, secid);
 }
 
+/**
+ * security_kernel_create_files_as() - Set file creation context using an inode
+ * @new: target credentials
+ * @inode: reference inode
+ *
+ * Set the file creation context in a set of credentials to be the same as the
+ * objective context of the specified inode.  The current task must be the one
+ * that nominated @inode.
+ *
+ * Return: Returns 0 if successful.
+ */
 int security_kernel_create_files_as(struct cred *new, struct inode *inode)
 {
 	return call_int_hook(kernel_create_files_as, 0, new, inode);
 }
 
+/**
+ * security_kernel_module_request() - Check is loading a module is allowed
+ * @kmod_name: module name
+ *
+ * Ability to trigger the kernel to automatically upcall to userspace for
+ * userspace to load a kernel module with the given name.
+ *
+ * Return: Returns 0 if successful.
+ */
 int security_kernel_module_request(char *kmod_name)
 {
 	int ret;
@@ -2749,6 +2836,16 @@ int security_kernel_module_request(char *kmod_name)
 	return integrity_kernel_module_request(kmod_name);
 }
 
+/**
+ * security_kernel_read_file() - Read a file specified by userspace
+ * @file: file
+ * @id: file identifier
+ * @contents: trust if security_kernel_post_read_file() will be called
+ *
+ * Read a file specified by userspace.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
 			      bool contents)
 {
@@ -2761,6 +2858,19 @@ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
 }
 EXPORT_SYMBOL_GPL(security_kernel_read_file);
 
+/**
+ * security_kernel_post_read_file() - Read a file specified by userspace
+ * @file: file
+ * @buf: file contents
+ * @size: size of file contents
+ * @id: file identifier
+ *
+ * Read a file specified by userspace.  This must be paired with a prior call
+ * to security_kernel_read_file() call that indicated this hook would also be
+ * called, see security_kernel_read_file() for more information.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
 				   enum kernel_read_file_id id)
 {
@@ -2773,6 +2883,15 @@ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
 }
 EXPORT_SYMBOL_GPL(security_kernel_post_read_file);
 
+/**
+ * security_kernel_load_data() - Load data provided by userspace
+ * @id: data identifier
+ * @contents: true if security_kernel_post_load_data() will be called
+ *
+ * Load data provided by userspace.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
 {
 	int ret;
@@ -2784,6 +2903,20 @@ int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
 }
 EXPORT_SYMBOL_GPL(security_kernel_load_data);
 
+/**
+ * security_kernel_post_load_data() - Load userspace data from a non-file source
+ * @buf: data
+ * @size: size of data
+ * @id: data identifier
+ * @description: text description of data, specific to the id value
+ *
+ * Load data provided by a non-file source (usually userspace buffer).  This
+ * must be paired with a prior security_kernel_load_data() call that indicated
+ * this hook would also be called, see security_kernel_load_data() for more
+ * information.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_kernel_post_load_data(char *buf, loff_t size,
 				   enum kernel_load_data_id id,
 				   char *description)
@@ -2798,38 +2931,112 @@ int security_kernel_post_load_data(char *buf, loff_t size,
 }
 EXPORT_SYMBOL_GPL(security_kernel_post_load_data);
 
+/**
+ * security_task_fix_setuid() - Update LSM with new user id attributes
+ * @new: updated credentials
+ * @old: credentials being replaced
+ * @flags: LSM_SETID_* flag values
+ *
+ * Update the module's state after setting one or more of the user identity
+ * attributes of the current process.  The @flags parameter indicates which of
+ * the set*uid system calls invoked this hook.  If @new is the set of
+ * credentials that will be installed.  Modifications should be made to this
+ * rather than to @current->cred.
+ *
+ * Return: Returns 0 on success.
+ */
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags)
 {
 	return call_int_hook(task_fix_setuid, 0, new, old, flags);
 }
 
+/**
+ * security_task_fix_setgid() - Update LSM with new group id attributes
+ * @new: updated credentials
+ * @old: credentials being replaced
+ * @flags: LSM_SETID_* flag value
+ *
+ * Update the module's state after setting one or more of the group identity
+ * attributes of the current process.  The @flags parameter indicates which of
+ * the set*gid system calls invoked this hook.  @new is the set of credentials
+ * that will be installed.  Modifications should be made to this rather than to
+ * @current->cred.
+ *
+ * Return: Returns 0 on success.
+ */
 int security_task_fix_setgid(struct cred *new, const struct cred *old,
 				 int flags)
 {
 	return call_int_hook(task_fix_setgid, 0, new, old, flags);
 }
 
+/**
+ * security_task_fix_setgroups() - Update LSM with new supplementary groups
+ * @new: updated credentials
+ * @old: credentials being replaced
+ *
+ * Update the module's state after setting the supplementary group identity
+ * attributes of the current process.  @new is the set of credentials that will
+ * be installed.  Modifications should be made to this rather than to
+ * @current->cred.
+ *
+ * Return: Returns 0 on success.
+ */
 int security_task_fix_setgroups(struct cred *new, const struct cred *old)
 {
 	return call_int_hook(task_fix_setgroups, 0, new, old);
 }
 
+/**
+ * security_task_setpgid() - Check if setting the pgid is allowed
+ * @p: task being modified
+ * @pgid: new pgid
+ *
+ * Check permission before setting the process group identifier of the process
+ * @p to @pgid.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return call_int_hook(task_setpgid, 0, p, pgid);
 }
 
+/**
+ * security_task_getpgid() - Check if getting the pgid is allowed
+ * @p: task
+ *
+ * Check permission before getting the process group identifier of the process
+ * @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_getpgid(struct task_struct *p)
 {
 	return call_int_hook(task_getpgid, 0, p);
 }
 
+/**
+ * security_task_getsid() - Check if getting the session id is allowed
+ * @p: task
+ *
+ * Check permission before getting the session identifier of the process @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_getsid(struct task_struct *p)
 {
 	return call_int_hook(task_getsid, 0, p);
 }
 
+/**
+ * security_current_getsecid_subj() - Get the current task's subjective secid
+ * @secid: secid value
+ *
+ * Retrieve the subjective security identifier of the current task and return
+ * it in @secid.  In case of failure, @secid will be set to zero.
+ */
 void security_current_getsecid_subj(u32 *secid)
 {
 	*secid = 0;
@@ -2837,6 +3044,14 @@ void security_current_getsecid_subj(u32 *secid)
 }
 EXPORT_SYMBOL(security_current_getsecid_subj);
 
+/**
+ * security_task_getsecid_obj() - Get a task's objective secid
+ * @p: target task
+ * @secid: secid value
+ *
+ * Retrieve the objective security identifier of the task_struct in @p and
+ * return it in @secid. In case of failure, @secid will be set to zero.
+ */
 void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
 {
 	*secid = 0;
@@ -2844,54 +3059,157 @@ void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
 }
 EXPORT_SYMBOL(security_task_getsecid_obj);
 
+/**
+ * security_task_setnice() - Check if setting a task's nice value is allowed
+ * @p: target task
+ * @nice: nice value
+ *
+ * Check permission before setting the nice value of @p to @nice.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_setnice(struct task_struct *p, int nice)
 {
 	return call_int_hook(task_setnice, 0, p, nice);
 }
 
+/**
+ * security_task_setioprio() - Check if setting a task's ioprio is allowed
+ * @p: target task
+ * @ioprio: ioprio value
+ *
+ * Check permission before setting the ioprio value of @p to @ioprio.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_setioprio(struct task_struct *p, int ioprio)
 {
 	return call_int_hook(task_setioprio, 0, p, ioprio);
 }
 
+/**
+ * security_task_getioprio() - Check if getting a task's ioprio is allowed
+ * @p: task
+ *
+ * Check permission before getting the ioprio value of @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_getioprio(struct task_struct *p)
 {
 	return call_int_hook(task_getioprio, 0, p);
 }
 
+/**
+ * security_task_prlimit() - Check if get/setting resources limits is allowed
+ * @cred: current task credentials
+ * @tcred: target task credentials
+ * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
+ *
+ * Check permission before getting and/or setting the resource limits of
+ * another task.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
 			  unsigned int flags)
 {
 	return call_int_hook(task_prlimit, 0, cred, tcred, flags);
 }
 
+/**
+ * security_task_setrlimit() - Check if setting a new rlimit value is allowed
+ * @p: target task's group leader
+ * @resource: resource whose limit is being set
+ * @new_rlim: new resource limit
+ *
+ * Check permission before setting the resource limits of process @p for
+ * @resource to @new_rlim.  The old resource limit values can be examined by
+ * dereferencing (p->signal->rlim + resource).
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_setrlimit(struct task_struct *p, unsigned int resource,
 		struct rlimit *new_rlim)
 {
 	return call_int_hook(task_setrlimit, 0, p, resource, new_rlim);
 }
 
+/**
+ * security_task_setscheduler() - Check if setting sched policy/param is allowed
+ * @p: target task
+ *
+ * Check permission before setting scheduling policy and/or parameters of
+ * process @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_setscheduler(struct task_struct *p)
 {
 	return call_int_hook(task_setscheduler, 0, p);
 }
 
+/**
+ * security_task_getscheduler() - Check if getting scheduling info is allowed
+ * @p: target task
+ *
+ * Check permission before obtaining scheduling information for process @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_getscheduler(struct task_struct *p)
 {
 	return call_int_hook(task_getscheduler, 0, p);
 }
 
+/**
+ * security_task_movememory() - Check if moving memory is allowed
+ * @p: task
+ *
+ * Check permission before moving memory owned by process @p.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_movememory(struct task_struct *p)
 {
 	return call_int_hook(task_movememory, 0, p);
 }
 
+/**
+ * security_task_kill() - Check if sending a signal is allowed
+ * @p: target process
+ * @info: signal information
+ * @sig: signal value
+ * @cred: credentials of the signal sender, NULL if @current
+ *
+ * Check permission before sending signal @sig to @p.  @info can be NULL, the
+ * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
+ * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
+ * the kernel and should typically be permitted.  SIGIO signals are handled
+ * separately by the send_sigiotask hook in file_security_ops.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
 			int sig, const struct cred *cred)
 {
 	return call_int_hook(task_kill, 0, p, info, sig, cred);
 }
 
+/**
+ * security_task_prctl() - Check if a prctl op is allowed
+ * @option: operation
+ * @arg2: argument
+ * @arg3: argument
+ * @arg4: argument
+ * @arg5: argument
+ *
+ * Check permission before performing a process control operation on the
+ * current process.
+ *
+ * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
+ *         to cause prctl() to return immediately with that value.
+ */
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {
@@ -2910,11 +3228,27 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	return rc;
 }
 
+/**
+ * security_task_to_inode() - Set the security attributes of a task's inode
+ * @p: task
+ * @inode: inode
+ *
+ * Set the security attributes for an inode based on an associated task's
+ * security attributes, e.g. for /proc/pid inodes.
+ */
 void security_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	call_void_hook(task_to_inode, p, inode);
 }
 
+/**
+ * security_create_user_ns() - Check if creating a new userns is allowed
+ * @cred: prepared creds
+ *
+ * Check permission prior to creating a new user namespace.
+ *
+ * Return: Returns 0 if successful, otherwise < 0 error code.
+ */
 int security_create_user_ns(const struct cred *cred)
 {
 	return call_int_hook(userns_create, 0, cred);
-- 
cgit v1.2.3


From 2bcf51bf2f03cc9fccd3316d361cdf9695fccd11 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sun, 12 Feb 2023 15:06:59 -0500
Subject: lsm: move the netlink hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 13 -------------
 security/security.c       | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 5578a13729bd..ba2daec1bc35 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,19 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for Netlink messaging.
- *
- * @netlink_send:
- *	Save security information for a netlink message so that permission
- *	checking can be performed when the message is processed.  The security
- *	information can be saved using the eff_cap field of the
- *	netlink_skb_parms structure.  Also may be used to provide fine
- *	grained control over message transmission.
- *	@sk associated sock of task sending the message.
- *	@skb contains the sk_buff structure for the netlink message.
- *	Return 0 if the information was successfully saved and message
- *	is allowed to be transmitted.
- *
  * Security hooks for Unix domain networking.
  *
  * @unix_stream_connect:
diff --git a/security/security.c b/security/security.c
index 766595a76ab4..b5fe49ac564e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3458,6 +3458,19 @@ int security_setprocattr(const char *lsm, const char *name, void *value,
 	return LSM_RET_DEFAULT(setprocattr);
 }
 
+/**
+ * security_netlink_send() - Save info and check if netlink sending is allowed
+ * @sk: sending socket
+ * @skb: netlink message
+ *
+ * Save security information for a netlink message so that permission checking
+ * can be performed when the message is processed.  The security information
+ * can be saved using the eff_cap field of the netlink_skb_parms structure.
+ * Also may be used to provide fine grained control over message transmission.
+ *
+ * Return: Returns 0 if the information was successfully saved and message is
+ *         allowed to be transmitted.
+ */
 int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
 	return call_int_hook(netlink_send, 0, sk, skb);
-- 
cgit v1.2.3


From 2c2442fd46cd162c0b6a07b4560095f03bd3275f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sun, 12 Feb 2023 15:10:23 -0500
Subject: lsm: move the AF_UNIX hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 26 --------------------------
 security/security.c       | 42 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index ba2daec1bc35..1fc1e2aa7d01 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,32 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for Unix domain networking.
- *
- * @unix_stream_connect:
- *	Check permissions before establishing a Unix domain stream connection
- *	between @sock and @other.
- *	@sock contains the sock structure.
- *	@other contains the peer sock structure.
- *	@newsk contains the new sock structure.
- *	Return 0 if permission is granted.
- * @unix_may_send:
- *	Check permissions before connecting or sending datagrams from @sock to
- *	@other.
- *	@sock contains the socket structure.
- *	@other contains the peer socket structure.
- *	Return 0 if permission is granted.
- *
- * The @unix_stream_connect and @unix_may_send hooks were necessary because
- * Linux provides an alternative to the conventional file name space for Unix
- * domain sockets.  Whereas binding and connecting to sockets in the file name
- * space is mediated by the typical file permissions (and caught by the mknod
- * and permission hooks in inode_security_ops), binding and connecting to
- * sockets in the abstract name space is completely unmediated.  Sufficient
- * control of Unix domain sockets in the abstract name space isn't possible
- * using only the socket layer hooks, since we need to know the actual target
- * socket, which is not looked up until we are inside the af_unix code.
- *
  * Security hooks for socket operations.
  *
  * @socket_create:
diff --git a/security/security.c b/security/security.c
index b5fe49ac564e..9112531e6c2e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3555,13 +3555,53 @@ int security_watch_key(struct key *key)
 #endif
 
 #ifdef CONFIG_SECURITY_NETWORK
-
+/**
+ * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
+ * @sock: originating sock
+ * @other: peer sock
+ * @newsk: new sock
+ *
+ * Check permissions before establishing a Unix domain stream connection
+ * between @sock and @other.
+ *
+ * The @unix_stream_connect and @unix_may_send hooks were necessary because
+ * Linux provides an alternative to the conventional file name space for Unix
+ * domain sockets.  Whereas binding and connecting to sockets in the file name
+ * space is mediated by the typical file permissions (and caught by the mknod
+ * and permission hooks in inode_security_ops), binding and connecting to
+ * sockets in the abstract name space is completely unmediated.  Sufficient
+ * control of Unix domain sockets in the abstract name space isn't possible
+ * using only the socket layer hooks, since we need to know the actual target
+ * socket, which is not looked up until we are inside the af_unix code.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
 {
 	return call_int_hook(unix_stream_connect, 0, sock, other, newsk);
 }
 EXPORT_SYMBOL(security_unix_stream_connect);
 
+/**
+ * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
+ * @sock: originating sock
+ * @other: peer sock
+ *
+ * Check permissions before connecting or sending datagrams from @sock to
+ * @other.
+ *
+ * The @unix_stream_connect and @unix_may_send hooks were necessary because
+ * Linux provides an alternative to the conventional file name space for Unix
+ * domain sockets.  Whereas binding and connecting to sockets in the file name
+ * space is mediated by the typical file permissions (and caught by the mknod
+ * and permission hooks in inode_security_ops), binding and connecting to
+ * sockets in the abstract name space is completely unmediated.  Sufficient
+ * control of Unix domain sockets in the abstract name space isn't possible
+ * using only the socket layer hooks, since we need to know the actual target
+ * socket, which is not looked up until we are inside the af_unix code.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_unix_may_send(struct socket *sock,  struct socket *other)
 {
 	return call_int_hook(unix_may_send, 0, sock, other);
-- 
cgit v1.2.3


From 6b6bbe8c02a1bc7ef7f0aa490a3601a16cd145fe Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sun, 12 Feb 2023 15:15:29 -0500
Subject: lsm: move the socket hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 191 ---------------------------
 security/security.c       | 322 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 322 insertions(+), 191 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 1fc1e2aa7d01..531d141083ed 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,197 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for socket operations.
- *
- * @socket_create:
- *	Check permissions prior to creating a new socket.
- *	@family contains the requested protocol family.
- *	@type contains the requested communications type.
- *	@protocol contains the requested protocol.
- *	@kern set to 1 if a kernel socket.
- *	Return 0 if permission is granted.
- * @socket_post_create:
- *	This hook allows a module to update or allocate a per-socket security
- *	structure. Note that the security field was not added directly to the
- *	socket structure, but rather, the socket security information is stored
- *	in the associated inode.  Typically, the inode alloc_security hook will
- *	allocate and attach security information to
- *	SOCK_INODE(sock)->i_security.  This hook may be used to update the
- *	SOCK_INODE(sock)->i_security field with additional information that
- *	wasn't available when the inode was allocated.
- *	@sock contains the newly created socket structure.
- *	@family contains the requested protocol family.
- *	@type contains the requested communications type.
- *	@protocol contains the requested protocol.
- *	@kern set to 1 if a kernel socket.
- *	Return 0 if permission is granted.
- * @socket_socketpair:
- *	Check permissions before creating a fresh pair of sockets.
- *	@socka contains the first socket structure.
- *	@sockb contains the second socket structure.
- *	Return 0 if permission is granted and the connection was established.
- * @socket_bind:
- *	Check permission before socket protocol layer bind operation is
- *	performed and the socket @sock is bound to the address specified in the
- *	@address parameter.
- *	@sock contains the socket structure.
- *	@address contains the address to bind to.
- *	@addrlen contains the length of address.
- *	Return 0 if permission is granted.
- * @socket_connect:
- *	Check permission before socket protocol layer connect operation
- *	attempts to connect socket @sock to a remote address, @address.
- *	@sock contains the socket structure.
- *	@address contains the address of remote endpoint.
- *	@addrlen contains the length of address.
- *	Return 0 if permission is granted.
- * @socket_listen:
- *	Check permission before socket protocol layer listen operation.
- *	@sock contains the socket structure.
- *	@backlog contains the maximum length for the pending connection queue.
- *	Return 0 if permission is granted.
- * @socket_accept:
- *	Check permission before accepting a new connection.  Note that the new
- *	socket, @newsock, has been created and some information copied to it,
- *	but the accept operation has not actually been performed.
- *	@sock contains the listening socket structure.
- *	@newsock contains the newly created server socket for connection.
- *	Return 0 if permission is granted.
- * @socket_sendmsg:
- *	Check permission before transmitting a message to another socket.
- *	@sock contains the socket structure.
- *	@msg contains the message to be transmitted.
- *	@size contains the size of message.
- *	Return 0 if permission is granted.
- * @socket_recvmsg:
- *	Check permission before receiving a message from a socket.
- *	@sock contains the socket structure.
- *	@msg contains the message structure.
- *	@size contains the size of message structure.
- *	@flags contains the operational flags.
- *	Return 0 if permission is granted.
- * @socket_getsockname:
- *	Check permission before the local address (name) of the socket object
- *	@sock is retrieved.
- *	@sock contains the socket structure.
- *	Return 0 if permission is granted.
- * @socket_getpeername:
- *	Check permission before the remote address (name) of a socket object
- *	@sock is retrieved.
- *	@sock contains the socket structure.
- *	Return 0 if permission is granted.
- * @socket_getsockopt:
- *	Check permissions before retrieving the options associated with socket
- *	@sock.
- *	@sock contains the socket structure.
- *	@level contains the protocol level to retrieve option from.
- *	@optname contains the name of option to retrieve.
- *	Return 0 if permission is granted.
- * @socket_setsockopt:
- *	Check permissions before setting the options associated with socket
- *	@sock.
- *	@sock contains the socket structure.
- *	@level contains the protocol level to set options for.
- *	@optname contains the name of the option to set.
- *	Return 0 if permission is granted.
- * @socket_shutdown:
- *	Checks permission before all or part of a connection on the socket
- *	@sock is shut down.
- *	@sock contains the socket structure.
- *	@how contains the flag indicating how future sends and receives
- *	are handled.
- *	Return 0 if permission is granted.
- * @socket_sock_rcv_skb:
- *	Check permissions on incoming network packets.  This hook is distinct
- *	from Netfilter's IP input hooks since it is the first time that the
- *	incoming sk_buff @skb has been associated with a particular socket, @sk.
- *	Must not sleep inside this hook because some callers hold spinlocks.
- *	@sk contains the sock (not socket) associated with the incoming sk_buff.
- *	@skb contains the incoming network data.
- *	Return 0 if permission is granted.
- * @socket_getpeersec_stream:
- *	This hook allows the security module to provide peer socket security
- *	state for unix or connected tcp sockets to userspace via getsockopt
- *	SO_GETPEERSEC.  For tcp sockets this can be meaningful if the
- *	socket is associated with an ipsec SA.
- *	@sock is the local socket.
- *	@optval memory where the security state is to be copied.
- *	@optlen memory where the module should copy the actual length
- *	of the security state.
- *	@len as input is the maximum length to copy to userspace provided
- *	by the caller.
- *	Return 0 if all is well, otherwise, typical getsockopt return
- *	values.
- * @socket_getpeersec_dgram:
- *	This hook allows the security module to provide peer socket security
- *	state for udp sockets on a per-packet basis to userspace via
- *	getsockopt SO_GETPEERSEC. The application must first have indicated
- *	the IP_PASSSEC option via getsockopt. It can then retrieve the
- *	security state returned by this hook for a packet via the SCM_SECURITY
- *	ancillary message type.
- *	@sock contains the peer socket. May be NULL.
- *	@skb is the sk_buff for the packet being queried. May be NULL.
- *	@secid pointer to store the secid of the packet.
- *	Return 0 on success, error on failure.
- * @sk_alloc_security:
- *	Allocate and attach a security structure to the sk->sk_security field,
- *	which is used to copy security attributes between local stream sockets.
- *	Return 0 on success, error on failure.
- * @sk_free_security:
- *	Deallocate security structure.
- * @sk_clone_security:
- *	Clone/copy security structure.
- * @sk_getsecid:
- *	Retrieve the LSM-specific secid for the sock to enable caching
- *	of network authorizations.
- * @sock_graft:
- *	Sets the socket's isec sid to the sock's sid.
- * @inet_conn_request:
- *	Sets the openreq's sid to socket's sid with MLS portion taken
- *	from peer sid.
- *	Return 0 if permission is granted.
- * @inet_csk_clone:
- *	Sets the new child socket's sid to the openreq sid.
- * @inet_conn_established:
- *	Sets the connection's peersid to the secmark on skb.
- * @secmark_relabel_packet:
- *	Check if the process should be allowed to relabel packets to
- *	the given secid.
- *	Return 0 if permission is granted.
- * @secmark_refcount_inc:
- *	Tells the LSM to increment the number of secmark labeling rules loaded.
- * @secmark_refcount_dec:
- *	Tells the LSM to decrement the number of secmark labeling rules loaded.
- * @req_classify_flow:
- *	Sets the flow's sid to the openreq sid.
- * @tun_dev_alloc_security:
- *	This hook allows a module to allocate a security structure for a TUN
- *	device.
- *	@security pointer to a security structure pointer.
- *	Returns a zero on success, negative values on failure.
- * @tun_dev_free_security:
- *	This hook allows a module to free the security structure for a TUN
- *	device.
- *	@security pointer to the TUN device's security structure.
- * @tun_dev_create:
- *	Check permissions prior to creating a new TUN device.
- *	Return 0 if permission is granted.
- * @tun_dev_attach_queue:
- *	Check permissions prior to attaching to a TUN device queue.
- *	@security pointer to the TUN device's security structure.
- *	Return 0 if permission is granted.
- * @tun_dev_attach:
- *	This hook can be used by the module to update any security state
- *	associated with the TUN device's sock structure.
- *	@sk contains the existing sock structure.
- *	@security pointer to the TUN device's security structure.
- *	Return 0 if permission is granted.
- * @tun_dev_open:
- *	This hook can be used by the module to update any security state
- *	associated with the TUN device's security structure.
- *	@security pointer to the TUN devices's security structure.
- *	Return 0 if permission is granted.
- *
  * Security hooks for SCTP
  *
  * @sctp_assoc_request:
diff --git a/security/security.c b/security/security.c
index 9112531e6c2e..1b2eca6437c4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3608,11 +3608,40 @@ int security_unix_may_send(struct socket *sock,  struct socket *other)
 }
 EXPORT_SYMBOL(security_unix_may_send);
 
+/**
+ * security_socket_create() - Check if creating a new socket is allowed
+ * @family: protocol family
+ * @type: communications type
+ * @protocol: requested protocol
+ * @kern: set to 1 if a kernel socket is requested
+ *
+ * Check permissions prior to creating a new socket.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_create(int family, int type, int protocol, int kern)
 {
 	return call_int_hook(socket_create, 0, family, type, protocol, kern);
 }
 
+/**
+ * security_socket_create() - Initialize a newly created socket
+ * @sock: socket
+ * @family: protocol family
+ * @type: communications type
+ * @protocol: requested protocol
+ * @kern: set to 1 if a kernel socket is requested
+ *
+ * This hook allows a module to update or allocate a per-socket security
+ * structure. Note that the security field was not added directly to the socket
+ * structure, but rather, the socket security information is stored in the
+ * associated inode.  Typically, the inode alloc_security hook will allocate
+ * and attach security information to SOCK_INODE(sock)->i_security.  This hook
+ * may be used to update the SOCK_INODE(sock)->i_security field with additional
+ * information that wasn't available when the inode was allocated.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_post_create(struct socket *sock, int family,
 				int type, int protocol, int kern)
 {
@@ -3620,74 +3649,223 @@ int security_socket_post_create(struct socket *sock, int family,
 						protocol, kern);
 }
 
+/**
+ * security_socket_socketpair() - Check if creating a socketpair is allowed
+ * @socka: first socket
+ * @sockb: second socket
+ *
+ * Check permissions before creating a fresh pair of sockets.
+ *
+ * Return: Returns 0 if permission is granted and the connection was
+ *         established.
+ */
 int security_socket_socketpair(struct socket *socka, struct socket *sockb)
 {
 	return call_int_hook(socket_socketpair, 0, socka, sockb);
 }
 EXPORT_SYMBOL(security_socket_socketpair);
 
+/**
+ * security_socket_bind() - Check if a socket bind operation is allowed
+ * @sock: socket
+ * @address: requested bind address
+ * @addrlen: length of address
+ *
+ * Check permission before socket protocol layer bind operation is performed
+ * and the socket @sock is bound to the address specified in the @address
+ * parameter.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
 {
 	return call_int_hook(socket_bind, 0, sock, address, addrlen);
 }
 
+/**
+ * security_socket_connect() - Check if a socket connect operation is allowed
+ * @sock: socket
+ * @address: address of remote connection point
+ * @addrlen: length of address
+ *
+ * Check permission before socket protocol layer connect operation attempts to
+ * connect socket @sock to a remote address, @address.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
 {
 	return call_int_hook(socket_connect, 0, sock, address, addrlen);
 }
 
+/**
+ * security_socket_listen() - Check if a socket is allowed to listen
+ * @sock: socket
+ * @backlog: connection queue size
+ *
+ * Check permission before socket protocol layer listen operation.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_listen(struct socket *sock, int backlog)
 {
 	return call_int_hook(socket_listen, 0, sock, backlog);
 }
 
+/**
+ * security_socket_accept() - Check if a socket is allowed to accept connections
+ * @sock: listening socket
+ * @newsock: newly creation connection socket
+ *
+ * Check permission before accepting a new connection.  Note that the new
+ * socket, @newsock, has been created and some information copied to it, but
+ * the accept operation has not actually been performed.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_accept(struct socket *sock, struct socket *newsock)
 {
 	return call_int_hook(socket_accept, 0, sock, newsock);
 }
 
+/**
+ * security_socket_sendmsg() - Check is sending a message is allowed
+ * @sock: sending socket
+ * @msg: message to send
+ * @size: size of message
+ *
+ * Check permission before transmitting a message to another socket.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
 {
 	return call_int_hook(socket_sendmsg, 0, sock, msg, size);
 }
 
+/**
+ * security_socket_recvmsg() - Check if receiving a message is allowed
+ * @sock: receiving socket
+ * @msg: message to receive
+ * @size: size of message
+ * @flags: operational flags
+ *
+ * Check permission before receiving a message from a socket.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
 			    int size, int flags)
 {
 	return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags);
 }
 
+/**
+ * security_socket_getsockname() - Check if reading the socket addr is allowed
+ * @sock: socket
+ *
+ * Check permission before reading the local address (name) of the socket
+ * object.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_getsockname(struct socket *sock)
 {
 	return call_int_hook(socket_getsockname, 0, sock);
 }
 
+/**
+ * security_socket_getpeername() - Check if reading the peer's addr is allowed
+ * @sock: socket
+ *
+ * Check permission before the remote address (name) of a socket object.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_getpeername(struct socket *sock)
 {
 	return call_int_hook(socket_getpeername, 0, sock);
 }
 
+/**
+ * security_socket_getsockopt() - Check if reading a socket option is allowed
+ * @sock: socket
+ * @level: option's protocol level
+ * @optname: option name
+ *
+ * Check permissions before retrieving the options associated with socket
+ * @sock.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_getsockopt(struct socket *sock, int level, int optname)
 {
 	return call_int_hook(socket_getsockopt, 0, sock, level, optname);
 }
 
+/**
+ * security_socket_setsockopt() - Check if setting a socket option is allowed
+ * @sock: socket
+ * @level: option's protocol level
+ * @optname: option name
+ *
+ * Check permissions before setting the options associated with socket @sock.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_setsockopt(struct socket *sock, int level, int optname)
 {
 	return call_int_hook(socket_setsockopt, 0, sock, level, optname);
 }
 
+/**
+ * security_socket_shutdown() - Checks if shutting down the socket is allowed
+ * @sock: socket
+ * @how: flag indicating how sends and receives are handled
+ *
+ * Checks permission before all or part of a connection on the socket @sock is
+ * shut down.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_socket_shutdown(struct socket *sock, int how)
 {
 	return call_int_hook(socket_shutdown, 0, sock, how);
 }
 
+/**
+ * security_sock_rcv_skb() - Check if an incoming network packet is allowed
+ * @sk: destination sock
+ * @skb: incoming packet
+ *
+ * Check permissions on incoming network packets.  This hook is distinct from
+ * Netfilter's IP input hooks since it is the first time that the incoming
+ * sk_buff @skb has been associated with a particular socket, @sk.  Must not
+ * sleep inside this hook because some callers hold spinlocks.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	return call_int_hook(socket_sock_rcv_skb, 0, sk, skb);
 }
 EXPORT_SYMBOL(security_sock_rcv_skb);
 
+/**
+ * security_socket_getpeersec_stream() - Get the remote peer label
+ * @sock: socket
+ * @optval: destination buffer
+ * @optlen: size of peer label copied into the buffer
+ * @len: maximum size of the destination buffer
+ *
+ * This hook allows the security module to provide peer socket security state
+ * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
+ * For tcp sockets this can be meaningful if the socket is associated with an
+ * ipsec SA.
+ *
+ * Return: Returns 0 if all is well, otherwise, typical getsockopt return
+ *         values.
+ */
 int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
 				      sockptr_t optlen, unsigned int len)
 {
@@ -3695,6 +3873,20 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
 			     optval, optlen, len);
 }
 
+/**
+ * security_socket_getpeersec_dgram() - Get the remote peer label
+ * @sock: socket
+ * @skb: datagram packet
+ * @secid: remote peer label secid
+ *
+ * This hook allows the security module to provide peer socket security state
+ * for udp sockets on a per-packet basis to userspace via getsockopt
+ * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
+ * option via getsockopt. It can then retrieve the security state returned by
+ * this hook for a packet via the SCM_SECURITY ancillary message type.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
 {
 	return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock,
@@ -3702,16 +3894,40 @@ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u
 }
 EXPORT_SYMBOL(security_socket_getpeersec_dgram);
 
+/**
+ * security_sk_alloc() - Allocate and initialize a sock's LSM blob
+ * @sk: sock
+ * @family: protocol family
+ * @priotity: gfp flags
+ *
+ * Allocate and attach a security structure to the sk->sk_security field, which
+ * is used to copy security attributes between local stream sockets.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
 {
 	return call_int_hook(sk_alloc_security, 0, sk, family, priority);
 }
 
+/**
+ * security_sk_free() - Free the sock's LSM blob
+ * @sk: sock
+ *
+ * Deallocate security structure.
+ */
 void security_sk_free(struct sock *sk)
 {
 	call_void_hook(sk_free_security, sk);
 }
 
+/**
+ * security_sk_clone() - Clone a sock's LSM state
+ * @sk: original sock
+ * @newsk: target sock
+ *
+ * Clone/copy security structure.
+ */
 void security_sk_clone(const struct sock *sk, struct sock *newsk)
 {
 	call_void_hook(sk_clone_security, sk, newsk);
@@ -3724,6 +3940,13 @@ void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic)
 }
 EXPORT_SYMBOL(security_sk_classify_flow);
 
+/**
+ * security_req_classify_flow() - Set a flow's secid based on request_sock
+ * @req: request_sock
+ * @flic: target flow
+ *
+ * Sets @flic's secid to @req's secid.
+ */
 void security_req_classify_flow(const struct request_sock *req,
 				struct flowi_common *flic)
 {
@@ -3731,12 +3954,30 @@ void security_req_classify_flow(const struct request_sock *req,
 }
 EXPORT_SYMBOL(security_req_classify_flow);
 
+/**
+ * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
+ * @sk: sock being grafted
+ * @sock: target socket
+ *
+ * Sets @sock's inode secid to @sk's secid and update @sk with any necessary
+ * LSM state from @sock.
+ */
 void security_sock_graft(struct sock *sk, struct socket *parent)
 {
 	call_void_hook(sock_graft, sk, parent);
 }
 EXPORT_SYMBOL(security_sock_graft);
 
+/**
+ * security_inet_conn_request() - Set request_sock state using incoming connect
+ * @sk: parent listening sock
+ * @skb: incoming connection
+ * @req: new request_sock
+ *
+ * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_inet_conn_request(const struct sock *sk,
 			struct sk_buff *skb, struct request_sock *req)
 {
@@ -3744,12 +3985,26 @@ int security_inet_conn_request(const struct sock *sk,
 }
 EXPORT_SYMBOL(security_inet_conn_request);
 
+/**
+ * security_inet_csk_clone() - Set new sock LSM state based on request_sock
+ * @newsk: new sock
+ * @req: connection request_sock
+ *
+ * Set that LSM state of @sock using the LSM state from @req.
+ */
 void security_inet_csk_clone(struct sock *newsk,
 			const struct request_sock *req)
 {
 	call_void_hook(inet_csk_clone, newsk, req);
 }
 
+/**
+ * security_inet_conn_established() - Update sock's LSM state with connection
+ * @sk: sock
+ * @skb: connection packet
+ *
+ * Update @sock's LSM state to represent a new connection from @skb.
+ */
 void security_inet_conn_established(struct sock *sk,
 			struct sk_buff *skb)
 {
@@ -3757,54 +4012,121 @@ void security_inet_conn_established(struct sock *sk,
 }
 EXPORT_SYMBOL(security_inet_conn_established);
 
+/**
+ * security_secmark_relabel_packet() - Check if setting a secmark is allowed
+ * @secid: new secmark value
+ *
+ * Check if the process should be allowed to relabel packets to @secid.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_secmark_relabel_packet(u32 secid)
 {
 	return call_int_hook(secmark_relabel_packet, 0, secid);
 }
 EXPORT_SYMBOL(security_secmark_relabel_packet);
 
+/**
+ * security_secmark_refcount_inc() - Increment the secmark labeling rule count
+ *
+ * Tells the LSM to increment the number of secmark labeling rules loaded.
+ */
 void security_secmark_refcount_inc(void)
 {
 	call_void_hook(secmark_refcount_inc);
 }
 EXPORT_SYMBOL(security_secmark_refcount_inc);
 
+/**
+ * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
+ *
+ * Tells the LSM to decrement the number of secmark labeling rules loaded.
+ */
 void security_secmark_refcount_dec(void)
 {
 	call_void_hook(secmark_refcount_dec);
 }
 EXPORT_SYMBOL(security_secmark_refcount_dec);
 
+/**
+ * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
+ * @security: pointer to the LSM blob
+ *
+ * This hook allows a module to allocate a security structure for a TUN	device,
+ * returning the pointer in @security.
+ *
+ * Return: Returns a zero on success, negative values on failure.
+ */
 int security_tun_dev_alloc_security(void **security)
 {
 	return call_int_hook(tun_dev_alloc_security, 0, security);
 }
 EXPORT_SYMBOL(security_tun_dev_alloc_security);
 
+/**
+ * security_tun_dev_free_security() - Free a TUN device LSM blob
+ * @security: LSM blob
+ *
+ * This hook allows a module to free the security structure for a TUN device.
+ */
 void security_tun_dev_free_security(void *security)
 {
 	call_void_hook(tun_dev_free_security, security);
 }
 EXPORT_SYMBOL(security_tun_dev_free_security);
 
+/**
+ * security_tun_dev_create() - Check if creating a TUN device is allowed
+ *
+ * Check permissions prior to creating a new TUN device.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_tun_dev_create(void)
 {
 	return call_int_hook(tun_dev_create, 0);
 }
 EXPORT_SYMBOL(security_tun_dev_create);
 
+/**
+ * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
+ * @security: TUN device LSM blob
+ *
+ * Check permissions prior to attaching to a TUN device queue.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_tun_dev_attach_queue(void *security)
 {
 	return call_int_hook(tun_dev_attach_queue, 0, security);
 }
 EXPORT_SYMBOL(security_tun_dev_attach_queue);
 
+/**
+ * security_tun_dev_attach() - Update TUN device LSM state on attach
+ * @sk: associated sock
+ * @security: TUN device LSM blob
+ *
+ * This hook can be used by the module to update any security state associated
+ * with the TUN device's sock structure.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_tun_dev_attach(struct sock *sk, void *security)
 {
 	return call_int_hook(tun_dev_attach, 0, sk, security);
 }
 EXPORT_SYMBOL(security_tun_dev_attach);
 
+/**
+ * security_tun_dev_open() - Update TUN device LSM state on open
+ * @security: TUN device LSM blob
+ *
+ * This hook can be used by the module to update any security state associated
+ * with the TUN device's security structure.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_tun_dev_open(void *security)
 {
 	return call_int_hook(tun_dev_open, 0, security);
-- 
cgit v1.2.3


From 4a49f592e931962ab3feb9fbb43bea719517670d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 15 Feb 2023 17:47:10 -0500
Subject: lsm: move the SCTP hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 33 ---------------------------------
 security/security.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 531d141083ed..bb460e0b1ff2 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,39 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for SCTP
- *
- * @sctp_assoc_request:
- *	Passes the @asoc and @chunk->skb of the association INIT packet to
- *	the security module.
- *	@asoc pointer to sctp association structure.
- *	@skb pointer to skbuff of association packet.
- *	Return 0 on success, error on failure.
- * @sctp_bind_connect:
- *	Validiate permissions required for each address associated with sock
- *	@sk. Depending on @optname, the addresses will be treated as either
- *	for a connect or bind service. The @addrlen is calculated on each
- *	ipv4 and ipv6 address using sizeof(struct sockaddr_in) or
- *	sizeof(struct sockaddr_in6).
- *	@sk pointer to sock structure.
- *	@optname name of the option to validate.
- *	@address list containing one or more ipv4/ipv6 addresses.
- *	@addrlen total length of address(s).
- *	Return 0 on success, error on failure.
- * @sctp_sk_clone:
- *	Called whenever a new socket is created by accept(2) (i.e. a TCP
- *	style socket) or when a socket is 'peeled off' e.g userspace
- *	calls sctp_peeloff(3).
- *	@asoc pointer to current sctp association structure.
- *	@sk pointer to current sock structure.
- *	@newsk pointer to new sock structure.
- * @sctp_assoc_established:
- *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
- *	to the security module.
- *	@asoc pointer to sctp association structure.
- *	@skb pointer to skbuff of association packet.
- *	Return 0 if permission is granted.
- *
  * Security hooks for Infiniband
  *
  * @ib_pkey_access:
diff --git a/security/security.c b/security/security.c
index 1b2eca6437c4..4f57c49bf561 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4133,12 +4133,35 @@ int security_tun_dev_open(void *security)
 }
 EXPORT_SYMBOL(security_tun_dev_open);
 
+/**
+ * security_sctp_assoc_request() - Update the LSM on a SCTP association req
+ * @asoc: SCTP association
+ * @skb: packet requesting the association
+ *
+ * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb)
 {
 	return call_int_hook(sctp_assoc_request, 0, asoc, skb);
 }
 EXPORT_SYMBOL(security_sctp_assoc_request);
 
+/**
+ * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
+ * @sk: socket
+ * @optname: SCTP option to validate
+ * @address: list of IP addresses to validate
+ * @addrlen: length of the address list
+ *
+ * Validiate permissions required for each address associated with sock	@sk.
+ * Depending on @optname, the addresses will be treated as either a connect or
+ * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
+ * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen)
 {
@@ -4147,6 +4170,16 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 }
 EXPORT_SYMBOL(security_sctp_bind_connect);
 
+/**
+ * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
+ * @asoc: SCTP association
+ * @sk: original sock
+ * @newsk: target sock
+ *
+ * Called whenever a new socket is created by accept(2) (i.e. a TCP style
+ * socket) or when a socket is 'peeled off' e.g userspace calls
+ * sctp_peeloff(3).
+ */
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk)
 {
@@ -4154,6 +4187,16 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 }
 EXPORT_SYMBOL(security_sctp_sk_clone);
 
+/**
+ * security_sctp_assoc_established() - Update LSM state when assoc established
+ * @asoc: SCTP association
+ * @skb: packet establishing the association
+ *
+ * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
+ * security module.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sctp_assoc_established(struct sctp_association *asoc,
 				    struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From ac318aed54987e6bd52dba90dd2ea3dca725b5c0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 15 Feb 2023 18:07:41 -0500
Subject: lsm: move the Infiniband hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 22 ----------------------
 security/security.c       | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index bb460e0b1ff2..9fc6417af980 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,28 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for Infiniband
- *
- * @ib_pkey_access:
- *	Check permission to access a pkey when modifing a QP.
- *	@subnet_prefix the subnet prefix of the port being used.
- *	@pkey the pkey to be accessed.
- *	@sec pointer to a security structure.
- *	Return 0 if permission is granted.
- * @ib_endport_manage_subnet:
- *	Check permissions to send and receive SMPs on a end port.
- *	@dev_name the IB device name (i.e. mlx4_0).
- *	@port_num the port number.
- *	@sec pointer to a security structure.
- *	Return 0 if permission is granted.
- * @ib_alloc_security:
- *	Allocate a security structure for Infiniband objects.
- *	@sec pointer to a security structure pointer.
- *	Returns 0 on success, non-zero on failure.
- * @ib_free_security:
- *	Deallocate an Infiniband security structure.
- *	@sec contains the security structure to be freed.
- *
  * Security hooks for XFRM operations.
  *
  * @xfrm_policy_alloc_security:
diff --git a/security/security.c b/security/security.c
index 4f57c49bf561..6e786e277a7f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4208,24 +4208,58 @@ EXPORT_SYMBOL(security_sctp_assoc_established);
 
 #ifdef CONFIG_SECURITY_INFINIBAND
 
+/**
+ * security_ib_pkey_access() - Check if access to an IB pkey is allowed
+ * @sec: LSM blob
+ * @subnet_prefix: subnet prefix of the port
+ * @pkey: IB pkey
+ *
+ * Check permission to access a pkey when modifing a QP.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
 {
 	return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey);
 }
 EXPORT_SYMBOL(security_ib_pkey_access);
 
+/**
+ * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
+ * @sec: LSM blob
+ * @dev_name: IB device name
+ * @port_num: port number
+ *
+ * Check permissions to send and receive SMPs on a end port.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num)
 {
 	return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num);
 }
 EXPORT_SYMBOL(security_ib_endport_manage_subnet);
 
+/**
+ * security_ib_alloc_security() - Allocate an Infiniband LSM blob
+ * @sec: LSM blob
+ *
+ * Allocate a security structure for Infiniband objects.
+ *
+ * Return: Returns 0 on success, non-zero on failure.
+ */
 int security_ib_alloc_security(void **sec)
 {
 	return call_int_hook(ib_alloc_security, 0, sec);
 }
 EXPORT_SYMBOL(security_ib_alloc_security);
 
+/**
+ * security_ib_free_security() - Free an Infiniband LSM blob
+ * @sec: LSM blob
+ *
+ * Deallocate an Infiniband security structure.
+ */
 void security_ib_free_security(void *sec)
 {
 	call_void_hook(ib_free_security, sec);
-- 
cgit v1.2.3


From 742b99456e86aa3e9176fb1c7db200101876c614 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 15 Feb 2023 18:14:01 -0500
Subject: lsm: move the xfrm hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h |  73 --------------------------------
 security/security.c       | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9fc6417af980..9c5254e4e9d1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,79 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks for XFRM operations.
- *
- * @xfrm_policy_alloc_security:
- *	@ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy
- *	Database used by the XFRM system.
- *	@sec_ctx contains the security context information being provided by
- *	the user-level policy update program (e.g., setkey).
- *	@gfp is to specify the context for the allocation.
- *	Allocate a security structure to the xp->security field; the security
- *	field is initialized to NULL when the xfrm_policy is allocated.
- *	Return 0 if operation was successful (memory to allocate, legal
- *	context).
- * @xfrm_policy_clone_security:
- *	@old_ctx contains an existing xfrm_sec_ctx.
- *	@new_ctxp contains a new xfrm_sec_ctx being cloned from old.
- *	Allocate a security structure in new_ctxp that contains the
- *	information from the old_ctx structure.
- *	Return 0 if operation was successful (memory to allocate).
- * @xfrm_policy_free_security:
- *	@ctx contains the xfrm_sec_ctx.
- *	Deallocate xp->security.
- * @xfrm_policy_delete_security:
- *	@ctx contains the xfrm_sec_ctx.
- *	Authorize deletion of xp->security.
- *	Return 0 if permission is granted.
- * @xfrm_state_alloc:
- *	@x contains the xfrm_state being added to the Security Association
- *	Database by the XFRM system.
- *	@sec_ctx contains the security context information being provided by
- *	the user-level SA generation program (e.g., setkey or racoon).
- *	Allocate a security structure to the x->security field; the security
- *	field is initialized to NULL when the xfrm_state is allocated. Set the
- *	context to correspond to sec_ctx. Return 0 if operation was successful
- *	(memory to allocate, legal context).
- * @xfrm_state_alloc_acquire:
- *	@x contains the xfrm_state being added to the Security Association
- *	Database by the XFRM system.
- *	@polsec contains the policy's security context.
- *	@secid contains the secid from which to take the mls portion of the
- *	context.
- *	Allocate a security structure to the x->security field; the security
- *	field is initialized to NULL when the xfrm_state is allocated. Set the
- *	context to correspond to secid. Return 0 if operation was successful
- *	(memory to allocate, legal context).
- * @xfrm_state_free_security:
- *	@x contains the xfrm_state.
- *	Deallocate x->security.
- * @xfrm_state_delete_security:
- *	@x contains the xfrm_state.
- *	Authorize deletion of x->security.
- *	Return 0 if permission is granted.
- * @xfrm_policy_lookup:
- *	@ctx contains the xfrm_sec_ctx for which the access control is being
- *	checked.
- *	@fl_secid contains the flow security label that is used to authorize
- *	access to the policy xp.
- *	@dir contains the direction of the flow (input or output).
- *	Check permission when a flow selects a xfrm_policy for processing
- *	XFRMs on a packet.  The hook is called when selecting either a
- *	per-socket policy or a generic xfrm policy.
- *	Return 0 if permission is granted, -ESRCH otherwise, or -errno
- *	on other errors.
- * @xfrm_state_pol_flow_match:
- *	@x contains the state to match.
- *	@xp contains the policy to check for a match.
- *	@flic contains the flowi_common struct to check for a match.
- *	Return 1 if there is a match.
- * @xfrm_decode_session:
- *	@skb points to skb to decode.
- *	@secid points to the flow key secid to set.
- *	@ckall says if all xfrms used should be checked for same secid.
- *	Return 0 if ckall is zero or all xfrms used have the same secid.
- *
  * Security hooks affecting all Key Management operations
  *
  * @key_alloc:
diff --git a/security/security.c b/security/security.c
index 6e786e277a7f..d3880096922e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4269,6 +4269,17 @@ EXPORT_SYMBOL(security_ib_free_security);
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 
+/**
+ * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
+ * @ctxp: xfrm security context being added to the SPD
+ * @sec_ctx: security label provided by userspace
+ * @gfp: gfp flags
+ *
+ * Allocate a security structure to the xp->security field; the security field
+ * is initialized to NULL when the xfrm_policy is allocated.
+ *
+ * Return:  Return 0 if operation was successful.
+ */
 int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
 			       struct xfrm_user_sec_ctx *sec_ctx,
 			       gfp_t gfp)
@@ -4277,23 +4288,58 @@ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
 }
 EXPORT_SYMBOL(security_xfrm_policy_alloc);
 
+/**
+ * security_xfrm_policy_clone() - Clone xfrm policy LSM state
+ * @old_ctx: xfrm security context
+ * @new_ctxp: target xfrm security context
+ *
+ * Allocate a security structure in new_ctxp that contains the information from
+ * the old_ctx structure.
+ *
+ * Return: Return 0 if operation was successful.
+ */
 int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
 			      struct xfrm_sec_ctx **new_ctxp)
 {
 	return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp);
 }
 
+/**
+ * security_xfrm_policy_free() - Free a xfrm security context
+ * @ctx: xfrm security context
+ *
+ * Free LSM resources associated with @ctx.
+ */
 void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
 {
 	call_void_hook(xfrm_policy_free_security, ctx);
 }
 EXPORT_SYMBOL(security_xfrm_policy_free);
 
+/**
+ * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
+ * @ctx: xfrm security context
+ *
+ * Authorize deletion of a SPD entry.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
 	return call_int_hook(xfrm_policy_delete_security, 0, ctx);
 }
 
+/**
+ * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
+ * @x: xfrm state being added to the SAD
+ * @sec_ctx: security label provided by userspace
+ *
+ * Allocate a security structure to the @x->security field; the security field
+ * is initialized to NULL when the xfrm_state is allocated. Set the context to
+ * correspond to @sec_ctx.
+ *
+ * Return: Return 0 if operation was successful.
+ */
 int security_xfrm_state_alloc(struct xfrm_state *x,
 			      struct xfrm_user_sec_ctx *sec_ctx)
 {
@@ -4301,28 +4347,76 @@ int security_xfrm_state_alloc(struct xfrm_state *x,
 }
 EXPORT_SYMBOL(security_xfrm_state_alloc);
 
+/**
+ * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
+ * @x: xfrm state being added to the SAD
+ * @polsec: associated policy's security context
+ * @secid: secid from the flow
+ *
+ * Allocate a security structure to the x->security field; the security field
+ * is initialized to NULL when the xfrm_state is allocated.  Set the context to
+ * correspond to secid.
+ *
+ * Return: Returns 0 if operation was successful.
+ */
 int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
 				      struct xfrm_sec_ctx *polsec, u32 secid)
 {
 	return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid);
 }
 
+/**
+ * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
+ * @x: xfrm state
+ *
+ * Authorize deletion of x->security.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_xfrm_state_delete(struct xfrm_state *x)
 {
 	return call_int_hook(xfrm_state_delete_security, 0, x);
 }
 EXPORT_SYMBOL(security_xfrm_state_delete);
 
+/**
+ * security_xfrm_state_free() - Free a xfrm state
+ * @x: xfrm state
+ *
+ * Deallocate x->security.
+ */
 void security_xfrm_state_free(struct xfrm_state *x)
 {
 	call_void_hook(xfrm_state_free_security, x);
 }
 
+/**
+ * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
+ * @ctx: target xfrm security context
+ * @fl_secid: flow secid used to authorize access
+ *
+ * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
+ * packet.  The hook is called when selecting either a per-socket policy or a
+ * generic xfrm policy.
+ *
+ * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
+ *         other errors.
+ */
 int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
 {
 	return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid);
 }
 
+/**
+ * security_xfrm_state_pol_flow_match() - Check for a xfrm match
+ * @x: xfrm state to match
+ * @xp xfrm policy to check for a match
+ * @flic: flow to check for a match.
+ *
+ * Check @xp and @flic for a match with @x.
+ *
+ * Return: Returns 1 if there is a match.
+ */
 int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       struct xfrm_policy *xp,
 				       const struct flowi_common *flic)
@@ -4347,6 +4441,15 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 	return rc;
 }
 
+/**
+ * security_xfrm_decode_session() - Determine the xfrm secid for a packet
+ * @skb: xfrm packet
+ * @secid: secid
+ *
+ * Decode the packet in @skb and return the security label in @secid.
+ *
+ * Return: Return 0 if all xfrms used have the same secid.
+ */
 int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
 {
 	return call_int_hook(xfrm_decode_session, 0, skb, secid, 1);
-- 
cgit v1.2.3


From ecc419a4453530c410c7031bc69ef34782b1c8ff Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 15 Feb 2023 21:46:31 -0500
Subject: lsm: move the key hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 32 --------------------------------
 security/security.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9c5254e4e9d1..2cfa56e3abc3 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,38 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks affecting all Key Management operations
- *
- * @key_alloc:
- *	Permit allocation of a key and assign security data. Note that key does
- *	not have a serial number assigned at this point.
- *	@key points to the key.
- *	@flags is the allocation flags.
- *	Return 0 if permission is granted, -ve error otherwise.
- * @key_free:
- *	Notification of destruction; free security data.
- *	@key points to the key.
- *	No return value.
- * @key_permission:
- *	See whether a specific operational right is granted to a process on a
- *	key.
- *	@key_ref refers to the key (key pointer + possession attribute bit).
- *	@cred points to the credentials to provide the context against which to
- *	evaluate the security data on the key.
- *	@perm describes the combination of permissions required of this key.
- *	Return 0 if permission is granted, -ve error otherwise.
- * @key_getsecurity:
- *	Get a textual representation of the security context attached to a key
- *	for the purposes of honouring KEYCTL_GETSECURITY.  This function
- *	allocates the storage for the NUL-terminated string and the caller
- *	should free it.
- *	@key points to the key to be queried.
- *	@_buffer points to a pointer that should be set to point to the
- *	resulting string (if no label or an error occurs).
- *	Return the length of the string (including terminating NUL) or -ve if
- *	an error.
- *	May also return 0 (and a NULL buffer pointer) if there is no label.
- *
  * Security hooks affecting all System V IPC operations.
  *
  * @ipc_permission:
diff --git a/security/security.c b/security/security.c
index d3880096922e..bde8d24af6ac 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4468,23 +4468,63 @@ EXPORT_SYMBOL(security_skb_classify_flow);
 
 #ifdef CONFIG_KEYS
 
+/**
+ * security_key_alloc() - Allocate and initialize a kernel key LSM blob
+ * @key: key
+ * @cred: credentials
+ * @flags: allocation flags
+ *
+ * Permit allocation of a key and assign security data. Note that key does not
+ * have a serial number assigned at this point.
+ *
+ * Return: Return 0 if permission is granted, -ve error otherwise.
+ */
 int security_key_alloc(struct key *key, const struct cred *cred,
 		       unsigned long flags)
 {
 	return call_int_hook(key_alloc, 0, key, cred, flags);
 }
 
+/**
+ * security_key_free() - Free a kernel key LSM blob
+ * @key: key
+ *
+ * Notification of destruction; free security data.
+ */
 void security_key_free(struct key *key)
 {
 	call_void_hook(key_free, key);
 }
 
+/**
+ * security_key_permission() - Check if a kernel key operation is allowed
+ * @key_ref: key reference
+ * @cred: credentials of actor requesting access
+ * @need_perm: requested permissions
+ *
+ * See whether a specific operational right is granted to a process on a key.
+ *
+ * Return: Return 0 if permission is granted, -ve error otherwise.
+ */
 int security_key_permission(key_ref_t key_ref, const struct cred *cred,
 			    enum key_need_perm need_perm)
 {
 	return call_int_hook(key_permission, 0, key_ref, cred, need_perm);
 }
 
+/**
+ * security_key_getsecurity() - Get the key's security label
+ * @key: key
+ * @buffer: security label buffer
+ *
+ * Get a textual representation of the security context attached to a key for
+ * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
+ * storage for the NUL-terminated string and the caller should free it.
+ *
+ * Return: Returns the length of @buffer (including terminating NUL) or -ve if
+ *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
+ *         there is no security label assigned to the key.
+ */
 int security_key_getsecurity(struct key *key, char **_buffer)
 {
 	*_buffer = NULL;
-- 
cgit v1.2.3


From 43fad282187695e25c51425ee9f5f9f08fe5e15d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 11:53:49 -0500
Subject: lsm: move the sysv hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 141 ----------------------------------
 security/security.c       | 191 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 191 insertions(+), 141 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 2cfa56e3abc3..f6679fead627 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,147 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * Security hooks affecting all System V IPC operations.
- *
- * @ipc_permission:
- *	Check permissions for access to IPC
- *	@ipcp contains the kernel IPC permission structure.
- *	@flag contains the desired (requested) permission set.
- *	Return 0 if permission is granted.
- * @ipc_getsecid:
- *	Get the secid associated with the ipc object.
- *	@ipcp contains the kernel IPC permission structure.
- *	@secid contains a pointer to the location where result will be saved.
- *	In case of failure, @secid will be set to zero.
- *
- * Security hooks for individual messages held in System V IPC message queues
- *
- * @msg_msg_alloc_security:
- *	Allocate and attach a security structure to the msg->security field.
- *	The security field is initialized to NULL when the structure is first
- *	created.
- *	@msg contains the message structure to be modified.
- *	Return 0 if operation was successful and permission is granted.
- * @msg_msg_free_security:
- *	Deallocate the security structure for this message.
- *	@msg contains the message structure to be modified.
- *
- * Security hooks for System V IPC Message Queues
- *
- * @msg_queue_alloc_security:
- *	Allocate and attach a security structure to the
- *	@perm->security field. The security field is initialized to
- *	NULL when the structure is first created.
- *	@perm contains the IPC permissions of the message queue.
- *	Return 0 if operation was successful and permission is granted.
- * @msg_queue_free_security:
- *	Deallocate security field @perm->security for the message queue.
- *	@perm contains the IPC permissions of the message queue.
- * @msg_queue_associate:
- *	Check permission when a message queue is requested through the
- *	msgget system call. This hook is only called when returning the
- *	message queue identifier for an existing message queue, not when a
- *	new message queue is created.
- *	@perm contains the IPC permissions of the message queue.
- *	@msqflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @msg_queue_msgctl:
- *	Check permission when a message control operation specified by @cmd
- *	is to be performed on the message queue with permissions @perm.
- *	The @perm may be NULL, e.g. for IPC_INFO or MSG_INFO.
- *	@perm contains the IPC permissions of the msg queue. May be NULL.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @msg_queue_msgsnd:
- *	Check permission before a message, @msg, is enqueued on the message
- *	queue with permissions @perm.
- *	@perm contains the IPC permissions of the message queue.
- *	@msg contains the message to be enqueued.
- *	@msqflg contains operational flags.
- *	Return 0 if permission is granted.
- * @msg_queue_msgrcv:
- *	Check permission before a message, @msg, is removed from the message
- *	queue. The @target task structure contains a pointer to the
- *	process that will be receiving the message (not equal to the current
- *	process when inline receives are being performed).
- *	@perm contains the IPC permissions of the message queue.
- *	@msg contains the message destination.
- *	@target contains the task structure for recipient process.
- *	@type contains the type of message requested.
- *	@mode contains the operational flags.
- *	Return 0 if permission is granted.
- *
- * Security hooks for System V Shared Memory Segments
- *
- * @shm_alloc_security:
- *	Allocate and attach a security structure to the @perm->security
- *	field. The security field is initialized to NULL when the structure is
- *	first created.
- *	@perm contains the IPC permissions of the shared memory structure.
- *	Return 0 if operation was successful and permission is granted.
- * @shm_free_security:
- *	Deallocate the security structure @perm->security for the memory segment.
- *	@perm contains the IPC permissions of the shared memory structure.
- * @shm_associate:
- *	Check permission when a shared memory region is requested through the
- *	shmget system call. This hook is only called when returning the shared
- *	memory region identifier for an existing region, not when a new shared
- *	memory region is created.
- *	@perm contains the IPC permissions of the shared memory structure.
- *	@shmflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @shm_shmctl:
- *	Check permission when a shared memory control operation specified by
- *	@cmd is to be performed on the shared memory region with permissions @perm.
- *	The @perm may be NULL, e.g. for IPC_INFO or SHM_INFO.
- *	@perm contains the IPC permissions of the shared memory structure.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @shm_shmat:
- *	Check permissions prior to allowing the shmat system call to attach the
- *	shared memory segment with permissions @perm to the data segment of the
- *	calling process. The attaching address is specified by @shmaddr.
- *	@perm contains the IPC permissions of the shared memory structure.
- *	@shmaddr contains the address to attach memory region to.
- *	@shmflg contains the operational flags.
- *	Return 0 if permission is granted.
- *
- * Security hooks for System V Semaphores
- *
- * @sem_alloc_security:
- *	Allocate and attach a security structure to the @perm->security
- *	field. The security field is initialized to NULL when the structure is
- *	first created.
- *	@perm contains the IPC permissions of the semaphore.
- *	Return 0 if operation was successful and permission is granted.
- * @sem_free_security:
- *	Deallocate security structure @perm->security for the semaphore.
- *	@perm contains the IPC permissions of the semaphore.
- * @sem_associate:
- *	Check permission when a semaphore is requested through the semget
- *	system call. This hook is only called when returning the semaphore
- *	identifier for an existing semaphore, not when a new one must be
- *	created.
- *	@perm contains the IPC permissions of the semaphore.
- *	@semflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @sem_semctl:
- *	Check permission when a semaphore operation specified by @cmd is to be
- *	performed on the semaphore. The @perm may be NULL, e.g. for
- *	IPC_INFO or SEM_INFO.
- *	@perm contains the IPC permissions of the semaphore. May be NULL.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @sem_semop:
- *	Check permissions before performing operations on members of the
- *	semaphore set. If the @alter flag is nonzero, the semaphore set
- *	may be modified.
- *	@perm contains the IPC permissions of the semaphore.
- *	@sops contains the operations to perform.
- *	@nsops contains the number of operations to perform.
- *	@alter contains the flag indicating whether changes are to be made.
- *	Return 0 if permission is granted.
- *
  * @binder_set_context_mgr:
  *	Check whether @mgr is allowed to be the binder context manager.
  *	@mgr contains the struct cred for the current binder process.
diff --git a/security/security.c b/security/security.c
index bde8d24af6ac..83443984b11a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3254,17 +3254,43 @@ int security_create_user_ns(const struct cred *cred)
 	return call_int_hook(userns_create, 0, cred);
 }
 
+/**
+ * security_ipc_permission() - Check if sysv ipc access is allowed
+ * @ipcp: ipc permission structure
+ * @flags: requested permissions
+ *
+ * Check permissions for access to IPC.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 {
 	return call_int_hook(ipc_permission, 0, ipcp, flag);
 }
 
+/**
+ * security_ipc_getsecid() - Get the sysv ipc object's secid
+ * @ipcp: ipc permission structure
+ * @secid: secid pointer
+ *
+ * Get the secid associated with the ipc object.  In case of failure, @secid
+ * will be set to zero.
+ */
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
 {
 	*secid = 0;
 	call_void_hook(ipc_getsecid, ipcp, secid);
 }
 
+/**
+ * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
+ * @msg: message structure
+ *
+ * Allocate and attach a security structure to the msg->security field.  The
+ * security field is initialized to NULL when the structure is first created.
+ *
+ * Return: Return 0 if operation was successful and permission is granted.
+ */
 int security_msg_msg_alloc(struct msg_msg *msg)
 {
 	int rc = lsm_msg_msg_alloc(msg);
@@ -3277,6 +3303,12 @@ int security_msg_msg_alloc(struct msg_msg *msg)
 	return rc;
 }
 
+/**
+ * security_msg_msg_free() - Free a sysv ipc message LSM blob
+ * @msg: message structure
+ *
+ * Deallocate the security structure for this message.
+ */
 void security_msg_msg_free(struct msg_msg *msg)
 {
 	call_void_hook(msg_msg_free_security, msg);
@@ -3284,6 +3316,15 @@ void security_msg_msg_free(struct msg_msg *msg)
 	msg->security = NULL;
 }
 
+/**
+ * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
+ * @msq: sysv ipc permission structure
+ *
+ * Allocate and attach a security structure to @msg. The security field is
+ * initialized to NULL when the structure is first created.
+ *
+ * Return: Returns 0 if operation was successful and permission is granted.
+ */
 int security_msg_queue_alloc(struct kern_ipc_perm *msq)
 {
 	int rc = lsm_ipc_alloc(msq);
@@ -3296,6 +3337,12 @@ int security_msg_queue_alloc(struct kern_ipc_perm *msq)
 	return rc;
 }
 
+/**
+ * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
+ * @msq: sysv ipc permission structure
+ *
+ * Deallocate security field @perm->security for the message queue.
+ */
 void security_msg_queue_free(struct kern_ipc_perm *msq)
 {
 	call_void_hook(msg_queue_free_security, msq);
@@ -3303,28 +3350,84 @@ void security_msg_queue_free(struct kern_ipc_perm *msq)
 	msq->security = NULL;
 }
 
+/**
+ * security_msg_queue_associate() - Check if a msg queue operation is allowed
+ * @msq: sysv ipc permission structure
+ * @msqflg: operation flags
+ *
+ * Check permission when a message queue is requested through the msgget system
+ * call. This hook is only called when returning the message queue identifier
+ * for an existing message queue, not when a new message queue is created.
+ *
+ * Return: Return 0 if permission is granted.
+ */
 int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
 {
 	return call_int_hook(msg_queue_associate, 0, msq, msqflg);
 }
 
+/**
+ * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
+ * @msq: sysv ipc permission structure
+ * @cmd: operation
+ *
+ * Check permission when a message control operation specified by @cmd is to be
+ * performed on the message queue with permissions.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
 {
 	return call_int_hook(msg_queue_msgctl, 0, msq, cmd);
 }
 
+/**
+ * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
+ * @msq: sysv ipc permission structure
+ * @msg: message
+ * @msqflg: operation flags
+ *
+ * Check permission before a message, @msg, is enqueued on the message queue
+ * with permissions specified in @msq.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
 			       struct msg_msg *msg, int msqflg)
 {
 	return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg);
 }
 
+/**
+ * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
+ * @msq: sysv ipc permission structure
+ * @msg: message
+ * @target: target task
+ * @type: type of message requested
+ * @mode: operation flags
+ *
+ * Check permission before a message, @msg, is removed from the message	queue.
+ * The @target task structure contains a pointer to the process that will be
+ * receiving the message (not equal to the current process when inline receives
+ * are being performed).
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
 			       struct task_struct *target, long type, int mode)
 {
 	return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode);
 }
 
+/**
+ * security_shm_alloc() - Allocate a sysv shm LSM blob
+ * @shp: sysv ipc permission structure
+ *
+ * Allocate and attach a security structure to the @shp security field.  The
+ * security field is initialized to NULL when the structure is first created.
+ *
+ * Return: Returns 0 if operation was successful and permission is granted.
+ */
 int security_shm_alloc(struct kern_ipc_perm *shp)
 {
 	int rc = lsm_ipc_alloc(shp);
@@ -3337,6 +3440,12 @@ int security_shm_alloc(struct kern_ipc_perm *shp)
 	return rc;
 }
 
+/**
+ * security_shm_free() - Free a sysv shm LSM blob
+ * @shp: sysv ipc permission structure
+ *
+ * Deallocate the security structure @perm->security for the memory segment.
+ */
 void security_shm_free(struct kern_ipc_perm *shp)
 {
 	call_void_hook(shm_free_security, shp);
@@ -3344,21 +3453,64 @@ void security_shm_free(struct kern_ipc_perm *shp)
 	shp->security = NULL;
 }
 
+/**
+ * security_shm_associate() - Check if a sysv shm operation is allowed
+ * @shp: sysv ipc permission structure
+ * @shmflg: operation flags
+ *
+ * Check permission when a shared memory region is requested through the shmget
+ * system call. This hook is only called when returning the shared memory
+ * region identifier for an existing region, not when a new shared memory
+ * region is created.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
 {
 	return call_int_hook(shm_associate, 0, shp, shmflg);
 }
 
+/**
+ * security_shm_shmctl() - Check if a sysv shm operation is allowed
+ * @shp: sysv ipc permission structure
+ * @cmd: operation
+ *
+ * Check permission when a shared memory control operation specified by @cmd is
+ * to be performed on the shared memory region with permissions in @shp.
+ *
+ * Return: Return 0 if permission is granted.
+ */
 int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
 {
 	return call_int_hook(shm_shmctl, 0, shp, cmd);
 }
 
+/**
+ * security_shm_shmat() - Check if a sysv shm attach operation is allowed
+ * @shp: sysv ipc permission structure
+ * @shmaddr: address of memory region to attach
+ * @shmflg: operation flags
+ *
+ * Check permissions prior to allowing the shmat system call to attach the
+ * shared memory segment with permissions @shp to the data segment of the
+ * calling process. The attaching address is specified by @shmaddr.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg)
 {
 	return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg);
 }
 
+/**
+ * security_sem_alloc() - Allocate a sysv semaphore LSM blob
+ * @sma: sysv ipc permission structure
+ *
+ * Allocate and attach a security structure to the @sma security field. The
+ * security field is initialized to NULL when the structure is first created.
+ *
+ * Return: Returns 0 if operation was successful and permission is granted.
+ */
 int security_sem_alloc(struct kern_ipc_perm *sma)
 {
 	int rc = lsm_ipc_alloc(sma);
@@ -3371,6 +3523,12 @@ int security_sem_alloc(struct kern_ipc_perm *sma)
 	return rc;
 }
 
+/**
+ * security_sem_free() - Free a sysv semaphore LSM blob
+ * @sma: sysv ipc permission structure
+ *
+ * Deallocate security structure @sma->security for the semaphore.
+ */
 void security_sem_free(struct kern_ipc_perm *sma)
 {
 	call_void_hook(sem_free_security, sma);
@@ -3378,16 +3536,49 @@ void security_sem_free(struct kern_ipc_perm *sma)
 	sma->security = NULL;
 }
 
+/**
+ * security_sem_associate() - Check if a sysv semaphore operation is allowed
+ * @sma: sysv ipc permission structure
+ * @semflg: operation flags
+ *
+ * Check permission when a semaphore is requested through the semget system
+ * call. This hook is only called when returning the semaphore identifier for
+ * an existing semaphore, not when a new one must be created.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
 {
 	return call_int_hook(sem_associate, 0, sma, semflg);
 }
 
+/**
+ * security_sem_ctl() - Check if a sysv semaphore operation is allowed
+ * @sma: sysv ipc permission structure
+ * @cmd: operation
+ *
+ * Check permission when a semaphore operation specified by @cmd is to be
+ * performed on the semaphore.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
 {
 	return call_int_hook(sem_semctl, 0, sma, cmd);
 }
 
+/**
+ * security_sem_semop() - Check if a sysv semaphore operation is allowed
+ * @sma: sysv ipc permission structure
+ * @sops: operations to perform
+ * @nsops: number of operations
+ * @alter: flag indicating changes will be made
+ *
+ * Check permissions before performing operations on members of the semaphore
+ * set. If the @alter flag is nonzero, the semaphore set may be modified.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter)
 {
-- 
cgit v1.2.3


From 1427ddbe5cc1a3d4ac068cea78638baf95f16c18 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 16:39:08 -0500
Subject: lsm: move the binder hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 22 ----------------------
 security/security.c       | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index f6679fead627..0a5b3b46fc2b 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -32,28 +32,6 @@
 /**
  * union security_list_options - Linux Security Module hook function list
  *
- * @binder_set_context_mgr:
- *	Check whether @mgr is allowed to be the binder context manager.
- *	@mgr contains the struct cred for the current binder process.
- *	Return 0 if permission is granted.
- * @binder_transaction:
- *	Check whether @from is allowed to invoke a binder transaction call
- *	to @to.
- *	@from contains the struct cred for the sending process.
- *	@to contains the struct cred for the receiving process.
- *	Return 0 if permission is granted.
- * @binder_transfer_binder:
- *	Check whether @from is allowed to transfer a binder reference to @to.
- *	@from contains the struct cred for the sending process.
- *	@to contains the struct cred for the receiving process.
- *	Return 0 if permission is granted.
- * @binder_transfer_file:
- *	Check whether @from is allowed to transfer @file to @to.
- *	@from contains the struct cred for the sending process.
- *	@file contains the struct file being transferred.
- *	@to contains the struct cred for the receiving process.
- *	Return 0 if permission is granted.
- *
  * @ptrace_access_check:
  *	Check permission before allowing the current process to trace the
  *	@child process.
diff --git a/security/security.c b/security/security.c
index 83443984b11a..b21154ed152f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -779,23 +779,59 @@ static int lsm_superblock_alloc(struct super_block *sb)
 
 /* Security operations */
 
+/**
+ * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
+ * @mgr: task credentials of current binder process
+ *
+ * Check whether @mgr is allowed to be the binder context manager.
+ *
+ * Return: Return 0 if permission is granted.
+ */
 int security_binder_set_context_mgr(const struct cred *mgr)
 {
 	return call_int_hook(binder_set_context_mgr, 0, mgr);
 }
 
+/**
+ * security_binder_transaction() - Check if a binder transaction is allowed
+ * @from: sending process
+ * @to: receiving process
+ *
+ * Check whether @from is allowed to invoke a binder transaction call to @to.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_binder_transaction(const struct cred *from,
 				const struct cred *to)
 {
 	return call_int_hook(binder_transaction, 0, from, to);
 }
 
+/**
+ * security_binder_transfer_binder() - Check if a binder transfer is allowed
+ * @from: sending process
+ * @to: receiving process
+ *
+ * Check whether @from is allowed to transfer a binder reference to @to.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_binder_transfer_binder(const struct cred *from,
 				    const struct cred *to)
 {
 	return call_int_hook(binder_transfer_binder, 0, from, to);
 }
 
+/**
+ * security_binder_transfer_file() - Check if a binder file xfer is allowed
+ * @from: sending process
+ * @to: receiving process
+ * @file: file being transferred
+ *
+ * Check whether @from is allowed to transfer @file to @to.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_binder_transfer_file(const struct cred *from,
 				  const struct cred *to, struct file *file)
 {
-- 
cgit v1.2.3


From b14faf9c94a66ef398c2c3fa6e141814f04e274e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 17:00:01 -0500
Subject: lsm: move the audit hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 32 --------------------------------
 security/security.c       | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0a5b3b46fc2b..e36387f88083 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -135,38 +135,6 @@
  *	@secdata contains the security context.
  *	@seclen contains the length of the security context.
  *
- * Security hooks for Audit
- *
- * @audit_rule_init:
- *	Allocate and initialize an LSM audit rule structure.
- *	@field contains the required Audit action.
- *	Fields flags are defined in <include/linux/audit.h>
- *	@op contains the operator the rule uses.
- *	@rulestr contains the context where the rule will be applied to.
- *	@lsmrule contains a pointer to receive the result.
- *	Return 0 if @lsmrule has been successfully set,
- *	-EINVAL in case of an invalid rule.
- *
- * @audit_rule_known:
- *	Specifies whether given @krule contains any fields related to
- *	current LSM.
- *	@krule contains the audit rule of interest.
- *	Return 1 in case of relation found, 0 otherwise.
- *
- * @audit_rule_match:
- *	Determine if given @secid matches a rule previously approved
- *	by @audit_rule_known.
- *	@secid contains the security id in question.
- *	@field contains the field which relates to current LSM.
- *	@op contains the operator that will be used for matching.
- *	@lrule points to the audit rule that will be checked against.
- *	Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure.
- *
- * @audit_rule_free:
- *	Deallocate the LSM audit rule structure previously allocated by
- *	audit_rule_init.
- *	@lsmrule contains the allocated rule.
- *
  * @inode_invalidate_secctx:
  *	Notify the security module that it must revalidate the security context
  *	of an inode.
diff --git a/security/security.c b/security/security.c
index b21154ed152f..80c05c28b7ff 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4762,21 +4762,62 @@ int security_key_getsecurity(struct key *key, char **_buffer)
 
 #ifdef CONFIG_AUDIT
 
+/**
+ * security_audit_rule_init() - Allocate and init an LSM audit rule struct
+ * @field: audit action
+ * @op: rule operator
+ * @rulestr: rule context
+ * @lsmrule: receive buffer for audit rule struct
+ *
+ * Allocate and initialize an LSM audit rule structure.
+ *
+ * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
+ *         an invalid rule.
+ */
 int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule)
 {
 	return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule);
 }
 
+/**
+ * security_audit_rule_known() - Check if an audit rule contains LSM fields
+ * @krule: audit rule
+ *
+ * Specifies whether given @krule contains any fields related to the current
+ * LSM.
+ *
+ * Return: Returns 1 in case of relation found, 0 otherwise.
+ */
 int security_audit_rule_known(struct audit_krule *krule)
 {
 	return call_int_hook(audit_rule_known, 0, krule);
 }
 
+/**
+ * security_audit_rule_free() - Free an LSM audit rule struct
+ * @lsmrule: audit rule struct
+ *
+ * Deallocate the LSM audit rule structure previously allocated by
+ * audit_rule_init().
+ */
 void security_audit_rule_free(void *lsmrule)
 {
 	call_void_hook(audit_rule_free, lsmrule);
 }
 
+/**
+ * security_audit_rule_match() - Check if a label matches an audit rule
+ * @secid: security label
+ * @field: LSM audit field
+ * @op: matching operator
+ * @lsmrule: audit rule
+ *
+ * Determine if given @secid matches a rule previously approved by
+ * security_audit_rule_known().
+ *
+ * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
+ *         failure.
+ */
 int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
 {
 	return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
-- 
cgit v1.2.3


From 55e853201a9e0383c9f6d5d800155e334685cd7e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 17:13:40 -0500
Subject: lsm: move the bpf hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 36 --------------------------
 security/security.c       | 65 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index e36387f88083..601d1ecdae0f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -190,42 +190,6 @@
  *	@key: The key to watch.
  *	Return 0 if permission is granted.
  *
- * Security hooks for using the eBPF maps and programs functionalities through
- * eBPF syscalls.
- *
- * @bpf:
- *	Do a initial check for all bpf syscalls after the attribute is copied
- *	into the kernel. The actual security module can implement their own
- *	rules to check the specific cmd they need.
- *	Return 0 if permission is granted.
- *
- * @bpf_map:
- *	Do a check when the kernel generate and return a file descriptor for
- *	eBPF maps.
- *	@map: bpf map that we want to access.
- *	@mask: the access flags.
- *	Return 0 if permission is granted.
- *
- * @bpf_prog:
- *	Do a check when the kernel generate and return a file descriptor for
- *	eBPF programs.
- *	@prog: bpf prog that userspace want to use.
- *	Return 0 if permission is granted.
- *
- * @bpf_map_alloc_security:
- *	Initialize the security field inside bpf map.
- *	Return 0 on success, error on failure.
- *
- * @bpf_map_free_security:
- *	Clean up the security information stored inside bpf map.
- *
- * @bpf_prog_alloc_security:
- *	Initialize the security field inside bpf program.
- *	Return 0 on success, error on failure.
- *
- * @bpf_prog_free_security:
- *	Clean up the security information stored inside bpf prog.
- *
  * @locked_down:
  *	Determine whether a kernel feature that potentially enables arbitrary
  *	code execution in kernel space should be permitted.
diff --git a/security/security.c b/security/security.c
index 80c05c28b7ff..4037af6b5196 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4825,30 +4825,95 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
 #endif /* CONFIG_AUDIT */
 
 #ifdef CONFIG_BPF_SYSCALL
+/**
+ * security_bpf() - Check if the bpf syscall operation is allowed
+ * @cmd: command
+ * @attr: bpf attribute
+ * @size: size
+ *
+ * Do a initial check for all bpf syscalls after the attribute is copied into
+ * the kernel. The actual security module can implement their own rules to
+ * check the specific cmd they need.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 {
 	return call_int_hook(bpf, 0, cmd, attr, size);
 }
+
+/**
+ * security_bpf_map() - Check if access to a bpf map is allowed
+ * @map: bpf map
+ * @fmode: mode
+ *
+ * Do a check when the kernel generates and returns a file descriptor for eBPF
+ * maps.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_bpf_map(struct bpf_map *map, fmode_t fmode)
 {
 	return call_int_hook(bpf_map, 0, map, fmode);
 }
+
+/**
+ * security_bpf_prog() - Check if access to a bpf program is allowed
+ * @prog: bpf program
+ *
+ * Do a check when the kernel generates and returns a file descriptor for eBPF
+ * programs.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_bpf_prog(struct bpf_prog *prog)
 {
 	return call_int_hook(bpf_prog, 0, prog);
 }
+
+/**
+ * security_bpf_map_alloc() - Allocate a bpf map LSM blob
+ * @map: bpf map
+ *
+ * Initialize the security field inside bpf map.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_bpf_map_alloc(struct bpf_map *map)
 {
 	return call_int_hook(bpf_map_alloc_security, 0, map);
 }
+
+/**
+ * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
+ * @aux: bpf program aux info struct
+ *
+ * Initialize the security field inside bpf program.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	return call_int_hook(bpf_prog_alloc_security, 0, aux);
 }
+
+/**
+ * security_bpf_map_free() - Free a bpf map's LSM blob
+ * @map: bpf map
+ *
+ * Clean up the security information stored inside bpf map.
+ */
 void security_bpf_map_free(struct bpf_map *map)
 {
 	call_void_hook(bpf_map_free_security, map);
 }
+
+/**
+ * security_bpf_prog_free() - Free a bpf program's LSM blob
+ * @aux: bpf program aux info struct
+ *
+ * Clean up the security information stored inside bpf prog.
+ */
 void security_bpf_prog_free(struct bpf_prog_aux *aux)
 {
 	call_void_hook(bpf_prog_free_security, aux);
-- 
cgit v1.2.3


From 452b670c7222c7bf11b45b13f5a736f96d0be1e3 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 17:22:36 -0500
Subject: lsm: move the perf hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 17 -----------------
 security/security.c       | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 601d1ecdae0f..3d8d430e271a 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -196,23 +196,6 @@
  *	@what: kernel feature being accessed.
  *	Return 0 if permission is granted.
  *
- * Security hooks for perf events
- *
- * @perf_event_open:
- *	Check whether the @type of perf_event_open syscall is allowed.
- *	Return 0 if permission is granted.
- * @perf_event_alloc:
- *	Allocate and save perf_event security info.
- *	Return 0 on success, error on failure.
- * @perf_event_free:
- *	Release (free) perf_event security info.
- * @perf_event_read:
- *	Read perf_event security info if allowed.
- *	Return 0 if permission is granted.
- * @perf_event_write:
- *	Write perf_event security info if allowed.
- *	Return 0 if permission is granted.
- *
  * Security hooks for io_uring
  *
  * @uring_override_creds:
diff --git a/security/security.c b/security/security.c
index 4037af6b5196..47506ae1b187 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4927,26 +4927,65 @@ int security_locked_down(enum lockdown_reason what)
 EXPORT_SYMBOL(security_locked_down);
 
 #ifdef CONFIG_PERF_EVENTS
+/**
+ * security_perf_event_open() - Check if a perf event open is allowed
+ * @attr: perf event attribute
+ * @type: type of event
+ *
+ * Check whether the @type of perf_event_open syscall is allowed.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_perf_event_open(struct perf_event_attr *attr, int type)
 {
 	return call_int_hook(perf_event_open, 0, attr, type);
 }
 
+/**
+ * security_perf_event_alloc() - Allocate a perf event LSM blob
+ * @event: perf event
+ *
+ * Allocate and save perf_event security info.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_perf_event_alloc(struct perf_event *event)
 {
 	return call_int_hook(perf_event_alloc, 0, event);
 }
 
+/**
+ * security_perf_event_free() - Free a perf event LSM blob
+ * @event: perf event
+ *
+ * Release (free) perf_event security info.
+ */
 void security_perf_event_free(struct perf_event *event)
 {
 	call_void_hook(perf_event_free, event);
 }
 
+/**
+ * security_perf_event_read() - Check if reading a perf event label is allowed
+ * @event: perf event
+ *
+ * Read perf_event security info if allowed.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_perf_event_read(struct perf_event *event)
 {
 	return call_int_hook(perf_event_read, 0, event);
 }
 
+/**
+ * security_perf_event_write() - Check if writing a perf event label is allowed
+ * @event: perf event
+ *
+ * Write perf_event security info if allowed.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_perf_event_write(struct perf_event *event)
 {
 	return call_int_hook(perf_event_write, 0, event);
-- 
cgit v1.2.3


From 1cd2aca64a5dc4edb65539dc26f24e162ab0e11c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 17:28:31 -0500
Subject: lsm: move the io_uring hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 17 -----------------
 security/security.c       | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3d8d430e271a..8e006df1db56 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -196,23 +196,6 @@
  *	@what: kernel feature being accessed.
  *	Return 0 if permission is granted.
  *
- * Security hooks for io_uring
- *
- * @uring_override_creds:
- *	Check if the current task, executing an io_uring operation, is allowed
- *	to override it's credentials with @new.
- *	@new: the new creds to use.
- *	Return 0 if permission is granted.
- *
- * @uring_sqpoll:
- *	Check whether the current task is allowed to spawn a io_uring polling
- *	thread (IORING_SETUP_SQPOLL).
- *	Return 0 if permission is granted.
- *
- * @uring_cmd:
- *	Check whether the file_operations uring_cmd is allowed to run.
- *	Return 0 if permission is granted.
- *
  */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
diff --git a/security/security.c b/security/security.c
index 47506ae1b187..af1db3fa7cd0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4993,15 +4993,41 @@ int security_perf_event_write(struct perf_event *event)
 #endif /* CONFIG_PERF_EVENTS */
 
 #ifdef CONFIG_IO_URING
+/**
+ * security_uring_override_creds() - Check if overriding creds is allowed
+ * @new: new credentials
+ *
+ * Check if the current task, executing an io_uring operation, is allowed to
+ * override it's credentials with @new.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_uring_override_creds(const struct cred *new)
 {
 	return call_int_hook(uring_override_creds, 0, new);
 }
 
+/**
+ * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
+ *
+ * Check whether the current task is allowed to spawn a io_uring polling thread
+ * (IORING_SETUP_SQPOLL).
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_uring_sqpoll(void)
 {
 	return call_int_hook(uring_sqpoll, 0);
 }
+
+/**
+ * security_uring_cmd() - Check if a io_uring passthrough command is allowed
+ * @ioucmd: command
+ *
+ * Check whether the file_operations uring_cmd is allowed to run.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_uring_cmd(struct io_uring_cmd *ioucmd)
 {
 	return call_int_hook(uring_cmd, 0, ioucmd);
-- 
cgit v1.2.3


From e261301c851aee401cfc63179ca4d3facd2f098b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Feb 2023 17:34:14 -0500
Subject: lsm: move the remaining LSM hook comments to security/security.c

This patch relocates the LSM hook function comments to the function
definitions, in keeping with the current kernel conventions.  This
should make the hook descriptions more easily discoverable and easier
to maintain.

While formatting changes have been done to better fit the kernel-doc
style, content changes have been kept to a minimum and limited to
text which was obviously incorrect and/or outdated.  It is expected
the future patches will improve the quality of the function header
comments.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 168 ---------------------------------
 security/security.c       | 231 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 168 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 8e006df1db56..ddbbe89a7a48 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -29,174 +29,6 @@
 #include <linux/init.h>
 #include <linux/rculist.h>
 
-/**
- * union security_list_options - Linux Security Module hook function list
- *
- * @ptrace_access_check:
- *	Check permission before allowing the current process to trace the
- *	@child process.
- *	Security modules may also want to perform a process tracing check
- *	during an execve in the set_security or apply_creds hooks of
- *	tracing check during an execve in the bprm_set_creds hook of
- *	binprm_security_ops if the process is being traced and its security
- *	attributes would be changed by the execve.
- *	@child contains the task_struct structure for the target process.
- *	@mode contains the PTRACE_MODE flags indicating the form of access.
- *	Return 0 if permission is granted.
- * @ptrace_traceme:
- *	Check that the @parent process has sufficient permission to trace the
- *	current process before allowing the current process to present itself
- *	to the @parent process for tracing.
- *	@parent contains the task_struct structure for debugger process.
- *	Return 0 if permission is granted.
- * @capget:
- *	Get the @effective, @inheritable, and @permitted capability sets for
- *	the @target process.  The hook may also perform permission checking to
- *	determine if the current process is allowed to see the capability sets
- *	of the @target process.
- *	@target contains the task_struct structure for target process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 if the capability sets were successfully obtained.
- * @capset:
- *	Set the @effective, @inheritable, and @permitted capability sets for
- *	the current process.
- *	@new contains the new credentials structure for target process.
- *	@old contains the current credentials structure for target process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 and update @new if permission is granted.
- * @capable:
- *	Check whether the @tsk process has the @cap capability in the indicated
- *	credentials.
- *	@cred contains the credentials to use.
- *	@ns contains the user namespace we want the capability in.
- *	@cap contains the capability <include/linux/capability.h>.
- *	@opts contains options for the capable check <include/linux/security.h>.
- *	Return 0 if the capability is granted for @tsk.
- * @quotactl:
- *	Check whether the quotactl syscall is allowed for this @sb.
- *	Return 0 if permission is granted.
- * @quota_on:
- *	Check whether QUOTAON is allowed for this @dentry.
- *	Return 0 if permission is granted.
- * @syslog:
- *	Check permission before accessing the kernel message ring or changing
- *	logging to the console.
- *	See the syslog(2) manual page for an explanation of the @type values.
- *	@type contains the SYSLOG_ACTION_* constant from
- *	<include/linux/syslog.h>.
- *	Return 0 if permission is granted.
- * @settime:
- *	Check permission to change the system time.
- *	struct timespec64 is defined in <include/linux/time64.h> and timezone
- *	is defined in <include/linux/time.h>
- *	@ts contains new time.
- *	@tz contains new timezone.
- *	Return 0 if permission is granted.
- * @vm_enough_memory:
- *	Check permissions for allocating a new virtual mapping.
- *	@mm contains the mm struct it is being added to.
- *	@pages contains the number of pages.
- *	Return 0 if permission is granted by the LSM infrastructure to the
- *	caller. If all LSMs return a positive value, __vm_enough_memory() will
- *	be called with cap_sys_admin set. If at least one LSM returns 0 or
- *	negative, __vm_enough_memory() will be called with cap_sys_admin
- *	cleared.
- *
- * @ismaclabel:
- *	Check if the extended attribute specified by @name
- *	represents a MAC label. Returns 1 if name is a MAC
- *	attribute otherwise returns 0.
- *	@name full extended attribute name to check against
- *	LSM as a MAC label.
- *
- * @secid_to_secctx:
- *	Convert secid to security context.  If secdata is NULL the length of
- *	the result will be returned in seclen, but no secdata will be returned.
- *	This does mean that the length could change between calls to check the
- *	length and the next call which actually allocates and returns the
- *	secdata.
- *	@secid contains the security ID.
- *	@secdata contains the pointer that stores the converted security
- *	context.
- *	@seclen pointer which contains the length of the data.
- *	Return 0 on success, error on failure.
- * @secctx_to_secid:
- *	Convert security context to secid.
- *	@secid contains the pointer to the generated security ID.
- *	@secdata contains the security context.
- *	Return 0 on success, error on failure.
- *
- * @release_secctx:
- *	Release the security context.
- *	@secdata contains the security context.
- *	@seclen contains the length of the security context.
- *
- * @inode_invalidate_secctx:
- *	Notify the security module that it must revalidate the security context
- *	of an inode.
- *
- * @inode_notifysecctx:
- *	Notify the security module of what the security context of an inode
- *	should be.  Initializes the incore security context managed by the
- *	security module for this inode.  Example usage:  NFS client invokes
- *	this hook to initialize the security context in its incore inode to the
- *	value provided by the server for the file when the server returned the
- *	file's attributes to the client.
- *	Must be called with inode->i_mutex locked.
- *	@inode we wish to set the security context of.
- *	@ctx contains the string which we wish to set in the inode.
- *	@ctxlen contains the length of @ctx.
- *	Return 0 on success, error on failure.
- *
- * @inode_setsecctx:
- *	Change the security context of an inode.  Updates the
- *	incore security context managed by the security module and invokes the
- *	fs code as needed (via __vfs_setxattr_noperm) to update any backing
- *	xattrs that represent the context.  Example usage:  NFS server invokes
- *	this hook to change the security context in its incore inode and on the
- *	backing filesystem to a value provided by the client on a SETATTR
- *	operation.
- *	Must be called with inode->i_mutex locked.
- *	@dentry contains the inode we wish to set the security context of.
- *	@ctx contains the string which we wish to set in the inode.
- *	@ctxlen contains the length of @ctx.
- *	Return 0 on success, error on failure.
- *
- * @inode_getsecctx:
- *	On success, returns 0 and fills out @ctx and @ctxlen with the security
- *	context for the given @inode.
- *	@inode we wish to get the security context of.
- *	@ctx is a pointer in which to place the allocated security context.
- *	@ctxlen points to the place to put the length of @ctx.
- *	Return 0 on success, error on failure.
- *
- * Security hooks for the general notification queue:
- *
- * @post_notification:
- *	Check to see if a watch notification can be posted to a particular
- *	queue.
- *	@w_cred: The credentials of the whoever set the watch.
- *	@cred: The event-triggerer's credentials.
- *	@n: The notification being posted.
- *	Return 0 if permission is granted.
- *
- * @watch_key:
- *	Check to see if a process is allowed to watch for event notifications
- *	from a key or keyring.
- *	@key: The key to watch.
- *	Return 0 if permission is granted.
- *
- * @locked_down:
- *	Determine whether a kernel feature that potentially enables arbitrary
- *	code execution in kernel space should be permitted.
- *	@what: kernel feature being accessed.
- *	Return 0 if permission is granted.
- *
- */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
 	#include "lsm_hook_defs.h"
diff --git a/security/security.c b/security/security.c
index af1db3fa7cd0..190a701fe444 100644
--- a/security/security.c
+++ b/security/security.c
@@ -838,16 +838,54 @@ int security_binder_transfer_file(const struct cred *from,
 	return call_int_hook(binder_transfer_file, 0, from, to, file);
 }
 
+/**
+ * security_ptrace_access_check() - Check if tracing is allowed
+ * @child: target process
+ * @mode: PTRACE_MODE flags
+ *
+ * Check permission before allowing the current process to trace the @child
+ * process.  Security modules may also want to perform a process tracing check
+ * during an execve in the set_security or apply_creds hooks of tracing check
+ * during an execve in the bprm_set_creds hook of binprm_security_ops if the
+ * process is being traced and its security attributes would be changed by the
+ * execve.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 	return call_int_hook(ptrace_access_check, 0, child, mode);
 }
 
+/**
+ * security_ptrace_traceme() - Check if tracing is allowed
+ * @parent: tracing process
+ *
+ * Check that the @parent process has sufficient permission to trace the
+ * current process before allowing the current process to present itself to the
+ * @parent process for tracing.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_ptrace_traceme(struct task_struct *parent)
 {
 	return call_int_hook(ptrace_traceme, 0, parent);
 }
 
+/**
+ * security_capget() - Get the capability sets for a process
+ * @target: target process
+ * @effective: effective capability set
+ * @inheritable: inheritable capability set
+ * @permitted: permitted capability set
+ *
+ * Get the @effective, @inheritable, and @permitted capability sets for the
+ * @target process.  The hook may also perform permission checking to determine
+ * if the current process is allowed to see the capability sets of the @target
+ * process.
+ *
+ * Return: Returns 0 if the capability sets were successfully obtained.
+ */
 int security_capget(struct task_struct *target,
 		     kernel_cap_t *effective,
 		     kernel_cap_t *inheritable,
@@ -857,6 +895,19 @@ int security_capget(struct task_struct *target,
 				effective, inheritable, permitted);
 }
 
+/**
+ * security_capset() - Set the capability sets for a process
+ * @new: new credentials for the target process
+ * @old: current credentials of the target process
+ * @effective: effective capability set
+ * @inheritable: inheritable capability set
+ * @permitted: permitted capability set
+ *
+ * Set the @effective, @inheritable, and @permitted capability sets for the
+ * current process.
+ *
+ * Return: Returns 0 and update @new if permission is granted.
+ */
 int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
@@ -866,6 +917,19 @@ int security_capset(struct cred *new, const struct cred *old,
 				effective, inheritable, permitted);
 }
 
+/**
+ * security_capable() - Check if a process has the necessary capability
+ * @cred: credentials to examine
+ * @ns: user namespace
+ * @cap: capability requested
+ * @opts: capability check options
+ *
+ * Check whether the @tsk process has the @cap capability in the indicated
+ * credentials.  @cap contains the capability <include/linux/capability.h>.
+ * @opts contains options for the capable check <include/linux/security.h>.
+ *
+ * Return: Returns 0 if the capability is granted.
+ */
 int security_capable(const struct cred *cred,
 		     struct user_namespace *ns,
 		     int cap,
@@ -874,26 +938,78 @@ int security_capable(const struct cred *cred,
 	return call_int_hook(capable, 0, cred, ns, cap, opts);
 }
 
+/**
+ * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
+ * @cmds: commands
+ * @type: type
+ * @id: id
+ * @sb: filesystem
+ *
+ * Check whether the quotactl syscall is allowed for this @sb.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_quotactl(int cmds, int type, int id, struct super_block *sb)
 {
 	return call_int_hook(quotactl, 0, cmds, type, id, sb);
 }
 
+/**
+ * security_quota_on() - Check if QUOTAON is allowed for a dentry
+ * @dentry: dentry
+ *
+ * Check whether QUOTAON is allowed for @dentry.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_quota_on(struct dentry *dentry)
 {
 	return call_int_hook(quota_on, 0, dentry);
 }
 
+/**
+ * security_syslog() - Check if accessing the kernel message ring is allowed
+ * @type: SYSLOG_ACTION_* type
+ *
+ * Check permission before accessing the kernel message ring or changing
+ * logging to the console.  See the syslog(2) manual page for an explanation of
+ * the @type values.
+ *
+ * Return: Return 0 if permission is granted.
+ */
 int security_syslog(int type)
 {
 	return call_int_hook(syslog, 0, type);
 }
 
+/**
+ * security_settime64() - Check if changing the system time is allowed
+ * @ts: new time
+ * @tz: timezone
+ *
+ * Check permission to change the system time, struct timespec64 is defined in
+ * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
 {
 	return call_int_hook(settime, 0, ts, tz);
 }
 
+/**
+ * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
+ * @mm: mm struct
+ * @pages: number of pages
+ *
+ * Check permissions for allocating a new virtual mapping.  If all LSMs return
+ * a positive value, __vm_enough_memory() will be called with cap_sys_admin
+ * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
+ * called with cap_sys_admin cleared.
+ *
+ * Return: Returns 0 if permission is granted by the LSM infrastructure to the
+ *         caller.
+ */
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
 	struct security_hook_list *hp;
@@ -3703,12 +3819,33 @@ int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return call_int_hook(netlink_send, 0, sk, skb);
 }
 
+/**
+ * security_ismaclabel() - Check is the named attribute is a MAC label
+ * @name: full extended attribute name
+ *
+ * Check if the extended attribute specified by @name represents a MAC label.
+ *
+ * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
+ */
 int security_ismaclabel(const char *name)
 {
 	return call_int_hook(ismaclabel, 0, name);
 }
 EXPORT_SYMBOL(security_ismaclabel);
 
+/**
+ * security_secid_to_secctx() - Convert a secid to a secctx
+ * @secid: secid
+ * @secdata: secctx
+ * @seclen: secctx length
+ *
+ * Convert secid to security context.  If @secdata is NULL the length of the
+ * result will be returned in @seclen, but no @secdata will be returned.  This
+ * does mean that the length could change between calls to check the length and
+ * the next call which actually allocates and returns the @secdata.
+ *
+ * Return: Return 0 on success, error on failure.
+ */
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	struct security_hook_list *hp;
@@ -3728,6 +3865,16 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 }
 EXPORT_SYMBOL(security_secid_to_secctx);
 
+/**
+ * security_secctx_to_secid() - Convert a secctx to a secid
+ * @secdata: secctx
+ * @seclen: length of secctx
+ * @secid: secid
+ *
+ * Convert security context to secid.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 {
 	*secid = 0;
@@ -3735,30 +3882,86 @@ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 }
 EXPORT_SYMBOL(security_secctx_to_secid);
 
+/**
+ * security_release_secctx() - Free a secctx buffer
+ * @secdata: secctx
+ * @seclen: length of secctx
+ *
+ * Release the security context.
+ */
 void security_release_secctx(char *secdata, u32 seclen)
 {
 	call_void_hook(release_secctx, secdata, seclen);
 }
 EXPORT_SYMBOL(security_release_secctx);
 
+/**
+ * security_inode_invalidate_secctx() - Invalidate an inode's security label
+ * @inode: inode
+ *
+ * Notify the security module that it must revalidate the security context of
+ * an inode.
+ */
 void security_inode_invalidate_secctx(struct inode *inode)
 {
 	call_void_hook(inode_invalidate_secctx, inode);
 }
 EXPORT_SYMBOL(security_inode_invalidate_secctx);
 
+/**
+ * security_inode_notifysecctx() - Nofify the LSM of an inode's security label
+ * @inode: inode
+ * @ctx: secctx
+ * @ctxlen: length of secctx
+ *
+ * Notify the security module of what the security context of an inode should
+ * be.  Initializes the incore security context managed by the security module
+ * for this inode.  Example usage: NFS client invokes this hook to initialize
+ * the security context in its incore inode to the value provided by the server
+ * for the file when the server returned the file's attributes to the client.
+ * Must be called with inode->i_mutex locked.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
 {
 	return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen);
 }
 EXPORT_SYMBOL(security_inode_notifysecctx);
 
+/**
+ * security_inode_setsecctx() - Change the security label of an inode
+ * @dentry: inode
+ * @ctx: secctx
+ * @ctxlen: length of secctx
+ *
+ * Change the security context of an inode.  Updates the incore security
+ * context managed by the security module and invokes the fs code as needed
+ * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
+ * context.  Example usage: NFS server invokes this hook to change the security
+ * context in its incore inode and on the backing filesystem to a value
+ * provided by the client on a SETATTR operation.  Must be called with
+ * inode->i_mutex locked.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 {
 	return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen);
 }
 EXPORT_SYMBOL(security_inode_setsecctx);
 
+/**
+ * security_inode_getsecctx() - Get the security label of an inode
+ * @inode: inode
+ * @ctx: secctx
+ * @ctxlen: length of secctx
+ *
+ * On success, returns 0 and fills out @ctx and @ctxlen with the security
+ * context for the given @inode.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 {
 	return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen);
@@ -3766,6 +3969,16 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 EXPORT_SYMBOL(security_inode_getsecctx);
 
 #ifdef CONFIG_WATCH_QUEUE
+/**
+ * security_post_notification() - Check if a watch notification can be posted
+ * @w_cred: credentials of the task that set the watch
+ * @cred: credentials of the task which triggered the watch
+ * @n: the notification
+ *
+ * Check to see if a watch notification can be posted to a particular queue.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_post_notification(const struct cred *w_cred,
 			       const struct cred *cred,
 			       struct watch_notification *n)
@@ -3775,6 +3988,15 @@ int security_post_notification(const struct cred *w_cred,
 #endif /* CONFIG_WATCH_QUEUE */
 
 #ifdef CONFIG_KEY_NOTIFICATIONS
+/**
+ * security_watch_key() - Check if a task is allowed to watch for key events
+ * @key: the key to watch
+ *
+ * Check to see if a process is allowed to watch for event notifications from
+ * a key or keyring.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_watch_key(struct key *key)
 {
 	return call_int_hook(watch_key, 0, key);
@@ -4920,6 +5142,15 @@ void security_bpf_prog_free(struct bpf_prog_aux *aux)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+/**
+ * security_locked_down() - Check if a kernel feature is allowed
+ * @what: requested kernel feature
+ *
+ * Determine whether a kernel feature that potentially enables arbitrary code
+ * execution in kernel space should be permitted.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
 int security_locked_down(enum lockdown_reason what)
 {
 	return call_int_hook(locked_down, 0, what);
-- 
cgit v1.2.3


From 01c7a5978993209f47ecfd95dcbd52f6c6672384 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 6 Mar 2023 15:48:11 -0500
Subject: fs: dlm: remove deprecated code parts

This patch removes code parts which was declared deprecated by
commit 6b0afc0cc3e9 ("fs: dlm: don't use deprecated timeout features by
default"). This contains the following dlm functionality:

- start a cancel of a dlm request did not complete after certain timeout:
  The current way how dlm cancellation works and interfering with other
  dlm requests triggered by the user can end in an overlapping and
  returning in -EBUSY. The most user don't handle this case and are
  unaware that DLM can return such errno in such situation. Due the
  timeout the user are mostly unaware when this happens.
- start a netlink warning messages for user space if dlm requests did
  not complete after certain timeout:
  This feature was never being built in the only known dlm user space side.
  As we are to remove the timeout cancellation feature we can directly
  remove this feature as well.

There might be the possibility to bring the timeout cancellation feature
back. However the current way of handling the -EBUSY case which is only
a software limitation and not a hardware limitation should be changed.
We minimize the current code base in DLM cancellation feature to not have
to deal with those existing features while solving the DLM cancellation
feature in general.

UAPI define DLM_LSFL_TIMEWARN is commented as deprecated and reserved
value. We should avoid at first to give it a new meaning but let
possible users still compile by keeping this define. In far future we
can give this flag a new meaning. The same for the DLM_LKF_TIMEOUT lock
request flag.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/Kconfig                    |   9 --
 fs/dlm/Makefile                   |   1 -
 fs/dlm/config.c                   |  21 ----
 fs/dlm/config.h                   |   3 -
 fs/dlm/dlm_internal.h             |  29 ------
 fs/dlm/lock.c                     | 195 --------------------------------------
 fs/dlm/lock.h                     |  17 ----
 fs/dlm/lockspace.c                |  23 -----
 fs/dlm/main.c                     |   9 +-
 fs/dlm/netlink.c                  | 139 ---------------------------
 fs/dlm/recoverd.c                 |   2 -
 fs/dlm/user.c                     |  22 -----
 include/linux/dlm.h               |   3 -
 include/uapi/linux/dlm.h          |   1 +
 include/uapi/linux/dlm_netlink.h  |  60 ------------
 include/uapi/linux/dlmconstants.h |   5 +-
 16 files changed, 6 insertions(+), 533 deletions(-)
 delete mode 100644 fs/dlm/netlink.c
 delete mode 100644 include/uapi/linux/dlm_netlink.h

(limited to 'include')

diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index b3b86dbdc187..f82a4952769d 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -8,15 +8,6 @@ menuconfig DLM
 	A general purpose distributed lock manager for kernel or userspace
 	applications.
 
-config DLM_DEPRECATED_API
-	bool "DLM deprecated API"
-	depends on DLM
-	help
-	Enables deprecated DLM timeout features that will be removed in
-        later Linux kernel releases.
-
-	If you are unsure, say N.
-
 config DLM_DEBUG
 	bool "DLM debugging"
 	depends on DLM
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 71dab733cf9a..5a471af1d1fe 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -17,6 +17,5 @@ dlm-y :=			ast.o \
 				requestqueue.o \
 				user.o \
 				util.o 
-dlm-$(CONFIG_DLM_DEPRECATED_API) +=	netlink.o
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 20b60709eccf..d31319d08581 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -75,9 +75,6 @@ struct dlm_cluster {
 	unsigned int cl_log_info;
 	unsigned int cl_protocol;
 	unsigned int cl_mark;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	unsigned int cl_timewarn_cs;
-#endif
 	unsigned int cl_new_rsb_count;
 	unsigned int cl_recover_callbacks;
 	char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -103,9 +100,6 @@ enum {
 	CLUSTER_ATTR_LOG_INFO,
 	CLUSTER_ATTR_PROTOCOL,
 	CLUSTER_ATTR_MARK,
-#ifdef CONFIG_DLM_DEPRECATED_API
-	CLUSTER_ATTR_TIMEWARN_CS,
-#endif
 	CLUSTER_ATTR_NEW_RSB_COUNT,
 	CLUSTER_ATTR_RECOVER_CALLBACKS,
 	CLUSTER_ATTR_CLUSTER_NAME,
@@ -226,9 +220,6 @@ CLUSTER_ATTR(log_debug, NULL);
 CLUSTER_ATTR(log_info, NULL);
 CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
 CLUSTER_ATTR(mark, NULL);
-#ifdef CONFIG_DLM_DEPRECATED_API
-CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-#endif
 CLUSTER_ATTR(new_rsb_count, NULL);
 CLUSTER_ATTR(recover_callbacks, NULL);
 
@@ -243,9 +234,6 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
 	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
 	[CLUSTER_ATTR_MARK] = &cluster_attr_mark,
-#ifdef CONFIG_DLM_DEPRECATED_API
-	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
-#endif
 	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
 	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
 	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -436,9 +424,6 @@ static struct config_group *make_cluster(struct config_group *g,
 	cl->cl_log_debug = dlm_config.ci_log_debug;
 	cl->cl_log_info = dlm_config.ci_log_info;
 	cl->cl_protocol = dlm_config.ci_protocol;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
-#endif
 	cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
 	cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
 	memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -959,9 +944,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_LOG_INFO           1
 #define DEFAULT_PROTOCOL           DLM_PROTO_TCP
 #define DEFAULT_MARK               0
-#ifdef CONFIG_DLM_DEPRECATED_API
-#define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
-#endif
 #define DEFAULT_NEW_RSB_COUNT    128
 #define DEFAULT_RECOVER_CALLBACKS  0
 #define DEFAULT_CLUSTER_NAME      ""
@@ -977,9 +959,6 @@ struct dlm_config_info dlm_config = {
 	.ci_log_info = DEFAULT_LOG_INFO,
 	.ci_protocol = DEFAULT_PROTOCOL,
 	.ci_mark = DEFAULT_MARK,
-#ifdef CONFIG_DLM_DEPRECATED_API
-	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
-#endif
 	.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
 	.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
 	.ci_cluster_name = DEFAULT_CLUSTER_NAME
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 55c5f2c13ebd..4c91fcca0fd4 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -37,9 +37,6 @@ struct dlm_config_info {
 	int ci_log_info;
 	int ci_protocol;
 	int ci_mark;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	int ci_timewarn_cs;
-#endif
 	int ci_new_rsb_count;
 	int ci_recover_callbacks;
 	char ci_cluster_name[DLM_LOCKSPACE_LEN];
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 9bf70962bc49..66863dc15b8f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -145,9 +145,6 @@ struct dlm_args {
 	void			(*bastfn) (void *astparam, int mode);
 	int			mode;
 	struct dlm_lksb		*lksb;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	unsigned long		timeout;
-#endif
 };
 
 
@@ -205,10 +202,6 @@ struct dlm_args {
 #define DLM_IFL_OVERLAP_UNLOCK  0x00080000
 #define DLM_IFL_OVERLAP_CANCEL  0x00100000
 #define DLM_IFL_ENDOFLIFE	0x00200000
-#ifdef CONFIG_DLM_DEPRECATED_API
-#define DLM_IFL_WATCH_TIMEWARN	0x00400000
-#define DLM_IFL_TIMEOUT_CANCEL	0x00800000
-#endif
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
 #define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
 
@@ -266,11 +259,6 @@ struct dlm_lkb {
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	ktime_t			lkb_timestamp;
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	struct list_head	lkb_time_list;
-	unsigned long		lkb_timeout_cs;
-#endif
-
 	spinlock_t		lkb_cb_lock;
 	struct work_struct	lkb_cb_work;
 	struct list_head	lkb_cb_list; /* for ls_cb_delay or proc->asts */
@@ -586,11 +574,6 @@ struct dlm_ls {
 	struct mutex		ls_orphans_mutex;
 	struct list_head	ls_orphans;
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	struct mutex		ls_timeout_mutex;
-	struct list_head	ls_timeout;
-#endif
-
 	spinlock_t		ls_new_rsb_spin;
 	int			ls_new_rsb_count;
 	struct list_head	ls_new_rsb;	/* new rsb structs */
@@ -704,9 +687,6 @@ struct dlm_ls {
 #define LSFL_RCOM_READY		5
 #define LSFL_RCOM_WAIT		6
 #define LSFL_UEVENT_WAIT	7
-#ifdef CONFIG_DLM_DEPRECATED_API
-#define LSFL_TIMEWARN		8
-#endif
 #define LSFL_CB_DELAY		9
 #define LSFL_NODIR		10
 
@@ -759,15 +739,6 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
 	return test_bit(LSFL_NODIR, &ls->ls_flags);
 }
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
-#else
-static inline int dlm_netlink_init(void) { return 0; }
-static inline void dlm_netlink_exit(void) { };
-static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { };
-#endif
 int dlm_plock_init(void);
 void dlm_plock_exit(void);
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e1adfa5aed05..9a7679f16eee 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -89,7 +89,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 				    struct dlm_message *ms);
 static int receive_extralen(struct dlm_message *ms);
 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
-static void del_timeout(struct dlm_lkb *lkb);
 static void toss_rsb(struct kref *kref);
 
 /*
@@ -292,19 +291,8 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	if (is_master_copy(lkb))
 		return;
 
-	del_timeout(lkb);
-
 	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
-	   timeout caused the cancel then return -ETIMEDOUT */
-	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
-		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
-		rv = -ETIMEDOUT;
-	}
-#endif
-
 	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
 		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
 		rv = -EDEADLK;
@@ -1215,9 +1203,6 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
 	kref_init(&lkb->lkb_ref);
 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
 	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
-#ifdef CONFIG_DLM_DEPRECATED_API
-	INIT_LIST_HEAD(&lkb->lkb_time_list);
-#endif
 	INIT_LIST_HEAD(&lkb->lkb_cb_list);
 	INIT_LIST_HEAD(&lkb->lkb_callbacks);
 	spin_lock_init(&lkb->lkb_cb_lock);
@@ -1735,133 +1720,6 @@ void dlm_scan_rsbs(struct dlm_ls *ls)
 	}
 }
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-static void add_timeout(struct dlm_lkb *lkb)
-{
-	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-
-	if (is_master_copy(lkb))
-		return;
-
-	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
-	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
-		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
-		goto add_it;
-	}
-	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
-		goto add_it;
-	return;
-
- add_it:
-	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
-	mutex_lock(&ls->ls_timeout_mutex);
-	hold_lkb(lkb);
-	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
-	mutex_unlock(&ls->ls_timeout_mutex);
-}
-
-static void del_timeout(struct dlm_lkb *lkb)
-{
-	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-
-	mutex_lock(&ls->ls_timeout_mutex);
-	if (!list_empty(&lkb->lkb_time_list)) {
-		list_del_init(&lkb->lkb_time_list);
-		unhold_lkb(lkb);
-	}
-	mutex_unlock(&ls->ls_timeout_mutex);
-}
-
-/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
-   lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
-   and then lock rsb because of lock ordering in add_timeout.  We may need
-   to specify some special timeout-related bits in the lkb that are just to
-   be accessed under the timeout_mutex. */
-
-void dlm_scan_timeout(struct dlm_ls *ls)
-{
-	struct dlm_rsb *r;
-	struct dlm_lkb *lkb = NULL, *iter;
-	int do_cancel, do_warn;
-	s64 wait_us;
-
-	for (;;) {
-		if (dlm_locking_stopped(ls))
-			break;
-
-		do_cancel = 0;
-		do_warn = 0;
-		mutex_lock(&ls->ls_timeout_mutex);
-		list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
-
-			wait_us = ktime_to_us(ktime_sub(ktime_get(),
-							iter->lkb_timestamp));
-
-			if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
-			    wait_us >= (iter->lkb_timeout_cs * 10000))
-				do_cancel = 1;
-
-			if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
-				do_warn = 1;
-
-			if (!do_cancel && !do_warn)
-				continue;
-			hold_lkb(iter);
-			lkb = iter;
-			break;
-		}
-		mutex_unlock(&ls->ls_timeout_mutex);
-
-		if (!lkb)
-			break;
-
-		r = lkb->lkb_resource;
-		hold_rsb(r);
-		lock_rsb(r);
-
-		if (do_warn) {
-			/* clear flag so we only warn once */
-			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
-			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
-				del_timeout(lkb);
-			dlm_timeout_warn(lkb);
-		}
-
-		if (do_cancel) {
-			log_debug(ls, "timeout cancel %x node %d %s",
-				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
-			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
-			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
-			del_timeout(lkb);
-			_cancel_lock(r, lkb);
-		}
-
-		unlock_rsb(r);
-		unhold_rsb(r);
-		dlm_put_lkb(lkb);
-	}
-}
-
-/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
-   dlm_recoverd before checking/setting ls_recover_begin. */
-
-void dlm_adjust_timeouts(struct dlm_ls *ls)
-{
-	struct dlm_lkb *lkb;
-	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
-
-	ls->ls_recover_begin = 0;
-	mutex_lock(&ls->ls_timeout_mutex);
-	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
-	mutex_unlock(&ls->ls_timeout_mutex);
-}
-#else
-static void add_timeout(struct dlm_lkb *lkb) { }
-static void del_timeout(struct dlm_lkb *lkb) { }
-#endif
-
 /* lkb is master or local copy */
 
 static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2723,20 +2581,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
 	}
 }
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
-			 int namelen, unsigned long timeout_cs,
-			 void (*ast) (void *astparam),
-			 void *astparam,
-			 void (*bast) (void *astparam, int mode),
-			 struct dlm_args *args)
-#else
 static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
 			 int namelen, void (*ast)(void *astparam),
 			 void *astparam,
 			 void (*bast)(void *astparam, int mode),
 			 struct dlm_args *args)
-#endif
 {
 	int rv = -EINVAL;
 
@@ -2789,9 +2638,6 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
 	args->astfn = ast;
 	args->astparam = astparam;
 	args->bastfn = bast;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	args->timeout = timeout_cs;
-#endif
 	args->mode = mode;
 	args->lksb = lksb;
 	rv = 0;
@@ -2847,9 +2693,6 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_lksb = args->lksb;
 	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
 	lkb->lkb_ownpid = (int) current->pid;
-#ifdef CONFIG_DLM_DEPRECATED_API
-	lkb->lkb_timeout_cs = args->timeout;
-#endif
 	rv = 0;
  out:
 	switch (rv) {
@@ -2934,9 +2777,6 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 		if (is_overlap(lkb))
 			goto out;
 
-		/* don't let scand try to do a cancel */
-		del_timeout(lkb);
-
 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
 			rv = -EBUSY;
@@ -2975,9 +2815,6 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 		if (is_overlap_unlock(lkb))
 			goto out;
 
-		/* don't let scand try to do a cancel */
-		del_timeout(lkb);
-
 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
 			rv = -EBUSY;
@@ -3045,7 +2882,6 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
-		add_timeout(lkb);
 		goto out;
 	}
 
@@ -3114,7 +2950,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-		add_timeout(lkb);
 		goto out;
 	}
 
@@ -3401,13 +3236,8 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 
 	trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
-			      astarg, bast, &args);
-#else
 	error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
 			      &args);
-#endif
 	if (error)
 		goto out_put;
 
@@ -4454,7 +4284,6 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 			munge_altmode(lkb, ms);
 		if (result) {
 			add_lkb(r, lkb, DLM_LKSTS_WAITING);
-			add_timeout(lkb);
 		} else {
 			grant_lock_pc(r, lkb, ms);
 			queue_cast(r, lkb, 0);
@@ -4541,7 +4370,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 			munge_demoted(lkb);
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-		add_timeout(lkb);
 		break;
 
 	case 0:
@@ -5708,14 +5536,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
 	return 0;
 }
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
-		     int mode, uint32_t flags, void *name, unsigned int namelen,
-		     unsigned long timeout_cs)
-#else
 int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		     int mode, uint32_t flags, void *name, unsigned int namelen)
-#endif
 {
 	struct dlm_lkb *lkb;
 	struct dlm_args args;
@@ -5740,13 +5562,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 			goto out_put;
 		}
 	}
-#ifdef CONFIG_DLM_DEPRECATED_API
-	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
-			      fake_astfn, ua, fake_bastfn, &args);
-#else
 	error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
 			      fake_bastfn, &args);
-#endif
 	if (error) {
 		kfree(ua->lksb.sb_lvbptr);
 		ua->lksb.sb_lvbptr = NULL;
@@ -5788,14 +5605,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	return error;
 }
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
-		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
-		     unsigned long timeout_cs)
-#else
 int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
-#endif
 {
 	struct dlm_lkb *lkb;
 	struct dlm_args args;
@@ -5832,13 +5643,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	ua->bastaddr = ua_tmp->bastaddr;
 	ua->user_lksb = ua_tmp->user_lksb;
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
-			      fake_astfn, ua, fake_bastfn, &args);
-#else
 	error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
 			      fake_bastfn, &args);
-#endif
 	if (error)
 		goto out_put;
 
@@ -6155,7 +5961,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 		lkb = del_proc_lock(ls, proc);
 		if (!lkb)
 			break;
-		del_timeout(lkb);
 		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
 			orphan_proc_lock(ls, lkb);
 		else
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 40c76b5544da..aa5ad44d902b 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -25,14 +25,6 @@ void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-void dlm_scan_timeout(struct dlm_ls *ls);
-void dlm_adjust_timeouts(struct dlm_ls *ls);
-#else
-static inline void dlm_scan_timeout(struct dlm_ls *ls) { }
-static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
-#endif
-
 int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
 		      unsigned int flags, int *r_nodeid, int *result);
 
@@ -47,19 +39,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls);
 int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
 int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
-	uint32_t flags, void *name, unsigned int namelen,
-	unsigned long timeout_cs);
-int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
-	int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
-	unsigned long timeout_cs);
-#else
 int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
 	uint32_t flags, void *name, unsigned int namelen);
 int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
-#endif
 int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	int mode, uint32_t flags, void *name, unsigned int namelen,
 	uint32_t *lkid);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 7325acbd1af7..ed785130fb60 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -273,7 +273,6 @@ static int dlm_scand(void *data)
 			if (dlm_lock_recovery_try(ls)) {
 				ls->ls_scan_time = jiffies;
 				dlm_scan_rsbs(ls);
-				dlm_scan_timeout(ls);
 				dlm_unlock_recovery(ls);
 			} else {
 				ls->ls_scan_time += HZ;
@@ -488,28 +487,10 @@ static int new_lockspace(const char *name, const char *cluster,
 		ls->ls_ops_arg = ops_arg;
 	}
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	if (flags & DLM_LSFL_TIMEWARN) {
-		pr_warn_once("===============================================================\n"
-			     "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n"
-			     "         will be removed in v6.2!\n"
-			     "         Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n"
-			     "===============================================================\n");
-
-		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
-	}
-
-	/* ls_exflags are forced to match among nodes, and we don't
-	 * need to require all nodes to have some flags set
-	 */
-	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
-				    DLM_LSFL_NEWEXCL));
-#else
 	/* ls_exflags are forced to match among nodes, and we don't
 	 * need to require all nodes to have some flags set
 	 */
 	ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
-#endif
 
 	size = READ_ONCE(dlm_config.ci_rsbtbl_size);
 	ls->ls_rsbtbl_size = size;
@@ -537,10 +518,6 @@ static int new_lockspace(const char *name, const char *cluster,
 	mutex_init(&ls->ls_waiters_mutex);
 	INIT_LIST_HEAD(&ls->ls_orphans);
 	mutex_init(&ls->ls_orphans_mutex);
-#ifdef CONFIG_DLM_DEPRECATED_API
-	INIT_LIST_HEAD(&ls->ls_timeout);
-	mutex_init(&ls->ls_timeout_mutex);
-#endif
 
 	INIT_LIST_HEAD(&ls->ls_new_rsb);
 	spin_lock_init(&ls->ls_new_rsb_spin);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index a77338be3237..93755f83a30d 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -46,20 +46,14 @@ static int __init init_dlm(void)
 	if (error)
 		goto out_debug;
 
-	error = dlm_netlink_init();
-	if (error)
-		goto out_user;
-
 	error = dlm_plock_init();
 	if (error)
-		goto out_netlink;
+		goto out_user;
 
 	printk("DLM installed\n");
 
 	return 0;
 
- out_netlink:
-	dlm_netlink_exit();
  out_user:
 	dlm_user_exit();
  out_debug:
@@ -77,7 +71,6 @@ static int __init init_dlm(void)
 static void __exit exit_dlm(void)
 {
 	dlm_plock_exit();
-	dlm_netlink_exit();
 	dlm_user_exit();
 	dlm_config_exit();
 	dlm_memory_exit();
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
deleted file mode 100644
index 4de4b8651c6c..000000000000
--- a/fs/dlm/netlink.c
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
- */
-
-#include <net/genetlink.h>
-#include <linux/dlm.h>
-#include <linux/dlm_netlink.h>
-#include <linux/gfp.h>
-
-#include "dlm_internal.h"
-
-static uint32_t dlm_nl_seqnum;
-static uint32_t listener_nlportid;
-
-static struct genl_family family;
-
-static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
-{
-	struct sk_buff *skb;
-	void *data;
-
-	skb = genlmsg_new(size, GFP_NOFS);
-	if (!skb)
-		return -ENOMEM;
-
-	/* add the message headers */
-	data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd);
-	if (!data) {
-		nlmsg_free(skb);
-		return -EINVAL;
-	}
-
-	*skbp = skb;
-	return 0;
-}
-
-static struct dlm_lock_data *mk_data(struct sk_buff *skb)
-{
-	struct nlattr *ret;
-
-	ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data));
-	if (!ret)
-		return NULL;
-	return nla_data(ret);
-}
-
-static int send_data(struct sk_buff *skb)
-{
-	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *data = genlmsg_data(genlhdr);
-
-	genlmsg_end(skb, data);
-
-	return genlmsg_unicast(&init_net, skb, listener_nlportid);
-}
-
-static int user_cmd(struct sk_buff *skb, struct genl_info *info)
-{
-	listener_nlportid = info->snd_portid;
-	printk("user_cmd nlpid %u\n", listener_nlportid);
-	return 0;
-}
-
-static const struct genl_small_ops dlm_nl_ops[] = {
-	{
-		.cmd	= DLM_CMD_HELLO,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.doit	= user_cmd,
-	},
-};
-
-static struct genl_family family __ro_after_init = {
-	.name		= DLM_GENL_NAME,
-	.version	= DLM_GENL_VERSION,
-	.small_ops	= dlm_nl_ops,
-	.n_small_ops	= ARRAY_SIZE(dlm_nl_ops),
-	.resv_start_op	= DLM_CMD_HELLO + 1,
-	.module		= THIS_MODULE,
-};
-
-int __init dlm_netlink_init(void)
-{
-	return genl_register_family(&family);
-}
-
-void dlm_netlink_exit(void)
-{
-	genl_unregister_family(&family);
-}
-
-static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
-{
-	struct dlm_rsb *r = lkb->lkb_resource;
-
-	memset(data, 0, sizeof(struct dlm_lock_data));
-
-	data->version = DLM_LOCK_DATA_VERSION;
-	data->nodeid = lkb->lkb_nodeid;
-	data->ownpid = lkb->lkb_ownpid;
-	data->id = lkb->lkb_id;
-	data->remid = lkb->lkb_remid;
-	data->status = lkb->lkb_status;
-	data->grmode = lkb->lkb_grmode;
-	data->rqmode = lkb->lkb_rqmode;
-	if (lkb->lkb_ua)
-		data->xid = lkb->lkb_ua->xid;
-	if (r) {
-		data->lockspace_id = r->res_ls->ls_global_id;
-		data->resource_namelen = r->res_length;
-		memcpy(data->resource_name, r->res_name, r->res_length);
-	}
-}
-
-void dlm_timeout_warn(struct dlm_lkb *lkb)
-{
-	struct sk_buff *send_skb;
-	struct dlm_lock_data *data;
-	size_t size;
-	int rv;
-
-	size = nla_total_size(sizeof(struct dlm_lock_data)) +
-	       nla_total_size(0); /* why this? */
-
-	rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size);
-	if (rv < 0)
-		return;
-
-	data = mk_data(send_skb);
-	if (!data) {
-		nlmsg_free(send_skb);
-		return;
-	}
-
-	fill_data(data, lkb);
-
-	send_data(send_skb);
-}
-
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index e15eb511b04b..19da816cfb09 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -214,8 +214,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 
 	dlm_clear_members_gone(ls);
 
-	dlm_adjust_timeouts(ls);
-
 	dlm_callback_resume(ls);
 
 	error = enable_locking(ls, rv->seq);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 688a480879e4..0951ca5754e2 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -259,14 +259,6 @@ static int device_user_lock(struct dlm_user_proc *proc,
 		goto out;
 	}
 
-#ifdef CONFIG_DLM_DEPRECATED_API
-	if (params->timeout)
-		pr_warn_once("========================================================\n"
-			     "WARNING: the lkb timeout feature is being deprecated and\n"
-			     "         will be removed in v6.2!\n"
-			     "========================================================\n");
-#endif
-
 	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
 	if (!ua)
 		goto out;
@@ -279,16 +271,9 @@ static int device_user_lock(struct dlm_user_proc *proc,
 	ua->xid = params->xid;
 
 	if (params->flags & DLM_LKF_CONVERT) {
-#ifdef CONFIG_DLM_DEPRECATED_API
-		error = dlm_user_convert(ls, ua,
-				         params->mode, params->flags,
-				         params->lkid, params->lvb,
-					 (unsigned long) params->timeout);
-#else
 		error = dlm_user_convert(ls, ua,
 					 params->mode, params->flags,
 					 params->lkid, params->lvb);
-#endif
 	} else if (params->flags & DLM_LKF_ORPHAN) {
 		error = dlm_user_adopt_orphan(ls, ua,
 					 params->mode, params->flags,
@@ -297,16 +282,9 @@ static int device_user_lock(struct dlm_user_proc *proc,
 		if (!error)
 			error = lkid;
 	} else {
-#ifdef CONFIG_DLM_DEPRECATED_API
-		error = dlm_user_request(ls, ua,
-					 params->mode, params->flags,
-					 params->name, params->namelen,
-					 (unsigned long) params->timeout);
-#else
 		error = dlm_user_request(ls, ua,
 					 params->mode, params->flags,
 					 params->name, params->namelen);
-#endif
 		if (!error)
 			error = ua->lksb.sb_lkid;
 	}
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index c6bc2b5ee7e6..c58c4f790c04 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -53,9 +53,6 @@ struct dlm_lockspace_ops {
  *   The dlm should not use a resource directory, but statically assign
  *   resource mastery to nodes based on the name hash that is otherwise
  *   used to select the directory node.  Must be the same on all nodes.
- * DLM_LSFL_TIMEWARN
- *   The dlm should emit netlink messages if locks have been waiting
- *   for a configurable amount of time.  (Unused.)
  * DLM_LSFL_NEWEXCL
  *   dlm_new_lockspace() should return -EEXIST if the lockspace exists.
  *
diff --git a/include/uapi/linux/dlm.h b/include/uapi/linux/dlm.h
index 1923f4f3b05e..e7e905fb0bb2 100644
--- a/include/uapi/linux/dlm.h
+++ b/include/uapi/linux/dlm.h
@@ -68,6 +68,7 @@ struct dlm_lksb {
 
 /* dlm_new_lockspace() flags */
 
+/* DLM_LSFL_TIMEWARN is deprecated and reserved. DO NOT USE! */
 #define DLM_LSFL_TIMEWARN	0x00000002
 #define DLM_LSFL_NEWEXCL     	0x00000008
 
diff --git a/include/uapi/linux/dlm_netlink.h b/include/uapi/linux/dlm_netlink.h
deleted file mode 100644
index 5dc3a67d353d..000000000000
--- a/include/uapi/linux/dlm_netlink.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- */
-
-#ifndef _DLM_NETLINK_H
-#define _DLM_NETLINK_H
-
-#include <linux/types.h>
-#include <linux/dlmconstants.h>
-
-enum {
-	DLM_STATUS_WAITING = 1,
-	DLM_STATUS_GRANTED = 2,
-	DLM_STATUS_CONVERT = 3,
-};
-
-#define DLM_LOCK_DATA_VERSION 1
-
-struct dlm_lock_data {
-	__u16 version;
-	__u32 lockspace_id;
-	int nodeid;
-	int ownpid;
-	__u32 id;
-	__u32 remid;
-	__u64 xid;
-	__s8 status;
-	__s8 grmode;
-	__s8 rqmode;
-	unsigned long timestamp;
-	int resource_namelen;
-	char resource_name[DLM_RESNAME_MAXLEN];
-};
-
-enum {
-	DLM_CMD_UNSPEC = 0,
-	DLM_CMD_HELLO,		/* user->kernel */
-	DLM_CMD_TIMEOUT,	/* kernel->user */
-	__DLM_CMD_MAX,
-};
-
-#define DLM_CMD_MAX (__DLM_CMD_MAX - 1)
-
-enum {
-	DLM_TYPE_UNSPEC = 0,
-	DLM_TYPE_LOCK,
-	__DLM_TYPE_MAX,
-};
-
-#define DLM_TYPE_MAX (__DLM_TYPE_MAX - 1)
-
-#define DLM_GENL_VERSION 0x1
-#define DLM_GENL_NAME "DLM"
-
-#endif /* _DLM_NETLINK_H */
diff --git a/include/uapi/linux/dlmconstants.h b/include/uapi/linux/dlmconstants.h
index a8ae47c32a37..6ca77a6388bc 100644
--- a/include/uapi/linux/dlmconstants.h
+++ b/include/uapi/linux/dlmconstants.h
@@ -87,7 +87,6 @@
  * DLM_LKF_NODLCKWT
  *
  * Do not cancel the lock if it gets into conversion deadlock.
- * Exclude this lock from being monitored due to DLM_LSFL_TIMEWARN.
  *
  * DLM_LKF_NODLCKBLK
  *
@@ -132,6 +131,10 @@
  * Unlock the lock even if it is converting or waiting or has sublocks.
  * Only really for use by the userland device.c code.
  *
+ * DLM_LKF_TIMEOUT
+ *
+ * This value is deprecated and reserved. DO NOT USE!
+ *
  */
 
 #define DLM_LKF_NOQUEUE		0x00000001
-- 
cgit v1.2.3


From a7e7ffacad7b9bbfc188cf20e9e5b04badf0a424 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 6 Mar 2023 15:48:12 -0500
Subject: fs: dlm: rename stub to local message flag

This patch renames DLM_IFL_STUB_MS to DLM_IFL_LOCAL_MS flag. The
DLM_IFL_STUB_MS flag is somewhat misnamed, it means the dlm message is
used for local message transfer only. It is used by recovery to resolve
lock states if a node got fenced.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h      |   8 ++--
 fs/dlm/lock.c              | 106 ++++++++++++++++++++++-----------------------
 fs/dlm/lockspace.c         |   4 +-
 include/trace/events/dlm.h |   2 +-
 4 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 66863dc15b8f..5e886b1f3519 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -203,7 +203,7 @@ struct dlm_args {
 #define DLM_IFL_OVERLAP_CANCEL  0x00100000
 #define DLM_IFL_ENDOFLIFE	0x00200000
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
-#define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
+#define DLM_IFL_LOCAL_MS	0x02000000 /* magic number for m_flags */
 
 #define DLM_IFL_CB_PENDING_BIT	0
 
@@ -593,9 +593,9 @@ struct dlm_ls {
 	int			ls_slots_size;
 	struct dlm_slot		*ls_slots;
 
-	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
-	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
-	struct dlm_message	ls_stub_ms;	/* for faking a reply */
+	struct dlm_rsb		ls_local_rsb;	/* for returning errors */
+	struct dlm_lkb		ls_local_lkb;	/* for returning errors */
+	struct dlm_message	ls_local_ms;	/* for faking a reply */
 
 	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
 	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9a7679f16eee..b680e1ef0b6c 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1558,7 +1558,7 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 	return error;
 }
 
-/* Handles situations where we might be processing a "fake" or "stub" reply in
+/* Handles situations where we might be processing a "fake" or "local" reply in
    which we can't try to take waiters_mutex again. */
 
 static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
@@ -1566,10 +1566,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error;
 
-	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
+	if (ms->m_flags != cpu_to_le32(DLM_IFL_LOCAL_MS))
 		mutex_lock(&ls->ls_waiters_mutex);
 	error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
-	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
+	if (ms->m_flags != cpu_to_le32(DLM_IFL_LOCAL_MS))
 		mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
@@ -3486,10 +3486,10 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	/* down conversions go without a reply from the master */
 	if (!error && down_conversion(lkb)) {
 		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
-		r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
-		r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
-		r->res_ls->ls_stub_ms.m_result = 0;
-		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+		r->res_ls->ls_local_ms.m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
+		r->res_ls->ls_local_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_local_ms.m_result = 0;
+		__receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms);
 	}
 
 	return error;
@@ -3648,7 +3648,7 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
 			     int ret_nodeid, int rv)
 {
-	struct dlm_rsb *r = &ls->ls_stub_rsb;
+	struct dlm_rsb *r = &ls->ls_local_rsb;
 	struct dlm_message *ms;
 	struct dlm_mhandle *mh;
 	int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
@@ -3681,7 +3681,7 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
-	if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS))
+	if (ms->m_flags == cpu_to_le32(DLM_IFL_LOCAL_MS))
 		return;
 
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
@@ -3768,12 +3768,12 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	return 0;
 }
 
-/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+/* We fill in the local-lkb fields with the info that send_xxxx_reply()
    uses to send a reply and that the remote end uses to process the reply. */
 
-static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms)
 {
-	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+	struct dlm_lkb *lkb = &ls->ls_local_lkb;
 	lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
 	lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
 }
@@ -3906,8 +3906,8 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 			  le32_to_cpu(ms->m_lkid), from_nodeid, error);
 	}
 
-	setup_stub_lkb(ls, ms);
-	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	setup_local_lkb(ls, ms);
+	send_request_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
 	return error;
 }
 
@@ -3962,8 +3962,8 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 	return 0;
 
  fail:
-	setup_stub_lkb(ls, ms);
-	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	setup_local_lkb(ls, ms);
+	send_convert_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
 	return error;
 }
 
@@ -4014,8 +4014,8 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 	return 0;
 
  fail:
-	setup_stub_lkb(ls, ms);
-	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	setup_local_lkb(ls, ms);
+	send_unlock_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
 	return error;
 }
 
@@ -4050,8 +4050,8 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 	return 0;
 
  fail:
-	setup_stub_lkb(ls, ms);
-	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+	setup_local_lkb(ls, ms);
+	send_cancel_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error);
 	return error;
 }
 
@@ -4403,7 +4403,7 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	if (error)
 		goto out;
 
-	/* stub reply can happen with waiters_mutex held */
+	/* local reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
 		goto out;
@@ -4440,7 +4440,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	if (error)
 		goto out;
 
-	/* stub reply can happen with waiters_mutex held */
+	/* local reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
 		goto out;
@@ -4490,7 +4490,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	if (error)
 		goto out;
 
-	/* stub reply can happen with waiters_mutex held */
+	/* local reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
 		goto out;
@@ -4834,16 +4834,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
 }
 
 static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
-				   struct dlm_message *ms_stub)
+				   struct dlm_message *ms_local)
 {
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
-		memset(ms_stub, 0, sizeof(struct dlm_message));
-		ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
-		ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
-		ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
-		ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-		_receive_convert_reply(lkb, ms_stub);
+		memset(ms_local, 0, sizeof(struct dlm_message));
+		ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
+		ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+		ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
+		ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+		_receive_convert_reply(lkb, ms_local);
 
 		/* Same special case as in receive_rcom_lock_args() */
 		lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4882,12 +4882,12 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
-	struct dlm_message *ms_stub;
-	int wait_type, stub_unlock_result, stub_cancel_result;
+	struct dlm_message *ms_local;
+	int wait_type, local_unlock_result, local_cancel_result;
 	int dir_nodeid;
 
-	ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL);
-	if (!ms_stub)
+	ms_local = kmalloc(sizeof(*ms_local), GFP_KERNEL);
+	if (!ms_local)
 		return;
 
 	mutex_lock(&ls->ls_waiters_mutex);
@@ -4923,8 +4923,8 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			continue;
 
 		wait_type = lkb->lkb_wait_type;
-		stub_unlock_result = -DLM_EUNLOCK;
-		stub_cancel_result = -DLM_ECANCEL;
+		local_unlock_result = -DLM_EUNLOCK;
+		local_cancel_result = -DLM_ECANCEL;
 
 		/* Main reply may have been received leaving a zero wait_type,
 		   but a reply for the overlapping op may not have been
@@ -4935,17 +4935,17 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			if (is_overlap_cancel(lkb)) {
 				wait_type = DLM_MSG_CANCEL;
 				if (lkb->lkb_grmode == DLM_LOCK_IV)
-					stub_cancel_result = 0;
+					local_cancel_result = 0;
 			}
 			if (is_overlap_unlock(lkb)) {
 				wait_type = DLM_MSG_UNLOCK;
 				if (lkb->lkb_grmode == DLM_LOCK_IV)
-					stub_unlock_result = -ENOENT;
+					local_unlock_result = -ENOENT;
 			}
 
 			log_debug(ls, "rwpre overlap %x %x %d %d %d",
 				  lkb->lkb_id, lkb->lkb_flags, wait_type,
-				  stub_cancel_result, stub_unlock_result);
+				  local_cancel_result, local_unlock_result);
 		}
 
 		switch (wait_type) {
@@ -4955,28 +4955,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			break;
 
 		case DLM_MSG_CONVERT:
-			recover_convert_waiter(ls, lkb, ms_stub);
+			recover_convert_waiter(ls, lkb, ms_local);
 			break;
 
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
-			memset(ms_stub, 0, sizeof(struct dlm_message));
-			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
-			ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
-			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result));
-			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-			_receive_unlock_reply(lkb, ms_stub);
+			memset(ms_local, 0, sizeof(struct dlm_message));
+			ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
+			ms_local->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
+			ms_local->m_result = cpu_to_le32(to_dlm_errno(local_unlock_result));
+			ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+			_receive_unlock_reply(lkb, ms_local);
 			dlm_put_lkb(lkb);
 			break;
 
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
-			memset(ms_stub, 0, sizeof(struct dlm_message));
-			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
-			ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
-			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result));
-			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-			_receive_cancel_reply(lkb, ms_stub);
+			memset(ms_local, 0, sizeof(struct dlm_message));
+			ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
+			ms_local->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
+			ms_local->m_result = cpu_to_le32(to_dlm_errno(local_cancel_result));
+			ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+			_receive_cancel_reply(lkb, ms_local);
 			dlm_put_lkb(lkb);
 			break;
 
@@ -4987,7 +4987,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		schedule();
 	}
 	mutex_unlock(&ls->ls_waiters_mutex);
-	kfree(ms_stub);
+	kfree(ms_local);
 }
 
 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index ed785130fb60..de9bb775236d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -529,8 +529,8 @@ static int new_lockspace(const char *name, const char *cluster,
 	ls->ls_total_weight = 0;
 	ls->ls_node_array = NULL;
 
-	memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
-	ls->ls_stub_rsb.res_ls = ls;
+	memset(&ls->ls_local_rsb, 0, sizeof(struct dlm_rsb));
+	ls->ls_local_rsb.res_ls = ls;
 
 	ls->ls_debug_rsb_dentry = NULL;
 	ls->ls_debug_waiters_dentry = NULL;
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 37eb79e29b28..2d3ff2f1a4a6 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -54,7 +54,7 @@
 	{ DLM_IFL_OVERLAP_CANCEL, "OVERLAP_CANCEL" },		\
 	{ DLM_IFL_ENDOFLIFE,	"ENDOFLIFE" },			\
 	{ DLM_IFL_DEADLOCK_CANCEL, "DEADLOCK_CANCEL" },		\
-	{ DLM_IFL_STUB_MS,	"STUB_MS" },			\
+	{ DLM_IFL_LOCAL_MS,	"LOCAL_MS" },			\
 	{ DLM_IFL_USER,		"USER" },			\
 	{ DLM_IFL_ORPHAN,	"ORPHAN" })
 
-- 
cgit v1.2.3


From 9f48eead5ea4c55692dd5628699a0d5715416615 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 6 Mar 2023 15:48:13 -0500
Subject: fs: dlm: remove DLM_IFL_LOCAL_MS flag

The DLM_IFL_LOCAL_MS flag is an internal non shared flag but used in
m_flags of dlm messages. It is not shared because it is only used for
local messaging. Instead using DLM_IFL_LOCAL_MS in dlm messages we pass a
parameter around to signal local messaging or not. This patch is adding
the local parameter to signal local messaging.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h      |  1 -
 fs/dlm/lock.c              | 65 +++++++++++++++++++++++-----------------------
 include/trace/events/dlm.h |  1 -
 3 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5e886b1f3519..3bdb2f4e132e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -203,7 +203,6 @@ struct dlm_args {
 #define DLM_IFL_OVERLAP_CANCEL  0x00100000
 #define DLM_IFL_ENDOFLIFE	0x00200000
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
-#define DLM_IFL_LOCAL_MS	0x02000000 /* magic number for m_flags */
 
 #define DLM_IFL_CB_PENDING_BIT	0
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b680e1ef0b6c..74dd297e0857 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -86,7 +86,7 @@ static int send_remove(struct dlm_rsb *r);
 static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
-				    struct dlm_message *ms);
+				    struct dlm_message *ms, bool local);
 static int receive_extralen(struct dlm_message *ms);
 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 static void toss_rsb(struct kref *kref);
@@ -1561,15 +1561,16 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 /* Handles situations where we might be processing a "fake" or "local" reply in
    which we can't try to take waiters_mutex again. */
 
-static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms,
+				  bool local)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error;
 
-	if (ms->m_flags != cpu_to_le32(DLM_IFL_LOCAL_MS))
+	if (!local)
 		mutex_lock(&ls->ls_waiters_mutex);
 	error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
-	if (ms->m_flags != cpu_to_le32(DLM_IFL_LOCAL_MS))
+	if (!local)
 		mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
@@ -3486,10 +3487,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	/* down conversions go without a reply from the master */
 	if (!error && down_conversion(lkb)) {
 		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
-		r->res_ls->ls_local_ms.m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
 		r->res_ls->ls_local_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
 		r->res_ls->ls_local_ms.m_result = 0;
-		__receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms);
+		__receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms, true);
 	}
 
 	return error;
@@ -3679,9 +3679,10 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 			  (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
 }
 
-static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
+				bool local)
 {
-	if (ms->m_flags == cpu_to_le32(DLM_IFL_LOCAL_MS))
+	if (local)
 		return;
 
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
@@ -4074,7 +4075,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
 	if (error)
 		goto out;
 
-	receive_flags_reply(lkb, ms);
+	receive_flags_reply(lkb, ms, false);
 	if (is_altmode(lkb))
 		munge_altmode(lkb, ms);
 	grant_lock_pc(r, lkb, ms);
@@ -4278,7 +4279,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	case -EINPROGRESS:
 	case 0:
 		/* request was queued or granted on remote master */
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, false);
 		lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
 		if (is_altmode(lkb))
 			munge_altmode(lkb, ms);
@@ -4348,7 +4349,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 }
 
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
-				    struct dlm_message *ms)
+				    struct dlm_message *ms, bool local)
 {
 	/* this is the value returned from do_convert() on the master */
 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
@@ -4358,14 +4359,14 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 		break;
 
 	case -EDEADLK:
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, local);
 		revert_lock_pc(r, lkb);
 		queue_cast(r, lkb, -EDEADLK);
 		break;
 
 	case -EINPROGRESS:
 		/* convert was queued on remote master */
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, local);
 		if (is_demoted(lkb))
 			munge_demoted(lkb);
 		del_lkb(r, lkb);
@@ -4374,7 +4375,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 
 	case 0:
 		/* convert was granted on remote master */
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, local);
 		if (is_demoted(lkb))
 			munge_demoted(lkb);
 		grant_lock_pc(r, lkb, ms);
@@ -4391,7 +4392,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	}
 }
 
-static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
+				   bool local)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
 	int error;
@@ -4404,11 +4406,11 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 		goto out;
 
 	/* local reply can happen with waiters_mutex held */
-	error = remove_from_waiters_ms(lkb, ms);
+	error = remove_from_waiters_ms(lkb, ms, local);
 	if (error)
 		goto out;
 
-	__receive_convert_reply(r, lkb, ms);
+	__receive_convert_reply(r, lkb, ms, local);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -4423,12 +4425,13 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	if (error)
 		return error;
 
-	_receive_convert_reply(lkb, ms);
+	_receive_convert_reply(lkb, ms, false);
 	dlm_put_lkb(lkb);
 	return 0;
 }
 
-static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
+				  bool local)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
 	int error;
@@ -4441,7 +4444,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 		goto out;
 
 	/* local reply can happen with waiters_mutex held */
-	error = remove_from_waiters_ms(lkb, ms);
+	error = remove_from_waiters_ms(lkb, ms, local);
 	if (error)
 		goto out;
 
@@ -4449,7 +4452,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
 	case -DLM_EUNLOCK:
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, local);
 		remove_lock_pc(r, lkb);
 		queue_cast(r, lkb, -DLM_EUNLOCK);
 		break;
@@ -4473,12 +4476,13 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	if (error)
 		return error;
 
-	_receive_unlock_reply(lkb, ms);
+	_receive_unlock_reply(lkb, ms, false);
 	dlm_put_lkb(lkb);
 	return 0;
 }
 
-static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
+				  bool local)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
 	int error;
@@ -4491,7 +4495,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 		goto out;
 
 	/* local reply can happen with waiters_mutex held */
-	error = remove_from_waiters_ms(lkb, ms);
+	error = remove_from_waiters_ms(lkb, ms, local);
 	if (error)
 		goto out;
 
@@ -4499,7 +4503,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
 	case -DLM_ECANCEL:
-		receive_flags_reply(lkb, ms);
+		receive_flags_reply(lkb, ms, local);
 		revert_lock_pc(r, lkb);
 		queue_cast(r, lkb, -DLM_ECANCEL);
 		break;
@@ -4524,7 +4528,7 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	if (error)
 		return error;
 
-	_receive_cancel_reply(lkb, ms);
+	_receive_cancel_reply(lkb, ms, false);
 	dlm_put_lkb(lkb);
 	return 0;
 }
@@ -4839,11 +4843,10 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
 		memset(ms_local, 0, sizeof(struct dlm_message));
-		ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
 		ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
 		ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
 		ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-		_receive_convert_reply(lkb, ms_local);
+		_receive_convert_reply(lkb, ms_local, true);
 
 		/* Same special case as in receive_rcom_lock_args() */
 		lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4961,22 +4964,20 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
 			memset(ms_local, 0, sizeof(struct dlm_message));
-			ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
 			ms_local->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
 			ms_local->m_result = cpu_to_le32(to_dlm_errno(local_unlock_result));
 			ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-			_receive_unlock_reply(lkb, ms_local);
+			_receive_unlock_reply(lkb, ms_local, true);
 			dlm_put_lkb(lkb);
 			break;
 
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
 			memset(ms_local, 0, sizeof(struct dlm_message));
-			ms_local->m_flags = cpu_to_le32(DLM_IFL_LOCAL_MS);
 			ms_local->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
 			ms_local->m_result = cpu_to_le32(to_dlm_errno(local_cancel_result));
 			ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
-			_receive_cancel_reply(lkb, ms_local);
+			_receive_cancel_reply(lkb, ms_local, true);
 			dlm_put_lkb(lkb);
 			break;
 
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 2d3ff2f1a4a6..cd00b2d34e33 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -54,7 +54,6 @@
 	{ DLM_IFL_OVERLAP_CANCEL, "OVERLAP_CANCEL" },		\
 	{ DLM_IFL_ENDOFLIFE,	"ENDOFLIFE" },			\
 	{ DLM_IFL_DEADLOCK_CANCEL, "DEADLOCK_CANCEL" },		\
-	{ DLM_IFL_LOCAL_MS,	"LOCAL_MS" },			\
 	{ DLM_IFL_USER,		"USER" },			\
 	{ DLM_IFL_ORPHAN,	"ORPHAN" })
 
-- 
cgit v1.2.3


From 8c11ba64ce577a993701616e335319a0afbbd49a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 6 Mar 2023 15:48:14 -0500
Subject: fs: dlm: store lkb distributed flags into own value

This patch stores lkb distributed flags value in an separate value
instead of sharing internal and distributed flags in lkb->lkb_flags value.
This has the advantage to not mask/write back flag values in
receive_flags() functionality. The dlm debug_fs does not provide the
distributed flags anymore, those can be added in future.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c               |  2 +-
 fs/dlm/debug_fs.c          |  4 ++--
 fs/dlm/dlm_internal.h      | 15 +++++----------
 fs/dlm/lock.c              | 28 +++++++++++++---------------
 fs/dlm/memory.c            |  2 +-
 fs/dlm/rcom.c              |  2 +-
 fs/dlm/user.c              |  6 ++++--
 include/trace/events/dlm.h | 11 ++---------
 8 files changed, 29 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 39805aea3336..28b7b5288bd4 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -139,7 +139,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int rv;
 
-	if (lkb->lkb_flags & DLM_IFL_USER) {
+	if (lkb->lkb_dflags & DLM_DFL_USER) {
 		dlm_user_add_ast(lkb, flags, mode, status, sbflags);
 		return;
 	}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8a0e1b1f74ad..65cfb58eaa6b 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -170,7 +170,7 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	u64 xid = 0;
 	u64 us;
 
-	if (lkb->lkb_flags & DLM_IFL_USER) {
+	if (lkb->lkb_dflags & DLM_DFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
@@ -230,7 +230,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 {
 	u64 xid = 0;
 
-	if (lkb->lkb_flags & DLM_IFL_USER) {
+	if (lkb->lkb_dflags & DLM_DFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 3bdb2f4e132e..748bb483da1c 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -206,16 +206,10 @@ struct dlm_args {
 
 #define DLM_IFL_CB_PENDING_BIT	0
 
-/* least significant 2 bytes are message changed, they are full transmitted
- * but at receive side only the 2 bytes LSB will be set.
- *
- * Even wireshark dlm dissector does only evaluate the lower bytes and note
- * that they may not be used on transceiver side, we assume the higher bytes
- * are for internal use or reserved so long they are not parsed on receiver
- * side.
- */
-#define DLM_IFL_USER		0x00000001
-#define DLM_IFL_ORPHAN		0x00000002
+/* lkb_dflags */
+
+#define DLM_DFL_USER		0x00000001
+#define DLM_DFL_ORPHAN		0x00000002
 
 #define DLM_CB_CAST		0x00000001
 #define DLM_CB_BAST		0x00000002
@@ -240,6 +234,7 @@ struct dlm_lkb {
 	uint32_t		lkb_exflags;	/* external flags from caller */
 	uint32_t		lkb_sbflags;	/* lksb flags */
 	uint32_t		lkb_flags;	/* internal flags */
+	uint32_t		lkb_dflags;	/* distributed flags */
 	unsigned long		lkb_iflags;	/* internal flags */
 	uint32_t		lkb_lvbseq;	/* lvb sequence number */
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 74dd297e0857..21c7abd7ef77 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3675,8 +3675,7 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
-	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
-			  (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
+	lkb->lkb_dflags = le32_to_cpu(ms->m_flags);
 }
 
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
@@ -3686,8 +3685,7 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
 		return;
 
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
-	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
-			 (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
+	lkb->lkb_dflags = le32_to_cpu(ms->m_flags);
 }
 
 static int receive_extralen(struct dlm_message *ms)
@@ -3788,8 +3786,8 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
 	int error = 0;
 
 	/* currently mixing of user/kernel locks are not supported */
-	if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) &&
-	    ~lkb->lkb_flags & DLM_IFL_USER) {
+	if (ms->m_flags & cpu_to_le32(DLM_DFL_USER) &&
+	    ~lkb->lkb_dflags & DLM_DFL_USER) {
 		log_error(lkb->lkb_resource->res_ls,
 			  "got user dlm message for a kernel lock");
 		error = -EINVAL;
@@ -5347,7 +5345,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
 	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
 	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
-	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
+	lkb->lkb_dflags = le32_to_cpu(rl->rl_flags);
 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
 	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
 	lkb->lkb_rqmode = rl->rl_rqmode;
@@ -5573,9 +5571,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	}
 
 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
-	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   When DLM_DFL_USER is set, the dlm knows that this is a userspace
 	   lock and that lkb_astparam is the dlm_user_args structure. */
-	lkb->lkb_flags |= DLM_IFL_USER;
+	lkb->lkb_dflags |= DLM_DFL_USER;
 	error = request_lock(ls, lkb, name, namelen, &args);
 
 	switch (error) {
@@ -5690,7 +5688,7 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 		lkb = iter;
 		list_del_init(&iter->lkb_ownqueue);
-		iter->lkb_flags &= ~DLM_IFL_ORPHAN;
+		iter->lkb_dflags &= ~DLM_DFL_ORPHAN;
 		*lkid = iter->lkb_id;
 		break;
 	}
@@ -5934,7 +5932,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
 	list_del_init(&lkb->lkb_ownqueue);
 
 	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
-		lkb->lkb_flags |= DLM_IFL_ORPHAN;
+		lkb->lkb_dflags |= DLM_DFL_ORPHAN;
 	else
 		lkb->lkb_flags |= DLM_IFL_DEAD;
  out:
@@ -6085,7 +6083,7 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
 
 /* debug functionality */
 int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
-		      int lkb_nodeid, unsigned int lkb_flags, int lkb_status)
+		      int lkb_nodeid, unsigned int lkb_dflags, int lkb_status)
 {
 	struct dlm_lksb *lksb;
 	struct dlm_lkb *lkb;
@@ -6093,7 +6091,7 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
 	int error;
 
 	/* we currently can't set a valid user lock */
-	if (lkb_flags & DLM_IFL_USER)
+	if (lkb_dflags & DLM_DFL_USER)
 		return -EOPNOTSUPP;
 
 	lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
@@ -6106,11 +6104,11 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
 		return error;
 	}
 
-	lkb->lkb_flags = lkb_flags;
+	lkb->lkb_dflags = lkb_dflags;
 	lkb->lkb_nodeid = lkb_nodeid;
 	lkb->lkb_lksb = lksb;
 	/* user specific pointer, just don't have it NULL for kernel locks */
-	if (~lkb_flags & DLM_IFL_USER)
+	if (~lkb_dflags & DLM_DFL_USER)
 		lkb->lkb_astparam = (void *)0xDEADBEEF;
 
 	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index cdbaa452fc05..c1a832e8a72b 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -118,7 +118,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 
 void dlm_free_lkb(struct dlm_lkb *lkb)
 {
-	if (lkb->lkb_flags & DLM_IFL_USER) {
+	if (lkb->lkb_dflags & DLM_DFL_USER) {
 		struct dlm_user_args *ua;
 		ua = lkb->lkb_ua;
 		if (ua) {
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index b76d52e2f6bd..005ee85218c5 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -415,7 +415,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
 	rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
 	rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
-	rl->rl_flags = cpu_to_le32(lkb->lkb_flags);
+	rl->rl_flags = cpu_to_le32(lkb->lkb_dflags);
 	rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
 	rl->rl_rqmode = lkb->lkb_rqmode;
 	rl->rl_grmode = lkb->lkb_grmode;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 0951ca5754e2..dd4b9c8f226c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -183,7 +183,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	struct dlm_user_proc *proc;
 	int rv;
 
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+	if (lkb->lkb_dflags & DLM_DFL_ORPHAN ||
+	    lkb->lkb_flags & DLM_IFL_DEAD)
 		return;
 
 	ls = lkb->lkb_resource->res_ls;
@@ -195,7 +196,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	   for cases where a completion ast is received for an operation that
 	   began before clear_proc_locks did its cancel/unlock. */
 
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+	if (lkb->lkb_dflags & DLM_DFL_ORPHAN ||
+	    lkb->lkb_flags & DLM_IFL_DEAD)
 		goto out;
 
 	DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb););
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index cd00b2d34e33..6fa4798e1939 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -47,15 +47,8 @@
 	{ DLM_SBF_ALTMODE,	"ALTMODE" })
 
 #define show_lkb_flags(flags) __print_flags(flags, "|",		\
-	{ DLM_IFL_MSTCPY,	"MSTCPY" },			\
-	{ DLM_IFL_RESEND,	"RESEND" },			\
-	{ DLM_IFL_DEAD,		"DEAD" },			\
-	{ DLM_IFL_OVERLAP_UNLOCK, "OVERLAP_UNLOCK" },		\
-	{ DLM_IFL_OVERLAP_CANCEL, "OVERLAP_CANCEL" },		\
-	{ DLM_IFL_ENDOFLIFE,	"ENDOFLIFE" },			\
-	{ DLM_IFL_DEADLOCK_CANCEL, "DEADLOCK_CANCEL" },		\
-	{ DLM_IFL_USER,		"USER" },			\
-	{ DLM_IFL_ORPHAN,	"ORPHAN" })
+	{ DLM_DFL_USER,		"USER" },			\
+	{ DLM_DFL_ORPHAN,	"ORPHAN" })
 
 #define show_header_cmd(cmd) __print_symbolic(cmd,		\
 	{ DLM_MSG,		"MSG"},				\
-- 
cgit v1.2.3


From 8a39dcd9c32dd3f0d3af745c72834d58a43ed90a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 6 Mar 2023 15:48:15 -0500
Subject: fs: dlm: change dflags to use atomic bits

Currently manipulating lkb_dflags assumes to held the rsb lock assigned
to the lkb. This is held by dlm message processing after certain
time to lookup the right rsb from the received lkb message id. For user
space locks flags, which is currently the only use case for lkb_dflags,
flags are also being set during dlm character device handling without
holding the rsb lock. To minimize the risk that bit operations are
getting corrupted we switch to atomic bit operations. This patch will
also introduce helpers to snapshot atomic bit values in an non atomic
way. There might be still issues with the flag handling e.g. running in
case of manipulating bit ops and snapshot them at the same time, but this
patch minimize them and will start to use atomic bit operations.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c               |  2 +-
 fs/dlm/debug_fs.c          |  4 ++--
 fs/dlm/dlm_internal.h      | 46 +++++++++++++++++++++++++++++++++++++++++++---
 fs/dlm/lock.c              | 26 +++++++++++++-------------
 fs/dlm/memory.c            |  2 +-
 fs/dlm/rcom.c              |  2 +-
 fs/dlm/user.c              |  4 ++--
 include/trace/events/dlm.h |  4 ++--
 8 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 28b7b5288bd4..700ff2e0515a 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -139,7 +139,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int rv;
 
-	if (lkb->lkb_dflags & DLM_DFL_USER) {
+	if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
 		dlm_user_add_ast(lkb, flags, mode, status, sbflags);
 		return;
 	}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 65cfb58eaa6b..370a0c81a0b3 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -170,7 +170,7 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	u64 xid = 0;
 	u64 us;
 
-	if (lkb->lkb_dflags & DLM_DFL_USER) {
+	if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
@@ -230,7 +230,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 {
 	u64 xid = 0;
 
-	if (lkb->lkb_dflags & DLM_DFL_USER) {
+	if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 748bb483da1c..cd4f89217ee1 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -208,8 +208,10 @@ struct dlm_args {
 
 /* lkb_dflags */
 
-#define DLM_DFL_USER		0x00000001
-#define DLM_DFL_ORPHAN		0x00000002
+#define DLM_DFL_USER_BIT	0
+#define __DLM_DFL_MIN_BIT	DLM_DFL_USER_BIT
+#define DLM_DFL_ORPHAN_BIT	1
+#define __DLM_DFL_MAX_BIT	DLM_DFL_ORPHAN_BIT
 
 #define DLM_CB_CAST		0x00000001
 #define DLM_CB_BAST		0x00000002
@@ -234,7 +236,7 @@ struct dlm_lkb {
 	uint32_t		lkb_exflags;	/* external flags from caller */
 	uint32_t		lkb_sbflags;	/* lksb flags */
 	uint32_t		lkb_flags;	/* internal flags */
-	uint32_t		lkb_dflags;	/* distributed flags */
+	unsigned long		lkb_dflags;	/* distributed flags */
 	unsigned long		lkb_iflags;	/* internal flags */
 	uint32_t		lkb_lvbseq;	/* lvb sequence number */
 
@@ -733,6 +735,44 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
 	return test_bit(LSFL_NODIR, &ls->ls_flags);
 }
 
+/* takes a snapshot from dlm atomic flags */
+static inline uint32_t dlm_flags_val(const unsigned long *addr,
+				     uint32_t min, uint32_t max)
+{
+	uint32_t bit = min, val = 0;
+
+	for_each_set_bit_from(bit, addr, max + 1) {
+		val |= BIT(bit);
+	}
+
+	return val;
+}
+
+static inline uint32_t dlm_dflags_val(const struct dlm_lkb *lkb)
+{
+	return dlm_flags_val(&lkb->lkb_dflags, __DLM_DFL_MIN_BIT,
+			     __DLM_DFL_MAX_BIT);
+}
+
+static inline void dlm_set_flags_val(unsigned long *addr, uint32_t val,
+				     uint32_t min, uint32_t max)
+{
+	uint32_t bit;
+
+	for (bit = min; bit < (max + 1); bit++) {
+		if (val & BIT(bit))
+			set_bit(bit, addr);
+		else
+			clear_bit(bit, addr);
+	}
+}
+
+static inline void dlm_set_dflags_val(struct dlm_lkb *lkb, uint32_t val)
+{
+	dlm_set_flags_val(&lkb->lkb_dflags, val, __DLM_DFL_MIN_BIT,
+			  __DLM_DFL_MAX_BIT);
+}
+
 int dlm_plock_init(void);
 void dlm_plock_exit(void);
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 21c7abd7ef77..70737352d595 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3410,7 +3410,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	ms->m_remid    = cpu_to_le32(lkb->lkb_remid);
 	ms->m_exflags  = cpu_to_le32(lkb->lkb_exflags);
 	ms->m_sbflags  = cpu_to_le32(lkb->lkb_sbflags);
-	ms->m_flags    = cpu_to_le32(lkb->lkb_flags);
+	ms->m_flags    = cpu_to_le32(dlm_dflags_val(lkb));
 	ms->m_lvbseq   = cpu_to_le32(lkb->lkb_lvbseq);
 	ms->m_status   = cpu_to_le32(lkb->lkb_status);
 	ms->m_grmode   = cpu_to_le32(lkb->lkb_grmode);
@@ -3675,7 +3675,7 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
-	lkb->lkb_dflags = le32_to_cpu(ms->m_flags);
+	dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
 }
 
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
@@ -3685,7 +3685,7 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
 		return;
 
 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
-	lkb->lkb_dflags = le32_to_cpu(ms->m_flags);
+	dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
 }
 
 static int receive_extralen(struct dlm_message *ms)
@@ -3786,8 +3786,8 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
 	int error = 0;
 
 	/* currently mixing of user/kernel locks are not supported */
-	if (ms->m_flags & cpu_to_le32(DLM_DFL_USER) &&
-	    ~lkb->lkb_dflags & DLM_DFL_USER) {
+	if (ms->m_flags & cpu_to_le32(BIT(DLM_DFL_USER_BIT)) &&
+	    !test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
 		log_error(lkb->lkb_resource->res_ls,
 			  "got user dlm message for a kernel lock");
 		error = -EINVAL;
@@ -5345,7 +5345,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
 	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
 	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
-	lkb->lkb_dflags = le32_to_cpu(rl->rl_flags);
+	dlm_set_dflags_val(lkb, le32_to_cpu(rl->rl_flags));
 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
 	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
 	lkb->lkb_rqmode = rl->rl_rqmode;
@@ -5571,9 +5571,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	}
 
 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
-	   When DLM_DFL_USER is set, the dlm knows that this is a userspace
+	   When DLM_DFL_USER_BIT is set, the dlm knows that this is a userspace
 	   lock and that lkb_astparam is the dlm_user_args structure. */
-	lkb->lkb_dflags |= DLM_DFL_USER;
+	set_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags);
 	error = request_lock(ls, lkb, name, namelen, &args);
 
 	switch (error) {
@@ -5688,7 +5688,7 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 		lkb = iter;
 		list_del_init(&iter->lkb_ownqueue);
-		iter->lkb_dflags &= ~DLM_DFL_ORPHAN;
+		clear_bit(DLM_DFL_ORPHAN_BIT, &iter->lkb_dflags);
 		*lkid = iter->lkb_id;
 		break;
 	}
@@ -5932,7 +5932,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
 	list_del_init(&lkb->lkb_ownqueue);
 
 	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
-		lkb->lkb_dflags |= DLM_DFL_ORPHAN;
+		set_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags);
 	else
 		lkb->lkb_flags |= DLM_IFL_DEAD;
  out:
@@ -6091,7 +6091,7 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
 	int error;
 
 	/* we currently can't set a valid user lock */
-	if (lkb_dflags & DLM_DFL_USER)
+	if (lkb_dflags & BIT(DLM_DFL_USER_BIT))
 		return -EOPNOTSUPP;
 
 	lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
@@ -6104,11 +6104,11 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
 		return error;
 	}
 
-	lkb->lkb_dflags = lkb_dflags;
+	dlm_set_dflags_val(lkb, lkb_dflags);
 	lkb->lkb_nodeid = lkb_nodeid;
 	lkb->lkb_lksb = lksb;
 	/* user specific pointer, just don't have it NULL for kernel locks */
-	if (~lkb_dflags & DLM_DFL_USER)
+	if (~lkb_dflags & BIT(DLM_DFL_USER_BIT))
 		lkb->lkb_astparam = (void *)0xDEADBEEF;
 
 	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1a832e8a72b..64f212a066cf 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -118,7 +118,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 
 void dlm_free_lkb(struct dlm_lkb *lkb)
 {
-	if (lkb->lkb_dflags & DLM_DFL_USER) {
+	if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
 		struct dlm_user_args *ua;
 		ua = lkb->lkb_ua;
 		if (ua) {
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 005ee85218c5..f4afdf892f78 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -415,7 +415,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
 	rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
 	rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
-	rl->rl_flags = cpu_to_le32(lkb->lkb_dflags);
+	rl->rl_flags = cpu_to_le32(dlm_dflags_val(lkb));
 	rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
 	rl->rl_rqmode = lkb->lkb_rqmode;
 	rl->rl_grmode = lkb->lkb_grmode;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index dd4b9c8f226c..233c88e46710 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -183,7 +183,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	struct dlm_user_proc *proc;
 	int rv;
 
-	if (lkb->lkb_dflags & DLM_DFL_ORPHAN ||
+	if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) ||
 	    lkb->lkb_flags & DLM_IFL_DEAD)
 		return;
 
@@ -196,7 +196,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	   for cases where a completion ast is received for an operation that
 	   began before clear_proc_locks did its cancel/unlock. */
 
-	if (lkb->lkb_dflags & DLM_DFL_ORPHAN ||
+	if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) ||
 	    lkb->lkb_flags & DLM_IFL_DEAD)
 		goto out;
 
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 6fa4798e1939..2b09574e1243 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -47,8 +47,8 @@
 	{ DLM_SBF_ALTMODE,	"ALTMODE" })
 
 #define show_lkb_flags(flags) __print_flags(flags, "|",		\
-	{ DLM_DFL_USER,		"USER" },			\
-	{ DLM_DFL_ORPHAN,	"ORPHAN" })
+	{ BIT(DLM_DFL_USER_BIT), "USER" },			\
+	{ BIT(DLM_DFL_ORPHAN_BIT), "ORPHAN" })
 
 #define show_header_cmd(cmd) __print_symbolic(cmd,		\
 	{ DLM_MSG,		"MSG"},				\
-- 
cgit v1.2.3


From 1038b6978b438d954e2e827d4d553bed94cd83e0 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 2 Mar 2023 08:58:30 +0800
Subject: dt-bindings: clock: ast2600: Add top-level I3C clock

The ast2600 hardware has a top-level clock for all i3c controller
peripherals (then gated to each individual controller), so add a
top-level i3c clock line to control this.

This is a partial cherry-pick and rework of ed44b8cdfdb and 1a35eb926d7
from Aspeed's own tree, originally by Dylan Hung
<dylan_hung@aspeedtech.com>.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://lore.kernel.org/r/20230302005834.13171-3-jk@codeconstruct.com.au
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/ast2600-clock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/ast2600-clock.h b/include/dt-bindings/clock/ast2600-clock.h
index d8b0db2f7a7d..dd1581bfdf58 100644
--- a/include/dt-bindings/clock/ast2600-clock.h
+++ b/include/dt-bindings/clock/ast2600-clock.h
@@ -87,6 +87,7 @@
 #define ASPEED_CLK_MAC2RCLK		68
 #define ASPEED_CLK_MAC3RCLK		69
 #define ASPEED_CLK_MAC4RCLK		70
+#define ASPEED_CLK_I3C			71
 
 /* Only list resets here that are not part of a gate */
 #define ASPEED_RESET_ADC		55
-- 
cgit v1.2.3


From 1f15e0486b6e29d7b327d3150215ddec31fca679 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 2 Mar 2023 08:58:32 +0800
Subject: dt-bindings: clock: ast2600: remove IC36 & I3C7 clock definitions

The current ast2600 clock definitions include entries for i3c6 and i3c7
devices, which don't exist: there are no clock control lines documented
for these, and only i3c devices 0 through 5 are present.

So, remove the definitions for I3C6 and I3C7. Although this is a
potential ABI-breaking change, there are no in-tree users of these, and
any references would be broken anyway, as the hardware doesn't exist.

This is a partial cherry-pick and rework of ed44b8cdfdb and 1a35eb926d7
from Aspeed's own tree, originally by Dylan Hung
<dylan_hung@aspeedtech.com>.

Reviewed-by: Joel Stanley <joel@jms.id.au>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://lore.kernel.org/r/20230302005834.13171-5-jk@codeconstruct.com.au
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/ast2600-clock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/ast2600-clock.h b/include/dt-bindings/clock/ast2600-clock.h
index dd1581bfdf58..b4d69103d722 100644
--- a/include/dt-bindings/clock/ast2600-clock.h
+++ b/include/dt-bindings/clock/ast2600-clock.h
@@ -57,8 +57,6 @@
 #define ASPEED_CLK_GATE_I3C3CLK		40
 #define ASPEED_CLK_GATE_I3C4CLK		41
 #define ASPEED_CLK_GATE_I3C5CLK		42
-#define ASPEED_CLK_GATE_I3C6CLK		43
-#define ASPEED_CLK_GATE_I3C7CLK		44
 
 #define ASPEED_CLK_GATE_FSICLK		45
 
-- 
cgit v1.2.3


From ced8a02b3452291171e9988a3c862c56e1628a1d Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 2 Mar 2023 08:58:34 +0800
Subject: dt-bindings: clock: ast2600: Expand comment on reset definitions

The current "not part of a gate" is a little ambiguous. Expand this a
little to clarify the reference to the paired clock + reset control.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://lore.kernel.org/r/20230302005834.13171-7-jk@codeconstruct.com.au
Reviewed-by: Joel Stanley <joel@jms.id.au>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/ast2600-clock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/ast2600-clock.h b/include/dt-bindings/clock/ast2600-clock.h
index b4d69103d722..e149eee61588 100644
--- a/include/dt-bindings/clock/ast2600-clock.h
+++ b/include/dt-bindings/clock/ast2600-clock.h
@@ -87,7 +87,7 @@
 #define ASPEED_CLK_MAC4RCLK		70
 #define ASPEED_CLK_I3C			71
 
-/* Only list resets here that are not part of a gate */
+/* Only list resets here that are not part of a clock gate + reset pair */
 #define ASPEED_RESET_ADC		55
 #define ASPEED_RESET_JTAG_MASTER2	54
 #define ASPEED_RESET_I3C_DMA		39
-- 
cgit v1.2.3


From 42bf4597b7da8d9e3bc6039260e5437902af925a Mon Sep 17 00:00:00 2001
From: Md Sadre Alam <quic_mdalam@quicinc.com>
Date: Mon, 6 Mar 2023 20:18:10 +0530
Subject: mtd: rawnand: Fix spelling mistake waifunc() -> waitfunc()

There is a spelling mistake in a chip->legacy.waifunc(). Fix it.

Signed-off-by: Md Sadre Alam <quic_mdalam@quicinc.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230306144810.22078-1-quic_mdalam@quicinc.com
---
 include/linux/mtd/rawnand.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index f8d4be9c587a..5159d692f9ce 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1075,7 +1075,7 @@ static inline void nand_op_trace(const char *prefix,
  * @exec_op:	 controller specific method to execute NAND operations.
  *		 This method replaces chip->legacy.cmdfunc(),
  *		 chip->legacy.{read,write}_{buf,byte,word}(),
- *		 chip->legacy.dev_ready() and chip->legacy.waifunc().
+ *		 chip->legacy.dev_ready() and chip->legacy.waitfunc().
  * @setup_interface: setup the data interface and timing. If chipnr is set to
  *		     %NAND_DATA_IFACE_CHECK_ONLY this means the configuration
  *		     should not be applied but only checked.
-- 
cgit v1.2.3


From d509c55cda22096a1836e35b03f66e1ef411c0c2 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 1 Mar 2023 12:09:13 +0200
Subject: wifi: nl80211: Update the documentation of
 NL80211_SCAN_FLAG_COLOCATED_6GHZ

Add a detailed description of NL80211_SCAN_FLAG_COLOCATED_6GHZ
flag.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.487ab04feb39.I5129fd61841332474693046241586f057b134c3c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f14621a954e1..c22eeb18b996 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6510,8 +6510,14 @@ enum nl80211_timeout_reason {
  * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with
  *	%NL80211_ATTR_SCAN_FREQ_KHZ. This also means
  *	%NL80211_ATTR_SCAN_FREQUENCIES will not be included.
- * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by
- *	2.4/5 GHz APs
+ * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for collocated APs reported by
+ *	2.4/5 GHz APs. When the flag is set, the scan logic will use the
+ *	information from the RNR element found in beacons/probe responses
+ *	received on the 2.4/5 GHz channels to actively scan only the 6GHz
+ *	channels on which APs are expected to be found. Note that when not set,
+ *	the scan logic would scan all 6GHz channels, but since transmission of
+ *	probe requests on non PSC channels is limited, it is highly likely that
+ *	these channels would passively be scanned.
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
-- 
cgit v1.2.3


From 6ff9efcfc2dc256480b252321818e0111b9399a2 Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:20 +0200
Subject: wifi: wireless: cleanup unused function parameters

In the past ftype was used for deciding about 6G DUP beacon, but the
logic was removed and ftype is not needed anymore.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.98d4761b809b.I255f5ecd77cb24fcf2f1641bb5833ea2d121296e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 +---
 net/wireless/scan.c    | 21 +++++----------------
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f115b2550309..03b911abd772 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6814,13 +6814,11 @@ enum cfg80211_bss_frame_type {
  * @ie: IEs
  * @ielen: length of IEs
  * @band: enum nl80211_band of the channel
- * @ftype: frame type
  *
  * Returns the channel number, or -1 if none could be determined.
  */
 int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
-				    enum nl80211_band band,
-				    enum cfg80211_bss_frame_type ftype);
+				    enum nl80211_band band);
 
 /**
  * cfg80211_inform_bss_data - inform cfg80211 of a new BSS
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index c23709d852bc..a1382255fab3 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1810,8 +1810,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 }
 
 int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
-				    enum nl80211_band band,
-				    enum cfg80211_bss_frame_type ftype)
+				    enum nl80211_band band)
 {
 	const struct element *tmp;
 
@@ -1868,15 +1867,14 @@ EXPORT_SYMBOL(cfg80211_get_ies_channel_number);
 static struct ieee80211_channel *
 cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 			 struct ieee80211_channel *channel,
-			 enum nl80211_bss_scan_width scan_width,
-			 enum cfg80211_bss_frame_type ftype)
+			 enum nl80211_bss_scan_width scan_width)
 {
 	u32 freq;
 	int channel_number;
 	struct ieee80211_channel *alt_channel;
 
 	channel_number = cfg80211_get_ies_channel_number(ie, ielen,
-							 channel->band, ftype);
+							 channel->band);
 
 	if (channel_number < 0) {
 		/* No channel information in frame payload */
@@ -1954,7 +1952,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 		return NULL;
 
 	channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan,
-					   data->scan_width, ftype);
+					   data->scan_width);
 	if (!channel)
 		return NULL;
 
@@ -2388,7 +2386,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 	size_t ielen, min_hdr_len = offsetof(struct ieee80211_mgmt,
 					     u.probe_resp.variable);
 	int bss_type;
-	enum cfg80211_bss_frame_type ftype;
 
 	BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
 			offsetof(struct ieee80211_mgmt, u.beacon.variable));
@@ -2425,16 +2422,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 			variable = ext->u.s1g_beacon.variable;
 	}
 
-	if (ieee80211_is_beacon(mgmt->frame_control))
-		ftype = CFG80211_BSS_FTYPE_BEACON;
-	else if (ieee80211_is_probe_resp(mgmt->frame_control))
-		ftype = CFG80211_BSS_FTYPE_PRESP;
-	else
-		ftype = CFG80211_BSS_FTYPE_UNKNOWN;
-
 	channel = cfg80211_get_bss_channel(wiphy, variable,
-					   ielen, data->chan, data->scan_width,
-					   ftype);
+					   ielen, data->chan, data->scan_width);
 	if (!channel)
 		return NULL;
 
-- 
cgit v1.2.3


From cbbaf2bb829b6c4ef911d4a725fc9b1fadc1e43f Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 1 Mar 2023 12:09:21 +0200
Subject: wifi: nl80211: add a command to enable/disable HW timestamping

Add a command to enable and disable HW timestamping of TM and FTM
frames. HW timestamping can be enabled for a specific mac address
or for all addresses.

The low level driver will indicate how many peers HW timestamping
can be enabled concurrently, and this information will be passed
to userspace.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.05678d7b1c17.Iccc08869ea8156f1c71a3111a47f86dd56234bd0@changeid
[switch to needing netdev UP, minor edits]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 27 +++++++++++++++++++++++++++
 include/uapi/linux/nl80211.h | 22 ++++++++++++++++++++++
 net/wireless/nl80211.c       | 37 +++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      | 17 +++++++++++++++++
 net/wireless/trace.h         | 25 +++++++++++++++++++++++++
 5 files changed, 128 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 03b911abd772..f0da61c6ec4b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -827,6 +827,18 @@ struct cfg80211_fils_aad {
 	const u8 *anonce;
 };
 
+/**
+ * struct cfg80211_set_hw_timestamp - enable/disable HW timestamping
+ * @macaddr: peer MAC address. NULL to enable/disable HW timestamping for all
+ *	addresses.
+ * @enable: if set, enable HW timestamping for the specified MAC address.
+ *	Otherwise disable HW timestamping for the specified MAC address.
+ */
+struct cfg80211_set_hw_timestamp {
+	const u8 *macaddr;
+	bool enable;
+};
+
 /**
  * cfg80211_get_chandef_type - return old channel type from chandef
  * @chandef: the channel definition
@@ -4330,6 +4342,8 @@ struct mgmt_frame_regs {
  * @add_link_station: Add a link to a station.
  * @mod_link_station: Modify a link of a station.
  * @del_link_station: Remove a link of a station.
+ *
+ * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4683,6 +4697,8 @@ struct cfg80211_ops {
 				    struct link_station_parameters *params);
 	int	(*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
 				    struct link_station_del_parameters *params);
+	int	(*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev,
+				    struct cfg80211_set_hw_timestamp *hwts);
 };
 
 /*
@@ -5139,6 +5155,8 @@ struct wiphy_iftype_akm_suites {
 	int n_akm_suites;
 };
 
+#define CFG80211_HW_TIMESTAMP_ALL_PEERS	0xffff
+
 /**
  * struct wiphy - wireless hardware description
  * @mtx: mutex for the data (structures) of this device
@@ -5348,6 +5366,13 @@ struct wiphy_iftype_akm_suites {
  *	NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
  *	legacy userspace and maximum allowed value is
  *	CFG80211_MAX_NUM_AKM_SUITES.
+ *
+ * @hw_timestamp_max_peers: maximum number of peers that the driver supports
+ *	enabling HW timestamping for concurrently. Setting this field to a
+ *	non-zero value indicates that the driver supports HW timestamping.
+ *	A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver
+ *	supports enabling HW timestamping for all peers (i.e. no need to
+ *	specify a mac address).
  */
 struct wiphy {
 	struct mutex mtx;
@@ -5496,6 +5521,8 @@ struct wiphy {
 	u8 ema_max_profile_periodicity;
 	u16 max_num_akm_suites;
 
+	u16 hw_timestamp_max_peers;
+
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c22eeb18b996..c8520c150f9c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1299,6 +1299,16 @@
  * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station
  * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station
  *
+ * @NL80211_CMD_SET_HW_TIMESTAMP: Enable/disable HW timestamping of Timing
+ *	measurement and Fine timing measurement frames. If %NL80211_ATTR_MAC
+ *	is included, enable/disable HW timestamping only for frames to/from the
+ *	specified MAC address. Otherwise enable/disable HW timestamping for
+ *	all TM/FTM frames (including ones that were enabled with specific MAC
+ *	address). If %NL80211_ATTR_HW_TIMESTAMP_ENABLED is not included, disable
+ *	HW timestamping.
+ *	The number of peers that HW timestamping can be enabled for concurrently
+ *	is indicated by %NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1550,6 +1560,8 @@ enum nl80211_commands {
 	NL80211_CMD_MODIFY_LINK_STA,
 	NL80211_CMD_REMOVE_LINK_STA,
 
+	NL80211_CMD_SET_HW_TIMESTAMP,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2775,6 +2787,13 @@ enum nl80211_commands {
  *	indicates that the sub-channel is punctured. Higher 16 bits are
  *	reserved.
  *
+ * @NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS: Maximum number of peers that HW
+ *	timestamping can be enabled for concurrently (u16), a wiphy attribute.
+ *	A value of 0xffff indicates setting for all peers (i.e. not specifying
+ *	an address with %NL80211_CMD_SET_HW_TIMESTAMP) is supported.
+ * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should
+ *	be enabled or not (flag attribute).
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3306,6 +3325,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PUNCT_BITMAP,
 
+	NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
+	NL80211_ATTR_HW_TIMESTAMP_ENABLED,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 112b4bb009c8..ab0497efdd37 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -806,6 +806,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
 	[NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_RANGE(NLA_U8, 0, 0xffff),
+
+	[NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 },
+	[NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -2964,6 +2967,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO)
 			nla_put_flag(msg, NL80211_ATTR_MLO_SUPPORT);
 
+		if (rdev->wiphy.hw_timestamp_max_peers &&
+		    nla_put_u16(msg, NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
+				rdev->wiphy.hw_timestamp_max_peers))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -16162,6 +16170,29 @@ nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
 	return ret;
 }
 
+static int nl80211_set_hw_timestamp(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_set_hw_timestamp hwts = {};
+
+	if (!rdev->wiphy.hw_timestamp_max_peers)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MAC] &&
+	    rdev->wiphy.hw_timestamp_max_peers != CFG80211_HW_TIMESTAMP_ALL_PEERS)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		hwts.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	hwts.enable =
+		nla_get_flag(info->attrs[NL80211_ATTR_HW_TIMESTAMP_ENABLED]);
+
+	return rdev_set_hw_timestamp(rdev, dev, &hwts);
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -17336,6 +17367,12 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
+	{
+		.cmd = NL80211_CMD_SET_HW_TIMESTAMP,
+		.doit = nl80211_set_hw_timestamp,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 13b209a8db28..2e497cf26ef2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1494,4 +1494,21 @@ rdev_del_link_station(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct cfg80211_set_hw_timestamp *hwts)
+{
+	struct wiphy *wiphy = &rdev->wiphy;
+	int ret;
+
+	if (!rdev->ops->set_hw_timestamp)
+		return -EOPNOTSUPP;
+
+	trace_rdev_set_hw_timestamp(wiphy, dev, hwts);
+	ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts);
+	trace_rdev_return_int(wiphy, ret);
+
+	return ret;
+}
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ca7474eec723..f3fcfc4fcce5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3918,6 +3918,31 @@ TRACE_EVENT(rdev_del_link_station,
 		  __entry->link_id)
 );
 
+TRACE_EVENT(rdev_set_hw_timestamp,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_set_hw_timestamp *hwts),
+
+	TP_ARGS(wiphy, netdev, hwts),
+
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		MAC_ENTRY(macaddr)
+		__field(bool, enable)
+	),
+
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(macaddr, hwts->macaddr);
+		__entry->enable = hwts->enable;
+	),
+
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
+		  __entry->enable)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 81202305f7c282c356c337dded8472d884acd94b Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 1 Mar 2023 12:09:22 +0200
Subject: wifi: mac80211: add support for set_hw_timestamp command

Support the set_hw_timestamp callback for enabling and disabling HW
timestamping if the low level driver supports it.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.700ded7badde.Ib2f7c228256ce313a04d3d9f9ecc6c7b9aa602bb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 ++++++
 net/mac80211/cfg.c     | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 219fd15893b0..6946c9d95aec 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4230,6 +4230,9 @@ struct ieee80211_prep_tx_info {
  *	Note that a sta can also be inserted or removed with valid links,
  *	i.e. passed to @sta_add/@sta_state with sta->valid_links not zero.
  *	In fact, cannot change from having valid_links and not having them.
+ * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is
+ *	not restored at HW reset by mac80211 so drivers need to take care of
+ *	that.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4589,6 +4592,9 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
 				u16 old_links, u16 new_links);
+	int (*set_hw_timestamp)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif,
+				struct cfg80211_set_hw_timestamp *hwts);
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 8eb342300868..7e90f4a81962 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4904,6 +4904,22 @@ ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev,
 	return ret;
 }
 
+static int ieee80211_set_hw_timestamp(struct wiphy *wiphy,
+				      struct net_device *dev,
+				      struct cfg80211_set_hw_timestamp *hwts)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	if (!local->ops->set_hw_timestamp)
+		return -EOPNOTSUPP;
+
+	if (!check_sdata_in_driver(sdata))
+		return -EIO;
+
+	return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts);
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -5014,4 +5030,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.add_link_station = ieee80211_add_link_station,
 	.mod_link_station = ieee80211_mod_link_station,
 	.del_link_station = ieee80211_del_link_station,
+	.set_hw_timestamp = ieee80211_set_hw_timestamp,
 };
-- 
cgit v1.2.3


From 4c532321bf90288dae6b07a3f52279bfde842a80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:23 +0200
Subject: wifi: cfg80211/mac80211: report link ID on control port RX

For control port RX, report the link ID for MLO.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.fe06dfc3791b.Iddcab94789cafe336417be406072ce8a6312fc2d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  5 +++--
 net/mac80211/rx.c      |  2 +-
 net/wireless/nl80211.c | 15 ++++++++++-----
 net/wireless/trace.h   | 11 +++++++----
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f0da61c6ec4b..7cebba1c4135 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8126,6 +8126,7 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
  *	responsible for any cleanup.  The caller must also ensure that
  *	skb->protocol is set appropriately.
  * @unencrypted: Whether the frame was received unencrypted
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
  *
  * This function is used to inform userspace about a received control port
  * frame.  It should only be used if userspace indicated it wants to receive
@@ -8136,8 +8137,8 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
  *
  * Return: %true if the frame was passed to userspace
  */
-bool cfg80211_rx_control_port(struct net_device *dev,
-			      struct sk_buff *skb, bool unencrypted);
+bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
+			      bool unencrypted, int link_id);
 
 /**
  * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 095bcd2552bb..f63ed6b91d9b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2591,7 +2591,7 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
 		struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 		bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED);
 
-		cfg80211_rx_control_port(dev, skb, noencrypt);
+		cfg80211_rx_control_port(dev, skb, noencrypt, rx->link_id);
 		dev_kfree_skb(skb);
 	} else {
 		struct ethhdr *ehdr = (void *)skb_mac_header(skb);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ab0497efdd37..85f714e1af87 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18754,7 +18754,9 @@ EXPORT_SYMBOL(cfg80211_mgmt_tx_status_ext);
 
 static int __nl80211_rx_control_port(struct net_device *dev,
 				     struct sk_buff *skb,
-				     bool unencrypted, gfp_t gfp)
+				     bool unencrypted,
+				     int link_id,
+				     gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -18786,6 +18788,8 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 			      NL80211_ATTR_PAD) ||
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
 	    nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
+	    (link_id >= 0 &&
+	     nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) ||
 	    (unencrypted && nla_put_flag(msg,
 					 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
 		goto nla_put_failure;
@@ -18804,13 +18808,14 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 	return -ENOBUFS;
 }
 
-bool cfg80211_rx_control_port(struct net_device *dev,
-			      struct sk_buff *skb, bool unencrypted)
+bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
+			      bool unencrypted, int link_id)
 {
 	int ret;
 
-	trace_cfg80211_rx_control_port(dev, skb, unencrypted);
-	ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC);
+	trace_cfg80211_rx_control_port(dev, skb, unencrypted, link_id);
+	ret = __nl80211_rx_control_port(dev, skb, unencrypted, link_id,
+					GFP_ATOMIC);
 	trace_cfg80211_return_bool(ret == 0);
 	return ret == 0;
 }
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f3fcfc4fcce5..716a1fa70069 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3165,14 +3165,15 @@ TRACE_EVENT(cfg80211_control_port_tx_status,
 
 TRACE_EVENT(cfg80211_rx_control_port,
 	TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
-		 bool unencrypted),
-	TP_ARGS(netdev, skb, unencrypted),
+		 bool unencrypted, int link_id),
+	TP_ARGS(netdev, skb, unencrypted, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(int, len)
 		MAC_ENTRY(from)
 		__field(u16, proto)
 		__field(bool, unencrypted)
+		__field(int, link_id)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
@@ -3180,10 +3181,12 @@ TRACE_EVENT(cfg80211_rx_control_port,
 		MAC_ASSIGN(from, eth_hdr(skb)->h_source);
 		__entry->proto = be16_to_cpu(skb->protocol);
 		__entry->unencrypted = unencrypted;
+		__entry->link_id = link_id;
 	),
-	TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s",
+	TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s, link: %d",
 		  NETDEV_PR_ARG, __entry->len, __entry->from,
-		  __entry->proto, BOOL_TO_STR(__entry->unencrypted))
+		  __entry->proto, BOOL_TO_STR(__entry->unencrypted),
+		  __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
-- 
cgit v1.2.3


From e1f113cc67870375eae0c7b84c2a40cc6388d903 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:25 +0200
Subject: wifi: mac80211: add pointer from bss_conf to vif

While often not needed, this considerably simplifies going from a link
specific bss_config to the vif. This helps with e.g. creating link
specific debugfs entries inside drivers.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.46f701a10ed5.I20390b2a8165ff222d66585915689206ea93222b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 3 +++
 net/mac80211/link.c    | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6946c9d95aec..2a1874d994e0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -534,6 +534,7 @@ struct ieee80211_fils_discovery {
  * This structure keeps information about a BSS (and an association
  * to that BSS) that can change during the lifetime of the BSS.
  *
+ * @vif: reference to owning VIF
  * @addr: (link) address used locally
  * @link_id: link ID, or 0 for non-MLO
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
@@ -675,6 +676,8 @@ struct ieee80211_fils_discovery {
  *	bandwidth
  */
 struct ieee80211_bss_conf {
+	struct ieee80211_vif *vif;
+
 	const u8 *bssid;
 	unsigned int link_id;
 	u8 addr[ETH_ALEN] __aligned(2);
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 8c8869cc1fb4..e6f9fce1dadb 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -34,6 +34,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 	link->link_id = link_id;
 	link->conf = link_conf;
 	link_conf->link_id = link_id;
+	link_conf->vif = &sdata->vif;
 
 	INIT_WORK(&link->csa_finalize_work,
 		  ieee80211_csa_finalize_work);
-- 
cgit v1.2.3


From 170cd6a66d9a164180eb4dc72d50afa6ce1ce566 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:27 +0200
Subject: wifi: mac80211: add netdev per-link debugfs data and driver hook

This adds the infrastructure to have netdev specific per-link data both
for mac80211 and the driver in debugfs. For the driver, a new callback
is added which is only used if MLO is supported.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.fb4c947e4df8.I69b3516ddf4c8a7501b395f652d6063444ecad63@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h        |  10 ++
 net/mac80211/debugfs_netdev.c | 222 ++++++++++++++++++++++++++++++++----------
 net/mac80211/debugfs_netdev.h |  16 +++
 net/mac80211/driver-ops.c     |  25 ++++-
 net/mac80211/driver-ops.h     |  16 +++
 net/mac80211/ieee80211_i.h    |   4 +
 net/mac80211/link.c           |   4 +
 7 files changed, 242 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2a1874d994e0..5df9eb828a58 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3844,6 +3844,12 @@ struct ieee80211_prep_tx_info {
  *	the station. See @sta_pre_rcu_remove if needed.
  *	This callback can sleep.
  *
+ * @link_add_debugfs: Drivers can use this callback to add debugfs files
+ *	when a link is added to a mac80211 vif. This callback should be within
+ *	a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep.
+ *	For non-MLO the callback will be called once for the default bss_conf
+ *	with the vif's directory rather than a separate subdirectory.
+ *
  * @sta_add_debugfs: Drivers can use this callback to add debugfs files
  *	when a station is added to mac80211's station list. This callback
  *	should be within a CONFIG_MAC80211_DEBUGFS conditional. This
@@ -4325,6 +4331,10 @@ struct ieee80211_ops {
 	int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			  struct ieee80211_sta *sta);
 #ifdef CONFIG_MAC80211_DEBUGFS
+	void (*link_add_debugfs)(struct ieee80211_hw *hw,
+				 struct ieee80211_vif *vif,
+				 struct ieee80211_bss_conf *link_conf,
+				 struct dentry *dir);
 	void (*sta_add_debugfs)(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 187bb22c0bbd..b0cef37eb394 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -23,16 +23,16 @@
 #include "driver-ops.h"
 
 static ssize_t ieee80211_if_read(
-	struct ieee80211_sub_if_data *sdata,
+	void *data,
 	char __user *userbuf,
 	size_t count, loff_t *ppos,
-	ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
+	ssize_t (*format)(const void *, char *, int))
 {
 	char buf[200];
 	ssize_t ret = -EINVAL;
 
 	read_lock(&dev_base_lock);
-	ret = (*format)(sdata, buf, sizeof(buf));
+	ret = (*format)(data, buf, sizeof(buf));
 	read_unlock(&dev_base_lock);
 
 	if (ret >= 0)
@@ -42,10 +42,10 @@ static ssize_t ieee80211_if_read(
 }
 
 static ssize_t ieee80211_if_write(
-	struct ieee80211_sub_if_data *sdata,
+	void *data,
 	const char __user *userbuf,
 	size_t count, loff_t *ppos,
-	ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int))
+	ssize_t (*write)(void *, const char *, int))
 {
 	char buf[64];
 	ssize_t ret;
@@ -58,64 +58,64 @@ static ssize_t ieee80211_if_write(
 	buf[count] = '\0';
 
 	rtnl_lock();
-	ret = (*write)(sdata, buf, count);
+	ret = (*write)(data, buf, count);
 	rtnl_unlock();
 
 	return ret;
 }
 
-#define IEEE80211_IF_FMT(name, field, format_string)			\
+#define IEEE80211_IF_FMT(name, type, field, format_string)		\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	const type *data, char *buf,					\
 	int buflen)							\
 {									\
-	return scnprintf(buf, buflen, format_string, sdata->field);	\
+	return scnprintf(buf, buflen, format_string, data->field);	\
 }
-#define IEEE80211_IF_FMT_DEC(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%d\n")
-#define IEEE80211_IF_FMT_HEX(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%#x\n")
-#define IEEE80211_IF_FMT_LHEX(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%#lx\n")
+#define IEEE80211_IF_FMT_DEC(name, type, field)				\
+		IEEE80211_IF_FMT(name, type, field, "%d\n")
+#define IEEE80211_IF_FMT_HEX(name, type, field)				\
+		IEEE80211_IF_FMT(name, type, field, "%#x\n")
+#define IEEE80211_IF_FMT_LHEX(name, type, field)			\
+		IEEE80211_IF_FMT(name, type, field, "%#lx\n")
 
-#define IEEE80211_IF_FMT_HEXARRAY(name, field)				\
+#define IEEE80211_IF_FMT_HEXARRAY(name, type, field)			\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
 	char *p = buf;							\
 	int i;								\
-	for (i = 0; i < sizeof(sdata->field); i++) {			\
+	for (i = 0; i < sizeof(data->field); i++) {			\
 		p += scnprintf(p, buflen + buf - p, "%.2x ",		\
-				 sdata->field[i]);			\
+				 data->field[i]);			\
 	}								\
 	p += scnprintf(p, buflen + buf - p, "\n");			\
 	return p - buf;							\
 }
 
-#define IEEE80211_IF_FMT_ATOMIC(name, field)				\
+#define IEEE80211_IF_FMT_ATOMIC(name, type, field)			\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
-	return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\
+	return scnprintf(buf, buflen, "%d\n", atomic_read(&data->field));\
 }
 
-#define IEEE80211_IF_FMT_MAC(name, field)				\
+#define IEEE80211_IF_FMT_MAC(name, type, field)				\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	const type *data, char *buf,					\
 	int buflen)							\
 {									\
-	return scnprintf(buf, buflen, "%pM\n", sdata->field);		\
+	return scnprintf(buf, buflen, "%pM\n", data->field);		\
 }
 
-#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field)			\
+#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, type, field)		\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
 	return scnprintf(buf, buflen, "%d\n",				\
-			 jiffies_to_msecs(sdata->field));		\
+			 jiffies_to_msecs(data->field));		\
 }
 
 #define _IEEE80211_IF_FILE_OPS(name, _read, _write)			\
@@ -126,43 +126,67 @@ static const struct file_operations name##_ops = {			\
 	.llseek = generic_file_llseek,					\
 }
 
-#define _IEEE80211_IF_FILE_R_FN(name)					\
+#define _IEEE80211_IF_FILE_R_FN(name, type)				\
 static ssize_t ieee80211_if_read_##name(struct file *file,		\
 					char __user *userbuf,		\
 					size_t count, loff_t *ppos)	\
 {									\
+	ssize_t (*fn)(const void *, char *, int) = (void *)		\
+		((ssize_t (*)(const type, char *, int))			\
+		 ieee80211_if_fmt_##name);				\
 	return ieee80211_if_read(file->private_data,			\
-				 userbuf, count, ppos,			\
-				 ieee80211_if_fmt_##name);		\
+				 userbuf, count, ppos, fn);		\
 }
 
-#define _IEEE80211_IF_FILE_W_FN(name)					\
+#define _IEEE80211_IF_FILE_W_FN(name, type)				\
 static ssize_t ieee80211_if_write_##name(struct file *file,		\
 					 const char __user *userbuf,	\
 					 size_t count, loff_t *ppos)	\
 {									\
+	ssize_t (*fn)(void *, const char *, int) = (void *)		\
+		((ssize_t (*)(type, const char *, int))			\
+		 ieee80211_if_parse_##name);				\
 	return ieee80211_if_write(file->private_data, userbuf, count,	\
-				  ppos, ieee80211_if_parse_##name);	\
+				  ppos, fn);				\
 }
 
 #define IEEE80211_IF_FILE_R(name)					\
-	_IEEE80211_IF_FILE_R_FN(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, NULL)
 
 #define IEEE80211_IF_FILE_W(name)					\
-	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, NULL, ieee80211_if_write_##name)
 
 #define IEEE80211_IF_FILE_RW(name)					\
-	_IEEE80211_IF_FILE_R_FN(name)					\
-	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_sub_if_data *)	\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name,		\
 			       ieee80211_if_write_##name)
 
 #define IEEE80211_IF_FILE(name, field, format)				\
-	IEEE80211_IF_FMT_##format(name, field)				\
+	IEEE80211_IF_FMT_##format(name, struct ieee80211_sub_if_data, field) \
 	IEEE80211_IF_FILE_R(name)
 
+/* Same but with a link_ prefix in the ops variable name and different type */
+#define IEEE80211_IF_LINK_FILE_R(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name, NULL)
+
+#define IEEE80211_IF_LINK_FILE_W(name)					\
+	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_OPS(link_##name, NULL, ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE_RW(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name,	\
+			       ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE(name, field, format)				\
+	IEEE80211_IF_FMT_##format(name, struct ieee80211_link_data, field) \
+	IEEE80211_IF_LINK_FILE_R(name)
+
 /* common attributes */
 IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ],
 		  HEX);
@@ -207,9 +231,9 @@ IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz);
 
 IEEE80211_IF_FILE(flags, flags, HEX);
 IEEE80211_IF_FILE(state, state, LHEX);
-IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC);
-IEEE80211_IF_FILE(ap_power_level, deflink.ap_power_level, DEC);
-IEEE80211_IF_FILE(user_power_level, deflink.user_power_level, DEC);
+IEEE80211_IF_LINK_FILE(txpower, conf->txpower, DEC);
+IEEE80211_IF_LINK_FILE(ap_power_level, ap_power_level, DEC);
+IEEE80211_IF_LINK_FILE(user_power_level, user_power_level, DEC);
 
 static ssize_t
 ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata,
@@ -236,9 +260,10 @@ IEEE80211_IF_FILE(bssid, deflink.u.mgd.bssid, MAC);
 IEEE80211_IF_FILE(aid, vif.cfg.aid, DEC);
 IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
 
-static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_set_smps(struct ieee80211_link_data *link,
 			      enum ieee80211_smps_mode smps_mode)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	int err;
 
@@ -256,7 +281,7 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
 		return -EOPNOTSUPP;
 
 	sdata_lock(sdata);
-	err = __ieee80211_request_smps_mgd(sdata, &sdata->deflink, smps_mode);
+	err = __ieee80211_request_smps_mgd(link->sdata, link, smps_mode);
 	sdata_unlock(sdata);
 
 	return err;
@@ -269,24 +294,24 @@ static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
 	[IEEE80211_SMPS_DYNAMIC] = "dynamic",
 };
 
-static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_link_data *link,
 				     char *buf, int buflen)
 {
-	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+	if (link->sdata->vif.type == NL80211_IFTYPE_STATION)
 		return snprintf(buf, buflen, "request: %s\nused: %s\n",
-				smps_modes[sdata->deflink.u.mgd.req_smps],
-				smps_modes[sdata->deflink.smps_mode]);
+				smps_modes[link->u.mgd.req_smps],
+				smps_modes[link->smps_mode]);
 	return -EINVAL;
 }
 
-static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_parse_smps(struct ieee80211_link_data *link,
 				       const char *buf, int buflen)
 {
 	enum ieee80211_smps_mode mode;
 
 	for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) {
 		if (strncmp(buf, smps_modes[mode], buflen) == 0) {
-			int err = ieee80211_set_smps(sdata, mode);
+			int err = ieee80211_set_smps(link, mode);
 			if (!err)
 				return buflen;
 			return err;
@@ -295,7 +320,7 @@ static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
 
 	return -EINVAL;
 }
-IEEE80211_IF_FILE_RW(smps);
+IEEE80211_IF_LINK_FILE_RW(smps);
 
 static ssize_t ieee80211_if_parse_tkip_mic_test(
 	struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
@@ -595,6 +620,8 @@ static ssize_t ieee80211_if_parse_active_links(struct ieee80211_sub_if_data *sda
 }
 IEEE80211_IF_FILE_RW(active_links);
 
+IEEE80211_IF_LINK_FILE(addr, conf->addr, MAC);
+
 #ifdef CONFIG_MAC80211_MESH
 IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC);
 
@@ -685,7 +712,6 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(bssid);
 	DEBUGFS_ADD(aid);
 	DEBUGFS_ADD(beacon_timeout);
-	DEBUGFS_ADD_MODE(smps, 0600);
 	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
 	DEBUGFS_ADD_MODE(beacon_loss, 0200);
 	DEBUGFS_ADD_MODE(uapsd_queues, 0600);
@@ -788,9 +814,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 
 	DEBUGFS_ADD(flags);
 	DEBUGFS_ADD(state);
-	DEBUGFS_ADD(txpower);
-	DEBUGFS_ADD(user_power_level);
-	DEBUGFS_ADD(ap_power_level);
 
 	if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
 		add_common_files(sdata);
@@ -820,6 +843,31 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 	}
 }
 
+#undef DEBUGFS_ADD_MODE
+#undef DEBUGFS_ADD
+
+#define DEBUGFS_ADD_MODE(dentry, name, mode) \
+	debugfs_create_file(#name, mode, dentry, \
+			    link, &link_##name##_ops)
+
+#define DEBUGFS_ADD(dentry, name) DEBUGFS_ADD_MODE(dentry, name, 0400)
+
+static void add_link_files(struct ieee80211_link_data *link,
+			   struct dentry *dentry)
+{
+	DEBUGFS_ADD(dentry, txpower);
+	DEBUGFS_ADD(dentry, user_power_level);
+	DEBUGFS_ADD(dentry, ap_power_level);
+
+	switch (link->sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		DEBUGFS_ADD_MODE(dentry, smps, 0600);
+		break;
+	default:
+		break;
+	}
+}
+
 void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
 {
 	char buf[10+IFNAMSIZ];
@@ -830,6 +878,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
 	sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
 							sdata->vif.debugfs_dir);
 	add_files(sdata);
+
+	if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		add_link_files(&sdata->deflink, sdata->vif.debugfs_dir);
 }
 
 void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
@@ -855,3 +906,66 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
 	sprintf(buf, "netdev:%s", sdata->name);
 	debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
 }
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{
+	char link_dir_name[10];
+
+	if (WARN_ON(!link->sdata->vif.debugfs_dir))
+		return;
+
+	/* For now, this should not be called for non-MLO capable drivers */
+	if (WARN_ON(!(link->sdata->local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)))
+		return;
+
+	snprintf(link_dir_name, sizeof(link_dir_name),
+		 "link-%d", link->link_id);
+
+	link->debugfs_dir =
+		debugfs_create_dir(link_dir_name,
+				   link->sdata->vif.debugfs_dir);
+
+	DEBUGFS_ADD(link->debugfs_dir, addr);
+	add_link_files(link, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{
+	if (!link->sdata->vif.debugfs_dir || !link->debugfs_dir) {
+		link->debugfs_dir = NULL;
+		return;
+	}
+
+	if (link->debugfs_dir == link->sdata->vif.debugfs_dir) {
+		WARN_ON(link != &link->sdata->deflink);
+		link->debugfs_dir = NULL;
+		return;
+	}
+
+	debugfs_remove_recursive(link->debugfs_dir);
+	link->debugfs_dir = NULL;
+}
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{
+	if (WARN_ON(!link->debugfs_dir))
+		return;
+
+	drv_link_add_debugfs(link->sdata->local, link->sdata,
+			     link->conf, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{
+	if (!link || !link->debugfs_dir)
+		return;
+
+	if (WARN_ON(link->debugfs_dir == link->sdata->vif.debugfs_dir))
+		return;
+
+	/* Recreate the directory excluding the driver data */
+	debugfs_remove_recursive(link->debugfs_dir);
+	link->debugfs_dir = NULL;
+
+	ieee80211_link_debugfs_add(link);
+}
diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h
index a7e9d8d518f9..99e688dcabd6 100644
--- a/net/mac80211/debugfs_netdev.h
+++ b/net/mac80211/debugfs_netdev.h
@@ -10,6 +10,12 @@
 void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata);
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link);
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link);
 #else
 static inline void ieee80211_debugfs_add_netdev(
 	struct ieee80211_sub_if_data *sdata)
@@ -20,6 +26,16 @@ static inline void ieee80211_debugfs_remove_netdev(
 static inline void ieee80211_debugfs_rename_netdev(
 	struct ieee80211_sub_if_data *sdata)
 {}
+
+static inline void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{}
+
+static inline void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{}
 #endif
 
 #endif /* __IEEE80211_DEBUGFS_NETDEV_H */
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index cfb09e4aed4d..30cd0c905a24 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -8,6 +8,7 @@
 #include "trace.h"
 #include "driver-ops.h"
 #include "debugfs_sta.h"
+#include "debugfs_netdev.h"
 
 int drv_start(struct ieee80211_local *local)
 {
@@ -477,6 +478,10 @@ int drv_change_vif_links(struct ieee80211_local *local,
 			 u16 old_links, u16 new_links,
 			 struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS])
 {
+	struct ieee80211_link_data *link;
+	unsigned long links_to_add;
+	unsigned long links_to_rem;
+	unsigned int link_id;
 	int ret = -EOPNOTSUPP;
 
 	might_sleep();
@@ -487,13 +492,31 @@ int drv_change_vif_links(struct ieee80211_local *local,
 	if (old_links == new_links)
 		return 0;
 
+	links_to_add = ~old_links & new_links;
+	links_to_rem = old_links & ~new_links;
+
+	for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link = rcu_access_pointer(sdata->link[link_id]);
+
+		ieee80211_link_debugfs_drv_remove(link);
+	}
+
 	trace_drv_change_vif_links(local, sdata, old_links, new_links);
 	if (local->ops->change_vif_links)
 		ret = local->ops->change_vif_links(&local->hw, &sdata->vif,
 						   old_links, new_links, old);
 	trace_drv_return_int(local, ret);
 
-	return ret;
+	if (ret)
+		return ret;
+
+	for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link = rcu_access_pointer(sdata->link[link_id]);
+
+		ieee80211_link_debugfs_drv_add(link);
+	}
+
+	return 0;
 }
 
 int drv_change_sta_links(struct ieee80211_local *local,
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 5d13a3dfd366..a68d606e6987 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -465,6 +465,22 @@ static inline void drv_sta_remove(struct ieee80211_local *local,
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
+static inline void drv_link_add_debugfs(struct ieee80211_local *local,
+					struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_bss_conf *link_conf,
+					struct dentry *dir)
+{
+	might_sleep();
+
+	sdata = get_bss_sdata(sdata);
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (local->ops->link_add_debugfs)
+		local->ops->link_add_debugfs(&local->hw, &sdata->vif,
+					     link_conf, dir);
+}
+
 static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
 				       struct ieee80211_sub_if_data *sdata,
 				       struct ieee80211_sta *sta,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ecc232eb1ee8..3d4edc25a69e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -999,6 +999,10 @@ struct ieee80211_link_data {
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 
 	struct ieee80211_bss_conf *conf;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *debugfs_dir;
+#endif
 };
 
 struct ieee80211_sub_if_data {
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index e6f9fce1dadb..e82db88a47f8 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -10,6 +10,7 @@
 #include "ieee80211_i.h"
 #include "driver-ops.h"
 #include "key.h"
+#include "debugfs_netdev.h"
 
 void ieee80211_link_setup(struct ieee80211_link_data *link)
 {
@@ -61,6 +62,8 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 		default:
 			WARN_ON(1);
 		}
+
+		ieee80211_link_debugfs_add(link);
 	}
 }
 
@@ -94,6 +97,7 @@ static void ieee80211_tear_down_links(struct ieee80211_sub_if_data *sdata,
 		if (WARN_ON(!link))
 			continue;
 		ieee80211_remove_link_keys(link, &keys);
+		ieee80211_link_debugfs_remove(link);
 		ieee80211_link_stop(link);
 	}
 
-- 
cgit v1.2.3


From 18cbf7c089ba70fefe1b4c01af28753cabfbf38f Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:34 +0200
Subject: wifi: radiotap: Add EHT radiotap definitions

This is based on https://www.radiotap.org/fields/EHT.html and
https://www.radiotap.org/fields/U-SIG.html new EHT TLV
definition for 11be standard.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.254b19fffe41.I4ce78e2c558da6e5a708a8d68d61b5d7b3eb3746@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 187 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 598f53d2a3a0..0fc2667a9a5d 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2017		Intel Deutschland GmbH
- * Copyright (c) 2018-2019, 2021 Intel Corporation
+ * Copyright (c) 2018-2019, 2021-2022 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -82,11 +82,14 @@ enum ieee80211_radiotap_presence {
 	IEEE80211_RADIOTAP_HE_MU = 24,
 	IEEE80211_RADIOTAP_ZERO_LEN_PSDU = 26,
 	IEEE80211_RADIOTAP_LSIG = 27,
+	IEEE80211_RADIOTAP_TLV = 28,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
 	IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29,
 	IEEE80211_RADIOTAP_VENDOR_NAMESPACE = 30,
-	IEEE80211_RADIOTAP_EXT = 31
+	IEEE80211_RADIOTAP_EXT = 31,
+	IEEE80211_RADIOTAP_EHT_USIG = 33,
+	IEEE80211_RADIOTAP_EHT = 34,
 };
 
 /* for IEEE80211_RADIOTAP_FLAGS */
@@ -360,6 +363,186 @@ enum ieee80211_radiotap_zero_len_psdu_type {
 	IEEE80211_RADIOTAP_ZERO_LEN_PSDU_VENDOR			= 0xff,
 };
 
+struct ieee80211_radiotap_tlv {
+	__le16 type;
+	__le16 len;
+	u8 data[];
+} __packed;
+
+/* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
+ * see www.radiotap.org/fields/U-SIG.html for details
+ */
+struct ieee80211_radiotap_eht_usig {
+	__le32 common;
+	__le32 value;
+	__le32 mask;
+} __packed;
+
+/* ieee80211_radiotap_eht - content of EHT tlv (type 34)
+ * see www.radiotap.org/fields/EHT.html for details
+ */
+struct ieee80211_radiotap_eht {
+	__le32 known;
+	__le32 data[9];
+	__le32 user_info[];
+} __packed;
+
+/* Known field for EHT TLV
+ * The ending defines for what the field applies as following
+ * O - OFDMA (including TB), M - MU-MIMO, S - EHT sounding.
+ */
+enum ieee80211_radiotap_eht_known {
+	IEEE80211_RADIOTAP_EHT_KNOWN_SPATIAL_REUSE		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_KNOWN_GI				= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_KNOWN_EHT_LTF			= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_KNOWN_LDPC_EXTRA_SYM_OM		= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PRE_PADD_FACOR_OM		= 0x00000040,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PE_DISAMBIGUITY_OM		= 0x00000080,
+	IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_O		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_S		= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_KNOWN_CRC1			= 0x00002000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_TAIL1			= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_CRC2_O			= 0x00008000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_TAIL2_O			= 0x00010000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_NSS_S			= 0x00020000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_BEAMFORMED_S		= 0x00040000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_NR_NON_OFDMA_USERS_M	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_CRC_M	= 0x00100000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_TAIL_M	= 0x00200000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_SIZE_OM		= 0x00400000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_INDEX_OM		= 0x00800000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_ALLOC_TB_FMT		= 0x01000000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PRIMARY_80			= 0x02000000,
+};
+
+enum ieee80211_radiotap_eht_data {
+	/* Data 0 */
+	IEEE80211_RADIOTAP_EHT_DATA0_SPATIAL_REUSE		= 0x00000078,
+	IEEE80211_RADIOTAP_EHT_DATA0_GI				= 0x00000180,
+	IEEE80211_RADIOTAP_EHT_DATA0_LTF			= 0x00000600,
+	IEEE80211_RADIOTAP_EHT_DATA0_EHT_LTF			= 0x00003800,
+	IEEE80211_RADIOTAP_EHT_DATA0_LDPC_EXTRA_SYM_OM		= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_DATA0_PRE_PADD_FACOR_OM		= 0x00018000,
+	IEEE80211_RADIOTAP_EHT_DATA0_PE_DISAMBIGUITY_OM		= 0x00020000,
+	IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_S		= 0x000c0000,
+	IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_O		= 0x003c0000,
+	IEEE80211_RADIOTAP_EHT_DATA0_CRC1_O			= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_DATA0_TAIL1_O			= 0xfc000000,
+	/* Data 1 */
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_SIZE			= 0x0000001f,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_INDEX			= 0x00001fe0,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1		= 0x003fe000,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1_KNOWN	= 0x00400000,
+	IEEE80211_RADIOTAP_EHT_DATA1_PRIMARY_80			= 0xc0000000,
+	/* Data 2 */
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2_KNOWN	= 0x20000000,
+	/* Data 3 */
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2_KNOWN	= 0x20000000,
+	/* Data 4 */
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3_KNOWN	= 0x20000000,
+	/* Data 5 */
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5_KNOWN	= 0x20000000,
+	/* Data 6 */
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6_KNOWN	= 0x20000000,
+	/* Data 7 */
+	IEEE80211_RADIOTAP_EHT_DATA7_CRC2_O			= 0x0000000f,
+	IEEE80211_RADIOTAP_EHT_DATA7_TAIL_2_O			= 0x000003f0,
+	IEEE80211_RADIOTAP_EHT_DATA7_NSS_S			= 0x0000f000,
+	IEEE80211_RADIOTAP_EHT_DATA7_BEAMFORMED_S		= 0x00010000,
+	IEEE80211_RADIOTAP_EHT_DATA7_NUM_OF_NON_OFDMA_USERS	= 0x000e0000,
+	IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_CRC	= 0x00f00000,
+	IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_TAIL	= 0x3f000000,
+	/* Data 8 */
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_PS_160	= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B0		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B7_B1	= 0x000001fc,
+};
+
+enum ieee80211_radiotap_eht_user_info {
+	IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID_KNOWN		= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_MCS_KNOWN		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_CODING_KNOWN		= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_KNOWN_O		= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_KNOWN_O	= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_KNOWN_M	= 0x00000040,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_DATA_FOR_USER		= 0x00000080,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID			= 0x0007ff00,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_CODING			= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_MCS			= 0x00f00000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_O			= 0x0f000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_O		= 0x20000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_M	= 0x3f000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_RESEVED_c0000000	= 0xc0000000,
+};
+
+enum ieee80211_radiotap_eht_usig_common {
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER_KNOWN	= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_KNOWN		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL_KNOWN		= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR_KNOWN	= 0x00000008,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP_KNOWN		= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BAD_USIG_CRC		= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER		= 0x00007000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW			= 0x00038000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL		= 0x00040000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR		= 0x01f80000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP			= 0xfe000000,
+};
+
+enum ieee80211_radiotap_eht_usig_mu {
+	/* MU-USIG-1 */
+	IEEE80211_RADIOTAP_EHT_USIG1_MU_B20_B24_DISREGARD	= 0x0000001f,
+	IEEE80211_RADIOTAP_EHT_USIG1_MU_B25_VALIDATE		= 0x00000020,
+	/* MU-USIG-2 */
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B0_B1_PPDU_TYPE		= 0x000000c0,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B2_VALIDATE		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B3_B7_PUNCTURED_INFO	= 0x00003e00,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B8_VALIDATE		= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B9_B10_SIG_MCS		= 0x00018000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B11_B15_EHT_SIG_SYMBOLS = 0x003e0000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B16_B19_CRC		= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B20_B25_TAIL		= 0xfc000000,
+};
+
+enum ieee80211_radiotap_eht_usig_tb {
+	/* TB-USIG-1 */
+	IEEE80211_RADIOTAP_EHT_USIG1_TB_B20_B25_DISREGARD	= 0x0000001f,
+
+	/* TB-USIG-2 */
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B0_B1_PPDU_TYPE		= 0x000000c0,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B2_VALIDATE		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B3_B6_SPATIAL_REUSE_1	= 0x00001e00,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B7_B10_SPATIAL_REUSE_2	= 0x0001e000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B11_B15_DISREGARD	= 0x003e0000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B16_B19_CRC		= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B20_B25_TAIL		= 0xfc000000,
+};
+
 /**
  * ieee80211_get_radiotap_len - get radiotap header length
  */
-- 
cgit v1.2.3


From 9179dff82598ab8b4e88dcc93c9e26a2594efd1a Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:35 +0200
Subject: wifi: mac80211: add support for driver adding radiotap TLVs

The new TLV format enables adding TLVs after the fixed
fields in radiotap, as part of the radiotap header.
Support this and move vendor data to the TLV format,
allowing a reuse of the RX_FLAG_RADIOTAP_VENDOR_DATA as
the new RX_FLAG_RADIOTAP_TLV_AT_END flag.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.b18fd5da8477.I576400ec40a7b35ef97a3b09a99b3a49e9174786@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 27 ++++-----
 drivers/net/wireless/mac80211_hwsim.c         | 45 ++++++++-------
 include/net/ieee80211_radiotap.h              | 20 +++++++
 include/net/mac80211.h                        | 44 +++-----------
 net/mac80211/rx.c                             | 82 +++++++++------------------
 5 files changed, 91 insertions(+), 127 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 549dbe0be223..d1769464d75b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -209,33 +209,34 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm,
 					    struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
-	struct ieee80211_vendor_radiotap *radiotap;
-	const int size = sizeof(*radiotap) + sizeof(__le16);
+	struct ieee80211_radiotap_vendor_tlv *radiotap;
+	const u16 vendor_data_len = sizeof(mvm->cur_aid);
+	const u16 padding = ALIGN(vendor_data_len, 4) - vendor_data_len;
 
 	if (!mvm->cur_aid)
 		return;
 
-	/* ensure alignment */
-	BUILD_BUG_ON((size + 2) % 4);
+	radiotap = skb_put(skb, sizeof(*radiotap) + vendor_data_len + padding);
+	radiotap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
+	radiotap->len = cpu_to_le16(sizeof(*radiotap) -
+				    sizeof(struct ieee80211_radiotap_tlv) +
+				    vendor_data_len);
 
-	radiotap = skb_put(skb, size + 2);
-	radiotap->align = 1;
 	/* Intel OUI */
 	radiotap->oui[0] = 0xf6;
 	radiotap->oui[1] = 0x54;
 	radiotap->oui[2] = 0x25;
 	/* radiotap sniffer config sub-namespace */
-	radiotap->subns = 1;
-	radiotap->present = 0x1;
-	radiotap->len = size - sizeof(*radiotap);
-	radiotap->pad = 2;
-
+	radiotap->oui_subtype = 1;
+	radiotap->vendor_type = 0;
+	/* clear reserved field */
+	radiotap->reserved = 0;
 	/* fill the data now */
 	memcpy(radiotap->data, &mvm->cur_aid, sizeof(mvm->cur_aid));
 	/* and clear the padding */
-	memset(radiotap->data + sizeof(__le16), 0, radiotap->pad);
+	memset(radiotap->data + vendor_data_len, 0, padding);
 
-	rx_status->flag |= RX_FLAG_RADIOTAP_VENDOR_DATA;
+	rx_status->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 }
 
 /* iwl_mvm_pass_packet_to_mac80211 - passes the packet for mac80211 */
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index e9b9340a97dc..152617034d19 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1534,37 +1534,38 @@ static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb)
 	 * the values accordingly.
 	 */
 #ifdef HWSIM_RADIOTAP_OUI
-	struct ieee80211_vendor_radiotap *rtap;
+	struct ieee80211_radiotap_vendor_tlv *rtap;
+	static const char vendor_data[8] = "ABCDEFGH";
+
+	// Make sure no padding is needed
+	BUILD_BUG_ON(sizeof(vendor_data) % 4);
+	/* this is last radiotap info before the mac header, so
+	 * skb_reset_mac_header for mac8022 to know the end of
+	 * the radiotap TLV/beginning of the 802.11 header
+	 */
+	skb_reset_mac_header(skb);
 
 	/*
 	 * Note that this code requires the headroom in the SKB
 	 * that was allocated earlier.
 	 */
-	rtap = skb_push(skb, sizeof(*rtap) + 8 + 4);
+	rtap = skb_push(skb, sizeof(*rtap) + sizeof(vendor_data));
+
+	rtap->len = cpu_to_le16(sizeof(*rtap) -
+				sizeof(struct ieee80211_radiotap_tlv) +
+				sizeof(vendor_data));
+	rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
+
 	rtap->oui[0] = HWSIM_RADIOTAP_OUI[0];
 	rtap->oui[1] = HWSIM_RADIOTAP_OUI[1];
 	rtap->oui[2] = HWSIM_RADIOTAP_OUI[2];
-	rtap->subns = 127;
+	rtap->oui_subtype = 127;
+	/* clear reserved field */
+	rtap->reserved = 0;
+	rtap->vendor_type = 0;
+	memcpy(rtap->data, vendor_data, sizeof(vendor_data));
 
-	/*
-	 * Radiotap vendor namespaces can (and should) also be
-	 * split into fields by using the standard radiotap
-	 * presence bitmap mechanism. Use just BIT(0) here for
-	 * the presence bitmap.
-	 */
-	rtap->present = BIT(0);
-	/* We have 8 bytes of (dummy) data */
-	rtap->len = 8;
-	/* For testing, also require it to be aligned */
-	rtap->align = 8;
-	/* And also test that padding works, 4 bytes */
-	rtap->pad = 4;
-	/* push the data */
-	memcpy(rtap->data, "ABCDEFGH", 8);
-	/* make sure to clear padding, mac80211 doesn't */
-	memset(rtap->data + 8, 0, 4);
-
-	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_VENDOR_DATA;
+	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 #endif
 }
 
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 0fc2667a9a5d..95436686d3fe 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -369,6 +369,26 @@ struct ieee80211_radiotap_tlv {
 	u8 data[];
 } __packed;
 
+/**
+ * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
+ * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
+ * @len: length of data
+ * @oui: radiotap vendor namespace OUI
+ * @oui_subtype: radiotap vendor sub namespace
+ * @vendor_type: radiotap vendor type
+ * @reserved: should always be set to zero (to avoid leaking memory)
+ * @data: the actual vendor namespace data
+ */
+struct ieee80211_radiotap_vendor_tlv {
+	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
+	__le16 len;
+	u8 oui[3];
+	u8 oui_subtype;
+	__le16 vendor_type;
+	__le16 reserved;
+	u8 data[];
+} __packed;
+
 /* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
  * see www.radiotap.org/fields/U-SIG.html for details
  */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5df9eb828a58..3a43ce5fd4ec 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1375,9 +1375,12 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  *	subframes share the same sequence number. Reported subframes can be
  *	either regular MSDU or singly A-MSDUs. Subframes must not be
  *	interleaved with other frames.
- * @RX_FLAG_RADIOTAP_VENDOR_DATA: This frame contains vendor-specific
- *	radiotap data in the skb->data (before the frame) as described by
- *	the &struct ieee80211_vendor_radiotap.
+ * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the
+ *	skb->data (before the 802.11 header).
+ *	If used, the SKB's mac_header pointer must be set to point
+ *	to the 802.11 header after the TLVs, and any padding added after TLV
+ *	data to align to 4 must be cleared by the driver putting the TLVs
+ *	in the skb.
  * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before.
  *	This is used for AMSDU subframes which can have the same PN as
  *	the first subframe.
@@ -1429,7 +1432,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_ONLY_MONITOR		= BIT(17),
 	RX_FLAG_SKIP_MONITOR		= BIT(18),
 	RX_FLAG_AMSDU_MORE		= BIT(19),
-	RX_FLAG_RADIOTAP_VENDOR_DATA	= BIT(20),
+	RX_FLAG_RADIOTAP_TLV_AT_END	= BIT(20),
 	RX_FLAG_MIC_STRIPPED		= BIT(21),
 	RX_FLAG_ALLOW_SAME_PN		= BIT(22),
 	RX_FLAG_ICV_STRIPPED		= BIT(23),
@@ -1569,39 +1572,6 @@ ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status)
 	       (rx_status->freq_offset ? 500 : 0);
 }
 
-/**
- * struct ieee80211_vendor_radiotap - vendor radiotap data information
- * @present: presence bitmap for this vendor namespace
- *	(this could be extended in the future if any vendor needs more
- *	 bits, the radiotap spec does allow for that)
- * @align: radiotap vendor namespace alignment. This defines the needed
- *	alignment for the @data field below, not for the vendor namespace
- *	description itself (which has a fixed 2-byte alignment)
- *	Must be a power of two, and be set to at least 1!
- * @oui: radiotap vendor namespace OUI
- * @subns: radiotap vendor sub namespace
- * @len: radiotap vendor sub namespace skip length, if alignment is done
- *	then that's added to this, i.e. this is only the length of the
- *	@data field.
- * @pad: number of bytes of padding after the @data, this exists so that
- *	the skb data alignment can be preserved even if the data has odd
- *	length
- * @data: the actual vendor namespace data
- *
- * This struct, including the vendor data, goes into the skb->data before
- * the 802.11 header. It's split up in mac80211 using the align/oui/subns
- * data.
- */
-struct ieee80211_vendor_radiotap {
-	u32 present;
-	u8 align;
-	u8 oui[3];
-	u8 subns;
-	u8 pad;
-	u16 len;
-	u8 data[];
-} __packed;
-
 /**
  * enum ieee80211_conf_flags - configuration flags
  *
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f63ed6b91d9b..0255c5745e1c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -55,7 +55,7 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb,
 	/* After pulling radiotap header, clear all flags that indicate
 	 * info in skb->data.
 	 */
-	status->flag &= ~(RX_FLAG_RADIOTAP_VENDOR_DATA |
+	status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END |
 			  RX_FLAG_RADIOTAP_LSIG |
 			  RX_FLAG_RADIOTAP_HE_MU |
 			  RX_FLAG_RADIOTAP_HE);
@@ -126,9 +126,6 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 	/* allocate extra bitmaps */
 	if (status->chains)
 		len += 4 * hweight8(status->chains);
-	/* vendor presence bitmap */
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)
-		len += 4;
 
 	if (ieee80211_have_rx_timestamp(status)) {
 		len = ALIGN(len, 8);
@@ -190,34 +187,28 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		len += 2 * hweight8(status->chains);
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		struct ieee80211_vendor_radiotap *rtap;
-		int vendor_data_offset = 0;
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
+		int tlv_offset = 0;
 
 		/*
 		 * The position to look at depends on the existence (or non-
 		 * existence) of other elements, so take that into account...
 		 */
 		if (status->flag & RX_FLAG_RADIOTAP_HE)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_he);
 		if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_he_mu);
 		if (status->flag & RX_FLAG_RADIOTAP_LSIG)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_lsig);
 
-		rtap = (void *)&skb->data[vendor_data_offset];
+		/* ensure 4 byte alignment for TLV */
+		len = ALIGN(len, 4);
 
-		/* alignment for fixed 6-byte vendor data header */
-		len = ALIGN(len, 2);
-		/* vendor data header */
-		len += 6;
-		if (WARN_ON(rtap->align == 0))
-			rtap->align = 1;
-		len = ALIGN(len, rtap->align);
-		len += rtap->len + rtap->pad;
+		/* TLVs until the mac header */
+		len += skb_mac_header(skb) - &skb->data[tlv_offset];
 	}
 
 	return len;
@@ -313,9 +304,9 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	u32 it_present_val;
 	u16 rx_flags = 0;
 	u16 channel_flags = 0;
+	u32 tlvs_len = 0;
 	int mpdulen, chain;
 	unsigned long chains = status->chains;
-	struct ieee80211_vendor_radiotap rtap = {};
 	struct ieee80211_radiotap_he he = {};
 	struct ieee80211_radiotap_he_mu he_mu = {};
 	struct ieee80211_radiotap_lsig lsig = {};
@@ -336,18 +327,17 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		skb_pull(skb, sizeof(lsig));
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
-		/* rtap.len and rtap.pad are undone immediately */
-		skb_pull(skb, sizeof(rtap) + rtap.len + rtap.pad);
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
+		/* data is pointer at tlv all other info was pulled off */
+		tlvs_len = skb_mac_header(skb) - skb->data;
 	}
 
 	mpdulen = skb->len;
 	if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)))
 		mpdulen += FCS_LEN;
 
-	rthdr = skb_push(skb, rtap_len);
-	memset(rthdr, 0, rtap_len - rtap.len - rtap.pad);
+	rthdr = skb_push(skb, rtap_len - tlvs_len);
+	memset(rthdr, 0, rtap_len - tlvs_len);
 	it_present = &rthdr->it_present;
 
 	/* radiotap header, set always present flags */
@@ -369,13 +359,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 				 BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		it_present_val |= BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE) |
-				  BIT(IEEE80211_RADIOTAP_EXT);
-		put_unaligned_le32(it_present_val, it_present);
-		it_present++;
-		it_present_val = rtap.present;
-	}
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
+		it_present_val |= BIT(IEEE80211_RADIOTAP_TLV);
 
 	put_unaligned_le32(it_present_val, it_present);
 
@@ -706,22 +691,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		*pos++ = status->chain_signal[chain];
 		*pos++ = chain;
 	}
-
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		/* ensure 2 byte alignment for the vendor field as required */
-		if ((pos - (u8 *)rthdr) & 1)
-			*pos++ = 0;
-		*pos++ = rtap.oui[0];
-		*pos++ = rtap.oui[1];
-		*pos++ = rtap.oui[2];
-		*pos++ = rtap.subns;
-		put_unaligned_le16(rtap.len, pos);
-		pos += 2;
-		/* align the actual payload as requested */
-		while ((pos - (u8 *)rthdr) & (rtap.align - 1))
-			*pos++ = 0;
-		/* data (and possible padding) already follows */
-	}
 }
 
 static struct sk_buff *
@@ -797,6 +766,13 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	bool only_monitor = false;
 	unsigned int min_head_len;
 
+	if (WARN_ON_ONCE(status->flag & RX_FLAG_RADIOTAP_TLV_AT_END &&
+			 !skb_mac_header_was_set(origskb))) {
+		/* with this skb no way to know where frame payload starts */
+		dev_kfree_skb(origskb);
+		return NULL;
+	}
+
 	if (status->flag & RX_FLAG_RADIOTAP_HE)
 		rtap_space += sizeof(struct ieee80211_radiotap_he);
 
@@ -806,12 +782,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	if (status->flag & RX_FLAG_RADIOTAP_LSIG)
 		rtap_space += sizeof(struct ieee80211_radiotap_lsig);
 
-	if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
-		struct ieee80211_vendor_radiotap *rtap =
-			(void *)(origskb->data + rtap_space);
-
-		rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
-	}
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
+		rtap_space += skb_mac_header(origskb) - &origskb->data[rtap_space];
 
 	min_head_len = rtap_space;
 
-- 
cgit v1.2.3


From 5383bfff5261422f843de72a4089da9b152291b0 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:50:05 +0800
Subject: wifi: mac80211: introduce ieee80211_refresh_tx_agg_session_timer()

This allows low level drivers to refresh the tx agg session timer, based on
querying stats from the firmware usually. Especially for some mt76 devices
support .net_fill_forward_path would bypass mac80211, which leads to tx BA
session timeout clients that set a timeout in their AddBA response to our
request, even if our request is without a timeout.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/7c3f72eac1c34921cd84a462e60d71e125862152.1676616450.git.ryder.lee@mediatek.com
[slightly clarify commit message, add note about RCU]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 14 ++++++++++++++
 net/mac80211/agg-tx.c  | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 3a43ce5fd4ec..722b99e54ad2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5973,6 +5973,20 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
 				  struct delayed_work *dwork,
 				  unsigned long delay);
 
+/**
+ * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer.
+ * @sta: the station for which to start a BA session
+ * @tid: the TID to BA on.
+ *
+ * This function allows low level driver to refresh tx agg session timer
+ * to maintain BA session, the session level will still be managed by the
+ * mac80211.
+ *
+ * Note: must be called in an RCU critical section.
+ */
+void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta,
+					    u16 tid);
+
 /**
  * ieee80211_start_tx_ba_session - Start a tx Block Ack session.
  * @sta: the station for which to start a BA session
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index f9514bacbd4a..3b651e7f5a73 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -554,6 +554,23 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	ieee80211_send_addba_with_timeout(sta, tid_tx);
 }
 
+void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *pubsta,
+					    u16 tid)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct tid_ampdu_tx *tid_tx;
+
+	if (WARN_ON_ONCE(tid >= IEEE80211_NUM_TIDS))
+		return;
+
+	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+	if (!tid_tx)
+		return;
+
+	tid_tx->last_tx = jiffies;
+}
+EXPORT_SYMBOL(ieee80211_refresh_tx_agg_session_timer);
+
 /*
  * After accepting the AddBA Response we activated a timer,
  * resetting it after each frame that we send.
-- 
cgit v1.2.3


From f4d1181e4759c9c6c97c86cda2cf2d1ddb6a74d2 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:48:59 +0800
Subject: wifi: mac80211: add EHT MU-MIMO related flags in ieee80211_bss_conf

Similar to VHT/HE. This is utilized to pass MU-MIMO configurations
from user space (i.e. hostapd) to driver.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/8d9966c4c1e77cb1ade77d42bdc49905609192e9.1676628065.git.ryder.lee@mediatek.com
[move into combined if statement, reset on !eht]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  9 +++++++++
 net/mac80211/cfg.c     | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 722b99e54ad2..410cd3daaa59 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -674,6 +674,12 @@ struct ieee80211_fils_discovery {
  * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission
  *	(non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU
  *	bandwidth
+ * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU
+ *	beamformer
+ * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU
+ *	beamformee
+ * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU
+ *	beamformer
  */
 struct ieee80211_bss_conf {
 	struct ieee80211_vif *vif;
@@ -761,6 +767,9 @@ struct ieee80211_bss_conf {
 	bool he_su_beamformee;
 	bool he_mu_beamformer;
 	bool he_full_ul_mumimo;
+	bool eht_su_beamformer;
+	bool eht_su_beamformee;
+	bool eht_mu_beamformer;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 2c8fa19c2f24..9789008626a5 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1299,6 +1299,22 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	if (params->eht_cap) {
 		link_conf->eht_puncturing = params->punct_bitmap;
 		changed |= BSS_CHANGED_EHT_PUNCTURING;
+
+		link_conf->eht_su_beamformer =
+			params->eht_cap->fixed.phy_cap_info[0] &
+				IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER;
+		link_conf->eht_su_beamformee =
+			params->eht_cap->fixed.phy_cap_info[0] &
+				IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE;
+		link_conf->eht_mu_beamformer =
+			params->eht_cap->fixed.phy_cap_info[7] &
+				(IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
+				 IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ |
+				 IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ);
+	} else {
+		link_conf->eht_su_beamformer = false;
+		link_conf->eht_su_beamformee = false;
+		link_conf->eht_mu_beamformer = false;
 	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP &&
-- 
cgit v1.2.3


From 2ad7dd9425408bf0ca524102808059c05bad169e Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:49:25 +0800
Subject: wifi: mac80211: add LDPC related flags in ieee80211_bss_conf

This is utilized to pass LDPC configurations from user space
(i.e. hostapd) to driver.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/1de696aaa34efd77a926eb657b8c0fda05aaa177.1676628065.git.ryder.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 ++++++
 net/mac80211/cfg.c     | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 410cd3daaa59..f12edca660ba 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -657,6 +657,9 @@ struct ieee80211_fils_discovery {
  *	write-protected by sdata_lock and local->mtx so holding either is fine
  *	for read access.
  * @color_change_color: the bss color that will be used after the change.
+ * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability.
+ * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability.
+ * @he_ldpc: in AP mode, indicates interface has HE LDPC capability.
  * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU
  *	beamformer
  * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU
@@ -759,6 +762,9 @@ struct ieee80211_bss_conf {
 	bool color_change_active;
 	u8 color_change_color;
 
+	bool ht_ldpc;
+	bool vht_ldpc;
+	bool he_ldpc;
 	bool vht_su_beamformer;
 	bool vht_su_beamformee;
 	bool vht_mu_beamformer;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 9789008626a5..760ad934f9e1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1252,7 +1252,15 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	prev_beacon_int = link_conf->beacon_int;
 	link_conf->beacon_int = params->beacon_interval;
 
+	if (params->ht_cap)
+		link_conf->ht_ldpc =
+			params->ht_cap->cap_info &
+				cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING);
+
 	if (params->vht_cap) {
+		link_conf->vht_ldpc =
+			params->vht_cap->vht_cap_info &
+				cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC);
 		link_conf->vht_su_beamformer =
 			params->vht_cap->vht_cap_info &
 				cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE);
@@ -1282,6 +1290,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	if (params->he_cap) {
+		link_conf->he_ldpc =
+			params->he_cap->phy_cap_info[1] &
+				IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD;
 		link_conf->he_su_beamformer =
 			params->he_cap->phy_cap_info[3] &
 				IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER;
-- 
cgit v1.2.3


From 6933486133ecf71bbe273d7ac72cfc4a51286af3 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Thu, 12 Jan 2023 06:54:13 +0530
Subject: wifi: nl80211: Add support for randomizing TA of auth and deauth
 frames

Add support to use a random local address in authentication and
deauthentication frames sent to unassociated peer when the driver
supports.

The driver needs to configure receive behavior to accept frames with
random transmit address specified in TX path authentication frames
during the time of the frame exchange is pending and such frames need to
be acknowledged similarly to frames sent to the local permanent address
when this random address functionality is used.

This capability allows use of randomized transmit address for PASN
authentication frames to improve privacy of WLAN clients.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/20230112012415.167556-2-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  5 ++++
 net/wireless/mlme.c          | 55 +++++++++++++++++++++++++++++---------------
 2 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c8520c150f9c..9a0ac0363f1f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6348,6 +6348,10 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_SECURE_NAN: Device supports NAN Pairing which enables
  *	authentication, data encryption and message integrity.
  *
+ * @NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA: Device supports randomized TA
+ *	in authentication and deauthentication frames sent to unassociated peer
+ *	using @NL80211_CMD_FRAME.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6418,6 +6422,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE,
 	NL80211_EXT_FEATURE_PUNCT,
 	NL80211_EXT_FEATURE_SECURE_NAN,
+	NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 81d3f40d6235..ac059cefbeb3 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -673,6 +673,39 @@ static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr)
 	return ether_addr_equal(addr, wdev_address(wdev));
 }
 
+static bool cfg80211_allowed_random_address(struct wireless_dev *wdev,
+					    const struct ieee80211_mgmt *mgmt)
+{
+	if (ieee80211_is_auth(mgmt->frame_control) ||
+	    ieee80211_is_deauth(mgmt->frame_control)) {
+		/* Allow random TA to be used with authentication and
+		 * deauthentication frames if the driver has indicated support.
+		 */
+		if (wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA))
+			return true;
+	} else if (ieee80211_is_action(mgmt->frame_control) &&
+		   mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
+		/* Allow random TA to be used with Public Action frames if the
+		 * driver has indicated support.
+		 */
+		if (!wdev->connected &&
+		    wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+			return true;
+
+		if (wdev->connected &&
+		    wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+			return true;
+	}
+
+	return false;
+}
+
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			  struct wireless_dev *wdev,
 			  struct cfg80211_mgmt_tx_params *params, u64 *cookie)
@@ -774,25 +807,9 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			return err;
 	}
 
-	if (!cfg80211_allowed_address(wdev, mgmt->sa)) {
-		/* Allow random TA to be used with Public Action frames if the
-		 * driver has indicated support for this. Otherwise, only allow
-		 * the local address to be used.
-		 */
-		if (!ieee80211_is_action(mgmt->frame_control) ||
-		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
-			return -EINVAL;
-		if (!wdev->connected &&
-		    !wiphy_ext_feature_isset(
-			    &rdev->wiphy,
-			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
-			return -EINVAL;
-		if (wdev->connected &&
-		    !wiphy_ext_feature_isset(
-			    &rdev->wiphy,
-			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
-			return -EINVAL;
-	}
+	if (!cfg80211_allowed_address(wdev, mgmt->sa) &&
+	    !cfg80211_allowed_random_address(wdev, mgmt))
+		return -EINVAL;
 
 	/* Transmit the management frame as requested by user space */
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
-- 
cgit v1.2.3


From 11f45690b3f6c6a2b5c57dbb036df3f838f7c016 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Tue, 7 Mar 2023 14:35:54 +0200
Subject: ASoC: SOF: ipc4: Add macro to set the core_id in create_pipe message

The create pipeline message can carry the target code_id which is set to
0 at the moment.

Add macros to set the core_id in the message extension.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://lore.kernel.org/r/20230307123556.31328-2-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/ipc4/header.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h
index d31349bf011d..49ff1558a171 100644
--- a/include/sound/sof/ipc4/header.h
+++ b/include/sound/sof/ipc4/header.h
@@ -176,6 +176,10 @@ enum sof_ipc4_pipeline_state {
 #define SOF_IPC4_GLB_PIPE_EXT_LP_MASK		BIT(0)
 #define SOF_IPC4_GLB_PIPE_EXT_LP(x)		((x) << SOF_IPC4_GLB_PIPE_EXT_LP_SHIFT)
 
+#define SOF_IPC4_GLB_PIPE_EXT_CORE_ID_SHIFT	20
+#define SOF_IPC4_GLB_PIPE_EXT_CORE_ID_MASK	GENMASK(23, 20)
+#define SOF_IPC4_GLB_PIPE_EXT_CORE_ID(x)	((x) << SOF_IPC4_GLB_PIPE_EXT_CORE_ID_SHIFT)
+
 /* pipeline set state ipc msg */
 #define SOF_IPC4_GLB_PIPE_STATE_ID_SHIFT		16
 #define SOF_IPC4_GLB_PIPE_STATE_ID_MASK		GENMASK(23, 16)
-- 
cgit v1.2.3


From 2d5bcdcda8799cf21f9ab84598c946dd320207a2 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 7 Mar 2023 08:14:06 -0700
Subject: bpf: Increase size of BTF_ID_LIST without CONFIG_DEBUG_INFO_BTF again

After commit 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr"),
clang builds without CONFIG_DEBUG_INFO_BTF warn:

  kernel/bpf/verifier.c:10298:24: warning: array index 16 is past the end of the array (that has type 'u32[16]' (aka 'unsigned int[16]')) [-Warray-bounds]
                                     meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
                                                     ^                  ~~~~~~~~~~~~~~~~~~~~~~~~
  kernel/bpf/verifier.c:9150:1: note: array 'special_kfunc_list' declared here
  BTF_ID_LIST(special_kfunc_list)
  ^
  include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST'
  #define BTF_ID_LIST(name) static u32 __maybe_unused name[16];
                            ^
  1 warning generated.

A warning of this nature was previously addressed by
commit beb3d47d1d3d ("bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set")
but there have been new kfuncs added since then.

Quadruple the size of the CONFIG_DEBUG_INFO_BTF=n definition so that
this problem is unlikely to show up for some time.

Link: https://github.com/ClangBuiltLinux/linux/issues/1810
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20230307-bpf-kfuncs-warray-bounds-v1-1-00ad3191f3a6@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 3a4f7cd882ca..00950cc03bff 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -204,7 +204,7 @@ extern struct btf_id_set8 name;
 
 #else
 
-#define BTF_ID_LIST(name) static u32 __maybe_unused name[16];
+#define BTF_ID_LIST(name) static u32 __maybe_unused name[64];
 #define BTF_ID(prefix, name)
 #define BTF_ID_FLAGS(prefix, name, ...)
 #define BTF_ID_UNUSED
-- 
cgit v1.2.3


From 5da094ac80cd4f6d4114ad220468f602f86d9980 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Fri, 24 Feb 2023 17:31:47 +0530
Subject: bus: mhi: host: Remove mhi_poll() API

mhi_poll() API is not used within the MHI stack and also not by any client
drivers in mainline. So let's remove it until any consumer is available.

Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/host/main.c | 15 ---------------
 include/linux/mhi.h         |  7 -------
 2 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/host/main.c b/drivers/bus/mhi/host/main.c
index df0fbfee7b78..4fa0969472fc 100644
--- a/drivers/bus/mhi/host/main.c
+++ b/drivers/bus/mhi/host/main.c
@@ -1679,18 +1679,3 @@ void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev)
 	}
 }
 EXPORT_SYMBOL_GPL(mhi_unprepare_from_transfer);
-
-int mhi_poll(struct mhi_device *mhi_dev, u32 budget)
-{
-	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
-	struct mhi_chan *mhi_chan = mhi_dev->dl_chan;
-	struct mhi_event *mhi_event = &mhi_cntrl->mhi_event[mhi_chan->er_index];
-	int ret;
-
-	spin_lock_bh(&mhi_event->lock);
-	ret = mhi_event->process_event(mhi_cntrl, mhi_event, budget);
-	spin_unlock_bh(&mhi_event->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mhi_poll);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index a5441ad33c74..f6de4b6ecfc7 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -765,13 +765,6 @@ int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev);
  */
 void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev);
 
-/**
- * mhi_poll - Poll for any available data in DL direction
- * @mhi_dev: Device associated with the channels
- * @budget: # of events to process
- */
-int mhi_poll(struct mhi_device *mhi_dev, u32 budget);
-
 /**
  * mhi_queue_dma - Send or receive DMA mapped buffers from client device
  *                 over MHI channel
-- 
cgit v1.2.3


From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:45:58 +0000
Subject: bpf: add new map ops ->map_mem_usage

Add a new map ops ->map_mem_usage to print the memory usage of a
bpf map.

This is a preparation for the followup change.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 15 +++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3456804f7aa..9059520bbb5e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -161,6 +161,8 @@ struct bpf_map_ops {
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
+	u64 (*map_mem_usage)(const struct bpf_map *map);
+
 	/* BTF id of struct allocated by map_alloc */
 	int *map_btf_id;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ede5f987484f..7c96e68859ec 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -771,16 +771,15 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
 }
 
 #ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
 {
 	unsigned long size;
 
-	size = round_up(map->key_size + bpf_map_value_size(map), 8);
+	if (map->ops->map_mem_usage)
+		return map->ops->map_mem_usage(map);
 
+	size = round_up(map->key_size + bpf_map_value_size(map), 8);
 	return round_up(map->max_entries * size, PAGE_SIZE);
 }
 
@@ -803,7 +802,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
 		   "map_extra:\t%#llx\n"
-		   "memlock:\t%lu\n"
+		   "memlock:\t%llu\n"
 		   "map_id:\t%u\n"
 		   "frozen:\t%u\n",
 		   map->map_type,
@@ -812,7 +811,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->max_entries,
 		   map->map_flags,
 		   (unsigned long long)map->map_extra,
-		   bpf_map_memory_footprint(map),
+		   bpf_map_memory_usage(map),
 		   map->id,
 		   READ_ONCE(map->frozen));
 	if (type) {
-- 
cgit v1.2.3


From 7490b7f1c02ef825ef98f7230662049d4a464a21 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:11 +0000
Subject: bpf, net: bpf_local_storage memory usage

A new helper is introduced into bpf_local_storage map to calculate the
memory usage. This helper is also used by other maps like
bpf_cgrp_storage, bpf_inode_storage, bpf_task_storage and etc.

Note that currently the dynamically allocated storage elements are not
counted in the usage, since it will take extra runtime overhead in the
elements update or delete path. So let's put it aside now, and implement
it in the future when someone really need it.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-15-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  1 +
 kernel/bpf/bpf_cgrp_storage.c     |  1 +
 kernel/bpf/bpf_inode_storage.c    |  1 +
 kernel/bpf/bpf_local_storage.c    | 10 ++++++++++
 kernel/bpf/bpf_task_storage.c     |  1 +
 net/core/bpf_sk_storage.c         |  1 +
 6 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 6d37a40cd90e..d934248b8e81 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -164,5 +164,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
 void bpf_local_storage_free_rcu(struct rcu_head *rcu);
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 6cdf6d9ed91d..9ae07aedaf23 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -221,6 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
 	.map_update_elem = bpf_cgrp_storage_update_elem,
 	.map_delete_elem = bpf_cgrp_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = cgroup_storage_ptr,
 };
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 05f4c66c9089..43e2619c8167 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -223,6 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
 	.map_update_elem = bpf_fd_inode_storage_update_elem,
 	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = inode_storage_ptr,
 };
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 3d320393a12c..d3ba3f2db640 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -685,6 +685,16 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 	return free_storage;
 }
 
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+	u64 usage = sizeof(*smap);
+
+	/* The dynamically callocated selems are not counted currently. */
+	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+	return usage;
+}
+
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1e486055a523..20f942229f3c 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -335,6 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_update_elem = bpf_pid_task_storage_update_elem,
 	.map_delete_elem = bpf_pid_task_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = task_storage_ptr,
 };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index bb378c33f542..7a36353dbc22 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -324,6 +324,7 @@ const struct bpf_map_ops sk_storage_map_ops = {
 	.map_local_storage_charge = bpf_sk_storage_charge,
 	.map_local_storage_uncharge = bpf_sk_storage_uncharge,
 	.map_owner_storage_ptr = bpf_sk_storage_ptr,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 };
 
 const struct bpf_func_proto bpf_sk_storage_get_proto = {
-- 
cgit v1.2.3


From b4fd0d672bca001632d7291b5b162b08e065b815 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:13 +0000
Subject: bpf, net: xskmap memory usage

A new helper is introduced to calculate xskmap memory usage.

The xfsmap memory usage can be dynamically changed when we add or remove
a xsk_map_node. Hence we need to track the count of xsk_map_node to get
its memory usage.

The result as follows,
- before
10: xskmap  name count_map  flags 0x0
        key 4B  value 4B  max_entries 65536  memlock 524288B

- after
10: xskmap  name count_map  flags 0x0 <<< no elements case
        key 4B  value 4B  max_entries 65536  memlock 524608B

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-17-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h |  1 +
 net/xdp/xskmap.c       | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3057e1a4a11c..e96a1151ec75 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -38,6 +38,7 @@ struct xdp_umem {
 struct xsk_map {
 	struct bpf_map map;
 	spinlock_t lock; /* Synchronize map updates */
+	atomic_t count;
 	struct xdp_sock __rcu *xsk_map[];
 };
 
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 771d0fa90ef5..0c38d7175922 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -24,6 +24,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
 		return ERR_PTR(-ENOMEM);
 
 	bpf_map_inc(&map->map);
+	atomic_inc(&map->count);
 
 	node->map = map;
 	node->map_entry = map_entry;
@@ -32,8 +33,11 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
 
 static void xsk_map_node_free(struct xsk_map_node *node)
 {
+	struct xsk_map *map = node->map;
+
 	bpf_map_put(&node->map->map);
 	kfree(node);
+	atomic_dec(&map->count);
 }
 
 static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
@@ -85,6 +89,14 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	return &m->map;
 }
 
+static u64 xsk_map_mem_usage(const struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	return struct_size(m, xsk_map, map->max_entries) +
+		   (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
+}
+
 static void xsk_map_free(struct bpf_map *map)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
@@ -267,6 +279,7 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_mem_usage = xsk_map_mem_usage,
 	.map_btf_id = &xsk_map_btf_ids[0],
 	.map_redirect = xsk_map_redirect,
 };
-- 
cgit v1.2.3


From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:14 +0000
Subject: bpf: offload map memory usage

A new helper is introduced to calculate offload map memory usage. But
currently the memory dynamically allocated in netdev dev_ops, like
nsim_map_update_elem, is not counted. Let's just put it aside now.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 6 ++++++
 kernel/bpf/offload.c | 6 ++++++
 kernel/bpf/syscall.c | 1 +
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9059520bbb5e..6792a7940e1e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2624,6 +2624,7 @@ static inline bool bpf_map_is_offloaded(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map);
 int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 			      const union bpf_attr *kattr,
 			      union bpf_attr __user *uattr);
@@ -2695,6 +2696,11 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
 
+static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+	return 0;
+}
+
 static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 					    const union bpf_attr *kattr,
 					    union bpf_attr __user *uattr)
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0c85e06f7ea7..d9c9f45e3529 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map)
 	bpf_map_area_free(offmap);
 }
 
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+	/* The memory dynamically allocated in netdev dev_ops is not counted */
+	return sizeof(struct bpf_offloaded_map);
+}
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_offloaded_map *offmap = map_to_offmap(map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7c96e68859ec..053409d951d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -105,6 +105,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {
 	.map_alloc = bpf_map_offload_map_alloc,
 	.map_free = bpf_map_offload_map_free,
 	.map_check_btf = map_check_no_btf,
+	.map_mem_usage = bpf_map_offload_map_mem_usage,
 };
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
-- 
cgit v1.2.3


From d70e2ecbc726a0ee794b72365730a8dbc9866080 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 26 Feb 2023 17:17:00 -0800
Subject: instrumented.h: Fix all kernel-doc format warnings

Fix 26 kernel-doc notation warnings by converting the function
documentation to kernel-doc format.

Warning example:

instrumented.h:155: warning: Function parameter or member 'to' not described in 'instrument_copy_from_user_after'
instrumented.h:155: warning: Function parameter or member 'from' not described in 'instrument_copy_from_user_after'
instrumented.h:155: warning: Function parameter or member 'n' not described in 'instrument_copy_from_user_after'
instrumented.h:155: warning: Function parameter or member 'left' not described in 'instrument_copy_from_user_after'

Fixes: 36e4d4dd4fc4 ("include/linux: Add instrumented.h infrastructure")
Fixes: 00047c2e6d7c ("instrumented.h: Introduce read-write instrumentation hooks")
Fixes: 33b75c1d884e ("instrumented.h: allow instrumenting both sides of copy_from_user()")
Fixes: 888f84a6da4d ("x86: asm: instrument usercopy in get_user() and put_user()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/instrumented.h | 63 ++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 501fa8486749..1b608e00290a 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -15,12 +15,11 @@
 
 /**
  * instrument_read - instrument regular read access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument a regular read access. The instrumentation should be inserted
  * before the actual read happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_read(const volatile void *v, size_t size)
 {
@@ -30,12 +29,11 @@ static __always_inline void instrument_read(const volatile void *v, size_t size)
 
 /**
  * instrument_write - instrument regular write access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument a regular write access. The instrumentation should be inserted
  * before the actual write happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_write(const volatile void *v, size_t size)
 {
@@ -45,12 +43,11 @@ static __always_inline void instrument_write(const volatile void *v, size_t size
 
 /**
  * instrument_read_write - instrument regular read-write access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument a regular write access. The instrumentation should be inserted
  * before the actual write happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_read_write(const volatile void *v, size_t size)
 {
@@ -60,12 +57,11 @@ static __always_inline void instrument_read_write(const volatile void *v, size_t
 
 /**
  * instrument_atomic_read - instrument atomic read access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument an atomic read access. The instrumentation should be inserted
  * before the actual read happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
 {
@@ -75,12 +71,11 @@ static __always_inline void instrument_atomic_read(const volatile void *v, size_
 
 /**
  * instrument_atomic_write - instrument atomic write access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument an atomic write access. The instrumentation should be inserted
  * before the actual write happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
 {
@@ -90,12 +85,11 @@ static __always_inline void instrument_atomic_write(const volatile void *v, size
 
 /**
  * instrument_atomic_read_write - instrument atomic read-write access
+ * @v: address of access
+ * @size: size of access
  *
  * Instrument an atomic read-write access. The instrumentation should be
  * inserted before the actual write happens.
- *
- * @ptr address of access
- * @size size of access
  */
 static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
 {
@@ -105,13 +99,12 @@ static __always_inline void instrument_atomic_read_write(const volatile void *v,
 
 /**
  * instrument_copy_to_user - instrument reads of copy_to_user
+ * @to: destination address
+ * @from: source address
+ * @n: number of bytes to copy
  *
  * Instrument reads from kernel memory, that are due to copy_to_user (and
  * variants). The instrumentation must be inserted before the accesses.
- *
- * @to destination address
- * @from source address
- * @n number of bytes to copy
  */
 static __always_inline void
 instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
@@ -123,13 +116,12 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
 
 /**
  * instrument_copy_from_user_before - add instrumentation before copy_from_user
+ * @to: destination address
+ * @from: source address
+ * @n: number of bytes to copy
  *
  * Instrument writes to kernel memory, that are due to copy_from_user (and
  * variants). The instrumentation should be inserted before the accesses.
- *
- * @to destination address
- * @from source address
- * @n number of bytes to copy
  */
 static __always_inline void
 instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
@@ -140,14 +132,13 @@ instrument_copy_from_user_before(const void *to, const void __user *from, unsign
 
 /**
  * instrument_copy_from_user_after - add instrumentation after copy_from_user
+ * @to: destination address
+ * @from: source address
+ * @n: number of bytes to copy
+ * @left: number of bytes not copied (as returned by copy_from_user)
  *
  * Instrument writes to kernel memory, that are due to copy_from_user (and
  * variants). The instrumentation should be inserted after the accesses.
- *
- * @to destination address
- * @from source address
- * @n number of bytes to copy
- * @left number of bytes not copied (as returned by copy_from_user)
  */
 static __always_inline void
 instrument_copy_from_user_after(const void *to, const void __user *from,
@@ -158,12 +149,11 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
 
 /**
  * instrument_get_user() - add instrumentation to get_user()-like macros
+ * @to: destination variable, may not be address-taken
  *
  * get_user() and friends are fragile, so it may depend on the implementation
  * whether the instrumentation happens before or after the data is copied from
  * the userspace.
- *
- * @to destination variable, may not be address-taken
  */
 #define instrument_get_user(to)				\
 ({							\
@@ -175,14 +165,13 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
 
 /**
  * instrument_put_user() - add instrumentation to put_user()-like macros
+ * @from: source address
+ * @ptr: userspace pointer to copy to
+ * @size: number of bytes to copy
  *
  * put_user() and friends are fragile, so it may depend on the implementation
  * whether the instrumentation happens before or after the data is copied from
  * the userspace.
- *
- * @from source address
- * @ptr userspace pointer to copy to
- * @size number of bytes to copy
  */
 #define instrument_put_user(from, ptr, size)			\
 ({								\
-- 
cgit v1.2.3


From 11a2638d120b9d998916efb6fc55c6422e469ffa Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Sun, 5 Mar 2023 14:16:19 +0200
Subject: wifi: radiotap: separate vendor TLV into header/content

To be able to use a general function later for any kind of
TLV, separate the vendor TLV header/content in the structs.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230305124407.8ac5195bb3e6.I19ad99c1ad3108453aede64bddf6ef1a7c4a0b74@changeid
[separate from the original combined patch]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 16 ++++++++--------
 drivers/net/wireless/mac80211_hwsim.c         | 14 +++++++-------
 include/net/ieee80211_radiotap.h              | 20 ++++++++++++++------
 3 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 2db4f68becff..71a6555f90d9 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -223,18 +223,18 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm,
 				    vendor_data_len);
 
 	/* Intel OUI */
-	radiotap->oui[0] = 0xf6;
-	radiotap->oui[1] = 0x54;
-	radiotap->oui[2] = 0x25;
+	radiotap->content.oui[0] = 0xf6;
+	radiotap->content.oui[1] = 0x54;
+	radiotap->content.oui[2] = 0x25;
 	/* radiotap sniffer config sub-namespace */
-	radiotap->oui_subtype = 1;
-	radiotap->vendor_type = 0;
+	radiotap->content.oui_subtype = 1;
+	radiotap->content.vendor_type = 0;
 	/* clear reserved field */
-	radiotap->reserved = 0;
+	radiotap->content.reserved = 0;
 	/* fill the data now */
-	memcpy(radiotap->data, &mvm->cur_aid, sizeof(mvm->cur_aid));
+	memcpy(radiotap->content.data, &mvm->cur_aid, sizeof(mvm->cur_aid));
 	/* and clear the padding */
-	memset(radiotap->data + vendor_data_len, 0, padding);
+	memset(radiotap->content.data + vendor_data_len, 0, padding);
 
 	rx_status->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 }
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 152617034d19..f4bdc243ea0d 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1556,14 +1556,14 @@ static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb)
 				sizeof(vendor_data));
 	rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
 
-	rtap->oui[0] = HWSIM_RADIOTAP_OUI[0];
-	rtap->oui[1] = HWSIM_RADIOTAP_OUI[1];
-	rtap->oui[2] = HWSIM_RADIOTAP_OUI[2];
-	rtap->oui_subtype = 127;
+	rtap->content.oui[0] = HWSIM_RADIOTAP_OUI[0];
+	rtap->content.oui[1] = HWSIM_RADIOTAP_OUI[1];
+	rtap->content.oui[2] = HWSIM_RADIOTAP_OUI[2];
+	rtap->content.oui_subtype = 127;
 	/* clear reserved field */
-	rtap->reserved = 0;
-	rtap->vendor_type = 0;
-	memcpy(rtap->data, vendor_data, sizeof(vendor_data));
+	rtap->content.reserved = 0;
+	rtap->content.vendor_type = 0;
+	memcpy(rtap->content.data, vendor_data, sizeof(vendor_data));
 
 	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 #endif
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 95436686d3fe..f980a72f2ce6 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -370,18 +370,14 @@ struct ieee80211_radiotap_tlv {
 } __packed;
 
 /**
- * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
- * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
- * @len: length of data
+ * struct ieee80211_radiotap_vendor_content - radiotap vendor data content
  * @oui: radiotap vendor namespace OUI
  * @oui_subtype: radiotap vendor sub namespace
  * @vendor_type: radiotap vendor type
  * @reserved: should always be set to zero (to avoid leaking memory)
  * @data: the actual vendor namespace data
  */
-struct ieee80211_radiotap_vendor_tlv {
-	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
-	__le16 len;
+struct ieee80211_radiotap_vendor_content {
 	u8 oui[3];
 	u8 oui_subtype;
 	__le16 vendor_type;
@@ -389,6 +385,18 @@ struct ieee80211_radiotap_vendor_tlv {
 	u8 data[];
 } __packed;
 
+/**
+ * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
+ * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
+ * @len: length of data
+ * @content: vendor content see @ieee80211_radiotap_vendor_content
+ */
+struct ieee80211_radiotap_vendor_tlv {
+	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
+	__le16 len;
+	struct ieee80211_radiotap_vendor_content content;
+};
+
 /* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
  * see www.radiotap.org/fields/U-SIG.html for details
  */
-- 
cgit v1.2.3


From 0194b64578e905dc8f112e641a71c306bd58ddde Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 6 Mar 2023 22:51:35 +0100
Subject: net: phy: improve phy_read_poll_timeout

cond sometimes is (val & MASK) what may result in a false positive
if val is a negative errno. We shouldn't evaluate cond if val < 0.
This has no functional impact here, but it's not nice.
Therefore switch order of the checks.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/6d8274ac-4344-23b4-d9a3-cad4c39517d4@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 36bf0bbc8efa..fefd5091bc24 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1130,16 +1130,15 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum)
 #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \
 				timeout_us, sleep_before_read) \
 ({ \
-	int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \
+	int __ret = read_poll_timeout(phy_read, val, val < 0 || (cond), \
 		sleep_us, timeout_us, sleep_before_read, phydev, regnum); \
-	if (val <  0) \
+	if (val < 0) \
 		__ret = val; \
 	if (__ret) \
 		phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
 	__ret; \
 })
 
-
 /**
  * __phy_read - convenience function for reading a given PHY register
  * @phydev: the phy_device struct
-- 
cgit v1.2.3


From 40bbae583ec38ea31e728bf42a4ea72bded22ab6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Mar 2023 20:43:13 +0000
Subject: net: remove enum skb_free_reason

enum skb_drop_reason is more generic, we can adopt it instead.

Provide dev_kfree_skb_irq_reason() and dev_kfree_skb_any_reason().

This means drivers can use more precise drop reasons if they want to.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
Link: https://lore.kernel.org/r/20230306204313.10492-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 18 +++++++-----------
 net/core/dev.c            | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a14b7b11766..ee483071cf59 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
+#include <net/dropreason.h>
 
 struct netpoll_info;
 struct device;
@@ -3804,13 +3805,8 @@ static inline unsigned int get_netdev_rx_queue_index(
 
 int netif_get_num_default_rss_queues(void);
 
-enum skb_free_reason {
-	SKB_REASON_CONSUMED,
-	SKB_REASON_DROPPED,
-};
-
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);
 
 /*
  * It is not allowed to call kfree_skb() or consume_skb() from hardware
@@ -3833,22 +3829,22 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
  */
 static inline void dev_kfree_skb_irq(struct sk_buff *skb)
 {
-	__dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
+	dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 }
 
 static inline void dev_consume_skb_irq(struct sk_buff *skb)
 {
-	__dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
+	dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
 }
 
 static inline void dev_kfree_skb_any(struct sk_buff *skb)
 {
-	__dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
+	dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 }
 
 static inline void dev_consume_skb_any(struct sk_buff *skb)
 {
-	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
+	dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
 }
 
 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
diff --git a/net/core/dev.c b/net/core/dev.c
index 253584777101..c7853192563d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3075,7 +3075,7 @@ void __netif_schedule(struct Qdisc *q)
 EXPORT_SYMBOL(__netif_schedule);
 
 struct dev_kfree_skb_cb {
-	enum skb_free_reason reason;
+	enum skb_drop_reason reason;
 };
 
 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
@@ -3108,7 +3108,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 }
 EXPORT_SYMBOL(netif_tx_wake_queue);
 
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	unsigned long flags;
 
@@ -3128,18 +3128,16 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(__dev_kfree_skb_irq);
+EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
 
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	if (in_hardirq() || irqs_disabled())
-		__dev_kfree_skb_irq(skb, reason);
-	else if (unlikely(reason == SKB_REASON_DROPPED))
-		kfree_skb(skb);
+		dev_kfree_skb_irq_reason(skb, reason);
 	else
-		consume_skb(skb);
+		kfree_skb_reason(skb, reason);
 }
-EXPORT_SYMBOL(__dev_kfree_skb_any);
+EXPORT_SYMBOL(dev_kfree_skb_any_reason);
 
 
 /**
@@ -5020,11 +5018,11 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(refcount_read(&skb->users));
-			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
 				trace_consume_skb(skb, net_tx_action);
 			else
 				trace_kfree_skb(skb, net_tx_action,
-						SKB_DROP_REASON_NOT_SPECIFIED);
+						get_kfree_skb_cb(skb)->reason);
 
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
-- 
cgit v1.2.3


From 4386b921857793440ebd4db3d6b70639149c7074 Mon Sep 17 00:00:00 2001
From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Date: Fri, 24 Feb 2023 10:52:51 +0100
Subject: netfilter: bridge: introduce broute meta statement

nftables equivalent for ebtables -t broute.

Implement broute meta statement to set br_netfilter_broute flag
in skb to force a packet to be routed instead of being bridged.

Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 +
 net/bridge/netfilter/nft_meta_bridge.c   | 71 ++++++++++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ff677f3a6cad..9c6f02c26054 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -931,6 +931,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_TIME_HOUR: hour of day (in seconds)
  * @NFT_META_SDIF: slave device interface index
  * @NFT_META_SDIFNAME: slave device interface name
+ * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -969,6 +970,7 @@ enum nft_meta_keys {
 	NFT_META_TIME_HOUR,
 	NFT_META_SDIF,
 	NFT_META_SDIFNAME,
+	NFT_META_BRI_BROUTE,
 	__NFT_META_IIFTYPE,
 };
 
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index c3ecd77e25cb..bd4d1b4d745f 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -8,6 +8,9 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nft_meta.h>
 #include <linux/if_bridge.h>
+#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+
+#include "../br_private.h"
 
 static const struct net_device *
 nft_meta_get_bridge(const struct net_device *dev)
@@ -102,6 +105,50 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = {
 	.reduce		= nft_meta_get_reduce,
 };
 
+static void nft_meta_bridge_set_eval(const struct nft_expr *expr,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *meta = nft_expr_priv(expr);
+	u32 *sreg = &regs->data[meta->sreg];
+	struct sk_buff *skb = pkt->skb;
+	u8 value8;
+
+	switch (meta->key) {
+	case NFT_META_BRI_BROUTE:
+		value8 = nft_reg_load8(sreg);
+		BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8;
+		break;
+	default:
+		nft_meta_set_eval(expr, regs, pkt);
+	}
+}
+
+static int nft_meta_bridge_set_init(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr,
+				    const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int len;
+	int err;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_BRI_BROUTE:
+		len = sizeof(u8);
+		break;
+	default:
+		return nft_meta_set_init(ctx, expr, tb);
+	}
+
+	priv->len = len;
+	err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
 static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
 				       const struct nft_expr *expr)
 {
@@ -120,15 +167,33 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
 	return false;
 }
 
+static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
+					const struct nft_expr *expr,
+					const struct nft_data **data)
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	switch (priv->key) {
+	case NFT_META_BRI_BROUTE:
+		hooks = 1 << NF_BR_PRE_ROUTING;
+		break;
+	default:
+		return nft_meta_set_validate(ctx, expr, data);
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
 static const struct nft_expr_ops nft_meta_bridge_set_ops = {
 	.type		= &nft_meta_bridge_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_set_eval,
-	.init		= nft_meta_set_init,
+	.eval		= nft_meta_bridge_set_eval,
+	.init		= nft_meta_bridge_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
 	.reduce		= nft_meta_bridge_set_reduce,
-	.validate	= nft_meta_set_validate,
+	.validate	= nft_meta_bridge_set_validate,
 };
 
 static const struct nft_expr_ops *
-- 
cgit v1.2.3


From 10369080454d87ee5b2db211ce947cb3118f0e13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Mar 2023 14:59:59 +0000
Subject: net: reclaim skb->scm_io_uring bit

Commit 0091bfc81741 ("io_uring/af_unix: defer registered
files gc to io_uring release") added one bit to struct sk_buff.

This structure is critical for networking, and we try very hard
to not add bloat on it, unless absolutely required.

For instance, we can use a specific destructor as a wrapper
around unix_destruct_scm(), to identify skbs that unix_gc()
has to special case.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 include/net/af_unix.h  | 1 +
 io_uring/rsrc.c        | 3 +--
 net/unix/garbage.c     | 2 +-
 net/unix/scm.c         | 6 ++++++
 5 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff7ad331fb82..fe661011644b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -810,7 +810,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@csum_level: indicates the number of consecutive checksums found in
  *		the packet minus one that have been verified as
  *		CHECKSUM_UNNECESSARY (max 3)
- *	@scm_io_uring: SKB holds io_uring registered files
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
@@ -989,7 +988,6 @@ struct sk_buff {
 #endif
 	__u8			slow_gro:1;
 	__u8			csum_not_inet:1;
-	__u8			scm_io_uring:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 480fa579787e..45ebde587138 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -11,6 +11,7 @@
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_destruct_scm(struct sk_buff *skb);
+void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a59fc02de598..27ceda3b50cf 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -867,8 +867,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
 
 		UNIXCB(skb).fp = fpl;
 		skb->sk = sk;
-		skb->scm_io_uring = 1;
-		skb->destructor = unix_destruct_scm;
+		skb->destructor = io_uring_destruct_scm;
 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 	}
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index dc2763540393..2405f0f9af31 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -305,7 +305,7 @@ void unix_gc(void)
 	 * release.path eventually putting registered files.
 	 */
 	skb_queue_walk_safe(&hitlist, skb, next_skb) {
-		if (skb->scm_io_uring) {
+		if (skb->destructor == io_uring_destruct_scm) {
 			__skb_unlink(skb, &hitlist);
 			skb_queue_tail(&skb->sk->sk_receive_queue, skb);
 		}
diff --git a/net/unix/scm.c b/net/unix/scm.c
index aa27a02478dc..f9152881d77f 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -152,3 +152,9 @@ void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 EXPORT_SYMBOL(unix_destruct_scm);
+
+void io_uring_destruct_scm(struct sk_buff *skb)
+{
+	unix_destruct_scm(skb);
+}
+EXPORT_SYMBOL(io_uring_destruct_scm);
-- 
cgit v1.2.3


From 28e144cf5f72ce1c304571bc448e37c27495903a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:31:30 -0500
Subject: netfilter: move br_nf_check_hbh_len to utils

Rename br_nf_check_hbh_len() to nf_ip6_check_hbh_len() and move it
to netfilter utils, so that it can be used by other modules, like
ovs and tc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_ipv6.h |  2 ++
 net/bridge/br_netfilter_ipv6.c | 55 +-----------------------------------------
 net/netfilter/utils.c          | 52 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 48314ade1506..7834c0be2831 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			unsigned int dataoff, u_int8_t protocol);
 
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen);
+
 int ipv6_netfilter_init(void);
 void ipv6_netfilter_fini(void);
 
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index a0d6dfb3e255..550039dfc31a 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -40,59 +40,6 @@
 #include <linux/sysctl.h>
 #endif
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
- * anyway
- */
-static int br_nf_check_hbh_len(struct sk_buff *skb, u32 *plen)
-{
-	int len, off = sizeof(struct ipv6hdr);
-	unsigned char *nh;
-
-	if (!pskb_may_pull(skb, off + 8))
-		return -1;
-	nh = (unsigned char *)(ipv6_hdr(skb) + 1);
-	len = (nh[1] + 1) << 3;
-
-	if (!pskb_may_pull(skb, off + len))
-		return -1;
-	nh = skb_network_header(skb);
-
-	off += 2;
-	len -= 2;
-	while (len > 0) {
-		int optlen;
-
-		if (nh[off] == IPV6_TLV_PAD1) {
-			off++;
-			len--;
-			continue;
-		}
-		if (len < 2)
-			return -1;
-		optlen = nh[off + 1] + 2;
-		if (optlen > len)
-			return -1;
-
-		if (nh[off] == IPV6_TLV_JUMBO) {
-			u32 pkt_len;
-
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				return -1;
-			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				return -1;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				return -1;
-			*plen = pkt_len;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-
-	return len ? -1 : 0;
-}
-
 int br_validate_ipv6(struct net *net, struct sk_buff *skb)
 {
 	const struct ipv6hdr *hdr;
@@ -112,7 +59,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
 		goto inhdr_error;
 
 	pkt_len = ntohs(hdr->payload_len);
-	if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb, &pkt_len))
+	if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
 		goto drop;
 
 	if (pkt_len + ip6h_len > skb->len) {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 2182d361e273..acef4155f0da 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -215,3 +215,55 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
 	}
 	return ret;
 }
+
+/* Only get and check the lengths, not do any hop-by-hop stuff. */
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
+{
+	int len, off = sizeof(struct ipv6hdr);
+	unsigned char *nh;
+
+	if (!pskb_may_pull(skb, off + 8))
+		return -ENOMEM;
+	nh = (unsigned char *)(ipv6_hdr(skb) + 1);
+	len = (nh[1] + 1) << 3;
+
+	if (!pskb_may_pull(skb, off + len))
+		return -ENOMEM;
+	nh = skb_network_header(skb);
+
+	off += 2;
+	len -= 2;
+	while (len > 0) {
+		int optlen;
+
+		if (nh[off] == IPV6_TLV_PAD1) {
+			off++;
+			len--;
+			continue;
+		}
+		if (len < 2)
+			return -EBADMSG;
+		optlen = nh[off + 1] + 2;
+		if (optlen > len)
+			return -EBADMSG;
+
+		if (nh[off] == IPV6_TLV_JUMBO) {
+			u32 pkt_len;
+
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				return -EBADMSG;
+			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				return -EBADMSG;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				return -EBADMSG;
+			*plen = pkt_len;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+
+	return len ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
-- 
cgit v1.2.3


From 071c44e4278156f18a6a56958617223b6bffa6ab Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 13 Feb 2023 23:05:58 -0800
Subject: sched/idle: Mark arch_cpu_idle_dead() __noreturn

Before commit 076cbf5d2163 ("x86/xen: don't let xen_pv_play_dead()
return"), in Xen, when a previously offlined CPU was brought back
online, it unexpectedly resumed execution where it left off in the
middle of the idle loop.

There were some hacks to make that work, but the behavior was surprising
as do_idle() doesn't expect an offlined CPU to return from the dead (in
arch_cpu_idle_dead()).

Now that Xen has been fixed, and the arch-specific implementations of
arch_cpu_idle_dead() also don't return, give it a __noreturn attribute.

This will cause the compiler to complain if an arch-specific
implementation might return.  It also improves code generation for both
caller and callee.

Also fixes the following warning:

  vmlinux.o: warning: objtool: do_idle+0x25f: unreachable instruction

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/60d527353da8c99d4cf13b6473131d46719ed16d.1676358308.git.jpoimboe@kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/alpha/kernel/process.c     | 2 +-
 arch/arm/kernel/smp.c           | 2 +-
 arch/arm64/kernel/process.c     | 2 +-
 arch/csky/kernel/smp.c          | 2 +-
 arch/ia64/kernel/process.c      | 2 +-
 arch/loongarch/kernel/process.c | 2 +-
 arch/mips/kernel/process.c      | 2 +-
 arch/parisc/kernel/process.c    | 2 +-
 arch/powerpc/kernel/smp.c       | 2 +-
 arch/riscv/kernel/cpu-hotplug.c | 2 +-
 arch/s390/kernel/idle.c         | 2 +-
 arch/sh/kernel/idle.c           | 2 +-
 arch/sparc/kernel/process_64.c  | 2 +-
 arch/x86/kernel/process.c       | 2 +-
 arch/xtensa/kernel/smp.c        | 2 +-
 include/linux/cpu.h             | 2 +-
 kernel/sched/idle.c             | 2 +-
 tools/objtool/check.c           | 1 +
 18 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index a82fefa31bdb..582d96548385 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -60,7 +60,7 @@ void arch_cpu_idle(void)
 	wtint(0);
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	wtint(INT_MAX);
 	BUG();
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 441ea5cff390..d6be4507d22d 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -320,7 +320,7 @@ void __cpu_die(unsigned int cpu)
  * of the other hotplug-cpu capable cores, so presumably coming
  * out of idle fixes this.
  */
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	unsigned int cpu = smp_processor_id();
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 71d59b5abede..089ced6d6bd6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -69,7 +69,7 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL_GPL(pm_power_off);
 
 #ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
        cpu_die();
 }
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 0ec20efaf5fd..9c7a20b73ac6 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -300,7 +300,7 @@ void __cpu_die(unsigned int cpu)
 	pr_notice("CPU%u: shutdown\n", cpu);
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	idle_task_exit();
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 78f5794b2dde..9a5cd9fad3a9 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -225,7 +225,7 @@ static inline void __noreturn play_dead(void)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	play_dead();
 }
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index fa2443c7afb2..b71e17c1cc0c 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -62,7 +62,7 @@ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
 EXPORT_SYMBOL(boot_option_idle_override);
 
 #ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	play_dead();
 }
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 093dbbd6b843..a3225912c862 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -40,7 +40,7 @@
 #include <asm/stacktrace.h>
 
 #ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	play_dead();
 }
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index c064719b49b0..97c6f875bd0e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL(running_on_qemu);
 /*
  * Called from the idle thread for the CPU which has been shutdown.
  */
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 #ifdef CONFIG_HOTPLUG_CPU
 	idle_task_exit();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6b90f10a6c81..f62e5e651bcd 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1752,7 +1752,7 @@ void __cpu_die(unsigned int cpu)
 		smp_ops->cpu_die(cpu);
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	/*
 	 * Disable on the down path. This will be re-enabled by
diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c
index f7a832e3a1d1..59b80211c25f 100644
--- a/arch/riscv/kernel/cpu-hotplug.c
+++ b/arch/riscv/kernel/cpu-hotplug.c
@@ -71,7 +71,7 @@ void __cpu_die(unsigned int cpu)
 /*
  * Called from the idle thread for the CPU which has been shutdown.
  */
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	idle_task_exit();
 
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 38e267c7bff7..e7239aaf428b 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -88,7 +88,7 @@ void arch_cpu_idle_exit(void)
 {
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	cpu_die();
 }
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 114f0c4abeac..d662503b0665 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -30,7 +30,7 @@ void default_idle(void)
 	clear_bl_bit();
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	play_dead();
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 91c2b8124527..b51d8fb0ecdc 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -95,7 +95,7 @@ void arch_cpu_idle(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	sched_preempt_enable_no_resched();
 	cpu_play_dead();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f1ec36caf1d8..3e30147a537e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -727,7 +727,7 @@ void arch_cpu_idle_enter(void)
 	local_touch_nmi();
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	play_dead();
 }
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 7bad78495536..054bd64eab19 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -322,7 +322,7 @@ void __cpu_die(unsigned int cpu)
 	pr_err("CPU%u: unable to kill\n", cpu);
 }
 
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
 {
 	cpu_die();
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f83e4519c5f0..8582a7142623 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -182,7 +182,7 @@ void arch_cpu_idle(void);
 void arch_cpu_idle_prepare(void);
 void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
-void arch_cpu_idle_dead(void);
+void __noreturn arch_cpu_idle_dead(void);
 
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 56e152f06d0f..342f58a329f5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -75,7 +75,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
 void __weak arch_cpu_idle_prepare(void) { }
 void __weak arch_cpu_idle_enter(void) { }
 void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { while (1); }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
 void __weak arch_cpu_idle(void)
 {
 	cpu_idle_force_poll = 1;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f937be1afe65..37c36dc1d868 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"__reiserfs_panic",
 		"__stack_chk_fail",
 		"__ubsan_handle_builtin_unreachable",
+		"arch_cpu_idle_dead",
 		"cpu_bringup_and_idle",
 		"cpu_startup_entry",
 		"do_exit",
-- 
cgit v1.2.3


From b3816cf8138b6814375ec65673bcba91d368ee9f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 8 Mar 2023 13:28:18 -0500
Subject: lsm: fix a badly named parameter in security_get_getsecurity()

There is no good reason for why the "_buffer" parameter needs an
underscore, get rid of it.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  2 +-
 security/security.c           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 094b76dc7164..6bb55e61e8e8 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -381,7 +381,7 @@ LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred,
 LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key)
 LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred,
 	 enum key_need_perm need_perm)
-LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **_buffer)
+LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer)
 #endif /* CONFIG_KEYS */
 
 #ifdef CONFIG_AUDIT
diff --git a/security/security.c b/security/security.c
index 4c39cfe313bf..e6c275fff001 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4981,20 +4981,20 @@ int security_key_permission(key_ref_t key_ref, const struct cred *cred,
 /**
  * security_key_getsecurity() - Get the key's security label
  * @key: key
- * @_buffer: security label buffer
+ * @buffer: security label buffer
  *
  * Get a textual representation of the security context attached to a key for
  * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
  * storage for the NUL-terminated string and the caller should free it.
  *
- * Return: Returns the length of @_buffer (including terminating NUL) or -ve if
+ * Return: Returns the length of @buffer (including terminating NUL) or -ve if
  *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
  *         there is no security label assigned to the key.
  */
-int security_key_getsecurity(struct key *key, char **_buffer)
+int security_key_getsecurity(struct key *key, char **buffer)
 {
-	*_buffer = NULL;
-	return call_int_hook(key_getsecurity, 0, key, _buffer);
+	*buffer = NULL;
+	return call_int_hook(key_getsecurity, 0, key, buffer);
 }
 #endif	/* CONFIG_KEYS */
 
-- 
cgit v1.2.3


From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:15 -0800
Subject: bpf: add iterator kfuncs registration and validation logic

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   2 +
 include/linux/btf.h          |   4 ++
 kernel/bpf/btf.c             | 112 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a8cb09e5973b..71758cd15b07 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+				 const struct btf_type *func, u32 func_flags)
+{
+	u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+	const char *name, *sfx, *iter_name;
+	const struct btf_param *arg;
+	const struct btf_type *t;
+	char exp_name[128];
+	u32 nr_args;
+
+	/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+	if (!flags || (flags & (flags - 1)))
+		return -EINVAL;
+
+	/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+	nr_args = btf_type_vlen(func);
+	if (nr_args < 1)
+		return -EINVAL;
+
+	arg = &btf_params(func)[0];
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!t || !btf_type_is_ptr(t))
+		return -EINVAL;
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+	if (!t || !__btf_type_is_struct(t))
+		return -EINVAL;
+
+	name = btf_name_by_offset(btf, t->name_off);
+	if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+		return -EINVAL;
+
+	/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+	 * fit nicely in stack slots
+	 */
+	if (t->size == 0 || (t->size % 8))
+		return -EINVAL;
+
+	/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+	 * naming pattern
+	 */
+	iter_name = name + sizeof(ITER_PREFIX) - 1;
+	if (flags & KF_ITER_NEW)
+		sfx = "new";
+	else if (flags & KF_ITER_NEXT)
+		sfx = "next";
+	else /* (flags & KF_ITER_DESTROY) */
+		sfx = "destroy";
+
+	snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+	if (strcmp(func_name, exp_name))
+		return -EINVAL;
+
+	/* only iter constructor should have extra arguments */
+	if (!(flags & KF_ITER_NEW) && nr_args != 1)
+		return -EINVAL;
+
+	if (flags & KF_ITER_NEXT) {
+		/* bpf_iter_<type>_next() should return pointer */
+		t = btf_type_skip_modifiers(btf, func->type, NULL);
+		if (!t || !btf_type_is_ptr(t))
+			return -EINVAL;
+	}
+
+	if (flags & KF_ITER_DESTROY) {
+		/* bpf_iter_<type>_destroy() should return void */
+		t = btf_type_by_id(btf, func->type);
+		if (!t || !btf_type_is_void(t))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+	const struct btf_type *func;
+	const char *func_name;
+	int err;
+
+	/* any kfunc should be FUNC -> FUNC_PROTO */
+	func = btf_type_by_id(btf, func_id);
+	if (!func || !btf_type_is_func(func))
+		return -EINVAL;
+
+	/* sanity check kfunc name */
+	func_name = btf_name_by_offset(btf, func->name_off);
+	if (!func_name || !func_name[0])
+		return -EINVAL;
+
+	func = btf_type_by_id(btf, func->type);
+	if (!func || !btf_type_is_func_proto(func))
+		return -EINVAL;
+
+	if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+		err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Kernel Function (kfunc) BTF ID set registration API */
 
 static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 				       const struct btf_kfunc_id_set *kset)
 {
 	struct btf *btf;
-	int ret;
+	int ret, i;
 
 	btf = btf_get_module_btf(kset->owner);
 	if (!btf) {
@@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
+	for (i = 0; i < kset->set->cnt; i++) {
+		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+					     kset->set->pairs[i].flags);
+		if (ret)
+			goto err_out;
+	}
+
 	ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
 	btf_put(btf);
 	return ret;
 }
-- 
cgit v1.2.3


From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:16 -0800
Subject: bpf: add support for open-coded iterator loops

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  23 ++
 kernel/bpf/verifier.c        | 595 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 610 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d40fba6a1c0..45a082284464 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta {
 		enum bpf_dynptr_type type;
 		u32 id;
 	} initialized_dynptr;
+	struct {
+		u8 spi;
+		u8 frameno;
+	} iter;
 	u64 mem_size;
 };
 
@@ -668,6 +672,7 @@ static char slot_type_char[] = {
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
 	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
 }
 
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
 static const char *kernel_type_name(const struct btf* btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 	}
 }
 
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
 static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
 {
 	env->scratched_regs |= 1U << regno;
@@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	}
 }
 
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_reg_state *reg, int insn_idx,
+				 struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j, id;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	id = acquire_reference_state(env, insn_idx);
+	if (id < 0)
+		return id;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		__mark_reg_known_zero(st);
+		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		st->live |= REG_LIVE_WRITTEN;
+		st->ref_obj_id = i == 0 ? id : 0;
+		st->iter.btf = btf;
+		st->iter.btf_id = btf_id;
+		st->iter.state = BPF_ITER_STATE_ACTIVE;
+		st->iter.depth = 0;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_ITER;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		if (i == 0)
+			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+		__mark_reg_not_init(env, st);
+
+		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+		st->live |= REG_LIVE_WRITTEN;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_INVALID;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+				     struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+	 * will do check_mem_access to check and update stack bounds later, so
+	 * return true for that case.
+	 */
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi == -ERANGE)
+		return true;
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] == STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		/* only main (first) slot has ref_obj_id set */
+		if (i == 0 && !st->ref_obj_id)
+			return false;
+		if (i != 0 && st->ref_obj_id)
+			return false;
+		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+			return false;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] != STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+/* Check if given stack slot is "special":
+ *   - spilled register state (STACK_SPILL);
+ *   - dynptr state (STACK_DYNPTR);
+ *   - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+	switch (type) {
+	case STACK_SPILL:
+	case STACK_DYNPTR:
+	case STACK_ITER:
+		return true;
+	case STACK_INVALID:
+	case STACK_MISC:
+	case STACK_ZERO:
+		return false;
+	default:
+		WARN_ONCE(1, "unknown stack slot type %d\n", type);
+		return true;
+	}
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (reg->ref_obj_id)
 				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
 			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		default:
@@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
 }
 
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			  int spi, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int err, i;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+		if (err)
+			return err;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* regular write of data into stack destroys any spilled ptr */
 		state->stack[spi].spilled_ptr.type = NOT_INIT;
-		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
-		if (is_spilled_reg(&state->stack[spi]))
+		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+		if (is_stack_slot_special(&state->stack[spi]))
 			for (i = 0; i < BPF_REG_SIZE; i++)
 				scrub_spilled_slot(&state->stack[spi].slot_type[i]);
 
@@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 	return err;
 }
 
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+	struct bpf_func_state *state = func(env, reg);
+
+	return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+	/* btf_check_iter_kfuncs() guarantees that first argument of any iter
+	 * kfunc is iter state pointer
+	 */
+	return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+			    struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	const struct btf_type *t;
+	const struct btf_param *arg;
+	int spi, err, i, nr_slots;
+	u32 btf_id;
+
+	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+	arg = &btf_params(meta->func_proto)[0];
+	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */
+	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
+	nr_slots = t->size / BPF_REG_SIZE;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0 && spi != -ERANGE)
+		return spi;
+
+	meta->iter.spi = spi;
+	meta->iter.frameno = reg->frameno;
+
+	if (is_iter_new_kfunc(meta)) {
+		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
+		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+			err = check_mem_access(env, insn_idx, regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		if (err)
+			return err;
+	} else {
+		/* iter_next() or iter_destroy() expect initialized iter state*/
+		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+			verbose(env, "expected an initialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		err = mark_iter_read(env, reg, spi, nr_slots);
+		if (err)
+			return err;
+
+		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+		if (is_iter_destroy_kfunc(meta)) {
+			err = unmark_stack_slots_iter(env, reg, nr_slots);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+				  struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+	struct bpf_reg_state *cur_iter, *queued_iter;
+	int iter_frameno = meta->iter.frameno;
+	int iter_spi = meta->iter.spi;
+
+	BTF_TYPE_EMIT(struct bpf_iter);
+
+	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+			cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+		return -EFAULT;
+	}
+
+	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* branch out active iter state */
+		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+		if (!queued_st)
+			return -ENOMEM;
+
+		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+		queued_iter->iter.depth++;
+
+		queued_fr = queued_st->frame[queued_st->curframe];
+		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+	}
+
+	/* switch to DRAINED state, but keep the depth unchanged */
+	/* mark current iter state as drained and assume returned NULL */
+	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */
 	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
+	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
 	KF_ARG_PTR_TO_LIST_NODE,
 	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
+	if (is_kfunc_arg_iter(meta, argno))
+		return KF_ARG_PTR_TO_ITER;
+
 	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_LIST_HEAD;
 
@@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
+		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
 		case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			break;
 		}
+		case KF_ARG_PTR_TO_ITER:
+			ret = process_iter_arg(env, regno, insn_idx, meta);
+			if (ret < 0)
+				return ret;
+			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	desc_btf = meta.btf;
 	insn_aux = &env->insn_aux_data[insn_idx];
 
+	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
 		return -EACCES;
@@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			mark_btf_func_reg_size(env, regno, t->size);
 	}
 
+	if (is_iter_next_kfunc(&meta)) {
+		err = process_iter_next_call(env, insn_idx, &meta);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			struct bpf_kfunc_call_arg_meta meta;
+
+			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+			if (ret == 0 && is_iter_next_kfunc(&meta))
+				mark_prune_point(env, t);
+		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
 	case BPF_JA:
@@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 	 * didn't use them
 	 */
 	for (i = 0; i < old->allocated_stack; i++) {
+		struct bpf_reg_state *old_reg, *cur_reg;
+
 		spi = i / BPF_REG_SIZE;
 
 		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 				return false;
 			break;
 		case STACK_DYNPTR:
-		{
-			const struct bpf_reg_state *old_reg, *cur_reg;
-
 			old_reg = &old->stack[spi].spilled_ptr;
 			cur_reg = &cur->stack[spi].spilled_ptr;
 			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
 				return false;
 			break;
-		}
+		case STACK_ITER:
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			/* iter.depth is not compared between states as it
+			 * doesn't matter for correctness and would otherwise
+			 * prevent convergence; we maintain it only to prevent
+			 * infinite loop check triggering, see
+			 * iter_active_depths_differ()
+			 */
+			if (old_reg->iter.btf != cur_reg->iter.btf ||
+			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+			    old_reg->iter.state != cur_reg->iter.state ||
+			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
+			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
@@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
 	return true;
 }
 
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ *   0: again:                          ; set up iter_next() call args
+ *   1:   r1 = &it                      ; <CHECKPOINT HERE>
+ *   2:   call bpf_iter_num_next        ; this is iter_next() call
+ *   3:   if r0 == 0 goto done
+ *   4:   ... something useful here ...
+ *   5:   goto again                    ; another iteration
+ *   6: done:
+ *   7:   r1 = &it
+ *   8:   call bpf_iter_num_destroy     ; clean up iter state
+ *   9:   exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ *   struct bpf_iter_num it;
+ *   int *p, x;
+ *
+ *   bpf_iter_num_new(&it, 0, 10);
+ *   while ((p = bpf_iter_num_next(&t))) {
+ *       x = p;
+ *       while (x--) {} // <<-- infinite loop here
+ *   }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+	struct bpf_reg_state *slot, *cur_slot;
+	struct bpf_func_state *state;
+	int i, fr;
+
+	for (fr = old->curframe; fr >= 0; fr--) {
+		state = old->frame[fr];
+		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+			if (state->stack[i].slot_type[0] != STACK_ITER)
+				continue;
+
+			slot = &state->stack[i].spilled_ptr;
+			if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+				continue;
+
+			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+			if (cur_slot->iter.depth != slot->iter.depth)
+				return true;
+		}
+	}
+	return false;
+}
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
@@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				 * Since the verifier still needs to catch infinite loops
 				 * inside async callbacks.
 				 */
-			} else if (states_maybe_looping(&sl->state, cur) &&
-				   states_equal(env, &sl->state, cur)) {
+				goto skip_inf_loop_check;
+			}
+			/* BPF open-coded iterators loop detection is special.
+			 * states_maybe_looping() logic is too simplistic in detecting
+			 * states that *might* be equivalent, because it doesn't know
+			 * about ID remapping, so don't even perform it.
+			 * See process_iter_next_call() and iter_active_depths_differ()
+			 * for overview of the logic. When current and one of parent
+			 * states are detected as equivalent, it's a good thing: we prove
+			 * convergence and can stop simulating further iterations.
+			 * It's safe to assume that iterator loop will finish, taking into
+			 * account iter_next() contract of eventually returning
+			 * sticky NULL result.
+			 */
+			if (is_iter_next_insn(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur)) {
+					struct bpf_func_state *cur_frame;
+					struct bpf_reg_state *iter_state, *iter_reg;
+					int spi;
+
+					cur_frame = cur->frame[cur->curframe];
+					/* btf_check_iter_kfuncs() enforces that
+					 * iter state pointer is always the first arg
+					 */
+					iter_reg = &cur_frame->regs[BPF_REG_1];
+					/* current state is valid due to states_equal(),
+					 * so we can assume valid iter and reg state,
+					 * no need for extra (re-)validations
+					 */
+					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+						goto hit;
+				}
+				goto skip_inf_loop_check;
+			}
+			/* attempt to detect infinite loop to avoid unnecessary doomed work */
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur) &&
+			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				return -EINVAL;
@@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * This threshold shouldn't be too high either, since states
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
+skip_inf_loop_check:
 			if (!env->test_state_freq &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
@@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			goto miss;
 		}
 		if (states_equal(env, &sl->state, cur)) {
+hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.
-- 
cgit v1.2.3


From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:17 -0800
Subject: bpf: implement numbers iterator

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 +++--
 include/uapi/linux/bpf.h       |  8 +++++
 kernel/bpf/bpf_iter.c          | 70 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           |  3 ++
 tools/include/uapi/linux/bpf.h |  8 +++++
 5 files changed, 95 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+struct bpf_iter_num_kern {
+	int cur; /* current value, inclusive */
+	int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+	BTF_TYPE_EMIT(struct btf_iter_num);
+
+	/* start == end is legit, it's an empty range and we'll just get NULL
+	 * on first (and any subsequent) bpf_iter_num_next() call
+	 */
+	if (start > end) {
+		s->cur = s->end = 0;
+		return -EINVAL;
+	}
+
+	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+		s->cur = s->end = 0;
+		return -E2BIG;
+	}
+
+	/* user will call bpf_iter_num_next() first,
+	 * which will set s->cur to exactly start value;
+	 * underflow shouldn't matter
+	 */
+	s->cur = start - 1;
+	s->end = end;
+
+	return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	/* check failed initialization or if we are done (same behavior);
+	 * need to be careful about overflow, so convert to s64 for checks,
+	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
+	 * s->cur + 1 >= s->end
+	 */
+	if ((s64)(s->cur + 1) >= s->end) {
+		s->cur = s->end = 0;
+		return NULL;
+	}
+
+	s->cur++;
+
+	return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 637ac4e92e75..f9b7eeedce08 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 6486a57f05d3d40aa8698908de16feebe52e9f13 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 28 Feb 2023 15:11:06 +0300
Subject: livepatch: fix ELF typos

ELF is acronym.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/Y/3vWjQ/SBA5a0i5@p183
---
 Documentation/livepatch/module-elf-format.rst | 20 ++++++++++----------
 include/linux/module.h                        |  6 +++---
 kernel/module/livepatch.c                     | 10 +++++-----
 3 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/livepatch/module-elf-format.rst b/Documentation/livepatch/module-elf-format.rst
index d48f530c0881..a03ed02ec57e 100644
--- a/Documentation/livepatch/module-elf-format.rst
+++ b/Documentation/livepatch/module-elf-format.rst
@@ -1,8 +1,8 @@
 ===========================
-Livepatch module Elf format
+Livepatch module ELF format
 ===========================
 
-This document outlines the Elf format requirements that livepatch modules must follow.
+This document outlines the ELF format requirements that livepatch modules must follow.
 
 
 .. Table of Contents
@@ -20,17 +20,17 @@ code. So, instead of duplicating code and re-implementing what the module
 loader can already do, livepatch leverages existing code in the module
 loader to perform the all the arch-specific relocation work. Specifically,
 livepatch reuses the apply_relocate_add() function in the module loader to
-write relocations. The patch module Elf format described in this document
+write relocations. The patch module ELF format described in this document
 enables livepatch to be able to do this. The hope is that this will make
 livepatch more easily portable to other architectures and reduce the amount
 of arch-specific code required to port livepatch to a particular
 architecture.
 
 Since apply_relocate_add() requires access to a module's section header
-table, symbol table, and relocation section indices, Elf information is
+table, symbol table, and relocation section indices, ELF information is
 preserved for livepatch modules (see section 5). Livepatch manages its own
 relocation sections and symbols, which are described in this document. The
-Elf constants used to mark livepatch symbols and relocation sections were
+ELF constants used to mark livepatch symbols and relocation sections were
 selected from OS-specific ranges according to the definitions from glibc.
 
 Why does livepatch need to write its own relocations?
@@ -43,7 +43,7 @@ reject the livepatch module. Furthermore, we cannot apply relocations that
 affect modules not yet loaded at patch module load time (e.g. a patch to a
 driver that is not loaded). Formerly, livepatch solved this problem by
 embedding special "dynrela" (dynamic rela) sections in the resulting patch
-module Elf output. Using these dynrela sections, livepatch could resolve
+module ELF output. Using these dynrela sections, livepatch could resolve
 symbols while taking into account its scope and what module the symbol
 belongs to, and then manually apply the dynamic relocations. However this
 approach required livepatch to supply arch-specific code in order to write
@@ -80,7 +80,7 @@ Example:
 3. Livepatch relocation sections
 ================================
 
-A livepatch module manages its own Elf relocation sections to apply
+A livepatch module manages its own ELF relocation sections to apply
 relocations to modules as well as to the kernel (vmlinux) at the
 appropriate time. For example, if a patch module patches a driver that is
 not currently loaded, livepatch will apply the corresponding livepatch
@@ -95,7 +95,7 @@ also possible for a livepatch module to have no livepatch relocation
 sections, as in the case of the sample livepatch module (see
 samples/livepatch).
 
-Since Elf information is preserved for livepatch modules (see Section 5), a
+Since ELF information is preserved for livepatch modules (see Section 5), a
 livepatch relocation section can be applied simply by passing in the
 appropriate section index to apply_relocate_add(), which then uses it to
 access the relocation section and apply the relocations.
@@ -291,12 +291,12 @@ Examples:
   Note that the 'Ndx' (Section index) for these symbols is SHN_LIVEPATCH (0xff20).
   "OS" means OS-specific.
 
-5. Symbol table and Elf section access
+5. Symbol table and ELF section access
 ======================================
 A livepatch module's symbol table is accessible through module->symtab.
 
 Since apply_relocate_add() requires access to a module's section headers,
-symbol table, and relocation section indices, Elf information is preserved for
+symbol table, and relocation section indices, ELF information is preserved for
 livepatch modules and is made accessible by the module loader through
 module->klp_info, which is a :c:type:`klp_modinfo` struct. When a livepatch module
 loads, this struct is filled in by the module loader.
diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..e65a71b04490 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -353,9 +353,9 @@ struct mod_kallsyms {
 
 #ifdef CONFIG_LIVEPATCH
 /**
- * struct klp_modinfo - Elf information preserved from the livepatch module
+ * struct klp_modinfo - ELF information preserved from the livepatch module
  *
- * @hdr: Elf header
+ * @hdr: ELF header
  * @sechdrs: Section header table
  * @secstrings: String table for the section headers
  * @symndx: The symbol table section index
@@ -523,7 +523,7 @@ struct module {
 	bool klp; /* Is this a livepatch module? */
 	bool klp_alive;
 
-	/* Elf information */
+	/* ELF information */
 	struct klp_modinfo *klp_info;
 #endif
 
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
index 486d4ff92719..a89f01e1d6b7 100644
--- a/kernel/module/livepatch.c
+++ b/kernel/module/livepatch.c
@@ -11,7 +11,7 @@
 #include "internal.h"
 
 /*
- * Persist Elf information about a module. Copy the Elf header,
+ * Persist ELF information about a module. Copy the ELF header,
  * section header table, section string table, and symtab section
  * index from info to mod->klp_info.
  */
@@ -25,11 +25,11 @@ int copy_module_elf(struct module *mod, struct load_info *info)
 	if (!mod->klp_info)
 		return -ENOMEM;
 
-	/* Elf header */
+	/* ELF header */
 	size = sizeof(mod->klp_info->hdr);
 	memcpy(&mod->klp_info->hdr, info->hdr, size);
 
-	/* Elf section header table */
+	/* ELF section header table */
 	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
 	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
 	if (!mod->klp_info->sechdrs) {
@@ -37,7 +37,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
 		goto free_info;
 	}
 
-	/* Elf section name string table */
+	/* ELF section name string table */
 	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
 	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
 	if (!mod->klp_info->secstrings) {
@@ -45,7 +45,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
 		goto free_sechdrs;
 	}
 
-	/* Elf symbol section index */
+	/* ELF symbol section index */
 	symndx = info->index.sym;
 	mod->klp_info->symndx = symndx;
 
-- 
cgit v1.2.3


From 4821a076eb602a6238528e9ebafeac853c833415 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:23:26 -0500
Subject: sctp: add fair capacity stream scheduler

As it says in rfc8260#section-3.5 about the fair capacity scheduler:

   A fair capacity distribution between the streams is used.  This
   scheduler considers the lengths of the messages of each stream and
   schedules them in a specific way to maintain an equal capacity for
   all streams.  The details are implementation dependent.  interleaving
   user messages allows for a better realization of the fair capacity
   usage.

This patch adds Fair Capacity Scheduler based on the foundations added
by commit 5bbbbe32a431 ("sctp: introduce stream scheduler foundations"):

A fc_list and a fc_length are added into struct sctp_stream_out_ext and
a fc_list is added into struct sctp_stream. In .enqueue, when there are
chunks enqueued into a stream, this stream will be linked into stream->
fc_list by its fc_list ordered by its fc_length. In .dequeue, it always
picks up the 1st skb from stream->fc_list. In .dequeue_done, fc_length
is increased by chunk's len and update its location in stream->fc_list
according to the its new fc_length.

Note that when the new fc_length overflows in .dequeue_done, instead of
resetting all fc_lengths to 0, we only reduced them by U32_MAX / 4 to
avoid a moment of imbalance in the scheduling, as Marcelo suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sctp/stream_sched.h |   1 +
 include/net/sctp/structs.h      |   7 ++
 include/uapi/linux/sctp.h       |   3 +-
 net/sctp/Makefile               |   3 +-
 net/sctp/stream_sched.c         |   1 +
 net/sctp/stream_sched_fc.c      | 183 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_fc.c

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index fa00dc20a0d7..913170710adb 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -58,5 +58,6 @@ void sctp_sched_ops_register(enum sctp_sched_type sched,
 			     struct sctp_sched_ops *sched_ops);
 void sctp_sched_ops_prio_init(void);
 void sctp_sched_ops_rr_init(void);
+void sctp_sched_ops_fc_init(void);
 
 #endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e1f6e7fc2b11..2f1c9f50b352 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1429,6 +1429,10 @@ struct sctp_stream_out_ext {
 		struct {
 			struct list_head rr_list;
 		};
+		struct {
+			struct list_head fc_list;
+			__u32 fc_length;
+		};
 	};
 };
 
@@ -1475,6 +1479,9 @@ struct sctp_stream {
 			/* The next stream in line */
 			struct sctp_stream_out_ext *rr_next;
 		};
+		struct {
+			struct list_head fc_list;
+		};
 	};
 	struct sctp_stream_interleave *si;
 };
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index ed7d4ecbf53d..6814c5a1c4bc 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1211,7 +1211,8 @@ enum sctp_sched_type {
 	SCTP_SS_DEFAULT = SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
 	SCTP_SS_RR,
-	SCTP_SS_MAX = SCTP_SS_RR
+	SCTP_SS_FC,
+	SCTP_SS_MAX = SCTP_SS_FC
 };
 
 /* Probe Interval socket option */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index e845e4588535..0448398408d8 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -13,7 +13,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
-	  stream_sched_rr.o stream_interleave.o
+	  stream_sched_rr.o stream_sched_fc.o \
+	  stream_interleave.o
 
 sctp_diag-y := diag.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 330067002deb..1ebd14ef8daa 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -124,6 +124,7 @@ void sctp_sched_ops_init(void)
 	sctp_sched_ops_fcfs_init();
 	sctp_sched_ops_prio_init();
 	sctp_sched_ops_rr_init();
+	sctp_sched_ops_fc_init();
 }
 
 static void sctp_sched_free_sched(struct sctp_stream *stream)
diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c
new file mode 100644
index 000000000000..b336c2f5486b
--- /dev/null
+++ b/net/sctp/stream_sched_fc.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2022
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Fair Capacity handling
+ * RFC 8260 section 3.5
+ */
+static void sctp_sched_fc_unsched_all(struct sctp_stream *stream);
+
+static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 weight, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_fc_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->fc_list);
+
+	return 0;
+}
+
+static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	INIT_LIST_HEAD(&soute->fc_list);
+	soute->fc_length = 0;
+
+	return 0;
+}
+
+static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
+static void sctp_sched_fc_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	struct sctp_stream_out_ext *pos;
+
+	if (!list_empty(&soute->fc_list))
+		return;
+
+	list_for_each_entry(pos, &stream->fc_list, fc_list)
+		if (pos->fc_length >= soute->fc_length)
+			break;
+	list_add_tail(&soute->fc_list, &pos->fc_list);
+}
+
+static void sctp_sched_fc_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext);
+}
+
+static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		return NULL;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list);
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+	return ch;
+}
+
+static void sctp_sched_fc_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute, *pos;
+	__u16 sid, i;
+
+	sid = sctp_chunk_stream_no(ch);
+	soute = SCTP_SO(stream, sid)->ext;
+	/* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */
+	if (soute->fc_length > U32_MAX - ch->skb->len) {
+		for (i = 0; i < stream->outcnt; i++) {
+			pos = SCTP_SO(stream, i)->ext;
+			if (!pos)
+				continue;
+			if (pos->fc_length <= (U32_MAX >> 2)) {
+				pos->fc_length = 0;
+				continue;
+			}
+			pos->fc_length -= (U32_MAX >> 2);
+		}
+	}
+	soute->fc_length += ch->skb->len;
+
+	if (list_empty(&soute->outq)) {
+		list_del_init(&soute->fc_list);
+		return;
+	}
+
+	pos = soute;
+	list_for_each_entry_continue(pos, &stream->fc_list, fc_list)
+		if (pos->fc_length >= soute->fc_length)
+			break;
+	list_move_tail(&soute->fc_list, &pos->fc_list);
+}
+
+static void sctp_sched_fc_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid = sctp_chunk_stream_no(ch);
+
+		if (SCTP_SO(stream, sid)->ext)
+			sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext);
+	}
+}
+
+static void sctp_sched_fc_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list)
+		list_del_init(&soute->fc_list);
+}
+
+static struct sctp_sched_ops sctp_sched_fc = {
+	.set = sctp_sched_fc_set,
+	.get = sctp_sched_fc_get,
+	.init = sctp_sched_fc_init,
+	.init_sid = sctp_sched_fc_init_sid,
+	.free_sid = sctp_sched_fc_free_sid,
+	.enqueue = sctp_sched_fc_enqueue,
+	.dequeue = sctp_sched_fc_dequeue,
+	.dequeue_done = sctp_sched_fc_dequeue_done,
+	.sched_all = sctp_sched_fc_sched_all,
+	.unsched_all = sctp_sched_fc_unsched_all,
+};
+
+void sctp_sched_ops_fc_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc);
+}
-- 
cgit v1.2.3


From 42d452e7709fdb4d42376d2a97369e22cc80a5d2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:23:27 -0500
Subject: sctp: add weighted fair queueing stream scheduler

As it says in rfc8260#section-3.6 about the weighted fair queueing
scheduler:

   A Weighted Fair Queueing scheduler between the streams is used.  The
   weight is configurable per outgoing SCTP stream.  This scheduler
   considers the lengths of the messages of each stream and schedules
   them in a specific way to use the capacity according to the given
   weights.  If the weight of stream S1 is n times the weight of stream
   S2, the scheduler should assign to stream S1 n times the capacity it
   assigns to stream S2.  The details are implementation dependent.
   Interleaving user messages allows for a better realization of the
   capacity usage according to the given weights.

This patch adds Weighted Fair Queueing Scheduler actually based on
the code of Fair Capacity Scheduler by adding fc_weight into struct
sctp_stream_out_ext and taking it into account when sorting stream->
fc_list in sctp_sched_fc_sched() and sctp_sched_fc_dequeue_done().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sctp/stream_sched.h |  1 +
 include/net/sctp/structs.h      |  1 +
 include/uapi/linux/sctp.h       |  3 ++-
 net/sctp/stream_sched.c         |  1 +
 net/sctp/stream_sched_fc.c      | 50 +++++++++++++++++++++++++++++++++++++----
 5 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 913170710adb..572d73fdcd5e 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -59,5 +59,6 @@ void sctp_sched_ops_register(enum sctp_sched_type sched,
 void sctp_sched_ops_prio_init(void);
 void sctp_sched_ops_rr_init(void);
 void sctp_sched_ops_fc_init(void);
+void sctp_sched_ops_wfq_init(void);
 
 #endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f1c9f50b352..a0933efd93c3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1432,6 +1432,7 @@ struct sctp_stream_out_ext {
 		struct {
 			struct list_head fc_list;
 			__u32 fc_length;
+			__u16 fc_weight;
 		};
 	};
 };
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6814c5a1c4bc..b7d91d4cf0db 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1212,7 +1212,8 @@ enum sctp_sched_type {
 	SCTP_SS_PRIO,
 	SCTP_SS_RR,
 	SCTP_SS_FC,
-	SCTP_SS_MAX = SCTP_SS_FC
+	SCTP_SS_WFQ,
+	SCTP_SS_MAX = SCTP_SS_WFQ
 };
 
 /* Probe Interval socket option */
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 1ebd14ef8daa..e843760e9aaa 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -125,6 +125,7 @@ void sctp_sched_ops_init(void)
 	sctp_sched_ops_prio_init();
 	sctp_sched_ops_rr_init();
 	sctp_sched_ops_fc_init();
+	sctp_sched_ops_wfq_init();
 }
 
 static void sctp_sched_free_sched(struct sctp_stream *stream)
diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c
index b336c2f5486b..4bd18a497a6d 100644
--- a/net/sctp/stream_sched_fc.c
+++ b/net/sctp/stream_sched_fc.c
@@ -19,11 +19,32 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/stream_sched.h>
 
-/* Fair Capacity handling
- * RFC 8260 section 3.5
+/* Fair Capacity and Weighted Fair Queueing handling
+ * RFC 8260 section 3.5 and 3.6
  */
 static void sctp_sched_fc_unsched_all(struct sctp_stream *stream);
 
+static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid,
+			      __u16 weight, gfp_t gfp)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	if (!weight)
+		return -EINVAL;
+
+	soute->fc_weight = weight;
+	return 0;
+}
+
+static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid,
+			      __u16 *value)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	*value = soute->fc_weight;
+	return 0;
+}
+
 static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid,
 			     __u16 weight, gfp_t gfp)
 {
@@ -50,6 +71,7 @@ static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid,
 
 	INIT_LIST_HEAD(&soute->fc_list);
 	soute->fc_length = 0;
+	soute->fc_weight = 1;
 
 	return 0;
 }
@@ -67,7 +89,8 @@ static void sctp_sched_fc_sched(struct sctp_stream *stream,
 		return;
 
 	list_for_each_entry(pos, &stream->fc_list, fc_list)
-		if (pos->fc_length >= soute->fc_length)
+		if ((__u64)pos->fc_length * soute->fc_weight >=
+		    (__u64)soute->fc_length * pos->fc_weight)
 			break;
 	list_add_tail(&soute->fc_list, &pos->fc_list);
 }
@@ -137,7 +160,8 @@ static void sctp_sched_fc_dequeue_done(struct sctp_outq *q,
 
 	pos = soute;
 	list_for_each_entry_continue(pos, &stream->fc_list, fc_list)
-		if (pos->fc_length >= soute->fc_length)
+		if ((__u64)pos->fc_length * soute->fc_weight >=
+		    (__u64)soute->fc_length * pos->fc_weight)
 			break;
 	list_move_tail(&soute->fc_list, &pos->fc_list);
 }
@@ -181,3 +205,21 @@ void sctp_sched_ops_fc_init(void)
 {
 	sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc);
 }
+
+static struct sctp_sched_ops sctp_sched_wfq = {
+	.set = sctp_sched_wfq_set,
+	.get = sctp_sched_wfq_get,
+	.init = sctp_sched_fc_init,
+	.init_sid = sctp_sched_fc_init_sid,
+	.free_sid = sctp_sched_fc_free_sid,
+	.enqueue = sctp_sched_fc_enqueue,
+	.dequeue = sctp_sched_fc_dequeue,
+	.dequeue_done = sctp_sched_fc_dequeue_done,
+	.sched_all = sctp_sched_fc_sched_all,
+	.unsched_all = sctp_sched_fc_unsched_all,
+};
+
+void sctp_sched_ops_wfq_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq);
+}
-- 
cgit v1.2.3


From 85bce38a7e1f392f5051e67fd5fcfbffe71e9b81 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Feb 2023 21:22:57 +0100
Subject: serial: 8250: Reorder fields in 'struct plat_serial8250_port'

Group some variables based on their sizes to reduce hole and avoid padding.
On x86_64, this shrinks the size of 'struct plat_serial8250_port'
from 144 to 128 bytes.

It saves a few bytes of memory.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/f3cb1efe1454e0615840fd331ee335bc441589a9.1676665358.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 19376bee9667..741ed4807a9c 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -18,16 +18,16 @@ struct plat_serial8250_port {
 	unsigned long	iobase;		/* io base address */
 	void __iomem	*membase;	/* ioremap cookie or NULL */
 	resource_size_t	mapbase;	/* resource base */
+	unsigned int	uartclk;	/* UART clock rate */
 	unsigned int	irq;		/* interrupt number */
 	unsigned long	irqflags;	/* request_irq flags */
-	unsigned int	uartclk;	/* UART clock rate */
 	void            *private_data;
 	unsigned char	regshift;	/* register shift */
 	unsigned char	iotype;		/* UPIO_* */
 	unsigned char	hub6;
 	unsigned char	has_sysrq;	/* supports magic SysRq */
-	upf_t		flags;		/* UPF_* flags */
 	unsigned int	type;		/* If UPF_FIXED_TYPE */
+	upf_t		flags;		/* UPF_* flags */
 	unsigned int	(*serial_in)(struct uart_port *, int);
 	void		(*serial_out)(struct uart_port *, int, int);
 	void		(*set_termios)(struct uart_port *,
-- 
cgit v1.2.3


From 9b12f050c76f090cc6d0aebe0ef76fed79ec3f15 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@kernel.org>
Date: Wed, 22 Feb 2023 10:23:02 +0100
Subject: char: pcmcia: remove all the drivers

These char PCMCIA drivers are buggy[1] and receive only minimal care. It
was concluded[2], that we should try to remove most pcmcia drivers
completely. Let's start with these char broken one.

Note that I also removed a UAPI header: include/uapi/linux/cm4000_cs.h.
I found only coccinelle tests mentioning some ioctl constants from that
file. But they are not actually used. Anyway, should someone complain,
we may reintroduce the header (or its parts).

[1] https://lore.kernel.org/all/f41c2765-80e0-48bc-b1e4-8cfd3230fd4a@www.fastmail.com/
[2] https://lore.kernel.org/all/c5b39544-a4fb-4796-a046-0b9be9853787@app.fastmail.com/

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: "Hyunwoo Kim" <imv4bel@gmail.com>
Cc: Harald Welte <laforge@gnumonks.org>
Cc: Lubomir Rintel <lkundrak@v3.sk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Dominik Brodowski <linux@dominikbrodowski.net>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20230222092302.6348-2-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/process/magic-number.rst             |    1 -
 .../translations/it_IT/process/magic-number.rst    |    1 -
 .../translations/sp_SP/process/magic-number.rst    |    1 -
 .../translations/zh_CN/process/magic-number.rst    |    1 -
 .../translations/zh_TW/process/magic-number.rst    |    1 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |   17 -
 arch/powerpc/configs/ppc6xx_defconfig              |    2 -
 drivers/char/Kconfig                               |    2 -
 drivers/char/Makefile                              |    1 -
 drivers/char/pcmcia/Kconfig                        |   59 -
 drivers/char/pcmcia/Makefile                       |   11 -
 drivers/char/pcmcia/cm4000_cs.c                    | 1912 ---------
 drivers/char/pcmcia/cm4040_cs.c                    |  684 ----
 drivers/char/pcmcia/cm4040_cs.h                    |   48 -
 drivers/char/pcmcia/scr24x_cs.c                    |  359 --
 drivers/char/pcmcia/synclink_cs.c                  | 4290 --------------------
 include/linux/cm4000_cs.h                          |   11 -
 include/uapi/linux/cm4000_cs.h                     |   64 -
 19 files changed, 7466 deletions(-)
 delete mode 100644 drivers/char/pcmcia/Kconfig
 delete mode 100644 drivers/char/pcmcia/Makefile
 delete mode 100644 drivers/char/pcmcia/cm4000_cs.c
 delete mode 100644 drivers/char/pcmcia/cm4040_cs.c
 delete mode 100644 drivers/char/pcmcia/cm4040_cs.h
 delete mode 100644 drivers/char/pcmcia/scr24x_cs.c
 delete mode 100644 drivers/char/pcmcia/synclink_cs.c
 delete mode 100644 include/linux/cm4000_cs.h
 delete mode 100644 include/uapi/linux/cm4000_cs.h

(limited to 'include')

diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index 64b5948fc1d4..7029c3c084ee 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -72,7 +72,6 @@ PG_MAGIC              'P'              pg_{read,write}_hdr      ``include/linux/
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 SLIP_MAGIC            0x5302           slip                     ``drivers/net/slip.h``
-MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
 BAYCOM_MAGIC          0x19730510       baycom_state             ``drivers/net/baycom_epp.c``
 HDLCDRV_MAGIC         0x5ac6e778       hdlcdrv_state            ``include/linux/hdlcdrv.h``
 KV_MAGIC              0x5f4b565f       kernel_vars_s            ``arch/mips/include/asm/sn/klkernvars.h``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 02eb7eb2448e..ae92ab633c16 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -78,7 +78,6 @@ PG_MAGIC              'P'              pg_{read,write}_hdr      ``include/linux/
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 SLIP_MAGIC            0x5302           slip                     ``drivers/net/slip.h``
-MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
 BAYCOM_MAGIC          0x19730510       baycom_state             ``drivers/net/baycom_epp.c``
 HDLCDRV_MAGIC         0x5ac6e778       hdlcdrv_state            ``include/linux/hdlcdrv.h``
 KV_MAGIC              0x5f4b565f       kernel_vars_s            ``arch/mips/include/asm/sn/klkernvars.h``
diff --git a/Documentation/translations/sp_SP/process/magic-number.rst b/Documentation/translations/sp_SP/process/magic-number.rst
index 2b62cec34e8e..7c7dfb4ba80b 100644
--- a/Documentation/translations/sp_SP/process/magic-number.rst
+++ b/Documentation/translations/sp_SP/process/magic-number.rst
@@ -77,7 +77,6 @@ PG_MAGIC              'P'              pg_{read,write}_hdr      ``include/linux/
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 SLIP_MAGIC            0x5302           slip                     ``drivers/net/slip.h``
-MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
 BAYCOM_MAGIC          0x19730510       baycom_state             ``drivers/net/baycom_epp.c``
 HDLCDRV_MAGIC         0x5ac6e778       hdlcdrv_state            ``include/linux/hdlcdrv.h``
 KV_MAGIC              0x5f4b565f       kernel_vars_s            ``arch/mips/include/asm/sn/klkernvars.h``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index 0617ce125e12..37ed33034134 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -61,7 +61,6 @@ PG_MAGIC              'P'              pg_{read,write}_hdr      ``include/linux/
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 SLIP_MAGIC            0x5302           slip                     ``drivers/net/slip.h``
-MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
 BAYCOM_MAGIC          0x19730510       baycom_state             ``drivers/net/baycom_epp.c``
 HDLCDRV_MAGIC         0x5ac6e778       hdlcdrv_state            ``include/linux/hdlcdrv.h``
 KV_MAGIC              0x5f4b565f       kernel_vars_s            ``arch/mips/include/asm/sn/klkernvars.h``
diff --git a/Documentation/translations/zh_TW/process/magic-number.rst b/Documentation/translations/zh_TW/process/magic-number.rst
index f3f7082e17c6..1d48e1bbf5f2 100644
--- a/Documentation/translations/zh_TW/process/magic-number.rst
+++ b/Documentation/translations/zh_TW/process/magic-number.rst
@@ -64,7 +64,6 @@ PG_MAGIC              'P'              pg_{read,write}_hdr      ``include/linux/
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 SLIP_MAGIC            0x5302           slip                     ``drivers/net/slip.h``
-MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
 BAYCOM_MAGIC          0x19730510       baycom_state             ``drivers/net/baycom_epp.c``
 HDLCDRV_MAGIC         0x5ac6e778       hdlcdrv_state            ``include/linux/hdlcdrv.h``
 KV_MAGIC              0x5f4b565f       kernel_vars_s            ``arch/mips/include/asm/sn/klkernvars.h``
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 0a1882e296ae..176e8fc3f31b 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -222,7 +222,6 @@ Code  Seq#    Include File                                           Comments
 'b'   00-FF                                                          conflict! bit3 vme host bridge
                                                                      <mailto:natalia@nikhefk.nikhef.nl>
 'b'   00-0F  linux/dma-buf.h                                         conflict!
-'c'   all    linux/cm4000_cs.h                                       conflict!
 'c'   00-7F  linux/comstats.h                                        conflict!
 'c'   00-7F  linux/coda.h                                            conflict!
 'c'   00-1F  linux/chio.h                                            conflict!
diff --git a/MAINTAINERS b/MAINTAINERS
index 8d5bc223f305..64068a601e59 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15322,18 +15322,6 @@ S:	Maintained
 F:	Documentation/filesystems/omfs.rst
 F:	fs/omfs/
 
-OMNIKEY CARDMAN 4000 DRIVER
-M:	Harald Welte <laforge@gnumonks.org>
-S:	Maintained
-F:	drivers/char/pcmcia/cm4000_cs.c
-F:	include/linux/cm4000_cs.h
-F:	include/uapi/linux/cm4000_cs.h
-
-OMNIKEY CARDMAN 4040 DRIVER
-M:	Harald Welte <laforge@gnumonks.org>
-S:	Maintained
-F:	drivers/char/pcmcia/cm4040_cs.*
-
 OMNIVISION OG01A1B SENSOR DRIVER
 M:	Shawn Tu <shawnx.tu@intel.com>
 L:	linux-media@vger.kernel.org
@@ -18609,11 +18597,6 @@ F:	include/linux/wait.h
 F:	include/uapi/linux/sched.h
 F:	kernel/sched/
 
-SCR24X CHIP CARD INTERFACE DRIVER
-M:	Lubomir Rintel <lkundrak@v3.sk>
-S:	Supported
-F:	drivers/char/pcmcia/scr24x_cs.c
-
 SCSI RDMA PROTOCOL (SRP) INITIATOR
 M:	Bart Van Assche <bvanassche@acm.org>
 L:	linux-rdma@vger.kernel.org
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 110258277959..d8729b94400e 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -614,8 +614,6 @@ CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_NVRAM=y
 CONFIG_DTLK=m
-CONFIG_CARDMAN_4000=m
-CONFIG_CARDMAN_4040=m
 CONFIG_IPWIRELESS=m
 CONFIG_I2C_CHARDEV=m
 CONFIG_I2C_HYDRA=m
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 30fe9848dac1..801d6c83f896 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -247,8 +247,6 @@ config SONYPI
 	  To compile this driver as a module, choose M here: the
 	  module will be called sonypi.
 
-source "drivers/char/pcmcia/Kconfig"
-
 config MWAVE
 	tristate "ACP Modem (Mwave) support"
 	depends on X86 && TTY
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 1b35d1724565..c5f532e412f1 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -35,7 +35,6 @@ obj-$(CONFIG_TELCLOCK)		+= tlclk.o
 
 obj-$(CONFIG_MWAVE)		+= mwave/
 obj-y				+= agp/
-obj-$(CONFIG_PCMCIA)		+= pcmcia/
 
 obj-$(CONFIG_HANGCHECK_TIMER)	+= hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)		+= tpm/
diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
deleted file mode 100644
index 26724990074c..000000000000
--- a/drivers/char/pcmcia/Kconfig
+++ /dev/null
@@ -1,59 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# PCMCIA character device configuration
-#
-
-menu "PCMCIA character devices"
-	depends on PCMCIA!=n
-
-config SYNCLINK_CS
-	tristate "SyncLink PC Card support"
-	depends on PCMCIA && TTY
-	help
-	  Enable support for the SyncLink PC Card serial adapter, running
-	  asynchronous and HDLC communications up to 512Kbps. The port is
-	  selectable for RS-232, V.35, RS-449, RS-530, and X.21
-
-	  This driver may be built as a module ( = code which can be
-	  inserted in and removed from the running kernel whenever you want).
-	  The module will be called synclink_cs.  If you want to do that, say M
-	  here.
-
-config CARDMAN_4000
-	tristate "Omnikey Cardman 4000 support"
-	depends on PCMCIA
-	select BITREVERSE
-	help
-	  Enable support for the Omnikey Cardman 4000 PCMCIA Smartcard
-	  reader.
-
-	  This kernel driver requires additional userspace support, either
-	  by the vendor-provided PC/SC ifd_handler (http://www.omnikey.com/),
-	  or via the cm4000 backend of OpenCT (http://www.opensc-project.org/opensc).
-
-config CARDMAN_4040
-	tristate "Omnikey CardMan 4040 support"
-	depends on PCMCIA
-	help
-	  Enable support for the Omnikey CardMan 4040 PCMCIA Smartcard
-	  reader.
-
-	  This card is basically a USB CCID device connected to a FIFO
-	  in I/O space.  To use the kernel driver, you will need either the
-	  PC/SC ifdhandler provided from the Omnikey homepage
-	  (http://www.omnikey.com/), or a current development version of OpenCT
-	  (http://www.opensc-project.org/opensc).
-
-config SCR24X
-	tristate "SCR24x Chip Card Interface support"
-	depends on PCMCIA
-	help
-	  Enable support for the SCR24x PCMCIA Chip Card Interface.
-
-	  To compile this driver as a module, choose M here.
-	  The module will be called scr24x_cs..
-
-	  If unsure say N.
-
-endmenu
-
diff --git a/drivers/char/pcmcia/Makefile b/drivers/char/pcmcia/Makefile
deleted file mode 100644
index 024eed1c4ca5..000000000000
--- a/drivers/char/pcmcia/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# drivers/char/pcmcia/Makefile
-#
-# Makefile for the Linux PCMCIA char device drivers.
-#
-
-obj-$(CONFIG_SYNCLINK_CS) += synclink_cs.o
-obj-$(CONFIG_CARDMAN_4000) += cm4000_cs.o
-obj-$(CONFIG_CARDMAN_4040) += cm4040_cs.o
-obj-$(CONFIG_SCR24X) += scr24x_cs.o
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
deleted file mode 100644
index e656f42a28ac..000000000000
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ /dev/null
@@ -1,1912 +0,0 @@
- /*
-  * A driver for the PCMCIA Smartcard Reader "Omnikey CardMan Mobile 4000"
-  *
-  * cm4000_cs.c support.linux@omnikey.com
-  *
-  * Tue Oct 23 11:32:43 GMT 2001 herp - cleaned up header files
-  * Sun Jan 20 10:11:15 MET 2002 herp - added modversion header files
-  * Thu Nov 14 16:34:11 GMT 2002 mh   - added PPS functionality
-  * Tue Nov 19 16:36:27 GMT 2002 mh   - added SUSPEND/RESUME functionailty
-  * Wed Jul 28 12:55:01 CEST 2004 mh  - kernel 2.6 adjustments
-  *
-  * current version: 2.4.0gm4
-  *
-  * (C) 2000,2001,2002,2003,2004 Omnikey AG
-  *
-  * (C) 2005-2006 Harald Welte <laforge@gnumonks.org>
-  * 	- Adhere to Kernel process/coding-style.rst
-  * 	- Port to 2.6.13 "new" style PCMCIA
-  * 	- Check for copy_{from,to}_user return values
-  * 	- Use nonseekable_open()
-  * 	- add class interface for udev device creation
-  *
-  * All rights reserved. Licensed under dual BSD/GPL license.
-  */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/delay.h>
-#include <linux/bitrev.h>
-#include <linux/mutex.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/cisreg.h>
-#include <pcmcia/ciscode.h>
-#include <pcmcia/ds.h>
-
-#include <linux/cm4000_cs.h>
-
-/* #define ATR_CSUM */
-
-#define reader_to_dev(x)	(&x->p_dev->dev)
-
-/* n (debug level) is ignored */
-/* additional debug output may be enabled by re-compiling with
- * CM4000_DEBUG set */
-/* #define CM4000_DEBUG */
-#define DEBUGP(n, rdr, x, args...) do { 		\
-		dev_dbg(reader_to_dev(rdr), "%s:" x, 	\
-			   __func__ , ## args);		\
-	} while (0)
-
-static DEFINE_MUTEX(cmm_mutex);
-
-#define	T_1SEC		(HZ)
-#define	T_10MSEC	msecs_to_jiffies(10)
-#define	T_20MSEC	msecs_to_jiffies(20)
-#define	T_40MSEC	msecs_to_jiffies(40)
-#define	T_50MSEC	msecs_to_jiffies(50)
-#define	T_100MSEC	msecs_to_jiffies(100)
-#define	T_500MSEC	msecs_to_jiffies(500)
-
-static void cm4000_release(struct pcmcia_device *link);
-
-static int major;		/* major number we get from the kernel */
-
-/* note: the first state has to have number 0 always */
-
-#define	M_FETCH_ATR	0
-#define	M_TIMEOUT_WAIT	1
-#define	M_READ_ATR_LEN	2
-#define	M_READ_ATR	3
-#define	M_ATR_PRESENT	4
-#define	M_BAD_CARD	5
-#define M_CARDOFF	6
-
-#define	LOCK_IO			0
-#define	LOCK_MONITOR		1
-
-#define IS_AUTOPPS_ACT		 6
-#define	IS_PROCBYTE_PRESENT	 7
-#define	IS_INVREV		 8
-#define IS_ANY_T0		 9
-#define	IS_ANY_T1		10
-#define	IS_ATR_PRESENT		11
-#define	IS_ATR_VALID		12
-#define	IS_CMM_ABSENT		13
-#define	IS_BAD_LENGTH		14
-#define	IS_BAD_CSUM		15
-#define	IS_BAD_CARD		16
-
-#define REG_FLAGS0(x)		(x + 0)
-#define REG_FLAGS1(x)		(x + 1)
-#define REG_NUM_BYTES(x)	(x + 2)
-#define REG_BUF_ADDR(x)		(x + 3)
-#define REG_BUF_DATA(x)		(x + 4)
-#define REG_NUM_SEND(x)		(x + 5)
-#define REG_BAUDRATE(x)		(x + 6)
-#define REG_STOPBITS(x)		(x + 7)
-
-struct cm4000_dev {
-	struct pcmcia_device *p_dev;
-
-	unsigned char atr[MAX_ATR];
-	unsigned char rbuf[512];
-	unsigned char sbuf[512];
-
-	wait_queue_head_t devq;		/* when removing cardman must not be
-					   zeroed! */
-
-	wait_queue_head_t ioq;		/* if IO is locked, wait on this Q */
-	wait_queue_head_t atrq;		/* wait for ATR valid */
-	wait_queue_head_t readq;	/* used by write to wake blk.read */
-
-	/* warning: do not move this struct group.
-	 * initialising to zero depends on it - see ZERO_DEV below.  */
-	struct_group(init,
-	unsigned char atr_csum;
-	unsigned char atr_len_retry;
-	unsigned short atr_len;
-	unsigned short rlen;	/* bytes avail. after write */
-	unsigned short rpos;	/* latest read pos. write zeroes */
-	unsigned char procbyte;	/* T=0 procedure byte */
-	unsigned char mstate;	/* state of card monitor */
-	unsigned char cwarn;	/* slow down warning */
-	unsigned char flags0;	/* cardman IO-flags 0 */
-	unsigned char flags1;	/* cardman IO-flags 1 */
-	unsigned int mdelay;	/* variable monitor speeds, in jiffies */
-
-	unsigned int baudv;	/* baud value for speed */
-	unsigned char ta1;
-	unsigned char proto;	/* T=0, T=1, ... */
-	unsigned long flags;	/* lock+flags (MONITOR,IO,ATR) * for concurrent
-				   access */
-
-	unsigned char pts[4];
-
-	struct timer_list timer;	/* used to keep monitor running */
-	int monitor_running;
-	);
-};
-
-#define	ZERO_DEV(dev)	memset(&((dev)->init), 0, sizeof((dev)->init))
-
-static struct pcmcia_device *dev_table[CM4000_MAX_DEV];
-static struct class *cmm_class;
-
-/* This table doesn't use spaces after the comma between fields and thus
- * violates process/coding-style.rst.  However, I don't really think wrapping it around will
- * make it any clearer to read -HW */
-static unsigned char fi_di_table[10][14] = {
-/*FI     00   01   02   03   04   05   06   07   08   09   10   11   12   13 */
-/*DI */
-/* 0 */ {0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11},
-/* 1 */ {0x01,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x91,0x11,0x11,0x11,0x11},
-/* 2 */ {0x02,0x12,0x22,0x32,0x11,0x11,0x11,0x11,0x11,0x92,0xA2,0xB2,0x11,0x11},
-/* 3 */ {0x03,0x13,0x23,0x33,0x43,0x53,0x63,0x11,0x11,0x93,0xA3,0xB3,0xC3,0xD3},
-/* 4 */ {0x04,0x14,0x24,0x34,0x44,0x54,0x64,0x11,0x11,0x94,0xA4,0xB4,0xC4,0xD4},
-/* 5 */ {0x00,0x15,0x25,0x35,0x45,0x55,0x65,0x11,0x11,0x95,0xA5,0xB5,0xC5,0xD5},
-/* 6 */ {0x06,0x16,0x26,0x36,0x46,0x56,0x66,0x11,0x11,0x96,0xA6,0xB6,0xC6,0xD6},
-/* 7 */ {0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11},
-/* 8 */ {0x08,0x11,0x28,0x38,0x48,0x58,0x68,0x11,0x11,0x98,0xA8,0xB8,0xC8,0xD8},
-/* 9 */ {0x09,0x19,0x29,0x39,0x49,0x59,0x69,0x11,0x11,0x99,0xA9,0xB9,0xC9,0xD9}
-};
-
-#ifndef CM4000_DEBUG
-#define	xoutb	outb
-#define	xinb	inb
-#else
-static inline void xoutb(unsigned char val, unsigned short port)
-{
-	pr_debug("outb(val=%.2x,port=%.4x)\n", val, port);
-	outb(val, port);
-}
-static inline unsigned char xinb(unsigned short port)
-{
-	unsigned char val;
-
-	val = inb(port);
-	pr_debug("%.2x=inb(%.4x)\n", val, port);
-
-	return val;
-}
-#endif
-
-static inline unsigned char invert_revert(unsigned char ch)
-{
-	return bitrev8(~ch);
-}
-
-static void str_invert_revert(unsigned char *b, int len)
-{
-	int i;
-
-	for (i = 0; i < len; i++)
-		b[i] = invert_revert(b[i]);
-}
-
-#define	ATRLENCK(dev,pos) \
-	if (pos>=dev->atr_len || pos>=MAX_ATR) \
-		goto return_0;
-
-static unsigned int calc_baudv(unsigned char fidi)
-{
-	unsigned int wcrcf, wbrcf, fi_rfu, di_rfu;
-
-	fi_rfu = 372;
-	di_rfu = 1;
-
-	/* FI */
-	switch ((fidi >> 4) & 0x0F) {
-	case 0x00:
-		wcrcf = 372;
-		break;
-	case 0x01:
-		wcrcf = 372;
-		break;
-	case 0x02:
-		wcrcf = 558;
-		break;
-	case 0x03:
-		wcrcf = 744;
-		break;
-	case 0x04:
-		wcrcf = 1116;
-		break;
-	case 0x05:
-		wcrcf = 1488;
-		break;
-	case 0x06:
-		wcrcf = 1860;
-		break;
-	case 0x07:
-		wcrcf = fi_rfu;
-		break;
-	case 0x08:
-		wcrcf = fi_rfu;
-		break;
-	case 0x09:
-		wcrcf = 512;
-		break;
-	case 0x0A:
-		wcrcf = 768;
-		break;
-	case 0x0B:
-		wcrcf = 1024;
-		break;
-	case 0x0C:
-		wcrcf = 1536;
-		break;
-	case 0x0D:
-		wcrcf = 2048;
-		break;
-	default:
-		wcrcf = fi_rfu;
-		break;
-	}
-
-	/* DI */
-	switch (fidi & 0x0F) {
-	case 0x00:
-		wbrcf = di_rfu;
-		break;
-	case 0x01:
-		wbrcf = 1;
-		break;
-	case 0x02:
-		wbrcf = 2;
-		break;
-	case 0x03:
-		wbrcf = 4;
-		break;
-	case 0x04:
-		wbrcf = 8;
-		break;
-	case 0x05:
-		wbrcf = 16;
-		break;
-	case 0x06:
-		wbrcf = 32;
-		break;
-	case 0x07:
-		wbrcf = di_rfu;
-		break;
-	case 0x08:
-		wbrcf = 12;
-		break;
-	case 0x09:
-		wbrcf = 20;
-		break;
-	default:
-		wbrcf = di_rfu;
-		break;
-	}
-
-	return (wcrcf / wbrcf);
-}
-
-static unsigned short io_read_num_rec_bytes(unsigned int iobase,
-					    unsigned short *s)
-{
-	unsigned short tmp;
-
-	tmp = *s = 0;
-	do {
-		*s = tmp;
-		tmp = inb(REG_NUM_BYTES(iobase)) |
-				(inb(REG_FLAGS0(iobase)) & 4 ? 0x100 : 0);
-	} while (tmp != *s);
-
-	return *s;
-}
-
-static int parse_atr(struct cm4000_dev *dev)
-{
-	unsigned char any_t1, any_t0;
-	unsigned char ch, ifno;
-	int ix, done;
-
-	DEBUGP(3, dev, "-> parse_atr: dev->atr_len = %i\n", dev->atr_len);
-
-	if (dev->atr_len < 3) {
-		DEBUGP(5, dev, "parse_atr: atr_len < 3\n");
-		return 0;
-	}
-
-	if (dev->atr[0] == 0x3f)
-		set_bit(IS_INVREV, &dev->flags);
-	else
-		clear_bit(IS_INVREV, &dev->flags);
-	ix = 1;
-	ifno = 1;
-	ch = dev->atr[1];
-	dev->proto = 0;		/* XXX PROTO */
-	any_t1 = any_t0 = done = 0;
-	dev->ta1 = 0x11;	/* defaults to 9600 baud */
-	do {
-		if (ifno == 1 && (ch & 0x10)) {
-			/* read first interface byte and TA1 is present */
-			dev->ta1 = dev->atr[2];
-			DEBUGP(5, dev, "Card says FiDi is 0x%.2x\n", dev->ta1);
-			ifno++;
-		} else if ((ifno == 2) && (ch & 0x10)) { /* TA(2) */
-			dev->ta1 = 0x11;
-			ifno++;
-		}
-
-		DEBUGP(5, dev, "Yi=%.2x\n", ch & 0xf0);
-		ix += ((ch & 0x10) >> 4)	/* no of int.face chars */
-		    +((ch & 0x20) >> 5)
-		    + ((ch & 0x40) >> 6)
-		    + ((ch & 0x80) >> 7);
-		/* ATRLENCK(dev,ix); */
-		if (ch & 0x80) {	/* TDi */
-			ch = dev->atr[ix];
-			if ((ch & 0x0f)) {
-				any_t1 = 1;
-				DEBUGP(5, dev, "card is capable of T=1\n");
-			} else {
-				any_t0 = 1;
-				DEBUGP(5, dev, "card is capable of T=0\n");
-			}
-		} else
-			done = 1;
-	} while (!done);
-
-	DEBUGP(5, dev, "ix=%d noHist=%d any_t1=%d\n",
-	      ix, dev->atr[1] & 15, any_t1);
-	if (ix + 1 + (dev->atr[1] & 0x0f) + any_t1 != dev->atr_len) {
-		DEBUGP(5, dev, "length error\n");
-		return 0;
-	}
-	if (any_t0)
-		set_bit(IS_ANY_T0, &dev->flags);
-
-	if (any_t1) {		/* compute csum */
-		dev->atr_csum = 0;
-#ifdef ATR_CSUM
-		for (i = 1; i < dev->atr_len; i++)
-			dev->atr_csum ^= dev->atr[i];
-		if (dev->atr_csum) {
-			set_bit(IS_BAD_CSUM, &dev->flags);
-			DEBUGP(5, dev, "bad checksum\n");
-			goto return_0;
-		}
-#endif
-		if (any_t0 == 0)
-			dev->proto = 1;	/* XXX PROTO */
-		set_bit(IS_ANY_T1, &dev->flags);
-	}
-
-	return 1;
-}
-
-struct card_fixup {
-	char atr[12];
-	u_int8_t atr_len;
-	u_int8_t stopbits;
-};
-
-static struct card_fixup card_fixups[] = {
-	{	/* ACOS */
-		.atr = { 0x3b, 0xb3, 0x11, 0x00, 0x00, 0x41, 0x01 },
-		.atr_len = 7,
-		.stopbits = 0x03,
-	},
-	{	/* Motorola */
-		.atr = {0x3b, 0x76, 0x13, 0x00, 0x00, 0x80, 0x62, 0x07,
-			0x41, 0x81, 0x81 },
-		.atr_len = 11,
-		.stopbits = 0x04,
-	},
-};
-
-static void set_cardparameter(struct cm4000_dev *dev)
-{
-	int i;
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-	u_int8_t stopbits = 0x02; /* ISO default */
-
-	DEBUGP(3, dev, "-> set_cardparameter\n");
-
-	dev->flags1 = dev->flags1 | (((dev->baudv - 1) & 0x0100) >> 8);
-	xoutb(dev->flags1, REG_FLAGS1(iobase));
-	DEBUGP(5, dev, "flags1 = 0x%02x\n", dev->flags1);
-
-	/* set baudrate */
-	xoutb((unsigned char)((dev->baudv - 1) & 0xFF), REG_BAUDRATE(iobase));
-
-	DEBUGP(5, dev, "baudv = %i -> write 0x%02x\n", dev->baudv,
-	      ((dev->baudv - 1) & 0xFF));
-
-	/* set stopbits */
-	for (i = 0; i < ARRAY_SIZE(card_fixups); i++) {
-		if (!memcmp(dev->atr, card_fixups[i].atr,
-			    card_fixups[i].atr_len))
-			stopbits = card_fixups[i].stopbits;
-	}
-	xoutb(stopbits, REG_STOPBITS(iobase));
-
-	DEBUGP(3, dev, "<- set_cardparameter\n");
-}
-
-static int set_protocol(struct cm4000_dev *dev, struct ptsreq *ptsreq)
-{
-
-	unsigned long tmp, i;
-	unsigned short num_bytes_read;
-	unsigned char pts_reply[4];
-	ssize_t rc;
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-
-	rc = 0;
-
-	DEBUGP(3, dev, "-> set_protocol\n");
-	DEBUGP(5, dev, "ptsreq->Protocol = 0x%.8x, ptsreq->Flags=0x%.8x, "
-		 "ptsreq->pts1=0x%.2x, ptsreq->pts2=0x%.2x, "
-		 "ptsreq->pts3=0x%.2x\n", (unsigned int)ptsreq->protocol,
-		 (unsigned int)ptsreq->flags, ptsreq->pts1, ptsreq->pts2,
-		 ptsreq->pts3);
-
-	/* Fill PTS structure */
-	dev->pts[0] = 0xff;
-	dev->pts[1] = 0x00;
-	tmp = ptsreq->protocol;
-	while ((tmp = (tmp >> 1)) > 0)
-		dev->pts[1]++;
-	dev->proto = dev->pts[1];	/* Set new protocol */
-	dev->pts[1] = (0x01 << 4) | (dev->pts[1]);
-
-	/* Correct Fi/Di according to CM4000 Fi/Di table */
-	DEBUGP(5, dev, "Ta(1) from ATR is 0x%.2x\n", dev->ta1);
-	/* set Fi/Di according to ATR TA(1) */
-	dev->pts[2] = fi_di_table[dev->ta1 & 0x0F][(dev->ta1 >> 4) & 0x0F];
-
-	/* Calculate PCK character */
-	dev->pts[3] = dev->pts[0] ^ dev->pts[1] ^ dev->pts[2];
-
-	DEBUGP(5, dev, "pts0=%.2x, pts1=%.2x, pts2=%.2x, pts3=%.2x\n",
-	       dev->pts[0], dev->pts[1], dev->pts[2], dev->pts[3]);
-
-	/* check card convention */
-	if (test_bit(IS_INVREV, &dev->flags))
-		str_invert_revert(dev->pts, 4);
-
-	/* reset SM */
-	xoutb(0x80, REG_FLAGS0(iobase));
-
-	/* Enable access to the message buffer */
-	DEBUGP(5, dev, "Enable access to the messages buffer\n");
-	dev->flags1 = 0x20	/* T_Active */
-	    | (test_bit(IS_INVREV, &dev->flags) ? 0x02 : 0x00) /* inv parity */
-	    | ((dev->baudv >> 8) & 0x01);	/* MSB-baud */
-	xoutb(dev->flags1, REG_FLAGS1(iobase));
-
-	DEBUGP(5, dev, "Enable message buffer -> flags1 = 0x%.2x\n",
-	       dev->flags1);
-
-	/* write challenge to the buffer */
-	DEBUGP(5, dev, "Write challenge to buffer: ");
-	for (i = 0; i < 4; i++) {
-		xoutb(i, REG_BUF_ADDR(iobase));
-		xoutb(dev->pts[i], REG_BUF_DATA(iobase));	/* buf data */
-#ifdef CM4000_DEBUG
-		pr_debug("0x%.2x ", dev->pts[i]);
-	}
-	pr_debug("\n");
-#else
-	}
-#endif
-
-	/* set number of bytes to write */
-	DEBUGP(5, dev, "Set number of bytes to write\n");
-	xoutb(0x04, REG_NUM_SEND(iobase));
-
-	/* Trigger CARDMAN CONTROLLER */
-	xoutb(0x50, REG_FLAGS0(iobase));
-
-	/* Monitor progress */
-	/* wait for xmit done */
-	DEBUGP(5, dev, "Waiting for NumRecBytes getting valid\n");
-
-	for (i = 0; i < 100; i++) {
-		if (inb(REG_FLAGS0(iobase)) & 0x08) {
-			DEBUGP(5, dev, "NumRecBytes is valid\n");
-			break;
-		}
-		/* can not sleep as this is in atomic context */
-		mdelay(10);
-	}
-	if (i == 100) {
-		DEBUGP(5, dev, "Timeout waiting for NumRecBytes getting "
-		       "valid\n");
-		rc = -EIO;
-		goto exit_setprotocol;
-	}
-
-	DEBUGP(5, dev, "Reading NumRecBytes\n");
-	for (i = 0; i < 100; i++) {
-		io_read_num_rec_bytes(iobase, &num_bytes_read);
-		if (num_bytes_read >= 4) {
-			DEBUGP(2, dev, "NumRecBytes = %i\n", num_bytes_read);
-			if (num_bytes_read > 4) {
-				rc = -EIO;
-				goto exit_setprotocol;
-			}
-			break;
-		}
-		/* can not sleep as this is in atomic context */
-		mdelay(10);
-	}
-
-	/* check whether it is a short PTS reply? */
-	if (num_bytes_read == 3)
-		i = 0;
-
-	if (i == 100) {
-		DEBUGP(5, dev, "Timeout reading num_bytes_read\n");
-		rc = -EIO;
-		goto exit_setprotocol;
-	}
-
-	DEBUGP(5, dev, "Reset the CARDMAN CONTROLLER\n");
-	xoutb(0x80, REG_FLAGS0(iobase));
-
-	/* Read PPS reply */
-	DEBUGP(5, dev, "Read PPS reply\n");
-	for (i = 0; i < num_bytes_read; i++) {
-		xoutb(i, REG_BUF_ADDR(iobase));
-		pts_reply[i] = inb(REG_BUF_DATA(iobase));
-	}
-
-#ifdef CM4000_DEBUG
-	DEBUGP(2, dev, "PTSreply: ");
-	for (i = 0; i < num_bytes_read; i++) {
-		pr_debug("0x%.2x ", pts_reply[i]);
-	}
-	pr_debug("\n");
-#endif	/* CM4000_DEBUG */
-
-	DEBUGP(5, dev, "Clear Tactive in Flags1\n");
-	xoutb(0x20, REG_FLAGS1(iobase));
-
-	/* Compare ptsreq and ptsreply */
-	if ((dev->pts[0] == pts_reply[0]) &&
-	    (dev->pts[1] == pts_reply[1]) &&
-	    (dev->pts[2] == pts_reply[2]) && (dev->pts[3] == pts_reply[3])) {
-		/* setcardparameter according to PPS */
-		dev->baudv = calc_baudv(dev->pts[2]);
-		set_cardparameter(dev);
-	} else if ((dev->pts[0] == pts_reply[0]) &&
-		   ((dev->pts[1] & 0xef) == pts_reply[1]) &&
-		   ((pts_reply[0] ^ pts_reply[1]) == pts_reply[2])) {
-		/* short PTS reply, set card parameter to default values */
-		dev->baudv = calc_baudv(0x11);
-		set_cardparameter(dev);
-	} else
-		rc = -EIO;
-
-exit_setprotocol:
-	DEBUGP(3, dev, "<- set_protocol\n");
-	return rc;
-}
-
-static int io_detect_cm4000(unsigned int iobase, struct cm4000_dev *dev)
-{
-
-	/* note: statemachine is assumed to be reset */
-	if (inb(REG_FLAGS0(iobase)) & 8) {
-		clear_bit(IS_ATR_VALID, &dev->flags);
-		set_bit(IS_CMM_ABSENT, &dev->flags);
-		return 0;	/* detect CMM = 1 -> failure */
-	}
-	/* xoutb(0x40, REG_FLAGS1(iobase)); detectCMM */
-	xoutb(dev->flags1 | 0x40, REG_FLAGS1(iobase));
-	if ((inb(REG_FLAGS0(iobase)) & 8) == 0) {
-		clear_bit(IS_ATR_VALID, &dev->flags);
-		set_bit(IS_CMM_ABSENT, &dev->flags);
-		return 0;	/* detect CMM=0 -> failure */
-	}
-	/* clear detectCMM again by restoring original flags1 */
-	xoutb(dev->flags1, REG_FLAGS1(iobase));
-	return 1;
-}
-
-static void terminate_monitor(struct cm4000_dev *dev)
-{
-
-	/* tell the monitor to stop and wait until
-	 * it terminates.
-	 */
-	DEBUGP(3, dev, "-> terminate_monitor\n");
-	wait_event_interruptible(dev->devq,
-				 test_and_set_bit(LOCK_MONITOR,
-						  (void *)&dev->flags));
-
-	/* now, LOCK_MONITOR has been set.
-	 * allow a last cycle in the monitor.
-	 * the monitor will indicate that it has
-	 * finished by clearing this bit.
-	 */
-	DEBUGP(5, dev, "Now allow last cycle of monitor!\n");
-	while (test_bit(LOCK_MONITOR, (void *)&dev->flags))
-		msleep(25);
-
-	DEBUGP(5, dev, "Delete timer\n");
-	del_timer_sync(&dev->timer);
-#ifdef CM4000_DEBUG
-	dev->monitor_running = 0;
-#endif
-
-	DEBUGP(3, dev, "<- terminate_monitor\n");
-}
-
-/*
- * monitor the card every 50msec. as a side-effect, retrieve the
- * atr once a card is inserted. another side-effect of retrieving the
- * atr is that the card will be powered on, so there is no need to
- * power on the card explicitly from the application: the driver
- * is already doing that for you.
- */
-
-static void monitor_card(struct timer_list *t)
-{
-	struct cm4000_dev *dev = from_timer(dev, t, timer);
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-	unsigned short s;
-	struct ptsreq ptsreq;
-	int i, atrc;
-
-	DEBUGP(7, dev, "->  monitor_card\n");
-
-	/* if someone has set the lock for us: we're done! */
-	if (test_and_set_bit(LOCK_MONITOR, &dev->flags)) {
-		DEBUGP(4, dev, "About to stop monitor\n");
-		/* no */
-		dev->rlen =
-		    dev->rpos =
-		    dev->atr_csum = dev->atr_len_retry = dev->cwarn = 0;
-		dev->mstate = M_FETCH_ATR;
-		clear_bit(LOCK_MONITOR, &dev->flags);
-		/* close et al. are sleeping on devq, so wake it */
-		wake_up_interruptible(&dev->devq);
-		DEBUGP(2, dev, "<- monitor_card (we are done now)\n");
-		return;
-	}
-
-	/* try to lock io: if it is already locked, just add another timer */
-	if (test_and_set_bit(LOCK_IO, (void *)&dev->flags)) {
-		DEBUGP(4, dev, "Couldn't get IO lock\n");
-		goto return_with_timer;
-	}
-
-	/* is a card/a reader inserted at all ? */
-	dev->flags0 = xinb(REG_FLAGS0(iobase));
-	DEBUGP(7, dev, "dev->flags0 = 0x%2x\n", dev->flags0);
-	DEBUGP(7, dev, "smartcard present: %s\n",
-	       dev->flags0 & 1 ? "yes" : "no");
-	DEBUGP(7, dev, "cardman present: %s\n",
-	       dev->flags0 == 0xff ? "no" : "yes");
-
-	if ((dev->flags0 & 1) == 0	/* no smartcard inserted */
-	    || dev->flags0 == 0xff) {	/* no cardman inserted */
-		/* no */
-		dev->rlen =
-		    dev->rpos =
-		    dev->atr_csum = dev->atr_len_retry = dev->cwarn = 0;
-		dev->mstate = M_FETCH_ATR;
-
-		dev->flags &= 0x000000ff; /* only keep IO and MONITOR locks */
-
-		if (dev->flags0 == 0xff) {
-			DEBUGP(4, dev, "set IS_CMM_ABSENT bit\n");
-			set_bit(IS_CMM_ABSENT, &dev->flags);
-		} else if (test_bit(IS_CMM_ABSENT, &dev->flags)) {
-			DEBUGP(4, dev, "clear IS_CMM_ABSENT bit "
-			       "(card is removed)\n");
-			clear_bit(IS_CMM_ABSENT, &dev->flags);
-		}
-
-		goto release_io;
-	} else if ((dev->flags0 & 1) && test_bit(IS_CMM_ABSENT, &dev->flags)) {
-		/* cardman and card present but cardman was absent before
-		 * (after suspend with inserted card) */
-		DEBUGP(4, dev, "clear IS_CMM_ABSENT bit (card is inserted)\n");
-		clear_bit(IS_CMM_ABSENT, &dev->flags);
-	}
-
-	if (test_bit(IS_ATR_VALID, &dev->flags) == 1) {
-		DEBUGP(7, dev, "believe ATR is already valid (do nothing)\n");
-		goto release_io;
-	}
-
-	switch (dev->mstate) {
-	case M_CARDOFF: {
-		unsigned char flags0;
-
-		DEBUGP(4, dev, "M_CARDOFF\n");
-		flags0 = inb(REG_FLAGS0(iobase));
-		if (flags0 & 0x02) {
-			/* wait until Flags0 indicate power is off */
-			dev->mdelay = T_10MSEC;
-		} else {
-			/* Flags0 indicate power off and no card inserted now;
-			 * Reset CARDMAN CONTROLLER */
-			xoutb(0x80, REG_FLAGS0(iobase));
-
-			/* prepare for fetching ATR again: after card off ATR
-			 * is read again automatically */
-			dev->rlen =
-			    dev->rpos =
-			    dev->atr_csum =
-			    dev->atr_len_retry = dev->cwarn = 0;
-			dev->mstate = M_FETCH_ATR;
-
-			/* minimal gap between CARDOFF and read ATR is 50msec */
-			dev->mdelay = T_50MSEC;
-		}
-		break;
-	}
-	case M_FETCH_ATR:
-		DEBUGP(4, dev, "M_FETCH_ATR\n");
-		xoutb(0x80, REG_FLAGS0(iobase));
-		DEBUGP(4, dev, "Reset BAUDV to 9600\n");
-		dev->baudv = 0x173;	/* 9600 */
-		xoutb(0x02, REG_STOPBITS(iobase));	/* stopbits=2 */
-		xoutb(0x73, REG_BAUDRATE(iobase));	/* baud value */
-		xoutb(0x21, REG_FLAGS1(iobase));	/* T_Active=1, baud
-							   value */
-		/* warm start vs. power on: */
-		xoutb(dev->flags0 & 2 ? 0x46 : 0x44, REG_FLAGS0(iobase));
-		dev->mdelay = T_40MSEC;
-		dev->mstate = M_TIMEOUT_WAIT;
-		break;
-	case M_TIMEOUT_WAIT:
-		DEBUGP(4, dev, "M_TIMEOUT_WAIT\n");
-		/* numRecBytes */
-		io_read_num_rec_bytes(iobase, &dev->atr_len);
-		dev->mdelay = T_10MSEC;
-		dev->mstate = M_READ_ATR_LEN;
-		break;
-	case M_READ_ATR_LEN:
-		DEBUGP(4, dev, "M_READ_ATR_LEN\n");
-		/* infinite loop possible, since there is no timeout */
-
-#define	MAX_ATR_LEN_RETRY	100
-
-		if (dev->atr_len == io_read_num_rec_bytes(iobase, &s)) {
-			if (dev->atr_len_retry++ >= MAX_ATR_LEN_RETRY) {					/* + XX msec */
-				dev->mdelay = T_10MSEC;
-				dev->mstate = M_READ_ATR;
-			}
-		} else {
-			dev->atr_len = s;
-			dev->atr_len_retry = 0;	/* set new timeout */
-		}
-
-		DEBUGP(4, dev, "Current ATR_LEN = %i\n", dev->atr_len);
-		break;
-	case M_READ_ATR:
-		DEBUGP(4, dev, "M_READ_ATR\n");
-		xoutb(0x80, REG_FLAGS0(iobase));	/* reset SM */
-		for (i = 0; i < dev->atr_len; i++) {
-			xoutb(i, REG_BUF_ADDR(iobase));
-			dev->atr[i] = inb(REG_BUF_DATA(iobase));
-		}
-		/* Deactivate T_Active flags */
-		DEBUGP(4, dev, "Deactivate T_Active flags\n");
-		dev->flags1 = 0x01;
-		xoutb(dev->flags1, REG_FLAGS1(iobase));
-
-		/* atr is present (which doesn't mean it's valid) */
-		set_bit(IS_ATR_PRESENT, &dev->flags);
-		if (dev->atr[0] == 0x03)
-			str_invert_revert(dev->atr, dev->atr_len);
-		atrc = parse_atr(dev);
-		if (atrc == 0) {	/* atr invalid */
-			dev->mdelay = 0;
-			dev->mstate = M_BAD_CARD;
-		} else {
-			dev->mdelay = T_50MSEC;
-			dev->mstate = M_ATR_PRESENT;
-			set_bit(IS_ATR_VALID, &dev->flags);
-		}
-
-		if (test_bit(IS_ATR_VALID, &dev->flags) == 1) {
-			DEBUGP(4, dev, "monitor_card: ATR valid\n");
- 			/* if ta1 == 0x11, no PPS necessary (default values) */
-			/* do not do PPS with multi protocol cards */
-			if ((test_bit(IS_AUTOPPS_ACT, &dev->flags) == 0) &&
-			    (dev->ta1 != 0x11) &&
-			    !(test_bit(IS_ANY_T0, &dev->flags) &&
-			    test_bit(IS_ANY_T1, &dev->flags))) {
-				DEBUGP(4, dev, "Perform AUTOPPS\n");
-				set_bit(IS_AUTOPPS_ACT, &dev->flags);
-				ptsreq.protocol = (0x01 << dev->proto);
-				ptsreq.flags = 0x01;
-				ptsreq.pts1 = 0x00;
-				ptsreq.pts2 = 0x00;
-				ptsreq.pts3 = 0x00;
-				if (set_protocol(dev, &ptsreq) == 0) {
-					DEBUGP(4, dev, "AUTOPPS ret SUCC\n");
-					clear_bit(IS_AUTOPPS_ACT, &dev->flags);
-					wake_up_interruptible(&dev->atrq);
-				} else {
-					DEBUGP(4, dev, "AUTOPPS failed: "
-					       "repower using defaults\n");
-					/* prepare for repowering  */
-					clear_bit(IS_ATR_PRESENT, &dev->flags);
-					clear_bit(IS_ATR_VALID, &dev->flags);
-					dev->rlen =
-					    dev->rpos =
-					    dev->atr_csum =
-					    dev->atr_len_retry = dev->cwarn = 0;
-					dev->mstate = M_FETCH_ATR;
-
-					dev->mdelay = T_50MSEC;
-				}
-			} else {
-				/* for cards which use slightly different
-				 * params (extra guard time) */
-				set_cardparameter(dev);
-				if (test_bit(IS_AUTOPPS_ACT, &dev->flags) == 1)
-					DEBUGP(4, dev, "AUTOPPS already active "
-					       "2nd try:use default values\n");
-				if (dev->ta1 == 0x11)
-					DEBUGP(4, dev, "No AUTOPPS necessary "
-					       "TA(1)==0x11\n");
-				if (test_bit(IS_ANY_T0, &dev->flags)
-				    && test_bit(IS_ANY_T1, &dev->flags))
-					DEBUGP(4, dev, "Do NOT perform AUTOPPS "
-					       "with multiprotocol cards\n");
-				clear_bit(IS_AUTOPPS_ACT, &dev->flags);
-				wake_up_interruptible(&dev->atrq);
-			}
-		} else {
-			DEBUGP(4, dev, "ATR invalid\n");
-			wake_up_interruptible(&dev->atrq);
-		}
-		break;
-	case M_BAD_CARD:
-		DEBUGP(4, dev, "M_BAD_CARD\n");
-		/* slow down warning, but prompt immediately after insertion */
-		if (dev->cwarn == 0 || dev->cwarn == 10) {
-			set_bit(IS_BAD_CARD, &dev->flags);
-			dev_warn(&dev->p_dev->dev, MODULE_NAME ": ");
-			if (test_bit(IS_BAD_CSUM, &dev->flags)) {
-				DEBUGP(4, dev, "ATR checksum (0x%.2x, should "
-				       "be zero) failed\n", dev->atr_csum);
-			}
-#ifdef CM4000_DEBUG
-			else if (test_bit(IS_BAD_LENGTH, &dev->flags)) {
-				DEBUGP(4, dev, "ATR length error\n");
-			} else {
-				DEBUGP(4, dev, "card damaged or wrong way "
-					"inserted\n");
-			}
-#endif
-			dev->cwarn = 0;
-			wake_up_interruptible(&dev->atrq);	/* wake open */
-		}
-		dev->cwarn++;
-		dev->mdelay = T_100MSEC;
-		dev->mstate = M_FETCH_ATR;
-		break;
-	default:
-		DEBUGP(7, dev, "Unknown action\n");
-		break;		/* nothing */
-	}
-
-release_io:
-	DEBUGP(7, dev, "release_io\n");
-	clear_bit(LOCK_IO, &dev->flags);
-	wake_up_interruptible(&dev->ioq);	/* whoever needs IO */
-
-return_with_timer:
-	DEBUGP(7, dev, "<- monitor_card (returns with timer)\n");
-	mod_timer(&dev->timer, jiffies + dev->mdelay);
-	clear_bit(LOCK_MONITOR, &dev->flags);
-}
-
-/* Interface to userland (file_operations) */
-
-static ssize_t cmm_read(struct file *filp, __user char *buf, size_t count,
-			loff_t *ppos)
-{
-	struct cm4000_dev *dev = filp->private_data;
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-	ssize_t rc;
-	int i, j, k;
-
-	DEBUGP(2, dev, "-> cmm_read(%s,%d)\n", current->comm, current->pid);
-
-	if (count == 0)		/* according to manpage */
-		return 0;
-
-	if (!pcmcia_dev_present(dev->p_dev) || /* device removed */
-	    test_bit(IS_CMM_ABSENT, &dev->flags))
-		return -ENODEV;
-
-	if (test_bit(IS_BAD_CSUM, &dev->flags))
-		return -EIO;
-
-	/* also see the note about this in cmm_write */
-	if (wait_event_interruptible
-	    (dev->atrq,
-	     ((filp->f_flags & O_NONBLOCK)
-	      || (test_bit(IS_ATR_PRESENT, (void *)&dev->flags) != 0)))) {
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-	if (test_bit(IS_ATR_VALID, &dev->flags) == 0)
-		return -EIO;
-
-	/* this one implements blocking IO */
-	if (wait_event_interruptible
-	    (dev->readq,
-	     ((filp->f_flags & O_NONBLOCK) || (dev->rpos < dev->rlen)))) {
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-	/* lock io */
-	if (wait_event_interruptible
-	    (dev->ioq,
-	     ((filp->f_flags & O_NONBLOCK)
-	      || (test_and_set_bit(LOCK_IO, (void *)&dev->flags) == 0)))) {
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-	rc = 0;
-	dev->flags0 = inb(REG_FLAGS0(iobase));
-	if ((dev->flags0 & 1) == 0	/* no smartcard inserted */
-	    || dev->flags0 == 0xff) {	/* no cardman inserted */
-		clear_bit(IS_ATR_VALID, &dev->flags);
-		if (dev->flags0 & 1) {
-			set_bit(IS_CMM_ABSENT, &dev->flags);
-			rc = -ENODEV;
-		} else {
-			rc = -EIO;
-		}
-		goto release_io;
-	}
-
-	DEBUGP(4, dev, "begin read answer\n");
-	j = min(count, (size_t)(dev->rlen - dev->rpos));
-	k = dev->rpos;
-	if (k + j > 255)
-		j = 256 - k;
-	DEBUGP(4, dev, "read1 j=%d\n", j);
-	for (i = 0; i < j; i++) {
-		xoutb(k++, REG_BUF_ADDR(iobase));
-		dev->rbuf[i] = xinb(REG_BUF_DATA(iobase));
-	}
-	j = min(count, (size_t)(dev->rlen - dev->rpos));
-	if (k + j > 255) {
-		DEBUGP(4, dev, "read2 j=%d\n", j);
-		dev->flags1 |= 0x10;	/* MSB buf addr set */
-		xoutb(dev->flags1, REG_FLAGS1(iobase));
-		for (; i < j; i++) {
-			xoutb(k++, REG_BUF_ADDR(iobase));
-			dev->rbuf[i] = xinb(REG_BUF_DATA(iobase));
-		}
-	}
-
-	if (dev->proto == 0 && count > dev->rlen - dev->rpos && i) {
-		DEBUGP(4, dev, "T=0 and count > buffer\n");
-		dev->rbuf[i] = dev->rbuf[i - 1];
-		dev->rbuf[i - 1] = dev->procbyte;
-		j++;
-	}
-	count = j;
-
-	dev->rpos = dev->rlen + 1;
-
-	/* Clear T1Active */
-	DEBUGP(4, dev, "Clear T1Active\n");
-	dev->flags1 &= 0xdf;
-	xoutb(dev->flags1, REG_FLAGS1(iobase));
-
-	xoutb(0, REG_FLAGS1(iobase));	/* clear detectCMM */
-	/* last check before exit */
-	if (!io_detect_cm4000(iobase, dev)) {
-		rc = -ENODEV;
-		goto release_io;
-	}
-
-	if (test_bit(IS_INVREV, &dev->flags) && count > 0)
-		str_invert_revert(dev->rbuf, count);
-
-	if (copy_to_user(buf, dev->rbuf, count))
-		rc = -EFAULT;
-
-release_io:
-	clear_bit(LOCK_IO, &dev->flags);
-	wake_up_interruptible(&dev->ioq);
-
-	DEBUGP(2, dev, "<- cmm_read returns: rc = %zi\n",
-	       (rc < 0 ? rc : count));
-	return rc < 0 ? rc : count;
-}
-
-static ssize_t cmm_write(struct file *filp, const char __user *buf,
-			 size_t count, loff_t *ppos)
-{
-	struct cm4000_dev *dev = filp->private_data;
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-	unsigned short s;
-	unsigned char infolen;
-	unsigned char sendT0;
-	unsigned short nsend;
-	unsigned short nr;
-	ssize_t rc;
-	int i;
-
-	DEBUGP(2, dev, "-> cmm_write(%s,%d)\n", current->comm, current->pid);
-
-	if (count == 0)		/* according to manpage */
-		return 0;
-
-	if (dev->proto == 0 && count < 4) {
-		/* T0 must have at least 4 bytes */
-		DEBUGP(4, dev, "T0 short write\n");
-		return -EIO;
-	}
-
-	nr = count & 0x1ff;	/* max bytes to write */
-
-	sendT0 = dev->proto ? 0 : nr > 5 ? 0x08 : 0;
-
-	if (!pcmcia_dev_present(dev->p_dev) || /* device removed */
-	    test_bit(IS_CMM_ABSENT, &dev->flags))
-		return -ENODEV;
-
-	if (test_bit(IS_BAD_CSUM, &dev->flags)) {
-		DEBUGP(4, dev, "bad csum\n");
-		return -EIO;
-	}
-
-	/*
-	 * wait for atr to become valid.
-	 * note: it is important to lock this code. if we dont, the monitor
-	 * could be run between test_bit and the call to sleep on the
-	 * atr-queue.  if *then* the monitor detects atr valid, it will wake up
-	 * any process on the atr-queue, *but* since we have been interrupted,
-	 * we do not yet sleep on this queue. this would result in a missed
-	 * wake_up and the calling process would sleep forever (until
-	 * interrupted).  also, do *not* restore_flags before sleep_on, because
-	 * this could result in the same situation!
-	 */
-	if (wait_event_interruptible
-	    (dev->atrq,
-	     ((filp->f_flags & O_NONBLOCK)
-	      || (test_bit(IS_ATR_PRESENT, (void *)&dev->flags) != 0)))) {
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-	if (test_bit(IS_ATR_VALID, &dev->flags) == 0) {	/* invalid atr */
-		DEBUGP(4, dev, "invalid ATR\n");
-		return -EIO;
-	}
-
-	/* lock io */
-	if (wait_event_interruptible
-	    (dev->ioq,
-	     ((filp->f_flags & O_NONBLOCK)
-	      || (test_and_set_bit(LOCK_IO, (void *)&dev->flags) == 0)))) {
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-	if (copy_from_user(dev->sbuf, buf, ((count > 512) ? 512 : count)))
-		return -EFAULT;
-
-	rc = 0;
-	dev->flags0 = inb(REG_FLAGS0(iobase));
-	if ((dev->flags0 & 1) == 0	/* no smartcard inserted */
-	    || dev->flags0 == 0xff) {	/* no cardman inserted */
-		clear_bit(IS_ATR_VALID, &dev->flags);
-		if (dev->flags0 & 1) {
-			set_bit(IS_CMM_ABSENT, &dev->flags);
-			rc = -ENODEV;
-		} else {
-			DEBUGP(4, dev, "IO error\n");
-			rc = -EIO;
-		}
-		goto release_io;
-	}
-
-	xoutb(0x80, REG_FLAGS0(iobase));	/* reset SM  */
-
-	if (!io_detect_cm4000(iobase, dev)) {
-		rc = -ENODEV;
-		goto release_io;
-	}
-
-	/* reflect T=0 send/read mode in flags1 */
-	dev->flags1 |= (sendT0);
-
-	set_cardparameter(dev);
-
-	/* dummy read, reset flag procedure received */
-	inb(REG_FLAGS1(iobase));
-
-	dev->flags1 = 0x20	/* T_Active */
-	    | (sendT0)
-	    | (test_bit(IS_INVREV, &dev->flags) ? 2 : 0)/* inverse parity  */
-	    | (((dev->baudv - 1) & 0x0100) >> 8);	/* MSB-Baud */
-	DEBUGP(1, dev, "set dev->flags1 = 0x%.2x\n", dev->flags1);
-	xoutb(dev->flags1, REG_FLAGS1(iobase));
-
-	/* xmit data */
-	DEBUGP(4, dev, "Xmit data\n");
-	for (i = 0; i < nr; i++) {
-		if (i >= 256) {
-			dev->flags1 = 0x20	/* T_Active */
-			    | (sendT0)	/* SendT0 */
-				/* inverse parity: */
-			    | (test_bit(IS_INVREV, &dev->flags) ? 2 : 0)
-			    | (((dev->baudv - 1) & 0x0100) >> 8) /* MSB-Baud */
-			    | 0x10;	/* set address high */
-			DEBUGP(4, dev, "dev->flags = 0x%.2x - set address "
-			       "high\n", dev->flags1);
-			xoutb(dev->flags1, REG_FLAGS1(iobase));
-		}
-		if (test_bit(IS_INVREV, &dev->flags)) {
-			DEBUGP(4, dev, "Apply inverse convention for 0x%.2x "
-				"-> 0x%.2x\n", (unsigned char)dev->sbuf[i],
-			      invert_revert(dev->sbuf[i]));
-			xoutb(i, REG_BUF_ADDR(iobase));
-			xoutb(invert_revert(dev->sbuf[i]),
-			      REG_BUF_DATA(iobase));
-		} else {
-			xoutb(i, REG_BUF_ADDR(iobase));
-			xoutb(dev->sbuf[i], REG_BUF_DATA(iobase));
-		}
-	}
-	DEBUGP(4, dev, "Xmit done\n");
-
-	if (dev->proto == 0) {
-		/* T=0 proto: 0 byte reply  */
-		if (nr == 4) {
-			DEBUGP(4, dev, "T=0 assumes 0 byte reply\n");
-			xoutb(i, REG_BUF_ADDR(iobase));
-			if (test_bit(IS_INVREV, &dev->flags))
-				xoutb(0xff, REG_BUF_DATA(iobase));
-			else
-				xoutb(0x00, REG_BUF_DATA(iobase));
-		}
-
-		/* numSendBytes */
-		if (sendT0)
-			nsend = nr;
-		else {
-			if (nr == 4)
-				nsend = 5;
-			else {
-				nsend = 5 + (unsigned char)dev->sbuf[4];
-				if (dev->sbuf[4] == 0)
-					nsend += 0x100;
-			}
-		}
-	} else
-		nsend = nr;
-
-	/* T0: output procedure byte */
-	if (test_bit(IS_INVREV, &dev->flags)) {
-		DEBUGP(4, dev, "T=0 set Procedure byte (inverse-reverse) "
-		       "0x%.2x\n", invert_revert(dev->sbuf[1]));
-		xoutb(invert_revert(dev->sbuf[1]), REG_NUM_BYTES(iobase));
-	} else {
-		DEBUGP(4, dev, "T=0 set Procedure byte 0x%.2x\n", dev->sbuf[1]);
-		xoutb(dev->sbuf[1], REG_NUM_BYTES(iobase));
-	}
-
-	DEBUGP(1, dev, "set NumSendBytes = 0x%.2x\n",
-	       (unsigned char)(nsend & 0xff));
-	xoutb((unsigned char)(nsend & 0xff), REG_NUM_SEND(iobase));
-
-	DEBUGP(1, dev, "Trigger CARDMAN CONTROLLER (0x%.2x)\n",
-	       0x40	/* SM_Active */
-	      | (dev->flags0 & 2 ? 0 : 4)	/* power on if needed */
-	      |(dev->proto ? 0x10 : 0x08)	/* T=1/T=0 */
-	      |(nsend & 0x100) >> 8 /* MSB numSendBytes */ );
-	xoutb(0x40		/* SM_Active */
-	      | (dev->flags0 & 2 ? 0 : 4)	/* power on if needed */
-	      |(dev->proto ? 0x10 : 0x08)	/* T=1/T=0 */
-	      |(nsend & 0x100) >> 8,	/* MSB numSendBytes */
-	      REG_FLAGS0(iobase));
-
-	/* wait for xmit done */
-	if (dev->proto == 1) {
-		DEBUGP(4, dev, "Wait for xmit done\n");
-		for (i = 0; i < 1000; i++) {
-			if (inb(REG_FLAGS0(iobase)) & 0x08)
-				break;
-			msleep_interruptible(10);
-		}
-		if (i == 1000) {
-			DEBUGP(4, dev, "timeout waiting for xmit done\n");
-			rc = -EIO;
-			goto release_io;
-		}
-	}
-
-	/* T=1: wait for infoLen */
-
-	infolen = 0;
-	if (dev->proto) {
-		/* wait until infoLen is valid */
-		for (i = 0; i < 6000; i++) {	/* max waiting time of 1 min */
-			io_read_num_rec_bytes(iobase, &s);
-			if (s >= 3) {
-				infolen = inb(REG_FLAGS1(iobase));
-				DEBUGP(4, dev, "infolen=%d\n", infolen);
-				break;
-			}
-			msleep_interruptible(10);
-		}
-		if (i == 6000) {
-			DEBUGP(4, dev, "timeout waiting for infoLen\n");
-			rc = -EIO;
-			goto release_io;
-		}
-	} else
-		clear_bit(IS_PROCBYTE_PRESENT, &dev->flags);
-
-	/* numRecBytes | bit9 of numRecytes */
-	io_read_num_rec_bytes(iobase, &dev->rlen);
-	for (i = 0; i < 600; i++) {	/* max waiting time of 2 sec */
-		if (dev->proto) {
-			if (dev->rlen >= infolen + 4)
-				break;
-		}
-		msleep_interruptible(10);
-		/* numRecBytes | bit9 of numRecytes */
-		io_read_num_rec_bytes(iobase, &s);
-		if (s > dev->rlen) {
-			DEBUGP(1, dev, "NumRecBytes inc (reset timeout)\n");
-			i = 0;	/* reset timeout */
-			dev->rlen = s;
-		}
-		/* T=0: we are done when numRecBytes doesn't
-		 *      increment any more and NoProcedureByte
-		 *      is set and numRecBytes == bytes sent + 6
-		 *      (header bytes + data + 1 for sw2)
-		 *      except when the card replies an error
-		 *      which means, no data will be sent back.
-		 */
-		else if (dev->proto == 0) {
-			if ((inb(REG_BUF_ADDR(iobase)) & 0x80)) {
-				/* no procedure byte received since last read */
-				DEBUGP(1, dev, "NoProcedure byte set\n");
-				/* i=0; */
-			} else {
-				/* procedure byte received since last read */
-				DEBUGP(1, dev, "NoProcedure byte unset "
-					"(reset timeout)\n");
-				dev->procbyte = inb(REG_FLAGS1(iobase));
-				DEBUGP(1, dev, "Read procedure byte 0x%.2x\n",
-				      dev->procbyte);
-				i = 0;	/* resettimeout */
-			}
-			if (inb(REG_FLAGS0(iobase)) & 0x08) {
-				DEBUGP(1, dev, "T0Done flag (read reply)\n");
-				break;
-			}
-		}
-		if (dev->proto)
-			infolen = inb(REG_FLAGS1(iobase));
-	}
-	if (i == 600) {
-		DEBUGP(1, dev, "timeout waiting for numRecBytes\n");
-		rc = -EIO;
-		goto release_io;
-	} else {
-		if (dev->proto == 0) {
-			DEBUGP(1, dev, "Wait for T0Done bit to be  set\n");
-			for (i = 0; i < 1000; i++) {
-				if (inb(REG_FLAGS0(iobase)) & 0x08)
-					break;
-				msleep_interruptible(10);
-			}
-			if (i == 1000) {
-				DEBUGP(1, dev, "timeout waiting for T0Done\n");
-				rc = -EIO;
-				goto release_io;
-			}
-
-			dev->procbyte = inb(REG_FLAGS1(iobase));
-			DEBUGP(4, dev, "Read procedure byte 0x%.2x\n",
-			      dev->procbyte);
-
-			io_read_num_rec_bytes(iobase, &dev->rlen);
-			DEBUGP(4, dev, "Read NumRecBytes = %i\n", dev->rlen);
-
-		}
-	}
-	/* T=1: read offset=zero, T=0: read offset=after challenge */
-	dev->rpos = dev->proto ? 0 : nr == 4 ? 5 : nr > dev->rlen ? 5 : nr;
-	DEBUGP(4, dev, "dev->rlen = %i,  dev->rpos = %i, nr = %i\n",
-	      dev->rlen, dev->rpos, nr);
-
-release_io:
-	DEBUGP(4, dev, "Reset SM\n");
-	xoutb(0x80, REG_FLAGS0(iobase));	/* reset SM */
-
-	if (rc < 0) {
-		DEBUGP(4, dev, "Write failed but clear T_Active\n");
-		dev->flags1 &= 0xdf;
-		xoutb(dev->flags1, REG_FLAGS1(iobase));
-	}
-
-	clear_bit(LOCK_IO, &dev->flags);
-	wake_up_interruptible(&dev->ioq);
-	wake_up_interruptible(&dev->readq);	/* tell read we have data */
-
-	/* ITSEC E2: clear write buffer */
-	memset((char *)dev->sbuf, 0, 512);
-
-	/* return error or actually written bytes */
-	DEBUGP(2, dev, "<- cmm_write\n");
-	return rc < 0 ? rc : nr;
-}
-
-static void start_monitor(struct cm4000_dev *dev)
-{
-	DEBUGP(3, dev, "-> start_monitor\n");
-	if (!dev->monitor_running) {
-		DEBUGP(5, dev, "create, init and add timer\n");
-		timer_setup(&dev->timer, monitor_card, 0);
-		dev->monitor_running = 1;
-		mod_timer(&dev->timer, jiffies);
-	} else
-		DEBUGP(5, dev, "monitor already running\n");
-	DEBUGP(3, dev, "<- start_monitor\n");
-}
-
-static void stop_monitor(struct cm4000_dev *dev)
-{
-	DEBUGP(3, dev, "-> stop_monitor\n");
-	if (dev->monitor_running) {
-		DEBUGP(5, dev, "stopping monitor\n");
-		terminate_monitor(dev);
-		/* reset monitor SM */
-		clear_bit(IS_ATR_VALID, &dev->flags);
-		clear_bit(IS_ATR_PRESENT, &dev->flags);
-	} else
-		DEBUGP(5, dev, "monitor already stopped\n");
-	DEBUGP(3, dev, "<- stop_monitor\n");
-}
-
-static long cmm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct cm4000_dev *dev = filp->private_data;
-	unsigned int iobase = dev->p_dev->resource[0]->start;
-	struct inode *inode = file_inode(filp);
-	struct pcmcia_device *link;
-	int rc;
-	void __user *argp = (void __user *)arg;
-#ifdef CM4000_DEBUG
-	char *ioctl_names[CM_IOC_MAXNR + 1] = {
-		[_IOC_NR(CM_IOCGSTATUS)] "CM_IOCGSTATUS",
-		[_IOC_NR(CM_IOCGATR)] "CM_IOCGATR",
-		[_IOC_NR(CM_IOCARDOFF)] "CM_IOCARDOFF",
-		[_IOC_NR(CM_IOCSPTS)] "CM_IOCSPTS",
-		[_IOC_NR(CM_IOSDBGLVL)] "CM4000_DBGLVL",
-	};
-	DEBUGP(3, dev, "cmm_ioctl(device=%d.%d) %s\n", imajor(inode),
-	       iminor(inode), ioctl_names[_IOC_NR(cmd)]);
-#endif
-
-	mutex_lock(&cmm_mutex);
-	rc = -ENODEV;
-	link = dev_table[iminor(inode)];
-	if (!pcmcia_dev_present(link)) {
-		DEBUGP(4, dev, "DEV_OK false\n");
-		goto out;
-	}
-
-	if (test_bit(IS_CMM_ABSENT, &dev->flags)) {
-		DEBUGP(4, dev, "CMM_ABSENT flag set\n");
-		goto out;
-	}
-	rc = -EINVAL;
-
-	if (_IOC_TYPE(cmd) != CM_IOC_MAGIC) {
-		DEBUGP(4, dev, "ioctype mismatch\n");
-		goto out;
-	}
-	if (_IOC_NR(cmd) > CM_IOC_MAXNR) {
-		DEBUGP(4, dev, "iocnr mismatch\n");
-		goto out;
-	}
-	rc = 0;
-
-	switch (cmd) {
-	case CM_IOCGSTATUS:
-		DEBUGP(4, dev, " ... in CM_IOCGSTATUS\n");
-		{
-			int status;
-
-			/* clear other bits, but leave inserted & powered as
-			 * they are */
-			status = dev->flags0 & 3;
-			if (test_bit(IS_ATR_PRESENT, &dev->flags))
-				status |= CM_ATR_PRESENT;
-			if (test_bit(IS_ATR_VALID, &dev->flags))
-				status |= CM_ATR_VALID;
-			if (test_bit(IS_CMM_ABSENT, &dev->flags))
-				status |= CM_NO_READER;
-			if (test_bit(IS_BAD_CARD, &dev->flags))
-				status |= CM_BAD_CARD;
-			if (copy_to_user(argp, &status, sizeof(int)))
-				rc = -EFAULT;
-		}
-		break;
-	case CM_IOCGATR:
-		DEBUGP(4, dev, "... in CM_IOCGATR\n");
-		{
-			struct atreq __user *atreq = argp;
-			int tmp;
-			/* allow nonblocking io and being interrupted */
-			if (wait_event_interruptible
-			    (dev->atrq,
-			     ((filp->f_flags & O_NONBLOCK)
-			      || (test_bit(IS_ATR_PRESENT, (void *)&dev->flags)
-				  != 0)))) {
-				if (filp->f_flags & O_NONBLOCK)
-					rc = -EAGAIN;
-				else
-					rc = -ERESTARTSYS;
-				break;
-			}
-
-			rc = -EFAULT;
-			if (test_bit(IS_ATR_VALID, &dev->flags) == 0) {
-				tmp = -1;
-				if (copy_to_user(&(atreq->atr_len), &tmp,
-						 sizeof(int)))
-					break;
-			} else {
-				if (copy_to_user(atreq->atr, dev->atr,
-						 dev->atr_len))
-					break;
-
-				tmp = dev->atr_len;
-				if (copy_to_user(&(atreq->atr_len), &tmp, sizeof(int)))
-					break;
-			}
-			rc = 0;
-			break;
-		}
-	case CM_IOCARDOFF:
-
-#ifdef CM4000_DEBUG
-		DEBUGP(4, dev, "... in CM_IOCARDOFF\n");
-		if (dev->flags0 & 0x01) {
-			DEBUGP(4, dev, "    Card inserted\n");
-		} else {
-			DEBUGP(2, dev, "    No card inserted\n");
-		}
-		if (dev->flags0 & 0x02) {
-			DEBUGP(4, dev, "    Card powered\n");
-		} else {
-			DEBUGP(2, dev, "    Card not powered\n");
-		}
-#endif
-
-		/* is a card inserted and powered? */
-		if ((dev->flags0 & 0x01) && (dev->flags0 & 0x02)) {
-
-			/* get IO lock */
-			if (wait_event_interruptible
-			    (dev->ioq,
-			     ((filp->f_flags & O_NONBLOCK)
-			      || (test_and_set_bit(LOCK_IO, (void *)&dev->flags)
-				  == 0)))) {
-				if (filp->f_flags & O_NONBLOCK)
-					rc = -EAGAIN;
-				else
-					rc = -ERESTARTSYS;
-				break;
-			}
-			/* Set Flags0 = 0x42 */
-			DEBUGP(4, dev, "Set Flags0=0x42 \n");
-			xoutb(0x42, REG_FLAGS0(iobase));
-			clear_bit(IS_ATR_PRESENT, &dev->flags);
-			clear_bit(IS_ATR_VALID, &dev->flags);
-			dev->mstate = M_CARDOFF;
-			clear_bit(LOCK_IO, &dev->flags);
-			if (wait_event_interruptible
-			    (dev->atrq,
-			     ((filp->f_flags & O_NONBLOCK)
-			      || (test_bit(IS_ATR_VALID, (void *)&dev->flags) !=
-				  0)))) {
-				if (filp->f_flags & O_NONBLOCK)
-					rc = -EAGAIN;
-				else
-					rc = -ERESTARTSYS;
-				break;
-			}
-		}
-		/* release lock */
-		clear_bit(LOCK_IO, &dev->flags);
-		wake_up_interruptible(&dev->ioq);
-
-		rc = 0;
-		break;
-	case CM_IOCSPTS:
-		{
-			struct ptsreq krnptsreq;
-
-			if (copy_from_user(&krnptsreq, argp,
-					   sizeof(struct ptsreq))) {
-				rc = -EFAULT;
-				break;
-			}
-
-			rc = 0;
-			DEBUGP(4, dev, "... in CM_IOCSPTS\n");
-			/* wait for ATR to get valid */
-			if (wait_event_interruptible
-			    (dev->atrq,
-			     ((filp->f_flags & O_NONBLOCK)
-			      || (test_bit(IS_ATR_PRESENT, (void *)&dev->flags)
-				  != 0)))) {
-				if (filp->f_flags & O_NONBLOCK)
-					rc = -EAGAIN;
-				else
-					rc = -ERESTARTSYS;
-				break;
-			}
-			/* get IO lock */
-			if (wait_event_interruptible
-			    (dev->ioq,
-			     ((filp->f_flags & O_NONBLOCK)
-			      || (test_and_set_bit(LOCK_IO, (void *)&dev->flags)
-				  == 0)))) {
-				if (filp->f_flags & O_NONBLOCK)
-					rc = -EAGAIN;
-				else
-					rc = -ERESTARTSYS;
-				break;
-			}
-
-			if ((rc = set_protocol(dev, &krnptsreq)) != 0) {
-				/* auto power_on again */
-				dev->mstate = M_FETCH_ATR;
-				clear_bit(IS_ATR_VALID, &dev->flags);
-			}
-			/* release lock */
-			clear_bit(LOCK_IO, &dev->flags);
-			wake_up_interruptible(&dev->ioq);
-
-		}
-		break;
-#ifdef CM4000_DEBUG
-	case CM_IOSDBGLVL:
-		rc = -ENOTTY;
-		break;
-#endif
-	default:
-		DEBUGP(4, dev, "... in default (unknown IOCTL code)\n");
-		rc = -ENOTTY;
-	}
-out:
-	mutex_unlock(&cmm_mutex);
-	return rc;
-}
-
-static int cmm_open(struct inode *inode, struct file *filp)
-{
-	struct cm4000_dev *dev;
-	struct pcmcia_device *link;
-	int minor = iminor(inode);
-	int ret;
-
-	if (minor >= CM4000_MAX_DEV)
-		return -ENODEV;
-
-	mutex_lock(&cmm_mutex);
-	link = dev_table[minor];
-	if (link == NULL || !pcmcia_dev_present(link)) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (link->open) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	dev = link->priv;
-	filp->private_data = dev;
-
-	DEBUGP(2, dev, "-> cmm_open(device=%d.%d process=%s,%d)\n",
-	      imajor(inode), minor, current->comm, current->pid);
-
-	/* init device variables, they may be "polluted" after close
-	 * or, the device may never have been closed (i.e. open failed)
-	 */
-
-	ZERO_DEV(dev);
-
-	/* opening will always block since the
-	 * monitor will be started by open, which
-	 * means we have to wait for ATR becoming
-	 * valid = block until valid (or card
-	 * inserted)
-	 */
-	if (filp->f_flags & O_NONBLOCK) {
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	dev->mdelay = T_50MSEC;
-
-	/* start monitoring the cardstatus */
-	start_monitor(dev);
-
-	link->open = 1;		/* only one open per device */
-
-	DEBUGP(2, dev, "<- cmm_open\n");
-	ret = stream_open(inode, filp);
-out:
-	mutex_unlock(&cmm_mutex);
-	return ret;
-}
-
-static int cmm_close(struct inode *inode, struct file *filp)
-{
-	struct cm4000_dev *dev;
-	struct pcmcia_device *link;
-	int minor = iminor(inode);
-
-	if (minor >= CM4000_MAX_DEV)
-		return -ENODEV;
-
-	link = dev_table[minor];
-	if (link == NULL)
-		return -ENODEV;
-
-	dev = link->priv;
-
-	DEBUGP(2, dev, "-> cmm_close(maj/min=%d.%d)\n",
-	       imajor(inode), minor);
-
-	stop_monitor(dev);
-
-	ZERO_DEV(dev);
-
-	link->open = 0;		/* only one open per device */
-	wake_up(&dev->devq);	/* socket removed? */
-
-	DEBUGP(2, dev, "cmm_close\n");
-	return 0;
-}
-
-static void cmm_cm4000_release(struct pcmcia_device * link)
-{
-	struct cm4000_dev *dev = link->priv;
-
-	/* dont terminate the monitor, rather rely on
-	 * close doing that for us.
-	 */
-	DEBUGP(3, dev, "-> cmm_cm4000_release\n");
-	while (link->open) {
-		printk(KERN_INFO MODULE_NAME ": delaying release until "
-		       "process has terminated\n");
-		/* note: don't interrupt us:
-		 * close the applications which own
-		 * the devices _first_ !
-		 */
-		wait_event(dev->devq, (link->open == 0));
-	}
-	/* dev->devq=NULL;	this cannot be zeroed earlier */
-	DEBUGP(3, dev, "<- cmm_cm4000_release\n");
-	return;
-}
-
-/*==== Interface to PCMCIA Layer =======================================*/
-
-static int cm4000_config_check(struct pcmcia_device *p_dev, void *priv_data)
-{
-	return pcmcia_request_io(p_dev);
-}
-
-static int cm4000_config(struct pcmcia_device * link, int devno)
-{
-	link->config_flags |= CONF_AUTO_SET_IO;
-
-	/* read the config-tuples */
-	if (pcmcia_loop_config(link, cm4000_config_check, NULL))
-		goto cs_release;
-
-	if (pcmcia_enable_device(link))
-		goto cs_release;
-
-	return 0;
-
-cs_release:
-	cm4000_release(link);
-	return -ENODEV;
-}
-
-static int cm4000_suspend(struct pcmcia_device *link)
-{
-	struct cm4000_dev *dev;
-
-	dev = link->priv;
-	stop_monitor(dev);
-
-	return 0;
-}
-
-static int cm4000_resume(struct pcmcia_device *link)
-{
-	struct cm4000_dev *dev;
-
-	dev = link->priv;
-	if (link->open)
-		start_monitor(dev);
-
-	return 0;
-}
-
-static void cm4000_release(struct pcmcia_device *link)
-{
-	cmm_cm4000_release(link);	/* delay release until device closed */
-	pcmcia_disable_device(link);
-}
-
-static int cm4000_probe(struct pcmcia_device *link)
-{
-	struct cm4000_dev *dev;
-	int i, ret;
-
-	for (i = 0; i < CM4000_MAX_DEV; i++)
-		if (dev_table[i] == NULL)
-			break;
-
-	if (i == CM4000_MAX_DEV) {
-		printk(KERN_NOTICE MODULE_NAME ": all devices in use\n");
-		return -ENODEV;
-	}
-
-	/* create a new cm4000_cs device */
-	dev = kzalloc(sizeof(struct cm4000_dev), GFP_KERNEL);
-	if (dev == NULL)
-		return -ENOMEM;
-
-	dev->p_dev = link;
-	link->priv = dev;
-	dev_table[i] = link;
-
-	init_waitqueue_head(&dev->devq);
-	init_waitqueue_head(&dev->ioq);
-	init_waitqueue_head(&dev->atrq);
-	init_waitqueue_head(&dev->readq);
-
-	ret = cm4000_config(link, i);
-	if (ret) {
-		dev_table[i] = NULL;
-		kfree(dev);
-		return ret;
-	}
-
-	device_create(cmm_class, NULL, MKDEV(major, i), NULL, "cmm%d", i);
-
-	return 0;
-}
-
-static void cm4000_detach(struct pcmcia_device *link)
-{
-	struct cm4000_dev *dev = link->priv;
-	int devno;
-
-	/* find device */
-	for (devno = 0; devno < CM4000_MAX_DEV; devno++)
-		if (dev_table[devno] == link)
-			break;
-	if (devno == CM4000_MAX_DEV)
-		return;
-
-	stop_monitor(dev);
-
-	cm4000_release(link);
-
-	dev_table[devno] = NULL;
-	kfree(dev);
-
-	device_destroy(cmm_class, MKDEV(major, devno));
-
-	return;
-}
-
-static const struct file_operations cm4000_fops = {
-	.owner	= THIS_MODULE,
-	.read	= cmm_read,
-	.write	= cmm_write,
-	.unlocked_ioctl	= cmm_ioctl,
-	.open	= cmm_open,
-	.release= cmm_close,
-	.llseek = no_llseek,
-};
-
-static const struct pcmcia_device_id cm4000_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x0223, 0x0002),
-	PCMCIA_DEVICE_PROD_ID12("CardMan", "4000", 0x2FB368CA, 0xA2BD8C39),
-	PCMCIA_DEVICE_NULL,
-};
-MODULE_DEVICE_TABLE(pcmcia, cm4000_ids);
-
-static struct pcmcia_driver cm4000_driver = {
-	.owner	  = THIS_MODULE,
-	.name	  = "cm4000_cs",
-	.probe    = cm4000_probe,
-	.remove   = cm4000_detach,
-	.suspend  = cm4000_suspend,
-	.resume   = cm4000_resume,
-	.id_table = cm4000_ids,
-};
-
-static int __init cmm_init(void)
-{
-	int rc;
-
-	cmm_class = class_create(THIS_MODULE, "cardman_4000");
-	if (IS_ERR(cmm_class))
-		return PTR_ERR(cmm_class);
-
-	major = register_chrdev(0, DEVICE_NAME, &cm4000_fops);
-	if (major < 0) {
-		printk(KERN_WARNING MODULE_NAME
-			": could not get major number\n");
-		class_destroy(cmm_class);
-		return major;
-	}
-
-	rc = pcmcia_register_driver(&cm4000_driver);
-	if (rc < 0) {
-		unregister_chrdev(major, DEVICE_NAME);
-		class_destroy(cmm_class);
-		return rc;
-	}
-
-	return 0;
-}
-
-static void __exit cmm_exit(void)
-{
-	pcmcia_unregister_driver(&cm4000_driver);
-	unregister_chrdev(major, DEVICE_NAME);
-	class_destroy(cmm_class);
-};
-
-module_init(cmm_init);
-module_exit(cmm_exit);
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
deleted file mode 100644
index 827711911da4..000000000000
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
- * A driver for the Omnikey PCMCIA smartcard reader CardMan 4040
- *
- * (c) 2000-2004 Omnikey AG (http://www.omnikey.com/)
- *
- * (C) 2005-2006 Harald Welte <laforge@gnumonks.org>
- * 	- add support for poll()
- * 	- driver cleanup
- * 	- add waitqueues
- * 	- adhere to linux kernel coding style and policies
- * 	- support 2.6.13 "new style" pcmcia interface
- * 	- add class interface for udev device creation
- *
- * The device basically is a USB CCID compliant device that has been
- * attached to an I/O-Mapped FIFO.
- *
- * All rights reserved, Dual BSD/GPL Licensed.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/delay.h>
-#include <linux/poll.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/cisreg.h>
-#include <pcmcia/ciscode.h>
-#include <pcmcia/ds.h>
-
-#include "cm4040_cs.h"
-
-
-#define reader_to_dev(x)	(&x->p_dev->dev)
-
-/* n (debug level) is ignored */
-/* additional debug output may be enabled by re-compiling with
- * CM4040_DEBUG set */
-/* #define CM4040_DEBUG */
-#define DEBUGP(n, rdr, x, args...) do { 		\
-		dev_dbg(reader_to_dev(rdr), "%s:" x, 	\
-			   __func__ , ## args);		\
-	} while (0)
-
-static DEFINE_MUTEX(cm4040_mutex);
-
-#define	CCID_DRIVER_BULK_DEFAULT_TIMEOUT  	(150*HZ)
-#define	CCID_DRIVER_ASYNC_POWERUP_TIMEOUT 	(35*HZ)
-#define	CCID_DRIVER_MINIMUM_TIMEOUT 		(3*HZ)
-#define READ_WRITE_BUFFER_SIZE 512
-#define POLL_LOOP_COUNT				1000
-
-/* how often to poll for fifo status change */
-#define POLL_PERIOD 				msecs_to_jiffies(10)
-
-static void reader_release(struct pcmcia_device *link);
-
-static int major;
-static struct class *cmx_class;
-
-#define		BS_READABLE	0x01
-#define		BS_WRITABLE	0x02
-
-struct reader_dev {
-	struct pcmcia_device	*p_dev;
-	wait_queue_head_t	devq;
-	wait_queue_head_t	poll_wait;
-	wait_queue_head_t	read_wait;
-	wait_queue_head_t	write_wait;
-	unsigned long 	  	buffer_status;
-	unsigned long     	timeout;
-	unsigned char     	s_buf[READ_WRITE_BUFFER_SIZE];
-	unsigned char     	r_buf[READ_WRITE_BUFFER_SIZE];
-	struct timer_list 	poll_timer;
-};
-
-static struct pcmcia_device *dev_table[CM_MAX_DEV];
-
-#ifndef CM4040_DEBUG
-#define	xoutb	outb
-#define	xinb	inb
-#else
-static inline void xoutb(unsigned char val, unsigned short port)
-{
-	pr_debug("outb(val=%.2x,port=%.4x)\n", val, port);
-	outb(val, port);
-}
-
-static inline unsigned char xinb(unsigned short port)
-{
-	unsigned char val;
-
-	val = inb(port);
-	pr_debug("%.2x=inb(%.4x)\n", val, port);
-	return val;
-}
-#endif
-
-/* poll the device fifo status register.  not to be confused with
- * the poll syscall. */
-static void cm4040_do_poll(struct timer_list *t)
-{
-	struct reader_dev *dev = from_timer(dev, t, poll_timer);
-	unsigned int obs = xinb(dev->p_dev->resource[0]->start
-				+ REG_OFFSET_BUFFER_STATUS);
-
-	if ((obs & BSR_BULK_IN_FULL)) {
-		set_bit(BS_READABLE, &dev->buffer_status);
-		DEBUGP(4, dev, "waking up read_wait\n");
-		wake_up_interruptible(&dev->read_wait);
-	} else
-		clear_bit(BS_READABLE, &dev->buffer_status);
-
-	if (!(obs & BSR_BULK_OUT_FULL)) {
-		set_bit(BS_WRITABLE, &dev->buffer_status);
-		DEBUGP(4, dev, "waking up write_wait\n");
-		wake_up_interruptible(&dev->write_wait);
-	} else
-		clear_bit(BS_WRITABLE, &dev->buffer_status);
-
-	if (dev->buffer_status)
-		wake_up_interruptible(&dev->poll_wait);
-
-	mod_timer(&dev->poll_timer, jiffies + POLL_PERIOD);
-}
-
-static void cm4040_stop_poll(struct reader_dev *dev)
-{
-	del_timer_sync(&dev->poll_timer);
-}
-
-static int wait_for_bulk_out_ready(struct reader_dev *dev)
-{
-	int i, rc;
-	int iobase = dev->p_dev->resource[0]->start;
-
-	for (i = 0; i < POLL_LOOP_COUNT; i++) {
-		if ((xinb(iobase + REG_OFFSET_BUFFER_STATUS)
-		    & BSR_BULK_OUT_FULL) == 0) {
-			DEBUGP(4, dev, "BulkOut empty (i=%d)\n", i);
-			return 1;
-		}
-	}
-
-	DEBUGP(4, dev, "wait_event_interruptible_timeout(timeout=%ld\n",
-		dev->timeout);
-	rc = wait_event_interruptible_timeout(dev->write_wait,
-					      test_and_clear_bit(BS_WRITABLE,
-						       &dev->buffer_status),
-					      dev->timeout);
-
-	if (rc > 0)
-		DEBUGP(4, dev, "woke up: BulkOut empty\n");
-	else if (rc == 0)
-		DEBUGP(4, dev, "woke up: BulkOut full, returning 0 :(\n");
-	else if (rc < 0)
-		DEBUGP(4, dev, "woke up: signal arrived\n");
-
-	return rc;
-}
-
-/* Write to Sync Control Register */
-static int write_sync_reg(unsigned char val, struct reader_dev *dev)
-{
-	int iobase = dev->p_dev->resource[0]->start;
-	int rc;
-
-	rc = wait_for_bulk_out_ready(dev);
-	if (rc <= 0)
-		return rc;
-
-	xoutb(val, iobase + REG_OFFSET_SYNC_CONTROL);
-	rc = wait_for_bulk_out_ready(dev);
-	if (rc <= 0)
-		return rc;
-
-	return 1;
-}
-
-static int wait_for_bulk_in_ready(struct reader_dev *dev)
-{
-	int i, rc;
-	int iobase = dev->p_dev->resource[0]->start;
-
-	for (i = 0; i < POLL_LOOP_COUNT; i++) {
-		if ((xinb(iobase + REG_OFFSET_BUFFER_STATUS)
-		    & BSR_BULK_IN_FULL) == BSR_BULK_IN_FULL) {
-			DEBUGP(3, dev, "BulkIn full (i=%d)\n", i);
-			return 1;
-		}
-	}
-
-	DEBUGP(4, dev, "wait_event_interruptible_timeout(timeout=%ld\n",
-		dev->timeout);
-	rc = wait_event_interruptible_timeout(dev->read_wait,
-					      test_and_clear_bit(BS_READABLE,
-						 	&dev->buffer_status),
-					      dev->timeout);
-	if (rc > 0)
-		DEBUGP(4, dev, "woke up: BulkIn full\n");
-	else if (rc == 0)
-		DEBUGP(4, dev, "woke up: BulkIn not full, returning 0 :(\n");
-	else if (rc < 0)
-		DEBUGP(4, dev, "woke up: signal arrived\n");
-
-	return rc;
-}
-
-static ssize_t cm4040_read(struct file *filp, char __user *buf,
-			size_t count, loff_t *ppos)
-{
-	struct reader_dev *dev = filp->private_data;
-	int iobase = dev->p_dev->resource[0]->start;
-	size_t bytes_to_read;
-	unsigned long i;
-	size_t min_bytes_to_read;
-	int rc;
-
-	DEBUGP(2, dev, "-> cm4040_read(%s,%d)\n", current->comm, current->pid);
-
-	if (count == 0)
-		return 0;
-
-	if (count < 10)
-		return -EFAULT;
-
-	if (filp->f_flags & O_NONBLOCK) {
-		DEBUGP(4, dev, "filep->f_flags O_NONBLOCK set\n");
-		DEBUGP(2, dev, "<- cm4040_read (failure)\n");
-		return -EAGAIN;
-	}
-
-	if (!pcmcia_dev_present(dev->p_dev))
-		return -ENODEV;
-
-	for (i = 0; i < 5; i++) {
-		rc = wait_for_bulk_in_ready(dev);
-		if (rc <= 0) {
-			DEBUGP(5, dev, "wait_for_bulk_in_ready rc=%.2x\n", rc);
-			DEBUGP(2, dev, "<- cm4040_read (failed)\n");
-			if (rc == -ERESTARTSYS)
-				return rc;
-			return -EIO;
-		}
-	  	dev->r_buf[i] = xinb(iobase + REG_OFFSET_BULK_IN);
-#ifdef CM4040_DEBUG
-		pr_debug("%lu:%2x ", i, dev->r_buf[i]);
-	}
-	pr_debug("\n");
-#else
-	}
-#endif
-
-	bytes_to_read = 5 + le32_to_cpu(*(__le32 *)&dev->r_buf[1]);
-
-	DEBUGP(6, dev, "BytesToRead=%zu\n", bytes_to_read);
-
-	min_bytes_to_read = min(count, bytes_to_read + 5);
-	min_bytes_to_read = min_t(size_t, min_bytes_to_read, READ_WRITE_BUFFER_SIZE);
-
-	DEBUGP(6, dev, "Min=%zu\n", min_bytes_to_read);
-
-	for (i = 0; i < (min_bytes_to_read-5); i++) {
-		rc = wait_for_bulk_in_ready(dev);
-		if (rc <= 0) {
-			DEBUGP(5, dev, "wait_for_bulk_in_ready rc=%.2x\n", rc);
-			DEBUGP(2, dev, "<- cm4040_read (failed)\n");
-			if (rc == -ERESTARTSYS)
-				return rc;
-			return -EIO;
-		}
-		dev->r_buf[i+5] = xinb(iobase + REG_OFFSET_BULK_IN);
-#ifdef CM4040_DEBUG
-		pr_debug("%lu:%2x ", i, dev->r_buf[i]);
-	}
-	pr_debug("\n");
-#else
-	}
-#endif
-
-	*ppos = min_bytes_to_read;
-	if (copy_to_user(buf, dev->r_buf, min_bytes_to_read))
-		return -EFAULT;
-
-	rc = wait_for_bulk_in_ready(dev);
-	if (rc <= 0) {
-		DEBUGP(5, dev, "wait_for_bulk_in_ready rc=%.2x\n", rc);
-		DEBUGP(2, dev, "<- cm4040_read (failed)\n");
-		if (rc == -ERESTARTSYS)
-			return rc;
-		return -EIO;
-	}
-
-	rc = write_sync_reg(SCR_READER_TO_HOST_DONE, dev);
-	if (rc <= 0) {
-		DEBUGP(5, dev, "write_sync_reg c=%.2x\n", rc);
-		DEBUGP(2, dev, "<- cm4040_read (failed)\n");
-		if (rc == -ERESTARTSYS)
-			return rc;
-		else
-			return -EIO;
-	}
-
-	xinb(iobase + REG_OFFSET_BULK_IN);
-
-	DEBUGP(2, dev, "<- cm4040_read (successfully)\n");
-	return min_bytes_to_read;
-}
-
-static ssize_t cm4040_write(struct file *filp, const char __user *buf,
-			 size_t count, loff_t *ppos)
-{
-	struct reader_dev *dev = filp->private_data;
-	int iobase = dev->p_dev->resource[0]->start;
-	ssize_t rc;
-	int i;
-	unsigned int bytes_to_write;
-
-	DEBUGP(2, dev, "-> cm4040_write(%s,%d)\n", current->comm, current->pid);
-
-	if (count == 0) {
-		DEBUGP(2, dev, "<- cm4040_write empty read (successfully)\n");
-		return 0;
-	}
-
-	if ((count < 5) || (count > READ_WRITE_BUFFER_SIZE)) {
-		DEBUGP(2, dev, "<- cm4040_write buffersize=%zd < 5\n", count);
-		return -EIO;
-	}
-
-	if (filp->f_flags & O_NONBLOCK) {
-		DEBUGP(4, dev, "filep->f_flags O_NONBLOCK set\n");
-		DEBUGP(4, dev, "<- cm4040_write (failure)\n");
-		return -EAGAIN;
-	}
-
-	if (!pcmcia_dev_present(dev->p_dev))
-		return -ENODEV;
-
-	bytes_to_write = count;
-	if (copy_from_user(dev->s_buf, buf, bytes_to_write))
-		return -EFAULT;
-
-	switch (dev->s_buf[0]) {
-		case CMD_PC_TO_RDR_XFRBLOCK:
-		case CMD_PC_TO_RDR_SECURE:
-		case CMD_PC_TO_RDR_TEST_SECURE:
-		case CMD_PC_TO_RDR_OK_SECURE:
-			dev->timeout = CCID_DRIVER_BULK_DEFAULT_TIMEOUT;
-			break;
-
-		case CMD_PC_TO_RDR_ICCPOWERON:
-			dev->timeout = CCID_DRIVER_ASYNC_POWERUP_TIMEOUT;
-			break;
-
-		case CMD_PC_TO_RDR_GETSLOTSTATUS:
-		case CMD_PC_TO_RDR_ICCPOWEROFF:
-		case CMD_PC_TO_RDR_GETPARAMETERS:
-		case CMD_PC_TO_RDR_RESETPARAMETERS:
-		case CMD_PC_TO_RDR_SETPARAMETERS:
-		case CMD_PC_TO_RDR_ESCAPE:
-		case CMD_PC_TO_RDR_ICCCLOCK:
-		default:
-			dev->timeout = CCID_DRIVER_MINIMUM_TIMEOUT;
-			break;
-	}
-
-	rc = write_sync_reg(SCR_HOST_TO_READER_START, dev);
-	if (rc <= 0) {
-		DEBUGP(5, dev, "write_sync_reg c=%.2zx\n", rc);
-		DEBUGP(2, dev, "<- cm4040_write (failed)\n");
-		if (rc == -ERESTARTSYS)
-			return rc;
-		else
-			return -EIO;
-	}
-
-	DEBUGP(4, dev, "start \n");
-
-	for (i = 0; i < bytes_to_write; i++) {
-		rc = wait_for_bulk_out_ready(dev);
-		if (rc <= 0) {
-			DEBUGP(5, dev, "wait_for_bulk_out_ready rc=%.2zx\n",
-			       rc);
-			DEBUGP(2, dev, "<- cm4040_write (failed)\n");
-			if (rc == -ERESTARTSYS)
-				return rc;
-			else
-				return -EIO;
-		}
-
-		xoutb(dev->s_buf[i],iobase + REG_OFFSET_BULK_OUT);
-	}
-	DEBUGP(4, dev, "end\n");
-
-	rc = write_sync_reg(SCR_HOST_TO_READER_DONE, dev);
-
-	if (rc <= 0) {
-		DEBUGP(5, dev, "write_sync_reg c=%.2zx\n", rc);
-		DEBUGP(2, dev, "<- cm4040_write (failed)\n");
-		if (rc == -ERESTARTSYS)
-			return rc;
-		else
-			return -EIO;
-	}
-
-	DEBUGP(2, dev, "<- cm4040_write (successfully)\n");
-	return count;
-}
-
-static __poll_t cm4040_poll(struct file *filp, poll_table *wait)
-{
-	struct reader_dev *dev = filp->private_data;
-	__poll_t mask = 0;
-
-	poll_wait(filp, &dev->poll_wait, wait);
-
-	if (test_and_clear_bit(BS_READABLE, &dev->buffer_status))
-		mask |= EPOLLIN | EPOLLRDNORM;
-	if (test_and_clear_bit(BS_WRITABLE, &dev->buffer_status))
-		mask |= EPOLLOUT | EPOLLWRNORM;
-
-	DEBUGP(2, dev, "<- cm4040_poll(%u)\n", mask);
-
-	return mask;
-}
-
-static int cm4040_open(struct inode *inode, struct file *filp)
-{
-	struct reader_dev *dev;
-	struct pcmcia_device *link;
-	int minor = iminor(inode);
-	int ret;
-
-	if (minor >= CM_MAX_DEV)
-		return -ENODEV;
-
-	mutex_lock(&cm4040_mutex);
-	link = dev_table[minor];
-	if (link == NULL || !pcmcia_dev_present(link)) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (link->open) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	dev = link->priv;
-	filp->private_data = dev;
-
-	if (filp->f_flags & O_NONBLOCK) {
-		DEBUGP(4, dev, "filep->f_flags O_NONBLOCK set\n");
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	link->open = 1;
-
-	mod_timer(&dev->poll_timer, jiffies + POLL_PERIOD);
-
-	DEBUGP(2, dev, "<- cm4040_open (successfully)\n");
-	ret = nonseekable_open(inode, filp);
-out:
-	mutex_unlock(&cm4040_mutex);
-	return ret;
-}
-
-static int cm4040_close(struct inode *inode, struct file *filp)
-{
-	struct reader_dev *dev = filp->private_data;
-	struct pcmcia_device *link;
-	int minor = iminor(inode);
-
-	DEBUGP(2, dev, "-> cm4040_close(maj/min=%d.%d)\n", imajor(inode),
-	      iminor(inode));
-
-	if (minor >= CM_MAX_DEV)
-		return -ENODEV;
-
-	link = dev_table[minor];
-	if (link == NULL)
-		return -ENODEV;
-
-	cm4040_stop_poll(dev);
-
-	link->open = 0;
-	wake_up(&dev->devq);
-
-	DEBUGP(2, dev, "<- cm4040_close\n");
-	return 0;
-}
-
-static void cm4040_reader_release(struct pcmcia_device *link)
-{
-	struct reader_dev *dev = link->priv;
-
-	DEBUGP(3, dev, "-> cm4040_reader_release\n");
-	while (link->open) {
-		DEBUGP(3, dev, MODULE_NAME ": delaying release "
-		       "until process has terminated\n");
- 		wait_event(dev->devq, (link->open == 0));
-	}
-	DEBUGP(3, dev, "<- cm4040_reader_release\n");
-	return;
-}
-
-static int cm4040_config_check(struct pcmcia_device *p_dev, void *priv_data)
-{
-	return pcmcia_request_io(p_dev);
-}
-
-
-static int reader_config(struct pcmcia_device *link, int devno)
-{
-	struct reader_dev *dev;
-	int fail_rc;
-
-	link->config_flags |= CONF_AUTO_SET_IO;
-
-	if (pcmcia_loop_config(link, cm4040_config_check, NULL))
-		goto cs_release;
-
-	fail_rc = pcmcia_enable_device(link);
-	if (fail_rc != 0) {
-		dev_info(&link->dev, "pcmcia_enable_device failed 0x%x\n",
-			 fail_rc);
-		goto cs_release;
-	}
-
-	dev = link->priv;
-
-	DEBUGP(2, dev, "device " DEVICE_NAME "%d at %pR\n", devno,
-	      link->resource[0]);
-	DEBUGP(2, dev, "<- reader_config (succ)\n");
-
-	return 0;
-
-cs_release:
-	reader_release(link);
-	return -ENODEV;
-}
-
-static void reader_release(struct pcmcia_device *link)
-{
-	cm4040_reader_release(link);
-	pcmcia_disable_device(link);
-}
-
-static int reader_probe(struct pcmcia_device *link)
-{
-	struct reader_dev *dev;
-	int i, ret;
-
-	for (i = 0; i < CM_MAX_DEV; i++) {
-		if (dev_table[i] == NULL)
-			break;
-	}
-
-	if (i == CM_MAX_DEV)
-		return -ENODEV;
-
-	dev = kzalloc(sizeof(struct reader_dev), GFP_KERNEL);
-	if (dev == NULL)
-		return -ENOMEM;
-
-	dev->timeout = CCID_DRIVER_MINIMUM_TIMEOUT;
-	dev->buffer_status = 0;
-
-	link->priv = dev;
-	dev->p_dev = link;
-
-	dev_table[i] = link;
-
-	init_waitqueue_head(&dev->devq);
-	init_waitqueue_head(&dev->poll_wait);
-	init_waitqueue_head(&dev->read_wait);
-	init_waitqueue_head(&dev->write_wait);
-	timer_setup(&dev->poll_timer, cm4040_do_poll, 0);
-
-	ret = reader_config(link, i);
-	if (ret) {
-		dev_table[i] = NULL;
-		kfree(dev);
-		return ret;
-	}
-
-	device_create(cmx_class, NULL, MKDEV(major, i), NULL, "cmx%d", i);
-
-	return 0;
-}
-
-static void reader_detach(struct pcmcia_device *link)
-{
-	struct reader_dev *dev = link->priv;
-	int devno;
-
-	/* find device */
-	for (devno = 0; devno < CM_MAX_DEV; devno++) {
-		if (dev_table[devno] == link)
-			break;
-	}
-	if (devno == CM_MAX_DEV)
-		return;
-
-	reader_release(link);
-
-	dev_table[devno] = NULL;
-	kfree(dev);
-
-	device_destroy(cmx_class, MKDEV(major, devno));
-
-	return;
-}
-
-static const struct file_operations reader_fops = {
-	.owner		= THIS_MODULE,
-	.read		= cm4040_read,
-	.write		= cm4040_write,
-	.open		= cm4040_open,
-	.release	= cm4040_close,
-	.poll		= cm4040_poll,
-	.llseek		= no_llseek,
-};
-
-static const struct pcmcia_device_id cm4040_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x0223, 0x0200),
-	PCMCIA_DEVICE_PROD_ID12("OMNIKEY", "CardMan 4040",
-				0xE32CDD8C, 0x8F23318B),
-	PCMCIA_DEVICE_NULL,
-};
-MODULE_DEVICE_TABLE(pcmcia, cm4040_ids);
-
-static struct pcmcia_driver reader_driver = {
-  	.owner		= THIS_MODULE,
-	.name		= "cm4040_cs",
-	.probe		= reader_probe,
-	.remove		= reader_detach,
-	.id_table	= cm4040_ids,
-};
-
-static int __init cm4040_init(void)
-{
-	int rc;
-
-	cmx_class = class_create(THIS_MODULE, "cardman_4040");
-	if (IS_ERR(cmx_class))
-		return PTR_ERR(cmx_class);
-
-	major = register_chrdev(0, DEVICE_NAME, &reader_fops);
-	if (major < 0) {
-		printk(KERN_WARNING MODULE_NAME
-			": could not get major number\n");
-		class_destroy(cmx_class);
-		return major;
-	}
-
-	rc = pcmcia_register_driver(&reader_driver);
-	if (rc < 0) {
-		unregister_chrdev(major, DEVICE_NAME);
-		class_destroy(cmx_class);
-		return rc;
-	}
-
-	return 0;
-}
-
-static void __exit cm4040_exit(void)
-{
-	pcmcia_unregister_driver(&reader_driver);
-	unregister_chrdev(major, DEVICE_NAME);
-	class_destroy(cmx_class);
-}
-
-module_init(cm4040_init);
-module_exit(cm4040_exit);
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/char/pcmcia/cm4040_cs.h b/drivers/char/pcmcia/cm4040_cs.h
deleted file mode 100644
index e2ffff995d51..000000000000
--- a/drivers/char/pcmcia/cm4040_cs.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef	_CM4040_H_
-#define	_CM4040_H_
-
-#define	CM_MAX_DEV		4
-
-#define	DEVICE_NAME		"cmx"
-#define	MODULE_NAME		"cm4040_cs"
-
-#define REG_OFFSET_BULK_OUT      0
-#define REG_OFFSET_BULK_IN       0
-#define REG_OFFSET_BUFFER_STATUS 1
-#define REG_OFFSET_SYNC_CONTROL  2
-
-#define BSR_BULK_IN_FULL  0x02
-#define BSR_BULK_OUT_FULL 0x01
-
-#define SCR_HOST_TO_READER_START 0x80
-#define SCR_ABORT                0x40
-#define SCR_EN_NOTIFY            0x20
-#define SCR_ACK_NOTIFY           0x10
-#define SCR_READER_TO_HOST_DONE  0x08
-#define SCR_HOST_TO_READER_DONE  0x04
-#define SCR_PULSE_INTERRUPT      0x02
-#define SCR_POWER_DOWN           0x01
-
-
-#define  CMD_PC_TO_RDR_ICCPOWERON       0x62
-#define  CMD_PC_TO_RDR_GETSLOTSTATUS    0x65
-#define  CMD_PC_TO_RDR_ICCPOWEROFF      0x63
-#define  CMD_PC_TO_RDR_SECURE           0x69
-#define  CMD_PC_TO_RDR_GETPARAMETERS    0x6C
-#define  CMD_PC_TO_RDR_RESETPARAMETERS  0x6D
-#define  CMD_PC_TO_RDR_SETPARAMETERS    0x61
-#define  CMD_PC_TO_RDR_XFRBLOCK         0x6F
-#define  CMD_PC_TO_RDR_ESCAPE           0x6B
-#define  CMD_PC_TO_RDR_ICCCLOCK         0x6E
-#define  CMD_PC_TO_RDR_TEST_SECURE      0x74
-#define  CMD_PC_TO_RDR_OK_SECURE        0x89
-
-
-#define  CMD_RDR_TO_PC_SLOTSTATUS         0x81
-#define  CMD_RDR_TO_PC_DATABLOCK          0x80
-#define  CMD_RDR_TO_PC_PARAMETERS         0x82
-#define  CMD_RDR_TO_PC_ESCAPE             0x83
-#define  CMD_RDR_TO_PC_OK_SECURE          0x89
-
-#endif	/* _CM4040_H_ */
diff --git a/drivers/char/pcmcia/scr24x_cs.c b/drivers/char/pcmcia/scr24x_cs.c
deleted file mode 100644
index 1bdce08fae3d..000000000000
--- a/drivers/char/pcmcia/scr24x_cs.c
+++ /dev/null
@@ -1,359 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SCR24x PCMCIA Smart Card Reader Driver
- *
- * Copyright (C) 2005-2006 TL Sudheendran
- * Copyright (C) 2016 Lubomir Rintel
- *
- * Derived from "scr24x_v4.2.6_Release.tar.gz" driver by TL Sudheendran.
- */
-
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/cdev.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/ds.h>
-
-#define CCID_HEADER_SIZE	10
-#define CCID_LENGTH_OFFSET	1
-#define CCID_MAX_LEN		271
-
-#define SCR24X_DATA(n)		(1 + n)
-#define SCR24X_CMD_STATUS	7
-#define CMD_START		0x40
-#define CMD_WRITE_BYTE		0x41
-#define CMD_READ_BYTE		0x42
-#define STATUS_BUSY		0x80
-
-struct scr24x_dev {
-	struct device *dev;
-	struct cdev c_dev;
-	unsigned char buf[CCID_MAX_LEN];
-	int devno;
-	struct mutex lock;
-	struct kref refcnt;
-	u8 __iomem *regs;
-};
-
-#define SCR24X_DEVS 8
-static DECLARE_BITMAP(scr24x_minors, SCR24X_DEVS);
-
-static struct class *scr24x_class;
-static dev_t scr24x_devt;
-
-static void scr24x_delete(struct kref *kref)
-{
-	struct scr24x_dev *dev = container_of(kref, struct scr24x_dev,
-								refcnt);
-
-	kfree(dev);
-}
-
-static int scr24x_wait_ready(struct scr24x_dev *dev)
-{
-	u_char status;
-	int timeout = 100;
-
-	do {
-		status = ioread8(dev->regs + SCR24X_CMD_STATUS);
-		if (!(status & STATUS_BUSY))
-			return 0;
-
-		msleep(20);
-	} while (--timeout);
-
-	return -EIO;
-}
-
-static int scr24x_open(struct inode *inode, struct file *filp)
-{
-	struct scr24x_dev *dev = container_of(inode->i_cdev,
-				struct scr24x_dev, c_dev);
-
-	kref_get(&dev->refcnt);
-	filp->private_data = dev;
-
-	return stream_open(inode, filp);
-}
-
-static int scr24x_release(struct inode *inode, struct file *filp)
-{
-	struct scr24x_dev *dev = filp->private_data;
-
-	/* We must not take the dev->lock here as scr24x_delete()
-	 * might be called to remove the dev structure altogether.
-	 * We don't need the lock anyway, since after the reference
-	 * acquired in probe() is released in remove() the chrdev
-	 * is already unregistered and noone can possibly acquire
-	 * a reference via open() anymore. */
-	kref_put(&dev->refcnt, scr24x_delete);
-	return 0;
-}
-
-static int read_chunk(struct scr24x_dev *dev, size_t offset, size_t limit)
-{
-	size_t i, y;
-	int ret;
-
-	for (i = offset; i < limit; i += 5) {
-		iowrite8(CMD_READ_BYTE, dev->regs + SCR24X_CMD_STATUS);
-		ret = scr24x_wait_ready(dev);
-		if (ret < 0)
-			return ret;
-
-		for (y = 0; y < 5 && i + y < limit; y++)
-			dev->buf[i + y] = ioread8(dev->regs + SCR24X_DATA(y));
-	}
-
-	return 0;
-}
-
-static ssize_t scr24x_read(struct file *filp, char __user *buf, size_t count,
-								loff_t *ppos)
-{
-	struct scr24x_dev *dev = filp->private_data;
-	int ret;
-	int len;
-
-	if (count < CCID_HEADER_SIZE)
-		return -EINVAL;
-
-	if (mutex_lock_interruptible(&dev->lock))
-		return -ERESTARTSYS;
-
-	if (!dev->dev) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	ret = scr24x_wait_ready(dev);
-	if (ret < 0)
-		goto out;
-	len = CCID_HEADER_SIZE;
-	ret = read_chunk(dev, 0, len);
-	if (ret < 0)
-		goto out;
-
-	len += le32_to_cpu(*(__le32 *)(&dev->buf[CCID_LENGTH_OFFSET]));
-	if (len > sizeof(dev->buf)) {
-		ret = -EIO;
-		goto out;
-	}
-	ret = read_chunk(dev, CCID_HEADER_SIZE, len);
-	if (ret < 0)
-		goto out;
-
-	if (len < count)
-		count = len;
-
-	if (copy_to_user(buf, dev->buf, count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	ret = count;
-out:
-	mutex_unlock(&dev->lock);
-	return ret;
-}
-
-static ssize_t scr24x_write(struct file *filp, const char __user *buf,
-					size_t count, loff_t *ppos)
-{
-	struct scr24x_dev *dev = filp->private_data;
-	size_t i, y;
-	int ret;
-
-	if (mutex_lock_interruptible(&dev->lock))
-		return -ERESTARTSYS;
-
-	if (!dev->dev) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (count > sizeof(dev->buf)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (copy_from_user(dev->buf, buf, count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	ret = scr24x_wait_ready(dev);
-	if (ret < 0)
-		goto out;
-
-	iowrite8(CMD_START, dev->regs + SCR24X_CMD_STATUS);
-	ret = scr24x_wait_ready(dev);
-	if (ret < 0)
-		goto out;
-
-	for (i = 0; i < count; i += 5) {
-		for (y = 0; y < 5 && i + y < count; y++)
-			iowrite8(dev->buf[i + y], dev->regs + SCR24X_DATA(y));
-
-		iowrite8(CMD_WRITE_BYTE, dev->regs + SCR24X_CMD_STATUS);
-		ret = scr24x_wait_ready(dev);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = count;
-out:
-	mutex_unlock(&dev->lock);
-	return ret;
-}
-
-static const struct file_operations scr24x_fops = {
-	.owner		= THIS_MODULE,
-	.read		= scr24x_read,
-	.write		= scr24x_write,
-	.open		= scr24x_open,
-	.release	= scr24x_release,
-	.llseek		= no_llseek,
-};
-
-static int scr24x_config_check(struct pcmcia_device *link, void *priv_data)
-{
-	if (resource_size(link->resource[PCMCIA_IOPORT_0]) != 0x11)
-		return -ENODEV;
-	return pcmcia_request_io(link);
-}
-
-static int scr24x_probe(struct pcmcia_device *link)
-{
-	struct scr24x_dev *dev;
-	int ret;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
-
-	dev->devno = find_first_zero_bit(scr24x_minors, SCR24X_DEVS);
-	if (dev->devno >= SCR24X_DEVS) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	mutex_init(&dev->lock);
-	kref_init(&dev->refcnt);
-
-	link->priv = dev;
-	link->config_flags |= CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
-
-	ret = pcmcia_loop_config(link, scr24x_config_check, NULL);
-	if (ret < 0)
-		goto err;
-
-	dev->dev = &link->dev;
-	dev->regs = devm_ioport_map(&link->dev,
-				link->resource[PCMCIA_IOPORT_0]->start,
-				resource_size(link->resource[PCMCIA_IOPORT_0]));
-	if (!dev->regs) {
-		ret = -EIO;
-		goto err;
-	}
-
-	cdev_init(&dev->c_dev, &scr24x_fops);
-	dev->c_dev.owner = THIS_MODULE;
-	ret = cdev_add(&dev->c_dev, MKDEV(MAJOR(scr24x_devt), dev->devno), 1);
-	if (ret < 0)
-		goto err;
-
-	ret = pcmcia_enable_device(link);
-	if (ret < 0) {
-		pcmcia_disable_device(link);
-		goto err;
-	}
-
-	device_create(scr24x_class, NULL, MKDEV(MAJOR(scr24x_devt), dev->devno),
-		      NULL, "scr24x%d", dev->devno);
-
-	dev_info(&link->dev, "SCR24x Chip Card Interface\n");
-	return 0;
-
-err:
-	if (dev->devno < SCR24X_DEVS)
-		clear_bit(dev->devno, scr24x_minors);
-	kfree (dev);
-	return ret;
-}
-
-static void scr24x_remove(struct pcmcia_device *link)
-{
-	struct scr24x_dev *dev = (struct scr24x_dev *)link->priv;
-
-	device_destroy(scr24x_class, MKDEV(MAJOR(scr24x_devt), dev->devno));
-	mutex_lock(&dev->lock);
-	pcmcia_disable_device(link);
-	cdev_del(&dev->c_dev);
-	clear_bit(dev->devno, scr24x_minors);
-	dev->dev = NULL;
-	mutex_unlock(&dev->lock);
-
-	kref_put(&dev->refcnt, scr24x_delete);
-}
-
-static const struct pcmcia_device_id scr24x_ids[] = {
-	PCMCIA_DEVICE_PROD_ID12("HP", "PC Card Smart Card Reader",
-					0x53cb94f9, 0xbfdf89a5),
-	PCMCIA_DEVICE_PROD_ID1("SCR241 PCMCIA", 0x6271efa3),
-	PCMCIA_DEVICE_PROD_ID1("SCR243 PCMCIA", 0x2054e8de),
-	PCMCIA_DEVICE_PROD_ID1("SCR24x PCMCIA", 0x54a33665),
-	PCMCIA_DEVICE_NULL
-};
-MODULE_DEVICE_TABLE(pcmcia, scr24x_ids);
-
-static struct pcmcia_driver scr24x_driver = {
-	.owner		= THIS_MODULE,
-	.name		= "scr24x_cs",
-	.probe		= scr24x_probe,
-	.remove		= scr24x_remove,
-	.id_table	= scr24x_ids,
-};
-
-static int __init scr24x_init(void)
-{
-	int ret;
-
-	scr24x_class = class_create(THIS_MODULE, "scr24x");
-	if (IS_ERR(scr24x_class))
-		return PTR_ERR(scr24x_class);
-
-	ret = alloc_chrdev_region(&scr24x_devt, 0, SCR24X_DEVS, "scr24x");
-	if (ret < 0)  {
-		class_destroy(scr24x_class);
-		return ret;
-	}
-
-	ret = pcmcia_register_driver(&scr24x_driver);
-	if (ret < 0) {
-		unregister_chrdev_region(scr24x_devt, SCR24X_DEVS);
-		class_destroy(scr24x_class);
-	}
-
-	return ret;
-}
-
-static void __exit scr24x_exit(void)
-{
-	pcmcia_unregister_driver(&scr24x_driver);
-	unregister_chrdev_region(scr24x_devt, SCR24X_DEVS);
-	class_destroy(scr24x_class);
-}
-
-module_init(scr24x_init);
-module_exit(scr24x_exit);
-
-MODULE_AUTHOR("Lubomir Rintel");
-MODULE_DESCRIPTION("SCR24x PCMCIA Smart Card Reader Driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
deleted file mode 100644
index 6ddfeb2fe98f..000000000000
--- a/drivers/char/pcmcia/synclink_cs.c
+++ /dev/null
@@ -1,4290 +0,0 @@
-/*
- * linux/drivers/char/pcmcia/synclink_cs.c
- *
- * $Id: synclink_cs.c,v 4.34 2005/09/08 13:20:54 paulkf Exp $
- *
- * Device driver for Microgate SyncLink PC Card
- * multiprotocol serial adapter.
- *
- * written by Paul Fulghum for Microgate Corporation
- * paulkf@microgate.com
- *
- * Microgate and SyncLink are trademarks of Microgate Corporation
- *
- * This code is released under the GNU General Public License (GPL)
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define VERSION(ver,rel,seq) (((ver)<<16) | ((rel)<<8) | (seq))
-#if defined(__i386__)
-#  define BREAKPOINT() asm("   int $3");
-#else
-#  define BREAKPOINT() { }
-#endif
-
-#define MAX_DEVICE_COUNT 4
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/time.h>
-#include <linux/interrupt.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/ioctl.h>
-#include <linux/synclink.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/dma.h>
-#include <linux/bitops.h>
-#include <asm/types.h>
-#include <linux/termios.h>
-#include <linux/workqueue.h>
-#include <linux/hdlc.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/cisreg.h>
-#include <pcmcia/ds.h>
-
-#if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_CS_MODULE))
-#define SYNCLINK_GENERIC_HDLC 1
-#else
-#define SYNCLINK_GENERIC_HDLC 0
-#endif
-
-#define GET_USER(error,value,addr) error = get_user(value,addr)
-#define COPY_FROM_USER(error,dest,src,size) error = copy_from_user(dest,src,size) ? -EFAULT : 0
-#define PUT_USER(error,value,addr) error = put_user(value,addr)
-#define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0
-
-#include <linux/uaccess.h>
-
-static MGSL_PARAMS default_params = {
-	MGSL_MODE_HDLC,			/* unsigned long mode */
-	0,				/* unsigned char loopback; */
-	HDLC_FLAG_UNDERRUN_ABORT15,	/* unsigned short flags; */
-	HDLC_ENCODING_NRZI_SPACE,	/* unsigned char encoding; */
-	0,				/* unsigned long clock_speed; */
-	0xff,				/* unsigned char addr_filter; */
-	HDLC_CRC_16_CCITT,		/* unsigned short crc_type; */
-	HDLC_PREAMBLE_LENGTH_8BITS,	/* unsigned char preamble_length; */
-	HDLC_PREAMBLE_PATTERN_NONE,	/* unsigned char preamble; */
-	9600,				/* unsigned long data_rate; */
-	8,				/* unsigned char data_bits; */
-	1,				/* unsigned char stop_bits; */
-	ASYNC_PARITY_NONE		/* unsigned char parity; */
-};
-
-typedef struct {
-	int count;
-	unsigned char status;
-	char data[1];
-} RXBUF;
-
-/* The queue of BH actions to be performed */
-
-#define BH_RECEIVE  1
-#define BH_TRANSMIT 2
-#define BH_STATUS   4
-
-#define IO_PIN_SHUTDOWN_LIMIT 100
-
-#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
-
-struct _input_signal_events {
-	int	ri_up;
-	int	ri_down;
-	int	dsr_up;
-	int	dsr_down;
-	int	dcd_up;
-	int	dcd_down;
-	int	cts_up;
-	int	cts_down;
-};
-
-
-/*
- * Device instance data structure
- */
-
-typedef struct _mgslpc_info {
-	struct tty_port		port;
-	void *if_ptr;	/* General purpose pointer (used by SPPP) */
-	int			magic;
-	int			line;
-
-	struct mgsl_icount	icount;
-
-	int			timeout;
-	int			x_char;		/* xon/xoff character */
-	unsigned char		read_status_mask;
-	unsigned char		ignore_status_mask;
-
-	unsigned char *tx_buf;
-	int            tx_put;
-	int            tx_get;
-	int            tx_count;
-
-	/* circular list of fixed length rx buffers */
-
-	unsigned char  *rx_buf;        /* memory allocated for all rx buffers */
-	int            rx_buf_total_size; /* size of memory allocated for rx buffers */
-	int            rx_put;         /* index of next empty rx buffer */
-	int            rx_get;         /* index of next full rx buffer */
-	int            rx_buf_size;    /* size in bytes of single rx buffer */
-	int            rx_buf_count;   /* total number of rx buffers */
-	int            rx_frame_count; /* number of full rx buffers */
-
-	wait_queue_head_t	status_event_wait_q;
-	wait_queue_head_t	event_wait_q;
-	struct timer_list	tx_timer;	/* HDLC transmit timeout timer */
-	struct _mgslpc_info	*next_device;	/* device list link */
-
-	unsigned short imra_value;
-	unsigned short imrb_value;
-	unsigned char  pim_value;
-
-	spinlock_t lock;
-	struct work_struct task;		/* task structure for scheduling bh */
-
-	u32 max_frame_size;
-
-	u32 pending_bh;
-
-	bool bh_running;
-	bool bh_requested;
-
-	int dcd_chkcount; /* check counts to prevent */
-	int cts_chkcount; /* too many IRQs if a signal */
-	int dsr_chkcount; /* is floating */
-	int ri_chkcount;
-
-	bool rx_enabled;
-	bool rx_overflow;
-
-	bool tx_enabled;
-	bool tx_active;
-	bool tx_aborting;
-	u32 idle_mode;
-
-	int if_mode; /* serial interface selection (RS-232, v.35 etc) */
-
-	char device_name[25];		/* device instance name */
-
-	unsigned int io_base;	/* base I/O address of adapter */
-	unsigned int irq_level;
-
-	MGSL_PARAMS params;		/* communications parameters */
-
-	unsigned char serial_signals;	/* current serial signal states */
-
-	bool irq_occurred;		/* for diagnostics use */
-	char testing_irq;
-	unsigned int init_error;	/* startup error (DIAGS)	*/
-
-	char *flag_buf;
-	bool drop_rts_on_tx_done;
-
-	struct	_input_signal_events	input_signal_events;
-
-	/* PCMCIA support */
-	struct pcmcia_device	*p_dev;
-	int		      stop;
-
-	/* SPPP/Cisco HDLC device parts */
-	int netcount;
-	spinlock_t netlock;
-
-#if SYNCLINK_GENERIC_HDLC
-	struct net_device *netdev;
-#endif
-
-} MGSLPC_INFO;
-
-#define MGSLPC_MAGIC 0x5402
-
-/*
- * The size of the serial xmit buffer is 1 page, or 4096 bytes
- */
-#define TXBUFSIZE 4096
-
-
-#define CHA     0x00   /* channel A offset */
-#define CHB     0x40   /* channel B offset */
-
-/*
- *  FIXME: PPC has PVR defined in asm/reg.h.  For now we just undef it.
- */
-#undef PVR
-
-#define RXFIFO  0
-#define TXFIFO  0
-#define STAR    0x20
-#define CMDR    0x20
-#define RSTA    0x21
-#define PRE     0x21
-#define MODE    0x22
-#define TIMR    0x23
-#define XAD1    0x24
-#define XAD2    0x25
-#define RAH1    0x26
-#define RAH2    0x27
-#define DAFO    0x27
-#define RAL1    0x28
-#define RFC     0x28
-#define RHCR    0x29
-#define RAL2    0x29
-#define RBCL    0x2a
-#define XBCL    0x2a
-#define RBCH    0x2b
-#define XBCH    0x2b
-#define CCR0    0x2c
-#define CCR1    0x2d
-#define CCR2    0x2e
-#define CCR3    0x2f
-#define VSTR    0x34
-#define BGR     0x34
-#define RLCR    0x35
-#define AML     0x36
-#define AMH     0x37
-#define GIS     0x38
-#define IVA     0x38
-#define IPC     0x39
-#define ISR     0x3a
-#define IMR     0x3a
-#define PVR     0x3c
-#define PIS     0x3d
-#define PIM     0x3d
-#define PCR     0x3e
-#define CCR4    0x3f
-
-// IMR/ISR
-
-#define IRQ_BREAK_ON    BIT15   // rx break detected
-#define IRQ_DATAOVERRUN BIT14	// receive data overflow
-#define IRQ_ALLSENT     BIT13	// all sent
-#define IRQ_UNDERRUN    BIT12	// transmit data underrun
-#define IRQ_TIMER       BIT11	// timer interrupt
-#define IRQ_CTS         BIT10	// CTS status change
-#define IRQ_TXREPEAT    BIT9	// tx message repeat
-#define IRQ_TXFIFO      BIT8	// transmit pool ready
-#define IRQ_RXEOM       BIT7	// receive message end
-#define IRQ_EXITHUNT    BIT6	// receive frame start
-#define IRQ_RXTIME      BIT6    // rx char timeout
-#define IRQ_DCD         BIT2	// carrier detect status change
-#define IRQ_OVERRUN     BIT1	// receive frame overflow
-#define IRQ_RXFIFO      BIT0	// receive pool full
-
-// STAR
-
-#define XFW   BIT6		// transmit FIFO write enable
-#define CEC   BIT2		// command executing
-#define CTS   BIT1		// CTS state
-
-#define PVR_DTR      BIT0
-#define PVR_DSR      BIT1
-#define PVR_RI       BIT2
-#define PVR_AUTOCTS  BIT3
-#define PVR_RS232    0x20   /* 0010b */
-#define PVR_V35      0xe0   /* 1110b */
-#define PVR_RS422    0x40   /* 0100b */
-
-/* Register access functions */
-
-#define write_reg(info, reg, val) outb((val),(info)->io_base + (reg))
-#define read_reg(info, reg) inb((info)->io_base + (reg))
-
-#define read_reg16(info, reg) inw((info)->io_base + (reg))
-#define write_reg16(info, reg, val) outw((val), (info)->io_base + (reg))
-
-#define set_reg_bits(info, reg, mask) \
-	write_reg(info, (reg), \
-		 (unsigned char) (read_reg(info, (reg)) | (mask)))
-#define clear_reg_bits(info, reg, mask) \
-	write_reg(info, (reg), \
-		 (unsigned char) (read_reg(info, (reg)) & ~(mask)))
-/*
- * interrupt enable/disable routines
- */
-static void irq_disable(MGSLPC_INFO *info, unsigned char channel, unsigned short mask)
-{
-	if (channel == CHA) {
-		info->imra_value |= mask;
-		write_reg16(info, CHA + IMR, info->imra_value);
-	} else {
-		info->imrb_value |= mask;
-		write_reg16(info, CHB + IMR, info->imrb_value);
-	}
-}
-static void irq_enable(MGSLPC_INFO *info, unsigned char channel, unsigned short mask)
-{
-	if (channel == CHA) {
-		info->imra_value &= ~mask;
-		write_reg16(info, CHA + IMR, info->imra_value);
-	} else {
-		info->imrb_value &= ~mask;
-		write_reg16(info, CHB + IMR, info->imrb_value);
-	}
-}
-
-#define port_irq_disable(info, mask) \
-	{ info->pim_value |= (mask); write_reg(info, PIM, info->pim_value); }
-
-#define port_irq_enable(info, mask) \
-	{ info->pim_value &= ~(mask); write_reg(info, PIM, info->pim_value); }
-
-static void rx_start(MGSLPC_INFO *info);
-static void rx_stop(MGSLPC_INFO *info);
-
-static void tx_start(MGSLPC_INFO *info, struct tty_struct *tty);
-static void tx_stop(MGSLPC_INFO *info);
-static void tx_set_idle(MGSLPC_INFO *info);
-
-static void get_signals(MGSLPC_INFO *info);
-static void set_signals(MGSLPC_INFO *info);
-
-static void reset_device(MGSLPC_INFO *info);
-
-static void hdlc_mode(MGSLPC_INFO *info);
-static void async_mode(MGSLPC_INFO *info);
-
-static void tx_timeout(struct timer_list *t);
-
-static bool carrier_raised(struct tty_port *port);
-static void dtr_rts(struct tty_port *port, bool active);
-
-#if SYNCLINK_GENERIC_HDLC
-#define dev_to_port(D) (dev_to_hdlc(D)->priv)
-static void hdlcdev_tx_done(MGSLPC_INFO *info);
-static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int size);
-static int  hdlcdev_init(MGSLPC_INFO *info);
-static void hdlcdev_exit(MGSLPC_INFO *info);
-#endif
-
-static void trace_block(MGSLPC_INFO *info,const char* data, int count, int xmit);
-
-static bool register_test(MGSLPC_INFO *info);
-static bool irq_test(MGSLPC_INFO *info);
-static int adapter_test(MGSLPC_INFO *info);
-
-static int claim_resources(MGSLPC_INFO *info);
-static void release_resources(MGSLPC_INFO *info);
-static int mgslpc_add_device(MGSLPC_INFO *info);
-static void mgslpc_remove_device(MGSLPC_INFO *info);
-
-static bool rx_get_frame(MGSLPC_INFO *info, struct tty_struct *tty);
-static void rx_reset_buffers(MGSLPC_INFO *info);
-static int  rx_alloc_buffers(MGSLPC_INFO *info);
-static void rx_free_buffers(MGSLPC_INFO *info);
-
-static irqreturn_t mgslpc_isr(int irq, void *dev_id);
-
-/*
- * Bottom half interrupt handlers
- */
-static void bh_handler(struct work_struct *work);
-static void bh_transmit(MGSLPC_INFO *info, struct tty_struct *tty);
-static void bh_status(MGSLPC_INFO *info);
-
-/*
- * ioctl handlers
- */
-static int tiocmget(struct tty_struct *tty);
-static int tiocmset(struct tty_struct *tty,
-					unsigned int set, unsigned int clear);
-static int get_stats(MGSLPC_INFO *info, struct mgsl_icount __user *user_icount);
-static int get_params(MGSLPC_INFO *info, MGSL_PARAMS __user *user_params);
-static int set_params(MGSLPC_INFO *info, MGSL_PARAMS __user *new_params, struct tty_struct *tty);
-static int get_txidle(MGSLPC_INFO *info, int __user *idle_mode);
-static int set_txidle(MGSLPC_INFO *info, int idle_mode);
-static int set_txenable(MGSLPC_INFO *info, int enable, struct tty_struct *tty);
-static int tx_abort(MGSLPC_INFO *info);
-static int set_rxenable(MGSLPC_INFO *info, int enable);
-static int wait_events(MGSLPC_INFO *info, int __user *mask);
-
-static MGSLPC_INFO *mgslpc_device_list = NULL;
-static int mgslpc_device_count = 0;
-
-/*
- * Set this param to non-zero to load eax with the
- * .text section address and breakpoint on module load.
- * This is useful for use with gdb and add-symbol-file command.
- */
-static bool break_on_load;
-
-/*
- * Driver major number, defaults to zero to get auto
- * assigned major number. May be forced as module parameter.
- */
-static int ttymajor=0;
-
-static int debug_level = 0;
-static int maxframe[MAX_DEVICE_COUNT] = {0,};
-
-module_param(break_on_load, bool, 0);
-module_param(ttymajor, int, 0);
-module_param(debug_level, int, 0);
-module_param_array(maxframe, int, NULL, 0);
-
-MODULE_LICENSE("GPL");
-
-static char *driver_name = "SyncLink PC Card driver";
-static char *driver_version = "$Revision: 4.34 $";
-
-static struct tty_driver *serial_driver;
-
-/* number of characters left in xmit buffer before we ask for more */
-#define WAKEUP_CHARS 256
-
-static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty);
-static void mgslpc_wait_until_sent(struct tty_struct *tty, int timeout);
-
-/* PCMCIA prototypes */
-
-static int mgslpc_config(struct pcmcia_device *link);
-static void mgslpc_release(u_long arg);
-static void mgslpc_detach(struct pcmcia_device *p_dev);
-
-/*
- * 1st function defined in .text section. Calling this function in
- * init_module() followed by a breakpoint allows a remote debugger
- * (gdb) to get the .text address for the add-symbol-file command.
- * This allows remote debugging of dynamically loadable modules.
- */
-static void* mgslpc_get_text_ptr(void)
-{
-	return mgslpc_get_text_ptr;
-}
-
-/*
- * line discipline callback wrappers
- *
- * The wrappers maintain line discipline references
- * while calling into the line discipline.
- *
- * ldisc_receive_buf  - pass receive data to line discipline
- */
-
-static void ldisc_receive_buf(struct tty_struct *tty,
-			      const __u8 *data, char *flags, int count)
-{
-	struct tty_ldisc *ld;
-	if (!tty)
-		return;
-	ld = tty_ldisc_ref(tty);
-	if (ld) {
-		if (ld->ops->receive_buf)
-			ld->ops->receive_buf(tty, data, flags, count);
-		tty_ldisc_deref(ld);
-	}
-}
-
-static const struct tty_port_operations mgslpc_port_ops = {
-	.carrier_raised = carrier_raised,
-	.dtr_rts = dtr_rts
-};
-
-static int mgslpc_probe(struct pcmcia_device *link)
-{
-	MGSLPC_INFO *info;
-	int ret;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("mgslpc_attach\n");
-
-	info = kzalloc(sizeof(MGSLPC_INFO), GFP_KERNEL);
-	if (!info) {
-		printk("Error can't allocate device instance data\n");
-		return -ENOMEM;
-	}
-
-	info->magic = MGSLPC_MAGIC;
-	tty_port_init(&info->port);
-	info->port.ops = &mgslpc_port_ops;
-	INIT_WORK(&info->task, bh_handler);
-	info->max_frame_size = 4096;
-	init_waitqueue_head(&info->status_event_wait_q);
-	init_waitqueue_head(&info->event_wait_q);
-	spin_lock_init(&info->lock);
-	spin_lock_init(&info->netlock);
-	memcpy(&info->params,&default_params,sizeof(MGSL_PARAMS));
-	info->idle_mode = HDLC_TXIDLE_FLAGS;
-	info->imra_value = 0xffff;
-	info->imrb_value = 0xffff;
-	info->pim_value = 0xff;
-
-	info->p_dev = link;
-	link->priv = info;
-
-	/* Initialize the struct pcmcia_device structure */
-
-	ret = mgslpc_config(link);
-	if (ret != 0)
-		goto failed;
-
-	ret = mgslpc_add_device(info);
-	if (ret != 0)
-		goto failed_release;
-
-	return 0;
-
-failed_release:
-	mgslpc_release((u_long)link);
-failed:
-	tty_port_destroy(&info->port);
-	kfree(info);
-	return ret;
-}
-
-/* Card has been inserted.
- */
-
-static int mgslpc_ioprobe(struct pcmcia_device *p_dev, void *priv_data)
-{
-	return pcmcia_request_io(p_dev);
-}
-
-static int mgslpc_config(struct pcmcia_device *link)
-{
-	MGSLPC_INFO *info = link->priv;
-	int ret;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("mgslpc_config(0x%p)\n", link);
-
-	link->config_flags |= CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
-
-	ret = pcmcia_loop_config(link, mgslpc_ioprobe, NULL);
-	if (ret != 0)
-		goto failed;
-
-	link->config_index = 8;
-	link->config_regs = PRESENT_OPTION;
-
-	ret = pcmcia_request_irq(link, mgslpc_isr);
-	if (ret)
-		goto failed;
-	ret = pcmcia_enable_device(link);
-	if (ret)
-		goto failed;
-
-	info->io_base = link->resource[0]->start;
-	info->irq_level = link->irq;
-	return 0;
-
-failed:
-	mgslpc_release((u_long)link);
-	return -ENODEV;
-}
-
-/* Card has been removed.
- * Unregister device and release PCMCIA configuration.
- * If device is open, postpone until it is closed.
- */
-static void mgslpc_release(u_long arg)
-{
-	struct pcmcia_device *link = (struct pcmcia_device *)arg;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("mgslpc_release(0x%p)\n", link);
-
-	pcmcia_disable_device(link);
-}
-
-static void mgslpc_detach(struct pcmcia_device *link)
-{
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("mgslpc_detach(0x%p)\n", link);
-
-	((MGSLPC_INFO *)link->priv)->stop = 1;
-	mgslpc_release((u_long)link);
-
-	mgslpc_remove_device((MGSLPC_INFO *)link->priv);
-}
-
-static int mgslpc_suspend(struct pcmcia_device *link)
-{
-	MGSLPC_INFO *info = link->priv;
-
-	info->stop = 1;
-
-	return 0;
-}
-
-static int mgslpc_resume(struct pcmcia_device *link)
-{
-	MGSLPC_INFO *info = link->priv;
-
-	info->stop = 0;
-
-	return 0;
-}
-
-
-static inline bool mgslpc_paranoia_check(MGSLPC_INFO *info,
-					char *name, const char *routine)
-{
-#ifdef MGSLPC_PARANOIA_CHECK
-	static const char *badmagic =
-		"Warning: bad magic number for mgsl struct (%s) in %s\n";
-	static const char *badinfo =
-		"Warning: null mgslpc_info for (%s) in %s\n";
-
-	if (!info) {
-		printk(badinfo, name, routine);
-		return true;
-	}
-	if (info->magic != MGSLPC_MAGIC) {
-		printk(badmagic, name, routine);
-		return true;
-	}
-#else
-	if (!info)
-		return true;
-#endif
-	return false;
-}
-
-
-#define CMD_RXFIFO      BIT7	// release current rx FIFO
-#define CMD_RXRESET     BIT6	// receiver reset
-#define CMD_RXFIFO_READ BIT5
-#define CMD_START_TIMER BIT4
-#define CMD_TXFIFO      BIT3	// release current tx FIFO
-#define CMD_TXEOM       BIT1	// transmit end message
-#define CMD_TXRESET     BIT0	// transmit reset
-
-static bool wait_command_complete(MGSLPC_INFO *info, unsigned char channel)
-{
-	int i = 0;
-	/* wait for command completion */
-	while (read_reg(info, (unsigned char)(channel+STAR)) & BIT2) {
-		udelay(1);
-		if (i++ == 1000)
-			return false;
-	}
-	return true;
-}
-
-static void issue_command(MGSLPC_INFO *info, unsigned char channel, unsigned char cmd)
-{
-	wait_command_complete(info, channel);
-	write_reg(info, (unsigned char) (channel + CMDR), cmd);
-}
-
-static void tx_pause(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (mgslpc_paranoia_check(info, tty->name, "tx_pause"))
-		return;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("tx_pause(%s)\n", info->device_name);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (info->tx_enabled)
-		tx_stop(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-}
-
-static void tx_release(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (mgslpc_paranoia_check(info, tty->name, "tx_release"))
-		return;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("tx_release(%s)\n", info->device_name);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (!info->tx_enabled)
-		tx_start(info, tty);
-	spin_unlock_irqrestore(&info->lock, flags);
-}
-
-/* Return next bottom half action to perform.
- * or 0 if nothing to do.
- */
-static int bh_action(MGSLPC_INFO *info)
-{
-	unsigned long flags;
-	int rc = 0;
-
-	spin_lock_irqsave(&info->lock, flags);
-
-	if (info->pending_bh & BH_RECEIVE) {
-		info->pending_bh &= ~BH_RECEIVE;
-		rc = BH_RECEIVE;
-	} else if (info->pending_bh & BH_TRANSMIT) {
-		info->pending_bh &= ~BH_TRANSMIT;
-		rc = BH_TRANSMIT;
-	} else if (info->pending_bh & BH_STATUS) {
-		info->pending_bh &= ~BH_STATUS;
-		rc = BH_STATUS;
-	}
-
-	if (!rc) {
-		/* Mark BH routine as complete */
-		info->bh_running = false;
-		info->bh_requested = false;
-	}
-
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return rc;
-}
-
-static void bh_handler(struct work_struct *work)
-{
-	MGSLPC_INFO *info = container_of(work, MGSLPC_INFO, task);
-	struct tty_struct *tty;
-	int action;
-
-	if (debug_level >= DEBUG_LEVEL_BH)
-		printk("%s(%d):bh_handler(%s) entry\n",
-			__FILE__,__LINE__,info->device_name);
-
-	info->bh_running = true;
-	tty = tty_port_tty_get(&info->port);
-
-	while((action = bh_action(info)) != 0) {
-
-		/* Process work item */
-		if (debug_level >= DEBUG_LEVEL_BH)
-			printk("%s(%d):bh_handler() work item action=%d\n",
-				__FILE__,__LINE__,action);
-
-		switch (action) {
-
-		case BH_RECEIVE:
-			while(rx_get_frame(info, tty));
-			break;
-		case BH_TRANSMIT:
-			bh_transmit(info, tty);
-			break;
-		case BH_STATUS:
-			bh_status(info);
-			break;
-		default:
-			/* unknown work item ID */
-			printk("Unknown work item ID=%08X!\n", action);
-			break;
-		}
-	}
-
-	tty_kref_put(tty);
-	if (debug_level >= DEBUG_LEVEL_BH)
-		printk("%s(%d):bh_handler(%s) exit\n",
-			__FILE__,__LINE__,info->device_name);
-}
-
-static void bh_transmit(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	if (debug_level >= DEBUG_LEVEL_BH)
-		printk("bh_transmit() entry on %s\n", info->device_name);
-
-	if (tty)
-		tty_wakeup(tty);
-}
-
-static void bh_status(MGSLPC_INFO *info)
-{
-	info->ri_chkcount = 0;
-	info->dsr_chkcount = 0;
-	info->dcd_chkcount = 0;
-	info->cts_chkcount = 0;
-}
-
-/* eom: non-zero = end of frame */
-static void rx_ready_hdlc(MGSLPC_INFO *info, int eom)
-{
-	unsigned char data[2];
-	unsigned char fifo_count, read_count, i;
-	RXBUF *buf = (RXBUF*)(info->rx_buf + (info->rx_put * info->rx_buf_size));
-
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):rx_ready_hdlc(eom=%d)\n", __FILE__, __LINE__, eom);
-
-	if (!info->rx_enabled)
-		return;
-
-	if (info->rx_frame_count >= info->rx_buf_count) {
-		/* no more free buffers */
-		issue_command(info, CHA, CMD_RXRESET);
-		info->pending_bh |= BH_RECEIVE;
-		info->rx_overflow = true;
-		info->icount.buf_overrun++;
-		return;
-	}
-
-	if (eom) {
-		/* end of frame, get FIFO count from RBCL register */
-		fifo_count = (unsigned char)(read_reg(info, CHA+RBCL) & 0x1f);
-		if (fifo_count == 0)
-			fifo_count = 32;
-	} else
-		fifo_count = 32;
-
-	do {
-		if (fifo_count == 1) {
-			read_count = 1;
-			data[0] = read_reg(info, CHA + RXFIFO);
-		} else {
-			read_count = 2;
-			*((unsigned short *) data) = read_reg16(info, CHA + RXFIFO);
-		}
-		fifo_count -= read_count;
-		if (!fifo_count && eom)
-			buf->status = data[--read_count];
-
-		for (i = 0; i < read_count; i++) {
-			if (buf->count >= info->max_frame_size) {
-				/* frame too large, reset receiver and reset current buffer */
-				issue_command(info, CHA, CMD_RXRESET);
-				buf->count = 0;
-				return;
-			}
-			*(buf->data + buf->count) = data[i];
-			buf->count++;
-		}
-	} while (fifo_count);
-
-	if (eom) {
-		info->pending_bh |= BH_RECEIVE;
-		info->rx_frame_count++;
-		info->rx_put++;
-		if (info->rx_put >= info->rx_buf_count)
-			info->rx_put = 0;
-	}
-	issue_command(info, CHA, CMD_RXFIFO);
-}
-
-static void rx_ready_async(MGSLPC_INFO *info, int tcd)
-{
-	struct tty_port *port = &info->port;
-	unsigned char data, status, flag;
-	int fifo_count;
-	int work = 0;
-	struct mgsl_icount *icount = &info->icount;
-
-	if (tcd) {
-		/* early termination, get FIFO count from RBCL register */
-		fifo_count = (unsigned char)(read_reg(info, CHA+RBCL) & 0x1f);
-
-		/* Zero fifo count could mean 0 or 32 bytes available.
-		 * If BIT5 of STAR is set then at least 1 byte is available.
-		 */
-		if (!fifo_count && (read_reg(info,CHA+STAR) & BIT5))
-			fifo_count = 32;
-	} else
-		fifo_count = 32;
-
-	tty_buffer_request_room(port, fifo_count);
-	/* Flush received async data to receive data buffer. */
-	while (fifo_count) {
-		data   = read_reg(info, CHA + RXFIFO);
-		status = read_reg(info, CHA + RXFIFO);
-		fifo_count -= 2;
-
-		icount->rx++;
-		flag = TTY_NORMAL;
-
-		// if no frameing/crc error then save data
-		// BIT7:parity error
-		// BIT6:framing error
-
-		if (status & (BIT7 | BIT6)) {
-			if (status & BIT7)
-				icount->parity++;
-			else
-				icount->frame++;
-
-			/* discard char if tty control flags say so */
-			if (status & info->ignore_status_mask)
-				continue;
-
-			status &= info->read_status_mask;
-
-			if (status & BIT7)
-				flag = TTY_PARITY;
-			else if (status & BIT6)
-				flag = TTY_FRAME;
-		}
-		work += tty_insert_flip_char(port, data, flag);
-	}
-	issue_command(info, CHA, CMD_RXFIFO);
-
-	if (debug_level >= DEBUG_LEVEL_ISR) {
-		printk("%s(%d):rx_ready_async",
-			__FILE__,__LINE__);
-		printk("%s(%d):rx=%d brk=%d parity=%d frame=%d overrun=%d\n",
-			__FILE__,__LINE__,icount->rx,icount->brk,
-			icount->parity,icount->frame,icount->overrun);
-	}
-
-	if (work)
-		tty_flip_buffer_push(port);
-}
-
-
-static void tx_done(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	if (!info->tx_active)
-		return;
-
-	info->tx_active = false;
-	info->tx_aborting = false;
-
-	if (info->params.mode == MGSL_MODE_ASYNC)
-		return;
-
-	info->tx_count = info->tx_put = info->tx_get = 0;
-	del_timer(&info->tx_timer);
-
-	if (info->drop_rts_on_tx_done) {
-		get_signals(info);
-		if (info->serial_signals & SerialSignal_RTS) {
-			info->serial_signals &= ~SerialSignal_RTS;
-			set_signals(info);
-		}
-		info->drop_rts_on_tx_done = false;
-	}
-
-#if SYNCLINK_GENERIC_HDLC
-	if (info->netcount)
-		hdlcdev_tx_done(info);
-	else
-#endif
-	{
-		if (tty && (tty->flow.stopped || tty->hw_stopped)) {
-			tx_stop(info);
-			return;
-		}
-		info->pending_bh |= BH_TRANSMIT;
-	}
-}
-
-static void tx_ready(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	unsigned char fifo_count = 32;
-	int c;
-
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):tx_ready(%s)\n", __FILE__, __LINE__, info->device_name);
-
-	if (info->params.mode == MGSL_MODE_HDLC) {
-		if (!info->tx_active)
-			return;
-	} else {
-		if (tty && (tty->flow.stopped || tty->hw_stopped)) {
-			tx_stop(info);
-			return;
-		}
-		if (!info->tx_count)
-			info->tx_active = false;
-	}
-
-	if (!info->tx_count)
-		return;
-
-	while (info->tx_count && fifo_count) {
-		c = min(2, min_t(int, fifo_count, min(info->tx_count, TXBUFSIZE - info->tx_get)));
-
-		if (c == 1) {
-			write_reg(info, CHA + TXFIFO, *(info->tx_buf + info->tx_get));
-		} else {
-			write_reg16(info, CHA + TXFIFO,
-					  *((unsigned short*)(info->tx_buf + info->tx_get)));
-		}
-		info->tx_count -= c;
-		info->tx_get = (info->tx_get + c) & (TXBUFSIZE - 1);
-		fifo_count -= c;
-	}
-
-	if (info->params.mode == MGSL_MODE_ASYNC) {
-		if (info->tx_count < WAKEUP_CHARS)
-			info->pending_bh |= BH_TRANSMIT;
-		issue_command(info, CHA, CMD_TXFIFO);
-	} else {
-		if (info->tx_count)
-			issue_command(info, CHA, CMD_TXFIFO);
-		else
-			issue_command(info, CHA, CMD_TXFIFO + CMD_TXEOM);
-	}
-}
-
-static void cts_change(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	get_signals(info);
-	if ((info->cts_chkcount)++ >= IO_PIN_SHUTDOWN_LIMIT)
-		irq_disable(info, CHB, IRQ_CTS);
-	info->icount.cts++;
-	if (info->serial_signals & SerialSignal_CTS)
-		info->input_signal_events.cts_up++;
-	else
-		info->input_signal_events.cts_down++;
-	wake_up_interruptible(&info->status_event_wait_q);
-	wake_up_interruptible(&info->event_wait_q);
-
-	if (tty && tty_port_cts_enabled(&info->port)) {
-		if (tty->hw_stopped) {
-			if (info->serial_signals & SerialSignal_CTS) {
-				if (debug_level >= DEBUG_LEVEL_ISR)
-					printk("CTS tx start...");
-				tty->hw_stopped = 0;
-				tx_start(info, tty);
-				info->pending_bh |= BH_TRANSMIT;
-				return;
-			}
-		} else {
-			if (!(info->serial_signals & SerialSignal_CTS)) {
-				if (debug_level >= DEBUG_LEVEL_ISR)
-					printk("CTS tx stop...");
-				tty->hw_stopped = 1;
-				tx_stop(info);
-			}
-		}
-	}
-	info->pending_bh |= BH_STATUS;
-}
-
-static void dcd_change(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	get_signals(info);
-	if ((info->dcd_chkcount)++ >= IO_PIN_SHUTDOWN_LIMIT)
-		irq_disable(info, CHB, IRQ_DCD);
-	info->icount.dcd++;
-	if (info->serial_signals & SerialSignal_DCD) {
-		info->input_signal_events.dcd_up++;
-	}
-	else
-		info->input_signal_events.dcd_down++;
-#if SYNCLINK_GENERIC_HDLC
-	if (info->netcount) {
-		if (info->serial_signals & SerialSignal_DCD)
-			netif_carrier_on(info->netdev);
-		else
-			netif_carrier_off(info->netdev);
-	}
-#endif
-	wake_up_interruptible(&info->status_event_wait_q);
-	wake_up_interruptible(&info->event_wait_q);
-
-	if (tty_port_check_carrier(&info->port)) {
-		if (debug_level >= DEBUG_LEVEL_ISR)
-			printk("%s CD now %s...", info->device_name,
-			       (info->serial_signals & SerialSignal_DCD) ? "on" : "off");
-		if (info->serial_signals & SerialSignal_DCD)
-			wake_up_interruptible(&info->port.open_wait);
-		else {
-			if (debug_level >= DEBUG_LEVEL_ISR)
-				printk("doing serial hangup...");
-			if (tty)
-				tty_hangup(tty);
-		}
-	}
-	info->pending_bh |= BH_STATUS;
-}
-
-static void dsr_change(MGSLPC_INFO *info)
-{
-	get_signals(info);
-	if ((info->dsr_chkcount)++ >= IO_PIN_SHUTDOWN_LIMIT)
-		port_irq_disable(info, PVR_DSR);
-	info->icount.dsr++;
-	if (info->serial_signals & SerialSignal_DSR)
-		info->input_signal_events.dsr_up++;
-	else
-		info->input_signal_events.dsr_down++;
-	wake_up_interruptible(&info->status_event_wait_q);
-	wake_up_interruptible(&info->event_wait_q);
-	info->pending_bh |= BH_STATUS;
-}
-
-static void ri_change(MGSLPC_INFO *info)
-{
-	get_signals(info);
-	if ((info->ri_chkcount)++ >= IO_PIN_SHUTDOWN_LIMIT)
-		port_irq_disable(info, PVR_RI);
-	info->icount.rng++;
-	if (info->serial_signals & SerialSignal_RI)
-		info->input_signal_events.ri_up++;
-	else
-		info->input_signal_events.ri_down++;
-	wake_up_interruptible(&info->status_event_wait_q);
-	wake_up_interruptible(&info->event_wait_q);
-	info->pending_bh |= BH_STATUS;
-}
-
-/* Interrupt service routine entry point.
- *
- * Arguments:
- *
- * irq     interrupt number that caused interrupt
- * dev_id  device ID supplied during interrupt registration
- */
-static irqreturn_t mgslpc_isr(int dummy, void *dev_id)
-{
-	MGSLPC_INFO *info = dev_id;
-	struct tty_struct *tty;
-	unsigned short isr;
-	unsigned char gis, pis;
-	int count=0;
-
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("mgslpc_isr(%d) entry.\n", info->irq_level);
-
-	if (!(info->p_dev->_locked))
-		return IRQ_HANDLED;
-
-	tty = tty_port_tty_get(&info->port);
-
-	spin_lock(&info->lock);
-
-	while ((gis = read_reg(info, CHA + GIS))) {
-		if (debug_level >= DEBUG_LEVEL_ISR)
-			printk("mgslpc_isr %s gis=%04X\n", info->device_name,gis);
-
-		if ((gis & 0x70) || count > 1000) {
-			printk("synclink_cs:hardware failed or ejected\n");
-			break;
-		}
-		count++;
-
-		if (gis & (BIT1 | BIT0)) {
-			isr = read_reg16(info, CHB + ISR);
-			if (isr & IRQ_DCD)
-				dcd_change(info, tty);
-			if (isr & IRQ_CTS)
-				cts_change(info, tty);
-		}
-		if (gis & (BIT3 | BIT2))
-		{
-			isr = read_reg16(info, CHA + ISR);
-			if (isr & IRQ_TIMER) {
-				info->irq_occurred = true;
-				irq_disable(info, CHA, IRQ_TIMER);
-			}
-
-			/* receive IRQs */
-			if (isr & IRQ_EXITHUNT) {
-				info->icount.exithunt++;
-				wake_up_interruptible(&info->event_wait_q);
-			}
-			if (isr & IRQ_BREAK_ON) {
-				info->icount.brk++;
-				if (info->port.flags & ASYNC_SAK)
-					do_SAK(tty);
-			}
-			if (isr & IRQ_RXTIME) {
-				issue_command(info, CHA, CMD_RXFIFO_READ);
-			}
-			if (isr & (IRQ_RXEOM | IRQ_RXFIFO)) {
-				if (info->params.mode == MGSL_MODE_HDLC)
-					rx_ready_hdlc(info, isr & IRQ_RXEOM);
-				else
-					rx_ready_async(info, isr & IRQ_RXEOM);
-			}
-
-			/* transmit IRQs */
-			if (isr & IRQ_UNDERRUN) {
-				if (info->tx_aborting)
-					info->icount.txabort++;
-				else
-					info->icount.txunder++;
-				tx_done(info, tty);
-			}
-			else if (isr & IRQ_ALLSENT) {
-				info->icount.txok++;
-				tx_done(info, tty);
-			}
-			else if (isr & IRQ_TXFIFO)
-				tx_ready(info, tty);
-		}
-		if (gis & BIT7) {
-			pis = read_reg(info, CHA + PIS);
-			if (pis & BIT1)
-				dsr_change(info);
-			if (pis & BIT2)
-				ri_change(info);
-		}
-	}
-
-	/* Request bottom half processing if there's something
-	 * for it to do and the bh is not already running
-	 */
-
-	if (info->pending_bh && !info->bh_running && !info->bh_requested) {
-		if (debug_level >= DEBUG_LEVEL_ISR)
-			printk("%s(%d):%s queueing bh task.\n",
-				__FILE__,__LINE__,info->device_name);
-		schedule_work(&info->task);
-		info->bh_requested = true;
-	}
-
-	spin_unlock(&info->lock);
-	tty_kref_put(tty);
-
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):mgslpc_isr(%d)exit.\n",
-		       __FILE__, __LINE__, info->irq_level);
-
-	return IRQ_HANDLED;
-}
-
-/* Initialize and start device.
- */
-static int startup(MGSLPC_INFO * info, struct tty_struct *tty)
-{
-	int retval = 0;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):startup(%s)\n", __FILE__, __LINE__, info->device_name);
-
-	if (tty_port_initialized(&info->port))
-		return 0;
-
-	if (!info->tx_buf) {
-		/* allocate a page of memory for a transmit buffer */
-		info->tx_buf = (unsigned char *)get_zeroed_page(GFP_KERNEL);
-		if (!info->tx_buf) {
-			printk(KERN_ERR"%s(%d):%s can't allocate transmit buffer\n",
-				__FILE__, __LINE__, info->device_name);
-			return -ENOMEM;
-		}
-	}
-
-	info->pending_bh = 0;
-
-	memset(&info->icount, 0, sizeof(info->icount));
-
-	timer_setup(&info->tx_timer, tx_timeout, 0);
-
-	/* Allocate and claim adapter resources */
-	retval = claim_resources(info);
-
-	/* perform existence check and diagnostics */
-	if (!retval)
-		retval = adapter_test(info);
-
-	if (retval) {
-		if (capable(CAP_SYS_ADMIN) && tty)
-			set_bit(TTY_IO_ERROR, &tty->flags);
-		release_resources(info);
-		return retval;
-	}
-
-	/* program hardware for current parameters */
-	mgslpc_change_params(info, tty);
-
-	if (tty)
-		clear_bit(TTY_IO_ERROR, &tty->flags);
-
-	tty_port_set_initialized(&info->port, true);
-
-	return 0;
-}
-
-/* Called by mgslpc_close() and mgslpc_hangup() to shutdown hardware
- */
-static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
-{
-	unsigned long flags;
-
-	if (!tty_port_initialized(&info->port))
-		return;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_shutdown(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	/* clear status wait queue because status changes */
-	/* can't happen after shutting down the hardware */
-	wake_up_interruptible(&info->status_event_wait_q);
-	wake_up_interruptible(&info->event_wait_q);
-
-	del_timer_sync(&info->tx_timer);
-
-	if (info->tx_buf) {
-		free_page((unsigned long) info->tx_buf);
-		info->tx_buf = NULL;
-	}
-
-	spin_lock_irqsave(&info->lock, flags);
-
-	rx_stop(info);
-	tx_stop(info);
-
-	/* TODO:disable interrupts instead of reset to preserve signal states */
-	reset_device(info);
-
-	if (!tty || C_HUPCL(tty)) {
-		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
-		set_signals(info);
-	}
-
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	release_resources(info);
-
-	if (tty)
-		set_bit(TTY_IO_ERROR, &tty->flags);
-
-	tty_port_set_initialized(&info->port, false);
-}
-
-static void mgslpc_program_hw(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-
-	rx_stop(info);
-	tx_stop(info);
-	info->tx_count = info->tx_put = info->tx_get = 0;
-
-	if (info->params.mode == MGSL_MODE_HDLC || info->netcount)
-		hdlc_mode(info);
-	else
-		async_mode(info);
-
-	set_signals(info);
-
-	info->dcd_chkcount = 0;
-	info->cts_chkcount = 0;
-	info->ri_chkcount = 0;
-	info->dsr_chkcount = 0;
-
-	irq_enable(info, CHB, IRQ_DCD | IRQ_CTS);
-	port_irq_enable(info, (unsigned char) PVR_DSR | PVR_RI);
-	get_signals(info);
-
-	if (info->netcount || (tty && C_CREAD(tty)))
-		rx_start(info);
-
-	spin_unlock_irqrestore(&info->lock, flags);
-}
-
-/* Reconfigure adapter based on new parameters
- */
-static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	unsigned cflag;
-	int bits_per_char;
-
-	if (!tty)
-		return;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_change_params(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	cflag = tty->termios.c_cflag;
-
-	/* if B0 rate (hangup) specified then negate RTS and DTR */
-	/* otherwise assert RTS and DTR */
-	if (cflag & CBAUD)
-		info->serial_signals |= SerialSignal_RTS | SerialSignal_DTR;
-	else
-		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
-
-	/* byte size and parity */
-	if ((cflag & CSIZE) != CS8) {
-		cflag &= ~CSIZE;
-		cflag |= CS7;
-		tty->termios.c_cflag = cflag;
-	}
-	info->params.data_bits = tty_get_char_size(cflag);
-
-	if (cflag & CSTOPB)
-		info->params.stop_bits = 2;
-	else
-		info->params.stop_bits = 1;
-
-	info->params.parity = ASYNC_PARITY_NONE;
-	if (cflag & PARENB) {
-		if (cflag & PARODD)
-			info->params.parity = ASYNC_PARITY_ODD;
-		else
-			info->params.parity = ASYNC_PARITY_EVEN;
-		if (cflag & CMSPAR)
-			info->params.parity = ASYNC_PARITY_SPACE;
-	}
-
-	/* calculate number of jiffies to transmit a full
-	 * FIFO (32 bytes) at specified data rate
-	 */
-	bits_per_char = info->params.data_bits +
-			info->params.stop_bits + 1;
-
-	/* if port data rate is set to 460800 or less then
-	 * allow tty settings to override, otherwise keep the
-	 * current data rate.
-	 */
-	if (info->params.data_rate <= 460800) {
-		info->params.data_rate = tty_get_baud_rate(tty);
-	}
-
-	if (info->params.data_rate) {
-		info->timeout = (32*HZ*bits_per_char) /
-				info->params.data_rate;
-	}
-	info->timeout += HZ/50;		/* Add .02 seconds of slop */
-
-	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
-
-	/* process tty input control flags */
-
-	info->read_status_mask = 0;
-	if (I_INPCK(tty))
-		info->read_status_mask |= BIT7 | BIT6;
-	if (I_IGNPAR(tty))
-		info->ignore_status_mask |= BIT7 | BIT6;
-
-	mgslpc_program_hw(info, tty);
-}
-
-/* Add a character to the transmit buffer
- */
-static int mgslpc_put_char(struct tty_struct *tty, unsigned char ch)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO) {
-		printk("%s(%d):mgslpc_put_char(%d) on %s\n",
-			__FILE__, __LINE__, ch, info->device_name);
-	}
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_put_char"))
-		return 0;
-
-	if (!info->tx_buf)
-		return 0;
-
-	spin_lock_irqsave(&info->lock, flags);
-
-	if (info->params.mode == MGSL_MODE_ASYNC || !info->tx_active) {
-		if (info->tx_count < TXBUFSIZE - 1) {
-			info->tx_buf[info->tx_put++] = ch;
-			info->tx_put &= TXBUFSIZE-1;
-			info->tx_count++;
-		}
-	}
-
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 1;
-}
-
-/* Enable transmitter so remaining characters in the
- * transmit buffer are sent.
- */
-static void mgslpc_flush_chars(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_flush_chars() entry on %s tx_count=%d\n",
-			__FILE__, __LINE__, info->device_name, info->tx_count);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_flush_chars"))
-		return;
-
-	if (info->tx_count <= 0 || tty->flow.stopped ||
-	    tty->hw_stopped || !info->tx_buf)
-		return;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_flush_chars() entry on %s starting transmitter\n",
-			__FILE__, __LINE__, info->device_name);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (!info->tx_active)
-		tx_start(info, tty);
-	spin_unlock_irqrestore(&info->lock, flags);
-}
-
-/* Send a block of data
- *
- * Arguments:
- *
- * tty        pointer to tty information structure
- * buf	      pointer to buffer containing send data
- * count      size of send data in bytes
- *
- * Returns: number of characters written
- */
-static int mgslpc_write(struct tty_struct * tty,
-			const unsigned char *buf, int count)
-{
-	int c, ret = 0;
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_write(%s) count=%d\n",
-			__FILE__, __LINE__, info->device_name, count);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_write") ||
-		!info->tx_buf)
-		goto cleanup;
-
-	if (info->params.mode == MGSL_MODE_HDLC) {
-		if (count > TXBUFSIZE) {
-			ret = -EIO;
-			goto cleanup;
-		}
-		if (info->tx_active)
-			goto cleanup;
-		else if (info->tx_count)
-			goto start;
-	}
-
-	for (;;) {
-		c = min(count,
-			min(TXBUFSIZE - info->tx_count - 1,
-			    TXBUFSIZE - info->tx_put));
-		if (c <= 0)
-			break;
-
-		memcpy(info->tx_buf + info->tx_put, buf, c);
-
-		spin_lock_irqsave(&info->lock, flags);
-		info->tx_put = (info->tx_put + c) & (TXBUFSIZE-1);
-		info->tx_count += c;
-		spin_unlock_irqrestore(&info->lock, flags);
-
-		buf += c;
-		count -= c;
-		ret += c;
-	}
-start:
-	if (info->tx_count && !tty->flow.stopped && !tty->hw_stopped) {
-		spin_lock_irqsave(&info->lock, flags);
-		if (!info->tx_active)
-			tx_start(info, tty);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-cleanup:
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_write(%s) returning=%d\n",
-			__FILE__, __LINE__, info->device_name, ret);
-	return ret;
-}
-
-/* Return the count of free bytes in transmit buffer
- */
-static unsigned int mgslpc_write_room(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	int ret;
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_write_room"))
-		return 0;
-
-	if (info->params.mode == MGSL_MODE_HDLC) {
-		/* HDLC (frame oriented) mode */
-		if (info->tx_active)
-			return 0;
-		else
-			return HDLC_MAX_FRAME_SIZE;
-	} else {
-		ret = TXBUFSIZE - info->tx_count - 1;
-		if (ret < 0)
-			ret = 0;
-	}
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_write_room(%s)=%d\n",
-			 __FILE__, __LINE__, info->device_name, ret);
-	return ret;
-}
-
-/* Return the count of bytes in transmit buffer
- */
-static unsigned int mgslpc_chars_in_buffer(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned int rc;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_chars_in_buffer(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_chars_in_buffer"))
-		return 0;
-
-	if (info->params.mode == MGSL_MODE_HDLC)
-		rc = info->tx_active ? info->max_frame_size : 0;
-	else
-		rc = info->tx_count;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_chars_in_buffer(%s)=%u\n",
-			 __FILE__, __LINE__, info->device_name, rc);
-
-	return rc;
-}
-
-/* Discard all data in the send buffer
- */
-static void mgslpc_flush_buffer(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_flush_buffer(%s) entry\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_flush_buffer"))
-		return;
-
-	spin_lock_irqsave(&info->lock, flags);
-	info->tx_count = info->tx_put = info->tx_get = 0;
-	del_timer(&info->tx_timer);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	wake_up_interruptible(&tty->write_wait);
-	tty_wakeup(tty);
-}
-
-/* Send a high-priority XON/XOFF character
- */
-static void mgslpc_send_xchar(struct tty_struct *tty, char ch)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_send_xchar(%s,%d)\n",
-			 __FILE__, __LINE__, info->device_name, ch);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_send_xchar"))
-		return;
-
-	info->x_char = ch;
-	if (ch) {
-		spin_lock_irqsave(&info->lock, flags);
-		if (!info->tx_enabled)
-			tx_start(info, tty);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-}
-
-/* Signal remote device to throttle send data (our receive data)
- */
-static void mgslpc_throttle(struct tty_struct * tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_throttle(%s) entry\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_throttle"))
-		return;
-
-	if (I_IXOFF(tty))
-		mgslpc_send_xchar(tty, STOP_CHAR(tty));
-
-	if (C_CRTSCTS(tty)) {
-		spin_lock_irqsave(&info->lock, flags);
-		info->serial_signals &= ~SerialSignal_RTS;
-		set_signals(info);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-}
-
-/* Signal remote device to stop throttling send data (our receive data)
- */
-static void mgslpc_unthrottle(struct tty_struct * tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_unthrottle(%s) entry\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_unthrottle"))
-		return;
-
-	if (I_IXOFF(tty)) {
-		if (info->x_char)
-			info->x_char = 0;
-		else
-			mgslpc_send_xchar(tty, START_CHAR(tty));
-	}
-
-	if (C_CRTSCTS(tty)) {
-		spin_lock_irqsave(&info->lock, flags);
-		info->serial_signals |= SerialSignal_RTS;
-		set_signals(info);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-}
-
-/* get the current serial statistics
- */
-static int get_stats(MGSLPC_INFO * info, struct mgsl_icount __user *user_icount)
-{
-	int err;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("get_params(%s)\n", info->device_name);
-	if (!user_icount) {
-		memset(&info->icount, 0, sizeof(info->icount));
-	} else {
-		COPY_TO_USER(err, user_icount, &info->icount, sizeof(struct mgsl_icount));
-		if (err)
-			return -EFAULT;
-	}
-	return 0;
-}
-
-/* get the current serial parameters
- */
-static int get_params(MGSLPC_INFO * info, MGSL_PARAMS __user *user_params)
-{
-	int err;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("get_params(%s)\n", info->device_name);
-	COPY_TO_USER(err,user_params, &info->params, sizeof(MGSL_PARAMS));
-	if (err)
-		return -EFAULT;
-	return 0;
-}
-
-/* set the serial parameters
- *
- * Arguments:
- *
- *	info		pointer to device instance data
- *	new_params	user buffer containing new serial params
- *
- * Returns:	0 if success, otherwise error code
- */
-static int set_params(MGSLPC_INFO * info, MGSL_PARAMS __user *new_params, struct tty_struct *tty)
-{
-	unsigned long flags;
-	MGSL_PARAMS tmp_params;
-	int err;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):set_params %s\n", __FILE__,__LINE__,
-			info->device_name);
-	COPY_FROM_USER(err,&tmp_params, new_params, sizeof(MGSL_PARAMS));
-	if (err) {
-		if (debug_level >= DEBUG_LEVEL_INFO)
-			printk("%s(%d):set_params(%s) user buffer copy failed\n",
-				__FILE__, __LINE__, info->device_name);
-		return -EFAULT;
-	}
-
-	spin_lock_irqsave(&info->lock, flags);
-	memcpy(&info->params,&tmp_params,sizeof(MGSL_PARAMS));
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	mgslpc_change_params(info, tty);
-
-	return 0;
-}
-
-static int get_txidle(MGSLPC_INFO * info, int __user *idle_mode)
-{
-	int err;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("get_txidle(%s)=%d\n", info->device_name, info->idle_mode);
-	COPY_TO_USER(err,idle_mode, &info->idle_mode, sizeof(int));
-	if (err)
-		return -EFAULT;
-	return 0;
-}
-
-static int set_txidle(MGSLPC_INFO * info, int idle_mode)
-{
-	unsigned long flags;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("set_txidle(%s,%d)\n", info->device_name, idle_mode);
-	spin_lock_irqsave(&info->lock, flags);
-	info->idle_mode = idle_mode;
-	tx_set_idle(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-static int get_interface(MGSLPC_INFO * info, int __user *if_mode)
-{
-	int err;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("get_interface(%s)=%d\n", info->device_name, info->if_mode);
-	COPY_TO_USER(err,if_mode, &info->if_mode, sizeof(int));
-	if (err)
-		return -EFAULT;
-	return 0;
-}
-
-static int set_interface(MGSLPC_INFO * info, int if_mode)
-{
-	unsigned long flags;
-	unsigned char val;
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("set_interface(%s,%d)\n", info->device_name, if_mode);
-	spin_lock_irqsave(&info->lock, flags);
-	info->if_mode = if_mode;
-
-	val = read_reg(info, PVR) & 0x0f;
-	switch (info->if_mode)
-	{
-	case MGSL_INTERFACE_RS232: val |= PVR_RS232; break;
-	case MGSL_INTERFACE_V35:   val |= PVR_V35;   break;
-	case MGSL_INTERFACE_RS422: val |= PVR_RS422; break;
-	}
-	write_reg(info, PVR, val);
-
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-static int set_txenable(MGSLPC_INFO * info, int enable, struct tty_struct *tty)
-{
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("set_txenable(%s,%d)\n", info->device_name, enable);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (enable) {
-		if (!info->tx_enabled)
-			tx_start(info, tty);
-	} else {
-		if (info->tx_enabled)
-			tx_stop(info);
-	}
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-static int tx_abort(MGSLPC_INFO * info)
-{
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("tx_abort(%s)\n", info->device_name);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (info->tx_active && info->tx_count &&
-	    info->params.mode == MGSL_MODE_HDLC) {
-		/* clear data count so FIFO is not filled on next IRQ.
-		 * This results in underrun and abort transmission.
-		 */
-		info->tx_count = info->tx_put = info->tx_get = 0;
-		info->tx_aborting = true;
-	}
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-static int set_rxenable(MGSLPC_INFO * info, int enable)
-{
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("set_rxenable(%s,%d)\n", info->device_name, enable);
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (enable) {
-		if (!info->rx_enabled)
-			rx_start(info);
-	} else {
-		if (info->rx_enabled)
-			rx_stop(info);
-	}
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-/* wait for specified event to occur
- *
- * Arguments:		info	pointer to device instance data
- *			mask	pointer to bitmask of events to wait for
- * Return Value:	0	if successful and bit mask updated with
- *				of events triggerred,
- *			otherwise error code
- */
-static int wait_events(MGSLPC_INFO * info, int __user *mask_ptr)
-{
-	unsigned long flags;
-	int s;
-	int rc=0;
-	struct mgsl_icount cprev, cnow;
-	int events;
-	int mask;
-	struct	_input_signal_events oldsigs, newsigs;
-	DECLARE_WAITQUEUE(wait, current);
-
-	COPY_FROM_USER(rc,&mask, mask_ptr, sizeof(int));
-	if (rc)
-		return  -EFAULT;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("wait_events(%s,%d)\n", info->device_name, mask);
-
-	spin_lock_irqsave(&info->lock, flags);
-
-	/* return immediately if state matches requested events */
-	get_signals(info);
-	s = info->serial_signals;
-	events = mask &
-		( ((s & SerialSignal_DSR) ? MgslEvent_DsrActive:MgslEvent_DsrInactive) +
-		  ((s & SerialSignal_DCD) ? MgslEvent_DcdActive:MgslEvent_DcdInactive) +
-		  ((s & SerialSignal_CTS) ? MgslEvent_CtsActive:MgslEvent_CtsInactive) +
-		  ((s & SerialSignal_RI)  ? MgslEvent_RiActive :MgslEvent_RiInactive) );
-	if (events) {
-		spin_unlock_irqrestore(&info->lock, flags);
-		goto exit;
-	}
-
-	/* save current irq counts */
-	cprev = info->icount;
-	oldsigs = info->input_signal_events;
-
-	if ((info->params.mode == MGSL_MODE_HDLC) &&
-	    (mask & MgslEvent_ExitHuntMode))
-		irq_enable(info, CHA, IRQ_EXITHUNT);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&info->event_wait_q, &wait);
-
-	spin_unlock_irqrestore(&info->lock, flags);
-
-
-	for(;;) {
-		schedule();
-		if (signal_pending(current)) {
-			rc = -ERESTARTSYS;
-			break;
-		}
-
-		/* get current irq counts */
-		spin_lock_irqsave(&info->lock, flags);
-		cnow = info->icount;
-		newsigs = info->input_signal_events;
-		set_current_state(TASK_INTERRUPTIBLE);
-		spin_unlock_irqrestore(&info->lock, flags);
-
-		/* if no change, wait aborted for some reason */
-		if (newsigs.dsr_up   == oldsigs.dsr_up   &&
-		    newsigs.dsr_down == oldsigs.dsr_down &&
-		    newsigs.dcd_up   == oldsigs.dcd_up   &&
-		    newsigs.dcd_down == oldsigs.dcd_down &&
-		    newsigs.cts_up   == oldsigs.cts_up   &&
-		    newsigs.cts_down == oldsigs.cts_down &&
-		    newsigs.ri_up    == oldsigs.ri_up    &&
-		    newsigs.ri_down  == oldsigs.ri_down  &&
-		    cnow.exithunt    == cprev.exithunt   &&
-		    cnow.rxidle      == cprev.rxidle) {
-			rc = -EIO;
-			break;
-		}
-
-		events = mask &
-			( (newsigs.dsr_up   != oldsigs.dsr_up   ? MgslEvent_DsrActive:0)   +
-			  (newsigs.dsr_down != oldsigs.dsr_down ? MgslEvent_DsrInactive:0) +
-			  (newsigs.dcd_up   != oldsigs.dcd_up   ? MgslEvent_DcdActive:0)   +
-			  (newsigs.dcd_down != oldsigs.dcd_down ? MgslEvent_DcdInactive:0) +
-			  (newsigs.cts_up   != oldsigs.cts_up   ? MgslEvent_CtsActive:0)   +
-			  (newsigs.cts_down != oldsigs.cts_down ? MgslEvent_CtsInactive:0) +
-			  (newsigs.ri_up    != oldsigs.ri_up    ? MgslEvent_RiActive:0)    +
-			  (newsigs.ri_down  != oldsigs.ri_down  ? MgslEvent_RiInactive:0)  +
-			  (cnow.exithunt    != cprev.exithunt   ? MgslEvent_ExitHuntMode:0) +
-			  (cnow.rxidle      != cprev.rxidle     ? MgslEvent_IdleReceived:0) );
-		if (events)
-			break;
-
-		cprev = cnow;
-		oldsigs = newsigs;
-	}
-
-	remove_wait_queue(&info->event_wait_q, &wait);
-	set_current_state(TASK_RUNNING);
-
-	if (mask & MgslEvent_ExitHuntMode) {
-		spin_lock_irqsave(&info->lock, flags);
-		if (!waitqueue_active(&info->event_wait_q))
-			irq_disable(info, CHA, IRQ_EXITHUNT);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-exit:
-	if (rc == 0)
-		PUT_USER(rc, events, mask_ptr);
-	return rc;
-}
-
-static int modem_input_wait(MGSLPC_INFO *info,int arg)
-{
-	unsigned long flags;
-	int rc;
-	struct mgsl_icount cprev, cnow;
-	DECLARE_WAITQUEUE(wait, current);
-
-	/* save current irq counts */
-	spin_lock_irqsave(&info->lock, flags);
-	cprev = info->icount;
-	add_wait_queue(&info->status_event_wait_q, &wait);
-	set_current_state(TASK_INTERRUPTIBLE);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	for(;;) {
-		schedule();
-		if (signal_pending(current)) {
-			rc = -ERESTARTSYS;
-			break;
-		}
-
-		/* get new irq counts */
-		spin_lock_irqsave(&info->lock, flags);
-		cnow = info->icount;
-		set_current_state(TASK_INTERRUPTIBLE);
-		spin_unlock_irqrestore(&info->lock, flags);
-
-		/* if no change, wait aborted for some reason */
-		if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr &&
-		    cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) {
-			rc = -EIO;
-			break;
-		}
-
-		/* check for change in caller specified modem input */
-		if ((arg & TIOCM_RNG && cnow.rng != cprev.rng) ||
-		    (arg & TIOCM_DSR && cnow.dsr != cprev.dsr) ||
-		    (arg & TIOCM_CD  && cnow.dcd != cprev.dcd) ||
-		    (arg & TIOCM_CTS && cnow.cts != cprev.cts)) {
-			rc = 0;
-			break;
-		}
-
-		cprev = cnow;
-	}
-	remove_wait_queue(&info->status_event_wait_q, &wait);
-	set_current_state(TASK_RUNNING);
-	return rc;
-}
-
-/* return the state of the serial control and status signals
- */
-static int tiocmget(struct tty_struct *tty)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned int result;
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	get_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	result = ((info->serial_signals & SerialSignal_RTS) ? TIOCM_RTS:0) +
-		((info->serial_signals & SerialSignal_DTR) ? TIOCM_DTR:0) +
-		((info->serial_signals & SerialSignal_DCD) ? TIOCM_CAR:0) +
-		((info->serial_signals & SerialSignal_RI)  ? TIOCM_RNG:0) +
-		((info->serial_signals & SerialSignal_DSR) ? TIOCM_DSR:0) +
-		((info->serial_signals & SerialSignal_CTS) ? TIOCM_CTS:0);
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):%s tiocmget() value=%08X\n",
-			 __FILE__, __LINE__, info->device_name, result);
-	return result;
-}
-
-/* set modem control signals (DTR/RTS)
- */
-static int tiocmset(struct tty_struct *tty,
-		    unsigned int set, unsigned int clear)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):%s tiocmset(%x,%x)\n",
-			__FILE__, __LINE__, info->device_name, set, clear);
-
-	if (set & TIOCM_RTS)
-		info->serial_signals |= SerialSignal_RTS;
-	if (set & TIOCM_DTR)
-		info->serial_signals |= SerialSignal_DTR;
-	if (clear & TIOCM_RTS)
-		info->serial_signals &= ~SerialSignal_RTS;
-	if (clear & TIOCM_DTR)
-		info->serial_signals &= ~SerialSignal_DTR;
-
-	spin_lock_irqsave(&info->lock, flags);
-	set_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return 0;
-}
-
-/* Set or clear transmit break condition
- *
- * Arguments:		tty		pointer to tty instance data
- *			break_state	-1=set break condition, 0=clear
- */
-static int mgslpc_break(struct tty_struct *tty, int break_state)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_break(%s,%d)\n",
-			 __FILE__, __LINE__, info->device_name, break_state);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_break"))
-		return -EINVAL;
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (break_state == -1)
-		set_reg_bits(info, CHA+DAFO, BIT6);
-	else
-		clear_reg_bits(info, CHA+DAFO, BIT6);
-	spin_unlock_irqrestore(&info->lock, flags);
-	return 0;
-}
-
-static int mgslpc_get_icount(struct tty_struct *tty,
-				struct serial_icounter_struct *icount)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-	struct mgsl_icount cnow;	/* kernel counter temps */
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	cnow = info->icount;
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	icount->cts = cnow.cts;
-	icount->dsr = cnow.dsr;
-	icount->rng = cnow.rng;
-	icount->dcd = cnow.dcd;
-	icount->rx = cnow.rx;
-	icount->tx = cnow.tx;
-	icount->frame = cnow.frame;
-	icount->overrun = cnow.overrun;
-	icount->parity = cnow.parity;
-	icount->brk = cnow.brk;
-	icount->buf_overrun = cnow.buf_overrun;
-
-	return 0;
-}
-
-/* Service an IOCTL request
- *
- * Arguments:
- *
- *	tty	pointer to tty instance data
- *	cmd	IOCTL command code
- *	arg	command argument/context
- *
- * Return Value:	0 if success, otherwise error code
- */
-static int mgslpc_ioctl(struct tty_struct *tty,
-			unsigned int cmd, unsigned long arg)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-	void __user *argp = (void __user *)arg;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_ioctl %s cmd=%08X\n", __FILE__, __LINE__,
-			info->device_name, cmd);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_ioctl"))
-		return -ENODEV;
-
-	if (cmd != TIOCMIWAIT) {
-		if (tty_io_error(tty))
-		    return -EIO;
-	}
-
-	switch (cmd) {
-	case MGSL_IOCGPARAMS:
-		return get_params(info, argp);
-	case MGSL_IOCSPARAMS:
-		return set_params(info, argp, tty);
-	case MGSL_IOCGTXIDLE:
-		return get_txidle(info, argp);
-	case MGSL_IOCSTXIDLE:
-		return set_txidle(info, (int)arg);
-	case MGSL_IOCGIF:
-		return get_interface(info, argp);
-	case MGSL_IOCSIF:
-		return set_interface(info,(int)arg);
-	case MGSL_IOCTXENABLE:
-		return set_txenable(info,(int)arg, tty);
-	case MGSL_IOCRXENABLE:
-		return set_rxenable(info,(int)arg);
-	case MGSL_IOCTXABORT:
-		return tx_abort(info);
-	case MGSL_IOCGSTATS:
-		return get_stats(info, argp);
-	case MGSL_IOCWAITEVENT:
-		return wait_events(info, argp);
-	case TIOCMIWAIT:
-		return modem_input_wait(info,(int)arg);
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return 0;
-}
-
-/* Set new termios settings
- *
- * Arguments:
- *
- *	tty		pointer to tty structure
- *	termios		pointer to buffer to hold returned old termios
- */
-static void mgslpc_set_termios(struct tty_struct *tty,
-			       const struct ktermios *old_termios)
-{
-	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_set_termios %s\n", __FILE__, __LINE__,
-			tty->driver->name);
-
-	/* just return if nothing has changed */
-	if ((tty->termios.c_cflag == old_termios->c_cflag)
-	    && (RELEVANT_IFLAG(tty->termios.c_iflag)
-		== RELEVANT_IFLAG(old_termios->c_iflag)))
-	  return;
-
-	mgslpc_change_params(info, tty);
-
-	/* Handle transition to B0 status */
-	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
-		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
-		spin_lock_irqsave(&info->lock, flags);
-		set_signals(info);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-
-	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
-		info->serial_signals |= SerialSignal_DTR;
-		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
-			info->serial_signals |= SerialSignal_RTS;
-		spin_lock_irqsave(&info->lock, flags);
-		set_signals(info);
-		spin_unlock_irqrestore(&info->lock, flags);
-	}
-
-	/* Handle turning off CRTSCTS */
-	if (old_termios->c_cflag & CRTSCTS && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
-		tx_release(tty);
-	}
-}
-
-static void mgslpc_close(struct tty_struct *tty, struct file * filp)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-	struct tty_port *port = &info->port;
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_close"))
-		return;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_close(%s) entry, count=%d\n",
-			 __FILE__, __LINE__, info->device_name, port->count);
-
-	if (tty_port_close_start(port, tty, filp) == 0)
-		goto cleanup;
-
-	if (tty_port_initialized(port))
-		mgslpc_wait_until_sent(tty, info->timeout);
-
-	mgslpc_flush_buffer(tty);
-
-	tty_ldisc_flush(tty);
-	shutdown(info, tty);
-	
-	tty_port_close_end(port, tty);
-	tty_port_tty_set(port, NULL);
-cleanup:
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_close(%s) exit, count=%d\n", __FILE__, __LINE__,
-			tty->driver->name, port->count);
-}
-
-/* Wait until the transmitter is empty.
- */
-static void mgslpc_wait_until_sent(struct tty_struct *tty, int timeout)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-	unsigned long orig_jiffies, char_time;
-
-	if (!info)
-		return;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_wait_until_sent(%s) entry\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_wait_until_sent"))
-		return;
-
-	if (!tty_port_initialized(&info->port))
-		goto exit;
-
-	orig_jiffies = jiffies;
-
-	/* Set check interval to 1/5 of estimated time to
-	 * send a character, and make it at least 1. The check
-	 * interval should also be less than the timeout.
-	 * Note: use tight timings here to satisfy the NIST-PCTS.
-	 */
-
-	if (info->params.data_rate) {
-	     	char_time = info->timeout/(32 * 5);
-		if (!char_time)
-			char_time++;
-	} else
-		char_time = 1;
-
-	if (timeout)
-		char_time = min_t(unsigned long, char_time, timeout);
-
-	if (info->params.mode == MGSL_MODE_HDLC) {
-		while (info->tx_active) {
-			msleep_interruptible(jiffies_to_msecs(char_time));
-			if (signal_pending(current))
-				break;
-			if (timeout && time_after(jiffies, orig_jiffies + timeout))
-				break;
-		}
-	} else {
-		while ((info->tx_count || info->tx_active) &&
-			info->tx_enabled) {
-			msleep_interruptible(jiffies_to_msecs(char_time));
-			if (signal_pending(current))
-				break;
-			if (timeout && time_after(jiffies, orig_jiffies + timeout))
-				break;
-		}
-	}
-
-exit:
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_wait_until_sent(%s) exit\n",
-			 __FILE__, __LINE__, info->device_name);
-}
-
-/* Called by tty_hangup() when a hangup is signaled.
- * This is the same as closing all open files for the port.
- */
-static void mgslpc_hangup(struct tty_struct *tty)
-{
-	MGSLPC_INFO * info = (MGSLPC_INFO *)tty->driver_data;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_hangup(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_hangup"))
-		return;
-
-	mgslpc_flush_buffer(tty);
-	shutdown(info, tty);
-	tty_port_hangup(&info->port);
-}
-
-static bool carrier_raised(struct tty_port *port)
-{
-	MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	get_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return info->serial_signals & SerialSignal_DCD;
-}
-
-static void dtr_rts(struct tty_port *port, bool active)
-{
-	MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	if (active)
-		info->serial_signals |= SerialSignal_RTS | SerialSignal_DTR;
-	else
-		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
-	set_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-}
-
-
-static int mgslpc_open(struct tty_struct *tty, struct file * filp)
-{
-	MGSLPC_INFO	*info;
-	struct tty_port *port;
-	int		retval, line;
-	unsigned long	flags;
-
-	/* verify range of specified line number */
-	line = tty->index;
-	if (line >= mgslpc_device_count) {
-		printk("%s(%d):mgslpc_open with invalid line #%d.\n",
-			__FILE__, __LINE__, line);
-		return -ENODEV;
-	}
-
-	/* find the info structure for the specified line */
-	info = mgslpc_device_list;
-	while(info && info->line != line)
-		info = info->next_device;
-	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_open"))
-		return -ENODEV;
-
-	port = &info->port;
-	tty->driver_data = info;
-	tty_port_tty_set(port, tty);
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_open(%s), old ref count = %d\n",
-			 __FILE__, __LINE__, tty->driver->name, port->count);
-
-	spin_lock_irqsave(&info->netlock, flags);
-	if (info->netcount) {
-		retval = -EBUSY;
-		spin_unlock_irqrestore(&info->netlock, flags);
-		goto cleanup;
-	}
-	spin_lock(&port->lock);
-	port->count++;
-	spin_unlock(&port->lock);
-	spin_unlock_irqrestore(&info->netlock, flags);
-
-	if (port->count == 1) {
-		/* 1st open on this device, init hardware */
-		retval = startup(info, tty);
-		if (retval < 0)
-			goto cleanup;
-	}
-
-	retval = tty_port_block_til_ready(&info->port, tty, filp);
-	if (retval) {
-		if (debug_level >= DEBUG_LEVEL_INFO)
-			printk("%s(%d):block_til_ready(%s) returned %d\n",
-				 __FILE__, __LINE__, info->device_name, retval);
-		goto cleanup;
-	}
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_open(%s) success\n",
-			 __FILE__, __LINE__, info->device_name);
-	retval = 0;
-
-cleanup:
-	return retval;
-}
-
-/*
- * /proc fs routines....
- */
-
-static inline void line_info(struct seq_file *m, MGSLPC_INFO *info)
-{
-	char	stat_buf[30];
-	unsigned long flags;
-
-	seq_printf(m, "%s:io:%04X irq:%d",
-		      info->device_name, info->io_base, info->irq_level);
-
-	/* output current serial signal states */
-	spin_lock_irqsave(&info->lock, flags);
-	get_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	stat_buf[0] = 0;
-	stat_buf[1] = 0;
-	if (info->serial_signals & SerialSignal_RTS)
-		strcat(stat_buf, "|RTS");
-	if (info->serial_signals & SerialSignal_CTS)
-		strcat(stat_buf, "|CTS");
-	if (info->serial_signals & SerialSignal_DTR)
-		strcat(stat_buf, "|DTR");
-	if (info->serial_signals & SerialSignal_DSR)
-		strcat(stat_buf, "|DSR");
-	if (info->serial_signals & SerialSignal_DCD)
-		strcat(stat_buf, "|CD");
-	if (info->serial_signals & SerialSignal_RI)
-		strcat(stat_buf, "|RI");
-
-	if (info->params.mode == MGSL_MODE_HDLC) {
-		seq_printf(m, " HDLC txok:%d rxok:%d",
-			      info->icount.txok, info->icount.rxok);
-		if (info->icount.txunder)
-			seq_printf(m, " txunder:%d", info->icount.txunder);
-		if (info->icount.txabort)
-			seq_printf(m, " txabort:%d", info->icount.txabort);
-		if (info->icount.rxshort)
-			seq_printf(m, " rxshort:%d", info->icount.rxshort);
-		if (info->icount.rxlong)
-			seq_printf(m, " rxlong:%d", info->icount.rxlong);
-		if (info->icount.rxover)
-			seq_printf(m, " rxover:%d", info->icount.rxover);
-		if (info->icount.rxcrc)
-			seq_printf(m, " rxcrc:%d", info->icount.rxcrc);
-	} else {
-		seq_printf(m, " ASYNC tx:%d rx:%d",
-			      info->icount.tx, info->icount.rx);
-		if (info->icount.frame)
-			seq_printf(m, " fe:%d", info->icount.frame);
-		if (info->icount.parity)
-			seq_printf(m, " pe:%d", info->icount.parity);
-		if (info->icount.brk)
-			seq_printf(m, " brk:%d", info->icount.brk);
-		if (info->icount.overrun)
-			seq_printf(m, " oe:%d", info->icount.overrun);
-	}
-
-	/* Append serial signal status to end */
-	seq_printf(m, " %s\n", stat_buf+1);
-
-	seq_printf(m, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
-		       info->tx_active,info->bh_requested,info->bh_running,
-		       info->pending_bh);
-}
-
-/* Called to print information about devices
- */
-static int mgslpc_proc_show(struct seq_file *m, void *v)
-{
-	MGSLPC_INFO *info;
-
-	seq_printf(m, "synclink driver:%s\n", driver_version);
-
-	info = mgslpc_device_list;
-	while (info) {
-		line_info(m, info);
-		info = info->next_device;
-	}
-	return 0;
-}
-
-static int rx_alloc_buffers(MGSLPC_INFO *info)
-{
-	/* each buffer has header and data */
-	info->rx_buf_size = sizeof(RXBUF) + info->max_frame_size;
-
-	/* calculate total allocation size for 8 buffers */
-	info->rx_buf_total_size = info->rx_buf_size * 8;
-
-	/* limit total allocated memory */
-	if (info->rx_buf_total_size > 0x10000)
-		info->rx_buf_total_size = 0x10000;
-
-	/* calculate number of buffers */
-	info->rx_buf_count = info->rx_buf_total_size / info->rx_buf_size;
-
-	info->rx_buf = kmalloc(info->rx_buf_total_size, GFP_KERNEL);
-	if (info->rx_buf == NULL)
-		return -ENOMEM;
-
-	/* unused flag buffer to satisfy receive_buf calling interface */
-	info->flag_buf = kzalloc(info->max_frame_size, GFP_KERNEL);
-	if (!info->flag_buf) {
-		kfree(info->rx_buf);
-		info->rx_buf = NULL;
-		return -ENOMEM;
-	}
-	
-	rx_reset_buffers(info);
-	return 0;
-}
-
-static void rx_free_buffers(MGSLPC_INFO *info)
-{
-	kfree(info->rx_buf);
-	info->rx_buf = NULL;
-	kfree(info->flag_buf);
-	info->flag_buf = NULL;
-}
-
-static int claim_resources(MGSLPC_INFO *info)
-{
-	if (rx_alloc_buffers(info) < 0) {
-		printk("Can't allocate rx buffer %s\n", info->device_name);
-		release_resources(info);
-		return -ENODEV;
-	}
-	return 0;
-}
-
-static void release_resources(MGSLPC_INFO *info)
-{
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("release_resources(%s)\n", info->device_name);
-	rx_free_buffers(info);
-}
-
-/* Add the specified device instance data structure to the
- * global linked list of devices and increment the device count.
- *
- * Arguments:		info	pointer to device instance data
- */
-static int mgslpc_add_device(MGSLPC_INFO *info)
-{
-	MGSLPC_INFO *current_dev = NULL;
-	struct device *tty_dev;
-	int ret;
-
-	info->next_device = NULL;
-	info->line = mgslpc_device_count;
-	sprintf(info->device_name,"ttySLP%d",info->line);
-
-	if (info->line < MAX_DEVICE_COUNT) {
-		if (maxframe[info->line])
-			info->max_frame_size = maxframe[info->line];
-	}
-
-	mgslpc_device_count++;
-
-	if (!mgslpc_device_list)
-		mgslpc_device_list = info;
-	else {
-		current_dev = mgslpc_device_list;
-		while (current_dev->next_device)
-			current_dev = current_dev->next_device;
-		current_dev->next_device = info;
-	}
-
-	if (info->max_frame_size < 4096)
-		info->max_frame_size = 4096;
-	else if (info->max_frame_size > 65535)
-		info->max_frame_size = 65535;
-
-	printk("SyncLink PC Card %s:IO=%04X IRQ=%d\n",
-		info->device_name, info->io_base, info->irq_level);
-
-#if SYNCLINK_GENERIC_HDLC
-	ret = hdlcdev_init(info);
-	if (ret != 0)
-		goto failed;
-#endif
-
-	tty_dev = tty_port_register_device(&info->port, serial_driver, info->line,
-			&info->p_dev->dev);
-	if (IS_ERR(tty_dev)) {
-		ret = PTR_ERR(tty_dev);
-#if SYNCLINK_GENERIC_HDLC
-		hdlcdev_exit(info);
-#endif
-		goto failed;
-	}
-
-	return 0;
-
-failed:
-	if (current_dev)
-		current_dev->next_device = NULL;
-	else
-		mgslpc_device_list = NULL;
-	mgslpc_device_count--;
-	return ret;
-}
-
-static void mgslpc_remove_device(MGSLPC_INFO *remove_info)
-{
-	MGSLPC_INFO *info = mgslpc_device_list;
-	MGSLPC_INFO *last = NULL;
-
-	while(info) {
-		if (info == remove_info) {
-			if (last)
-				last->next_device = info->next_device;
-			else
-				mgslpc_device_list = info->next_device;
-			tty_unregister_device(serial_driver, info->line);
-#if SYNCLINK_GENERIC_HDLC
-			hdlcdev_exit(info);
-#endif
-			release_resources(info);
-			tty_port_destroy(&info->port);
-			kfree(info);
-			mgslpc_device_count--;
-			return;
-		}
-		last = info;
-		info = info->next_device;
-	}
-}
-
-static const struct pcmcia_device_id mgslpc_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x02c5, 0x0050),
-	PCMCIA_DEVICE_NULL
-};
-MODULE_DEVICE_TABLE(pcmcia, mgslpc_ids);
-
-static struct pcmcia_driver mgslpc_driver = {
-	.owner		= THIS_MODULE,
-	.name		= "synclink_cs",
-	.probe		= mgslpc_probe,
-	.remove		= mgslpc_detach,
-	.id_table	= mgslpc_ids,
-	.suspend	= mgslpc_suspend,
-	.resume		= mgslpc_resume,
-};
-
-static const struct tty_operations mgslpc_ops = {
-	.open = mgslpc_open,
-	.close = mgslpc_close,
-	.write = mgslpc_write,
-	.put_char = mgslpc_put_char,
-	.flush_chars = mgslpc_flush_chars,
-	.write_room = mgslpc_write_room,
-	.chars_in_buffer = mgslpc_chars_in_buffer,
-	.flush_buffer = mgslpc_flush_buffer,
-	.ioctl = mgslpc_ioctl,
-	.throttle = mgslpc_throttle,
-	.unthrottle = mgslpc_unthrottle,
-	.send_xchar = mgslpc_send_xchar,
-	.break_ctl = mgslpc_break,
-	.wait_until_sent = mgslpc_wait_until_sent,
-	.set_termios = mgslpc_set_termios,
-	.stop = tx_pause,
-	.start = tx_release,
-	.hangup = mgslpc_hangup,
-	.tiocmget = tiocmget,
-	.tiocmset = tiocmset,
-	.get_icount = mgslpc_get_icount,
-	.proc_show = mgslpc_proc_show,
-};
-
-static int __init synclink_cs_init(void)
-{
-	int rc;
-
-	if (break_on_load) {
-		mgslpc_get_text_ptr();
-		BREAKPOINT();
-	}
-
-	serial_driver = tty_alloc_driver(MAX_DEVICE_COUNT,
-			TTY_DRIVER_REAL_RAW |
-			TTY_DRIVER_DYNAMIC_DEV);
-	if (IS_ERR(serial_driver)) {
-		rc = PTR_ERR(serial_driver);
-		goto err;
-	}
-
-	/* Initialize the tty_driver structure */
-	serial_driver->driver_name = "synclink_cs";
-	serial_driver->name = "ttySLP";
-	serial_driver->major = ttymajor;
-	serial_driver->minor_start = 64;
-	serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	serial_driver->subtype = SERIAL_TYPE_NORMAL;
-	serial_driver->init_termios = tty_std_termios;
-	serial_driver->init_termios.c_cflag =
-	B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	tty_set_operations(serial_driver, &mgslpc_ops);
-
-	rc = tty_register_driver(serial_driver);
-	if (rc < 0) {
-		printk(KERN_ERR "%s(%d):Couldn't register serial driver\n",
-				__FILE__, __LINE__);
-		goto err_put_tty;
-	}
-
-	rc = pcmcia_register_driver(&mgslpc_driver);
-	if (rc < 0)
-		goto err_unreg_tty;
-
-	printk(KERN_INFO "%s %s, tty major#%d\n", driver_name, driver_version,
-			serial_driver->major);
-
-	return 0;
-err_unreg_tty:
-	tty_unregister_driver(serial_driver);
-err_put_tty:
-	tty_driver_kref_put(serial_driver);
-err:
-	return rc;
-}
-
-static void __exit synclink_cs_exit(void)
-{
-	pcmcia_unregister_driver(&mgslpc_driver);
-	tty_unregister_driver(serial_driver);
-	tty_driver_kref_put(serial_driver);
-}
-
-module_init(synclink_cs_init);
-module_exit(synclink_cs_exit);
-
-static void mgslpc_set_rate(MGSLPC_INFO *info, unsigned char channel, unsigned int rate)
-{
-	unsigned int M, N;
-	unsigned char val;
-
-	/* note:standard BRG mode is broken in V3.2 chip
-	 * so enhanced mode is always used
-	 */
-
-	if (rate) {
-		N = 3686400 / rate;
-		if (!N)
-			N = 1;
-		N >>= 1;
-		for (M = 1; N > 64 && M < 16; M++)
-			N >>= 1;
-		N--;
-
-		/* BGR[5..0] = N
-		 * BGR[9..6] = M
-		 * BGR[7..0] contained in BGR register
-		 * BGR[9..8] contained in CCR2[7..6]
-		 * divisor = (N+1)*2^M
-		 *
-		 * Note: M *must* not be zero (causes asymetric duty cycle)
-		 */
-		write_reg(info, (unsigned char) (channel + BGR),
-				  (unsigned char) ((M << 6) + N));
-		val = read_reg(info, (unsigned char) (channel + CCR2)) & 0x3f;
-		val |= ((M << 4) & 0xc0);
-		write_reg(info, (unsigned char) (channel + CCR2), val);
-	}
-}
-
-/* Enabled the AUX clock output at the specified frequency.
- */
-static void enable_auxclk(MGSLPC_INFO *info)
-{
-	unsigned char val;
-
-	/* MODE
-	 *
-	 * 07..06  MDS[1..0] 10 = transparent HDLC mode
-	 * 05      ADM Address Mode, 0 = no addr recognition
-	 * 04      TMD Timer Mode, 0 = external
-	 * 03      RAC Receiver Active, 0 = inactive
-	 * 02      RTS 0=RTS active during xmit, 1=RTS always active
-	 * 01      TRS Timer Resolution, 1=512
-	 * 00      TLP Test Loop, 0 = no loop
-	 *
-	 * 1000 0010
-	 */
-	val = 0x82;
-
-	/* channel B RTS is used to enable AUXCLK driver on SP505 */
-	if (info->params.mode == MGSL_MODE_HDLC && info->params.clock_speed)
-		val |= BIT2;
-	write_reg(info, CHB + MODE, val);
-
-	/* CCR0
-	 *
-	 * 07      PU Power Up, 1=active, 0=power down
-	 * 06      MCE Master Clock Enable, 1=enabled
-	 * 05      Reserved, 0
-	 * 04..02  SC[2..0] Encoding
-	 * 01..00  SM[1..0] Serial Mode, 00=HDLC
-	 *
-	 * 11000000
-	 */
-	write_reg(info, CHB + CCR0, 0xc0);
-
-	/* CCR1
-	 *
-	 * 07      SFLG Shared Flag, 0 = disable shared flags
-	 * 06      GALP Go Active On Loop, 0 = not used
-	 * 05      GLP Go On Loop, 0 = not used
-	 * 04      ODS Output Driver Select, 1=TxD is push-pull output
-	 * 03      ITF Interframe Time Fill, 0=mark, 1=flag
-	 * 02..00  CM[2..0] Clock Mode
-	 *
-	 * 0001 0111
-	 */
-	write_reg(info, CHB + CCR1, 0x17);
-
-	/* CCR2 (Channel B)
-	 *
-	 * 07..06  BGR[9..8] Baud rate bits 9..8
-	 * 05      BDF Baud rate divisor factor, 0=1, 1=BGR value
-	 * 04      SSEL Clock source select, 1=submode b
-	 * 03      TOE 0=TxCLK is input, 1=TxCLK is output
-	 * 02      RWX Read/Write Exchange 0=disabled
-	 * 01      C32, CRC select, 0=CRC-16, 1=CRC-32
-	 * 00      DIV, data inversion 0=disabled, 1=enabled
-	 *
-	 * 0011 1000
-	 */
-	if (info->params.mode == MGSL_MODE_HDLC && info->params.clock_speed)
-		write_reg(info, CHB + CCR2, 0x38);
-	else
-		write_reg(info, CHB + CCR2, 0x30);
-
-	/* CCR4
-	 *
-	 * 07      MCK4 Master Clock Divide by 4, 1=enabled
-	 * 06      EBRG Enhanced Baud Rate Generator Mode, 1=enabled
-	 * 05      TST1 Test Pin, 0=normal operation
-	 * 04      ICD Ivert Carrier Detect, 1=enabled (active low)
-	 * 03..02  Reserved, must be 0
-	 * 01..00  RFT[1..0] RxFIFO Threshold 00=32 bytes
-	 *
-	 * 0101 0000
-	 */
-	write_reg(info, CHB + CCR4, 0x50);
-
-	/* if auxclk not enabled, set internal BRG so
-	 * CTS transitions can be detected (requires TxC)
-	 */
-	if (info->params.mode == MGSL_MODE_HDLC && info->params.clock_speed)
-		mgslpc_set_rate(info, CHB, info->params.clock_speed);
-	else
-		mgslpc_set_rate(info, CHB, 921600);
-}
-
-static void loopback_enable(MGSLPC_INFO *info)
-{
-	unsigned char val;
-
-	/* CCR1:02..00  CM[2..0] Clock Mode = 111 (clock mode 7) */
-	val = read_reg(info, CHA + CCR1) | (BIT2 | BIT1 | BIT0);
-	write_reg(info, CHA + CCR1, val);
-
-	/* CCR2:04 SSEL Clock source select, 1=submode b */
-	val = read_reg(info, CHA + CCR2) | (BIT4 | BIT5);
-	write_reg(info, CHA + CCR2, val);
-
-	/* set LinkSpeed if available, otherwise default to 2Mbps */
-	if (info->params.clock_speed)
-		mgslpc_set_rate(info, CHA, info->params.clock_speed);
-	else
-		mgslpc_set_rate(info, CHA, 1843200);
-
-	/* MODE:00 TLP Test Loop, 1=loopback enabled */
-	val = read_reg(info, CHA + MODE) | BIT0;
-	write_reg(info, CHA + MODE, val);
-}
-
-static void hdlc_mode(MGSLPC_INFO *info)
-{
-	unsigned char val;
-	unsigned char clkmode, clksubmode;
-
-	/* disable all interrupts */
-	irq_disable(info, CHA, 0xffff);
-	irq_disable(info, CHB, 0xffff);
-	port_irq_disable(info, 0xff);
-
-	/* assume clock mode 0a, rcv=RxC xmt=TxC */
-	clkmode = clksubmode = 0;
-	if (info->params.flags & HDLC_FLAG_RXC_DPLL
-	    && info->params.flags & HDLC_FLAG_TXC_DPLL) {
-		/* clock mode 7a, rcv = DPLL, xmt = DPLL */
-		clkmode = 7;
-	} else if (info->params.flags & HDLC_FLAG_RXC_BRG
-		 && info->params.flags & HDLC_FLAG_TXC_BRG) {
-		/* clock mode 7b, rcv = BRG, xmt = BRG */
-		clkmode = 7;
-		clksubmode = 1;
-	} else if (info->params.flags & HDLC_FLAG_RXC_DPLL) {
-		if (info->params.flags & HDLC_FLAG_TXC_BRG) {
-			/* clock mode 6b, rcv = DPLL, xmt = BRG/16 */
-			clkmode = 6;
-			clksubmode = 1;
-		} else {
-			/* clock mode 6a, rcv = DPLL, xmt = TxC */
-			clkmode = 6;
-		}
-	} else if (info->params.flags & HDLC_FLAG_TXC_BRG) {
-		/* clock mode 0b, rcv = RxC, xmt = BRG */
-		clksubmode = 1;
-	}
-
-	/* MODE
-	 *
-	 * 07..06  MDS[1..0] 10 = transparent HDLC mode
-	 * 05      ADM Address Mode, 0 = no addr recognition
-	 * 04      TMD Timer Mode, 0 = external
-	 * 03      RAC Receiver Active, 0 = inactive
-	 * 02      RTS 0=RTS active during xmit, 1=RTS always active
-	 * 01      TRS Timer Resolution, 1=512
-	 * 00      TLP Test Loop, 0 = no loop
-	 *
-	 * 1000 0010
-	 */
-	val = 0x82;
-	if (info->params.loopback)
-		val |= BIT0;
-
-	/* preserve RTS state */
-	if (info->serial_signals & SerialSignal_RTS)
-		val |= BIT2;
-	write_reg(info, CHA + MODE, val);
-
-	/* CCR0
-	 *
-	 * 07      PU Power Up, 1=active, 0=power down
-	 * 06      MCE Master Clock Enable, 1=enabled
-	 * 05      Reserved, 0
-	 * 04..02  SC[2..0] Encoding
-	 * 01..00  SM[1..0] Serial Mode, 00=HDLC
-	 *
-	 * 11000000
-	 */
-	val = 0xc0;
-	switch (info->params.encoding)
-	{
-	case HDLC_ENCODING_NRZI:
-		val |= BIT3;
-		break;
-	case HDLC_ENCODING_BIPHASE_SPACE:
-		val |= BIT4;
-		break;		// FM0
-	case HDLC_ENCODING_BIPHASE_MARK:
-		val |= BIT4 | BIT2;
-		break;		// FM1
-	case HDLC_ENCODING_BIPHASE_LEVEL:
-		val |= BIT4 | BIT3;
-		break;		// Manchester
-	}
-	write_reg(info, CHA + CCR0, val);
-
-	/* CCR1
-	 *
-	 * 07      SFLG Shared Flag, 0 = disable shared flags
-	 * 06      GALP Go Active On Loop, 0 = not used
-	 * 05      GLP Go On Loop, 0 = not used
-	 * 04      ODS Output Driver Select, 1=TxD is push-pull output
-	 * 03      ITF Interframe Time Fill, 0=mark, 1=flag
-	 * 02..00  CM[2..0] Clock Mode
-	 *
-	 * 0001 0000
-	 */
-	val = 0x10 + clkmode;
-	write_reg(info, CHA + CCR1, val);
-
-	/* CCR2
-	 *
-	 * 07..06  BGR[9..8] Baud rate bits 9..8
-	 * 05      BDF Baud rate divisor factor, 0=1, 1=BGR value
-	 * 04      SSEL Clock source select, 1=submode b
-	 * 03      TOE 0=TxCLK is input, 0=TxCLK is input
-	 * 02      RWX Read/Write Exchange 0=disabled
-	 * 01      C32, CRC select, 0=CRC-16, 1=CRC-32
-	 * 00      DIV, data inversion 0=disabled, 1=enabled
-	 *
-	 * 0000 0000
-	 */
-	val = 0x00;
-	if (clkmode == 2 || clkmode == 3 || clkmode == 6
-	    || clkmode == 7 || (clkmode == 0 && clksubmode == 1))
-		val |= BIT5;
-	if (clksubmode)
-		val |= BIT4;
-	if (info->params.crc_type == HDLC_CRC_32_CCITT)
-		val |= BIT1;
-	if (info->params.encoding == HDLC_ENCODING_NRZB)
-		val |= BIT0;
-	write_reg(info, CHA + CCR2, val);
-
-	/* CCR3
-	 *
-	 * 07..06  PRE[1..0] Preamble count 00=1, 01=2, 10=4, 11=8
-	 * 05      EPT Enable preamble transmission, 1=enabled
-	 * 04      RADD Receive address pushed to FIFO, 0=disabled
-	 * 03      CRL CRC Reset Level, 0=FFFF
-	 * 02      RCRC Rx CRC 0=On 1=Off
-	 * 01      TCRC Tx CRC 0=On 1=Off
-	 * 00      PSD DPLL Phase Shift Disable
-	 *
-	 * 0000 0000
-	 */
-	val = 0x00;
-	if (info->params.crc_type == HDLC_CRC_NONE)
-		val |= BIT2 | BIT1;
-	if (info->params.preamble != HDLC_PREAMBLE_PATTERN_NONE)
-		val |= BIT5;
-	switch (info->params.preamble_length)
-	{
-	case HDLC_PREAMBLE_LENGTH_16BITS:
-		val |= BIT6;
-		break;
-	case HDLC_PREAMBLE_LENGTH_32BITS:
-		val |= BIT6;
-		break;
-	case HDLC_PREAMBLE_LENGTH_64BITS:
-		val |= BIT7 | BIT6;
-		break;
-	}
-	write_reg(info, CHA + CCR3, val);
-
-	/* PRE - Preamble pattern */
-	val = 0;
-	switch (info->params.preamble)
-	{
-	case HDLC_PREAMBLE_PATTERN_FLAGS: val = 0x7e; break;
-	case HDLC_PREAMBLE_PATTERN_10:    val = 0xaa; break;
-	case HDLC_PREAMBLE_PATTERN_01:    val = 0x55; break;
-	case HDLC_PREAMBLE_PATTERN_ONES:  val = 0xff; break;
-	}
-	write_reg(info, CHA + PRE, val);
-
-	/* CCR4
-	 *
-	 * 07      MCK4 Master Clock Divide by 4, 1=enabled
-	 * 06      EBRG Enhanced Baud Rate Generator Mode, 1=enabled
-	 * 05      TST1 Test Pin, 0=normal operation
-	 * 04      ICD Ivert Carrier Detect, 1=enabled (active low)
-	 * 03..02  Reserved, must be 0
-	 * 01..00  RFT[1..0] RxFIFO Threshold 00=32 bytes
-	 *
-	 * 0101 0000
-	 */
-	val = 0x50;
-	write_reg(info, CHA + CCR4, val);
-	if (info->params.flags & HDLC_FLAG_RXC_DPLL)
-		mgslpc_set_rate(info, CHA, info->params.clock_speed * 16);
-	else
-		mgslpc_set_rate(info, CHA, info->params.clock_speed);
-
-	/* RLCR Receive length check register
-	 *
-	 * 7     1=enable receive length check
-	 * 6..0  Max frame length = (RL + 1) * 32
-	 */
-	write_reg(info, CHA + RLCR, 0);
-
-	/* XBCH Transmit Byte Count High
-	 *
-	 * 07      DMA mode, 0 = interrupt driven
-	 * 06      NRM, 0=ABM (ignored)
-	 * 05      CAS Carrier Auto Start
-	 * 04      XC Transmit Continuously (ignored)
-	 * 03..00  XBC[10..8] Transmit byte count bits 10..8
-	 *
-	 * 0000 0000
-	 */
-	val = 0x00;
-	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
-		val |= BIT5;
-	write_reg(info, CHA + XBCH, val);
-	enable_auxclk(info);
-	if (info->params.loopback || info->testing_irq)
-		loopback_enable(info);
-	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
-	{
-		irq_enable(info, CHB, IRQ_CTS);
-		/* PVR[3] 1=AUTO CTS active */
-		set_reg_bits(info, CHA + PVR, BIT3);
-	} else
-		clear_reg_bits(info, CHA + PVR, BIT3);
-
-	irq_enable(info, CHA,
-			 IRQ_RXEOM | IRQ_RXFIFO | IRQ_ALLSENT |
-			 IRQ_UNDERRUN | IRQ_TXFIFO);
-	issue_command(info, CHA, CMD_TXRESET + CMD_RXRESET);
-	wait_command_complete(info, CHA);
-	read_reg16(info, CHA + ISR);	/* clear pending IRQs */
-
-	/* Master clock mode enabled above to allow reset commands
-	 * to complete even if no data clocks are present.
-	 *
-	 * Disable master clock mode for normal communications because
-	 * V3.2 of the ESCC2 has a bug that prevents the transmit all sent
-	 * IRQ when in master clock mode.
-	 *
-	 * Leave master clock mode enabled for IRQ test because the
-	 * timer IRQ used by the test can only happen in master clock mode.
-	 */
-	if (!info->testing_irq)
-		clear_reg_bits(info, CHA + CCR0, BIT6);
-
-	tx_set_idle(info);
-
-	tx_stop(info);
-	rx_stop(info);
-}
-
-static void rx_stop(MGSLPC_INFO *info)
-{
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):rx_stop(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	/* MODE:03 RAC Receiver Active, 0=inactive */
-	clear_reg_bits(info, CHA + MODE, BIT3);
-
-	info->rx_enabled = false;
-	info->rx_overflow = false;
-}
-
-static void rx_start(MGSLPC_INFO *info)
-{
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):rx_start(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	rx_reset_buffers(info);
-	info->rx_enabled = false;
-	info->rx_overflow = false;
-
-	/* MODE:03 RAC Receiver Active, 1=active */
-	set_reg_bits(info, CHA + MODE, BIT3);
-
-	info->rx_enabled = true;
-}
-
-static void tx_start(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):tx_start(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	if (info->tx_count) {
-		/* If auto RTS enabled and RTS is inactive, then assert */
-		/* RTS and set a flag indicating that the driver should */
-		/* negate RTS when the transmission completes. */
-		info->drop_rts_on_tx_done = false;
-
-		if (info->params.flags & HDLC_FLAG_AUTO_RTS) {
-			get_signals(info);
-			if (!(info->serial_signals & SerialSignal_RTS)) {
-				info->serial_signals |= SerialSignal_RTS;
-				set_signals(info);
-				info->drop_rts_on_tx_done = true;
-			}
-		}
-
-		if (info->params.mode == MGSL_MODE_ASYNC) {
-			if (!info->tx_active) {
-				info->tx_active = true;
-				tx_ready(info, tty);
-			}
-		} else {
-			info->tx_active = true;
-			tx_ready(info, tty);
-			mod_timer(&info->tx_timer, jiffies +
-					msecs_to_jiffies(5000));
-		}
-	}
-
-	if (!info->tx_enabled)
-		info->tx_enabled = true;
-}
-
-static void tx_stop(MGSLPC_INFO *info)
-{
-	if (debug_level >= DEBUG_LEVEL_ISR)
-		printk("%s(%d):tx_stop(%s)\n",
-			 __FILE__, __LINE__, info->device_name);
-
-	del_timer(&info->tx_timer);
-
-	info->tx_enabled = false;
-	info->tx_active = false;
-}
-
-/* Reset the adapter to a known state and prepare it for further use.
- */
-static void reset_device(MGSLPC_INFO *info)
-{
-	/* power up both channels (set BIT7) */
-	write_reg(info, CHA + CCR0, 0x80);
-	write_reg(info, CHB + CCR0, 0x80);
-	write_reg(info, CHA + MODE, 0);
-	write_reg(info, CHB + MODE, 0);
-
-	/* disable all interrupts */
-	irq_disable(info, CHA, 0xffff);
-	irq_disable(info, CHB, 0xffff);
-	port_irq_disable(info, 0xff);
-
-	/* PCR Port Configuration Register
-	 *
-	 * 07..04  DEC[3..0] Serial I/F select outputs
-	 * 03      output, 1=AUTO CTS control enabled
-	 * 02      RI Ring Indicator input 0=active
-	 * 01      DSR input 0=active
-	 * 00      DTR output 0=active
-	 *
-	 * 0000 0110
-	 */
-	write_reg(info, PCR, 0x06);
-
-	/* PVR Port Value Register
-	 *
-	 * 07..04  DEC[3..0] Serial I/F select (0000=disabled)
-	 * 03      AUTO CTS output 1=enabled
-	 * 02      RI Ring Indicator input
-	 * 01      DSR input
-	 * 00      DTR output (1=inactive)
-	 *
-	 * 0000 0001
-	 */
-//	write_reg(info, PVR, PVR_DTR);
-
-	/* IPC Interrupt Port Configuration
-	 *
-	 * 07      VIS 1=Masked interrupts visible
-	 * 06..05  Reserved, 0
-	 * 04..03  SLA Slave address, 00 ignored
-	 * 02      CASM Cascading Mode, 1=daisy chain
-	 * 01..00  IC[1..0] Interrupt Config, 01=push-pull output, active low
-	 *
-	 * 0000 0101
-	 */
-	write_reg(info, IPC, 0x05);
-}
-
-static void async_mode(MGSLPC_INFO *info)
-{
-	unsigned char val;
-
-	/* disable all interrupts */
-	irq_disable(info, CHA, 0xffff);
-	irq_disable(info, CHB, 0xffff);
-	port_irq_disable(info, 0xff);
-
-	/* MODE
-	 *
-	 * 07      Reserved, 0
-	 * 06      FRTS RTS State, 0=active
-	 * 05      FCTS Flow Control on CTS
-	 * 04      FLON Flow Control Enable
-	 * 03      RAC Receiver Active, 0 = inactive
-	 * 02      RTS 0=Auto RTS, 1=manual RTS
-	 * 01      TRS Timer Resolution, 1=512
-	 * 00      TLP Test Loop, 0 = no loop
-	 *
-	 * 0000 0110
-	 */
-	val = 0x06;
-	if (info->params.loopback)
-		val |= BIT0;
-
-	/* preserve RTS state */
-	if (!(info->serial_signals & SerialSignal_RTS))
-		val |= BIT6;
-	write_reg(info, CHA + MODE, val);
-
-	/* CCR0
-	 *
-	 * 07      PU Power Up, 1=active, 0=power down
-	 * 06      MCE Master Clock Enable, 1=enabled
-	 * 05      Reserved, 0
-	 * 04..02  SC[2..0] Encoding, 000=NRZ
-	 * 01..00  SM[1..0] Serial Mode, 11=Async
-	 *
-	 * 1000 0011
-	 */
-	write_reg(info, CHA + CCR0, 0x83);
-
-	/* CCR1
-	 *
-	 * 07..05  Reserved, 0
-	 * 04      ODS Output Driver Select, 1=TxD is push-pull output
-	 * 03      BCR Bit Clock Rate, 1=16x
-	 * 02..00  CM[2..0] Clock Mode, 111=BRG
-	 *
-	 * 0001 1111
-	 */
-	write_reg(info, CHA + CCR1, 0x1f);
-
-	/* CCR2 (channel A)
-	 *
-	 * 07..06  BGR[9..8] Baud rate bits 9..8
-	 * 05      BDF Baud rate divisor factor, 0=1, 1=BGR value
-	 * 04      SSEL Clock source select, 1=submode b
-	 * 03      TOE 0=TxCLK is input, 0=TxCLK is input
-	 * 02      RWX Read/Write Exchange 0=disabled
-	 * 01      Reserved, 0
-	 * 00      DIV, data inversion 0=disabled, 1=enabled
-	 *
-	 * 0001 0000
-	 */
-	write_reg(info, CHA + CCR2, 0x10);
-
-	/* CCR3
-	 *
-	 * 07..01  Reserved, 0
-	 * 00      PSD DPLL Phase Shift Disable
-	 *
-	 * 0000 0000
-	 */
-	write_reg(info, CHA + CCR3, 0);
-
-	/* CCR4
-	 *
-	 * 07      MCK4 Master Clock Divide by 4, 1=enabled
-	 * 06      EBRG Enhanced Baud Rate Generator Mode, 1=enabled
-	 * 05      TST1 Test Pin, 0=normal operation
-	 * 04      ICD Ivert Carrier Detect, 1=enabled (active low)
-	 * 03..00  Reserved, must be 0
-	 *
-	 * 0101 0000
-	 */
-	write_reg(info, CHA + CCR4, 0x50);
-	mgslpc_set_rate(info, CHA, info->params.data_rate * 16);
-
-	/* DAFO Data Format
-	 *
-	 * 07      Reserved, 0
-	 * 06      XBRK transmit break, 0=normal operation
-	 * 05      Stop bits (0=1, 1=2)
-	 * 04..03  PAR[1..0] Parity (01=odd, 10=even)
-	 * 02      PAREN Parity Enable
-	 * 01..00  CHL[1..0] Character Length (00=8, 01=7)
-	 *
-	 */
-	val = 0x00;
-	if (info->params.data_bits != 8)
-		val |= BIT0;	/* 7 bits */
-	if (info->params.stop_bits != 1)
-		val |= BIT5;
-	if (info->params.parity != ASYNC_PARITY_NONE)
-	{
-		val |= BIT2;	/* Parity enable */
-		if (info->params.parity == ASYNC_PARITY_ODD)
-			val |= BIT3;
-		else
-			val |= BIT4;
-	}
-	write_reg(info, CHA + DAFO, val);
-
-	/* RFC Rx FIFO Control
-	 *
-	 * 07      Reserved, 0
-	 * 06      DPS, 1=parity bit not stored in data byte
-	 * 05      DXS, 0=all data stored in FIFO (including XON/XOFF)
-	 * 04      RFDF Rx FIFO Data Format, 1=status byte stored in FIFO
-	 * 03..02  RFTH[1..0], rx threshold, 11=16 status + 16 data byte
-	 * 01      Reserved, 0
-	 * 00      TCDE Terminate Char Detect Enable, 0=disabled
-	 *
-	 * 0101 1100
-	 */
-	write_reg(info, CHA + RFC, 0x5c);
-
-	/* RLCR Receive length check register
-	 *
-	 * Max frame length = (RL + 1) * 32
-	 */
-	write_reg(info, CHA + RLCR, 0);
-
-	/* XBCH Transmit Byte Count High
-	 *
-	 * 07      DMA mode, 0 = interrupt driven
-	 * 06      NRM, 0=ABM (ignored)
-	 * 05      CAS Carrier Auto Start
-	 * 04      XC Transmit Continuously (ignored)
-	 * 03..00  XBC[10..8] Transmit byte count bits 10..8
-	 *
-	 * 0000 0000
-	 */
-	val = 0x00;
-	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
-		val |= BIT5;
-	write_reg(info, CHA + XBCH, val);
-	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
-		irq_enable(info, CHA, IRQ_CTS);
-
-	/* MODE:03 RAC Receiver Active, 1=active */
-	set_reg_bits(info, CHA + MODE, BIT3);
-	enable_auxclk(info);
-	if (info->params.flags & HDLC_FLAG_AUTO_CTS) {
-		irq_enable(info, CHB, IRQ_CTS);
-		/* PVR[3] 1=AUTO CTS active */
-		set_reg_bits(info, CHA + PVR, BIT3);
-	} else
-		clear_reg_bits(info, CHA + PVR, BIT3);
-	irq_enable(info, CHA,
-			  IRQ_RXEOM | IRQ_RXFIFO | IRQ_BREAK_ON | IRQ_RXTIME |
-			  IRQ_ALLSENT | IRQ_TXFIFO);
-	issue_command(info, CHA, CMD_TXRESET + CMD_RXRESET);
-	wait_command_complete(info, CHA);
-	read_reg16(info, CHA + ISR);	/* clear pending IRQs */
-}
-
-/* Set the HDLC idle mode for the transmitter.
- */
-static void tx_set_idle(MGSLPC_INFO *info)
-{
-	/* Note: ESCC2 only supports flags and one idle modes */
-	if (info->idle_mode == HDLC_TXIDLE_FLAGS)
-		set_reg_bits(info, CHA + CCR1, BIT3);
-	else
-		clear_reg_bits(info, CHA + CCR1, BIT3);
-}
-
-/* get state of the V24 status (input) signals.
- */
-static void get_signals(MGSLPC_INFO *info)
-{
-	unsigned char status = 0;
-
-	/* preserve RTS and DTR */
-	info->serial_signals &= SerialSignal_RTS | SerialSignal_DTR;
-
-	if (read_reg(info, CHB + VSTR) & BIT7)
-		info->serial_signals |= SerialSignal_DCD;
-	if (read_reg(info, CHB + STAR) & BIT1)
-		info->serial_signals |= SerialSignal_CTS;
-
-	status = read_reg(info, CHA + PVR);
-	if (!(status & PVR_RI))
-		info->serial_signals |= SerialSignal_RI;
-	if (!(status & PVR_DSR))
-		info->serial_signals |= SerialSignal_DSR;
-}
-
-/* Set the state of RTS and DTR based on contents of
- * serial_signals member of device extension.
- */
-static void set_signals(MGSLPC_INFO *info)
-{
-	unsigned char val;
-
-	val = read_reg(info, CHA + MODE);
-	if (info->params.mode == MGSL_MODE_ASYNC) {
-		if (info->serial_signals & SerialSignal_RTS)
-			val &= ~BIT6;
-		else
-			val |= BIT6;
-	} else {
-		if (info->serial_signals & SerialSignal_RTS)
-			val |= BIT2;
-		else
-			val &= ~BIT2;
-	}
-	write_reg(info, CHA + MODE, val);
-
-	if (info->serial_signals & SerialSignal_DTR)
-		clear_reg_bits(info, CHA + PVR, PVR_DTR);
-	else
-		set_reg_bits(info, CHA + PVR, PVR_DTR);
-}
-
-static void rx_reset_buffers(MGSLPC_INFO *info)
-{
-	RXBUF *buf;
-	int i;
-
-	info->rx_put = 0;
-	info->rx_get = 0;
-	info->rx_frame_count = 0;
-	for (i=0 ; i < info->rx_buf_count ; i++) {
-		buf = (RXBUF*)(info->rx_buf + (i * info->rx_buf_size));
-		buf->status = buf->count = 0;
-	}
-}
-
-/* Attempt to return a received HDLC frame
- * Only frames received without errors are returned.
- *
- * Returns true if frame returned, otherwise false
- */
-static bool rx_get_frame(MGSLPC_INFO *info, struct tty_struct *tty)
-{
-	unsigned short status;
-	RXBUF *buf;
-	unsigned int framesize = 0;
-	unsigned long flags;
-	bool return_frame = false;
-
-	if (info->rx_frame_count == 0)
-		return false;
-
-	buf = (RXBUF*)(info->rx_buf + (info->rx_get * info->rx_buf_size));
-
-	status = buf->status;
-
-	/* 07  VFR  1=valid frame
-	 * 06  RDO  1=data overrun
-	 * 05  CRC  1=OK, 0=error
-	 * 04  RAB  1=frame aborted
-	 */
-	if ((status & 0xf0) != 0xA0) {
-		if (!(status & BIT7) || (status & BIT4))
-			info->icount.rxabort++;
-		else if (status & BIT6)
-			info->icount.rxover++;
-		else if (!(status & BIT5)) {
-			info->icount.rxcrc++;
-			if (info->params.crc_type & HDLC_CRC_RETURN_EX)
-				return_frame = true;
-		}
-		framesize = 0;
-#if SYNCLINK_GENERIC_HDLC
-		{
-			info->netdev->stats.rx_errors++;
-			info->netdev->stats.rx_frame_errors++;
-		}
-#endif
-	} else
-		return_frame = true;
-
-	if (return_frame)
-		framesize = buf->count;
-
-	if (debug_level >= DEBUG_LEVEL_BH)
-		printk("%s(%d):rx_get_frame(%s) status=%04X size=%d\n",
-			__FILE__, __LINE__, info->device_name, status, framesize);
-
-	if (debug_level >= DEBUG_LEVEL_DATA)
-		trace_block(info, buf->data, framesize, 0);
-
-	if (framesize) {
-		if ((info->params.crc_type & HDLC_CRC_RETURN_EX &&
-		      framesize+1 > info->max_frame_size) ||
-		    framesize > info->max_frame_size)
-			info->icount.rxlong++;
-		else {
-			if (status & BIT5)
-				info->icount.rxok++;
-
-			if (info->params.crc_type & HDLC_CRC_RETURN_EX) {
-				*(buf->data + framesize) = status & BIT5 ? RX_OK:RX_CRC_ERROR;
-				++framesize;
-			}
-
-#if SYNCLINK_GENERIC_HDLC
-			if (info->netcount)
-				hdlcdev_rx(info, buf->data, framesize);
-			else
-#endif
-				ldisc_receive_buf(tty, buf->data, info->flag_buf, framesize);
-		}
-	}
-
-	spin_lock_irqsave(&info->lock, flags);
-	buf->status = buf->count = 0;
-	info->rx_frame_count--;
-	info->rx_get++;
-	if (info->rx_get >= info->rx_buf_count)
-		info->rx_get = 0;
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return true;
-}
-
-static bool register_test(MGSLPC_INFO *info)
-{
-	static unsigned char patterns[] =
-	    { 0x00, 0xff, 0xaa, 0x55, 0x69, 0x96, 0x0f };
-	static unsigned int count = ARRAY_SIZE(patterns);
-	unsigned int i;
-	bool rc = true;
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	reset_device(info);
-
-	for (i = 0; i < count; i++) {
-		write_reg(info, XAD1, patterns[i]);
-		write_reg(info, XAD2, patterns[(i + 1) % count]);
-		if ((read_reg(info, XAD1) != patterns[i]) ||
-		    (read_reg(info, XAD2) != patterns[(i + 1) % count])) {
-			rc = false;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&info->lock, flags);
-	return rc;
-}
-
-static bool irq_test(MGSLPC_INFO *info)
-{
-	unsigned long end_time;
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->lock, flags);
-	reset_device(info);
-
-	info->testing_irq = true;
-	hdlc_mode(info);
-
-	info->irq_occurred = false;
-
-	/* init hdlc mode */
-
-	irq_enable(info, CHA, IRQ_TIMER);
-	write_reg(info, CHA + TIMR, 0);	/* 512 cycles */
-	issue_command(info, CHA, CMD_START_TIMER);
-
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	end_time=100;
-	while(end_time-- && !info->irq_occurred) {
-		msleep_interruptible(10);
-	}
-
-	info->testing_irq = false;
-
-	spin_lock_irqsave(&info->lock, flags);
-	reset_device(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return info->irq_occurred;
-}
-
-static int adapter_test(MGSLPC_INFO *info)
-{
-	if (!register_test(info)) {
-		info->init_error = DiagStatus_AddressFailure;
-		printk("%s(%d):Register test failure for device %s Addr=%04X\n",
-			__FILE__, __LINE__, info->device_name, (unsigned short)(info->io_base));
-		return -ENODEV;
-	}
-
-	if (!irq_test(info)) {
-		info->init_error = DiagStatus_IrqFailure;
-		printk("%s(%d):Interrupt test failure for device %s IRQ=%d\n",
-			__FILE__, __LINE__, info->device_name, (unsigned short)(info->irq_level));
-		return -ENODEV;
-	}
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):device %s passed diagnostics\n",
-			__FILE__, __LINE__, info->device_name);
-	return 0;
-}
-
-static void trace_block(MGSLPC_INFO *info,const char* data, int count, int xmit)
-{
-	int i;
-	int linecount;
-	if (xmit)
-		printk("%s tx data:\n", info->device_name);
-	else
-		printk("%s rx data:\n", info->device_name);
-
-	while(count) {
-		if (count > 16)
-			linecount = 16;
-		else
-			linecount = count;
-
-		for(i=0;i<linecount;i++)
-			printk("%02X ", (unsigned char)data[i]);
-		for(;i<17;i++)
-			printk("   ");
-		for(i=0;i<linecount;i++) {
-			if (data[i]>=040 && data[i]<=0176)
-				printk("%c", data[i]);
-			else
-				printk(".");
-		}
-		printk("\n");
-
-		data  += linecount;
-		count -= linecount;
-	}
-}
-
-/* HDLC frame time out
- * update stats and do tx completion processing
- */
-static void tx_timeout(struct timer_list *t)
-{
-	MGSLPC_INFO *info = from_timer(info, t, tx_timer);
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):tx_timeout(%s)\n",
-			__FILE__, __LINE__, info->device_name);
-	if (info->tx_active &&
-	    info->params.mode == MGSL_MODE_HDLC) {
-		info->icount.txtimeout++;
-	}
-	spin_lock_irqsave(&info->lock, flags);
-	info->tx_active = false;
-	info->tx_count = info->tx_put = info->tx_get = 0;
-
-	spin_unlock_irqrestore(&info->lock, flags);
-
-#if SYNCLINK_GENERIC_HDLC
-	if (info->netcount)
-		hdlcdev_tx_done(info);
-	else
-#endif
-	{
-		struct tty_struct *tty = tty_port_tty_get(&info->port);
-		bh_transmit(info, tty);
-		tty_kref_put(tty);
-	}
-}
-
-#if SYNCLINK_GENERIC_HDLC
-
-/*
- * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
- * set encoding and frame check sequence (FCS) options
- *
- * dev       pointer to network device structure
- * encoding  serial encoding setting
- * parity    FCS setting
- *
- * returns 0 if success, otherwise error code
- */
-static int hdlcdev_attach(struct net_device *dev, unsigned short encoding,
-			  unsigned short parity)
-{
-	MGSLPC_INFO *info = dev_to_port(dev);
-	struct tty_struct *tty;
-	unsigned char  new_encoding;
-	unsigned short new_crctype;
-
-	/* return error if TTY interface open */
-	if (info->port.count)
-		return -EBUSY;
-
-	switch (encoding)
-	{
-	case ENCODING_NRZ:        new_encoding = HDLC_ENCODING_NRZ; break;
-	case ENCODING_NRZI:       new_encoding = HDLC_ENCODING_NRZI_SPACE; break;
-	case ENCODING_FM_MARK:    new_encoding = HDLC_ENCODING_BIPHASE_MARK; break;
-	case ENCODING_FM_SPACE:   new_encoding = HDLC_ENCODING_BIPHASE_SPACE; break;
-	case ENCODING_MANCHESTER: new_encoding = HDLC_ENCODING_BIPHASE_LEVEL; break;
-	default: return -EINVAL;
-	}
-
-	switch (parity)
-	{
-	case PARITY_NONE:            new_crctype = HDLC_CRC_NONE; break;
-	case PARITY_CRC16_PR1_CCITT: new_crctype = HDLC_CRC_16_CCITT; break;
-	case PARITY_CRC32_PR1_CCITT: new_crctype = HDLC_CRC_32_CCITT; break;
-	default: return -EINVAL;
-	}
-
-	info->params.encoding = new_encoding;
-	info->params.crc_type = new_crctype;
-
-	/* if network interface up, reprogram hardware */
-	if (info->netcount) {
-		tty = tty_port_tty_get(&info->port);
-		mgslpc_program_hw(info, tty);
-		tty_kref_put(tty);
-	}
-
-	return 0;
-}
-
-/*
- * called by generic HDLC layer to send frame
- *
- * skb  socket buffer containing HDLC frame
- * dev  pointer to network device structure
- */
-static netdev_tx_t hdlcdev_xmit(struct sk_buff *skb,
-				      struct net_device *dev)
-{
-	MGSLPC_INFO *info = dev_to_port(dev);
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk(KERN_INFO "%s:hdlc_xmit(%s)\n", __FILE__, dev->name);
-
-	/* stop sending until this frame completes */
-	netif_stop_queue(dev);
-
-	/* copy data to device buffers */
-	skb_copy_from_linear_data(skb, info->tx_buf, skb->len);
-	info->tx_get = 0;
-	info->tx_put = info->tx_count = skb->len;
-
-	/* update network statistics */
-	dev->stats.tx_packets++;
-	dev->stats.tx_bytes += skb->len;
-
-	/* done with socket buffer, so free it */
-	dev_kfree_skb(skb);
-
-	/* save start time for transmit timeout detection */
-	netif_trans_update(dev);
-
-	/* start hardware transmitter if necessary */
-	spin_lock_irqsave(&info->lock, flags);
-	if (!info->tx_active) {
-		struct tty_struct *tty = tty_port_tty_get(&info->port);
-		tx_start(info, tty);
-		tty_kref_put(tty);
-	}
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	return NETDEV_TX_OK;
-}
-
-/*
- * called by network layer when interface enabled
- * claim resources and initialize hardware
- *
- * dev  pointer to network device structure
- *
- * returns 0 if success, otherwise error code
- */
-static int hdlcdev_open(struct net_device *dev)
-{
-	MGSLPC_INFO *info = dev_to_port(dev);
-	struct tty_struct *tty;
-	int rc;
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s:hdlcdev_open(%s)\n", __FILE__, dev->name);
-
-	/* generic HDLC layer open processing */
-	rc = hdlc_open(dev);
-	if (rc != 0)
-		return rc;
-
-	/* arbitrate between network and tty opens */
-	spin_lock_irqsave(&info->netlock, flags);
-	if (info->port.count != 0 || info->netcount != 0) {
-		printk(KERN_WARNING "%s: hdlc_open returning busy\n", dev->name);
-		spin_unlock_irqrestore(&info->netlock, flags);
-		return -EBUSY;
-	}
-	info->netcount=1;
-	spin_unlock_irqrestore(&info->netlock, flags);
-
-	tty = tty_port_tty_get(&info->port);
-	/* claim resources and init adapter */
-	rc = startup(info, tty);
-	if (rc != 0) {
-		tty_kref_put(tty);
-		spin_lock_irqsave(&info->netlock, flags);
-		info->netcount=0;
-		spin_unlock_irqrestore(&info->netlock, flags);
-		return rc;
-	}
-	/* assert RTS and DTR, apply hardware settings */
-	info->serial_signals |= SerialSignal_RTS | SerialSignal_DTR;
-	mgslpc_program_hw(info, tty);
-	tty_kref_put(tty);
-
-	/* enable network layer transmit */
-	netif_trans_update(dev);
-	netif_start_queue(dev);
-
-	/* inform generic HDLC layer of current DCD status */
-	spin_lock_irqsave(&info->lock, flags);
-	get_signals(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-	if (info->serial_signals & SerialSignal_DCD)
-		netif_carrier_on(dev);
-	else
-		netif_carrier_off(dev);
-	return 0;
-}
-
-/*
- * called by network layer when interface is disabled
- * shutdown hardware and release resources
- *
- * dev  pointer to network device structure
- *
- * returns 0 if success, otherwise error code
- */
-static int hdlcdev_close(struct net_device *dev)
-{
-	MGSLPC_INFO *info = dev_to_port(dev);
-	struct tty_struct *tty = tty_port_tty_get(&info->port);
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s:hdlcdev_close(%s)\n", __FILE__, dev->name);
-
-	netif_stop_queue(dev);
-
-	/* shutdown adapter and release resources */
-	shutdown(info, tty);
-	tty_kref_put(tty);
-	hdlc_close(dev);
-
-	spin_lock_irqsave(&info->netlock, flags);
-	info->netcount=0;
-	spin_unlock_irqrestore(&info->netlock, flags);
-
-	return 0;
-}
-
-/*
- * called by network layer to process IOCTL call to network device
- *
- * dev  pointer to network device structure
- * ifs  pointer to network interface settings structure
- *
- * returns 0 if success, otherwise error code
- */
-static int hdlcdev_wan_ioctl(struct net_device *dev, struct if_settings *ifs)
-{
-	const size_t size = sizeof(sync_serial_settings);
-	sync_serial_settings new_line;
-	sync_serial_settings __user *line = ifs->ifs_ifsu.sync;
-	MGSLPC_INFO *info = dev_to_port(dev);
-	unsigned int flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s:hdlcdev_ioctl(%s)\n", __FILE__, dev->name);
-
-	/* return error if TTY interface open */
-	if (info->port.count)
-		return -EBUSY;
-
-	memset(&new_line, 0, size);
-
-	switch (ifs->type) {
-	case IF_GET_IFACE: /* return current sync_serial_settings */
-
-		ifs->type = IF_IFACE_SYNC_SERIAL;
-		if (ifs->size < size) {
-			ifs->size = size; /* data size wanted */
-			return -ENOBUFS;
-		}
-
-		flags = info->params.flags & (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
-					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
-					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
-					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
-
-		switch (flags){
-		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN): new_line.clock_type = CLOCK_EXT; break;
-		case (HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_INT; break;
-		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_TXINT; break;
-		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN): new_line.clock_type = CLOCK_TXFROMRX; break;
-		default: new_line.clock_type = CLOCK_DEFAULT;
-		}
-
-		new_line.clock_rate = info->params.clock_speed;
-		new_line.loopback   = info->params.loopback ? 1:0;
-
-		if (copy_to_user(line, &new_line, size))
-			return -EFAULT;
-		return 0;
-
-	case IF_IFACE_SYNC_SERIAL: /* set sync_serial_settings */
-
-		if(!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		if (copy_from_user(&new_line, line, size))
-			return -EFAULT;
-
-		switch (new_line.clock_type)
-		{
-		case CLOCK_EXT:      flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN; break;
-		case CLOCK_TXFROMRX: flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN; break;
-		case CLOCK_INT:      flags = HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG;    break;
-		case CLOCK_TXINT:    flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG;    break;
-		case CLOCK_DEFAULT:  flags = info->params.flags &
-					     (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
-					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
-					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
-					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN); break;
-		default: return -EINVAL;
-		}
-
-		if (new_line.loopback != 0 && new_line.loopback != 1)
-			return -EINVAL;
-
-		info->params.flags &= ~(HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
-					HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
-					HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
-					HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
-		info->params.flags |= flags;
-
-		info->params.loopback = new_line.loopback;
-
-		if (flags & (HDLC_FLAG_RXC_BRG | HDLC_FLAG_TXC_BRG))
-			info->params.clock_speed = new_line.clock_rate;
-		else
-			info->params.clock_speed = 0;
-
-		/* if network interface up, reprogram hardware */
-		if (info->netcount) {
-			struct tty_struct *tty = tty_port_tty_get(&info->port);
-			mgslpc_program_hw(info, tty);
-			tty_kref_put(tty);
-		}
-		return 0;
-	default:
-		return hdlc_ioctl(dev, ifs);
-	}
-}
-
-/*
- * called by network layer when transmit timeout is detected
- *
- * dev  pointer to network device structure
- */
-static void hdlcdev_tx_timeout(struct net_device *dev, unsigned int txqueue)
-{
-	MGSLPC_INFO *info = dev_to_port(dev);
-	unsigned long flags;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("hdlcdev_tx_timeout(%s)\n", dev->name);
-
-	dev->stats.tx_errors++;
-	dev->stats.tx_aborted_errors++;
-
-	spin_lock_irqsave(&info->lock, flags);
-	tx_stop(info);
-	spin_unlock_irqrestore(&info->lock, flags);
-
-	netif_wake_queue(dev);
-}
-
-/*
- * called by device driver when transmit completes
- * reenable network layer transmit if stopped
- *
- * info  pointer to device instance information
- */
-static void hdlcdev_tx_done(MGSLPC_INFO *info)
-{
-	if (netif_queue_stopped(info->netdev))
-		netif_wake_queue(info->netdev);
-}
-
-/*
- * called by device driver when frame received
- * pass frame to network layer
- *
- * info  pointer to device instance information
- * buf   pointer to buffer contianing frame data
- * size  count of data bytes in buf
- */
-static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int size)
-{
-	struct sk_buff *skb = dev_alloc_skb(size);
-	struct net_device *dev = info->netdev;
-
-	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("hdlcdev_rx(%s)\n", dev->name);
-
-	if (skb == NULL) {
-		printk(KERN_NOTICE "%s: can't alloc skb, dropping packet\n", dev->name);
-		dev->stats.rx_dropped++;
-		return;
-	}
-
-	skb_put_data(skb, buf, size);
-
-	skb->protocol = hdlc_type_trans(skb, dev);
-
-	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += size;
-
-	netif_rx(skb);
-}
-
-static const struct net_device_ops hdlcdev_ops = {
-	.ndo_open       = hdlcdev_open,
-	.ndo_stop       = hdlcdev_close,
-	.ndo_start_xmit = hdlc_start_xmit,
-	.ndo_siocwandev = hdlcdev_wan_ioctl,
-	.ndo_tx_timeout = hdlcdev_tx_timeout,
-};
-
-/*
- * called by device driver when adding device instance
- * do generic HDLC initialization
- *
- * info  pointer to device instance information
- *
- * returns 0 if success, otherwise error code
- */
-static int hdlcdev_init(MGSLPC_INFO *info)
-{
-	int rc;
-	struct net_device *dev;
-	hdlc_device *hdlc;
-
-	/* allocate and initialize network and HDLC layer objects */
-
-	dev = alloc_hdlcdev(info);
-	if (dev == NULL) {
-		printk(KERN_ERR "%s:hdlc device allocation failure\n", __FILE__);
-		return -ENOMEM;
-	}
-
-	/* for network layer reporting purposes only */
-	dev->base_addr = info->io_base;
-	dev->irq       = info->irq_level;
-
-	/* network layer callbacks and settings */
-	dev->netdev_ops	    = &hdlcdev_ops;
-	dev->watchdog_timeo = 10 * HZ;
-	dev->tx_queue_len   = 50;
-
-	/* generic HDLC layer callbacks and settings */
-	hdlc         = dev_to_hdlc(dev);
-	hdlc->attach = hdlcdev_attach;
-	hdlc->xmit   = hdlcdev_xmit;
-
-	/* register objects with HDLC layer */
-	rc = register_hdlc_device(dev);
-	if (rc) {
-		printk(KERN_WARNING "%s:unable to register hdlc device\n", __FILE__);
-		free_netdev(dev);
-		return rc;
-	}
-
-	info->netdev = dev;
-	return 0;
-}
-
-/*
- * called by device driver when removing device instance
- * do generic HDLC cleanup
- *
- * info  pointer to device instance information
- */
-static void hdlcdev_exit(MGSLPC_INFO *info)
-{
-	unregister_hdlc_device(info->netdev);
-	free_netdev(info->netdev);
-	info->netdev = NULL;
-}
-
-#endif /* CONFIG_HDLC */
-
diff --git a/include/linux/cm4000_cs.h b/include/linux/cm4000_cs.h
deleted file mode 100644
index ea4958e07a14..000000000000
--- a/include/linux/cm4000_cs.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef	_CM4000_H_
-#define	_CM4000_H_
-
-#include <uapi/linux/cm4000_cs.h>
-
-
-#define	DEVICE_NAME		"cmm"
-#define	MODULE_NAME		"cm4000_cs"
-
-#endif	/* _CM4000_H_ */
diff --git a/include/uapi/linux/cm4000_cs.h b/include/uapi/linux/cm4000_cs.h
deleted file mode 100644
index c70a62ec8a49..000000000000
--- a/include/uapi/linux/cm4000_cs.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_CM4000_H_
-#define _UAPI_CM4000_H_
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#define	MAX_ATR			33
-
-#define	CM4000_MAX_DEV		4
-
-/* those two structures are passed via ioctl() from/to userspace.  They are
- * used by existing userspace programs, so I kepth the awkward "bIFSD" naming
- * not to break compilation of userspace apps. -HW */
-
-typedef struct atreq {
-	__s32 atr_len;
-	unsigned char atr[64];
-	__s32 power_act;
-	unsigned char bIFSD;
-	unsigned char bIFSC;
-} atreq_t;
-
-
-/* what is particularly stupid in the original driver is the arch-dependent
- * member sizes. This leads to CONFIG_COMPAT breakage, since 32bit userspace
- * will lay out the structure members differently than the 64bit kernel.
- *
- * I've changed "ptsreq.protocol" from "unsigned long" to "__u32".
- * On 32bit this will make no difference.  With 64bit kernels, it will make
- * 32bit apps work, too.
- */
-
-typedef struct ptsreq {
-	__u32 protocol; /*T=0: 2^0, T=1:  2^1*/
- 	unsigned char flags;
- 	unsigned char pts1;
- 	unsigned char pts2;
-	unsigned char pts3;
-} ptsreq_t;
-
-#define	CM_IOC_MAGIC		'c'
-#define	CM_IOC_MAXNR	        255
-
-#define	CM_IOCGSTATUS		_IOR (CM_IOC_MAGIC, 0, unsigned char *)
-#define	CM_IOCGATR		_IOWR(CM_IOC_MAGIC, 1, atreq_t *)
-#define	CM_IOCSPTS		_IOW (CM_IOC_MAGIC, 2, ptsreq_t *)
-#define	CM_IOCSRDR		_IO  (CM_IOC_MAGIC, 3)
-#define CM_IOCARDOFF            _IO  (CM_IOC_MAGIC, 4)
-
-#define CM_IOSDBGLVL            _IOW(CM_IOC_MAGIC, 250, int*)
-
-/* card and device states */
-#define	CM_CARD_INSERTED		0x01
-#define	CM_CARD_POWERED			0x02
-#define	CM_ATR_PRESENT			0x04
-#define	CM_ATR_VALID	 		0x08
-#define	CM_STATE_VALID			0x0f
-/* extra info only from CM4000 */
-#define	CM_NO_READER			0x10
-#define	CM_BAD_CARD			0x20
-
-
-#endif /* _UAPI_CM4000_H_ */
-- 
cgit v1.2.3


From 8efc52743ecb5c69d2e4faba965a3e14418c2494 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 7 Mar 2023 15:38:16 -0600
Subject: misc: alcor_pci: Use PCI core to manage ASPM instead of open-coding

"priv->ext_config_dev_aspm" was never set to a non-zero value.  Therefore,
alcor_pci_aspm_ctrl(priv, 1) did nothing, and alcor_pci_aspm_ctrl(priv, 0)
always disabled ASPM in the device and the upstream bridge.

The driver disabled ASPM in alcor_pci_probe() and alcor_resume(), so it's
possible the device doesn't work well when ASPM is enabled.

Remove all the ASPM-related code and replace the alcor_pci_aspm_ctrl(0)
calls with pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S |
PCIE_LINK_STATE_L1), which asks the PCI core to disable ASPM.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20230307213816.886308-1-helgaas@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/cardreader/alcor_pci.c | 144 +-----------------------------------
 include/linux/alcor_pci.h           |   7 --
 2 files changed, 4 insertions(+), 147 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/cardreader/alcor_pci.c b/drivers/misc/cardreader/alcor_pci.c
index 9080f9f150a2..5b637171c46c 100644
--- a/drivers/misc/cardreader/alcor_pci.c
+++ b/drivers/misc/cardreader/alcor_pci.c
@@ -95,137 +95,6 @@ u32 alcor_read32be(struct alcor_pci_priv *priv, unsigned int addr)
 }
 EXPORT_SYMBOL_GPL(alcor_read32be);
 
-static int alcor_pci_find_cap_offset(struct alcor_pci_priv *priv,
-				     struct pci_dev *pci)
-{
-	int where;
-	u8 val8;
-	u32 val32;
-
-	where = ALCOR_CAP_START_OFFSET;
-	pci_read_config_byte(pci, where, &val8);
-	if (!val8)
-		return 0;
-
-	where = (int)val8;
-	while (1) {
-		pci_read_config_dword(pci, where, &val32);
-		if (val32 == 0xffffffff) {
-			dev_dbg(priv->dev, "find_cap_offset invalid value %x.\n",
-				val32);
-			return 0;
-		}
-
-		if ((val32 & 0xff) == 0x10) {
-			dev_dbg(priv->dev, "pcie cap offset: %x\n", where);
-			return where;
-		}
-
-		if ((val32 & 0xff00) == 0x00) {
-			dev_dbg(priv->dev, "pci_find_cap_offset invalid value %x.\n",
-				val32);
-			break;
-		}
-		where = (int)((val32 >> 8) & 0xff);
-	}
-
-	return 0;
-}
-
-static void alcor_pci_init_check_aspm(struct alcor_pci_priv *priv)
-{
-	struct pci_dev *pci;
-	int where;
-	u32 val32;
-
-	priv->pdev_cap_off    = alcor_pci_find_cap_offset(priv, priv->pdev);
-	/*
-	 * A device might be attached to root complex directly and
-	 * priv->parent_pdev will be NULL. In this case we don't check its
-	 * capability and disable ASPM completely.
-	 */
-	if (priv->parent_pdev)
-		priv->parent_cap_off = alcor_pci_find_cap_offset(priv,
-							 priv->parent_pdev);
-
-	if ((priv->pdev_cap_off == 0) || (priv->parent_cap_off == 0)) {
-		dev_dbg(priv->dev, "pci_cap_off: %x, parent_cap_off: %x\n",
-			priv->pdev_cap_off, priv->parent_cap_off);
-		return;
-	}
-
-	/* link capability */
-	pci   = priv->pdev;
-	where = priv->pdev_cap_off + ALCOR_PCIE_LINK_CAP_OFFSET;
-	pci_read_config_dword(pci, where, &val32);
-	priv->pdev_aspm_cap = (u8)(val32 >> 10) & 0x03;
-
-	pci   = priv->parent_pdev;
-	where = priv->parent_cap_off + ALCOR_PCIE_LINK_CAP_OFFSET;
-	pci_read_config_dword(pci, where, &val32);
-	priv->parent_aspm_cap = (u8)(val32 >> 10) & 0x03;
-
-	if (priv->pdev_aspm_cap != priv->parent_aspm_cap) {
-		u8 aspm_cap;
-
-		dev_dbg(priv->dev, "pdev_aspm_cap: %x, parent_aspm_cap: %x\n",
-			priv->pdev_aspm_cap, priv->parent_aspm_cap);
-		aspm_cap = priv->pdev_aspm_cap & priv->parent_aspm_cap;
-		priv->pdev_aspm_cap    = aspm_cap;
-		priv->parent_aspm_cap = aspm_cap;
-	}
-
-	dev_dbg(priv->dev, "ext_config_dev_aspm: %x, pdev_aspm_cap: %x\n",
-		priv->ext_config_dev_aspm, priv->pdev_aspm_cap);
-	priv->ext_config_dev_aspm &= priv->pdev_aspm_cap;
-}
-
-static void alcor_pci_aspm_ctrl(struct alcor_pci_priv *priv, u8 aspm_enable)
-{
-	struct pci_dev *pci;
-	u8 aspm_ctrl, i;
-	int where;
-	u32 val32;
-
-	if ((!priv->pdev_cap_off) || (!priv->parent_cap_off)) {
-		dev_dbg(priv->dev, "pci_cap_off: %x, parent_cap_off: %x\n",
-			priv->pdev_cap_off, priv->parent_cap_off);
-		return;
-	}
-
-	if (!priv->pdev_aspm_cap)
-		return;
-
-	aspm_ctrl = 0;
-	if (aspm_enable) {
-		aspm_ctrl = priv->ext_config_dev_aspm;
-
-		if (!aspm_ctrl) {
-			dev_dbg(priv->dev, "aspm_ctrl == 0\n");
-			return;
-		}
-	}
-
-	for (i = 0; i < 2; i++) {
-
-		if (i) {
-			pci   = priv->parent_pdev;
-			where = priv->parent_cap_off
-				+ ALCOR_PCIE_LINK_CTRL_OFFSET;
-		} else {
-			pci   = priv->pdev;
-			where = priv->pdev_cap_off
-				+ ALCOR_PCIE_LINK_CTRL_OFFSET;
-		}
-
-		pci_read_config_dword(pci, where, &val32);
-		val32 &= (~0x03);
-		val32 |= (aspm_ctrl & priv->pdev_aspm_cap);
-		pci_write_config_byte(pci, where, (u8)val32);
-	}
-
-}
-
 static inline void alcor_mask_sd_irqs(struct alcor_pci_priv *priv)
 {
 	alcor_write32(priv, 0, AU6601_REG_INT_ENABLE);
@@ -308,7 +177,6 @@ static int alcor_pci_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 	pci_set_drvdata(pdev, priv);
-	alcor_pci_init_check_aspm(priv);
 
 	for (i = 0; i < ARRAY_SIZE(alcor_pci_cells); i++) {
 		alcor_pci_cells[i].platform_data = priv;
@@ -319,7 +187,7 @@ static int alcor_pci_probe(struct pci_dev *pdev,
 	if (ret < 0)
 		goto error_clear_drvdata;
 
-	alcor_pci_aspm_ctrl(priv, 0);
+	pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);
 
 	return 0;
 
@@ -339,8 +207,6 @@ static void alcor_pci_remove(struct pci_dev *pdev)
 
 	priv = pci_get_drvdata(pdev);
 
-	alcor_pci_aspm_ctrl(priv, 1);
-
 	mfd_remove_devices(&pdev->dev);
 
 	ida_free(&alcor_pci_idr, priv->id);
@@ -353,18 +219,16 @@ static void alcor_pci_remove(struct pci_dev *pdev)
 #ifdef CONFIG_PM_SLEEP
 static int alcor_suspend(struct device *dev)
 {
-	struct alcor_pci_priv *priv = dev_get_drvdata(dev);
-
-	alcor_pci_aspm_ctrl(priv, 1);
 	return 0;
 }
 
 static int alcor_resume(struct device *dev)
 {
-
 	struct alcor_pci_priv *priv = dev_get_drvdata(dev);
 
-	alcor_pci_aspm_ctrl(priv, 0);
+	pci_disable_link_state(priv->pdev,
+			       PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);
+
 	return 0;
 }
 #endif /* CONFIG_PM_SLEEP */
diff --git a/include/linux/alcor_pci.h b/include/linux/alcor_pci.h
index 8274ed525e9f..c4a0b23846d8 100644
--- a/include/linux/alcor_pci.h
+++ b/include/linux/alcor_pci.h
@@ -268,13 +268,6 @@ struct alcor_pci_priv {
 	unsigned long id; /* idr id */
 
 	struct alcor_dev_cfg	*cfg;
-
-	/* PCI ASPM related vars */
-	int pdev_cap_off;
-	u8  pdev_aspm_cap;
-	int parent_cap_off;
-	u8  parent_aspm_cap;
-	u8 ext_config_dev_aspm;
 };
 
 void alcor_write8(struct alcor_pci_priv *priv, u8 val, unsigned int addr);
-- 
cgit v1.2.3


From 8f118f61540e23fb0e4ec84aec361f6758dbe93a Mon Sep 17 00:00:00 2001
From: Nava kishore Manne <nava.kishore.manne@amd.com>
Date: Fri, 24 Feb 2023 17:37:37 +0530
Subject: firmware: xilinx: Add pm api function for PL config reg readback

Adds PM API for performing Programmable Logic(PL) configuration
register readback. It provides an interface to the firmware(pmufw)
to readback the FPGA configuration register.

Signed-off-by: Nava kishore Manne <nava.kishore.manne@amd.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20230224120738.329416-2-nava.kishore.manne@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 33 +++++++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h | 11 +++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index acd83d29c866..3b3ffb223c4c 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -971,6 +971,39 @@ int zynqmp_pm_fpga_get_status(u32 *value)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_fpga_get_status);
 
+/**
+ * zynqmp_pm_fpga_get_config_status - Get the FPGA configuration status.
+ * @value: Buffer to store FPGA configuration status.
+ *
+ * This function provides access to the pmufw to get the FPGA configuration
+ * status
+ *
+ * Return: 0 on success, a negative value on error
+ */
+int zynqmp_pm_fpga_get_config_status(u32 *value)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	u32 buf, lower_addr, upper_addr;
+	int ret;
+
+	if (!value)
+		return -EINVAL;
+
+	lower_addr = lower_32_bits((u64)&buf);
+	upper_addr = upper_32_bits((u64)&buf);
+
+	ret = zynqmp_pm_invoke_fn(PM_FPGA_READ,
+				  XILINX_ZYNQMP_PM_FPGA_CONFIG_STAT_OFFSET,
+				  lower_addr, upper_addr,
+				  XILINX_ZYNQMP_PM_FPGA_READ_CONFIG_REG,
+				  ret_payload);
+
+	*value = ret_payload[1];
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_fpga_get_config_status);
+
 /**
  * zynqmp_pm_pinctrl_request - Request Pin from firmware
  * @pin: Pin number to request
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0e4c70987e6a..f5da51677069 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -71,6 +71,10 @@
 #define XILINX_ZYNQMP_PM_FPGA_FULL	0x0U
 #define XILINX_ZYNQMP_PM_FPGA_PARTIAL	BIT(0)
 
+/* FPGA Status Reg */
+#define XILINX_ZYNQMP_PM_FPGA_CONFIG_STAT_OFFSET	7U
+#define XILINX_ZYNQMP_PM_FPGA_READ_CONFIG_REG		0U
+
 /*
  * Node IDs for the Error Events.
  */
@@ -124,6 +128,7 @@ enum pm_api_id {
 	PM_CLOCK_GETRATE = 42,
 	PM_CLOCK_SETPARENT = 43,
 	PM_CLOCK_GETPARENT = 44,
+	PM_FPGA_READ = 46,
 	PM_SECURE_AES = 47,
 	PM_FEATURE_CHECK = 63,
 };
@@ -519,6 +524,7 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out);
 int zynqmp_pm_sha_hash(const u64 address, const u32 size, const u32 flags);
 int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags);
 int zynqmp_pm_fpga_get_status(u32 *value);
+int zynqmp_pm_fpga_get_config_status(u32 *value);
 int zynqmp_pm_write_ggs(u32 index, u32 value);
 int zynqmp_pm_read_ggs(u32 index, u32 *value);
 int zynqmp_pm_write_pggs(u32 index, u32 value);
@@ -725,6 +731,11 @@ static inline int zynqmp_pm_fpga_get_status(u32 *value)
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_fpga_get_config_status(u32 *value)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_write_ggs(u32 index, u32 value)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 5a70f4a63000ba68004fb3c1aaf2f90303dd228f Mon Sep 17 00:00:00 2001
From: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Date: Thu, 9 Mar 2023 14:38:23 +0100
Subject: bpf: Fix a typo for BPF_F_ANY_ALIGNMENT in bpf.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix s/BPF_PROF_LOAD/BPF_PROG_LOAD/ typo in the documentation comment
for BPF_F_ANY_ALIGNMENT in bpf.h.

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230309133823.944097-1-michael.weiss@aisec.fraunhofer.de
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4abddb668a10..d8c534e05b0a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1108,7 +1108,7 @@ enum bpf_link_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will allow any alignment whatsoever.  On platforms
  * with strict alignment requirements for loads ands stores (such
  * as sparc and mips) the verifier validates that all loads and
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4abddb668a10..d8c534e05b0a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1108,7 +1108,7 @@ enum bpf_link_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will allow any alignment whatsoever.  On platforms
  * with strict alignment requirements for loads ands stores (such
  * as sparc and mips) the verifier validates that all loads and
-- 
cgit v1.2.3


From ac3b43283923440900b4f36ca5f9f0b1ca43b70e Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Feb 2023 16:28:02 -0800
Subject: module: replace module_layout with module_memory

module_layout manages different types of memory (text, data, rodata, etc.)
in one allocation, which is problematic for some reasons:

1. It is hard to enable CONFIG_STRICT_MODULE_RWX.
2. It is hard to use huge pages in modules (and not break strict rwx).
3. Many archs uses module_layout for arch-specific data, but it is not
   obvious how these data are used (are they RO, RX, or RW?)

Improve the scenario by replacing 2 (or 3) module_layout per module with
up to 7 module_memory per module:

        MOD_TEXT,
        MOD_DATA,
        MOD_RODATA,
        MOD_RO_AFTER_INIT,
        MOD_INIT_TEXT,
        MOD_INIT_DATA,
        MOD_INIT_RODATA,

and allocating them separately. This adds slightly more entries to
mod_tree (from up to 3 entries per module, to up to 7 entries per
module). However, this at most adds a small constant overhead to
__module_address(), which is expected to be fast.

Various archs use module_layout for different data. These data are put
into different module_memory based on their location in module_layout.
IOW, data that used to go with text is allocated with MOD_MEM_TYPE_TEXT;
data that used to go with data is allocated with MOD_MEM_TYPE_DATA, etc.

module_memory simplifies quite some of the module code. For example,
ARCH_WANTS_MODULES_DATA_IN_VMALLOC is a lot cleaner, as it just uses a
different allocator for the data. kernel/module/strict_rwx.c is also
much cleaner with module_memory.

Signed-off-by: Song Liu <song@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/arc/kernel/unwind.c        |  12 +-
 arch/arm/kernel/module-plts.c   |   9 +-
 arch/arm64/kernel/module-plts.c |  13 +-
 arch/ia64/kernel/module.c       |  24 +--
 arch/mips/kernel/vpe.c          |  11 +-
 arch/parisc/kernel/module.c     |  51 ++----
 arch/powerpc/kernel/module_32.c |   7 +-
 arch/s390/kernel/module.c       |  26 +--
 arch/x86/kernel/callthunks.c    |   4 +-
 arch/x86/kernel/module.c        |   4 +-
 include/linux/module.h          |  89 +++++++---
 kernel/module/internal.h        |  40 ++---
 kernel/module/kallsyms.c        |  58 ++++---
 kernel/module/kdb.c             |  17 +-
 kernel/module/main.c            | 375 ++++++++++++++++++++--------------------
 kernel/module/procfs.c          |  16 +-
 kernel/module/strict_rwx.c      |  99 ++---------
 kernel/module/tree_lookup.c     |  39 ++---
 18 files changed, 427 insertions(+), 467 deletions(-)

(limited to 'include')

diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c
index 200270a94558..9270d0a713c3 100644
--- a/arch/arc/kernel/unwind.c
+++ b/arch/arc/kernel/unwind.c
@@ -369,6 +369,8 @@ void *unwind_add_table(struct module *module, const void *table_start,
 		       unsigned long table_size)
 {
 	struct unwind_table *table;
+	struct module_memory *core_text;
+	struct module_memory *init_text;
 
 	if (table_size <= 0)
 		return NULL;
@@ -377,11 +379,11 @@ void *unwind_add_table(struct module *module, const void *table_start,
 	if (!table)
 		return NULL;
 
-	init_unwind_table(table, module->name,
-			  module->core_layout.base, module->core_layout.size,
-			  module->init_layout.base, module->init_layout.size,
-			  table_start, table_size,
-			  NULL, 0);
+	core_text = &module->mem[MOD_TEXT];
+	init_text = &module->mem[MOD_INIT_TEXT];
+
+	init_unwind_table(table, module->name, core_text->base, core_text->size,
+			  init_text->base, init_text->size, table_start, table_size, NULL, 0);
 
 	init_unwind_hdr(table, unw_hdr_alloc);
 
diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c
index af7c322ebed6..f5a43fd8c163 100644
--- a/arch/arm/kernel/module-plts.c
+++ b/arch/arm/kernel/module-plts.c
@@ -28,11 +28,6 @@ static const u32 fixed_plts[] = {
 #endif
 };
 
-static bool in_init(const struct module *mod, unsigned long loc)
-{
-	return loc - (u32)mod->init_layout.base < mod->init_layout.size;
-}
-
 static void prealloc_fixed(struct mod_plt_sec *pltsec, struct plt_entries *plt)
 {
 	int i;
@@ -50,8 +45,8 @@ static void prealloc_fixed(struct mod_plt_sec *pltsec, struct plt_entries *plt)
 
 u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
 {
-	struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core :
-							  &mod->arch.init;
+	struct mod_plt_sec *pltsec = !within_module_init(loc, mod) ?
+						&mod->arch.core : &mod->arch.init;
 	struct plt_entries *plt;
 	int idx;
 
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index 5a0a8f552a61..543493bf924d 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -65,17 +65,12 @@ static bool plt_entries_equal(const struct plt_entry *a,
 	       (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
 }
 
-static bool in_init(const struct module *mod, void *loc)
-{
-	return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size;
-}
-
 u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
 			  void *loc, const Elf64_Rela *rela,
 			  Elf64_Sym *sym)
 {
-	struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core :
-							  &mod->arch.init;
+	struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ?
+						&mod->arch.core : &mod->arch.init;
 	struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr;
 	int i = pltsec->plt_num_entries;
 	int j = i - 1;
@@ -105,8 +100,8 @@ u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
 u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs,
 				void *loc, u64 val)
 {
-	struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core :
-							  &mod->arch.init;
+	struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ?
+						&mod->arch.core : &mod->arch.init;
 	struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr;
 	int i = pltsec->plt_num_entries++;
 	u32 br;
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 8f62cf97f691..3661135da9d9 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -485,19 +485,19 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 	return 0;
 }
 
-static inline int
+static inline bool
 in_init (const struct module *mod, uint64_t addr)
 {
-	return addr - (uint64_t) mod->init_layout.base < mod->init_layout.size;
+	return within_module_init(addr, mod);
 }
 
-static inline int
+static inline bool
 in_core (const struct module *mod, uint64_t addr)
 {
-	return addr - (uint64_t) mod->core_layout.base < mod->core_layout.size;
+	return within_module_core(addr, mod);
 }
 
-static inline int
+static inline bool
 is_internal (const struct module *mod, uint64_t value)
 {
 	return in_init(mod, value) || in_core(mod, value);
@@ -677,7 +677,8 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
 		break;
 
 	      case RV_BDREL:
-		val -= (uint64_t) (in_init(mod, val) ? mod->init_layout.base : mod->core_layout.base);
+		val -= (uint64_t) (in_init(mod, val) ? mod->mem[MOD_INIT_TEXT].base :
+				   mod->mem[MOD_TEXT].base);
 		break;
 
 	      case RV_LTV:
@@ -812,15 +813,18 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind
 		 *     addresses have been selected...
 		 */
 		uint64_t gp;
-		if (mod->core_layout.size > MAX_LTOFF)
+		struct module_memory *mod_mem;
+
+		mod_mem = &mod->mem[MOD_DATA];
+		if (mod_mem->size > MAX_LTOFF)
 			/*
 			 * This takes advantage of fact that SHF_ARCH_SMALL gets allocated
 			 * at the end of the module.
 			 */
-			gp = mod->core_layout.size - MAX_LTOFF / 2;
+			gp = mod_mem->size - MAX_LTOFF / 2;
 		else
-			gp = mod->core_layout.size / 2;
-		gp = (uint64_t) mod->core_layout.base + ((gp + 7) & -8);
+			gp = mod_mem->size / 2;
+		gp = (uint64_t) mod_mem->base + ((gp + 7) & -8);
 		mod->arch.gp = gp;
 		DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp);
 	}
diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 13294972707b..ab114c686f9d 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -199,18 +199,17 @@ static void layout_sections(struct module *mod, const Elf_Ehdr *hdr,
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
 		for (i = 0; i < hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &sechdrs[i];
+			struct module_memory *mod_mem;
+
+			mod_mem = &mod->mem[MOD_TEXT];
 
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL)
 				continue;
 			s->sh_entsize =
-				get_offset((unsigned long *)&mod->core_layout.size, s);
+				get_offset((unsigned long *)&mod_mem->size, s);
 		}
-
-		if (m == 0)
-			mod->core_layout.text_size = mod->core_layout.size;
-
 	}
 }
 
@@ -641,7 +640,7 @@ static int vpe_elfload(struct vpe *v)
 		layout_sections(&mod, hdr, sechdrs, secstrings);
 	}
 
-	v->load_addr = alloc_progmem(mod.core_layout.size);
+	v->load_addr = alloc_progmem(mod.mod_mem[MOD_TEXT].size);
 	if (!v->load_addr)
 		return -ENOMEM;
 
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 7df140545b22..f6e38c4d3904 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -27,9 +27,9 @@
  *      We are not doing SEGREL32 handling correctly. According to the ABI, we
  *      should do a value offset, like this:
  *			if (in_init(me, (void *)val))
- *				val -= (uint32_t)me->init_layout.base;
+ *				val -= (uint32_t)me->mem[MOD_INIT_TEXT].base;
  *			else
- *				val -= (uint32_t)me->core_layout.base;
+ *				val -= (uint32_t)me->mem[MOD_TEXT].base;
  *	However, SEGREL32 is used only for PARISC unwind entries, and we want
  *	those entries to have an absolute address, and not just an offset.
  *
@@ -76,25 +76,6 @@
  * allows us to allocate up to 4095 GOT entries. */
 #define MAX_GOTS	4095
 
-/* three functions to determine where in the module core
- * or init pieces the location is */
-static inline int in_init(struct module *me, void *loc)
-{
-	return (loc >= me->init_layout.base &&
-		loc <= (me->init_layout.base + me->init_layout.size));
-}
-
-static inline int in_core(struct module *me, void *loc)
-{
-	return (loc >= me->core_layout.base &&
-		loc <= (me->core_layout.base + me->core_layout.size));
-}
-
-static inline int in_local(struct module *me, void *loc)
-{
-	return in_init(me, loc) || in_core(me, loc);
-}
-
 #ifndef CONFIG_64BIT
 struct got_entry {
 	Elf32_Addr addr;
@@ -302,6 +283,7 @@ int module_frob_arch_sections(CONST Elf_Ehdr *hdr,
 {
 	unsigned long gots = 0, fdescs = 0, len;
 	unsigned int i;
+	struct module_memory *mod_mem;
 
 	len = hdr->e_shnum * sizeof(me->arch.section[0]);
 	me->arch.section = kzalloc(len, GFP_KERNEL);
@@ -346,14 +328,15 @@ int module_frob_arch_sections(CONST Elf_Ehdr *hdr,
 		me->arch.section[s].stub_entries += count;
 	}
 
+	mod_mem = &me->mem[MOD_TEXT];
 	/* align things a bit */
-	me->core_layout.size = ALIGN(me->core_layout.size, 16);
-	me->arch.got_offset = me->core_layout.size;
-	me->core_layout.size += gots * sizeof(struct got_entry);
+	mod_mem->size = ALIGN(mod_mem->size, 16);
+	me->arch.got_offset = mod_mem->size;
+	mod_mem->size += gots * sizeof(struct got_entry);
 
-	me->core_layout.size = ALIGN(me->core_layout.size, 16);
-	me->arch.fdesc_offset = me->core_layout.size;
-	me->core_layout.size += fdescs * sizeof(Elf_Fdesc);
+	mod_mem->size = ALIGN(mod_mem->size, 16);
+	me->arch.fdesc_offset = mod_mem->size;
+	mod_mem->size += fdescs * sizeof(Elf_Fdesc);
 
 	me->arch.got_max = gots;
 	me->arch.fdesc_max = fdescs;
@@ -371,7 +354,7 @@ static Elf64_Word get_got(struct module *me, unsigned long value, long addend)
 
 	BUG_ON(value == 0);
 
-	got = me->core_layout.base + me->arch.got_offset;
+	got = me->mem[MOD_TEXT].base + me->arch.got_offset;
 	for (i = 0; got[i].addr; i++)
 		if (got[i].addr == value)
 			goto out;
@@ -389,7 +372,7 @@ static Elf64_Word get_got(struct module *me, unsigned long value, long addend)
 #ifdef CONFIG_64BIT
 static Elf_Addr get_fdesc(struct module *me, unsigned long value)
 {
-	Elf_Fdesc *fdesc = me->core_layout.base + me->arch.fdesc_offset;
+	Elf_Fdesc *fdesc = me->mem[MOD_TEXT].base + me->arch.fdesc_offset;
 
 	if (!value) {
 		printk(KERN_ERR "%s: zero OPD requested!\n", me->name);
@@ -407,7 +390,7 @@ static Elf_Addr get_fdesc(struct module *me, unsigned long value)
 
 	/* Create new one */
 	fdesc->addr = value;
-	fdesc->gp = (Elf_Addr)me->core_layout.base + me->arch.got_offset;
+	fdesc->gp = (Elf_Addr)me->mem[MOD_TEXT].base + me->arch.got_offset;
 	return (Elf_Addr)fdesc;
 }
 #endif /* CONFIG_64BIT */
@@ -742,7 +725,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs,
 			       loc, val);
 			val += addend;
 			/* can we reach it locally? */
-			if (in_local(me, (void *)val)) {
+			if (within_module(val, me)) {
 				/* this is the case where the symbol is local
 				 * to the module, but in a different section,
 				 * so stub the jump in case it's more than 22
@@ -801,7 +784,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs,
 			break;
 		case R_PARISC_FPTR64:
 			/* 64-bit function address */
-			if(in_local(me, (void *)(val + addend))) {
+			if (within_module(val + addend, me)) {
 				*loc64 = get_fdesc(me, val+addend);
 				pr_debug("FDESC for %s at %llx points to %llx\n",
 				       strtab + sym->st_name, *loc64,
@@ -839,7 +822,7 @@ register_unwind_table(struct module *me,
 
 	table = (unsigned char *)sechdrs[me->arch.unwind_section].sh_addr;
 	end = table + sechdrs[me->arch.unwind_section].sh_size;
-	gp = (Elf_Addr)me->core_layout.base + me->arch.got_offset;
+	gp = (Elf_Addr)me->mem[MOD_TEXT].base + me->arch.got_offset;
 
 	pr_debug("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n",
 	       me->arch.unwind_section, table, end, gp);
@@ -977,7 +960,7 @@ void module_arch_cleanup(struct module *mod)
 #ifdef CONFIG_64BIT
 void *dereference_module_function_descriptor(struct module *mod, void *ptr)
 {
-	unsigned long start_opd = (Elf64_Addr)mod->core_layout.base +
+	unsigned long start_opd = (Elf64_Addr)mod->mem[MOD_TEXT].base +
 				   mod->arch.fdesc_offset;
 	unsigned long end_opd = start_opd +
 				mod->arch.fdesc_count * sizeof(Elf64_Fdesc);
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index ea6536171778..816a63fd71fb 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -163,8 +163,7 @@ static uint32_t do_plt_call(void *location,
 
 	pr_debug("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location);
 	/* Init, or core PLT? */
-	if (location >= mod->core_layout.base
-	    && location < mod->core_layout.base + mod->core_layout.size)
+	if (within_module_core((unsigned long)location, mod))
 		entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr;
 	else
 		entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr;
@@ -322,14 +321,14 @@ notrace int module_trampoline_target(struct module *mod, unsigned long addr,
 
 int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs)
 {
-	module->arch.tramp = do_plt_call(module->core_layout.base,
+	module->arch.tramp = do_plt_call(module->mem[MOD_TEXT].base,
 					 (unsigned long)ftrace_caller,
 					 sechdrs, module);
 	if (!module->arch.tramp)
 		return -ENOENT;
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
-	module->arch.tramp_regs = do_plt_call(module->core_layout.base,
+	module->arch.tramp_regs = do_plt_call(module->mem[MOD_TEXT].base,
 					      (unsigned long)ftrace_regs_caller,
 					      sechdrs, module);
 	if (!module->arch.tramp_regs)
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 2d159b32885b..b9dbf0c483ed 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -126,6 +126,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 	Elf_Rela *rela;
 	char *strings;
 	int nrela, i, j;
+	struct module_memory *mod_mem;
 
 	/* Find symbol table and string table. */
 	symtab = NULL;
@@ -173,14 +174,15 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 
 	/* Increase core size by size of got & plt and set start
 	   offsets for got and plt. */
-	me->core_layout.size = ALIGN(me->core_layout.size, 4);
-	me->arch.got_offset = me->core_layout.size;
-	me->core_layout.size += me->arch.got_size;
-	me->arch.plt_offset = me->core_layout.size;
+	mod_mem = &me->mem[MOD_TEXT];
+	mod_mem->size = ALIGN(mod_mem->size, 4);
+	me->arch.got_offset = mod_mem->size;
+	mod_mem->size += me->arch.got_size;
+	me->arch.plt_offset = mod_mem->size;
 	if (me->arch.plt_size) {
 		if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable)
 			me->arch.plt_size += PLT_ENTRY_SIZE;
-		me->core_layout.size += me->arch.plt_size;
+		mod_mem->size += me->arch.plt_size;
 	}
 	return 0;
 }
@@ -304,7 +306,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_GOTPLT64:	/* 64 bit offset to jump slot.	*/
 	case R_390_GOTPLTENT:	/* 32 bit rel. offset to jump slot >> 1. */
 		if (info->got_initialized == 0) {
-			Elf_Addr *gotent = me->core_layout.base +
+			Elf_Addr *gotent = me->mem[MOD_TEXT].base +
 					   me->arch.got_offset +
 					   info->got_offset;
 
@@ -329,7 +331,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			rc = apply_rela_bits(loc, val, 0, 64, 0, write);
 		else if (r_type == R_390_GOTENT ||
 			 r_type == R_390_GOTPLTENT) {
-			val += (Elf_Addr) me->core_layout.base - loc;
+			val += (Elf_Addr) me->mem[MOD_TEXT].base - loc;
 			rc = apply_rela_bits(loc, val, 1, 32, 1, write);
 		}
 		break;
@@ -345,7 +347,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			char *plt_base;
 			char *ip;
 
-			plt_base = me->core_layout.base + me->arch.plt_offset;
+			plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset;
 			ip = plt_base + info->plt_offset;
 			*(int *)insn = 0x0d10e310;	/* basr 1,0  */
 			*(int *)&insn[4] = 0x100c0004;	/* lg	1,12(1) */
@@ -375,7 +377,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			       val - loc + 0xffffUL < 0x1ffffeUL) ||
 			      (r_type == R_390_PLT32DBL &&
 			       val - loc + 0xffffffffULL < 0x1fffffffeULL)))
-				val = (Elf_Addr) me->core_layout.base +
+				val = (Elf_Addr) me->mem[MOD_TEXT].base +
 					me->arch.plt_offset +
 					info->plt_offset;
 			val += rela->r_addend - loc;
@@ -397,7 +399,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 	case R_390_GOTOFF32:	/* 32 bit offset to GOT.  */
 	case R_390_GOTOFF64:	/* 64 bit offset to GOT. */
 		val = val + rela->r_addend -
-			((Elf_Addr) me->core_layout.base + me->arch.got_offset);
+			((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset);
 		if (r_type == R_390_GOTOFF16)
 			rc = apply_rela_bits(loc, val, 0, 16, 0, write);
 		else if (r_type == R_390_GOTOFF32)
@@ -407,7 +409,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 		break;
 	case R_390_GOTPC:	/* 32 bit PC relative offset to GOT. */
 	case R_390_GOTPCDBL:	/* 32 bit PC rel. off. to GOT shifted by 1. */
-		val = (Elf_Addr) me->core_layout.base + me->arch.got_offset +
+		val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset +
 			rela->r_addend - loc;
 		if (r_type == R_390_GOTPC)
 			rc = apply_rela_bits(loc, val, 1, 32, 0, write);
@@ -515,7 +517,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 	    !nospec_disable && me->arch.plt_size) {
 		unsigned int *ij;
 
-		ij = me->core_layout.base + me->arch.plt_offset +
+		ij = me->mem[MOD_TEXT].base + me->arch.plt_offset +
 			me->arch.plt_size - PLT_ENTRY_SIZE;
 		ij[0] = 0xc6000000;	/* exrl	%r0,.+10	*/
 		ij[1] = 0x0005a7f4;	/* j	.		*/
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index ffea98f9064b..22ab13966427 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -330,8 +330,8 @@ void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
 					    struct module *mod)
 {
 	struct core_text ct = {
-		.base = (unsigned long)mod->core_layout.base,
-		.end  = (unsigned long)mod->core_layout.base + mod->core_layout.size,
+		.base = (unsigned long)mod->mem[MOD_TEXT].base,
+		.end  = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size,
 		.name = mod->name,
 	};
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 84ad0e61ba6e..b05f62ee2344 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -362,8 +362,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 	}
 	if (locks) {
 		void *lseg = (void *)locks->sh_addr;
-		void *text = me->core_layout.base;
-		void *text_end = text + me->core_layout.text_size;
+		void *text = me->mem[MOD_TEXT].base;
+		void *text_end = text + me->mem[MOD_TEXT].size;
 		alternatives_smp_module_add(me, me->name,
 					    lseg, lseg + locks->sh_size,
 					    text, text_end);
diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..c0593a96e158 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -320,17 +320,47 @@ struct mod_tree_node {
 	struct latch_tree_node node;
 };
 
-struct module_layout {
-	/* The actual code + data. */
+enum mod_mem_type {
+	MOD_TEXT = 0,
+	MOD_DATA,
+	MOD_RODATA,
+	MOD_RO_AFTER_INIT,
+	MOD_INIT_TEXT,
+	MOD_INIT_DATA,
+	MOD_INIT_RODATA,
+
+	MOD_MEM_NUM_TYPES,
+	MOD_INVALID = -1,
+};
+
+#define mod_mem_type_is_init(type)	\
+	((type) == MOD_INIT_TEXT ||	\
+	 (type) == MOD_INIT_DATA ||	\
+	 (type) == MOD_INIT_RODATA)
+
+#define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type))
+
+#define mod_mem_type_is_text(type)	\
+	 ((type) == MOD_TEXT ||		\
+	  (type) == MOD_INIT_TEXT)
+
+#define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type))
+
+#define mod_mem_type_is_core_data(type)	\
+	(mod_mem_type_is_core(type) &&	\
+	 mod_mem_type_is_data(type))
+
+#define for_each_mod_mem_type(type)			\
+	for (enum mod_mem_type (type) = 0;		\
+	     (type) < MOD_MEM_NUM_TYPES; (type)++)
+
+#define for_class_mod_mem_type(type, class)		\
+	for_each_mod_mem_type(type)			\
+		if (mod_mem_type_is_##class(type))
+
+struct module_memory {
 	void *base;
-	/* Total size. */
 	unsigned int size;
-	/* The size of the executable code.  */
-	unsigned int text_size;
-	/* Size of RO section of the module (text+rodata) */
-	unsigned int ro_size;
-	/* Size of RO after init section */
-	unsigned int ro_after_init_size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 	struct mod_tree_node mtn;
@@ -339,9 +369,9 @@ struct module_layout {
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 /* Only touch one cacheline for common rbtree-for-core-layout case. */
-#define __module_layout_align ____cacheline_aligned
+#define __module_memory_align ____cacheline_aligned
 #else
-#define __module_layout_align
+#define __module_memory_align
 #endif
 
 struct mod_kallsyms {
@@ -426,12 +456,7 @@ struct module {
 	/* Startup function. */
 	int (*init)(void);
 
-	/* Core layout: rbtree is accessed frequently, so keep together. */
-	struct module_layout core_layout __module_layout_align;
-	struct module_layout init_layout;
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	struct module_layout data_layout;
-#endif
+	struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align;
 
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
@@ -581,23 +606,35 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
 bool is_module_percpu_address(unsigned long addr);
 bool is_module_text_address(unsigned long addr);
 
+static inline bool within_module_mem_type(unsigned long addr,
+					  const struct module *mod,
+					  enum mod_mem_type type)
+{
+	unsigned long base, size;
+
+	base = (unsigned long)mod->mem[type].base;
+	size = mod->mem[type].size;
+	return addr - base < size;
+}
+
 static inline bool within_module_core(unsigned long addr,
 				      const struct module *mod)
 {
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	if ((unsigned long)mod->data_layout.base <= addr &&
-	    addr < (unsigned long)mod->data_layout.base + mod->data_layout.size)
-		return true;
-#endif
-	return (unsigned long)mod->core_layout.base <= addr &&
-	       addr < (unsigned long)mod->core_layout.base + mod->core_layout.size;
+	for_class_mod_mem_type(type, core) {
+		if (within_module_mem_type(addr, mod, type))
+			return true;
+	}
+	return false;
 }
 
 static inline bool within_module_init(unsigned long addr,
 				      const struct module *mod)
 {
-	return (unsigned long)mod->init_layout.base <= addr &&
-	       addr < (unsigned long)mod->init_layout.base + mod->init_layout.size;
+	for_class_mod_mem_type(type, init) {
+		if (within_module_mem_type(addr, mod, type))
+			return true;
+	}
+	return false;
 }
 
 static inline bool within_module(unsigned long addr, const struct module *mod)
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..c6c44ceca07a 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -17,27 +17,19 @@
 #define ARCH_SHF_SMALL 0
 #endif
 
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1))
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
-#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-#define	data_layout core_layout
-#endif
-
 /*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_STRICT_MODULE_RWX=y
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
  */
-static inline unsigned int strict_align(unsigned int size)
-{
-	if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
-		return PAGE_ALIGN(size);
-	else
-		return size;
-}
+
+#define SH_ENTSIZE_TYPE_BITS	4
+#define SH_ENTSIZE_TYPE_SHIFT	(BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK	((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK	((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
 
 extern struct mutex module_mutex;
 extern struct list_head modules;
@@ -101,8 +93,8 @@ int try_to_force_load(struct module *mod, const char *reason);
 bool find_symbol(struct find_symbol_arg *fsa);
 struct module *find_module_all(const char *name, size_t len, bool even_unformed);
 int cmp_name(const void *name, const void *sym);
-long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
-		       unsigned int section);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+				Elf_Shdr *sechdr, unsigned int section);
 char *module_flags(struct module *mod, char *buf, bool show_state);
 size_t module_flags_taint(unsigned long taints, char *buf);
 
@@ -190,10 +182,13 @@ struct mod_tree_root {
 #endif
 	unsigned long addr_min;
 	unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	unsigned long data_addr_min;
+	unsigned long data_addr_max;
+#endif
 };
 
 extern struct mod_tree_root mod_tree;
-extern struct mod_tree_root mod_data_tree;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 void mod_tree_insert(struct module *mod);
@@ -224,7 +219,6 @@ void module_enable_nx(const struct module *mod);
 void module_enable_x(const struct module *mod);
 int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 				char *secstrings, struct module *mod);
-bool module_check_misalignment(const struct module *mod);
 
 #ifdef CONFIG_MODULE_SIG
 int module_sig_check(struct load_info *info, int flags);
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index ab2376a1be88..8e89f459ffdd 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -78,6 +78,7 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 			   unsigned int shnum, unsigned int pcpundx)
 {
 	const Elf_Shdr *sec;
+	enum mod_mem_type type;
 
 	if (src->st_shndx == SHN_UNDEF ||
 	    src->st_shndx >= shnum ||
@@ -90,11 +91,12 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 #endif
 
 	sec = sechdrs + src->st_shndx;
+	type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
 	if (!(sec->sh_flags & SHF_ALLOC)
 #ifndef CONFIG_KALLSYMS_ALL
 	    || !(sec->sh_flags & SHF_EXECINSTR)
 #endif
-	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+	    || mod_mem_type_is_init(type))
 		return false;
 
 	return true;
@@ -113,11 +115,13 @@ void layout_symtab(struct module *mod, struct load_info *info)
 	Elf_Shdr *strsect = info->sechdrs + info->index.str;
 	const Elf_Sym *src;
 	unsigned int i, nsrc, ndst, strtab_size = 0;
+	struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+	struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
-	symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect,
-						info->index.sym) | INIT_OFFSET_MASK;
+	symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+							 symsect, info->index.sym);
 	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
 
 	src = (void *)info->hdr + symsect->sh_offset;
@@ -134,28 +138,27 @@ void layout_symtab(struct module *mod, struct load_info *info)
 	}
 
 	/* Append room for core symbols at end of core part. */
-	info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1);
-	info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
-	mod->data_layout.size += strtab_size;
+	info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+	info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod_mem_data->size += strtab_size;
 	/* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
-	info->core_typeoffs = mod->data_layout.size;
-	mod->data_layout.size += ndst * sizeof(char);
-	mod->data_layout.size = strict_align(mod->data_layout.size);
+	info->core_typeoffs = mod_mem_data->size;
+	mod_mem_data->size += ndst * sizeof(char);
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
-	strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect,
-						info->index.str) | INIT_OFFSET_MASK;
+	strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+							 strsect, info->index.str);
 	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
 
 	/* We'll tack temporary mod_kallsyms on the end. */
-	mod->init_layout.size = ALIGN(mod->init_layout.size,
-				      __alignof__(struct mod_kallsyms));
-	info->mod_kallsyms_init_off = mod->init_layout.size;
-	mod->init_layout.size += sizeof(struct mod_kallsyms);
-	info->init_typeoffs = mod->init_layout.size;
-	mod->init_layout.size += nsrc * sizeof(char);
-	mod->init_layout.size = strict_align(mod->init_layout.size);
+	mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+					__alignof__(struct mod_kallsyms));
+	info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+	mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+	info->init_typeoffs = mod_mem_init_data->size;
+	mod_mem_init_data->size += nsrc * sizeof(char);
 }
 
 /*
@@ -171,9 +174,11 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	char *s;
 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 	unsigned long strtab_size;
+	void *data_base = mod->mem[MOD_DATA].base;
+	void *init_data_base = mod->mem[MOD_INIT_DATA].base;
 
 	/* Set up to point into init section. */
-	mod->kallsyms = (void __rcu *)mod->init_layout.base +
+	mod->kallsyms = (void __rcu *)init_data_base +
 		info->mod_kallsyms_init_off;
 
 	rcu_read_lock();
@@ -183,15 +188,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	/* Make sure we get permanent strtab: don't use info->strtab. */
 	rcu_dereference(mod->kallsyms)->strtab =
 		(void *)info->sechdrs[info->index.str].sh_addr;
-	rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
+	rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs;
 
 	/*
 	 * Now populate the cut down core kallsyms for after init
 	 * and set types up while we still have access to sections.
 	 */
-	mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs;
-	mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
-	mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
+	mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+	mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+	mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
 	strtab_size = info->core_typeoffs - info->stroffs;
 	src = rcu_dereference(mod->kallsyms)->symtab;
 	for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
@@ -267,12 +272,15 @@ static const char *find_kallsyms_symbol(struct module *mod,
 	unsigned int i, best = 0;
 	unsigned long nextval, bestval;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+	struct module_memory *mod_mem;
 
 	/* At worse, next value is at end of module */
 	if (within_module_init(addr, mod))
-		nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size;
+		mod_mem = &mod->mem[MOD_INIT_TEXT];
 	else
-		nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size;
+		mod_mem = &mod->mem[MOD_TEXT];
+
+	nextval = (unsigned long)mod_mem->base + mod_mem->size;
 
 	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
 
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
index f4317f92e189..995c32d3698f 100644
--- a/kernel/module/kdb.c
+++ b/kernel/module/kdb.c
@@ -26,10 +26,11 @@ int kdb_lsmod(int argc, const char **argv)
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
 
-		kdb_printf("%-20s%8u", mod->name, mod->core_layout.size);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-		kdb_printf("/%8u", mod->data_layout.size);
-#endif
+		kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+		kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+		kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+		kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
 		kdb_printf("  0x%px ", (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
 		kdb_printf("%4d ", module_refcount(mod));
@@ -40,10 +41,10 @@ int kdb_lsmod(int argc, const char **argv)
 			kdb_printf(" (Loading)");
 		else
 			kdb_printf(" (Live)");
-		kdb_printf(" 0x%px", mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-		kdb_printf("/0x%px", mod->data_layout.base);
-#endif
+		kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+		kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+		kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+		kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
 
 #ifdef CONFIG_MODULE_UNLOAD
 		{
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d3be89de706d..2724bc1b9a90 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -80,12 +80,6 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
 	.addr_min = -1UL,
 };
 
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-struct mod_tree_root mod_data_tree __cacheline_aligned = {
-	.addr_min = -1UL,
-};
-#endif
-
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const s32 *crcs;
@@ -93,14 +87,24 @@ struct symsearch {
 };
 
 /*
- * Bounds of module text, for speeding up __module_address.
+ * Bounds of module memory, for speeding up __module_address.
  * Protected by module_mutex.
  */
-static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree)
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+				unsigned int size, struct mod_tree_root *tree)
 {
 	unsigned long min = (unsigned long)base;
 	unsigned long max = min + size;
 
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	if (mod_mem_type_is_core_data(type)) {
+		if (min < tree->data_addr_min)
+			tree->data_addr_min = min;
+		if (max > tree->data_addr_max)
+			tree->data_addr_max = max;
+		return;
+	}
+#endif
 	if (min < tree->addr_min)
 		tree->addr_min = min;
 	if (max > tree->addr_max)
@@ -109,12 +113,12 @@ static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_r
 
 static void mod_update_bounds(struct module *mod)
 {
-	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree);
-	if (mod->init_layout.size)
-		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	__mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree);
-#endif
+	for_each_mod_mem_type(type) {
+		struct module_memory *mod_mem = &mod->mem[type];
+
+		if (mod_mem->size)
+			__mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+	}
 }
 
 /* Block module loading/unloading? */
@@ -926,7 +930,13 @@ struct module_attribute module_uevent =
 static ssize_t show_coresize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
+	unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+	if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+		for_class_mod_mem_type(type, core_data)
+			size += mk->mod->mem[type].size;
+	}
+	return sprintf(buffer, "%u\n", size);
 }
 
 static struct module_attribute modinfo_coresize =
@@ -936,7 +946,11 @@ static struct module_attribute modinfo_coresize =
 static ssize_t show_datasize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", mk->mod->data_layout.size);
+	unsigned int size = 0;
+
+	for_class_mod_mem_type(type, core_data)
+		size += mk->mod->mem[type].size;
+	return sprintf(buffer, "%u\n", size);
 }
 
 static struct module_attribute modinfo_datasize =
@@ -946,7 +960,11 @@ static struct module_attribute modinfo_datasize =
 static ssize_t show_initsize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
+	unsigned int size = 0;
+
+	for_class_mod_mem_type(type, init)
+		size += mk->mod->mem[type].size;
+	return sprintf(buffer, "%u\n", size);
 }
 
 static struct module_attribute modinfo_initsize =
@@ -1143,6 +1161,46 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+static bool mod_mem_use_vmalloc(enum mod_mem_type type)
+{
+	return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) &&
+		mod_mem_type_is_core_data(type);
+}
+
+static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+{
+	if (mod_mem_use_vmalloc(type))
+		return vzalloc(size);
+	return module_alloc(size);
+}
+
+static void module_memory_free(void *ptr, enum mod_mem_type type)
+{
+	if (mod_mem_use_vmalloc(type))
+		vfree(ptr);
+	else
+		module_memfree(ptr);
+}
+
+static void free_mod_mem(struct module *mod)
+{
+	for_each_mod_mem_type(type) {
+		struct module_memory *mod_mem = &mod->mem[type];
+
+		if (type == MOD_DATA)
+			continue;
+
+		/* Free lock-classes; relies on the preceding sync_rcu(). */
+		lockdep_free_key_range(mod_mem->base, mod_mem->size);
+		if (mod_mem->size)
+			module_memory_free(mod_mem->base, type);
+	}
+
+	/* MOD_DATA hosts mod, so free it at last */
+	lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+	module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+}
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1189,18 +1247,10 @@ static void free_module(struct module *mod)
 
 	/* This may be empty, but that's OK */
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
 	kfree(mod->args);
 	percpu_modfree(mod);
 
-	/* Free lock-classes; relies on the preceding sync_rcu(). */
-	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
-
-	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	vfree(mod->data_layout.base);
-#endif
+	free_mod_mem(mod);
 }
 
 void *__symbol_get(const char *symbol)
@@ -1387,16 +1437,18 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod,
 	return 0;
 }
 
-/* Update size with this section: return offset. */
-long module_get_offset(struct module *mod, unsigned int *size,
-		       Elf_Shdr *sechdr, unsigned int section)
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+				Elf_Shdr *sechdr, unsigned int section)
 {
-	long ret;
+	long offset;
+	long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
 
-	*size += arch_mod_section_prepend(mod, section);
-	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
-	*size = ret + sechdr->sh_size;
-	return ret;
+	mod->mem[type].size += arch_mod_section_prepend(mod, section);
+	offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+	mod->mem[type].size = offset + sechdr->sh_size;
+
+	WARN_ON_ONCE(offset & mask);
+	return offset | mask;
 }
 
 static bool module_init_layout_section(const char *sname)
@@ -1408,15 +1460,11 @@ static bool module_init_layout_section(const char *sname)
 	return module_init_section(sname);
 }
 
-/*
- * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- * might -- code, read-only data, read-write data, small data.  Tally
- * sizes, and place the offsets into sh_entsize fields: high bit means it
- * belongs in init.
- */
-static void layout_sections(struct module *mod, struct load_info *info)
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
 {
-	static unsigned long const masks[][2] = {
+	unsigned int m, i;
+
+	static const unsigned long masks[][2] = {
 		/*
 		 * NOTE: all executable code must be the first section
 		 * in this array; otherwise modify the text_size
@@ -1428,84 +1476,63 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
 	};
-	unsigned int m, i;
-
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		info->sechdrs[i].sh_entsize = ~0UL;
+	static const int core_m_to_mem_type[] = {
+		MOD_TEXT,
+		MOD_RODATA,
+		MOD_RO_AFTER_INIT,
+		MOD_DATA,
+		MOD_INVALID,	/* This is needed to match the masks array */
+	};
+	static const int init_m_to_mem_type[] = {
+		MOD_INIT_TEXT,
+		MOD_INIT_RODATA,
+		MOD_INVALID,
+		MOD_INIT_DATA,
+		MOD_INVALID,	/* This is needed to match the masks array */
+	};
 
-	pr_debug("Core section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+		enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
 			const char *sname = info->secstrings + s->sh_name;
-			unsigned int *sizep;
 
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || module_init_layout_section(sname))
+			    || is_init != module_init_layout_section(sname))
 				continue;
-			sizep = m ? &mod->data_layout.size : &mod->core_layout.size;
-			s->sh_entsize = module_get_offset(mod, sizep, s, i);
-			pr_debug("\t%s\n", sname);
-		}
-		switch (m) {
-		case 0: /* executable */
-			mod->core_layout.size = strict_align(mod->core_layout.size);
-			mod->core_layout.text_size = mod->core_layout.size;
-			break;
-		case 1: /* RO: text and ro-data */
-			mod->data_layout.size = strict_align(mod->data_layout.size);
-			mod->data_layout.ro_size = mod->data_layout.size;
-			break;
-		case 2: /* RO after init */
-			mod->data_layout.size = strict_align(mod->data_layout.size);
-			mod->data_layout.ro_after_init_size = mod->data_layout.size;
-			break;
-		case 4: /* whole core */
-			mod->data_layout.size = strict_align(mod->data_layout.size);
-			break;
-		}
-	}
-
-	pr_debug("Init section allocation order:\n");
-	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-		for (i = 0; i < info->hdr->e_shnum; ++i) {
-			Elf_Shdr *s = &info->sechdrs[i];
-			const char *sname = info->secstrings + s->sh_name;
 
-			if ((s->sh_flags & masks[m][0]) != masks[m][0]
-			    || (s->sh_flags & masks[m][1])
-			    || s->sh_entsize != ~0UL
-			    || !module_init_layout_section(sname))
+			if (WARN_ON_ONCE(type == MOD_INVALID))
 				continue;
-			s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i)
-					 | INIT_OFFSET_MASK);
+
+			s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
 			pr_debug("\t%s\n", sname);
 		}
-		switch (m) {
-		case 0: /* executable */
-			mod->init_layout.size = strict_align(mod->init_layout.size);
-			mod->init_layout.text_size = mod->init_layout.size;
-			break;
-		case 1: /* RO: text and ro-data */
-			mod->init_layout.size = strict_align(mod->init_layout.size);
-			mod->init_layout.ro_size = mod->init_layout.size;
-			break;
-		case 2:
-			/*
-			 * RO after init doesn't apply to init_layout (only
-			 * core_layout), so it just takes the value of ro_size.
-			 */
-			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
-			break;
-		case 4: /* whole init */
-			mod->init_layout.size = strict_align(mod->init_layout.size);
-			break;
-		}
 	}
 }
 
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data.  Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+	unsigned int i;
+
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		info->sechdrs[i].sh_entsize = ~0UL;
+
+	pr_debug("Core section allocation order:\n");
+	__layout_sections(mod, info, false);
+
+	pr_debug("Init section allocation order:\n");
+	__layout_sections(mod, info, true);
+}
+
 static void set_license(struct module *mod, const char *license)
 {
 	if (!license)
@@ -2122,72 +2149,41 @@ static int move_module(struct module *mod, struct load_info *info)
 {
 	int i;
 	void *ptr;
+	enum mod_mem_type t;
 
-	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
-	/*
-	 * The pointer to this block is stored in the module structure
-	 * which is inside the block. Just mark it as not being a
-	 * leak.
-	 */
-	kmemleak_not_leak(ptr);
-	if (!ptr)
-		return -ENOMEM;
-
-	memset(ptr, 0, mod->core_layout.size);
-	mod->core_layout.base = ptr;
+	for_each_mod_mem_type(type) {
+		if (!mod->mem[type].size) {
+			mod->mem[type].base = NULL;
+			continue;
+		}
+		mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
+		ptr = module_memory_alloc(mod->mem[type].size, type);
 
-	if (mod->init_layout.size) {
-		ptr = module_alloc(mod->init_layout.size);
 		/*
 		 * The pointer to this block is stored in the module structure
-		 * which is inside the block. This block doesn't need to be
-		 * scanned as it contains data and code that will be freed
-		 * after the module is initialized.
+		 * which is inside the block. Just mark it as not being a
+		 * leak.
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->core_layout.base);
-			return -ENOMEM;
+			t = type;
+			goto out_enomem;
 		}
-		memset(ptr, 0, mod->init_layout.size);
-		mod->init_layout.base = ptr;
-	} else
-		mod->init_layout.base = NULL;
-
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	/* Do the allocs. */
-	ptr = vzalloc(mod->data_layout.size);
-	/*
-	 * The pointer to this block is stored in the module structure
-	 * which is inside the block. Just mark it as not being a
-	 * leak.
-	 */
-	kmemleak_not_leak(ptr);
-	if (!ptr) {
-		module_memfree(mod->core_layout.base);
-		module_memfree(mod->init_layout.base);
-		return -ENOMEM;
+		memset(ptr, 0, mod->mem[type].size);
+		mod->mem[type].base = ptr;
 	}
 
-	mod->data_layout.base = ptr;
-#endif
 	/* Transfer each section which specifies SHF_ALLOC */
 	pr_debug("final section addresses:\n");
 	for (i = 0; i < info->hdr->e_shnum; i++) {
 		void *dest;
 		Elf_Shdr *shdr = &info->sechdrs[i];
+		enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
 
 		if (!(shdr->sh_flags & SHF_ALLOC))
 			continue;
 
-		if (shdr->sh_entsize & INIT_OFFSET_MASK)
-			dest = mod->init_layout.base
-				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
-		else if (!(shdr->sh_flags & SHF_EXECINSTR))
-			dest = mod->data_layout.base + shdr->sh_entsize;
-		else
-			dest = mod->core_layout.base + shdr->sh_entsize;
+		dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
 
 		if (shdr->sh_type != SHT_NOBITS)
 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -2198,6 +2194,10 @@ static int move_module(struct module *mod, struct load_info *info)
 	}
 
 	return 0;
+out_enomem:
+	for (t--; t >= 0; t--)
+		module_memory_free(mod->mem[t].base, t);
+	return -ENOMEM;
 }
 
 static int check_module_license_and_versions(struct module *mod)
@@ -2242,12 +2242,14 @@ static void flush_module_icache(const struct module *mod)
 	 * Do it before processing of module parameters, so the module
 	 * can provide parameter accessor functions of its own.
 	 */
-	if (mod->init_layout.base)
-		flush_icache_range((unsigned long)mod->init_layout.base,
-				   (unsigned long)mod->init_layout.base
-				   + mod->init_layout.size);
-	flush_icache_range((unsigned long)mod->core_layout.base,
-			   (unsigned long)mod->core_layout.base + mod->core_layout.size);
+	for_each_mod_mem_type(type) {
+		const struct module_memory *mod_mem = &mod->mem[type];
+
+		if (mod_mem->size) {
+			flush_icache_range((unsigned long)mod_mem->base,
+					   (unsigned long)mod_mem->base + mod_mem->size);
+		}
+	}
 }
 
 bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
@@ -2350,11 +2352,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	vfree(mod->data_layout.base);
-#endif
+
+	free_mod_mem(mod);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2415,7 +2414,9 @@ static void do_mod_ctors(struct module *mod)
 /* For freeing module_init on success, in case kallsyms traversing */
 struct mod_initfree {
 	struct llist_node node;
-	void *module_init;
+	void *init_text;
+	void *init_data;
+	void *init_rodata;
 };
 
 static void do_free_init(struct work_struct *w)
@@ -2429,7 +2430,9 @@ static void do_free_init(struct work_struct *w)
 
 	llist_for_each_safe(pos, n, list) {
 		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->module_init);
+		module_memfree(initfree->init_text);
+		module_memfree(initfree->init_data);
+		module_memfree(initfree->init_rodata);
 		kfree(initfree);
 	}
 }
@@ -2456,7 +2459,9 @@ static noinline int do_init_module(struct module *mod)
 		ret = -ENOMEM;
 		goto fail;
 	}
-	freeinit->module_init = mod->init_layout.base;
+	freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+	freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+	freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
 
 	do_mod_ctors(mod);
 	/* Start the module */
@@ -2492,8 +2497,8 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested)
 		async_synchronize_full();
 
-	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
-			mod->init_layout.size);
+	ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+			mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
@@ -2505,11 +2510,10 @@ static noinline int do_init_module(struct module *mod)
 	module_enable_ro(mod, true);
 	mod_tree_remove_init(mod);
 	module_arch_freeing_init(mod);
-	mod->init_layout.base = NULL;
-	mod->init_layout.size = 0;
-	mod->init_layout.ro_size = 0;
-	mod->init_layout.ro_after_init_size = 0;
-	mod->init_layout.text_size = 0;
+	for_class_mod_mem_type(type, init) {
+		mod->mem[type].base = NULL;
+		mod->mem[type].size = 0;
+	}
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
 	mod->btf_data = NULL;
@@ -2628,9 +2632,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 	module_cfi_finalize(info->hdr, info->sechdrs, mod);
 
-	if (module_check_misalignment(mod))
-		goto out_misaligned;
-
 	module_enable_ro(mod, false);
 	module_enable_nx(mod);
 	module_enable_x(mod);
@@ -2644,8 +2645,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
 
 	return 0;
 
-out_misaligned:
-	err = -EINVAL;
 out:
 	mutex_unlock(&module_mutex);
 	return err;
@@ -2909,7 +2908,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mutex_unlock(&module_mutex);
  free_module:
 	/* Free lock-classes; relies on the preceding sync_rcu() */
-	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
+	for_class_mod_mem_type(type, core_data) {
+		lockdep_free_key_range(mod->mem[type].base,
+				       mod->mem[type].size);
+	}
 
 	module_deallocate(mod, info);
  free_copy:
@@ -3060,20 +3062,21 @@ bool is_module_address(unsigned long addr)
 struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
-	struct mod_tree_root *tree;
 
 	if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
-		tree = &mod_tree;
+		goto lookup;
+
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max)
-		tree = &mod_data_tree;
+	if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+		goto lookup;
 #endif
-	else
-		return NULL;
 
+	return NULL;
+
+lookup:
 	module_assert_mutex_or_preempt();
 
-	mod = mod_find(addr, tree);
+	mod = mod_find(addr, &mod_tree);
 	if (mod) {
 		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
@@ -3113,8 +3116,8 @@ struct module *__module_text_address(unsigned long addr)
 	struct module *mod = __module_address(addr);
 	if (mod) {
 		/* Make sure it's within the text section. */
-		if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
-		    && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
+		if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+		    !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
 			mod = NULL;
 	}
 	return mod;
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
index cf5b9f1e6ec4..0a4841e88adb 100644
--- a/kernel/module/procfs.c
+++ b/kernel/module/procfs.c
@@ -62,6 +62,15 @@ static void m_stop(struct seq_file *m, void *p)
 	mutex_unlock(&module_mutex);
 }
 
+static unsigned int module_total_size(struct module *mod)
+{
+	int size = 0;
+
+	for_each_mod_mem_type(type)
+		size += mod->mem[type].size;
+	return size;
+}
+
 static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
@@ -73,10 +82,7 @@ static int m_show(struct seq_file *m, void *p)
 	if (mod->state == MODULE_STATE_UNFORMED)
 		return 0;
 
-	size = mod->init_layout.size + mod->core_layout.size;
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	size += mod->data_layout.size;
-#endif
+	size = module_total_size(mod);
 	seq_printf(m, "%s %u", mod->name, size);
 	print_unload_info(m, mod);
 
@@ -86,7 +92,7 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	value = m->private ? NULL : mod->core_layout.base;
+	value = m->private ? NULL : mod->mem[MOD_TEXT].base;
 	seq_printf(m, " 0x%px", value);
 
 	/* Taints info */
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 14fbea66f12f..a2b656b4e3d2 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -11,82 +11,25 @@
 #include <linux/set_memory.h>
 #include "internal.h"
 
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- *          [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^                ^               ^               ^
- * ro_size ------------------------|               |               |
- * ro_after_init_size -----------------------------|               |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base) when
- * CONFIG_STRICT_MODULE_RWX is set.
- */
+static void module_set_memory(const struct module *mod, enum mod_mem_type type,
+			      int (*set_memory)(unsigned long start, int num_pages))
+{
+	const struct module_memory *mod_mem = &mod->mem[type];
+
+	set_vm_flush_reset_perms(mod_mem->base);
+	set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
 
 /*
  * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
  * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
  * are strict.
  */
-static void frob_text(const struct module_layout *layout,
-		      int (*set_memory)(unsigned long start, int num_pages))
-{
-	set_memory((unsigned long)layout->base,
-		   PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_rodata(const struct module_layout *layout,
-		 int (*set_memory)(unsigned long start, int num_pages))
-{
-	set_memory((unsigned long)layout->base + layout->text_size,
-		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
-			int (*set_memory)(unsigned long start, int num_pages))
-{
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
-			int (*set_memory)(unsigned long start, int num_pages))
-{
-	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
-		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static bool layout_check_misalignment(const struct module_layout *layout)
-{
-	return WARN_ON(!PAGE_ALIGNED(layout->base)) ||
-	       WARN_ON(!PAGE_ALIGNED(layout->text_size)) ||
-	       WARN_ON(!PAGE_ALIGNED(layout->ro_size)) ||
-	       WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) ||
-	       WARN_ON(!PAGE_ALIGNED(layout->size));
-}
-
-bool module_check_misalignment(const struct module *mod)
-{
-	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
-		return false;
-
-	return layout_check_misalignment(&mod->core_layout) ||
-	       layout_check_misalignment(&mod->data_layout) ||
-	       layout_check_misalignment(&mod->init_layout);
-}
-
 void module_enable_x(const struct module *mod)
 {
-	if (!PAGE_ALIGNED(mod->core_layout.base) ||
-	    !PAGE_ALIGNED(mod->init_layout.base))
-		return;
-
-	frob_text(&mod->core_layout, set_memory_x);
-	frob_text(&mod->init_layout, set_memory_x);
+	for_class_mod_mem_type(type, text)
+		module_set_memory(mod, type, set_memory_x);
 }
 
 void module_enable_ro(const struct module *mod, bool after_init)
@@ -98,16 +41,13 @@ void module_enable_ro(const struct module *mod, bool after_init)
 		return;
 #endif
 
-	set_vm_flush_reset_perms(mod->core_layout.base);
-	set_vm_flush_reset_perms(mod->init_layout.base);
-	frob_text(&mod->core_layout, set_memory_ro);
-
-	frob_rodata(&mod->data_layout, set_memory_ro);
-	frob_text(&mod->init_layout, set_memory_ro);
-	frob_rodata(&mod->init_layout, set_memory_ro);
+	module_set_memory(mod, MOD_TEXT, set_memory_ro);
+	module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
+	module_set_memory(mod, MOD_RODATA, set_memory_ro);
+	module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
 
 	if (after_init)
-		frob_ro_after_init(&mod->data_layout, set_memory_ro);
+		module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
 }
 
 void module_enable_nx(const struct module *mod)
@@ -115,11 +55,8 @@ void module_enable_nx(const struct module *mod)
 	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		return;
 
-	frob_rodata(&mod->data_layout, set_memory_nx);
-	frob_ro_after_init(&mod->data_layout, set_memory_nx);
-	frob_writable_data(&mod->data_layout, set_memory_nx);
-	frob_rodata(&mod->init_layout, set_memory_nx);
-	frob_writable_data(&mod->init_layout, set_memory_nx);
+	for_class_mod_mem_type(type, data)
+		module_set_memory(mod, type, set_memory_nx);
 }
 
 int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index 8ec5cfd60496..277197977d43 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -21,16 +21,16 @@
 
 static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
 {
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+	struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
 
-	return (unsigned long)layout->base;
+	return (unsigned long)mod_mem->base;
 }
 
 static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
 {
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+	struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
 
-	return (unsigned long)layout->size;
+	return (unsigned long)mod_mem->size;
 }
 
 static __always_inline bool
@@ -77,32 +77,27 @@ static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *
  */
 void mod_tree_insert(struct module *mod)
 {
-	mod->core_layout.mtn.mod = mod;
-	mod->init_layout.mtn.mod = mod;
-
-	__mod_tree_insert(&mod->core_layout.mtn, &mod_tree);
-	if (mod->init_layout.size)
-		__mod_tree_insert(&mod->init_layout.mtn, &mod_tree);
-
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	mod->data_layout.mtn.mod = mod;
-	__mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree);
-#endif
+	for_each_mod_mem_type(type) {
+		mod->mem[type].mtn.mod = mod;
+		if (mod->mem[type].size)
+			__mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+	}
 }
 
 void mod_tree_remove_init(struct module *mod)
 {
-	if (mod->init_layout.size)
-		__mod_tree_remove(&mod->init_layout.mtn, &mod_tree);
+	for_class_mod_mem_type(type, init) {
+		if (mod->mem[type].size)
+			__mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+	}
 }
 
 void mod_tree_remove(struct module *mod)
 {
-	__mod_tree_remove(&mod->core_layout.mtn, &mod_tree);
-	mod_tree_remove_init(mod);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-	__mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree);
-#endif
+	for_each_mod_mem_type(type) {
+		if (mod->mem[type].size)
+			__mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+	}
 }
 
 struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
-- 
cgit v1.2.3


From 042edf1ebb49a63f299c4cdfb9c1e2ba299c0b91 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 4 Feb 2023 05:44:46 +0000
Subject: module: make module_ktype structure constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.")
the driver core allows the usage of const struct kobj_type.

Take advantage of this to constify the structure definition to prevent
modification at runtime.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module.h | 2 +-
 kernel/params.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index c0593a96e158..dbc9124b71bc 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -871,7 +871,7 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr)
 
 #ifdef CONFIG_SYSFS
 extern struct kset *module_kset;
-extern struct kobj_type module_ktype;
+extern const struct kobj_type module_ktype;
 #endif /* CONFIG_SYSFS */
 
 #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
diff --git a/kernel/params.c b/kernel/params.c
index 6e34ca89ebae..6a7548979aa9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -948,7 +948,7 @@ static void module_kobj_release(struct kobject *kobj)
 	complete(mk->kobj_completion);
 }
 
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
 	.release   =	module_kobj_release,
 	.sysfs_ops =	&module_sysfs_ops,
 };
-- 
cgit v1.2.3


From 7deabd67498869640c937c9bd83472574b7dea0b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 3 Mar 2023 11:50:56 -0500
Subject: dyndbg: use the module notifier callbacks

Bring dynamic debug in line with other subsystems by using the module
notifier callbacks. This results in a net decrease in core module
code.

Additionally, Jim Cromie has a new dynamic debug classmap feature,
which requires that jump labels be initialized prior to dynamic debug.
Specifically, the new feature toggles a jump label from the existing
dynamic_debug_setup() function. However, this does not currently work
properly, because jump labels are initialized via the
'module_notify_list' notifier chain, which is invoked after the
current call to dynamic_debug_setup(). Thus, this patch ensures that
jump labels are initialized prior to dynamic debug by setting the
dynamic debug notifier priority to 0, while jump labels have the
higher priority of 1.

Tested by Jim using his new test case, and I've verfied the correct
printing via: # modprobe test_dynamic_debug dyndbg.

Link: https://lore.kernel.org/lkml/20230113193016.749791-21-jim.cromie@gmail.com/
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202302190427.9iIK2NfJ-lkp@intel.com/
Tested-by: Jim Cromie <jim.cromie@gmail.com>
Reviewed-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
CC: Jim Cromie <jim.cromie@gmail.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/dynamic_debug.h | 13 -----------
 include/linux/module.h        |  4 ++++
 kernel/module/internal.h      |  2 --
 kernel/module/main.c          | 30 +++++++-------------------
 lib/dynamic_debug.c           | 50 +++++++++++++++++++++++++++++++++++--------
 5 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 41682278d2e8..f401414341fb 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -130,9 +130,6 @@ struct ddebug_class_param {
 
 #if defined(CONFIG_DYNAMIC_DEBUG_CORE)
 
-int ddebug_add_module(struct _ddebug_info *dyndbg, const char *modname);
-
-extern int ddebug_remove_module(const char *mod_name);
 extern __printf(2, 3)
 void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
 
@@ -304,16 +301,6 @@ int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp);
 #include <linux/errno.h>
 #include <linux/printk.h>
 
-static inline int ddebug_add_module(struct _ddebug_info *dinfo, const char *modname)
-{
-	return 0;
-}
-
-static inline int ddebug_remove_module(const char *mod)
-{
-	return 0;
-}
-
 static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
 						const char *modname)
 {
diff --git a/include/linux/module.h b/include/linux/module.h
index dbc9124b71bc..91726444d55f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -27,6 +27,7 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/srcu.h>
 #include <linux/static_call_types.h>
+#include <linux/dynamic_debug.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -579,6 +580,9 @@ struct module {
 	struct error_injection_entry *ei_funcs;
 	unsigned int num_ei_funcs;
 #endif
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+	struct _ddebug_info dyndbg_info;
+#endif
 } ____cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index c6c44ceca07a..e3883b7d4840 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -45,7 +45,6 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const s32 __start___kcrctab[];
 extern const s32 __start___kcrctab_gpl[];
 
-#include <linux/dynamic_debug.h>
 struct load_info {
 	const char *name;
 	/* pointer to module in temporary copy, freed at end of load_module() */
@@ -55,7 +54,6 @@ struct load_info {
 	Elf_Shdr *sechdrs;
 	char *secstrings, *strtab;
 	unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
-	struct _ddebug_info dyndbg;
 	bool sig_ok;
 #ifdef CONFIG_KALLSYMS
 	unsigned long mod_kallsyms_init_off;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 1fa529668a45..b4759f1695b7 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1216,9 +1216,6 @@ static void free_module(struct module *mod)
 	mod->state = MODULE_STATE_UNFORMED;
 	mutex_unlock(&module_mutex);
 
-	/* Remove dynamic debug info */
-	ddebug_remove_module(mod->name);
-
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -1619,19 +1616,6 @@ static void free_modinfo(struct module *mod)
 	}
 }
 
-static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg)
-{
-	if (!dyndbg->num_descs)
-		return;
-	ddebug_add_module(dyndbg, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg)
-{
-	if (dyndbg->num_descs)
-		ddebug_remove_module(mod->name);
-}
-
 void * __weak module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
@@ -2137,10 +2121,14 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	if (section_addr(info, "__obsparm"))
 		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
 
-	info->dyndbg.descs = section_objs(info, "__dyndbg",
-					sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs);
-	info->dyndbg.classes = section_objs(info, "__dyndbg_classes",
-					sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes);
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+	mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+					      sizeof(*mod->dyndbg_info.descs),
+					      &mod->dyndbg_info.num_descs);
+	mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+						sizeof(*mod->dyndbg_info.classes),
+						&mod->dyndbg_info.num_classes);
+#endif
 
 	return 0;
 }
@@ -2824,7 +2812,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	}
 
 	init_build_id(mod, info);
-	dynamic_debug_setup(mod, &info->dyndbg);
 
 	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
 	ftrace_module_init(mod);
@@ -2888,7 +2875,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
  ddebug_cleanup:
 	ftrace_release_mod(mod);
-	dynamic_debug_remove(mod, &info->dyndbg);
 	synchronize_rcu();
 	kfree(mod->args);
  free_arch_cleanup:
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 8136e5236b7b..fdd6d9800a70 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -1223,7 +1223,7 @@ static void ddebug_attach_module_classes(struct ddebug_table *dt,
  * Allocate a new ddebug_table for the given module
  * and add it to the global list.
  */
-static int __ddebug_add_module(struct _ddebug_info *di, const char *modname)
+static int ddebug_add_module(struct _ddebug_info *di, const char *modname)
 {
 	struct ddebug_table *dt;
 
@@ -1262,11 +1262,6 @@ static int __ddebug_add_module(struct _ddebug_info *di, const char *modname)
 	return 0;
 }
 
-int ddebug_add_module(struct _ddebug_info *di, const char *modname)
-{
-	return __ddebug_add_module(di, modname);
-}
-
 /* helper for ddebug_dyndbg_(boot|module)_param_cb */
 static int ddebug_dyndbg_param_cb(char *param, char *val,
 				const char *modname, int on_err)
@@ -1313,11 +1308,13 @@ static void ddebug_table_free(struct ddebug_table *dt)
 	kfree(dt);
 }
 
+#ifdef CONFIG_MODULES
+
 /*
  * Called in response to a module being unloaded.  Removes
  * any ddebug_table's which point at the module.
  */
-int ddebug_remove_module(const char *mod_name)
+static int ddebug_remove_module(const char *mod_name)
 {
 	struct ddebug_table *dt, *nextdt;
 	int ret = -ENOENT;
@@ -1336,6 +1333,33 @@ int ddebug_remove_module(const char *mod_name)
 	return ret;
 }
 
+static int ddebug_module_notify(struct notifier_block *self, unsigned long val,
+				void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ret = ddebug_add_module(&mod->dyndbg_info, mod->name);
+		if (ret)
+			WARN(1, "Failed to allocate memory: dyndbg may not work properly.\n");
+		break;
+	case MODULE_STATE_GOING:
+		ddebug_remove_module(mod->name);
+		break;
+	}
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block ddebug_module_nb = {
+	.notifier_call = ddebug_module_notify,
+	.priority = 0, /* dynamic debug depends on jump label */
+};
+
+#endif /* CONFIG_MODULES */
+
 static void ddebug_remove_all_tables(void)
 {
 	mutex_lock(&ddebug_lock);
@@ -1387,6 +1411,14 @@ static int __init dynamic_debug_init(void)
 		.num_classes = __stop___dyndbg_classes - __start___dyndbg_classes,
 	};
 
+#ifdef CONFIG_MODULES
+	ret = register_module_notifier(&ddebug_module_nb);
+	if (ret) {
+		pr_warn("Failed to register dynamic debug module notifier\n");
+		return ret;
+	}
+#endif /* CONFIG_MODULES */
+
 	if (&__start___dyndbg == &__stop___dyndbg) {
 		if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) {
 			pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
@@ -1407,7 +1439,7 @@ static int __init dynamic_debug_init(void)
 			mod_ct++;
 			di.num_descs = mod_sites;
 			di.descs = iter_mod_start;
-			ret = __ddebug_add_module(&di, modname);
+			ret = ddebug_add_module(&di, modname);
 			if (ret)
 				goto out_err;
 
@@ -1418,7 +1450,7 @@ static int __init dynamic_debug_init(void)
 	}
 	di.num_descs = mod_sites;
 	di.descs = iter_mod_start;
-	ret = __ddebug_add_module(&di, modname);
+	ret = ddebug_add_module(&di, modname);
 	if (ret)
 		goto out_err;
 
-- 
cgit v1.2.3


From 3f4b9cb4133a4ecf16447cbd5fdb8ed618593bf8 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Wed, 1 Mar 2023 11:45:09 +0300
Subject: scsi: target: core: Add RTPI field to target port

SAM-5 4.6.5.2 (Relative Port Identifier attribute) defines the attribute as
unique across SCSI target ports.

The change introduces RTPI attribute to se_portal group. The value is
unique across all enabled SCSI target ports. It also limits number of SCSI
target ports to 65535.

Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://lore.kernel.org/r/20230301084512.21956-2-d.bogdanov@yadro.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_fabric_configfs.c |  9 +++--
 drivers/target/target_core_internal.h        |  2 ++
 drivers/target/target_core_tpg.c             | 54 ++++++++++++++++++++++++++++
 include/target/target_core_base.h            |  2 ++
 4 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 67b18a67317a..873da49ab704 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -836,13 +836,12 @@ static ssize_t target_fabric_tpg_base_enable_store(struct config_item *item,
 
 	if (se_tpg->enabled == op)
 		return count;
-
-	ret = se_tpg->se_tpg_tfo->fabric_enable_tpg(se_tpg, op);
+	if (op)
+		ret = target_tpg_enable(se_tpg);
+	else
+		ret = target_tpg_disable(se_tpg);
 	if (ret)
 		return ret;
-
-	se_tpg->enabled = op;
-
 	return count;
 }
 
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 38a6d08f75b3..82fd5768a662 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -132,6 +132,8 @@ void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
 struct se_node_acl *core_tpg_add_initiator_node_acl(struct se_portal_group *tpg,
 		const char *initiatorname);
 void core_tpg_del_initiator_node_acl(struct se_node_acl *acl);
+int target_tpg_enable(struct se_portal_group *se_tpg);
+int target_tpg_disable(struct se_portal_group *se_tpg);
 
 /* target_core_transport.c */
 int	init_se_kmem_caches(void);
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index 736847c933e5..0de3385b94c5 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -31,6 +31,7 @@
 #include "target_core_ua.h"
 
 extern struct se_device *g_lun0_dev;
+static DEFINE_XARRAY_ALLOC(tpg_xa);
 
 /*	__core_tpg_get_initiator_node_acl():
  *
@@ -439,6 +440,57 @@ static void core_tpg_lun_ref_release(struct percpu_ref *ref)
 	complete(&lun->lun_shutdown_comp);
 }
 
+static int target_tpg_register_rtpi(struct se_portal_group *se_tpg)
+{
+	u32 val;
+	int ret;
+
+	ret = xa_alloc(&tpg_xa, &val, se_tpg,
+		       XA_LIMIT(1, USHRT_MAX), GFP_KERNEL);
+	if (!ret)
+		se_tpg->tpg_rtpi = val;
+
+	return ret;
+}
+
+static void target_tpg_deregister_rtpi(struct se_portal_group *se_tpg)
+{
+	if (se_tpg->tpg_rtpi && se_tpg->enabled)
+		xa_erase(&tpg_xa, se_tpg->tpg_rtpi);
+}
+
+int target_tpg_enable(struct se_portal_group *se_tpg)
+{
+	int ret;
+
+	ret = target_tpg_register_rtpi(se_tpg);
+	if (ret)
+		return ret;
+
+	ret = se_tpg->se_tpg_tfo->fabric_enable_tpg(se_tpg, true);
+	if (ret) {
+		target_tpg_deregister_rtpi(se_tpg);
+		return ret;
+	}
+
+	se_tpg->enabled = true;
+
+	return 0;
+}
+
+int target_tpg_disable(struct se_portal_group *se_tpg)
+{
+	int ret;
+
+	target_tpg_deregister_rtpi(se_tpg);
+
+	ret = se_tpg->se_tpg_tfo->fabric_enable_tpg(se_tpg, false);
+	if (!ret)
+		se_tpg->enabled = false;
+
+	return ret;
+}
+
 /* Does not change se_wwn->priv. */
 int core_tpg_register(
 	struct se_wwn *se_wwn,
@@ -535,6 +587,8 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
 		kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
 	}
 
+	target_tpg_deregister_rtpi(se_tpg);
+
 	return 0;
 }
 EXPORT_SYMBOL(core_tpg_deregister);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 12c9ba16217e..814edf746395 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -920,6 +920,8 @@ struct se_portal_group {
 	 */
 	int			proto_id;
 	bool			enabled;
+	/* RELATIVE TARGET PORT IDENTIFIER */
+	u16			tpg_rtpi;
 	/* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */
 	atomic_t		tpg_pr_ref_count;
 	/* Spinlock for adding/removing ACLed Nodes */
-- 
cgit v1.2.3


From 5fe99dace4313b0061d46f69e32f981956c92445 Mon Sep 17 00:00:00 2001
From: Roman Bolshakov <r.bolshakov@yadro.com>
Date: Wed, 1 Mar 2023 11:45:11 +0300
Subject: scsi: target: core: Drop device-based RTPI

The code is not needed since target port-based RTPI allocation replaced it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://lore.kernel.org/r/20230301084512.21956-4-d.bogdanov@yadro.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c   | 41 -----------------------------------
 drivers/target/target_core_internal.h |  1 -
 drivers/target/target_core_tpg.c      |  6 -----
 include/target/target_core_base.h     |  4 ----
 4 files changed, 52 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 93f7f050fdf1..a3292eade6ea 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -479,47 +479,6 @@ void core_clear_lun_from_tpg(struct se_lun *lun, struct se_portal_group *tpg)
 	mutex_unlock(&tpg->acl_node_mutex);
 }
 
-int core_alloc_rtpi(struct se_lun *lun, struct se_device *dev)
-{
-	struct se_lun *tmp;
-
-	spin_lock(&dev->se_port_lock);
-	if (dev->export_count == 0x0000ffff) {
-		pr_warn("Reached dev->dev_port_count =="
-				" 0x0000ffff\n");
-		spin_unlock(&dev->se_port_lock);
-		return -ENOSPC;
-	}
-again:
-	/*
-	 * Allocate the next RELATIVE TARGET PORT IDENTIFIER for this struct se_device
-	 * Here is the table from spc4r17 section 7.7.3.8.
-	 *
-	 *    Table 473 -- RELATIVE TARGET PORT IDENTIFIER field
-	 *
-	 * Code      Description
-	 * 0h        Reserved
-	 * 1h        Relative port 1, historically known as port A
-	 * 2h        Relative port 2, historically known as port B
-	 * 3h to FFFFh    Relative port 3 through 65 535
-	 */
-	lun->lun_rtpi = dev->dev_rpti_counter++;
-	if (!lun->lun_rtpi)
-		goto again;
-
-	list_for_each_entry(tmp, &dev->dev_sep_list, lun_dev_link) {
-		/*
-		 * Make sure RELATIVE TARGET PORT IDENTIFIER is unique
-		 * for 16-bit wrap..
-		 */
-		if (lun->lun_rtpi == tmp->lun_rtpi)
-			goto again;
-	}
-	spin_unlock(&dev->se_port_lock);
-
-	return 0;
-}
-
 static void se_release_vpd_for_dev(struct se_device *dev)
 {
 	struct t10_vpd *vpd, *vpd_tmp;
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 82fd5768a662..9c3bca552bb9 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -59,7 +59,6 @@ struct target_fabric_configfs {
 extern struct t10_alua_lu_gp *default_lu_gp;
 
 /* target_core_device.c */
-int	core_alloc_rtpi(struct se_lun *lun, struct se_device *dev);
 struct se_dev_entry *core_get_se_deve_from_rtpi(struct se_node_acl *, u16);
 void	target_pr_kref_release(struct kref *);
 void	core_free_device_list_for_node(struct se_node_acl *,
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index 0de3385b94c5..b1d9383386ec 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -632,10 +632,6 @@ int core_tpg_add_lun(
 	if (ret < 0)
 		goto out;
 
-	ret = core_alloc_rtpi(lun, dev);
-	if (ret)
-		goto out_kill_ref;
-
 	if (!(dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA) &&
 	    !(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE))
 		target_attach_tg_pt_gp(lun, dev->t10_alua.default_tg_pt_gp);
@@ -659,8 +655,6 @@ int core_tpg_add_lun(
 
 	return 0;
 
-out_kill_ref:
-	percpu_ref_exit(&lun->lun_ref);
 out:
 	return ret;
 }
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 814edf746395..008e0e4500d1 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -735,8 +735,6 @@ struct se_lun {
 	bool			lun_access_ro;
 	u32			lun_index;
 
-	/* RELATIVE TARGET PORT IDENTIFER */
-	u16			lun_rtpi;
 	atomic_t		lun_acl_count;
 	struct se_device __rcu	*lun_se_dev;
 
@@ -788,8 +786,6 @@ struct se_device_queue {
 };
 
 struct se_device {
-	/* RELATIVE TARGET PORT IDENTIFER Counter */
-	u16			dev_rpti_counter;
 	/* Used for SAM Task Attribute ordering */
 	u32			dev_cur_ordered_id;
 	u32			dev_flags;
-- 
cgit v1.2.3


From 31177b74790cc566200f30705bf9a83d168da893 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Wed, 1 Mar 2023 11:45:12 +0300
Subject: scsi: target: core: Add RTPI attribute for target port

RELATIVE TARGET PORT IDENTIFIER can be read and configured via configfs:

$ echo 0x10 > $TARGET/tpgt_N/rtpi

RTPI can be changed only on disabled target ports.

Co-developed-by: Roman Bolshakov <r.bolshakov@yadro.com>
Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://lore.kernel.org/r/20230301084512.21956-5-d.bogdanov@yadro.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_fabric_configfs.c | 39 ++++++++++++++++++++++++++--
 drivers/target/target_core_tpg.c             | 19 +++++++++++---
 include/target/target_core_base.h            |  1 +
 3 files changed, 53 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 873da49ab704..0ce47e21e0c8 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -844,15 +844,48 @@ static ssize_t target_fabric_tpg_base_enable_store(struct config_item *item,
 		return ret;
 	return count;
 }
+static ssize_t target_fabric_tpg_base_rtpi_show(struct config_item *item, char *page)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+
+	return sysfs_emit(page, "%#x\n", se_tpg->tpg_rtpi);
+}
+
+static ssize_t target_fabric_tpg_base_rtpi_store(struct config_item *item,
+				   const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+	u16 val;
+	int ret;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret < 0)
+		return ret;
+	if (val == 0)
+		return -EINVAL;
+
+	if (se_tpg->enabled) {
+		pr_info("%s_TPG[%hu] - Can not change RTPI on enabled TPG",
+			se_tpg->se_tpg_tfo->fabric_name,
+			se_tpg->se_tpg_tfo->tpg_get_tag(se_tpg));
+		return -EINVAL;
+	}
+
+	se_tpg->tpg_rtpi = val;
+	se_tpg->rtpi_manual = true;
+
+	return count;
+}
 
 CONFIGFS_ATTR(target_fabric_tpg_base_, enable);
+CONFIGFS_ATTR(target_fabric_tpg_base_, rtpi);
 
 static int
 target_fabric_setup_tpg_base_cit(struct target_fabric_configfs *tf)
 {
 	struct config_item_type *cit = &tf->tf_tpg_base_cit;
 	struct configfs_attribute **attrs = NULL;
-	size_t nr_attrs = 0;
+	size_t nr_attrs = 1;
 	int i = 0;
 
 	if (tf->tf_ops->tfc_tpg_base_attrs)
@@ -875,7 +908,9 @@ target_fabric_setup_tpg_base_cit(struct target_fabric_configfs *tf)
 			attrs[i] = tf->tf_ops->tfc_tpg_base_attrs[i];
 
 	if (tf->tf_ops->fabric_enable_tpg)
-		attrs[i] = &target_fabric_tpg_base_attr_enable;
+		attrs[i++] = &target_fabric_tpg_base_attr_enable;
+
+	attrs[i++] = &target_fabric_tpg_base_attr_rtpi;
 
 done:
 	cit->ct_item_ops = &target_fabric_tpg_base_item_ops;
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index b1d9383386ec..2e079c6a8e8c 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -445,10 +445,21 @@ static int target_tpg_register_rtpi(struct se_portal_group *se_tpg)
 	u32 val;
 	int ret;
 
-	ret = xa_alloc(&tpg_xa, &val, se_tpg,
-		       XA_LIMIT(1, USHRT_MAX), GFP_KERNEL);
-	if (!ret)
-		se_tpg->tpg_rtpi = val;
+	if (se_tpg->rtpi_manual) {
+		ret = xa_insert(&tpg_xa, se_tpg->tpg_rtpi, se_tpg, GFP_KERNEL);
+		if (ret) {
+			pr_info("%s_TPG[%hu] - Can not set RTPI %#x, it is already busy",
+				se_tpg->se_tpg_tfo->fabric_name,
+				se_tpg->se_tpg_tfo->tpg_get_tag(se_tpg),
+				se_tpg->tpg_rtpi);
+			return -EINVAL;
+		}
+	} else {
+		ret = xa_alloc(&tpg_xa, &val, se_tpg,
+			       XA_LIMIT(1, USHRT_MAX), GFP_KERNEL);
+		if (!ret)
+			se_tpg->tpg_rtpi = val;
+	}
 
 	return ret;
 }
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 008e0e4500d1..e52d0915b3d8 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -918,6 +918,7 @@ struct se_portal_group {
 	bool			enabled;
 	/* RELATIVE TARGET PORT IDENTIFIER */
 	u16			tpg_rtpi;
+	bool			rtpi_manual;
 	/* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */
 	atomic_t		tpg_pr_ref_count;
 	/* Spinlock for adding/removing ACLed Nodes */
-- 
cgit v1.2.3


From 6978052448f9eb19f7b03243ac0416104e5ee50d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 8 Mar 2023 15:20:06 +0100
Subject: netlink: remove unused 'compare' function

No users in the tree.  Tested with allmodconfig build.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230308142006.20879-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h  | 1 -
 net/netlink/af_netlink.c | 2 --
 net/netlink/af_netlink.h | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c43ac7690eca..3e8743252167 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -50,7 +50,6 @@ struct netlink_kernel_cfg {
 	struct mutex	*cb_mutex;
 	int		(*bind)(struct net *net, int group);
 	void		(*unbind)(struct net *net, int group);
-	bool		(*compare)(struct net *net, struct sock *sk);
 };
 
 struct sock *__netlink_kernel_create(struct net *net, int unit,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c64277659753..877f1da1a8ac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2097,8 +2097,6 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 			nl_table[unit].bind = cfg->bind;
 			nl_table[unit].unbind = cfg->unbind;
 			nl_table[unit].flags = cfg->flags;
-			if (cfg->compare)
-				nl_table[unit].compare = cfg->compare;
 		}
 		nl_table[unit].registered = 1;
 	} else {
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 5f454c8de6a4..90a3198a9b7f 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -64,7 +64,6 @@ struct netlink_table {
 	struct module		*module;
 	int			(*bind)(struct net *net, int group);
 	void			(*unbind)(struct net *net, int group);
-	bool			(*compare)(struct net *net, struct sock *sock);
 	int			registered;
 };
 
-- 
cgit v1.2.3


From 62423bd2d2e231951245d77740a58027a2d81ef9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Mar 2023 18:26:48 +0000
Subject: net: sched: remove qdisc_watchdog->last_expires

This field mirrors hrtimer softexpires, we can instead
use the existing helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230308182648.1150762-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h | 1 -
 net/sched/sch_api.c     | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2016839991a4..bb0bd69fb655 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -64,7 +64,6 @@ static inline psched_time_t psched_get_time(void)
 }
 
 struct qdisc_watchdog {
-	u64		last_expires;
 	struct hrtimer	timer;
 	struct Qdisc	*qdisc;
 };
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index aba789c30a2e..fdb8f429333d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -639,14 +639,16 @@ void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
 		return;
 
 	if (hrtimer_is_queued(&wd->timer)) {
+		u64 softexpires;
+
+		softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
 		/* If timer is already set in [expires, expires + delta_ns],
 		 * do not reprogram it.
 		 */
-		if (wd->last_expires - expires <= delta_ns)
+		if (softexpires - expires <= delta_ns)
 			return;
 	}
 
-	wd->last_expires = expires;
 	hrtimer_start_range_ns(&wd->timer,
 			       ns_to_ktime(expires),
 			       delta_ns,
-- 
cgit v1.2.3


From 76b9bf965c98c9b53ef7420b3b11438dbd764f92 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 8 Mar 2023 11:23:13 +0200
Subject: neighbour: delete neigh_lookup_nodev as not used

neigh_lookup_nodev isn't used in the kernel after removal
of DECnet. So let's remove it.

Fixes: 1202cdd66531 ("Remove DECnet support from kernel")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/eb5656200d7964b2d177a36b77efa3c597d6d72d.1678267343.git.leonro@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  2 --
 net/core/neighbour.c    | 31 -------------------------------
 2 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2f2a6023fb0e..234799ca527e 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -336,8 +336,6 @@ void neigh_table_init(int index, struct neigh_table *tbl);
 int neigh_table_clear(int index, struct neigh_table *tbl);
 struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 			       struct net_device *dev);
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
-				     const void *pkey);
 struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 				 struct net_device *dev, bool want_ref);
 static inline struct neighbour *neigh_create(struct neigh_table *tbl,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6798f6d2423b..0116b0ff91a7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -627,37 +627,6 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 }
 EXPORT_SYMBOL(neigh_lookup);
 
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
-				     const void *pkey)
-{
-	struct neighbour *n;
-	unsigned int key_len = tbl->key_len;
-	u32 hash_val;
-	struct neigh_hash_table *nht;
-
-	NEIGH_CACHE_STAT_INC(tbl, lookups);
-
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
-	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
-
-	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
-	     n != NULL;
-	     n = rcu_dereference_bh(n->next)) {
-		if (!memcmp(n->primary_key, pkey, key_len) &&
-		    net_eq(dev_net(n->dev), net)) {
-			if (!refcount_inc_not_zero(&n->refcnt))
-				n = NULL;
-			NEIGH_CACHE_STAT_INC(tbl, hits);
-			break;
-		}
-	}
-
-	rcu_read_unlock_bh();
-	return n;
-}
-EXPORT_SYMBOL(neigh_lookup_nodev);
-
 static struct neighbour *
 ___neigh_create(struct neigh_table *tbl, const void *pkey,
 		struct net_device *dev, u32 flags,
-- 
cgit v1.2.3


From 0433686c6092f65b552eadad651f620e51b6aad1 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 24 Feb 2023 22:07:45 +0200
Subject: devres: Pass unique name of the resource to devm_add_action()

Pass the unique name of the resource to devm_add_action(),
so it will be easier to debug managed resources.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230224200745.17324-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/devres.c  | 11 ++++++-----
 include/linux/device.h |  5 ++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index c0e100074aa3..5c998cfac335 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -722,20 +722,21 @@ static void devm_action_release(struct device *dev, void *res)
 }
 
 /**
- * devm_add_action() - add a custom action to list of managed resources
+ * __devm_add_action() - add a custom action to list of managed resources
  * @dev: Device that owns the action
  * @action: Function that should be called
  * @data: Pointer to data passed to @action implementation
+ * @name: Name of the resource (for debugging purposes)
  *
  * This adds a custom action to the list of managed resources so that
  * it gets executed as part of standard resource unwinding.
  */
-int devm_add_action(struct device *dev, void (*action)(void *), void *data)
+int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name)
 {
 	struct action_devres *devres;
 
-	devres = devres_alloc(devm_action_release,
-			      sizeof(struct action_devres), GFP_KERNEL);
+	devres = __devres_alloc_node(devm_action_release, sizeof(struct action_devres),
+				     GFP_KERNEL, NUMA_NO_NODE, name);
 	if (!devres)
 		return -ENOMEM;
 
@@ -745,7 +746,7 @@ int devm_add_action(struct device *dev, void (*action)(void *), void *data)
 	devres_add(dev, devres);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(devm_add_action);
+EXPORT_SYMBOL_GPL(__devm_add_action);
 
 /**
  * devm_remove_action() - removes previously added custom action
diff --git a/include/linux/device.h b/include/linux/device.h
index 19b6ba478fbf..0f128520f6e5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -243,10 +243,13 @@ void __iomem *devm_of_iomap(struct device *dev,
 			    resource_size_t *size);
 
 /* allows to add/remove a custom action to devres stack */
-int devm_add_action(struct device *dev, void (*action)(void *), void *data);
 void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
 void devm_release_action(struct device *dev, void (*action)(void *), void *data);
 
+int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name);
+#define devm_add_action(release, action, data) \
+	__devm_add_action(release, action, data, #action)
+
 static inline int devm_add_action_or_reset(struct device *dev,
 					   void (*action)(void *), void *data)
 {
-- 
cgit v1.2.3


From 4c9edf17c0b44655c565b59a956161a2ee125cca Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 13 Feb 2023 11:10:05 +0800
Subject: crypto: acomp - Be more careful with request flags

The request flags for acompress is split into two parts.  Part of
it may be set by the user while the other part (ALLOC_OUTPUT) is
managed by the API.

This patch makes the split more explicit by not touching the other
bits at all in the two "set" functions that let the user modify the
flags.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/acompress.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index e4bc96528902..c14cfc9a3b79 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -219,7 +219,8 @@ static inline void acomp_request_set_callback(struct acomp_req *req,
 {
 	req->base.complete = cmpl;
 	req->base.data = data;
-	req->base.flags = flgs;
+	req->base.flags &= CRYPTO_ACOMP_ALLOC_OUTPUT;
+	req->base.flags |= flgs & ~CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
 
 /**
@@ -246,6 +247,7 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 	req->slen = slen;
 	req->dlen = dlen;
 
+	req->flags &= ~CRYPTO_ACOMP_ALLOC_OUTPUT;
 	if (!req->dst)
 		req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
-- 
cgit v1.2.3


From 740b2f03f80e8f50c3f96309bc123bea040de8d0 Mon Sep 17 00:00:00 2001
From: fengqi <fengqi@xiaomi.com>
Date: Thu, 16 Feb 2023 16:48:30 +0800
Subject: HID: add KEY_CAMERA_FOCUS event in HID

Our HID device need KEY_CAMERA_FOCUS event to control camera,
but this event is non-existent in current HID driver.
So we add this event in hid-input.c.

Signed-off-by: fengqi <fengqi@xiaomi.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 10 ++++++++++
 include/linux/hid.h     |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 7fc967964dd8..fbf5db9a0d67 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1261,6 +1261,16 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 			return;
 		}
 		goto unknown;
+	case HID_UP_CAMERA:
+		switch (usage->hid & HID_USAGE) {
+		case 0x020:
+			map_key_clear(KEY_CAMERA_FOCUS);	break;
+		case 0x021:
+			map_key_clear(KEY_CAMERA);		break;
+		default:
+			goto ignore;
+		}
+		break;
 
 	case HID_UP_HPVENDOR:	/* Reported on a Dutch layout HP5308 */
 		set_bit(EV_REP, input->evbit);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 1ea8c7a3570b..bff9f459db36 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -156,6 +156,7 @@ struct hid_item {
 #define HID_UP_DIGITIZER	0x000d0000
 #define HID_UP_PID		0x000f0000
 #define HID_UP_BATTERY		0x00850000
+#define HID_UP_CAMERA		0x00900000
 #define HID_UP_HPVENDOR         0xff7f0000
 #define HID_UP_HPVENDOR2        0xff010000
 #define HID_UP_MSVENDOR		0xff000000
-- 
cgit v1.2.3


From fd9c55c0acac59013dae9f8eedb0e0eb2e82e276 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 7 Mar 2023 11:56:40 +0100
Subject: soc: renesas: rcar-sysc: Remove R-Car H3 ES1.* handling

R-Car H3 ES1.* was only available to an internal development group and
needed a lot of quirks and workarounds.  These become a maintenance
burden now, so our development group decided to remove upstream support
and disable booting for this SoC.  Public users only have ES2 onwards.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20230307105645.5285-5-wsa+renesas@sang-engineering.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/soc/renesas/r8a7795-sysc.c       | 10 ----------
 include/dt-bindings/power/r8a7795-sysc.h |  1 -
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/renesas/r8a7795-sysc.c b/drivers/soc/renesas/r8a7795-sysc.c
index 91074411b8cf..cbe1ff0fc583 100644
--- a/drivers/soc/renesas/r8a7795-sysc.c
+++ b/drivers/soc/renesas/r8a7795-sysc.c
@@ -38,8 +38,6 @@ static struct rcar_sysc_area r8a7795_areas[] __initdata = {
 	{ "a3vp",	0x340, 0, R8A7795_PD_A3VP,	R8A7795_PD_ALWAYS_ON },
 	{ "cr7",	0x240, 0, R8A7795_PD_CR7,	R8A7795_PD_ALWAYS_ON },
 	{ "a3vc",	0x380, 0, R8A7795_PD_A3VC,	R8A7795_PD_ALWAYS_ON },
-	/* A2VC0 exists on ES1.x only */
-	{ "a2vc0",	0x3c0, 0, R8A7795_PD_A2VC0,	R8A7795_PD_A3VC },
 	{ "a2vc1",	0x3c0, 1, R8A7795_PD_A2VC1,	R8A7795_PD_A3VC },
 	{ "3dg-a",	0x100, 0, R8A7795_PD_3DG_A,	R8A7795_PD_ALWAYS_ON },
 	{ "3dg-b",	0x100, 1, R8A7795_PD_3DG_B,	R8A7795_PD_3DG_A },
@@ -54,14 +52,10 @@ static struct rcar_sysc_area r8a7795_areas[] __initdata = {
 	 * Fixups for R-Car H3 revisions
 	 */
 
-#define HAS_A2VC0	BIT(0)		/* Power domain A2VC0 is present */
 #define NO_EXTMASK	BIT(1)		/* Missing SYSCEXTMASK register */
 
 static const struct soc_device_attribute r8a7795_quirks_match[] __initconst = {
 	{
-		.soc_id = "r8a7795", .revision = "ES1.*",
-		.data = (void *)(HAS_A2VC0 | NO_EXTMASK),
-	}, {
 		.soc_id = "r8a7795", .revision = "ES2.*",
 		.data = (void *)(NO_EXTMASK),
 	},
@@ -77,10 +71,6 @@ static int __init r8a7795_sysc_init(void)
 	if (attr)
 		quirks = (uintptr_t)attr->data;
 
-	if (!(quirks & HAS_A2VC0))
-		rcar_sysc_nullify(r8a7795_areas, ARRAY_SIZE(r8a7795_areas),
-				  R8A7795_PD_A2VC0);
-
 	if (quirks & NO_EXTMASK)
 		r8a7795_sysc_info.extmask_val = 0;
 
diff --git a/include/dt-bindings/power/r8a7795-sysc.h b/include/dt-bindings/power/r8a7795-sysc.h
index eea6ad69f0b0..ff5323858572 100644
--- a/include/dt-bindings/power/r8a7795-sysc.h
+++ b/include/dt-bindings/power/r8a7795-sysc.h
@@ -30,7 +30,6 @@
 #define R8A7795_PD_CA53_SCU		21
 #define R8A7795_PD_3DG_E		22
 #define R8A7795_PD_A3IR			24
-#define R8A7795_PD_A2VC0		25	/* ES1.x only */
 #define R8A7795_PD_A2VC1		26
 
 /* Always-on power area */
-- 
cgit v1.2.3


From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Mar 2023 22:01:49 -0800
Subject: bpf: ensure state checkpointing at iter_next() call sites

State equivalence check and checkpointing performed in is_state_visited()
employs certain heuristics to try to save memory by avoiding state checkpoints
if not enough jumps and instructions happened since last checkpoint. This leads
to unpredictability of whether a particular instruction will be checkpointed
and how regularly. While normally this is not causing much problems (except
inconveniences for predictable verifier tests, which we overcome with
BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded
iterators.

Checking and saving state checkpoints at iter_next() call is crucial for fast
convergence of open-coded iterator loop logic, so we need to force it. If we
don't do that, is_state_visited() might skip saving a checkpoint, causing
unnecessarily long sequence of not checkpointed instructions and jumps, leading
to exhaustion of jump history buffer, and potentially other undesired outcomes.
It is expected that with correct open-coded iterators convergence will happen
quickly, so we don't run a risk of exhausting memory.

This patch adds, in addition to prune and jump instruction marks, also a
"forced checkpoint" mark, and makes sure that any iter_next() call instruction
is marked as such.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  6 +++++-
 kernel/bpf/verifier.c        | 31 ++++++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c052bc79940..81d525d057c7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -477,8 +477,12 @@ struct bpf_insn_aux_data {
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
-	bool prune_point;
 	bool jmp_point;
+	bool prune_point;
+	/* ensure we check state equivalence and save state checkpoint and
+	 * this instruction, regardless of any heuristics
+	 */
+	bool force_checkpoint;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45a082284464..13fd4c893f3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13865,6 +13865,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].prune_point;
 }
 
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
 enum {
 	DONE_EXPLORING = 0,
 	KEEP_EXPLORING = 1,
@@ -13984,8 +13995,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			struct bpf_kfunc_call_arg_meta meta;
 
 			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
-			if (ret == 0 && is_iter_next_kfunc(&meta))
+			if (ret == 0 && is_iter_next_kfunc(&meta)) {
 				mark_prune_point(env, t);
+				/* Checking and saving state checkpoints at iter_next() call
+				 * is crucial for fast convergence of open-coded iterator loop
+				 * logic, so we need to force it. If we don't do that,
+				 * is_state_visited() might skip saving a checkpoint, causing
+				 * unnecessarily long sequence of not checkpointed
+				 * instructions and jumps, leading to exhaustion of jump
+				 * history buffer, and potentially other undesired outcomes.
+				 * It is expected that with correct open-coded iterators
+				 * convergence will happen quickly, so we don't run a risk of
+				 * exhausting memory.
+				 */
+				mark_force_checkpoint(env, t);
+			}
 		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
@@ -15172,7 +15196,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
-	bool add_new_state = env->test_state_freq ? true : false;
+	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+	bool add_new_state = force_new_state;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -15269,7 +15294,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
 skip_inf_loop_check:
-			if (!env->test_state_freq &&
+			if (!force_new_state &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
 				add_new_state = false;
-- 
cgit v1.2.3


From 0b04d4c0542e8573a837b1d81b94209e48723b25 Mon Sep 17 00:00:00 2001
From: Douglas Raillard <douglas.raillard@arm.com>
Date: Mon, 6 Mar 2023 12:25:49 +0000
Subject: f2fs: Fix f2fs_truncate_partial_nodes ftrace event

Fix the nid_t field so that its size is correctly reported in the text
format embedded in trace.dat files. As it stands, it is reported as
being of size 4:

        field:nid_t nid[3];     offset:24;      size:4; signed:0;

Instead of 12:

        field:nid_t nid[3];     offset:24;      size:12;        signed:0;

This also fixes the reported offset of subsequent fields so that they
match with the actual struct layout.

Signed-off-by: Douglas Raillard <douglas.raillard@arm.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 1322d34a5dfc..99cbc5949e3c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -512,7 +512,7 @@ TRACE_EVENT(f2fs_truncate_partial_nodes,
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
-		__field(nid_t,	nid[3])
+		__array(nid_t,	nid, 3)
 		__field(int,	depth)
 		__field(int,	err)
 	),
-- 
cgit v1.2.3


From 557aafac11530a283bebf2dea4cec62765d8df0f Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 10 Mar 2023 10:55:59 -0800
Subject: kernel/module: add documentation for try_module_get()

There is quite a bit of tribal knowledge around proper use of try_module_get()
and requiring *somehow* the module to still exist to use this call in a way
that is safe. Document this bit of tribal knowledge. To be clear, you should
only use try_module_get() *iff* you are 100% sure the module already does
exist and is not on its way out.

You can be sure the module still exists and is alive through:

1) Direct protection with its refcount: you know some earlier caller called
   __module_get() safely
2) Implied protection: there is an implied protection against module removal

Having an idea of when you are sure __module_get() might be called earlier is
easy to understand however the implied protection requires an example. We use
sysfs an an example for implied protection without a direct module reference
count bump. kernfs / sysfs uses its own internal reference counting for files
being actively used, when such file are active they completely prevent
the module from being removed. kernfs protects this with its kernfs_active().
Effort has been put into verifying the kernfs implied protection works by
using a currently out-of-tree test_sysfs selftest test #32 [0]:

./tools/testing/selftests/sysfs/sysfs.sh -t 0032

Without kernfs / sysfs preventing module removal through its active reference
count (kernfs_active()) the write would fail or worse, a crash would happen in
this test and it does not.

Similar safeguards are required for other users of try_module_get() *iff*
they are not ensuring the above rule 1) is followed.

[0] https://lore.kernel.org/lkml/20211029184500.2821444-4-mcgrof@kernel.org/

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module.h | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 91726444d55f..c3b357196470 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -671,10 +671,46 @@ void symbol_put_addr(void *addr);
    to handle the error case (which only happens with rmmod --wait). */
 extern void __module_get(struct module *module);
 
-/* This is the Right Way to get a module: if it fails, it's being removed,
- * so pretend it's not there. */
+/**
+ * try_module_get() - take module refcount unless module is being removed
+ * @module: the module we should check for
+ *
+ * Only try to get a module reference count if the module is not being removed.
+ * This call will fail if the module is already being removed.
+ *
+ * Care must also be taken to ensure the module exists and is alive prior to
+ * usage of this call. This can be gauranteed through two means:
+ *
+ * 1) Direct protection: you know an earlier caller must have increased the
+ *    module reference through __module_get(). This can typically be achieved
+ *    by having another entity other than the module itself increment the
+ *    module reference count.
+ *
+ * 2) Implied protection: there is an implied protection against module
+ *    removal. An example of this is the implied protection used by kernfs /
+ *    sysfs. The sysfs store / read file operations are guaranteed to exist
+ *    through the use of kernfs's active reference (see kernfs_active()) and a
+ *    sysfs / kernfs file removal cannot happen unless the same file is not
+ *    active. Therefore, if a sysfs file is being read or written to the module
+ *    which created it must still exist. It is therefore safe to use
+ *    try_module_get() on module sysfs store / read ops.
+ *
+ * One of the real values to try_module_get() is the module_is_live() check
+ * which ensures that the caller of try_module_get() can yield to userspace
+ * module removal requests and gracefully fail if the module is on its way out.
+ *
+ * Returns true if the reference count was successfully incremented.
+ */
 extern bool try_module_get(struct module *module);
 
+/**
+ * module_put() - release a reference count to a module
+ * @module: the module we should release a reference count for
+ *
+ * If you successfully bump a reference count to a module with try_module_get(),
+ * when you are finished you must call module_put() to release that reference
+ * count.
+ */
 extern void module_put(struct module *module);
 
 #else /*!CONFIG_MODULE_UNLOAD*/
-- 
cgit v1.2.3


From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:20 -0800
Subject: bpf: Move a few bpf_local_storage functions to static scope

This patch moves the bpf_local_storage_free_rcu() and
bpf_selem_unlink_map() to static because they are
not used outside of bpf_local_storage.c.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 3 ---
 kernel/bpf/bpf_local_storage.c    | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index d934248b8e81..502ad7093f13 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
-
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
@@ -163,7 +161,6 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu);
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index d3ba3f2db640..1904a4245ebe 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -95,7 +95,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
@@ -251,7 +251,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
 }
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
-- 
cgit v1.2.3


From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:21 -0800
Subject: bpf: Refactor codes into bpf_local_storage_destroy

This patch first renames bpf_local_storage_unlink_nolock to
bpf_local_storage_destroy(). It better reflects that it is only
used when the storage's owner (sk/task/cgrp/inode) is being kfree().

All bpf_local_storage_destroy's caller is taking the spin lock and
then free the storage. This patch also moves these two steps into
the bpf_local_storage_destroy.

This is a preparation work for a later patch that uses
bpf_mem_cache_alloc/free in the bpf_local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 2 +-
 kernel/bpf/bpf_cgrp_storage.c     | 9 +--------
 kernel/bpf/bpf_inode_storage.c    | 8 +-------
 kernel/bpf/bpf_local_storage.c    | 8 ++++++--
 kernel/bpf/bpf_task_storage.c     | 9 +--------
 net/core/bpf_sk_storage.c         | 8 +-------
 6 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 502ad7093f13..5908a954ddc2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage);
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 
 void bpf_local_storage_map_free(struct bpf_map *map,
 				struct bpf_local_storage_cache *cache,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 9ae07aedaf23..492594d69a86 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
 void bpf_cgrp_storage_free(struct cgroup *cgroup)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_cgroup_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 	local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
 	}
 
 	bpf_cgrp_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_cgrp_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_cgroup_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static struct bpf_local_storage_data *
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 43e2619c8167..2d25bcfa371b 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 void bpf_inode_storage_free(struct inode *inode)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_inode_storage = false;
 	struct bpf_storage_blob *bsb;
 
 	bsb = bpf_inode(inode);
@@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	raw_spin_lock_bh(&local_storage->lock);
-	free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_bh(&local_storage->lock);
+	bpf_local_storage_destroy(local_storage);
 	rcu_read_unlock();
-
-	if (free_inode_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 1904a4245ebe..e19f9f50a60d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -652,11 +652,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 	return 0;
 }
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
 	struct bpf_local_storage_elem *selem;
 	bool free_storage = false;
 	struct hlist_node *n;
+	unsigned long flags;
 
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
@@ -667,6 +668,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 	 * when unlinking elem from the local_storage->list and
 	 * the map's bucket->list.
 	 */
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
 		/* Always unlink from map before unlinking from
 		 * local_storage.
@@ -681,8 +683,10 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 		free_storage = bpf_selem_unlink_storage_nolock(
 			local_storage, selem, false, false);
 	}
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	return free_storage;
+	if (free_storage)
+		kfree_rcu(local_storage, rcu);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 20f942229f3c..4dcef28744d1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 void bpf_task_storage_free(struct task_struct *task)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_task_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
 	}
 
 	bpf_task_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_task_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_task_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 7a36353dbc22..8f56438c104b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -49,7 +49,6 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 void bpf_sk_storage_free(struct sock *sk)
 {
 	struct bpf_local_storage *sk_storage;
-	bool free_sk_storage = false;
 
 	rcu_read_lock();
 	sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -58,13 +57,8 @@ void bpf_sk_storage_free(struct sock *sk)
 		return;
 	}
 
-	raw_spin_lock_bh(&sk_storage->lock);
-	free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
-	raw_spin_unlock_bh(&sk_storage->lock);
+	bpf_local_storage_destroy(sk_storage);
 	rcu_read_unlock();
-
-	if (free_sk_storage)
-		kfree_rcu(sk_storage, rcu);
 }
 
 static void bpf_sk_storage_map_free(struct bpf_map *map)
-- 
cgit v1.2.3


From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:24 -0800
Subject: bpf: Remember smap in bpf_local_storage

This patch remembers which smap triggers the allocation
of a 'struct bpf_local_storage' object. The local_storage is
allocated during the very first selem added to the owner.
The smap pointer is needed when using the bpf_mem_cache_free
in a later patch because it needs to free to the correct
smap's bpf_mem_alloc object.

When a selem is being removed, it needs to check if it is
the selem that triggers the creation of the local_storage.
If it is, the local_storage->smap pointer will be reset to NULL.
This NULL reset is done under the local_storage->lock in
bpf_selem_unlink_storage_nolock() when a selem is being removed.
Also note that the local_storage may not go away even
local_storage->smap is NULL because there may be other
selem still stored in the local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 1 +
 kernel/bpf/bpf_local_storage.c    | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 5908a954ddc2..613b1805ed9f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -83,6 +83,7 @@ struct bpf_local_storage_elem {
 
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct bpf_local_storage_map __rcu *smap;
 	struct hlist_head list; /* List of bpf_local_storage_elem */
 	void *owner;		/* The object that owns the above "list" of
 				 * bpf_local_storage_elem.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 70df8dcb2066..5585dbfd9c66 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -213,6 +213,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 			kfree_rcu(selem, rcu);
 	}
 
+	if (rcu_access_pointer(local_storage->smap) == smap)
+		RCU_INIT_POINTER(local_storage->smap, NULL);
+
 	return free_local_storage;
 }
 
@@ -368,6 +371,7 @@ int bpf_local_storage_alloc(void *owner,
 		goto uncharge;
 	}
 
+	RCU_INIT_POINTER(storage->smap, smap);
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
 	storage->owner = owner;
-- 
cgit v1.2.3


From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:25 -0800
Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage

This patch re-purpose the use_trace_rcu to mean
if the freed memory can be reused immediately or not.
The use_trace_rcu is renamed to reuse_now. Other than
the boolean test is reversed, it should be a no-op.

The following explains the reason for the rename and how it will
be used in a later patch.

In a later patch, bpf_mem_cache_alloc/free will be used
in the bpf_local_storage. The bpf mem allocator will reuse
the freed memory immediately. Some of the free paths in
bpf_local_storage does not support memory to be reused immediately.
These paths are the "delete" elem cases from the bpf_*_storage_delete()
helper and the map_delete_elem() syscall. Note that "delete" elem
before the owner's (sk/task/cgrp/inode) lifetime ended is not
the common usage for the local storage.

The common free path, bpf_local_storage_destroy(), can reuse the
memory immediately. This common path means the storage stays with
its owner until the owner is destroyed.

The above mentioned "delete" elem paths that cannot
reuse immediately always has the 'use_trace_rcu ==  true'.
The cases that is safe for immediate reuse always have
'use_trace_rcu == false'. Instead of adding another arg
in a later patch, this patch re-purpose this arg
to reuse_now and have the test logic reversed.

In a later patch, 'reuse_now == true' will free to the
bpf_mem_cache_free() where the memory can be reused
immediately. 'reuse_now == false' will go through the
call_rcu_tasks_trace().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  2 +-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 24 ++++++++++++------------
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 613b1805ed9f..18a31add2255 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 492594d69a86..c975cacdd16b 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -121,7 +121,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 	return 0;
 }
 
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2d25bcfa371b..ad2ab0187e45 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -122,7 +122,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 5585dbfd9c66..70c34a948c3c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -147,7 +147,7 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
  */
 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 					    struct bpf_local_storage_elem *selem,
-					    bool uncharge_mem, bool use_trace_rcu)
+					    bool uncharge_mem, bool reuse_now)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -201,7 +201,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	 * any special fields.
 	 */
 	rec = smap->map.record;
-	if (use_trace_rcu) {
+	if (!reuse_now) {
 		if (!IS_ERR_OR_NULL(rec))
 			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
 		else
@@ -220,7 +220,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 }
 
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
-				     bool use_trace_rcu)
+				     bool reuse_now)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -235,11 +235,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, use_trace_rcu);
+			local_storage, selem, true, reuse_now);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage) {
-		if (use_trace_rcu)
+		if (!reuse_now)
 			call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_rcu);
 		else
@@ -284,14 +284,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
 {
 	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
 	 * the local_storage.
 	 */
 	bpf_selem_unlink_map(selem);
-	bpf_selem_unlink_storage(selem, use_trace_rcu);
+	bpf_selem_unlink_storage(selem, reuse_now);
 }
 
 /* If cacheit_lockit is false, this lookup function is lockless */
@@ -538,7 +538,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false, true);
+						false, false);
 	}
 
 unlock:
@@ -651,7 +651,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, false);
+			local_storage, selem, false, true);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
@@ -745,7 +745,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 				migrate_disable();
 				this_cpu_inc(*busy_counter);
 			}
-			bpf_selem_unlink(selem, false);
+			bpf_selem_unlink(selem, true);
 			if (busy_counter) {
 				this_cpu_dec(*busy_counter);
 				migrate_enable();
@@ -783,8 +783,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
 		 * is true, because while call_rcu invocation is skipped in that
 		 * case in bpf_selem_free_fields_trace_rcu (and all local
-		 * storage maps pass use_trace_rcu = true), there can be
-		 * call_rcu callbacks based on use_trace_rcu = false in the
+		 * storage maps pass reuse_now = false), there can be
+		 * call_rcu callbacks based on reuse_now = true in the
 		 * while ((selem = ...)) loop above or when owner's free path
 		 * calls bpf_local_storage_unlink_nolock.
 		 */
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 4dcef28744d1..c88cc04c17c1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -168,7 +168,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	if (!nobusy)
 		return -EBUSY;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 8f56438c104b..a5f185b8e50a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
-- 
cgit v1.2.3


From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:28 -0800
Subject: bpf: Add bpf_selem_free()

This patch refactors the selem freeing logic into bpf_selem_free().
It is a preparation work for a later patch using
bpf_mem_cache_alloc/free. The other kfree(selem) cases
are also changed to bpf_selem_free(..., reuse_now = true).

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  4 ++++
 kernel/bpf/bpf_local_storage.c    | 21 ++++++++++++++-------
 net/core/bpf_sk_storage.c         |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 18a31add2255..a34f61467a2f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -152,6 +152,10 @@ struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now);
+
 int
 bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146e9caeda96..512943aac435 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -125,6 +125,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now)
+{
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+	if (!reuse_now)
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	else
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+}
+
 /* local_storage->lock must be held and selem->local_storage == local_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
@@ -175,11 +186,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	bpf_selem_free(selem, smap, reuse_now);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -423,7 +430,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 
 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
-			kfree(selem);
+			bpf_selem_free(selem, smap, true);
 			mem_uncharge(smap, owner, smap->elem_size);
 			return ERR_PTR(err);
 		}
@@ -517,7 +524,7 @@ unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	if (selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		kfree(selem);
+		bpf_selem_free(selem, smap, true);
 	}
 	return ERR_PTR(err);
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index a5f185b8e50a..24c3dc0d62e5 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -197,7 +197,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		} else {
 			ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
 			if (ret) {
-				kfree(copy_selem);
+				bpf_selem_free(copy_selem, smap, true);
 				atomic_sub(smap->elem_size,
 					   &newsk->sk_omem_alloc);
 				bpf_map_put(map);
-- 
cgit v1.2.3


From 5fe5a7586c27a13a42b7946334d41a96c776a188 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Fri, 10 Mar 2023 17:07:53 +0100
Subject: Move COMPAT_ATM_ADDPARTY to net/atm/svc.c

This used to be behind an #ifdef COMPAT_COMPAT, so most of userspace
wouldn't have seen the definition before.  Unfortunately this header
file became visible to userspace, so the definition has instead been
moved to net/atm/svc.c (the only user).

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Reviewed-by: Andrew Waterman <waterman@eecs.berkeley.edu>
Reviewed-by: Albert Ou <aou@eecs.berkeley.edu>
Message-Id: <1447119071-19392-4-git-send-email-palmer@dabbelt.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/atmdev.h | 4 ----
 net/atm/svc.c               | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/atmdev.h b/include/uapi/linux/atmdev.h
index a5c15cf23bd7..20b0215084fc 100644
--- a/include/uapi/linux/atmdev.h
+++ b/include/uapi/linux/atmdev.h
@@ -101,10 +101,6 @@ struct atm_dev_stats {
 					/* use backend to make new if */
 #define ATM_ADDPARTY  	_IOW('a', ATMIOC_SPECIAL+4,struct atm_iobuf)
  					/* add party to p2mp call */
-#ifdef CONFIG_COMPAT
-/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */
-#define COMPAT_ATM_ADDPARTY  	_IOW('a', ATMIOC_SPECIAL+4,struct compat_atm_iobuf)
-#endif
 #define ATM_DROPPARTY 	_IOW('a', ATMIOC_SPECIAL+5,int)
 					/* drop party from p2mp call */
 
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 4a02bcaad279..d83556d8beb9 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -28,6 +28,11 @@
 #include "signaling.h"
 #include "addr.h"
 
+#ifdef CONFIG_COMPAT
+/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */
+#define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf)
+#endif
+
 static int svc_create(struct net *net, struct socket *sock, int protocol,
 		      int kern);
 
-- 
cgit v1.2.3


From 063f3ed9faf38e41449838bb3258a7d00f75c5d1 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Fri, 10 Mar 2023 17:07:54 +0100
Subject: Move ep_take_care_of_epollwakeup() to fs/eventpoll.c

This doesn't make any sense to expose to userspace, so it's been moved
to the one user.  This was introduced by commit 95f19f658ce1 ("epoll:
drop EPOLLWAKEUP if PM_SLEEP is disabled").

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Reviewed-by: Andrew Waterman <waterman@eecs.berkeley.edu>
Reviewed-by: Albert Ou <aou@eecs.berkeley.edu>
Message-Id: <1447119071-19392-7-git-send-email-palmer@dabbelt.com>
[thuth: Rebased to fix contextual conflicts]
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/eventpoll.c                 | 13 +++++++++++++
 include/uapi/linux/eventpoll.h | 12 ------------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 64659b110973..e2a5d2cc9051 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2042,6 +2042,19 @@ SYSCALL_DEFINE1(epoll_create, int, size)
 	return do_epoll_create(0);
 }
 
+#ifdef CONFIG_PM_SLEEP
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+	if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
+		epev->events &= ~EPOLLWAKEUP;
+}
+#else
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+	epev->events &= ~EPOLLWAKEUP;
+}
+#endif
+
 static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
 				   bool nonblock)
 {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index e687658843b1..cfbcc4cc49ac 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -85,16 +85,4 @@ struct epoll_event {
 	__u64 data;
 } EPOLL_PACKED;
 
-#ifdef CONFIG_PM_SLEEP
-static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
-{
-	if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
-		epev->events &= ~EPOLLWAKEUP;
-}
-#else
-static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
-{
-	epev->events &= ~EPOLLWAKEUP;
-}
-#endif
 #endif /* _UAPI_LINUX_EVENTPOLL_H */
-- 
cgit v1.2.3


From d3c7ec7588508bd59bbb0e468d3820f3d3cba690 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Fri, 10 Mar 2023 17:07:55 +0100
Subject: Move bp_type_idx to include/linux/hw_breakpoint.h

This has a "#ifdef CONFIG_*" that used to be exposed to userspace.

The names in here are so generic that I don't think it's a good idea
to expose them to userspace (or even the rest of the kernel).  There are
multiple in-kernel users, so it's been moved to a kernel header file.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Reviewed-by: Andrew Waterman <waterman@eecs.berkeley.edu>
Reviewed-by: Albert Ou <aou@eecs.berkeley.edu>
Message-Id: <1447119071-19392-10-git-send-email-palmer@dabbelt.com>
[thuth: Remove it also from tools/include/uapi/linux/hw_breakpoint.h]
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/hw_breakpoint.h            | 10 ++++++++++
 include/uapi/linux/hw_breakpoint.h       | 10 ----------
 tools/include/uapi/linux/hw_breakpoint.h | 10 ----------
 3 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index f319bd26b030..7fbb45911273 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -7,6 +7,16 @@
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
+enum bp_type_idx {
+	TYPE_INST	= 0,
+#if defined(CONFIG_HAVE_MIXED_BREAKPOINTS_REGS)
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 extern int __init init_hw_breakpoint(void);
 
 static inline void hw_breakpoint_init(struct perf_event_attr *attr)
diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
index 965e4d8606d8..1575d3ca6f0d 100644
--- a/include/uapi/linux/hw_breakpoint.h
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -22,14 +22,4 @@ enum {
 	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
-
 #endif /* _UAPI_LINUX_HW_BREAKPOINT_H */
diff --git a/tools/include/uapi/linux/hw_breakpoint.h b/tools/include/uapi/linux/hw_breakpoint.h
index 965e4d8606d8..1575d3ca6f0d 100644
--- a/tools/include/uapi/linux/hw_breakpoint.h
+++ b/tools/include/uapi/linux/hw_breakpoint.h
@@ -22,14 +22,4 @@ enum {
 	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
-
 #endif /* _UAPI_LINUX_HW_BREAKPOINT_H */
-- 
cgit v1.2.3


From f5bdc61eb6089e10181b8f51b0a180bbd47a89fc Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Fri, 10 Mar 2023 17:07:56 +0100
Subject: pktcdvd: Remove CONFIG_CDROM_PKTCDVD_WCACHE from uapi header

CONFIG_* switches should not be exposed in uapi headers, thus let's get
rid of the USE_WCACHING macro here (which was also named way to generic)
and integrate the logic directly in the only function that needs it.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/block/pktcdvd.c      | 13 +++++++++----
 include/uapi/linux/pktcdvd.h | 11 -----------
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2f1a92509271..5ae2a80db2c3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1869,12 +1869,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 /*
  * enable/disable write caching on drive
  */
-static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
-						int set)
+static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd)
 {
 	struct packet_command cgc;
 	struct scsi_sense_hdr sshdr;
 	unsigned char buf[64];
+	bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE);
 	int ret;
 
 	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
@@ -1890,7 +1890,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 	if (ret)
 		return ret;
 
-	buf[pd->mode_offset + 10] |= (!!set << 2);
+	/*
+	 * use drive write caching -- we need deferred error handling to be
+	 * able to successfully recover with this option (drive will return good
+	 * status as soon as the cdb is validated).
+	 */
+	buf[pd->mode_offset + 10] |= (set << 2);
 
 	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
 	ret = pkt_mode_select(pd, &cgc);
@@ -2085,7 +2090,7 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 		return -EIO;
 	}
 
-	pkt_write_caching(pd, USE_WCACHING);
+	pkt_write_caching(pd);
 
 	ret = pkt_get_max_speed(pd, &write_speed);
 	if (ret)
diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h
index 9cbb55d21c94..6a5552dfd6af 100644
--- a/include/uapi/linux/pktcdvd.h
+++ b/include/uapi/linux/pktcdvd.h
@@ -29,17 +29,6 @@
  */
 #define PACKET_WAIT_TIME	(HZ * 5 / 1000)
 
-/*
- * use drive write caching -- we need deferred error handling to be
- * able to successfully recover with this option (drive will return good
- * status as soon as the cdb is validated).
- */
-#if defined(CONFIG_CDROM_PKTCDVD_WCACHE)
-#define USE_WCACHING		1
-#else
-#define USE_WCACHING		0
-#endif
-
 /*
  * No user-servicable parts beyond this point ->
  */
-- 
cgit v1.2.3


From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:08 -0800
Subject: bpf: Change btf_record_find enum parameter to field_mask

btf_record_find's 3rd parameter can be multiple enum btf_field_type's
masked together. The function is called with BPF_KPTR in two places in
verifier.c, so it works with masked values already.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 +-
 kernel/bpf/syscall.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e64ff1e89fb2..3a38db315f7f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog);
 void bpf_map_free_id(struct bpf_map *map);
 
 struct btf_field *btf_record_find(const struct btf_record *rec,
-				  u32 offset, enum btf_field_type type);
+				  u32 offset, u32 field_mask);
 void btf_record_free(struct btf_record *rec);
 void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f406dfa13792..cc4b7684910c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b)
 }
 
 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
-				  enum btf_field_type type)
+				  u32 field_mask)
 {
 	struct btf_field *field;
 
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
 		return NULL;
 	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
-	if (!field || !(field->type & type))
+	if (!field || !(field->type & field_mask))
 		return NULL;
 	return field;
 }
-- 
cgit v1.2.3


From 887d85a0736ff346cbfe5efaf51cf20c7ca195a3 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 8 Mar 2023 20:39:50 +0000
Subject: kunit: fix bug in debugfs logs of parameterized tests

Fix bug in debugfs logs that causes individual parameterized results to not
appear because the log is reinitialized (cleared) when each parameter is
run.

Ensure these results appear in the debugfs logs, increase log size to
allow for the size of parameterized results. As a result, append lines to
the log directly rather than using an intermediate variable that can cause
stack size warnings due to the increased log size.

Here is the debugfs log of ext4_inode_test which uses parameterized tests
before the fix:

     KTAP version 1

     # Subtest: ext4_inode_test
     1..1
 # Totals: pass:16 fail:0 skip:0 total:16
 ok 1 ext4_inode_test

As you can see, this log does not include any of the individual
parametrized results.

After (in combination with the next two fixes to remove extra empty line
and ensure KTAP valid format):

 KTAP version 1
 1..1
     KTAP version 1
     # Subtest: ext4_inode_test
     1..1
        KTAP version 1
         # Subtest: inode_test_xtimestamp_decoding
         ok 1 1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits
         ... (the rest of the individual parameterized tests)
         ok 16 2446-05-10 Upper bound of 32bit >=0 timestamp. All extra
     # inode_test_xtimestamp_decoding: pass:16 fail:0 skip:0 total:16
     ok 1 inode_test_xtimestamp_decoding
 # Totals: pass:16 fail:0 skip:0 total:16
 ok 1 ext4_inode_test

Signed-off-by: Rae Moar <rmoar@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h |  2 +-
 lib/kunit/test.c     | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 08d3559dd703..0668d29f3453 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -34,7 +34,7 @@ DECLARE_STATIC_KEY_FALSE(kunit_running);
 struct kunit;
 
 /* Size of log associated with test. */
-#define KUNIT_LOG_SIZE	512
+#define KUNIT_LOG_SIZE 1500
 
 /* Maximum size of parameter description string. */
 #define KUNIT_PARAM_DESC_SIZE 128
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index c9e15bb60058..c4d6304edd61 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -114,22 +114,27 @@ static void kunit_print_test_stats(struct kunit *test,
  */
 void kunit_log_append(char *log, const char *fmt, ...)
 {
-	char line[KUNIT_LOG_SIZE];
 	va_list args;
-	int len_left;
+	int len, log_len, len_left;
 
 	if (!log)
 		return;
 
-	len_left = KUNIT_LOG_SIZE - strlen(log) - 1;
+	log_len = strlen(log);
+	len_left = KUNIT_LOG_SIZE - log_len - 1;
 	if (len_left <= 0)
 		return;
 
+	/* Evaluate length of line to add to log */
 	va_start(args, fmt);
-	vsnprintf(line, sizeof(line), fmt, args);
+	len = vsnprintf(NULL, 0, fmt, args) + 1;
+	va_end(args);
+
+	/* Print formatted line to the log */
+	va_start(args, fmt);
+	vsnprintf(log + log_len, min(len, len_left), fmt, args);
 	va_end(args);
 
-	strncat(log, line, len_left);
 }
 EXPORT_SYMBOL_GPL(kunit_log_append);
 
@@ -437,7 +442,6 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
 	struct kunit_try_catch_context context;
 	struct kunit_try_catch *try_catch;
 
-	kunit_init_test(test, test_case->name, test_case->log);
 	try_catch = &test->try_catch;
 
 	kunit_try_catch_init(try_catch,
@@ -533,6 +537,8 @@ int kunit_run_tests(struct kunit_suite *suite)
 		struct kunit_result_stats param_stats = { 0 };
 		test_case->status = KUNIT_SKIPPED;
 
+		kunit_init_test(&test, test_case->name, test_case->log);
+
 		if (!test_case->generate_params) {
 			/* Non-parameterised test. */
 			kunit_run_case_catch_errors(suite, test_case, &test);
-- 
cgit v1.2.3


From 2c6a96dad5797e57b4cf04101d6c8d5c7a571603 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 8 Mar 2023 20:39:52 +0000
Subject: kunit: fix bug of extra newline characters in debugfs logs

Fix bug of the extra newline characters in debugfs logs. When a
line is added to debugfs with a newline character at the end,
an extra line appears in the debugfs log.

This is due to a discrepancy between how the lines are printed and how they
are added to the logs. Remove this discrepancy by checking if a newline
character is present before adding a newline character. This should closely
match the printk behavior.

Add kunit_log_newline_test to provide test coverage for this issue.  (Also,
move kunit_log_test above suite definition to remove the unnecessary
declaration prior to the suite definition)

As an example, say we add these two lines to the log:

kunit_log(..., "KTAP version 1\n");
kunit_log(..., "1..1");

The debugfs log before this fix:

 KTAP version 1

 1..1

The debugfs log after this fix:

 KTAP version 1
 1..1

Signed-off-by: Rae Moar <rmoar@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h   |  2 +-
 lib/kunit/kunit-test.c | 35 +++++++++++++++++++++++------------
 lib/kunit/test.c       | 18 ++++++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 0668d29f3453..9721584027d8 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -420,7 +420,7 @@ void __printf(2, 3) kunit_log_append(char *log, const char *fmt, ...);
 #define kunit_log(lvl, test_or_suite, fmt, ...)				\
 	do {								\
 		printk(lvl fmt, ##__VA_ARGS__);				\
-		kunit_log_append((test_or_suite)->log,	fmt "\n",	\
+		kunit_log_append((test_or_suite)->log,	fmt,		\
 				 ##__VA_ARGS__);			\
 	} while (0)
 
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 4df0335d0d06..b63595d3e241 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -443,18 +443,6 @@ static struct kunit_suite kunit_resource_test_suite = {
 	.test_cases = kunit_resource_test_cases,
 };
 
-static void kunit_log_test(struct kunit *test);
-
-static struct kunit_case kunit_log_test_cases[] = {
-	KUNIT_CASE(kunit_log_test),
-	{}
-};
-
-static struct kunit_suite kunit_log_test_suite = {
-	.name = "kunit-log-test",
-	.test_cases = kunit_log_test_cases,
-};
-
 static void kunit_log_test(struct kunit *test)
 {
 	struct kunit_suite suite;
@@ -481,6 +469,29 @@ static void kunit_log_test(struct kunit *test)
 #endif
 }
 
+static void kunit_log_newline_test(struct kunit *test)
+{
+	kunit_info(test, "Add newline\n");
+	if (test->log) {
+		KUNIT_ASSERT_NOT_NULL_MSG(test, strstr(test->log, "Add newline\n"),
+			"Missing log line, full log:\n%s", test->log);
+		KUNIT_EXPECT_NULL(test, strstr(test->log, "Add newline\n\n"));
+	} else {
+		kunit_skip(test, "only useful when debugfs is enabled");
+	}
+}
+
+static struct kunit_case kunit_log_test_cases[] = {
+	KUNIT_CASE(kunit_log_test),
+	KUNIT_CASE(kunit_log_newline_test),
+	{}
+};
+
+static struct kunit_suite kunit_log_test_suite = {
+	.name = "kunit-log-test",
+	.test_cases = kunit_log_test_cases,
+};
+
 static void kunit_status_set_failure_test(struct kunit *test)
 {
 	struct kunit fake;
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 811fcc376d2f..e2910b261112 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -108,6 +108,22 @@ static void kunit_print_test_stats(struct kunit *test,
 		  stats.total);
 }
 
+/**
+ * kunit_log_newline() - Add newline to the end of log if one is not
+ * already present.
+ * @log: The log to add the newline to.
+ */
+static void kunit_log_newline(char *log)
+{
+	int log_len, len_left;
+
+	log_len = strlen(log);
+	len_left = KUNIT_LOG_SIZE - log_len - 1;
+
+	if (log_len > 0 && log[log_len - 1] != '\n')
+		strncat(log, "\n", len_left);
+}
+
 /*
  * Append formatted message to log, size of which is limited to
  * KUNIT_LOG_SIZE bytes (including null terminating byte).
@@ -135,6 +151,8 @@ void kunit_log_append(char *log, const char *fmt, ...)
 	vsnprintf(log + log_len, min(len, len_left), fmt, args);
 	va_end(args);
 
+	/* Add newline to end of log if not already present. */
+	kunit_log_newline(log);
 }
 EXPORT_SYMBOL_GPL(kunit_log_append);
 
-- 
cgit v1.2.3


From 42994ee3cd7298b27698daa6848ed7168e72d056 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 10 Mar 2023 09:53:59 +0100
Subject: security: Introduce LSM_ORDER_LAST and set it for the integrity LSM

Introduce LSM_ORDER_LAST, to satisfy the requirement of LSMs needing to be
last, e.g. the 'integrity' LSM, without changing the kernel command line or
configuration.

Also, set this order for the 'integrity' LSM. While not enforced, this is
the only LSM expected to use it.

Similarly to LSM_ORDER_FIRST, LSMs with LSM_ORDER_LAST are always enabled
and put at the end of the LSM list, if selected in the kernel
configuration. Setting one of these orders alone, does not cause the LSMs
to be selected and compiled built-in in the kernel.

Finally, for LSM_ORDER_MUTABLE LSMs, set the found variable to true if an
LSM is found, regardless of its order. In this way, the kernel would not
wrongly report that the LSM is not built-in in the kernel if its order is
LSM_ORDER_LAST.

Fixes: 79f7865d844c ("LSM: Introduce "lsm=" for boottime LSM selection")
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h |  1 +
 security/integrity/iint.c |  1 +
 security/security.c       | 12 +++++++++---
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index ddbbe89a7a48..c2be66c669ae 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -92,6 +92,7 @@ extern void security_add_hooks(struct security_hook_list *hooks, int count,
 enum lsm_order {
 	LSM_ORDER_FIRST = -1,	/* This is only for capabilities. */
 	LSM_ORDER_MUTABLE = 0,
+	LSM_ORDER_LAST = 1,	/* This is only for integrity. */
 };
 
 struct lsm_info {
diff --git a/security/integrity/iint.c b/security/integrity/iint.c
index 8638976f7990..b97eb59e0e32 100644
--- a/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@ -182,6 +182,7 @@ static int __init integrity_iintcache_init(void)
 DEFINE_LSM(integrity) = {
 	.name = "integrity",
 	.init = integrity_iintcache_init,
+	.order = LSM_ORDER_LAST,
 };
 
 
diff --git a/security/security.c b/security/security.c
index e6c275fff001..b808e1b86551 100644
--- a/security/security.c
+++ b/security/security.c
@@ -285,9 +285,9 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		bool found = false;
 
 		for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-			if (lsm->order == LSM_ORDER_MUTABLE &&
-			    strcmp(lsm->name, name) == 0) {
-				append_ordered_lsm(lsm, origin);
+			if (strcmp(lsm->name, name) == 0) {
+				if (lsm->order == LSM_ORDER_MUTABLE)
+					append_ordered_lsm(lsm, origin);
 				found = true;
 			}
 		}
@@ -307,6 +307,12 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		}
 	}
 
+	/* LSM_ORDER_LAST is always last. */
+	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+		if (lsm->order == LSM_ORDER_LAST)
+			append_ordered_lsm(lsm, "   last");
+	}
+
 	/* Disable all LSMs not in the ordered list. */
 	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
 		if (exists_ordered_lsm(lsm))
-- 
cgit v1.2.3


From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 10 Mar 2023 15:07:41 -0800
Subject: bpf: Support __kptr to local kptrs

If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module
BTF - it must have been allocated by bpf_obj_new and therefore must be
free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local
kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new.

This patch adds support for treating __kptr-tagged pointers to "local
kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr
acquire / release semantics. Consider the following example:

  struct node_data {
          long key;
          long data;
          struct bpf_rb_node node;
  };

  struct map_value {
          struct node_data __kptr *node;
  };

  struct {
          __uint(type, BPF_MAP_TYPE_ARRAY);
          __type(key, int);
          __type(value, struct map_value);
          __uint(max_entries, 1);
  } some_nodes SEC(".maps");

If struct node_data had a matching definition in kernel BTF, the verifier would
expect a destructor for the type to be registered. Since struct node_data does
not match any type in kernel BTF, the verifier knows that there is no kfunc
that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can
only come from bpf_obj_new. So instead of searching for a registered dtor,
a bpf_obj_drop dtor can be assumed.

This allows the runtime to properly destruct such kptrs in
bpf_obj_free_fields, which enables maps to clean up map_vals w/ such
kptrs when going away.

Implementation notes:
  * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr.
    Before this patch, the variable would only ever point to vmlinux or
    module BTFs, but now it can point to some program BTF for local kptr
    type. It's later used to populate the (btf, btf_id) pair in kptr btf
    field.
  * It's necessary to btf_get the program BTF when populating btf_field
    for local kptr. btf_record_free later does a btf_put.
  * Behavior for non-local referenced kptrs is not modified, as
    bpf_find_btf_id helper only searches vmlinux and module BTFs for
    matching BTF type. If such a type is found, btf_field_kptr's btf will
    pass btf_is_kernel check, and the associated release function is
    some one-argument dtor. If btf_is_kernel check fails, associated
    release function is two-arg bpf_obj_drop_impl. Before this patch
    only btf_field_kptr's w/ kernel or module BTFs were created.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 11 ++++++++++-
 include/linux/btf.h  |  2 --
 kernel/bpf/btf.c     | 37 ++++++++++++++++++++++++++++---------
 kernel/bpf/helpers.c | 11 ++++++++---
 kernel/bpf/syscall.c | 14 +++++++++++++-
 5 files changed, 59 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3a38db315f7f..756b85f0d0d3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -189,10 +189,19 @@ enum btf_field_type {
 				 BPF_RB_NODE | BPF_RB_ROOT,
 };
 
+typedef void (*btf_dtor_kfunc_t)(void *);
+typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
+
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	btf_dtor_kfunc_t dtor;
+	union {
+		/* dtor used if btf_is_kernel(btf), otherwise the type
+		 * is program-allocated and obj_drop is used
+		 */
+		btf_dtor_kfunc_t dtor;
+		btf_dtor_obj_drop obj_drop;
+	};
 	u32 btf_id;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1bba0827e8c4..d53b10cc55f2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -121,8 +121,6 @@ struct btf_struct_metas {
 	struct btf_struct_meta types[];
 };
 
-typedef void (*btf_dtor_kfunc_t)(void *);
-
 extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 37779ceefd09..66fad7a16b6c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,12 +3551,17 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
 	struct module *mod = NULL;
 	const struct btf_type *t;
-	struct btf *kernel_btf;
+	/* If a matching btf type is found in kernel or module BTFs, kptr_ref
+	 * is that BTF, otherwise it's program BTF
+	 */
+	struct btf *kptr_btf;
 	int ret;
 	s32 id;
 
@@ -3565,7 +3570,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 	 */
 	t = btf_type_by_id(btf, info->kptr.type_id);
 	id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
-			     &kernel_btf);
+			     &kptr_btf);
+	if (id == -ENOENT) {
+		/* btf_parse_kptr should only be called w/ btf = program BTF */
+		WARN_ON_ONCE(btf_is_kernel(btf));
+
+		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+		 * kptr allocated via bpf_obj_new
+		 */
+		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		id = info->kptr.type_id;
+		kptr_btf = (struct btf *)btf;
+		btf_get(kptr_btf);
+		goto found_dtor;
+	}
 	if (id < 0)
 		return id;
 
@@ -3582,20 +3600,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		 * can be used as a referenced pointer and be stored in a map at
 		 * the same time.
 		 */
-		dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+		dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
 		if (dtor_btf_id < 0) {
 			ret = dtor_btf_id;
 			goto end_btf;
 		}
 
-		dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+		dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
 		if (!dtor_func) {
 			ret = -ENOENT;
 			goto end_btf;
 		}
 
-		if (btf_is_module(kernel_btf)) {
-			mod = btf_try_get_module(kernel_btf);
+		if (btf_is_module(kptr_btf)) {
+			mod = btf_try_get_module(kptr_btf);
 			if (!mod) {
 				ret = -ENXIO;
 				goto end_btf;
@@ -3605,7 +3623,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* We already verified dtor_func to be btf_type_is_func
 		 * in register_btf_id_dtor_kfuncs.
 		 */
-		dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+		dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
 		addr = kallsyms_lookup_name(dtor_func_name);
 		if (!addr) {
 			ret = -EINVAL;
@@ -3614,14 +3632,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		field->kptr.dtor = (void *)addr;
 	}
 
+found_dtor:
 	field->kptr.btf_id = id;
-	field->kptr.btf = kernel_btf;
+	field->kptr.btf = kptr_btf;
 	field->kptr.module = mod;
 	return 0;
 end_mod:
 	module_put(mod);
 end_btf:
-	btf_put(kernel_btf);
+	btf_put(kptr_btf);
 	return ret;
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f9b7eeedce08..77d64b6951b9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1896,14 +1896,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+	if (rec)
+		bpf_obj_free_fields(rec, p);
+	bpf_mem_free(&bpf_global_ma, p);
+}
+
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	if (meta)
-		bpf_obj_free_fields(meta->record, p);
-	bpf_mem_free(&bpf_global_ma, p);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
 }
 
 static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cc4b7684910c..0684febc447a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		return;
 	fields = rec->fields;
 	for (i = 0; i < rec->cnt; i++) {
+		struct btf_struct_meta *pointee_struct_meta;
 		const struct btf_field *field = &fields[i];
 		void *field_ptr = obj + field->offset;
+		void *xchgd_field;
 
 		switch (fields[i].type) {
 		case BPF_SPIN_LOCK:
@@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
-			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+			if (!btf_is_kernel(field->kptr.btf)) {
+				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+									   field->kptr.btf_id);
+				WARN_ON_ONCE(!pointee_struct_meta);
+				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
+								  pointee_struct_meta->record :
+								  NULL);
+			} else {
+				field->kptr.dtor(xchgd_field);
+			}
 			break;
 		case BPF_LIST_HEAD:
 			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
-- 
cgit v1.2.3


From 4ad682e04c1694aba95627f60e89f83178c277c2 Mon Sep 17 00:00:00 2001
From: Mehdi Djait <mehdi.djait.k@gmail.com>
Date: Thu, 2 Mar 2023 14:04:35 +0100
Subject: iio: Improve the kernel-doc of iio_trigger_poll

Move the kernel-doc of the function to industrialio-trigger.c
Add a note on the context where the function is expected to be called.

Signed-off-by: Mehdi Djait <mehdi.djait.k@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/bd84fc17e9d22eab998bf48720297f9a77689f45.1677761379.git.mehdi.djait.k@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-trigger.c | 6 ++++++
 include/linux/iio/trigger.h        | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index a2f3cc2f65ef..270ec0a9f641 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -192,6 +192,12 @@ static void iio_trigger_notify_done_atomic(struct iio_trigger *trig)
 		schedule_work(&trig->reenable_work);
 }
 
+/**
+ * iio_trigger_poll() - Call the IRQ trigger handler of the consumers
+ * @trig: trigger which occurred
+ *
+ * This function should only be called from a hard IRQ context.
+ */
 void iio_trigger_poll(struct iio_trigger *trig)
 {
 	int i;
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index f6360d9a492d..42da55dc3aa7 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -151,12 +151,6 @@ void iio_trigger_unregister(struct iio_trigger *trig_info);
  **/
 int iio_trigger_set_immutable(struct iio_dev *indio_dev, struct iio_trigger *trig);
 
-/**
- * iio_trigger_poll() - called on a trigger occurring
- * @trig:	trigger which occurred
- *
- * Typically called in relevant hardware interrupt handler.
- **/
 void iio_trigger_poll(struct iio_trigger *trig);
 void iio_trigger_poll_chained(struct iio_trigger *trig);
 
-- 
cgit v1.2.3


From f700e55ef6ef9ec723164659ed4385900981c872 Mon Sep 17 00:00:00 2001
From: Mehdi Djait <mehdi.djait.k@gmail.com>
Date: Thu, 2 Mar 2023 14:04:36 +0100
Subject: iio: Rename iio_trigger_poll_chained and add kernel-doc

Rename the function to iio_trigger_poll_nested. Add kernel-doc with
a note on the context where the function is expected to be called.

Signed-off-by: Mehdi Djait <mehdi.djait.k@gmail.com>
Link: https://lore.kernel.org/r/841b533cba28ca25a8e87280c44e45979166e8e2.1677761379.git.mehdi.djait.k@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/bma400_core.c                    |  2 +-
 drivers/iio/accel/kionix-kx022a.c                  |  2 +-
 drivers/iio/accel/mma8452.c                        |  2 +-
 drivers/iio/accel/msa311.c                         |  2 +-
 drivers/iio/adc/ad7606.c                           |  2 +-
 drivers/iio/adc/at91-sama5d2_adc.c                 |  2 +-
 drivers/iio/adc/max11410.c                         |  2 +-
 drivers/iio/common/st_sensors/st_sensors_trigger.c |  4 ++--
 drivers/iio/gyro/fxas21002c_core.c                 |  2 +-
 drivers/iio/gyro/mpu3050-core.c                    |  2 +-
 drivers/iio/humidity/hts221_buffer.c               |  2 +-
 drivers/iio/industrialio-trigger.c                 | 11 +++++++++--
 drivers/iio/light/acpi-als.c                       |  2 +-
 drivers/iio/light/rpr0521.c                        |  2 +-
 drivers/iio/light/st_uvis25_core.c                 |  2 +-
 drivers/iio/light/vcnl4000.c                       |  2 +-
 drivers/iio/light/vcnl4035.c                       |  2 +-
 drivers/iio/potentiostat/lmp91000.c                |  2 +-
 drivers/iio/pressure/zpa2326.c                     |  2 +-
 drivers/iio/proximity/as3935.c                     |  2 +-
 drivers/iio/trigger/iio-trig-loop.c                |  2 +-
 include/linux/iio/trigger.h                        |  2 +-
 22 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/accel/bma400_core.c b/drivers/iio/accel/bma400_core.c
index 623f37cbaf50..a68b845f5b4f 100644
--- a/drivers/iio/accel/bma400_core.c
+++ b/drivers/iio/accel/bma400_core.c
@@ -1688,7 +1688,7 @@ static irqreturn_t bma400_interrupt(int irq, void *private)
 
 	if (FIELD_GET(BMA400_INT_DRDY_MSK, le16_to_cpu(data->status))) {
 		mutex_unlock(&data->mutex);
-		iio_trigger_poll_chained(data->trig);
+		iio_trigger_poll_nested(data->trig);
 		return IRQ_HANDLED;
 	}
 
diff --git a/drivers/iio/accel/kionix-kx022a.c b/drivers/iio/accel/kionix-kx022a.c
index 9f8c7631f098..8dac0337c249 100644
--- a/drivers/iio/accel/kionix-kx022a.c
+++ b/drivers/iio/accel/kionix-kx022a.c
@@ -899,7 +899,7 @@ static irqreturn_t kx022a_irq_thread_handler(int irq, void *private)
 	mutex_lock(&data->mutex);
 
 	if (data->trigger_enabled) {
-		iio_trigger_poll_chained(data->trig);
+		iio_trigger_poll_nested(data->trig);
 		ret = IRQ_HANDLED;
 	}
 
diff --git a/drivers/iio/accel/mma8452.c b/drivers/iio/accel/mma8452.c
index f97fb68e3a71..ea14e3aaa30a 100644
--- a/drivers/iio/accel/mma8452.c
+++ b/drivers/iio/accel/mma8452.c
@@ -1067,7 +1067,7 @@ static irqreturn_t mma8452_interrupt(int irq, void *p)
 		return IRQ_NONE;
 
 	if (src & MMA8452_INT_DRDY) {
-		iio_trigger_poll_chained(indio_dev->trig);
+		iio_trigger_poll_nested(indio_dev->trig);
 		ret = IRQ_HANDLED;
 	}
 
diff --git a/drivers/iio/accel/msa311.c b/drivers/iio/accel/msa311.c
index af94d3adf6d8..6690fa37da8f 100644
--- a/drivers/iio/accel/msa311.c
+++ b/drivers/iio/accel/msa311.c
@@ -951,7 +951,7 @@ static irqreturn_t msa311_irq_thread(int irq, void *p)
 	}
 
 	if (new_data_int_enabled)
-		iio_trigger_poll_chained(msa311->new_data_trig);
+		iio_trigger_poll_nested(msa311->new_data_trig);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/iio/adc/ad7606.c b/drivers/iio/adc/ad7606.c
index dd6b603f65ea..1928d9ae5bcf 100644
--- a/drivers/iio/adc/ad7606.c
+++ b/drivers/iio/adc/ad7606.c
@@ -477,7 +477,7 @@ static irqreturn_t ad7606_interrupt(int irq, void *dev_id)
 
 	if (iio_buffer_enabled(indio_dev)) {
 		gpiod_set_value(st->gpio_convst, 0);
-		iio_trigger_poll_chained(st->trig);
+		iio_trigger_poll_nested(st->trig);
 	} else {
 		complete(&st->completion);
 	}
diff --git a/drivers/iio/adc/at91-sama5d2_adc.c b/drivers/iio/adc/at91-sama5d2_adc.c
index 50d02e5fc6fc..b5d0c9ee284c 100644
--- a/drivers/iio/adc/at91-sama5d2_adc.c
+++ b/drivers/iio/adc/at91-sama5d2_adc.c
@@ -1194,7 +1194,7 @@ static void at91_dma_buffer_done(void *data)
 {
 	struct iio_dev *indio_dev = data;
 
-	iio_trigger_poll_chained(indio_dev->trig);
+	iio_trigger_poll_nested(indio_dev->trig);
 }
 
 static int at91_adc_dma_start(struct iio_dev *indio_dev)
diff --git a/drivers/iio/adc/max11410.c b/drivers/iio/adc/max11410.c
index b74b689ee7de..237b2ce3f264 100644
--- a/drivers/iio/adc/max11410.c
+++ b/drivers/iio/adc/max11410.c
@@ -678,7 +678,7 @@ static irqreturn_t max11410_interrupt(int irq, void *dev_id)
 	struct max11410_state *st = iio_priv(indio_dev);
 
 	if (iio_buffer_enabled(indio_dev))
-		iio_trigger_poll_chained(st->trig);
+		iio_trigger_poll_nested(st->trig);
 	else
 		complete(&st->completion);
 
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index 899b640c0a70..a0df9250a69f 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -85,7 +85,7 @@ static irqreturn_t st_sensors_irq_thread(int irq, void *p)
 	 */
 	if (sdata->hw_irq_trigger &&
 	    st_sensors_new_samples_available(indio_dev, sdata)) {
-		iio_trigger_poll_chained(p);
+		iio_trigger_poll_nested(p);
 	} else {
 		dev_dbg(indio_dev->dev.parent, "spurious IRQ\n");
 		return IRQ_NONE;
@@ -110,7 +110,7 @@ static irqreturn_t st_sensors_irq_thread(int irq, void *p)
 		dev_dbg(indio_dev->dev.parent,
 			"more samples came in during polling\n");
 		sdata->hw_timestamp = iio_get_time_ns(indio_dev);
-		iio_trigger_poll_chained(p);
+		iio_trigger_poll_nested(p);
 	}
 
 	return IRQ_HANDLED;
diff --git a/drivers/iio/gyro/fxas21002c_core.c b/drivers/iio/gyro/fxas21002c_core.c
index 3ea1d4613080..c28d17ca6f5e 100644
--- a/drivers/iio/gyro/fxas21002c_core.c
+++ b/drivers/iio/gyro/fxas21002c_core.c
@@ -813,7 +813,7 @@ static irqreturn_t fxas21002c_data_rdy_thread(int irq, void *private)
 	if (!data_ready)
 		return IRQ_NONE;
 
-	iio_trigger_poll_chained(data->dready_trig);
+	iio_trigger_poll_nested(data->dready_trig);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/iio/gyro/mpu3050-core.c b/drivers/iio/gyro/mpu3050-core.c
index 6a6d84a3deda..a791ba3a693a 100644
--- a/drivers/iio/gyro/mpu3050-core.c
+++ b/drivers/iio/gyro/mpu3050-core.c
@@ -939,7 +939,7 @@ static irqreturn_t mpu3050_irq_thread(int irq, void *p)
 	if (!(val & MPU3050_INT_STATUS_RAW_RDY))
 		return IRQ_NONE;
 
-	iio_trigger_poll_chained(p);
+	iio_trigger_poll_nested(p);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/iio/humidity/hts221_buffer.c b/drivers/iio/humidity/hts221_buffer.c
index 2a4107a79662..11ef38994a95 100644
--- a/drivers/iio/humidity/hts221_buffer.c
+++ b/drivers/iio/humidity/hts221_buffer.c
@@ -68,7 +68,7 @@ static irqreturn_t hts221_trigger_handler_thread(int irq, void *private)
 	if (!(status & HTS221_RH_DRDY_MASK))
 		return IRQ_NONE;
 
-	iio_trigger_poll_chained(hw->trig);
+	iio_trigger_poll_nested(hw->trig);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 270ec0a9f641..784dc1e00310 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -222,7 +222,14 @@ irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private)
 }
 EXPORT_SYMBOL(iio_trigger_generic_data_rdy_poll);
 
-void iio_trigger_poll_chained(struct iio_trigger *trig)
+/**
+ * iio_trigger_poll_nested() - Call the threaded trigger handler of the
+ * consumers
+ * @trig: trigger which occurred
+ *
+ * This function should only be called from a kernel thread context.
+ */
+void iio_trigger_poll_nested(struct iio_trigger *trig)
 {
 	int i;
 
@@ -237,7 +244,7 @@ void iio_trigger_poll_chained(struct iio_trigger *trig)
 		}
 	}
 }
-EXPORT_SYMBOL(iio_trigger_poll_chained);
+EXPORT_SYMBOL(iio_trigger_poll_nested);
 
 void iio_trigger_notify_done(struct iio_trigger *trig)
 {
diff --git a/drivers/iio/light/acpi-als.c b/drivers/iio/light/acpi-als.c
index e1ff6f524f4b..2d91caf24dd0 100644
--- a/drivers/iio/light/acpi-als.c
+++ b/drivers/iio/light/acpi-als.c
@@ -108,7 +108,7 @@ static void acpi_als_notify(struct acpi_device *device, u32 event)
 	if (iio_buffer_enabled(indio_dev) && iio_trigger_using_own(indio_dev)) {
 		switch (event) {
 		case ACPI_ALS_NOTIFY_ILLUMINANCE:
-			iio_trigger_poll_chained(als->trig);
+			iio_trigger_poll_nested(als->trig);
 			break;
 		default:
 			/* Unhandled event */
diff --git a/drivers/iio/light/rpr0521.c b/drivers/iio/light/rpr0521.c
index 668e444f6049..9d0218b7426e 100644
--- a/drivers/iio/light/rpr0521.c
+++ b/drivers/iio/light/rpr0521.c
@@ -431,7 +431,7 @@ static irqreturn_t rpr0521_drdy_irq_thread(int irq, void *private)
 	struct rpr0521_data *data = iio_priv(indio_dev);
 
 	if (rpr0521_is_triggered(data)) {
-		iio_trigger_poll_chained(data->drdy_trigger0);
+		iio_trigger_poll_nested(data->drdy_trigger0);
 		return IRQ_HANDLED;
 	}
 
diff --git a/drivers/iio/light/st_uvis25_core.c b/drivers/iio/light/st_uvis25_core.c
index c737d3e193ae..50f95c5d2060 100644
--- a/drivers/iio/light/st_uvis25_core.c
+++ b/drivers/iio/light/st_uvis25_core.c
@@ -161,7 +161,7 @@ static irqreturn_t st_uvis25_trigger_handler_thread(int irq, void *private)
 	if (!(status & ST_UVIS25_REG_UV_DA_MASK))
 		return IRQ_NONE;
 
-	iio_trigger_poll_chained(hw->trig);
+	iio_trigger_poll_nested(hw->trig);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c
index 6bdfce9747f9..b9460c165d37 100644
--- a/drivers/iio/light/vcnl4000.c
+++ b/drivers/iio/light/vcnl4000.c
@@ -1078,7 +1078,7 @@ static irqreturn_t vcnl4010_irq_thread(int irq, void *p)
 	}
 
 	if (isr & VCNL4010_INT_DRDY && iio_buffer_enabled(indio_dev))
-		iio_trigger_poll_chained(indio_dev->trig);
+		iio_trigger_poll_nested(indio_dev->trig);
 
 end:
 	return IRQ_HANDLED;
diff --git a/drivers/iio/light/vcnl4035.c b/drivers/iio/light/vcnl4035.c
index 84148b944000..14e29330e972 100644
--- a/drivers/iio/light/vcnl4035.c
+++ b/drivers/iio/light/vcnl4035.c
@@ -89,7 +89,7 @@ static irqreturn_t vcnl4035_drdy_irq_thread(int irq, void *private)
 							IIO_EV_TYPE_THRESH,
 							IIO_EV_DIR_EITHER),
 				iio_get_time_ns(indio_dev));
-		iio_trigger_poll_chained(data->drdy_trigger0);
+		iio_trigger_poll_nested(data->drdy_trigger0);
 		return IRQ_HANDLED;
 	}
 
diff --git a/drivers/iio/potentiostat/lmp91000.c b/drivers/iio/potentiostat/lmp91000.c
index b82f093f1e6a..0083e858c21e 100644
--- a/drivers/iio/potentiostat/lmp91000.c
+++ b/drivers/iio/potentiostat/lmp91000.c
@@ -118,7 +118,7 @@ static int lmp91000_read(struct lmp91000_data *data, int channel, int *val)
 
 	data->chan_select = channel != LMP91000_REG_MODECN_3LEAD;
 
-	iio_trigger_poll_chained(data->trig);
+	iio_trigger_poll_nested(data->trig);
 
 	ret = wait_for_completion_timeout(&data->completion, HZ);
 	reinit_completion(&data->completion);
diff --git a/drivers/iio/pressure/zpa2326.c b/drivers/iio/pressure/zpa2326.c
index 67119a9b95fc..421e059d1f19 100644
--- a/drivers/iio/pressure/zpa2326.c
+++ b/drivers/iio/pressure/zpa2326.c
@@ -829,7 +829,7 @@ static irqreturn_t zpa2326_handle_threaded_irq(int irq, void *data)
 	}
 
 	/* New sample available: dispatch internal trigger consumers. */
-	iio_trigger_poll_chained(priv->trigger);
+	iio_trigger_poll_nested(priv->trigger);
 
 	if (cont)
 		/*
diff --git a/drivers/iio/proximity/as3935.c b/drivers/iio/proximity/as3935.c
index ebc95cf8f5f4..96fa97451cbf 100644
--- a/drivers/iio/proximity/as3935.c
+++ b/drivers/iio/proximity/as3935.c
@@ -257,7 +257,7 @@ static void as3935_event_work(struct work_struct *work)
 
 	switch (val) {
 	case AS3935_EVENT_INT:
-		iio_trigger_poll_chained(st->trig);
+		iio_trigger_poll_nested(st->trig);
 		break;
 	case AS3935_DISTURB_INT:
 	case AS3935_NOISE_INT:
diff --git a/drivers/iio/trigger/iio-trig-loop.c b/drivers/iio/trigger/iio-trig-loop.c
index 96ec06bbe546..7aaed0611899 100644
--- a/drivers/iio/trigger/iio-trig-loop.c
+++ b/drivers/iio/trigger/iio-trig-loop.c
@@ -46,7 +46,7 @@ static int iio_loop_thread(void *data)
 	set_freezable();
 
 	do {
-		iio_trigger_poll_chained(trig);
+		iio_trigger_poll_nested(trig);
 	} while (likely(!kthread_freezable_should_stop(NULL)));
 
 	return 0;
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 42da55dc3aa7..51f52c5c6092 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -152,7 +152,7 @@ void iio_trigger_unregister(struct iio_trigger *trig_info);
 int iio_trigger_set_immutable(struct iio_dev *indio_dev, struct iio_trigger *trig);
 
 void iio_trigger_poll(struct iio_trigger *trig);
-void iio_trigger_poll_chained(struct iio_trigger *trig);
+void iio_trigger_poll_nested(struct iio_trigger *trig);
 
 irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private);
 
-- 
cgit v1.2.3


From 9e264f3f85a56cc109cc2d6010a48aa89d5c1ff1 Mon Sep 17 00:00:00 2001
From: Amit Kumar Mahapatra via Alsa-devel <alsa-devel@alsa-project.org>
Date: Fri, 10 Mar 2023 23:02:03 +0530
Subject: spi: Replace all spi->chip_select and spi->cs_gpiod references with
 function call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Supporting multi-cs in spi drivers would require the chip_select & cs_gpiod
members of struct spi_device to be an array. But changing the type of these
members to array would break the spi driver functionality. To make the
transition smoother introduced four new APIs to get/set the
spi->chip_select & spi->cs_gpiod and replaced all spi->chip_select and
spi->cs_gpiod references with get or set API calls.
While adding multi-cs support in further patches the chip_select & cs_gpiod
members of the spi_device structure would be converted to arrays & the
"idx" parameter of the APIs would be used as array index i.e.,
spi->chip_select[idx] & spi->cs_gpiod[idx] respectively.

Signed-off-by: Amit Kumar Mahapatra <amit.kumar-mahapatra@amd.com>
Acked-by: Heiko Stuebner <heiko@sntech.de> # Rockchip drivers
Reviewed-by: Michal Simek <michal.simek@amd.com>
Reviewed-by: Cédric Le Goater <clg@kaod.org> # Aspeed driver
Reviewed-by: Dhruva Gole <d-gole@ti.com> # SPI Cadence QSPI
Reviewed-by: Patrice Chotard <patrice.chotard@foss.st.com> # spi-stm32-qspi
Acked-by: William Zhang <william.zhang@broadcom.com> # bcm63xx-hsspi driver
Reviewed-by: Serge Semin <fancer.lancer@gmail.com> # DW SSI part
Link: https://lore.kernel.org/r/167847070432.26.15076794204368669839@mailman-core.alsa-project.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-altera-core.c     |  2 +-
 drivers/spi/spi-amd.c             |  4 ++--
 drivers/spi/spi-ar934x.c          |  2 +-
 drivers/spi/spi-armada-3700.c     |  4 ++--
 drivers/spi/spi-aspeed-smc.c      | 13 +++++++------
 drivers/spi/spi-at91-usart.c      |  2 +-
 drivers/spi/spi-ath79.c           |  4 ++--
 drivers/spi/spi-atmel.c           | 26 +++++++++++++-------------
 drivers/spi/spi-au1550.c          |  4 ++--
 drivers/spi/spi-axi-spi-engine.c  |  2 +-
 drivers/spi/spi-bcm-qspi.c        | 10 +++++-----
 drivers/spi/spi-bcm2835.c         | 19 ++++++++++---------
 drivers/spi/spi-bcm2835aux.c      |  4 ++--
 drivers/spi/spi-bcm63xx-hsspi.c   | 30 +++++++++++++++---------------
 drivers/spi/spi-bcm63xx.c         |  2 +-
 drivers/spi/spi-bcmbca-hsspi.c    | 30 +++++++++++++++---------------
 drivers/spi/spi-cadence-quadspi.c |  5 +++--
 drivers/spi/spi-cadence-xspi.c    |  4 ++--
 drivers/spi/spi-cadence.c         |  4 ++--
 drivers/spi/spi-cavium.c          |  8 ++++----
 drivers/spi/spi-coldfire-qspi.c   |  8 ++++----
 drivers/spi/spi-davinci.c         | 18 +++++++++---------
 drivers/spi/spi-dln2.c            |  6 +++---
 drivers/spi/spi-dw-core.c         |  2 +-
 drivers/spi/spi-dw-mmio.c         |  4 ++--
 drivers/spi/spi-falcon.c          |  2 +-
 drivers/spi/spi-fsi.c             |  2 +-
 drivers/spi/spi-fsl-dspi.c        | 16 ++++++++--------
 drivers/spi/spi-fsl-espi.c        |  6 +++---
 drivers/spi/spi-fsl-lpspi.c       |  2 +-
 drivers/spi/spi-fsl-qspi.c        |  6 +++---
 drivers/spi/spi-fsl-spi.c         |  2 +-
 drivers/spi/spi-geni-qcom.c       |  6 +++---
 drivers/spi/spi-gpio.c            |  4 ++--
 drivers/spi/spi-gxp.c             |  4 ++--
 drivers/spi/spi-hisi-sfc-v3xx.c   |  2 +-
 drivers/spi/spi-img-spfi.c        | 14 +++++++-------
 drivers/spi/spi-imx.c             | 30 +++++++++++++++---------------
 drivers/spi/spi-ingenic.c         |  4 ++--
 drivers/spi/spi-intel.c           |  2 +-
 drivers/spi/spi-jcore.c           |  4 ++--
 drivers/spi/spi-lantiq-ssc.c      |  6 +++---
 drivers/spi/spi-mem.c             |  4 ++--
 drivers/spi/spi-meson-spicc.c     |  2 +-
 drivers/spi/spi-microchip-core.c  |  6 +++---
 drivers/spi/spi-mpc512x-psc.c     |  8 ++++----
 drivers/spi/spi-mpc52xx.c         |  2 +-
 drivers/spi/spi-mt65xx.c          |  6 +++---
 drivers/spi/spi-mt7621.c          |  2 +-
 drivers/spi/spi-mux.c             |  8 ++++----
 drivers/spi/spi-mxic.c            | 10 +++++-----
 drivers/spi/spi-mxs.c             |  2 +-
 drivers/spi/spi-npcm-fiu.c        | 20 ++++++++++----------
 drivers/spi/spi-nxp-fspi.c        | 10 +++++-----
 drivers/spi/spi-omap-uwire.c      |  8 ++++----
 drivers/spi/spi-omap2-mcspi.c     | 24 ++++++++++++------------
 drivers/spi/spi-orion.c           |  4 ++--
 drivers/spi/spi-pci1xxxx.c        |  4 ++--
 drivers/spi/spi-pic32-sqi.c       |  2 +-
 drivers/spi/spi-pic32.c           |  4 ++--
 drivers/spi/spi-pl022.c           |  4 ++--
 drivers/spi/spi-pxa2xx.c          |  6 +++---
 drivers/spi/spi-qcom-qspi.c       |  2 +-
 drivers/spi/spi-rb4xx.c           |  2 +-
 drivers/spi/spi-rockchip-sfc.c    |  2 +-
 drivers/spi/spi-rockchip.c        | 26 ++++++++++++++------------
 drivers/spi/spi-rspi.c            | 10 +++++-----
 drivers/spi/spi-s3c64xx.c         |  2 +-
 drivers/spi/spi-sc18is602.c       |  4 ++--
 drivers/spi/spi-sh-msiof.c        |  6 +++---
 drivers/spi/spi-sh-sci.c          |  2 +-
 drivers/spi/spi-sifive.c          |  6 +++---
 drivers/spi/spi-sn-f-ospi.c       |  2 +-
 drivers/spi/spi-st-ssc4.c         |  2 +-
 drivers/spi/spi-stm32-qspi.c      | 12 ++++++------
 drivers/spi/spi-sun4i.c           |  2 +-
 drivers/spi/spi-sun6i.c           |  2 +-
 drivers/spi/spi-synquacer.c       |  6 +++---
 drivers/spi/spi-tegra114.c        | 28 ++++++++++++++--------------
 drivers/spi/spi-tegra20-sflash.c  |  2 +-
 drivers/spi/spi-tegra20-slink.c   |  6 +++---
 drivers/spi/spi-tegra210-quad.c   |  8 ++++----
 drivers/spi/spi-ti-qspi.c         | 16 ++++++++--------
 drivers/spi/spi-topcliff-pch.c    |  4 ++--
 drivers/spi/spi-wpcm-fiu.c        | 12 ++++++------
 drivers/spi/spi-xcomm.c           |  2 +-
 drivers/spi/spi-xilinx.c          |  6 +++---
 drivers/spi/spi-xlp.c             |  4 ++--
 drivers/spi/spi-zynq-qspi.c       |  2 +-
 drivers/spi/spi-zynqmp-gqspi.c    |  2 +-
 drivers/spi/spidev.c              |  6 +++---
 include/trace/events/spi.h        | 10 +++++-----
 92 files changed, 333 insertions(+), 328 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi-altera-core.c b/drivers/spi/spi-altera-core.c
index 94fe6bf1b9a6..87e37f48f196 100644
--- a/drivers/spi/spi-altera-core.c
+++ b/drivers/spi/spi-altera-core.c
@@ -80,7 +80,7 @@ static void altera_spi_set_cs(struct spi_device *spi, bool is_high)
 		altr_spi_writel(hw, ALTERA_SPI_TARGET_SEL, 0);
 	} else {
 		altr_spi_writel(hw, ALTERA_SPI_TARGET_SEL,
-				BIT(spi->chip_select));
+				BIT(spi_get_chipselect(spi, 0)));
 		hw->imr |= ALTERA_SPI_CONTROL_SSO_MSK;
 		altr_spi_writel(hw, ALTERA_SPI_CONTROL, hw->imr);
 	}
diff --git a/drivers/spi/spi-amd.c b/drivers/spi/spi-amd.c
index bfc3ab5f39ea..fecead757a3c 100644
--- a/drivers/spi/spi-amd.c
+++ b/drivers/spi/spi-amd.c
@@ -347,7 +347,7 @@ fin_msg:
 	case AMD_SPI_V1:
 		break;
 	case AMD_SPI_V2:
-		amd_spi_clear_chip(amd_spi, message->spi->chip_select);
+		amd_spi_clear_chip(amd_spi, spi_get_chipselect(message->spi, 0));
 		break;
 	default:
 		return -ENODEV;
@@ -364,7 +364,7 @@ static int amd_spi_master_transfer(struct spi_master *master,
 	struct amd_spi *amd_spi = spi_master_get_devdata(master);
 	struct spi_device *spi = msg->spi;
 
-	amd_spi_select_chip(amd_spi, spi->chip_select);
+	amd_spi_select_chip(amd_spi, spi_get_chipselect(spi, 0));
 
 	/*
 	 * Extract spi_transfers from the spi message and
diff --git a/drivers/spi/spi-ar934x.c b/drivers/spi/spi-ar934x.c
index c71c8348eeaa..9dcada8c4cb9 100644
--- a/drivers/spi/spi-ar934x.c
+++ b/drivers/spi/spi-ar934x.c
@@ -125,7 +125,7 @@ static int ar934x_spi_transfer_one_message(struct spi_controller *ctlr,
 				iowrite32(reg, sp->base + AR934X_SPI_DATAOUT);
 			}
 
-			reg = AR934X_SPI_SHIFT_VAL(spi->chip_select, term,
+			reg = AR934X_SPI_SHIFT_VAL(spi_get_chipselect(spi, 0), term,
 						   trx_cur * 8);
 			iowrite32(reg, sp->base + AR934X_SPI_REG_SHIFT_CTRL);
 			stat = readl_poll_timeout(
diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c
index 6a7e605f73bf..a7fb7c94e70e 100644
--- a/drivers/spi/spi-armada-3700.c
+++ b/drivers/spi/spi-armada-3700.c
@@ -437,9 +437,9 @@ static void a3700_spi_set_cs(struct spi_device *spi, bool enable)
 	struct a3700_spi *a3700_spi = spi_controller_get_devdata(spi->controller);
 
 	if (!enable)
-		a3700_spi_activate_cs(a3700_spi, spi->chip_select);
+		a3700_spi_activate_cs(a3700_spi, spi_get_chipselect(spi, 0));
 	else
-		a3700_spi_deactivate_cs(a3700_spi, spi->chip_select);
+		a3700_spi_deactivate_cs(a3700_spi, spi_get_chipselect(spi, 0));
 }
 
 static void a3700_spi_header_set(struct a3700_spi *a3700_spi)
diff --git a/drivers/spi/spi-aspeed-smc.c b/drivers/spi/spi-aspeed-smc.c
index 3f2548860317..e75b0d51f06a 100644
--- a/drivers/spi/spi-aspeed-smc.c
+++ b/drivers/spi/spi-aspeed-smc.c
@@ -296,7 +296,7 @@ static const struct aspeed_spi_data ast2400_spi_data;
 static int do_aspeed_spi_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct aspeed_spi *aspi = spi_controller_get_devdata(mem->spi->master);
-	struct aspeed_spi_chip *chip = &aspi->chips[mem->spi->chip_select];
+	struct aspeed_spi_chip *chip = &aspi->chips[spi_get_chipselect(mem->spi, 0)];
 	u32 addr_mode, addr_mode_backup;
 	u32 ctl_val;
 	int ret = 0;
@@ -377,7 +377,8 @@ static const char *aspeed_spi_get_name(struct spi_mem *mem)
 	struct aspeed_spi *aspi = spi_controller_get_devdata(mem->spi->master);
 	struct device *dev = aspi->dev;
 
-	return devm_kasprintf(dev, GFP_KERNEL, "%s.%d", dev_name(dev), mem->spi->chip_select);
+	return devm_kasprintf(dev, GFP_KERNEL, "%s.%d", dev_name(dev),
+			      spi_get_chipselect(mem->spi, 0));
 }
 
 struct aspeed_spi_window {
@@ -553,7 +554,7 @@ static int aspeed_spi_do_calibration(struct aspeed_spi_chip *chip);
 static int aspeed_spi_dirmap_create(struct spi_mem_dirmap_desc *desc)
 {
 	struct aspeed_spi *aspi = spi_controller_get_devdata(desc->mem->spi->master);
-	struct aspeed_spi_chip *chip = &aspi->chips[desc->mem->spi->chip_select];
+	struct aspeed_spi_chip *chip = &aspi->chips[spi_get_chipselect(desc->mem->spi, 0)];
 	struct spi_mem_op *op = &desc->info.op_tmpl;
 	u32 ctl_val;
 	int ret = 0;
@@ -620,7 +621,7 @@ static ssize_t aspeed_spi_dirmap_read(struct spi_mem_dirmap_desc *desc,
 				      u64 offset, size_t len, void *buf)
 {
 	struct aspeed_spi *aspi = spi_controller_get_devdata(desc->mem->spi->master);
-	struct aspeed_spi_chip *chip = &aspi->chips[desc->mem->spi->chip_select];
+	struct aspeed_spi_chip *chip = &aspi->chips[spi_get_chipselect(desc->mem->spi, 0)];
 
 	/* Switch to USER command mode if mapping window is too small */
 	if (chip->ahb_window_size < offset + len) {
@@ -670,7 +671,7 @@ static int aspeed_spi_setup(struct spi_device *spi)
 {
 	struct aspeed_spi *aspi = spi_controller_get_devdata(spi->master);
 	const struct aspeed_spi_data *data = aspi->data;
-	unsigned int cs = spi->chip_select;
+	unsigned int cs = spi_get_chipselect(spi, 0);
 	struct aspeed_spi_chip *chip = &aspi->chips[cs];
 
 	chip->aspi = aspi;
@@ -697,7 +698,7 @@ static int aspeed_spi_setup(struct spi_device *spi)
 static void aspeed_spi_cleanup(struct spi_device *spi)
 {
 	struct aspeed_spi *aspi = spi_controller_get_devdata(spi->master);
-	unsigned int cs = spi->chip_select;
+	unsigned int cs = spi_get_chipselect(spi, 0);
 
 	aspeed_spi_chip_enable(aspi, cs, false);
 
diff --git a/drivers/spi/spi-at91-usart.c b/drivers/spi/spi-at91-usart.c
index 4fb3653b5941..7854d9790fe9 100644
--- a/drivers/spi/spi-at91-usart.c
+++ b/drivers/spi/spi-at91-usart.c
@@ -390,7 +390,7 @@ static int at91_usart_spi_setup(struct spi_device *spi)
 
 	dev_dbg(&spi->dev,
 		"setup: bpw %u mode 0x%x -> mr %d %08x\n",
-		spi->bits_per_word, spi->mode, spi->chip_select, mr);
+		spi->bits_per_word, spi->mode, spi_get_chipselect(spi, 0), mr);
 
 	return 0;
 }
diff --git a/drivers/spi/spi-ath79.c b/drivers/spi/spi-ath79.c
index b4d25b3bee19..d3dd21386f12 100644
--- a/drivers/spi/spi-ath79.c
+++ b/drivers/spi/spi-ath79.c
@@ -71,7 +71,7 @@ static void ath79_spi_chipselect(struct spi_device *spi, int is_active)
 {
 	struct ath79_spi *sp = ath79_spidev_to_sp(spi);
 	int cs_high = (spi->mode & SPI_CS_HIGH) ? is_active : !is_active;
-	u32 cs_bit = AR71XX_SPI_IOC_CS(spi->chip_select);
+	u32 cs_bit = AR71XX_SPI_IOC_CS(spi_get_chipselect(spi, 0));
 
 	if (cs_high)
 		sp->ioc_base |= cs_bit;
@@ -140,7 +140,7 @@ static int ath79_exec_mem_op(struct spi_mem *mem,
 	struct ath79_spi *sp = ath79_spidev_to_sp(mem->spi);
 
 	/* Ensures that reading is performed on device connected to hardware cs0 */
-	if (mem->spi->chip_select || mem->spi->cs_gpiod)
+	if (spi_get_chipselect(mem->spi, 0) || spi_get_csgpiod(mem->spi, 0))
 		return -ENOTSUPP;
 
 	/* Only use for fast-read op. */
diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c
index 73f80c8ac2ff..7f06305e16cb 100644
--- a/drivers/spi/spi-atmel.c
+++ b/drivers/spi/spi-atmel.c
@@ -327,10 +327,10 @@ static void cs_activate(struct atmel_spi *as, struct spi_device *spi)
 	int chip_select;
 	u32 mr;
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		chip_select = as->native_cs_for_gpio;
 	else
-		chip_select = spi->chip_select;
+		chip_select = spi_get_chipselect(spi, 0);
 
 	if (atmel_spi_is_v2(as)) {
 		spi_writel(as, CSR0 + 4 * chip_select, asd->csr);
@@ -378,10 +378,10 @@ static void cs_deactivate(struct atmel_spi *as, struct spi_device *spi)
 	int chip_select;
 	u32 mr;
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		chip_select = as->native_cs_for_gpio;
 	else
-		chip_select = spi->chip_select;
+		chip_select = spi_get_chipselect(spi, 0);
 
 	/* only deactivate *this* device; sometimes transfers to
 	 * another device may be active when this routine is called.
@@ -394,7 +394,7 @@ static void cs_deactivate(struct atmel_spi *as, struct spi_device *spi)
 
 	dev_dbg(&spi->dev, "DEactivate NPCS, mr %08x\n", mr);
 
-	if (!spi->cs_gpiod)
+	if (!spi_get_csgpiod(spi, 0))
 		spi_writel(as, CR, SPI_BIT(LASTXFER));
 }
 
@@ -800,10 +800,10 @@ static int atmel_spi_set_xfer_speed(struct atmel_spi *as,
 	unsigned long		bus_hz;
 	int chip_select;
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		chip_select = as->native_cs_for_gpio;
 	else
-		chip_select = spi->chip_select;
+		chip_select = spi_get_chipselect(spi, 0);
 
 	/* v1 chips start out at half the peripheral bus speed. */
 	bus_hz = as->spi_clk;
@@ -1189,7 +1189,7 @@ static int atmel_spi_setup(struct spi_device *spi)
 	as = spi_controller_get_devdata(spi->controller);
 
 	/* see notes above re chipselect */
-	if (!spi->cs_gpiod && (spi->mode & SPI_CS_HIGH)) {
+	if (!spi_get_csgpiod(spi, 0) && (spi->mode & SPI_CS_HIGH)) {
 		dev_warn(&spi->dev, "setup: non GPIO CS can't be active-high\n");
 		return -EINVAL;
 	}
@@ -1201,16 +1201,16 @@ static int atmel_spi_setup(struct spi_device *spi)
 	 */
 	initialize_native_cs_for_gpio(as);
 
-	if (spi->cs_gpiod && as->native_cs_free) {
+	if (spi_get_csgpiod(spi, 0) && as->native_cs_free) {
 		dev_err(&spi->dev,
 			"No native CS available to support this GPIO CS\n");
 		return -EBUSY;
 	}
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		chip_select = as->native_cs_for_gpio;
 	else
-		chip_select = spi->chip_select;
+		chip_select = spi_get_chipselect(spi, 0);
 
 	csr = SPI_BF(BITS, bits - 8);
 	if (spi->mode & SPI_CPOL)
@@ -1218,7 +1218,7 @@ static int atmel_spi_setup(struct spi_device *spi)
 	if (!(spi->mode & SPI_CPHA))
 		csr |= SPI_BIT(NCPHA);
 
-	if (!spi->cs_gpiod)
+	if (!spi_get_csgpiod(spi, 0))
 		csr |= SPI_BIT(CSAAT);
 	csr |= SPI_BF(DLYBS, 0);
 
@@ -1244,7 +1244,7 @@ static int atmel_spi_setup(struct spi_device *spi)
 
 	dev_dbg(&spi->dev,
 		"setup: bpw %u mode 0x%x -> csr%d %08x\n",
-		bits, spi->mode, spi->chip_select, csr);
+		bits, spi->mode, spi_get_chipselect(spi, 0), csr);
 
 	if (!atmel_spi_is_v2(as))
 		spi_writel(as, CSR0 + 4 * chip_select, csr);
diff --git a/drivers/spi/spi-au1550.c b/drivers/spi/spi-au1550.c
index 8151bed8a117..0b57e6afce0f 100644
--- a/drivers/spi/spi-au1550.c
+++ b/drivers/spi/spi-au1550.c
@@ -166,7 +166,7 @@ static void au1550_spi_chipsel(struct spi_device *spi, int value)
 	switch (value) {
 	case BITBANG_CS_INACTIVE:
 		if (hw->pdata->deactivate_cs)
-			hw->pdata->deactivate_cs(hw->pdata, spi->chip_select,
+			hw->pdata->deactivate_cs(hw->pdata, spi_get_chipselect(spi, 0),
 					cspol);
 		break;
 
@@ -211,7 +211,7 @@ static void au1550_spi_chipsel(struct spi_device *spi, int value)
 		} while ((stat & PSC_SPISTAT_DR) == 0);
 
 		if (hw->pdata->activate_cs)
-			hw->pdata->activate_cs(hw->pdata, spi->chip_select,
+			hw->pdata->activate_cs(hw->pdata, spi_get_chipselect(spi, 0),
 					cspol);
 		break;
 	}
diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c
index c5a3a3189164..89661f3b0d44 100644
--- a/drivers/spi/spi-axi-spi-engine.c
+++ b/drivers/spi/spi-axi-spi-engine.c
@@ -193,7 +193,7 @@ static void spi_engine_gen_cs(struct spi_engine_program *p, bool dry,
 	unsigned int mask = 0xff;
 
 	if (assert)
-		mask ^= BIT(spi->chip_select);
+		mask ^= BIT(spi_get_chipselect(spi, 0));
 
 	spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_ASSERT(1, mask));
 }
diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 0eee574d3e1f..7c2f1d1fb3f7 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -986,7 +986,7 @@ static int write_to_hw(struct bcm_qspi *qspi, struct spi_device *spi)
 		if (has_bspi(qspi))
 			mspi_cdram &= ~1;
 		else
-			mspi_cdram |= (~(1 << spi->chip_select) &
+			mspi_cdram |= (~(1 << spi_get_chipselect(spi, 0)) &
 				       MSPI_CDRAM_PCS);
 
 		write_cdram_slot(qspi, slot, mspi_cdram);
@@ -1046,8 +1046,8 @@ static int bcm_qspi_bspi_exec_mem_op(struct spi_device *spi,
 			return -EIO;
 
 	from = op->addr.val;
-	if (!spi->cs_gpiod)
-		bcm_qspi_chip_select(qspi, spi->chip_select);
+	if (!spi_get_csgpiod(spi, 0))
+		bcm_qspi_chip_select(qspi, spi_get_chipselect(spi, 0));
 	bcm_qspi_write(qspi, MSPI, MSPI_WRITE_LOCK, 0);
 
 	/*
@@ -1126,8 +1126,8 @@ static int bcm_qspi_transfer_one(struct spi_master *master,
 	int slots;
 	unsigned long timeo = msecs_to_jiffies(100);
 
-	if (!spi->cs_gpiod)
-		bcm_qspi_chip_select(qspi, spi->chip_select);
+	if (!spi_get_csgpiod(spi, 0))
+		bcm_qspi_chip_select(qspi, spi_get_chipselect(spi, 0));
 	qspi->trans_pos.trans = trans;
 	qspi->trans_pos.byte = 0;
 
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 747e03228c48..6b7a3fa93fdf 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -1274,9 +1274,9 @@ static int bcm2835_spi_setup(struct spi_device *spi)
 	 * The SPI core has successfully requested the CS GPIO line from the
 	 * device tree, so we are done.
 	 */
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		return 0;
-	if (spi->chip_select > 1) {
+	if (spi_get_chipselect(spi, 0) > 1) {
 		/* error in the case of native CS requested with CS > 1
 		 * officially there is a CS2, but it is not documented
 		 * which GPIO is connected with that...
@@ -1301,18 +1301,19 @@ static int bcm2835_spi_setup(struct spi_device *spi)
 	if (!chip)
 		return 0;
 
-	spi->cs_gpiod = gpiochip_request_own_desc(chip, 8 - spi->chip_select,
-						  DRV_NAME,
-						  GPIO_LOOKUP_FLAGS_DEFAULT,
-						  GPIOD_OUT_LOW);
-	if (IS_ERR(spi->cs_gpiod)) {
-		ret = PTR_ERR(spi->cs_gpiod);
+	spi_set_csgpiod(spi, 0, gpiochip_request_own_desc(chip,
+							  8 - (spi_get_chipselect(spi, 0)),
+							  DRV_NAME,
+							  GPIO_LOOKUP_FLAGS_DEFAULT,
+							  GPIOD_OUT_LOW));
+	if (IS_ERR(spi_get_csgpiod(spi, 0))) {
+		ret = PTR_ERR(spi_get_csgpiod(spi, 0));
 		goto err_cleanup;
 	}
 
 	/* and set up the "mode" and level */
 	dev_info(&spi->dev, "setting up native-CS%i to use GPIO\n",
-		 spi->chip_select);
+		 spi_get_chipselect(spi, 0));
 
 	return 0;
 
diff --git a/drivers/spi/spi-bcm2835aux.c b/drivers/spi/spi-bcm2835aux.c
index 7f2546fd900a..288f7b994b36 100644
--- a/drivers/spi/spi-bcm2835aux.c
+++ b/drivers/spi/spi-bcm2835aux.c
@@ -448,7 +448,7 @@ static int bcm2835aux_spi_setup(struct spi_device *spi)
 	if (spi->mode & SPI_NO_CS)
 		return 0;
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		return 0;
 
 	/* for dt-backwards compatibility: only support native on CS0
@@ -465,7 +465,7 @@ static int bcm2835aux_spi_setup(struct spi_device *spi)
 	dev_warn(&spi->dev,
 		 "Native CS is not supported - please configure cs-gpio in device-tree\n");
 
-	if (spi->chip_select == 0)
+	if (spi_get_chipselect(spi, 0) == 0)
 		return 0;
 
 	dev_warn(&spi->dev, "Native CS is not working for cs > 0\n");
diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c
index f2708caa2f33..ee2528dad02d 100644
--- a/drivers/spi/spi-bcm63xx-hsspi.c
+++ b/drivers/spi/spi-bcm63xx-hsspi.c
@@ -349,7 +349,7 @@ static int bcm63xx_hsspi_do_prepend_txrx(struct spi_device *spi,
 					 struct spi_transfer *t)
 {
 	struct bcm63xx_hsspi *bs = spi_master_get_devdata(spi->master);
-	unsigned int chip_select = spi->chip_select;
+	unsigned int chip_select = spi_get_chipselect(spi, 0);
 	u16 opcode = 0, val;
 	const u8 *tx = t->tx_buf;
 	u8 *rx = t->rx_buf;
@@ -441,7 +441,7 @@ static void bcm63xx_hsspi_set_cs(struct bcm63xx_hsspi *bs, unsigned int cs,
 static void bcm63xx_hsspi_set_clk(struct bcm63xx_hsspi *bs,
 				  struct spi_device *spi, int hz)
 {
-	unsigned int profile = spi->chip_select;
+	unsigned int profile = spi_get_chipselect(spi, 0);
 	u32 reg;
 
 	reg = DIV_ROUND_UP(2048, DIV_ROUND_UP(bs->speed_hz, hz));
@@ -468,7 +468,7 @@ static void bcm63xx_hsspi_set_clk(struct bcm63xx_hsspi *bs,
 static int bcm63xx_hsspi_do_txrx(struct spi_device *spi, struct spi_transfer *t)
 {
 	struct bcm63xx_hsspi *bs = spi_master_get_devdata(spi->master);
-	unsigned int chip_select = spi->chip_select;
+	unsigned int chip_select = spi_get_chipselect(spi, 0);
 	u16 opcode = 0, val;
 	int pending = t->len;
 	int step_size = HSSPI_BUFFER_LEN;
@@ -478,7 +478,7 @@ static int bcm63xx_hsspi_do_txrx(struct spi_device *spi, struct spi_transfer *t)
 
 	bcm63xx_hsspi_set_clk(bs, spi, t->speed_hz);
 	if (!t->cs_off)
-		bcm63xx_hsspi_set_cs(bs, spi->chip_select, true);
+		bcm63xx_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), true);
 
 	if (tx && rx)
 		opcode = HSSPI_OP_READ_WRITE;
@@ -545,14 +545,14 @@ static int bcm63xx_hsspi_setup(struct spi_device *spi)
 	u32 reg;
 
 	reg = __raw_readl(bs->regs +
-			  HSSPI_PROFILE_SIGNAL_CTRL_REG(spi->chip_select));
+			  HSSPI_PROFILE_SIGNAL_CTRL_REG(spi_get_chipselect(spi, 0)));
 	reg &= ~(SIGNAL_CTRL_LAUNCH_RISING | SIGNAL_CTRL_LATCH_RISING);
 	if (spi->mode & SPI_CPHA)
 		reg |= SIGNAL_CTRL_LAUNCH_RISING;
 	else
 		reg |= SIGNAL_CTRL_LATCH_RISING;
 	__raw_writel(reg, bs->regs +
-		     HSSPI_PROFILE_SIGNAL_CTRL_REG(spi->chip_select));
+		     HSSPI_PROFILE_SIGNAL_CTRL_REG(spi_get_chipselect(spi, 0)));
 
 	mutex_lock(&bs->bus_mutex);
 	reg = __raw_readl(bs->regs + HSSPI_GLOBAL_CTRL_REG);
@@ -560,16 +560,16 @@ static int bcm63xx_hsspi_setup(struct spi_device *spi)
 	/* only change actual polarities if there is no transfer */
 	if ((reg & GLOBAL_CTRL_CS_POLARITY_MASK) == bs->cs_polarity) {
 		if (spi->mode & SPI_CS_HIGH)
-			reg |= BIT(spi->chip_select);
+			reg |= BIT(spi_get_chipselect(spi, 0));
 		else
-			reg &= ~BIT(spi->chip_select);
+			reg &= ~BIT(spi_get_chipselect(spi, 0));
 		__raw_writel(reg, bs->regs + HSSPI_GLOBAL_CTRL_REG);
 	}
 
 	if (spi->mode & SPI_CS_HIGH)
-		bs->cs_polarity |= BIT(spi->chip_select);
+		bs->cs_polarity |= BIT(spi_get_chipselect(spi, 0));
 	else
-		bs->cs_polarity &= ~BIT(spi->chip_select);
+		bs->cs_polarity &= ~BIT(spi_get_chipselect(spi, 0));
 
 	mutex_unlock(&bs->bus_mutex);
 
@@ -600,7 +600,7 @@ static int bcm63xx_hsspi_do_dummy_cs_txrx(struct spi_device *spi,
 	 * e. At the end restore the polarities again to their default values.
 	 */
 
-	dummy_cs = !spi->chip_select;
+	dummy_cs = !spi_get_chipselect(spi, 0);
 	bcm63xx_hsspi_set_cs(bs, dummy_cs, true);
 
 	list_for_each_entry(t, &msg->transfers, transfer_list) {
@@ -633,22 +633,22 @@ static int bcm63xx_hsspi_do_dummy_cs_txrx(struct spi_device *spi,
 				keep_cs = true;
 			} else {
 				if (!t->cs_off)
-					bcm63xx_hsspi_set_cs(bs, spi->chip_select, false);
+					bcm63xx_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), false);
 
 				spi_transfer_cs_change_delay_exec(msg, t);
 
 				if (!list_next_entry(t, transfer_list)->cs_off)
-					bcm63xx_hsspi_set_cs(bs, spi->chip_select, true);
+					bcm63xx_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), true);
 			}
 		} else if (!list_is_last(&t->transfer_list, &msg->transfers) &&
 			   t->cs_off != list_next_entry(t, transfer_list)->cs_off) {
-			bcm63xx_hsspi_set_cs(bs, spi->chip_select, t->cs_off);
+			bcm63xx_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), t->cs_off);
 		}
 	}
 
 	bcm63xx_hsspi_set_cs(bs, dummy_cs, false);
 	if (status || !keep_cs)
-		bcm63xx_hsspi_set_cs(bs, spi->chip_select, false);
+		bcm63xx_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), false);
 
 	return status;
 }
diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c
index 77b8328c8a6d..96633a0051b1 100644
--- a/drivers/spi/spi-bcm63xx.c
+++ b/drivers/spi/spi-bcm63xx.c
@@ -282,7 +282,7 @@ static int bcm63xx_txrx_bufs(struct spi_device *spi, struct spi_transfer *first,
 	/* Issue the transfer */
 	cmd = SPI_CMD_START_IMMEDIATE;
 	cmd |= (prepend_len << SPI_CMD_PREPEND_BYTE_CNT_SHIFT);
-	cmd |= (spi->chip_select << SPI_CMD_DEVICE_ID_SHIFT);
+	cmd |= (spi_get_chipselect(spi, 0) << SPI_CMD_DEVICE_ID_SHIFT);
 	bcm_spi_writew(bs, cmd, SPI_CMD);
 
 	/* Enable the CMD_DONE interrupt */
diff --git a/drivers/spi/spi-bcmbca-hsspi.c b/drivers/spi/spi-bcmbca-hsspi.c
index c7a44832bc9c..8cbd01619789 100644
--- a/drivers/spi/spi-bcmbca-hsspi.c
+++ b/drivers/spi/spi-bcmbca-hsspi.c
@@ -193,7 +193,7 @@ static void bcmbca_hsspi_set_cs(struct bcmbca_hsspi *bs, unsigned int cs,
 static void bcmbca_hsspi_set_clk(struct bcmbca_hsspi *bs,
 				  struct spi_device *spi, int hz)
 {
-	unsigned int profile = spi->chip_select;
+	unsigned int profile = spi_get_chipselect(spi, 0);
 	u32 reg;
 
 	reg = DIV_ROUND_UP(2048, DIV_ROUND_UP(bs->speed_hz, hz));
@@ -251,7 +251,7 @@ static int bcmbca_hsspi_do_txrx(struct spi_device *spi, struct spi_transfer *t,
 								struct spi_message *msg)
 {
 	struct bcmbca_hsspi *bs = spi_master_get_devdata(spi->master);
-	unsigned int chip_select = spi->chip_select;
+	unsigned int chip_select = spi_get_chipselect(spi, 0);
 	u16 opcode = 0, val;
 	int pending = t->len;
 	int step_size = HSSPI_BUFFER_LEN;
@@ -312,7 +312,7 @@ static int bcmbca_hsspi_do_txrx(struct spi_device *spi, struct spi_transfer *t,
 			    PINGPONG_COMMAND_START_NOW;
 		__raw_writel(reg, bs->regs + HSSPI_PINGPONG_COMMAND_REG(0));
 
-		if (bcmbca_hsspi_wait_cmd(bs, spi->chip_select))
+		if (bcmbca_hsspi_wait_cmd(bs, spi_get_chipselect(spi, 0)))
 			return -ETIMEDOUT;
 
 		pending -= curr_step;
@@ -332,33 +332,33 @@ static int bcmbca_hsspi_setup(struct spi_device *spi)
 	u32 reg;
 
 	reg = __raw_readl(bs->regs +
-			  HSSPI_PROFILE_SIGNAL_CTRL_REG(spi->chip_select));
+			  HSSPI_PROFILE_SIGNAL_CTRL_REG(spi_get_chipselect(spi, 0)));
 	reg &= ~(SIGNAL_CTRL_LAUNCH_RISING | SIGNAL_CTRL_LATCH_RISING);
 	if (spi->mode & SPI_CPHA)
 		reg |= SIGNAL_CTRL_LAUNCH_RISING;
 	else
 		reg |= SIGNAL_CTRL_LATCH_RISING;
 	__raw_writel(reg, bs->regs +
-		     HSSPI_PROFILE_SIGNAL_CTRL_REG(spi->chip_select));
+		     HSSPI_PROFILE_SIGNAL_CTRL_REG(spi_get_chipselect(spi, 0)));
 
 	mutex_lock(&bs->bus_mutex);
 	reg = __raw_readl(bs->regs + HSSPI_GLOBAL_CTRL_REG);
 
 	if (spi->mode & SPI_CS_HIGH)
-		reg |= BIT(spi->chip_select);
+		reg |= BIT(spi_get_chipselect(spi, 0));
 	else
-		reg &= ~BIT(spi->chip_select);
+		reg &= ~BIT(spi_get_chipselect(spi, 0));
 	__raw_writel(reg, bs->regs + HSSPI_GLOBAL_CTRL_REG);
 
 	if (spi->mode & SPI_CS_HIGH)
-		bs->cs_polarity |= BIT(spi->chip_select);
+		bs->cs_polarity |= BIT(spi_get_chipselect(spi, 0));
 	else
-		bs->cs_polarity &= ~BIT(spi->chip_select);
+		bs->cs_polarity &= ~BIT(spi_get_chipselect(spi, 0));
 
 	reg = __raw_readl(bs->spim_ctrl);
-	reg &= ~BIT(spi->chip_select + SPIM_CTRL_CS_OVERRIDE_VAL_SHIFT);
+	reg &= ~BIT(spi_get_chipselect(spi, 0) + SPIM_CTRL_CS_OVERRIDE_VAL_SHIFT);
 	if (spi->mode & SPI_CS_HIGH)
-		reg |= BIT(spi->chip_select + SPIM_CTRL_CS_OVERRIDE_VAL_SHIFT);
+		reg |= BIT(spi_get_chipselect(spi, 0) + SPIM_CTRL_CS_OVERRIDE_VAL_SHIFT);
 	__raw_writel(reg, bs->spim_ctrl);
 
 	mutex_unlock(&bs->bus_mutex);
@@ -388,16 +388,16 @@ static int bcmbca_hsspi_transfer_one(struct spi_master *master,
 				keep_cs = true;
 			} else {
 				if (!t->cs_off)
-					bcmbca_hsspi_set_cs(bs, spi->chip_select, false);
+					bcmbca_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), false);
 
 				spi_transfer_cs_change_delay_exec(msg, t);
 
 				if (!list_next_entry(t, transfer_list)->cs_off)
-					bcmbca_hsspi_set_cs(bs, spi->chip_select, true);
+					bcmbca_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), true);
 			}
 		} else if (!list_is_last(&t->transfer_list, &msg->transfers) &&
 			   t->cs_off != list_next_entry(t, transfer_list)->cs_off) {
-			bcmbca_hsspi_set_cs(bs, spi->chip_select, t->cs_off);
+			bcmbca_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), t->cs_off);
 		}
 
 		msg->actual_length += t->len;
@@ -406,7 +406,7 @@ static int bcmbca_hsspi_transfer_one(struct spi_master *master,
 	mutex_unlock(&bs->msg_mutex);
 
 	if (status || !keep_cs)
-		bcmbca_hsspi_set_cs(bs, spi->chip_select, false);
+		bcmbca_hsspi_set_cs(bs, spi_get_chipselect(spi, 0), false);
 
 	msg->status = status;
 	spi_finalize_current_message(master);
diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index 463fd49f4f80..79ab7e309644 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -1355,7 +1355,7 @@ static int cqspi_mem_process(struct spi_mem *mem, const struct spi_mem_op *op)
 	struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
 	struct cqspi_flash_pdata *f_pdata;
 
-	f_pdata = &cqspi->f_pdata[mem->spi->chip_select];
+	f_pdata = &cqspi->f_pdata[spi_get_chipselect(mem->spi, 0)];
 	cqspi_configure(f_pdata, mem->spi->max_speed_hz);
 
 	if (op->data.dir == SPI_MEM_DATA_IN && op->data.buf.in) {
@@ -1561,7 +1561,8 @@ static const char *cqspi_get_name(struct spi_mem *mem)
 	struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
 	struct device *dev = &cqspi->pdev->dev;
 
-	return devm_kasprintf(dev, GFP_KERNEL, "%s.%d", dev_name(dev), mem->spi->chip_select);
+	return devm_kasprintf(dev, GFP_KERNEL, "%s.%d", dev_name(dev),
+			      spi_get_chipselect(mem->spi, 0));
 }
 
 static const struct spi_controller_mem_ops cqspi_mem_ops = {
diff --git a/drivers/spi/spi-cadence-xspi.c b/drivers/spi/spi-cadence-xspi.c
index 91db3c973167..ce4a3145f065 100644
--- a/drivers/spi/spi-cadence-xspi.c
+++ b/drivers/spi/spi-cadence-xspi.c
@@ -409,8 +409,8 @@ static int cdns_xspi_mem_op(struct cdns_xspi_dev *cdns_xspi,
 {
 	enum spi_mem_data_dir dir = op->data.dir;
 
-	if (cdns_xspi->cur_cs != mem->spi->chip_select)
-		cdns_xspi->cur_cs = mem->spi->chip_select;
+	if (cdns_xspi->cur_cs != spi_get_chipselect(mem->spi, 0))
+		cdns_xspi->cur_cs = spi_get_chipselect(mem->spi, 0);
 
 	return cdns_xspi_send_stig_command(cdns_xspi, op,
 					   (dir != SPI_MEM_NO_DATA));
diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c
index 1ad86e5d2b2f..49936237e583 100644
--- a/drivers/spi/spi-cadence.c
+++ b/drivers/spi/spi-cadence.c
@@ -185,11 +185,11 @@ static void cdns_spi_chipselect(struct spi_device *spi, bool is_high)
 		/* Select the slave */
 		ctrl_reg &= ~CDNS_SPI_CR_SSCTRL;
 		if (!(xspi->is_decoded_cs))
-			ctrl_reg |= ((~(CDNS_SPI_SS0 << spi->chip_select)) <<
+			ctrl_reg |= ((~(CDNS_SPI_SS0 << spi_get_chipselect(spi, 0))) <<
 				     CDNS_SPI_SS_SHIFT) &
 				     CDNS_SPI_CR_SSCTRL;
 		else
-			ctrl_reg |= (spi->chip_select << CDNS_SPI_SS_SHIFT) &
+			ctrl_reg |= (spi_get_chipselect(spi, 0) << CDNS_SPI_SS_SHIFT) &
 				     CDNS_SPI_CR_SSCTRL;
 	}
 
diff --git a/drivers/spi/spi-cavium.c b/drivers/spi/spi-cavium.c
index 6854c3ce423b..dfe224defd6e 100644
--- a/drivers/spi/spi-cavium.c
+++ b/drivers/spi/spi-cavium.c
@@ -57,8 +57,8 @@ static int octeon_spi_do_transfer(struct octeon_spi *p,
 	mpi_cfg.s.cslate = cpha ? 1 : 0;
 	mpi_cfg.s.enable = 1;
 
-	if (spi->chip_select < 4)
-		p->cs_enax |= 1ull << (12 + spi->chip_select);
+	if (spi_get_chipselect(spi, 0) < 4)
+		p->cs_enax |= 1ull << (12 + spi_get_chipselect(spi, 0));
 	mpi_cfg.u64 |= p->cs_enax;
 
 	if (mpi_cfg.u64 != p->last_cfg) {
@@ -78,7 +78,7 @@ static int octeon_spi_do_transfer(struct octeon_spi *p,
 			writeq(d, p->register_base + OCTEON_SPI_DAT0(p) + (8 * i));
 		}
 		mpi_tx.u64 = 0;
-		mpi_tx.s.csid = spi->chip_select;
+		mpi_tx.s.csid = spi_get_chipselect(spi, 0);
 		mpi_tx.s.leavecs = 1;
 		mpi_tx.s.txnum = tx_buf ? OCTEON_SPI_MAX_BYTES : 0;
 		mpi_tx.s.totnum = OCTEON_SPI_MAX_BYTES;
@@ -103,7 +103,7 @@ static int octeon_spi_do_transfer(struct octeon_spi *p,
 	}
 
 	mpi_tx.u64 = 0;
-	mpi_tx.s.csid = spi->chip_select;
+	mpi_tx.s.csid = spi_get_chipselect(spi, 0);
 	if (last_xfer)
 		mpi_tx.s.leavecs = xfer->cs_change;
 	else
diff --git a/drivers/spi/spi-coldfire-qspi.c b/drivers/spi/spi-coldfire-qspi.c
index d98e74c6e4b2..b1bd8a6b5bf9 100644
--- a/drivers/spi/spi-coldfire-qspi.c
+++ b/drivers/spi/spi-coldfire-qspi.c
@@ -290,9 +290,9 @@ static void mcfqspi_set_cs(struct spi_device *spi, bool enable)
 	bool cs_high = spi->mode & SPI_CS_HIGH;
 
 	if (enable)
-		mcfqspi_cs_select(mcfqspi, spi->chip_select, cs_high);
+		mcfqspi_cs_select(mcfqspi, spi_get_chipselect(spi, 0), cs_high);
 	else
-		mcfqspi_cs_deselect(mcfqspi, spi->chip_select, cs_high);
+		mcfqspi_cs_deselect(mcfqspi, spi_get_chipselect(spi, 0), cs_high);
 }
 
 static int mcfqspi_transfer_one(struct spi_master *master,
@@ -324,11 +324,11 @@ static int mcfqspi_transfer_one(struct spi_master *master,
 static int mcfqspi_setup(struct spi_device *spi)
 {
 	mcfqspi_cs_deselect(spi_master_get_devdata(spi->master),
-			    spi->chip_select, spi->mode & SPI_CS_HIGH);
+			    spi_get_chipselect(spi, 0), spi->mode & SPI_CS_HIGH);
 
 	dev_dbg(&spi->dev,
 			"bits per word %d, chip select %d, speed %d KHz\n",
-			spi->bits_per_word, spi->chip_select,
+			spi->bits_per_word, spi_get_chipselect(spi, 0),
 			(MCFQSPI_BUSCLK / mcfqspi_qmr_baud(spi->max_speed_hz))
 			/ 1000);
 
diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index add1e198a439..b04811c911e2 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -199,7 +199,7 @@ static void davinci_spi_chipselect(struct spi_device *spi, int value)
 {
 	struct davinci_spi *dspi;
 	struct davinci_spi_config *spicfg = spi->controller_data;
-	u8 chip_sel = spi->chip_select;
+	u8 chip_sel = spi_get_chipselect(spi, 0);
 	u16 spidat1 = CS_DEFAULT;
 
 	dspi = spi_master_get_devdata(spi->master);
@@ -212,11 +212,11 @@ static void davinci_spi_chipselect(struct spi_device *spi, int value)
 	 * Board specific chip select logic decides the polarity and cs
 	 * line for the controller
 	 */
-	if (spi->cs_gpiod) {
+	if (spi_get_csgpiod(spi, 0)) {
 		if (value == BITBANG_CS_ACTIVE)
-			gpiod_set_value(spi->cs_gpiod, 1);
+			gpiod_set_value(spi_get_csgpiod(spi, 0), 1);
 		else
-			gpiod_set_value(spi->cs_gpiod, 0);
+			gpiod_set_value(spi_get_csgpiod(spi, 0), 0);
 	} else {
 		if (value == BITBANG_CS_ACTIVE) {
 			if (!(spi->mode & SPI_CS_WORD))
@@ -293,11 +293,11 @@ static int davinci_spi_setup_transfer(struct spi_device *spi,
 	if (bits_per_word <= 8) {
 		dspi->get_rx = davinci_spi_rx_buf_u8;
 		dspi->get_tx = davinci_spi_tx_buf_u8;
-		dspi->bytes_per_word[spi->chip_select] = 1;
+		dspi->bytes_per_word[spi_get_chipselect(spi, 0)] = 1;
 	} else {
 		dspi->get_rx = davinci_spi_rx_buf_u16;
 		dspi->get_tx = davinci_spi_tx_buf_u16;
-		dspi->bytes_per_word[spi->chip_select] = 2;
+		dspi->bytes_per_word[spi_get_chipselect(spi, 0)] = 2;
 	}
 
 	if (!hz)
@@ -415,11 +415,11 @@ static int davinci_spi_setup(struct spi_device *spi)
 	dspi = spi_master_get_devdata(spi->master);
 
 	if (!(spi->mode & SPI_NO_CS)) {
-		if (np && spi->cs_gpiod)
+		if (np && spi_get_csgpiod(spi, 0))
 			internal_cs = false;
 
 		if (internal_cs)
-			set_io_bits(dspi->base + SPIPC0, 1 << spi->chip_select);
+			set_io_bits(dspi->base + SPIPC0, 1 << spi_get_chipselect(spi, 0));
 	}
 
 	if (spi->mode & SPI_READY)
@@ -579,7 +579,7 @@ static int davinci_spi_bufs(struct spi_device *spi, struct spi_transfer *t)
 		spicfg = &davinci_spi_default_cfg;
 
 	/* convert len to words based on bits_per_word */
-	data_type = dspi->bytes_per_word[spi->chip_select];
+	data_type = dspi->bytes_per_word[spi_get_chipselect(spi, 0)];
 
 	dspi->tx = t->tx_buf;
 	dspi->rx = t->rx_buf;
diff --git a/drivers/spi/spi-dln2.c b/drivers/spi/spi-dln2.c
index 821bfc6b0dd6..6bd93c47853c 100644
--- a/drivers/spi/spi-dln2.c
+++ b/drivers/spi/spi-dln2.c
@@ -596,12 +596,12 @@ static int dln2_spi_prepare_message(struct spi_master *master,
 	struct dln2_spi *dln2 = spi_master_get_devdata(master);
 	struct spi_device *spi = message->spi;
 
-	if (dln2->cs != spi->chip_select) {
-		ret = dln2_spi_cs_set_one(dln2, spi->chip_select);
+	if (dln2->cs != spi_get_chipselect(spi, 0)) {
+		ret = dln2_spi_cs_set_one(dln2, spi_get_chipselect(spi, 0));
 		if (ret < 0)
 			return ret;
 
-		dln2->cs = spi->chip_select;
+		dln2->cs = spi_get_chipselect(spi, 0);
 	}
 
 	return 0;
diff --git a/drivers/spi/spi-dw-core.c b/drivers/spi/spi-dw-core.c
index c3bfb6c84cab..ae3108c70f50 100644
--- a/drivers/spi/spi-dw-core.c
+++ b/drivers/spi/spi-dw-core.c
@@ -103,7 +103,7 @@ void dw_spi_set_cs(struct spi_device *spi, bool enable)
 	 * support active-high or active-low CS level.
 	 */
 	if (cs_high == enable)
-		dw_writel(dws, DW_SPI_SER, BIT(spi->chip_select));
+		dw_writel(dws, DW_SPI_SER, BIT(spi_get_chipselect(spi, 0)));
 	else
 		dw_writel(dws, DW_SPI_SER, 0);
 }
diff --git a/drivers/spi/spi-dw-mmio.c b/drivers/spi/spi-dw-mmio.c
index 6ae124c30969..328541b5fb52 100644
--- a/drivers/spi/spi-dw-mmio.c
+++ b/drivers/spi/spi-dw-mmio.c
@@ -65,7 +65,7 @@ static void dw_spi_mscc_set_cs(struct spi_device *spi, bool enable)
 	struct dw_spi *dws = spi_master_get_devdata(spi->master);
 	struct dw_spi_mmio *dwsmmio = container_of(dws, struct dw_spi_mmio, dws);
 	struct dw_spi_mscc *dwsmscc = dwsmmio->priv;
-	u32 cs = spi->chip_select;
+	u32 cs = spi_get_chipselect(spi, 0);
 
 	if (cs < 4) {
 		u32 sw_mode = MSCC_SPI_MST_SW_MODE_SW_PIN_CTRL_MODE;
@@ -138,7 +138,7 @@ static void dw_spi_sparx5_set_cs(struct spi_device *spi, bool enable)
 	struct dw_spi *dws = spi_master_get_devdata(spi->master);
 	struct dw_spi_mmio *dwsmmio = container_of(dws, struct dw_spi_mmio, dws);
 	struct dw_spi_mscc *dwsmscc = dwsmmio->priv;
-	u8 cs = spi->chip_select;
+	u8 cs = spi_get_chipselect(spi, 0);
 
 	if (!enable) {
 		/* CS override drive enable */
diff --git a/drivers/spi/spi-falcon.c b/drivers/spi/spi-falcon.c
index a7d4dffac66b..4c103dff0d44 100644
--- a/drivers/spi/spi-falcon.c
+++ b/drivers/spi/spi-falcon.c
@@ -131,7 +131,7 @@ int falcon_sflash_xfer(struct spi_device *spi, struct spi_transfer *t,
 				 * especially alen and dumlen.
 				 */
 
-				priv->sfcmd = ((spi->chip_select
+				priv->sfcmd = ((spi_get_chipselect(spi, 0)
 						<< SFCMD_CS_OFFSET)
 					       & SFCMD_CS_MASK);
 				priv->sfcmd |= SFCMD_KEEP_CS_KEEP_SELECTED;
diff --git a/drivers/spi/spi-fsi.c b/drivers/spi/spi-fsi.c
index cf1e4f9ebd72..ba3b17d7c9ec 100644
--- a/drivers/spi/spi-fsi.c
+++ b/drivers/spi/spi-fsi.c
@@ -425,7 +425,7 @@ static int fsi_spi_transfer_one_message(struct spi_controller *ctlr,
 					struct spi_message *mesg)
 {
 	int rc;
-	u8 seq_slave = SPI_FSI_SEQUENCE_SEL_SLAVE(mesg->spi->chip_select + 1);
+	u8 seq_slave = SPI_FSI_SEQUENCE_SEL_SLAVE(spi_get_chipselect(mesg->spi, 0) + 1);
 	unsigned int len;
 	struct spi_transfer *transfer;
 	struct fsi_spi *ctx = spi_controller_get_devdata(ctlr);
diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index 49df00f52ea4..ee42285b5c52 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -902,19 +902,19 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id)
 
 static void dspi_assert_cs(struct spi_device *spi, bool *cs)
 {
-	if (!spi->cs_gpiod || *cs)
+	if (!spi_get_csgpiod(spi, 0) || *cs)
 		return;
 
-	gpiod_set_value_cansleep(spi->cs_gpiod, true);
+	gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), true);
 	*cs = true;
 }
 
 static void dspi_deassert_cs(struct spi_device *spi, bool *cs)
 {
-	if (!spi->cs_gpiod || !*cs)
+	if (!spi_get_csgpiod(spi, 0) || !*cs)
 		return;
 
-	gpiod_set_value_cansleep(spi->cs_gpiod, false);
+	gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), false);
 	*cs = false;
 }
 
@@ -938,8 +938,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 
 		/* Prepare command word for CMD FIFO */
 		dspi->tx_cmd = SPI_PUSHR_CMD_CTAS(0);
-		if (!spi->cs_gpiod)
-			dspi->tx_cmd |= SPI_PUSHR_CMD_PCS(spi->chip_select);
+		if (!spi_get_csgpiod(spi, 0))
+			dspi->tx_cmd |= SPI_PUSHR_CMD_PCS(spi_get_chipselect(spi, 0));
 
 		if (list_is_last(&dspi->cur_transfer->transfer_list,
 				 &dspi->cur_msg->transfers)) {
@@ -1058,7 +1058,7 @@ static int dspi_setup(struct spi_device *spi)
 			chip->ctar_val |= SPI_CTAR_LSBFE;
 	}
 
-	gpiod_direction_output(spi->cs_gpiod, false);
+	gpiod_direction_output(spi_get_csgpiod(spi, 0), false);
 	dspi_deassert_cs(spi, &cs);
 
 	spi_set_ctldata(spi, chip);
@@ -1071,7 +1071,7 @@ static void dspi_cleanup(struct spi_device *spi)
 	struct chip_data *chip = spi_get_ctldata((struct spi_device *)spi);
 
 	dev_dbg(&spi->dev, "spi_device %u.%u cleanup\n",
-		spi->controller->bus_num, spi->chip_select);
+		spi->controller->bus_num, spi_get_chipselect(spi, 0));
 
 	kfree(chip);
 }
diff --git a/drivers/spi/spi-fsl-espi.c b/drivers/spi/spi-fsl-espi.c
index 42a3ed79e7dc..b3d2d3db5850 100644
--- a/drivers/spi/spi-fsl-espi.c
+++ b/drivers/spi/spi-fsl-espi.c
@@ -345,7 +345,7 @@ static void fsl_espi_setup_transfer(struct spi_device *spi,
 
 	/* don't write the mode register if the mode doesn't change */
 	if (cs->hw_mode != hw_mode_old)
-		fsl_espi_write_reg(espi, ESPI_SPMODEx(spi->chip_select),
+		fsl_espi_write_reg(espi, ESPI_SPMODEx(spi_get_chipselect(spi, 0)),
 				   cs->hw_mode);
 }
 
@@ -359,7 +359,7 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
 	reinit_completion(&espi->done);
 
 	/* Set SPCOM[CS] and SPCOM[TRANLEN] field */
-	spcom = SPCOM_CS(spi->chip_select);
+	spcom = SPCOM_CS(spi_get_chipselect(spi, 0));
 	spcom |= SPCOM_TRANLEN(t->len - 1);
 
 	/* configure RXSKIP mode */
@@ -492,7 +492,7 @@ static int fsl_espi_setup(struct spi_device *spi)
 
 	pm_runtime_get_sync(espi->dev);
 
-	cs->hw_mode = fsl_espi_read_reg(espi, ESPI_SPMODEx(spi->chip_select));
+	cs->hw_mode = fsl_espi_read_reg(espi, ESPI_SPMODEx(spi_get_chipselect(spi, 0)));
 	/* mask out bits we are going to set */
 	cs->hw_mode &= ~(CSMODE_CP_BEGIN_EDGECLK | CSMODE_CI_INACTIVEHIGH
 			 | CSMODE_REV);
diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c
index b9e8b7b241a4..f2341ab99556 100644
--- a/drivers/spi/spi-fsl-lpspi.c
+++ b/drivers/spi/spi-fsl-lpspi.c
@@ -425,7 +425,7 @@ static int fsl_lpspi_setup_transfer(struct spi_controller *controller,
 	if (fsl_lpspi->is_only_cs1)
 		fsl_lpspi->config.chip_select = 1;
 	else
-		fsl_lpspi->config.chip_select = spi->chip_select;
+		fsl_lpspi->config.chip_select = spi_get_chipselect(spi, 0);
 
 	if (!fsl_lpspi->config.speed_hz)
 		fsl_lpspi->config.speed_hz = spi->max_speed_hz;
diff --git a/drivers/spi/spi-fsl-qspi.c b/drivers/spi/spi-fsl-qspi.c
index bacc54836959..8ade61e5ebc0 100644
--- a/drivers/spi/spi-fsl-qspi.c
+++ b/drivers/spi/spi-fsl-qspi.c
@@ -528,7 +528,7 @@ static void fsl_qspi_select_mem(struct fsl_qspi *q, struct spi_device *spi)
 	unsigned long rate = spi->max_speed_hz;
 	int ret;
 
-	if (q->selected == spi->chip_select)
+	if (q->selected == spi_get_chipselect(spi, 0))
 		return;
 
 	if (needs_4x_clock(q))
@@ -544,7 +544,7 @@ static void fsl_qspi_select_mem(struct fsl_qspi *q, struct spi_device *spi)
 	if (ret)
 		return;
 
-	q->selected = spi->chip_select;
+	q->selected = spi_get_chipselect(spi, 0);
 
 	fsl_qspi_invalidate(q);
 }
@@ -823,7 +823,7 @@ static const char *fsl_qspi_get_name(struct spi_mem *mem)
 
 	name = devm_kasprintf(dev, GFP_KERNEL,
 			      "%s-%d", dev_name(q->dev),
-			      mem->spi->chip_select);
+			      spi_get_chipselect(mem->spi, 0));
 
 	if (!name) {
 		dev_err(dev, "failed to get memory for custom flash name\n");
diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c
index 725d043488a1..702756c25aa7 100644
--- a/drivers/spi/spi-fsl-spi.c
+++ b/drivers/spi/spi-fsl-spi.c
@@ -503,7 +503,7 @@ static void fsl_spi_grlib_cs_control(struct spi_device *spi, bool on)
 	struct mpc8xxx_spi *mpc8xxx_spi = spi_master_get_devdata(spi->master);
 	struct fsl_spi_reg __iomem *reg_base = mpc8xxx_spi->reg_base;
 	u32 slvsel;
-	u16 cs = spi->chip_select;
+	u16 cs = spi_get_chipselect(spi, 0);
 
 	if (cs < mpc8xxx_spi->native_chipselects) {
 		slvsel = mpc8xxx_spi_read_reg(&reg_base->slvsel);
diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c
index f80635532b4d..ba7be505ec4e 100644
--- a/drivers/spi/spi-geni-qcom.c
+++ b/drivers/spi/spi-geni-qcom.c
@@ -391,9 +391,9 @@ static int setup_fifo_params(struct spi_device *spi_slv,
 			cpha = CPHA;
 
 		if (spi_slv->mode & SPI_CS_HIGH)
-			demux_output_inv = BIT(spi_slv->chip_select);
+			demux_output_inv = BIT(spi_get_chipselect(spi_slv, 0));
 
-		demux_sel = spi_slv->chip_select;
+		demux_sel = spi_get_chipselect(spi_slv, 0);
 		mas->cur_bits_per_word = spi_slv->bits_per_word;
 
 		spi_setup_word_len(mas, spi_slv->mode, spi_slv->bits_per_word);
@@ -469,7 +469,7 @@ static int setup_gsi_xfer(struct spi_transfer *xfer, struct spi_geni_master *mas
 	peripheral.loopback_en = !!(spi_slv->mode & SPI_LOOP);
 	peripheral.clock_pol_high = !!(spi_slv->mode & SPI_CPOL);
 	peripheral.data_pol_high = !!(spi_slv->mode & SPI_CPHA);
-	peripheral.cs = spi_slv->chip_select;
+	peripheral.cs = spi_get_chipselect(spi_slv, 0);
 	peripheral.pack_en = true;
 	peripheral.word_len = xfer->bits_per_word - MIN_WORD_LEN;
 
diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index 9c8c7948044e..092afc7679d4 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -230,7 +230,7 @@ static void spi_gpio_chipselect(struct spi_device *spi, int is_active)
 
 	/* Drive chip select line, if we have one */
 	if (spi_gpio->cs_gpios) {
-		struct gpio_desc *cs = spi_gpio->cs_gpios[spi->chip_select];
+		struct gpio_desc *cs = spi_gpio->cs_gpios[spi_get_chipselect(spi, 0)];
 
 		/* SPI chip selects are normally active-low */
 		gpiod_set_value_cansleep(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active);
@@ -248,7 +248,7 @@ static int spi_gpio_setup(struct spi_device *spi)
 	 * initialized from the descriptor lookup.
 	 */
 	if (spi_gpio->cs_gpios) {
-		cs = spi_gpio->cs_gpios[spi->chip_select];
+		cs = spi_gpio->cs_gpios[spi_get_chipselect(spi, 0)];
 		if (!spi->controller_state && cs)
 			status = gpiod_direction_output(cs,
 						  !(spi->mode & SPI_CS_HIGH));
diff --git a/drivers/spi/spi-gxp.c b/drivers/spi/spi-gxp.c
index c900c2f39b57..684d63f402f3 100644
--- a/drivers/spi/spi-gxp.c
+++ b/drivers/spi/spi-gxp.c
@@ -201,7 +201,7 @@ static ssize_t gxp_spi_write(struct gxp_spi_chip *chip, const struct spi_mem_op
 static int do_gxp_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct gxp_spi *spifi = spi_controller_get_devdata(mem->spi->master);
-	struct gxp_spi_chip *chip = &spifi->chips[mem->spi->chip_select];
+	struct gxp_spi_chip *chip = &spifi->chips[spi_get_chipselect(mem->spi, 0)];
 	int ret;
 
 	if (op->data.dir == SPI_MEM_DATA_IN) {
@@ -237,7 +237,7 @@ static const struct spi_controller_mem_ops gxp_spi_mem_ops = {
 static int gxp_spi_setup(struct spi_device *spi)
 {
 	struct gxp_spi *spifi = spi_controller_get_devdata(spi->master);
-	unsigned int cs = spi->chip_select;
+	unsigned int cs = spi_get_chipselect(spi, 0);
 	struct gxp_spi_chip *chip = &spifi->chips[cs];
 
 	chip->spifi = spifi;
diff --git a/drivers/spi/spi-hisi-sfc-v3xx.c b/drivers/spi/spi-hisi-sfc-v3xx.c
index f07d1045a30a..7cbcb065bb44 100644
--- a/drivers/spi/spi-hisi-sfc-v3xx.c
+++ b/drivers/spi/spi-hisi-sfc-v3xx.c
@@ -361,7 +361,7 @@ static int hisi_sfc_v3xx_exec_op(struct spi_mem *mem,
 {
 	struct hisi_sfc_v3xx_host *host;
 	struct spi_device *spi = mem->spi;
-	u8 chip_select = spi->chip_select;
+	u8 chip_select = spi_get_chipselect(spi, 0);
 
 	host = spi_controller_get_devdata(spi->master);
 
diff --git a/drivers/spi/spi-img-spfi.c b/drivers/spi/spi-img-spfi.c
index c64e4fd3fdf0..d775f87770e3 100644
--- a/drivers/spi/spi-img-spfi.c
+++ b/drivers/spi/spi-img-spfi.c
@@ -413,15 +413,15 @@ static int img_spfi_prepare(struct spi_master *master, struct spi_message *msg)
 	val = spfi_readl(spfi, SPFI_PORT_STATE);
 	val &= ~(SPFI_PORT_STATE_DEV_SEL_MASK <<
 		 SPFI_PORT_STATE_DEV_SEL_SHIFT);
-	val |= msg->spi->chip_select << SPFI_PORT_STATE_DEV_SEL_SHIFT;
+	val |= spi_get_chipselect(msg->spi, 0) << SPFI_PORT_STATE_DEV_SEL_SHIFT;
 	if (msg->spi->mode & SPI_CPHA)
-		val |= SPFI_PORT_STATE_CK_PHASE(msg->spi->chip_select);
+		val |= SPFI_PORT_STATE_CK_PHASE(spi_get_chipselect(msg->spi, 0));
 	else
-		val &= ~SPFI_PORT_STATE_CK_PHASE(msg->spi->chip_select);
+		val &= ~SPFI_PORT_STATE_CK_PHASE(spi_get_chipselect(msg->spi, 0));
 	if (msg->spi->mode & SPI_CPOL)
-		val |= SPFI_PORT_STATE_CK_POL(msg->spi->chip_select);
+		val |= SPFI_PORT_STATE_CK_POL(spi_get_chipselect(msg->spi, 0));
 	else
-		val &= ~SPFI_PORT_STATE_CK_POL(msg->spi->chip_select);
+		val &= ~SPFI_PORT_STATE_CK_POL(spi_get_chipselect(msg->spi, 0));
 	spfi_writel(spfi, val, SPFI_PORT_STATE);
 
 	return 0;
@@ -450,11 +450,11 @@ static void img_spfi_config(struct spi_master *master, struct spi_device *spi,
 	div = DIV_ROUND_UP(clk_get_rate(spfi->spfi_clk), xfer->speed_hz);
 	div = clamp(512 / (1 << get_count_order(div)), 1, 128);
 
-	val = spfi_readl(spfi, SPFI_DEVICE_PARAMETER(spi->chip_select));
+	val = spfi_readl(spfi, SPFI_DEVICE_PARAMETER(spi_get_chipselect(spi, 0)));
 	val &= ~(SPFI_DEVICE_PARAMETER_BITCLK_MASK <<
 		 SPFI_DEVICE_PARAMETER_BITCLK_SHIFT);
 	val |= div << SPFI_DEVICE_PARAMETER_BITCLK_SHIFT;
-	spfi_writel(spfi, val, SPFI_DEVICE_PARAMETER(spi->chip_select));
+	spfi_writel(spfi, val, SPFI_DEVICE_PARAMETER(spi_get_chipselect(spi, 0)));
 
 	spfi_writel(spfi, xfer->len << SPFI_TRANSACTION_TSIZE_SHIFT,
 		    SPFI_TRANSACTION);
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index e4ccd0c329d0..620bce96b1f9 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -528,7 +528,7 @@ static int mx51_ecspi_prepare_message(struct spi_imx_data *spi_imx,
 		ctrl |= MX51_ECSPI_CTRL_DRCTL(spi_imx->spi_drctl);
 
 	/* set chip select to use */
-	ctrl |= MX51_ECSPI_CTRL_CS(spi->chip_select);
+	ctrl |= MX51_ECSPI_CTRL_CS(spi_get_chipselect(spi, 0));
 
 	/*
 	 * The ctrl register must be written first, with the EN bit set other
@@ -549,22 +549,22 @@ static int mx51_ecspi_prepare_message(struct spi_imx_data *spi_imx,
 	 * BURST_LENGTH + 1 bits are received
 	 */
 	if (spi_imx->slave_mode && is_imx53_ecspi(spi_imx))
-		cfg &= ~MX51_ECSPI_CONFIG_SBBCTRL(spi->chip_select);
+		cfg &= ~MX51_ECSPI_CONFIG_SBBCTRL(spi_get_chipselect(spi, 0));
 	else
-		cfg |= MX51_ECSPI_CONFIG_SBBCTRL(spi->chip_select);
+		cfg |= MX51_ECSPI_CONFIG_SBBCTRL(spi_get_chipselect(spi, 0));
 
 	if (spi->mode & SPI_CPOL) {
-		cfg |= MX51_ECSPI_CONFIG_SCLKPOL(spi->chip_select);
-		cfg |= MX51_ECSPI_CONFIG_SCLKCTL(spi->chip_select);
+		cfg |= MX51_ECSPI_CONFIG_SCLKPOL(spi_get_chipselect(spi, 0));
+		cfg |= MX51_ECSPI_CONFIG_SCLKCTL(spi_get_chipselect(spi, 0));
 	} else {
-		cfg &= ~MX51_ECSPI_CONFIG_SCLKPOL(spi->chip_select);
-		cfg &= ~MX51_ECSPI_CONFIG_SCLKCTL(spi->chip_select);
+		cfg &= ~MX51_ECSPI_CONFIG_SCLKPOL(spi_get_chipselect(spi, 0));
+		cfg &= ~MX51_ECSPI_CONFIG_SCLKCTL(spi_get_chipselect(spi, 0));
 	}
 
 	if (spi->mode & SPI_CS_HIGH)
-		cfg |= MX51_ECSPI_CONFIG_SSBPOL(spi->chip_select);
+		cfg |= MX51_ECSPI_CONFIG_SSBPOL(spi_get_chipselect(spi, 0));
 	else
-		cfg &= ~MX51_ECSPI_CONFIG_SSBPOL(spi->chip_select);
+		cfg &= ~MX51_ECSPI_CONFIG_SSBPOL(spi_get_chipselect(spi, 0));
 
 	if (cfg == current_cfg)
 		return 0;
@@ -614,9 +614,9 @@ static void mx51_configure_cpha(struct spi_imx_data *spi_imx,
 	cpha ^= flip_cpha;
 
 	if (cpha)
-		cfg |= MX51_ECSPI_CONFIG_SCLKPHA(spi->chip_select);
+		cfg |= MX51_ECSPI_CONFIG_SCLKPHA(spi_get_chipselect(spi, 0));
 	else
-		cfg &= ~MX51_ECSPI_CONFIG_SCLKPHA(spi->chip_select);
+		cfg &= ~MX51_ECSPI_CONFIG_SCLKPHA(spi_get_chipselect(spi, 0));
 
 	writel(cfg, spi_imx->base + MX51_ECSPI_CONFIG);
 }
@@ -768,8 +768,8 @@ static int mx31_prepare_transfer(struct spi_imx_data *spi_imx,
 		reg |= MX31_CSPICTRL_POL;
 	if (spi->mode & SPI_CS_HIGH)
 		reg |= MX31_CSPICTRL_SSPOL;
-	if (!spi->cs_gpiod)
-		reg |= (spi->chip_select) <<
+	if (!spi_get_csgpiod(spi, 0))
+		reg |= (spi_get_chipselect(spi, 0)) <<
 			(is_imx35_cspi(spi_imx) ? MX35_CSPICTRL_CS_SHIFT :
 						  MX31_CSPICTRL_CS_SHIFT);
 
@@ -868,8 +868,8 @@ static int mx21_prepare_transfer(struct spi_imx_data *spi_imx,
 		reg |= MX21_CSPICTRL_POL;
 	if (spi->mode & SPI_CS_HIGH)
 		reg |= MX21_CSPICTRL_SSPOL;
-	if (!spi->cs_gpiod)
-		reg |= spi->chip_select << MX21_CSPICTRL_CS_SHIFT;
+	if (!spi_get_csgpiod(spi, 0))
+		reg |= spi_get_chipselect(spi, 0) << MX21_CSPICTRL_CS_SHIFT;
 
 	writel(reg, spi_imx->base + MXC_CSPICTRL);
 
diff --git a/drivers/spi/spi-ingenic.c b/drivers/spi/spi-ingenic.c
index 713a238bee63..7d4b515a160d 100644
--- a/drivers/spi/spi-ingenic.c
+++ b/drivers/spi/spi-ingenic.c
@@ -263,7 +263,7 @@ static int spi_ingenic_prepare_message(struct spi_controller *ctlr,
 {
 	struct ingenic_spi *priv = spi_controller_get_devdata(ctlr);
 	struct spi_device *spi = message->spi;
-	unsigned int cs = REG_SSICR1_FRMHL << spi->chip_select;
+	unsigned int cs = REG_SSICR1_FRMHL << spi_get_chipselect(spi, 0);
 	unsigned int ssicr0_mask = REG_SSICR0_LOOP | REG_SSICR0_FSEL;
 	unsigned int ssicr1_mask = REG_SSICR1_PHA | REG_SSICR1_POL | cs;
 	unsigned int ssicr0 = 0, ssicr1 = 0;
@@ -282,7 +282,7 @@ static int spi_ingenic_prepare_message(struct spi_controller *ctlr,
 
 	if (spi->mode & SPI_LOOP)
 		ssicr0 |= REG_SSICR0_LOOP;
-	if (spi->chip_select)
+	if (spi_get_chipselect(spi, 0))
 		ssicr0 |= REG_SSICR0_FSEL;
 
 	if (spi->mode & SPI_CPHA)
diff --git a/drivers/spi/spi-intel.c b/drivers/spi/spi-intel.c
index f4679868c49f..bc6d22149e7e 100644
--- a/drivers/spi/spi-intel.c
+++ b/drivers/spi/spi-intel.c
@@ -451,7 +451,7 @@ static u32 intel_spi_chip_addr(const struct intel_spi *ispi,
 	/* Pick up the correct start address */
 	if (!mem)
 		return 0;
-	return mem->spi->chip_select == 1 ? ispi->chip0_size : 0;
+	return (spi_get_chipselect(mem->spi, 0) == 1) ? ispi->chip0_size : 0;
 }
 
 static int intel_spi_read_reg(struct intel_spi *ispi, const struct spi_mem *mem,
diff --git a/drivers/spi/spi-jcore.c b/drivers/spi/spi-jcore.c
index 74c8319c29f1..c42a3358e8c9 100644
--- a/drivers/spi/spi-jcore.c
+++ b/drivers/spi/spi-jcore.c
@@ -68,9 +68,9 @@ static void jcore_spi_program(struct jcore_spi *hw)
 static void jcore_spi_chipsel(struct spi_device *spi, bool value)
 {
 	struct jcore_spi *hw = spi_master_get_devdata(spi->master);
-	u32 csbit = 1U << (2 * spi->chip_select);
+	u32 csbit = 1U << (2 * spi_get_chipselect(spi, 0));
 
-	dev_dbg(hw->master->dev.parent, "chipselect %d\n", spi->chip_select);
+	dev_dbg(hw->master->dev.parent, "chipselect %d\n", spi_get_chipselect(spi, 0));
 
 	if (value)
 		hw->cs_reg |= csbit;
diff --git a/drivers/spi/spi-lantiq-ssc.c b/drivers/spi/spi-lantiq-ssc.c
index 76cf2a66f874..8d6ecc5d6f70 100644
--- a/drivers/spi/spi-lantiq-ssc.c
+++ b/drivers/spi/spi-lantiq-ssc.c
@@ -388,11 +388,11 @@ static int lantiq_ssc_setup(struct spi_device *spidev)
 {
 	struct spi_master *master = spidev->master;
 	struct lantiq_ssc_spi *spi = spi_master_get_devdata(master);
-	unsigned int cs = spidev->chip_select;
+	unsigned int cs = spi_get_chipselect(spidev, 0);
 	u32 gpocon;
 
 	/* GPIOs are used for CS */
-	if (spidev->cs_gpiod)
+	if (spi_get_csgpiod(spidev, 0))
 		return 0;
 
 	dev_dbg(spi->dev, "using internal chipselect %u\n", cs);
@@ -796,7 +796,7 @@ static void lantiq_ssc_handle_err(struct spi_master *master,
 static void lantiq_ssc_set_cs(struct spi_device *spidev, bool enable)
 {
 	struct lantiq_ssc_spi *spi = spi_master_get_devdata(spidev->master);
-	unsigned int cs = spidev->chip_select;
+	unsigned int cs = spi_get_chipselect(spidev, 0);
 	u32 fgpo;
 
 	if (!!(spidev->mode & SPI_CS_HIGH) == enable)
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index 701838b6f0c4..edd7430d4c05 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -325,7 +325,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 	if (!spi_mem_internal_supports_op(mem, op))
 		return -ENOTSUPP;
 
-	if (ctlr->mem_ops && ctlr->mem_ops->exec_op && !mem->spi->cs_gpiod) {
+	if (ctlr->mem_ops && ctlr->mem_ops->exec_op && !spi_get_csgpiod(mem->spi, 0)) {
 		ret = spi_mem_access_start(mem);
 		if (ret)
 			return ret;
@@ -808,7 +808,7 @@ int spi_mem_poll_status(struct spi_mem *mem,
 	    op->data.dir != SPI_MEM_DATA_IN)
 		return -EINVAL;
 
-	if (ctlr->mem_ops && ctlr->mem_ops->poll_status && !mem->spi->cs_gpiod) {
+	if (ctlr->mem_ops && ctlr->mem_ops->poll_status && !spi_get_csgpiod(mem->spi, 0)) {
 		ret = spi_mem_access_start(mem);
 		if (ret)
 			return ret;
diff --git a/drivers/spi/spi-meson-spicc.c b/drivers/spi/spi-meson-spicc.c
index b9f812837cd6..141562c882f1 100644
--- a/drivers/spi/spi-meson-spicc.c
+++ b/drivers/spi/spi-meson-spicc.c
@@ -505,7 +505,7 @@ static int meson_spicc_prepare_message(struct spi_master *master,
 		conf |= FIELD_PREP(SPICC_DRCTL_MASK, SPICC_DRCTL_IGNORE);
 
 	/* Select CS */
-	conf |= FIELD_PREP(SPICC_CS_MASK, spi->chip_select);
+	conf |= FIELD_PREP(SPICC_CS_MASK, spi_get_chipselect(spi, 0));
 
 	/* Default 8bit word */
 	conf |= FIELD_PREP(SPICC_BITLENGTH_MASK, 8 - 1);
diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index e6cf6ff08061..b59e8a0c5b97 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -247,8 +247,8 @@ static void mchp_corespi_set_cs(struct spi_device *spi, bool disable)
 	struct mchp_corespi *corespi = spi_master_get_devdata(spi->master);
 
 	reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT);
-	reg &= ~BIT(spi->chip_select);
-	reg |= !disable << spi->chip_select;
+	reg &= ~BIT(spi_get_chipselect(spi, 0));
+	reg |= !disable << spi_get_chipselect(spi, 0);
 
 	mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg);
 }
@@ -265,7 +265,7 @@ static int mchp_corespi_setup(struct spi_device *spi)
 	 */
 	if (spi->mode & SPI_CS_HIGH) {
 		reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT);
-		reg |= BIT(spi->chip_select);
+		reg |= BIT(spi_get_chipselect(spi, 0));
 		mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg);
 	}
 	return 0;
diff --git a/drivers/spi/spi-mpc512x-psc.c b/drivers/spi/spi-mpc512x-psc.c
index 0b86f1804682..99aeef28a477 100644
--- a/drivers/spi/spi-mpc512x-psc.c
+++ b/drivers/spi/spi-mpc512x-psc.c
@@ -121,17 +121,17 @@ static void mpc512x_psc_spi_activate_cs(struct spi_device *spi)
 	out_be32(psc_addr(mps, ccr), ccr);
 	mps->bits_per_word = cs->bits_per_word;
 
-	if (spi->cs_gpiod) {
+	if (spi_get_csgpiod(spi, 0)) {
 		/* gpiolib will deal with the inversion */
-		gpiod_set_value(spi->cs_gpiod, 1);
+		gpiod_set_value(spi_get_csgpiod(spi, 0), 1);
 	}
 }
 
 static void mpc512x_psc_spi_deactivate_cs(struct spi_device *spi)
 {
-	if (spi->cs_gpiod) {
+	if (spi_get_csgpiod(spi, 0)) {
 		/* gpiolib will deal with the inversion */
-		gpiod_set_value(spi->cs_gpiod, 0);
+		gpiod_set_value(spi_get_csgpiod(spi, 0), 0);
 	}
 }
 
diff --git a/drivers/spi/spi-mpc52xx.c b/drivers/spi/spi-mpc52xx.c
index b652fb196622..ab7df5f64342 100644
--- a/drivers/spi/spi-mpc52xx.c
+++ b/drivers/spi/spi-mpc52xx.c
@@ -101,7 +101,7 @@ static void mpc52xx_spi_chipsel(struct mpc52xx_spi *ms, int value)
 	int cs;
 
 	if (ms->gpio_cs_count > 0) {
-		cs = ms->message->spi->chip_select;
+		cs = spi_get_chipselect(ms->message->spi, 0);
 		gpiod_set_value(ms->gpio_cs[cs], value);
 	} else {
 		out_8(ms->regs + SPI_PORTDATA, value ? 0 : 0x08);
diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c
index 9eab6c20dbc5..21c321f43766 100644
--- a/drivers/spi/spi-mt65xx.c
+++ b/drivers/spi/spi-mt65xx.c
@@ -421,7 +421,7 @@ static int mtk_spi_hw_init(struct spi_master *master,
 
 	/* pad select */
 	if (mdata->dev_comp->need_pad_sel)
-		writel(mdata->pad_sel[spi->chip_select],
+		writel(mdata->pad_sel[spi_get_chipselect(spi, 0)],
 		       mdata->base + SPI_PAD_SEL_REG);
 
 	/* tick delay */
@@ -735,9 +735,9 @@ static int mtk_spi_setup(struct spi_device *spi)
 	if (!spi->controller_data)
 		spi->controller_data = (void *)&mtk_default_chip_info;
 
-	if (mdata->dev_comp->need_pad_sel && spi->cs_gpiod)
+	if (mdata->dev_comp->need_pad_sel && spi_get_csgpiod(spi, 0))
 		/* CS de-asserted, gpiolib will handle inversion */
-		gpiod_direction_output(spi->cs_gpiod, 0);
+		gpiod_direction_output(spi_get_csgpiod(spi, 0), 0);
 
 	return 0;
 }
diff --git a/drivers/spi/spi-mt7621.c b/drivers/spi/spi-mt7621.c
index c4cc8e2f85e2..3e9d396b33bd 100644
--- a/drivers/spi/spi-mt7621.c
+++ b/drivers/spi/spi-mt7621.c
@@ -76,7 +76,7 @@ static inline void mt7621_spi_write(struct mt7621_spi *rs, u32 reg, u32 val)
 static void mt7621_spi_set_cs(struct spi_device *spi, int enable)
 {
 	struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
-	int cs = spi->chip_select;
+	int cs = spi_get_chipselect(spi, 0);
 	u32 polar = 0;
 	u32 master;
 
diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c
index 0709e987bd5a..fa8c1f740c70 100644
--- a/drivers/spi/spi-mux.c
+++ b/drivers/spi/spi-mux.c
@@ -51,22 +51,22 @@ static int spi_mux_select(struct spi_device *spi)
 	struct spi_mux_priv *priv = spi_controller_get_devdata(spi->controller);
 	int ret;
 
-	ret = mux_control_select(priv->mux, spi->chip_select);
+	ret = mux_control_select(priv->mux, spi_get_chipselect(spi, 0));
 	if (ret)
 		return ret;
 
-	if (priv->current_cs == spi->chip_select)
+	if (priv->current_cs == spi_get_chipselect(spi, 0))
 		return 0;
 
 	dev_dbg(&priv->spi->dev, "setting up the mux for cs %d\n",
-		spi->chip_select);
+		spi_get_chipselect(spi, 0));
 
 	/* copy the child device's settings except for the cs */
 	priv->spi->max_speed_hz = spi->max_speed_hz;
 	priv->spi->mode = spi->mode;
 	priv->spi->bits_per_word = spi->bits_per_word;
 
-	priv->current_cs = spi->chip_select;
+	priv->current_cs = spi_get_chipselect(spi, 0);
 
 	return 0;
 }
diff --git a/drivers/spi/spi-mxic.c b/drivers/spi/spi-mxic.c
index 10727ea53043..00617fd4b2c3 100644
--- a/drivers/spi/spi-mxic.c
+++ b/drivers/spi/spi-mxic.c
@@ -306,8 +306,8 @@ static u32 mxic_spi_prep_hc_cfg(struct spi_device *spi, u32 flags)
 		nio = 2;
 
 	return flags | HC_CFG_NIO(nio) |
-	       HC_CFG_TYPE(spi->chip_select, HC_CFG_TYPE_SPI_NOR) |
-	       HC_CFG_SLV_ACT(spi->chip_select) | HC_CFG_IDLE_SIO_LVL(1);
+	       HC_CFG_TYPE(spi_get_chipselect(spi, 0), HC_CFG_TYPE_SPI_NOR) |
+	       HC_CFG_SLV_ACT(spi_get_chipselect(spi, 0)) | HC_CFG_IDLE_SIO_LVL(1);
 }
 
 static u32 mxic_spi_mem_prep_op_cfg(const struct spi_mem_op *op,
@@ -405,7 +405,7 @@ static ssize_t mxic_spi_mem_dirmap_read(struct spi_mem_dirmap_desc *desc,
 	len = min_t(size_t, len, mxic->linear.size);
 	writel(len, mxic->regs + LRD_RANGE);
 	writel(LMODE_CMD0(desc->info.op_tmpl.cmd.opcode) |
-	       LMODE_SLV_ACT(desc->mem->spi->chip_select) |
+	       LMODE_SLV_ACT(spi_get_chipselect(desc->mem->spi, 0)) |
 	       LMODE_EN,
 	       mxic->regs + LRD_CTRL);
 
@@ -449,7 +449,7 @@ static ssize_t mxic_spi_mem_dirmap_write(struct spi_mem_dirmap_desc *desc,
 	len = min_t(size_t, len, mxic->linear.size);
 	writel(len, mxic->regs + LWR_RANGE);
 	writel(LMODE_CMD0(desc->info.op_tmpl.cmd.opcode) |
-	       LMODE_SLV_ACT(desc->mem->spi->chip_select) |
+	       LMODE_SLV_ACT(spi_get_chipselect(desc->mem->spi, 0)) |
 	       LMODE_EN,
 	       mxic->regs + LWR_CTRL);
 
@@ -524,7 +524,7 @@ static int mxic_spi_mem_exec_op(struct spi_mem *mem,
 	writel(HC_EN_BIT, mxic->regs + HC_EN);
 
 	writel(mxic_spi_mem_prep_op_cfg(op, op->data.nbytes),
-	       mxic->regs + SS_CTRL(mem->spi->chip_select));
+	       mxic->regs + SS_CTRL(spi_get_chipselect(mem->spi, 0)));
 
 	writel(readl(mxic->regs + HC_CFG) | HC_CFG_MAN_CS_ASSERT,
 	       mxic->regs + HC_CFG);
diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c
index 10fb31a5e409..963a53dd680b 100644
--- a/drivers/spi/spi-mxs.c
+++ b/drivers/spi/spi-mxs.c
@@ -369,7 +369,7 @@ static int mxs_spi_transfer_one(struct spi_master *master,
 	/* Program CS register bits here, it will be used for all transfers. */
 	writel(BM_SSP_CTRL0_WAIT_FOR_CMD | BM_SSP_CTRL0_WAIT_FOR_IRQ,
 	       ssp->base + HW_SSP_CTRL0 + STMP_OFFSET_REG_CLR);
-	writel(mxs_spi_cs_to_reg(m->spi->chip_select),
+	writel(mxs_spi_cs_to_reg(spi_get_chipselect(m->spi, 0)),
 	       ssp->base + HW_SSP_CTRL0 + STMP_OFFSET_REG_SET);
 
 	list_for_each_entry(t, &m->transfers, transfer_list) {
diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c
index 8d7698d167ef..eb353561509a 100644
--- a/drivers/spi/spi-npcm-fiu.c
+++ b/drivers/spi/spi-npcm-fiu.c
@@ -288,7 +288,7 @@ static ssize_t npcm_fiu_direct_read(struct spi_mem_dirmap_desc *desc,
 {
 	struct npcm_fiu_spi *fiu =
 		spi_controller_get_devdata(desc->mem->spi->master);
-	struct npcm_fiu_chip *chip = &fiu->chip[desc->mem->spi->chip_select];
+	struct npcm_fiu_chip *chip = &fiu->chip[spi_get_chipselect(desc->mem->spi, 0)];
 	void __iomem *src = (void __iomem *)(chip->flash_region_mapped_ptr +
 					     offs);
 	u8 *buf_rx = buf;
@@ -315,7 +315,7 @@ static ssize_t npcm_fiu_direct_write(struct spi_mem_dirmap_desc *desc,
 {
 	struct npcm_fiu_spi *fiu =
 		spi_controller_get_devdata(desc->mem->spi->master);
-	struct npcm_fiu_chip *chip = &fiu->chip[desc->mem->spi->chip_select];
+	struct npcm_fiu_chip *chip = &fiu->chip[spi_get_chipselect(desc->mem->spi, 0)];
 	void __iomem *dst = (void __iomem *)(chip->flash_region_mapped_ptr +
 					     offs);
 	const u8 *buf_tx = buf;
@@ -344,7 +344,7 @@ static int npcm_fiu_uma_read(struct spi_mem *mem,
 
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CTS,
 			   NPCM_FIU_UMA_CTS_DEV_NUM,
-			   (mem->spi->chip_select <<
+			   (spi_get_chipselect(mem->spi, 0) <<
 			    NPCM_FIU_UMA_CTS_DEV_NUM_SHIFT));
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CMD,
 			   NPCM_FIU_UMA_CMD_CMD, op->cmd.opcode);
@@ -398,7 +398,7 @@ static int npcm_fiu_uma_write(struct spi_mem *mem,
 
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CTS,
 			   NPCM_FIU_UMA_CTS_DEV_NUM,
-			   (mem->spi->chip_select <<
+			   (spi_get_chipselect(mem->spi, 0) <<
 			    NPCM_FIU_UMA_CTS_DEV_NUM_SHIFT));
 
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CMD,
@@ -451,7 +451,7 @@ static int npcm_fiu_manualwrite(struct spi_mem *mem,
 
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CTS,
 			   NPCM_FIU_UMA_CTS_DEV_NUM,
-			   (mem->spi->chip_select <<
+			   (spi_get_chipselect(mem->spi, 0) <<
 			    NPCM_FIU_UMA_CTS_DEV_NUM_SHIFT));
 	regmap_update_bits(fiu->regmap, NPCM_FIU_UMA_CTS,
 			   NPCM_FIU_UMA_CTS_SW_CS, 0);
@@ -545,7 +545,7 @@ static int npcm_fiu_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct npcm_fiu_spi *fiu =
 		spi_controller_get_devdata(mem->spi->master);
-	struct npcm_fiu_chip *chip = &fiu->chip[mem->spi->chip_select];
+	struct npcm_fiu_chip *chip = &fiu->chip[spi_get_chipselect(mem->spi, 0)];
 	int ret = 0;
 	u8 *buf;
 
@@ -605,7 +605,7 @@ static int npcm_fiu_dirmap_create(struct spi_mem_dirmap_desc *desc)
 {
 	struct npcm_fiu_spi *fiu =
 		spi_controller_get_devdata(desc->mem->spi->master);
-	struct npcm_fiu_chip *chip = &fiu->chip[desc->mem->spi->chip_select];
+	struct npcm_fiu_chip *chip = &fiu->chip[spi_get_chipselect(desc->mem->spi, 0)];
 	struct regmap *gcr_regmap;
 
 	if (!fiu->res_mem) {
@@ -624,7 +624,7 @@ static int npcm_fiu_dirmap_create(struct spi_mem_dirmap_desc *desc)
 		chip->flash_region_mapped_ptr =
 			devm_ioremap(fiu->dev, (fiu->res_mem->start +
 							(fiu->info->max_map_size *
-						    desc->mem->spi->chip_select)),
+						    spi_get_chipselect(desc->mem->spi, 0))),
 					     (u32)desc->info.length);
 		if (!chip->flash_region_mapped_ptr) {
 			dev_warn(fiu->dev, "Error mapping memory region, direct read disabled\n");
@@ -669,9 +669,9 @@ static int npcm_fiu_setup(struct spi_device *spi)
 	struct npcm_fiu_spi *fiu = spi_controller_get_devdata(ctrl);
 	struct npcm_fiu_chip *chip;
 
-	chip = &fiu->chip[spi->chip_select];
+	chip = &fiu->chip[spi_get_chipselect(spi, 0)];
 	chip->fiu = fiu;
-	chip->chipselect = spi->chip_select;
+	chip->chipselect = spi_get_chipselect(spi, 0);
 	chip->clkrate = spi->max_speed_hz;
 
 	fiu->clkrate = clk_get_rate(fiu->clk);
diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c
index 71be1ec3fbde..76168cc1e00d 100644
--- a/drivers/spi/spi-nxp-fspi.c
+++ b/drivers/spi/spi-nxp-fspi.c
@@ -663,7 +663,7 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi)
 	 * Return, if previously selected slave device is same as current
 	 * requested slave device.
 	 */
-	if (f->selected == spi->chip_select)
+	if (f->selected == spi_get_chipselect(spi, 0))
 		return;
 
 	/* Reset FLSHxxCR0 registers */
@@ -676,9 +676,9 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi)
 	size_kb = FSPI_FLSHXCR0_SZ(f->memmap_phy_size);
 
 	fspi_writel(f, size_kb, f->iobase + FSPI_FLSHA1CR0 +
-		    4 * spi->chip_select);
+		    4 * spi_get_chipselect(spi, 0));
 
-	dev_dbg(f->dev, "Slave device [CS:%x] selected\n", spi->chip_select);
+	dev_dbg(f->dev, "Slave device [CS:%x] selected\n", spi_get_chipselect(spi, 0));
 
 	nxp_fspi_clk_disable_unprep(f);
 
@@ -690,7 +690,7 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi)
 	if (ret)
 		return;
 
-	f->selected = spi->chip_select;
+	f->selected = spi_get_chipselect(spi, 0);
 }
 
 static int nxp_fspi_read_ahb(struct nxp_fspi *f, const struct spi_mem_op *op)
@@ -1055,7 +1055,7 @@ static const char *nxp_fspi_get_name(struct spi_mem *mem)
 
 	name = devm_kasprintf(dev, GFP_KERNEL,
 			      "%s-%d", dev_name(f->dev),
-			      mem->spi->chip_select);
+			      spi_get_chipselect(mem->spi, 0));
 
 	if (!name) {
 		dev_err(dev, "failed to get memory for custom flash name\n");
diff --git a/drivers/spi/spi-omap-uwire.c b/drivers/spi/spi-omap-uwire.c
index 6da77de19e2b..902d2e0c1f2f 100644
--- a/drivers/spi/spi-omap-uwire.c
+++ b/drivers/spi/spi-omap-uwire.c
@@ -179,7 +179,7 @@ static void uwire_chipselect(struct spi_device *spi, int value)
 
 	w = uwire_read_reg(UWIRE_CSR);
 	old_cs = (w >> 10) & 0x03;
-	if (value == BITBANG_CS_INACTIVE || old_cs != spi->chip_select) {
+	if (value == BITBANG_CS_INACTIVE || old_cs != spi_get_chipselect(spi, 0)) {
 		/* Deselect this CS, or the previous CS */
 		w &= ~CS_CMD;
 		uwire_write_reg(UWIRE_CSR, w);
@@ -193,7 +193,7 @@ static void uwire_chipselect(struct spi_device *spi, int value)
 		else
 			uwire_write_reg(UWIRE_SR4, 0);
 
-		w = spi->chip_select << 10;
+		w = spi_get_chipselect(spi, 0) << 10;
 		w |= CS_CMD;
 		uwire_write_reg(UWIRE_CSR, w);
 	}
@@ -210,7 +210,7 @@ static int uwire_txrx(struct spi_device *spi, struct spi_transfer *t)
 	if (!t->tx_buf && !t->rx_buf)
 		return 0;
 
-	w = spi->chip_select << 10;
+	w = spi_get_chipselect(spi, 0) << 10;
 	w |= CS_CMD;
 
 	if (t->tx_buf) {
@@ -408,7 +408,7 @@ static int uwire_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 		rate /= 8;
 		break;
 	}
-	omap_uwire_configure_mode(spi->chip_select, flags);
+	omap_uwire_configure_mode(spi_get_chipselect(spi, 0), flags);
 	pr_debug("%s: uwire flags %02x, armxor %lu KHz, SCK %lu KHz\n",
 			__func__, flags,
 			clk_get_rate(uwire->ck) / 1000,
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index ce3cdd540420..1a967930f4e5 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -379,7 +379,7 @@ static void omap2_mcspi_rx_callback(void *data)
 {
 	struct spi_device *spi = data;
 	struct omap2_mcspi *mcspi = spi_master_get_devdata(spi->master);
-	struct omap2_mcspi_dma *mcspi_dma = &mcspi->dma_channels[spi->chip_select];
+	struct omap2_mcspi_dma *mcspi_dma = &mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	/* We must disable the DMA RX request */
 	omap2_mcspi_set_dma_req(spi, 1, 0);
@@ -391,7 +391,7 @@ static void omap2_mcspi_tx_callback(void *data)
 {
 	struct spi_device *spi = data;
 	struct omap2_mcspi *mcspi = spi_master_get_devdata(spi->master);
-	struct omap2_mcspi_dma *mcspi_dma = &mcspi->dma_channels[spi->chip_select];
+	struct omap2_mcspi_dma *mcspi_dma = &mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	/* We must disable the DMA TX request */
 	omap2_mcspi_set_dma_req(spi, 0, 0);
@@ -408,7 +408,7 @@ static void omap2_mcspi_tx_dma(struct spi_device *spi,
 	struct dma_async_tx_descriptor *tx;
 
 	mcspi = spi_master_get_devdata(spi->master);
-	mcspi_dma = &mcspi->dma_channels[spi->chip_select];
+	mcspi_dma = &mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	dmaengine_slave_config(mcspi_dma->dma_tx, &cfg);
 
@@ -446,7 +446,7 @@ omap2_mcspi_rx_dma(struct spi_device *spi, struct spi_transfer *xfer,
 	struct dma_async_tx_descriptor *tx;
 
 	mcspi = spi_master_get_devdata(spi->master);
-	mcspi_dma = &mcspi->dma_channels[spi->chip_select];
+	mcspi_dma = &mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 	count = xfer->len;
 
 	/*
@@ -591,7 +591,7 @@ omap2_mcspi_txrx_dma(struct spi_device *spi, struct spi_transfer *xfer)
 	int			wait_res;
 
 	mcspi = spi_master_get_devdata(spi->master);
-	mcspi_dma = &mcspi->dma_channels[spi->chip_select];
+	mcspi_dma = &mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	if (cs->word_len <= 8) {
 		width = DMA_SLAVE_BUSWIDTH_1_BYTE;
@@ -1062,8 +1062,8 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 		cs = kzalloc(sizeof(*cs), GFP_KERNEL);
 		if (!cs)
 			return -ENOMEM;
-		cs->base = mcspi->base + spi->chip_select * 0x14;
-		cs->phys = mcspi->phys + spi->chip_select * 0x14;
+		cs->base = mcspi->base + spi_get_chipselect(spi, 0) * 0x14;
+		cs->phys = mcspi->phys + spi_get_chipselect(spi, 0) * 0x14;
 		cs->mode = 0;
 		cs->chconf0 = 0;
 		cs->chctrl0 = 0;
@@ -1142,7 +1142,7 @@ static int omap2_mcspi_transfer_one(struct spi_master *master,
 	u32				chconf;
 
 	mcspi = spi_master_get_devdata(master);
-	mcspi_dma = mcspi->dma_channels + spi->chip_select;
+	mcspi_dma = mcspi->dma_channels + spi_get_chipselect(spi, 0);
 	cs = spi->controller_state;
 	cd = spi->controller_data;
 
@@ -1158,7 +1158,7 @@ static int omap2_mcspi_transfer_one(struct spi_master *master,
 
 	omap2_mcspi_set_enable(spi, 0);
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		omap2_mcspi_set_cs(spi, spi->mode & SPI_CS_HIGH);
 
 	if (par_override ||
@@ -1247,7 +1247,7 @@ out:
 
 	omap2_mcspi_set_enable(spi, 0);
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		omap2_mcspi_set_cs(spi, !(spi->mode & SPI_CS_HIGH));
 
 	if (mcspi->fifo_depth > 0 && t)
@@ -1289,7 +1289,7 @@ static bool omap2_mcspi_can_dma(struct spi_master *master,
 {
 	struct omap2_mcspi *mcspi = spi_master_get_devdata(spi->master);
 	struct omap2_mcspi_dma *mcspi_dma =
-		&mcspi->dma_channels[spi->chip_select];
+		&mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	if (!mcspi_dma->dma_rx || !mcspi_dma->dma_tx)
 		return false;
@@ -1307,7 +1307,7 @@ static size_t omap2_mcspi_max_xfer_size(struct spi_device *spi)
 {
 	struct omap2_mcspi *mcspi = spi_master_get_devdata(spi->master);
 	struct omap2_mcspi_dma *mcspi_dma =
-		&mcspi->dma_channels[spi->chip_select];
+		&mcspi->dma_channels[spi_get_chipselect(spi, 0)];
 
 	if (mcspi->max_xfer_len && mcspi_dma->dma_rx)
 		return mcspi->max_xfer_len;
diff --git a/drivers/spi/spi-orion.c b/drivers/spi/spi-orion.c
index e79d1fe0bca4..bb244b596143 100644
--- a/drivers/spi/spi-orion.c
+++ b/drivers/spi/spi-orion.c
@@ -346,7 +346,7 @@ static void orion_spi_set_cs(struct spi_device *spi, bool enable)
 	 * as it is handled by a GPIO, but that doesn't matter. What we need
 	 * is to deassert the old chip select and assert some other chip select.
 	 */
-	val |= ORION_SPI_CS(spi->chip_select);
+	val |= ORION_SPI_CS(spi_get_chipselect(spi, 0));
 
 	/*
 	 * Chip select logic is inverted from spi_set_cs(). For lines using a
@@ -470,7 +470,7 @@ orion_spi_write_read(struct spi_device *spi, struct spi_transfer *xfer)
 	unsigned int count;
 	int word_len;
 	struct orion_spi *orion_spi;
-	int cs = spi->chip_select;
+	int cs = spi_get_chipselect(spi, 0);
 	void __iomem *vaddr;
 
 	word_len = spi->bits_per_word;
diff --git a/drivers/spi/spi-pci1xxxx.c b/drivers/spi/spi-pci1xxxx.c
index a31c3b612a43..1c5731641a04 100644
--- a/drivers/spi/spi-pci1xxxx.c
+++ b/drivers/spi/spi-pci1xxxx.c
@@ -116,11 +116,11 @@ static void pci1xxxx_spi_set_cs(struct spi_device *spi, bool enable)
 	regval = readl(par->reg_base + SPI_MST_CTL_REG_OFFSET(p->hw_inst));
 	if (enable) {
 		regval &= ~SPI_MST_CTL_DEVSEL_MASK;
-		regval |= (spi->chip_select << 25);
+		regval |= (spi_get_chipselect(spi, 0) << 25);
 		writel(regval,
 		       par->reg_base + SPI_MST_CTL_REG_OFFSET(p->hw_inst));
 	} else {
-		regval &= ~(spi->chip_select << 25);
+		regval &= ~(spi_get_chipselect(spi, 0) << 25);
 		writel(regval,
 		       par->reg_base + SPI_MST_CTL_REG_OFFSET(p->hw_inst));
 
diff --git a/drivers/spi/spi-pic32-sqi.c b/drivers/spi/spi-pic32-sqi.c
index 4c8493f34fca..51dfb49523f3 100644
--- a/drivers/spi/spi-pic32-sqi.c
+++ b/drivers/spi/spi-pic32-sqi.c
@@ -267,7 +267,7 @@ static int pic32_sqi_one_transfer(struct pic32_sqi *sqi,
 	u32 nbits;
 
 	/* Device selection */
-	bd_ctrl = spi->chip_select << BD_DEVSEL_SHIFT;
+	bd_ctrl = spi_get_chipselect(spi, 0) << BD_DEVSEL_SHIFT;
 
 	/* half-duplex: select transfer buffer, direction and lane */
 	if (xfer->rx_buf) {
diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c
index 5a64ad0c94fe..8a02073a354d 100644
--- a/drivers/spi/spi-pic32.c
+++ b/drivers/spi/spi-pic32.c
@@ -591,7 +591,7 @@ static int pic32_spi_setup(struct spi_device *spi)
 	 * unreliable/erroneous SPI transactions.
 	 * To avoid that we will always handle /CS by toggling GPIO.
 	 */
-	if (!spi->cs_gpiod)
+	if (!spi_get_csgpiod(spi, 0))
 		return -EINVAL;
 
 	return 0;
@@ -600,7 +600,7 @@ static int pic32_spi_setup(struct spi_device *spi)
 static void pic32_spi_cleanup(struct spi_device *spi)
 {
 	/* de-activate cs-gpio, gpiolib will handle inversion */
-	gpiod_direction_output(spi->cs_gpiod, 0);
+	gpiod_direction_output(spi_get_csgpiod(spi, 0), 0);
 }
 
 static int pic32_spi_dma_prep(struct pic32_spi *pic32s, struct device *dev)
diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c
index a17ff839117f..982407bc5d9f 100644
--- a/drivers/spi/spi-pl022.c
+++ b/drivers/spi/spi-pl022.c
@@ -1587,9 +1587,9 @@ static int pl022_transfer_one_message(struct spi_master *master,
 
 	/* Setup the SPI using the per chip configuration */
 	pl022->cur_chip = spi_get_ctldata(msg->spi);
-	pl022->cur_cs = msg->spi->chip_select;
+	pl022->cur_cs = spi_get_chipselect(msg->spi, 0);
 	/* This is always available but may be set to -ENOENT */
-	pl022->cur_gpiod = msg->spi->cs_gpiod;
+	pl022->cur_gpiod = spi_get_csgpiod(msg->spi, 0);
 
 	restore_state(pl022);
 	flush(pl022);
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index a75ba2993f3c..d04e8cb987e9 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -368,7 +368,7 @@ static void lpss_ssp_select_cs(struct spi_device *spi,
 
 	value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
 
-	cs = spi->chip_select;
+	cs = spi_get_chipselect(spi, 0);
 	cs <<= config->cs_sel_shift;
 	if (cs != (value & config->cs_sel_mask)) {
 		/*
@@ -429,7 +429,7 @@ static void cs_assert(struct spi_device *spi)
 		spi_controller_get_devdata(spi->controller);
 
 	if (drv_data->ssp_type == CE4100_SSP) {
-		pxa2xx_spi_write(drv_data, SSSR, spi->chip_select);
+		pxa2xx_spi_write(drv_data, SSSR, spi_get_chipselect(spi, 0));
 		return;
 	}
 
@@ -1217,7 +1217,7 @@ static int setup(struct spi_device *spi)
 			return -ENOMEM;
 
 		if (drv_data->ssp_type == CE4100_SSP) {
-			if (spi->chip_select > 4) {
+			if (spi_get_chipselect(spi, 0) > 4) {
 				dev_err(&spi->dev,
 					"failed setup: cs number must not be > 4.\n");
 				kfree(chip);
diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index 15c4e21cd562..aa91a6c9ab55 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -311,7 +311,7 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
 
 	mstr_cfg = readl(ctrl->base + MSTR_CONFIG);
 	mstr_cfg &= ~CHIP_SELECT_NUM;
-	if (message->spi->chip_select)
+	if (spi_get_chipselect(message->spi, 0))
 		mstr_cfg |= CHIP_SELECT_NUM;
 
 	mstr_cfg |= FB_CLK_EN | PIN_WPN | PIN_HOLDN | SBL_EN | FULL_CYCLE_MODE;
diff --git a/drivers/spi/spi-rb4xx.c b/drivers/spi/spi-rb4xx.c
index e312b30b733b..5073736d3d1f 100644
--- a/drivers/spi/spi-rb4xx.c
+++ b/drivers/spi/spi-rb4xx.c
@@ -107,7 +107,7 @@ static int rb4xx_transfer_one(struct spi_master *master,
 	 * command set was designed to almost not clash with that of the
 	 * boot flash.
 	 */
-	if (spi->chip_select == 2)
+	if (spi_get_chipselect(spi, 0) == 2)
 		/* MMC */
 		spi_ioc = AR71XX_SPI_IOC_CS0;
 	else
diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c
index 80e1ee110d31..4fabd2e0439f 100644
--- a/drivers/spi/spi-rockchip-sfc.c
+++ b/drivers/spi/spi-rockchip-sfc.c
@@ -346,7 +346,7 @@ static int rockchip_sfc_xfer_setup(struct rockchip_sfc *sfc,
 
 	/* set the Controller */
 	ctrl |= SFC_CTRL_PHASE_SEL_NEGETIVE;
-	cmd |= mem->spi->chip_select << SFC_CMD_CS_SHIFT;
+	cmd |= spi_get_chipselect(mem->spi, 0) << SFC_CMD_CS_SHIFT;
 
 	dev_dbg(sfc->dev, "sfc addr.nbytes=%x(x%d) dummy.nbytes=%x(x%d)\n",
 		op->addr.nbytes, op->addr.buswidth,
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index a66fff0ee20e..143ede958ac1 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -246,28 +246,30 @@ static void rockchip_spi_set_cs(struct spi_device *spi, bool enable)
 	bool cs_asserted = spi->mode & SPI_CS_HIGH ? enable : !enable;
 
 	/* Return immediately for no-op */
-	if (cs_asserted == rs->cs_asserted[spi->chip_select])
+	if (cs_asserted == rs->cs_asserted[spi_get_chipselect(spi, 0)])
 		return;
 
 	if (cs_asserted) {
 		/* Keep things powered as long as CS is asserted */
 		pm_runtime_get_sync(rs->dev);
 
-		if (spi->cs_gpiod)
+		if (spi_get_csgpiod(spi, 0))
 			ROCKCHIP_SPI_SET_BITS(rs->regs + ROCKCHIP_SPI_SER, 1);
 		else
-			ROCKCHIP_SPI_SET_BITS(rs->regs + ROCKCHIP_SPI_SER, BIT(spi->chip_select));
+			ROCKCHIP_SPI_SET_BITS(rs->regs + ROCKCHIP_SPI_SER,
+					      BIT(spi_get_chipselect(spi, 0)));
 	} else {
-		if (spi->cs_gpiod)
+		if (spi_get_csgpiod(spi, 0))
 			ROCKCHIP_SPI_CLR_BITS(rs->regs + ROCKCHIP_SPI_SER, 1);
 		else
-			ROCKCHIP_SPI_CLR_BITS(rs->regs + ROCKCHIP_SPI_SER, BIT(spi->chip_select));
+			ROCKCHIP_SPI_CLR_BITS(rs->regs + ROCKCHIP_SPI_SER,
+					      BIT(spi_get_chipselect(spi, 0)));
 
 		/* Drop reference from when we first asserted CS */
 		pm_runtime_put(rs->dev);
 	}
 
-	rs->cs_asserted[spi->chip_select] = cs_asserted;
+	rs->cs_asserted[spi_get_chipselect(spi, 0)] = cs_asserted;
 }
 
 static void rockchip_spi_handle_err(struct spi_controller *ctlr,
@@ -541,7 +543,7 @@ static int rockchip_spi_config(struct rockchip_spi *rs,
 	if (spi->mode & SPI_LSB_FIRST)
 		cr0 |= CR0_FBM_LSB << CR0_FBM_OFFSET;
 	if (spi->mode & SPI_CS_HIGH)
-		cr0 |= BIT(spi->chip_select) << CR0_SOI_OFFSET;
+		cr0 |= BIT(spi_get_chipselect(spi, 0)) << CR0_SOI_OFFSET;
 
 	if (xfer->rx_buf && xfer->tx_buf)
 		cr0 |= CR0_XFM_TR << CR0_XFM_OFFSET;
@@ -724,7 +726,7 @@ static int rockchip_spi_setup(struct spi_device *spi)
 	struct rockchip_spi *rs = spi_controller_get_devdata(spi->controller);
 	u32 cr0;
 
-	if (!spi->cs_gpiod && (spi->mode & SPI_CS_HIGH) && !rs->cs_high_supported) {
+	if (!spi_get_csgpiod(spi, 0) && (spi->mode & SPI_CS_HIGH) && !rs->cs_high_supported) {
 		dev_warn(&spi->dev, "setup: non GPIO CS can't be active-high\n");
 		return -EINVAL;
 	}
@@ -735,10 +737,10 @@ static int rockchip_spi_setup(struct spi_device *spi)
 
 	cr0 &= ~(0x3 << CR0_SCPH_OFFSET);
 	cr0 |= ((spi->mode & 0x3) << CR0_SCPH_OFFSET);
-	if (spi->mode & SPI_CS_HIGH && spi->chip_select <= 1)
-		cr0 |= BIT(spi->chip_select) << CR0_SOI_OFFSET;
-	else if (spi->chip_select <= 1)
-		cr0 &= ~(BIT(spi->chip_select) << CR0_SOI_OFFSET);
+	if (spi->mode & SPI_CS_HIGH && spi_get_chipselect(spi, 0) <= 1)
+		cr0 |= BIT(spi_get_chipselect(spi, 0)) << CR0_SOI_OFFSET;
+	else if (spi_get_chipselect(spi, 0) <= 1)
+		cr0 &= ~(BIT(spi_get_chipselect(spi, 0)) << CR0_SOI_OFFSET);
 
 	writel_relaxed(cr0, rs->regs + ROCKCHIP_SPI_CTRLR0);
 
diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
index f494c86bafea..f50a39efaf96 100644
--- a/drivers/spi/spi-rspi.c
+++ b/drivers/spi/spi-rspi.c
@@ -950,7 +950,7 @@ static int rspi_setup(struct spi_device *spi)
 	struct rspi_data *rspi = spi_controller_get_devdata(spi->controller);
 	u8 sslp;
 
-	if (spi->cs_gpiod)
+	if (spi_get_csgpiod(spi, 0))
 		return 0;
 
 	pm_runtime_get_sync(&rspi->pdev->dev);
@@ -958,9 +958,9 @@ static int rspi_setup(struct spi_device *spi)
 
 	sslp = rspi_read8(rspi, RSPI_SSLP);
 	if (spi->mode & SPI_CS_HIGH)
-		sslp |= SSLP_SSLP(spi->chip_select);
+		sslp |= SSLP_SSLP(spi_get_chipselect(spi, 0));
 	else
-		sslp &= ~SSLP_SSLP(spi->chip_select);
+		sslp &= ~SSLP_SSLP(spi_get_chipselect(spi, 0));
 	rspi_write8(rspi, sslp, RSPI_SSLP);
 
 	spin_unlock_irq(&rspi->lock);
@@ -1001,8 +1001,8 @@ static int rspi_prepare_message(struct spi_controller *ctlr,
 		rspi->spcmd |= SPCMD_LSBF;
 
 	/* Configure slave signal to assert */
-	rspi->spcmd |= SPCMD_SSLA(spi->cs_gpiod ? rspi->ctlr->unused_native_cs
-						: spi->chip_select);
+	rspi->spcmd |= SPCMD_SSLA(spi_get_csgpiod(spi, 0) ? rspi->ctlr->unused_native_cs
+						: spi_get_chipselect(spi, 0));
 
 	/* CMOS output mode and MOSI signal from previous transfer */
 	rspi->sppcr = 0;
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index cc69f8ffdbdc..7ac17f0d18a9 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -891,7 +891,7 @@ static int s3c64xx_spi_setup(struct spi_device *spi)
 
 	/* NULL is fine, we just avoid using the FB delay (=0) */
 	if (IS_ERR(cs)) {
-		dev_err(&spi->dev, "No CS for SPI(%d)\n", spi->chip_select);
+		dev_err(&spi->dev, "No CS for SPI(%d)\n", spi_get_chipselect(spi, 0));
 		return -ENODEV;
 	}
 
diff --git a/drivers/spi/spi-sc18is602.c b/drivers/spi/spi-sc18is602.c
index 983b3621bc2a..a12adc68731b 100644
--- a/drivers/spi/spi-sc18is602.c
+++ b/drivers/spi/spi-sc18is602.c
@@ -70,7 +70,7 @@ static int sc18is602_txrx(struct sc18is602 *hw, struct spi_message *msg,
 
 	if (hw->tlen == 0) {
 		/* First byte (I2C command) is chip select */
-		hw->buffer[0] = 1 << msg->spi->chip_select;
+		hw->buffer[0] = 1 << spi_get_chipselect(msg->spi, 0);
 		hw->tlen = 1;
 		hw->rindex = 0;
 	}
@@ -229,7 +229,7 @@ static int sc18is602_setup(struct spi_device *spi)
 	struct sc18is602 *hw = spi_master_get_devdata(spi->master);
 
 	/* SC18IS602 does not support CS2 */
-	if (hw->id == sc18is602 && spi->chip_select == 2)
+	if (hw->id == sc18is602 && (spi_get_chipselect(spi, 0) == 2))
 		return -ENXIO;
 
 	return 0;
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index d828a3b370b8..50498c4eb661 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -554,7 +554,7 @@ static int sh_msiof_spi_setup(struct spi_device *spi)
 		spi_controller_get_devdata(spi->controller);
 	u32 clr, set, tmp;
 
-	if (spi->cs_gpiod || spi_controller_is_slave(p->ctlr))
+	if (spi_get_csgpiod(spi, 0) || spi_controller_is_slave(p->ctlr))
 		return 0;
 
 	if (p->native_cs_inited &&
@@ -587,11 +587,11 @@ static int sh_msiof_prepare_message(struct spi_controller *ctlr,
 	u32 ss, cs_high;
 
 	/* Configure pins before asserting CS */
-	if (spi->cs_gpiod) {
+	if (spi_get_csgpiod((struct spi_device *)spi, 0)) {
 		ss = ctlr->unused_native_cs;
 		cs_high = p->native_cs_high;
 	} else {
-		ss = spi->chip_select;
+		ss = spi_get_chipselect((struct spi_device *)spi, 0);
 		cs_high = !!(spi->mode & SPI_CS_HIGH);
 	}
 	sh_msiof_spi_set_pin_regs(p, ss, !!(spi->mode & SPI_CPOL),
diff --git a/drivers/spi/spi-sh-sci.c b/drivers/spi/spi-sh-sci.c
index 0fdfec2de47a..92ca3f2d61ba 100644
--- a/drivers/spi/spi-sh-sci.c
+++ b/drivers/spi/spi-sh-sci.c
@@ -108,7 +108,7 @@ static void sh_sci_spi_chipselect(struct spi_device *dev, int value)
 	struct sh_sci_spi *sp = spi_master_get_devdata(dev->master);
 
 	if (sp->info->chip_select)
-		(sp->info->chip_select)(sp->info, dev->chip_select, value);
+		(sp->info->chip_select)(sp->info, spi_get_chipselect(dev, 0), value);
 }
 
 static int sh_sci_spi_probe(struct platform_device *dev)
diff --git a/drivers/spi/spi-sifive.c b/drivers/spi/spi-sifive.c
index 055de44e0d22..dae9e097c333 100644
--- a/drivers/spi/spi-sifive.c
+++ b/drivers/spi/spi-sifive.c
@@ -135,13 +135,13 @@ sifive_spi_prepare_message(struct spi_master *master, struct spi_message *msg)
 
 	/* Update the chip select polarity */
 	if (device->mode & SPI_CS_HIGH)
-		spi->cs_inactive &= ~BIT(device->chip_select);
+		spi->cs_inactive &= ~BIT(spi_get_chipselect(device, 0));
 	else
-		spi->cs_inactive |= BIT(device->chip_select);
+		spi->cs_inactive |= BIT(spi_get_chipselect(device, 0));
 	sifive_spi_write(spi, SIFIVE_SPI_REG_CSDEF, spi->cs_inactive);
 
 	/* Select the correct device */
-	sifive_spi_write(spi, SIFIVE_SPI_REG_CSID, device->chip_select);
+	sifive_spi_write(spi, SIFIVE_SPI_REG_CSID, spi_get_chipselect(device, 0));
 
 	/* Set clock mode */
 	sifive_spi_write(spi, SIFIVE_SPI_REG_SCKMODE,
diff --git a/drivers/spi/spi-sn-f-ospi.c b/drivers/spi/spi-sn-f-ospi.c
index 644ae34f623b..d64100f88181 100644
--- a/drivers/spi/spi-sn-f-ospi.c
+++ b/drivers/spi/spi-sn-f-ospi.c
@@ -267,7 +267,7 @@ static void f_ospi_config_indir_protocol(struct f_ospi *ospi,
 	int unit;
 
 	/* Set one chip select */
-	writel(BIT(spi->chip_select), ospi->base + OSPI_SSEL);
+	writel(BIT(spi_get_chipselect(spi, 0)), ospi->base + OSPI_SSEL);
 
 	mode = f_ospi_get_mode(ospi, op->cmd.buswidth, 1);
 	prot |= FIELD_PREP(OSPI_PROT_MODE_CODE_MASK, mode);
diff --git a/drivers/spi/spi-st-ssc4.c b/drivers/spi/spi-st-ssc4.c
index 9141f19c7f8e..7fcff9c539e2 100644
--- a/drivers/spi/spi-st-ssc4.c
+++ b/drivers/spi/spi-st-ssc4.c
@@ -183,7 +183,7 @@ static int spi_st_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (!spi->cs_gpiod) {
+	if (!spi_get_csgpiod(spi, 0)) {
 		dev_err(&spi->dev, "no valid gpio assigned\n");
 		return -EINVAL;
 	}
diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c
index 29125af0afdb..2b6804aa6901 100644
--- a/drivers/spi/spi-stm32-qspi.c
+++ b/drivers/spi/spi-stm32-qspi.c
@@ -359,7 +359,7 @@ static int stm32_qspi_get_mode(u8 buswidth)
 static int stm32_qspi_send(struct spi_device *spi, const struct spi_mem_op *op)
 {
 	struct stm32_qspi *qspi = spi_controller_get_devdata(spi->master);
-	struct stm32_qspi_flash *flash = &qspi->flash[spi->chip_select];
+	struct stm32_qspi_flash *flash = &qspi->flash[spi_get_chipselect(spi, 0)];
 	u32 ccr, cr;
 	int timeout, err = 0, err_poll_status = 0;
 
@@ -564,7 +564,7 @@ static int stm32_qspi_transfer_one_message(struct spi_controller *ctrl,
 	struct spi_mem_op op;
 	int ret = 0;
 
-	if (!spi->cs_gpiod)
+	if (!spi_get_csgpiod(spi, 0))
 		return -EOPNOTSUPP;
 
 	ret = pm_runtime_resume_and_get(qspi->dev);
@@ -573,7 +573,7 @@ static int stm32_qspi_transfer_one_message(struct spi_controller *ctrl,
 
 	mutex_lock(&qspi->lock);
 
-	gpiod_set_value_cansleep(spi->cs_gpiod, true);
+	gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), true);
 
 	list_for_each_entry(transfer, &msg->transfers, transfer_list) {
 		u8 dummy_bytes = 0;
@@ -626,7 +626,7 @@ static int stm32_qspi_transfer_one_message(struct spi_controller *ctrl,
 	}
 
 end_of_transfer:
-	gpiod_set_value_cansleep(spi->cs_gpiod, false);
+	gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), false);
 
 	mutex_unlock(&qspi->lock);
 
@@ -669,8 +669,8 @@ static int stm32_qspi_setup(struct spi_device *spi)
 
 	presc = DIV_ROUND_UP(qspi->clk_rate, spi->max_speed_hz) - 1;
 
-	flash = &qspi->flash[spi->chip_select];
-	flash->cs = spi->chip_select;
+	flash = &qspi->flash[spi_get_chipselect(spi, 0)];
+	flash->cs = spi_get_chipselect(spi, 0);
 	flash->presc = presc;
 
 	mutex_lock(&qspi->lock);
diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c
index 994d0fb50e68..b8947265d329 100644
--- a/drivers/spi/spi-sun4i.c
+++ b/drivers/spi/spi-sun4i.c
@@ -167,7 +167,7 @@ static void sun4i_spi_set_cs(struct spi_device *spi, bool enable)
 	reg = sun4i_spi_read(sspi, SUN4I_CTL_REG);
 
 	reg &= ~SUN4I_CTL_CS_MASK;
-	reg |= SUN4I_CTL_CS(spi->chip_select);
+	reg |= SUN4I_CTL_CS(spi_get_chipselect(spi, 0));
 
 	/* We want to control the chip select manually */
 	reg |= SUN4I_CTL_CS_MANUAL;
diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c
index 43c29afea6bb..7532c85a352c 100644
--- a/drivers/spi/spi-sun6i.c
+++ b/drivers/spi/spi-sun6i.c
@@ -174,7 +174,7 @@ static void sun6i_spi_set_cs(struct spi_device *spi, bool enable)
 
 	reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG);
 	reg &= ~SUN6I_TFR_CTL_CS_MASK;
-	reg |= SUN6I_TFR_CTL_CS(spi->chip_select);
+	reg |= SUN6I_TFR_CTL_CS(spi_get_chipselect(spi, 0));
 
 	if (enable)
 		reg |= SUN6I_TFR_CTL_CS_LEVEL;
diff --git a/drivers/spi/spi-synquacer.c b/drivers/spi/spi-synquacer.c
index 3a92f722a6c5..aeaf7db022f0 100644
--- a/drivers/spi/spi-synquacer.c
+++ b/drivers/spi/spi-synquacer.c
@@ -250,7 +250,7 @@ static int synquacer_spi_config(struct spi_master *master,
 	}
 
 	mode = spi->mode;
-	cs = spi->chip_select;
+	cs = spi_get_chipselect(spi, 0);
 	speed = xfer->speed_hz;
 	bpw = xfer->bits_per_word;
 
@@ -344,7 +344,7 @@ static int synquacer_spi_config(struct spi_master *master,
 	sspi->bpw = bpw;
 	sspi->mode = mode;
 	sspi->speed = speed;
-	sspi->cs = spi->chip_select;
+	sspi->cs = spi_get_chipselect(spi, 0);
 	sspi->bus_width = bus_width;
 
 	return 0;
@@ -488,7 +488,7 @@ static void synquacer_spi_set_cs(struct spi_device *spi, bool enable)
 	val = readl(sspi->regs + SYNQUACER_HSSPI_REG_DMSTART);
 	val &= ~(SYNQUACER_HSSPI_DMPSEL_CS_MASK <<
 		 SYNQUACER_HSSPI_DMPSEL_CS_SHIFT);
-	val |= spi->chip_select << SYNQUACER_HSSPI_DMPSEL_CS_SHIFT;
+	val |= spi_get_chipselect(spi, 0) << SYNQUACER_HSSPI_DMPSEL_CS_SHIFT;
 
 	if (!enable)
 		val |= SYNQUACER_HSSPI_DMSTOP_STOP;
diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c
index b6bee922a92c..9e7d762d0ee6 100644
--- a/drivers/spi/spi-tegra114.c
+++ b/drivers/spi/spi-tegra114.c
@@ -747,7 +747,7 @@ static int tegra_spi_set_hw_cs_timing(struct spi_device *spi)
 	if (setup_dly && hold_dly) {
 		setup_hold = SPI_SETUP_HOLD(setup_dly - 1, hold_dly - 1);
 		spi_cs_timing = SPI_CS_SETUP_HOLD(tspi->spi_cs_timing1,
-						  spi->chip_select,
+						  spi_get_chipselect(spi, 0),
 						  setup_hold);
 		if (tspi->spi_cs_timing1 != spi_cs_timing) {
 			tspi->spi_cs_timing1 = spi_cs_timing;
@@ -760,9 +760,9 @@ static int tegra_spi_set_hw_cs_timing(struct spi_device *spi)
 		inactive_cycles--;
 	cs_state = inactive_cycles ? 0 : 1;
 	spi_cs_timing = tspi->spi_cs_timing2;
-	SPI_SET_CS_ACTIVE_BETWEEN_PACKETS(spi_cs_timing, spi->chip_select,
+	SPI_SET_CS_ACTIVE_BETWEEN_PACKETS(spi_cs_timing, spi_get_chipselect(spi, 0),
 					  cs_state);
-	SPI_SET_CYCLES_BETWEEN_PACKETS(spi_cs_timing, spi->chip_select,
+	SPI_SET_CYCLES_BETWEEN_PACKETS(spi_cs_timing, spi_get_chipselect(spi, 0),
 				       inactive_cycles);
 	if (tspi->spi_cs_timing2 != spi_cs_timing) {
 		tspi->spi_cs_timing2 = spi_cs_timing;
@@ -831,8 +831,8 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi,
 			tegra_spi_writel(tspi, command1, SPI_COMMAND1);
 
 		/* GPIO based chip select control */
-		if (spi->cs_gpiod)
-			gpiod_set_value(spi->cs_gpiod, 1);
+		if (spi_get_csgpiod(spi, 0))
+			gpiod_set_value(spi_get_csgpiod(spi, 0), 1);
 
 		if (is_single_xfer && !(t->cs_change)) {
 			tspi->use_hw_based_cs = true;
@@ -846,7 +846,7 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi,
 				command1 &= ~SPI_CS_SW_VAL;
 		}
 
-		if (tspi->last_used_cs != spi->chip_select) {
+		if (tspi->last_used_cs != spi_get_chipselect(spi, 0)) {
 			if (cdata && cdata->tx_clk_tap_delay)
 				tx_tap = cdata->tx_clk_tap_delay;
 			if (cdata && cdata->rx_clk_tap_delay)
@@ -855,7 +855,7 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi,
 				   SPI_RX_TAP_DELAY(rx_tap);
 			if (command2 != tspi->def_command2_reg)
 				tegra_spi_writel(tspi, command2, SPI_COMMAND2);
-			tspi->last_used_cs = spi->chip_select;
+			tspi->last_used_cs = spi_get_chipselect(spi, 0);
 		}
 
 	} else {
@@ -896,7 +896,7 @@ static int tegra_spi_start_transfer_one(struct spi_device *spi,
 		command1 |= SPI_TX_EN;
 		tspi->cur_direction |= DATA_DIR_TX;
 	}
-	command1 |= SPI_CS_SEL(spi->chip_select);
+	command1 |= SPI_CS_SEL(spi_get_chipselect(spi, 0));
 	tegra_spi_writel(tspi, command1, SPI_COMMAND1);
 	tspi->command1_reg = command1;
 
@@ -980,14 +980,14 @@ static int tegra_spi_setup(struct spi_device *spi)
 
 	spin_lock_irqsave(&tspi->lock, flags);
 	/* GPIO based chip select control */
-	if (spi->cs_gpiod)
-		gpiod_set_value(spi->cs_gpiod, 0);
+	if (spi_get_csgpiod(spi, 0))
+		gpiod_set_value(spi_get_csgpiod(spi, 0), 0);
 
 	val = tspi->def_command1_reg;
 	if (spi->mode & SPI_CS_HIGH)
-		val &= ~SPI_CS_POL_INACTIVE(spi->chip_select);
+		val &= ~SPI_CS_POL_INACTIVE(spi_get_chipselect(spi, 0));
 	else
-		val |= SPI_CS_POL_INACTIVE(spi->chip_select);
+		val |= SPI_CS_POL_INACTIVE(spi_get_chipselect(spi, 0));
 	tspi->def_command1_reg = val;
 	tegra_spi_writel(tspi, tspi->def_command1_reg, SPI_COMMAND1);
 	spin_unlock_irqrestore(&tspi->lock, flags);
@@ -1002,8 +1002,8 @@ static void tegra_spi_transfer_end(struct spi_device *spi)
 	int cs_val = (spi->mode & SPI_CS_HIGH) ? 0 : 1;
 
 	/* GPIO based chip select control */
-	if (spi->cs_gpiod)
-		gpiod_set_value(spi->cs_gpiod, 0);
+	if (spi_get_csgpiod(spi, 0))
+		gpiod_set_value(spi_get_csgpiod(spi, 0), 0);
 
 	if (!tspi->use_hw_based_cs) {
 		if (cs_val)
diff --git a/drivers/spi/spi-tegra20-sflash.c b/drivers/spi/spi-tegra20-sflash.c
index ed82530ea64b..4286310628a2 100644
--- a/drivers/spi/spi-tegra20-sflash.c
+++ b/drivers/spi/spi-tegra20-sflash.c
@@ -280,7 +280,7 @@ static int tegra_sflash_start_transfer_one(struct spi_device *spi,
 			command |= SPI_ACTIVE_SCLK_DRIVE_HIGH;
 		else
 			command |= SPI_ACTIVE_SCLK_DRIVE_LOW;
-		command |= SPI_CS0_EN << spi->chip_select;
+		command |= SPI_CS0_EN << spi_get_chipselect(spi, 0);
 	} else {
 		command = tsd->command_reg;
 		command &= ~SPI_BIT_LENGTH(~0);
diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c
index ac7933bc03e2..c2915f7672cc 100644
--- a/drivers/spi/spi-tegra20-slink.c
+++ b/drivers/spi/spi-tegra20-slink.c
@@ -758,9 +758,9 @@ static int tegra_slink_setup(struct spi_device *spi)
 	spin_lock_irqsave(&tspi->lock, flags);
 	val = tspi->def_command_reg;
 	if (spi->mode & SPI_CS_HIGH)
-		val |= cs_pol_bit[spi->chip_select];
+		val |= cs_pol_bit[spi_get_chipselect(spi, 0)];
 	else
-		val &= ~cs_pol_bit[spi->chip_select];
+		val &= ~cs_pol_bit[spi_get_chipselect(spi, 0)];
 	tspi->def_command_reg = val;
 	tegra_slink_writel(tspi, tspi->def_command_reg, SLINK_COMMAND);
 	spin_unlock_irqrestore(&tspi->lock, flags);
@@ -781,7 +781,7 @@ static int tegra_slink_prepare_message(struct spi_master *master,
 	tspi->command_reg |= SLINK_CS_SW | SLINK_CS_VALUE;
 
 	tspi->command2_reg = tspi->def_command2_reg;
-	tspi->command2_reg |= SLINK_SS_EN_CS(spi->chip_select);
+	tspi->command2_reg |= SLINK_SS_EN_CS(spi_get_chipselect(spi, 0));
 
 	tspi->command_reg &= ~SLINK_MODES;
 	if (spi->mode & SPI_CPHA)
diff --git a/drivers/spi/spi-tegra210-quad.c b/drivers/spi/spi-tegra210-quad.c
index fd0d532364e2..325b4427491c 100644
--- a/drivers/spi/spi-tegra210-quad.c
+++ b/drivers/spi/spi-tegra210-quad.c
@@ -829,7 +829,7 @@ static u32 tegra_qspi_setup_transfer_one(struct spi_device *spi, struct spi_tran
 		tegra_qspi_mask_clear_irq(tqspi);
 
 		command1 = tqspi->def_command1_reg;
-		command1 |= QSPI_CS_SEL(spi->chip_select);
+		command1 |= QSPI_CS_SEL(spi_get_chipselect(spi, 0));
 		command1 |= QSPI_BIT_LENGTH(bits_per_word - 1);
 
 		command1 &= ~QSPI_CONTROL_MODE_MASK;
@@ -960,11 +960,11 @@ static int tegra_qspi_setup(struct spi_device *spi)
 
 	/* keep default cs state to inactive */
 	val = tqspi->def_command1_reg;
-	val |= QSPI_CS_SEL(spi->chip_select);
+	val |= QSPI_CS_SEL(spi_get_chipselect(spi, 0));
 	if (spi->mode & SPI_CS_HIGH)
-		val &= ~QSPI_CS_POL_INACTIVE(spi->chip_select);
+		val &= ~QSPI_CS_POL_INACTIVE(spi_get_chipselect(spi, 0));
 	else
-		val |= QSPI_CS_POL_INACTIVE(spi->chip_select);
+		val |= QSPI_CS_POL_INACTIVE(spi_get_chipselect(spi, 0));
 
 	tqspi->def_command1_reg = val;
 	tegra_qspi_writel(tqspi, tqspi->def_command1_reg, QSPI_COMMAND1);
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index 60086869bcae..5914335ff63d 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -533,10 +533,10 @@ static void ti_qspi_enable_memory_map(struct spi_device *spi)
 	if (qspi->ctrl_base) {
 		regmap_update_bits(qspi->ctrl_base, qspi->ctrl_reg,
 				   MEM_CS_MASK,
-				   MEM_CS_EN(spi->chip_select));
+				   MEM_CS_EN(spi_get_chipselect(spi, 0)));
 	}
 	qspi->mmap_enabled = true;
-	qspi->current_cs = spi->chip_select;
+	qspi->current_cs = spi_get_chipselect(spi, 0);
 }
 
 static void ti_qspi_disable_memory_map(struct spi_device *spi)
@@ -572,7 +572,7 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode,
 	memval |= ((addr_width - 1) << QSPI_SETUP_ADDR_SHIFT |
 		   dummy_bytes << QSPI_SETUP_DUMMY_SHIFT);
 	ti_qspi_write(qspi, memval,
-		      QSPI_SPI_SETUP_REG(spi->chip_select));
+		      QSPI_SPI_SETUP_REG(spi_get_chipselect(spi, 0)));
 }
 
 static int ti_qspi_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
@@ -623,7 +623,7 @@ static int ti_qspi_exec_mem_op(struct spi_mem *mem,
 
 	mutex_lock(&qspi->list_lock);
 
-	if (!qspi->mmap_enabled || qspi->current_cs != mem->spi->chip_select) {
+	if (!qspi->mmap_enabled || qspi->current_cs != spi_get_chipselect(mem->spi, 0)) {
 		ti_qspi_setup_clk(qspi, mem->spi->max_speed_hz);
 		ti_qspi_enable_memory_map(mem->spi);
 	}
@@ -673,11 +673,11 @@ static int ti_qspi_start_transfer_one(struct spi_master *master,
 	qspi->dc = 0;
 
 	if (spi->mode & SPI_CPHA)
-		qspi->dc |= QSPI_CKPHA(spi->chip_select);
+		qspi->dc |= QSPI_CKPHA(spi_get_chipselect(spi, 0));
 	if (spi->mode & SPI_CPOL)
-		qspi->dc |= QSPI_CKPOL(spi->chip_select);
+		qspi->dc |= QSPI_CKPOL(spi_get_chipselect(spi, 0));
 	if (spi->mode & SPI_CS_HIGH)
-		qspi->dc |= QSPI_CSPOL(spi->chip_select);
+		qspi->dc |= QSPI_CSPOL(spi_get_chipselect(spi, 0));
 
 	frame_len_words = 0;
 	list_for_each_entry(t, &m->transfers, transfer_list)
@@ -686,7 +686,7 @@ static int ti_qspi_start_transfer_one(struct spi_master *master,
 
 	/* setup command reg */
 	qspi->cmd = 0;
-	qspi->cmd |= QSPI_EN_CS(spi->chip_select);
+	qspi->cmd |= QSPI_EN_CS(spi_get_chipselect(spi, 0));
 	qspi->cmd |= QSPI_FLEN(frame_len_words);
 
 	ti_qspi_write(qspi, qspi->dc, QSPI_SPI_DC_REG);
diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c
index 1679a0ba3d62..af5846cfe5e9 100644
--- a/drivers/spi/spi-topcliff-pch.c
+++ b/drivers/spi/spi-topcliff-pch.c
@@ -499,7 +499,7 @@ static inline void pch_spi_select_chip(struct pch_spi_data *data,
 				       struct spi_device *pspi)
 {
 	if (data->current_chip != NULL) {
-		if (pspi->chip_select != data->n_curnt_chip) {
+		if (spi_get_chipselect(pspi, 0) != data->n_curnt_chip) {
 			dev_dbg(&pspi->dev, "%s : different slave\n", __func__);
 			data->current_chip = NULL;
 		}
@@ -507,7 +507,7 @@ static inline void pch_spi_select_chip(struct pch_spi_data *data,
 
 	data->current_chip = pspi;
 
-	data->n_curnt_chip = data->current_chip->chip_select;
+	data->n_curnt_chip = spi_get_chipselect(data->current_chip, 0);
 
 	dev_dbg(&pspi->dev, "%s :Invoking pch_spi_setup_transfer\n", __func__);
 	pch_spi_setup_transfer(pspi);
diff --git a/drivers/spi/spi-wpcm-fiu.c b/drivers/spi/spi-wpcm-fiu.c
index ab33710d50ac..f15312fdcdaf 100644
--- a/drivers/spi/spi-wpcm-fiu.c
+++ b/drivers/spi/spi-wpcm-fiu.c
@@ -158,7 +158,7 @@ static int wpcm_fiu_normal_exec(struct spi_mem *mem, const struct spi_mem_op *op
 	if (op->data.dir == SPI_MEM_DATA_OUT)
 		wpcm_fiu_set_data(fiu, op->data.buf.out, op->data.nbytes);
 
-	ret = wpcm_fiu_do_uma(fiu, mem->spi->chip_select, op->addr.nbytes == 3,
+	ret = wpcm_fiu_do_uma(fiu, spi_get_chipselect(mem->spi, 0), op->addr.nbytes == 3,
 			      op->data.dir == SPI_MEM_DATA_OUT, op->data.nbytes);
 
 	if (op->data.dir == SPI_MEM_DATA_IN)
@@ -196,7 +196,7 @@ static bool wpcm_fiu_4ba_match(const struct spi_mem_op *op)
 static int wpcm_fiu_4ba_exec(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct wpcm_fiu_spi *fiu = spi_controller_get_devdata(mem->spi->controller);
-	int cs = mem->spi->chip_select;
+	int cs = spi_get_chipselect(mem->spi, 0);
 
 	wpcm_fiu_ects_assert(fiu, cs);
 
@@ -241,7 +241,7 @@ static bool wpcm_fiu_rdid_match(const struct spi_mem_op *op)
 static int wpcm_fiu_rdid_exec(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct wpcm_fiu_spi *fiu = spi_controller_get_devdata(mem->spi->controller);
-	int cs = mem->spi->chip_select;
+	int cs = spi_get_chipselect(mem->spi, 0);
 
 	/* First transfer */
 	wpcm_fiu_set_opcode(fiu, op->cmd.opcode);
@@ -278,7 +278,7 @@ static bool wpcm_fiu_dummy_match(const struct spi_mem_op *op)
 static int wpcm_fiu_dummy_exec(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct wpcm_fiu_spi *fiu = spi_controller_get_devdata(mem->spi->controller);
-	int cs = mem->spi->chip_select;
+	int cs = spi_get_chipselect(mem->spi, 0);
 
 	wpcm_fiu_ects_assert(fiu, cs);
 
@@ -376,7 +376,7 @@ static int wpcm_fiu_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
 static int wpcm_fiu_dirmap_create(struct spi_mem_dirmap_desc *desc)
 {
 	struct wpcm_fiu_spi *fiu = spi_controller_get_devdata(desc->mem->spi->controller);
-	int cs = desc->mem->spi->chip_select;
+	int cs = spi_get_chipselect(desc->mem->spi, 0);
 
 	if (desc->info.op_tmpl.data.dir != SPI_MEM_DATA_IN)
 		return -ENOTSUPP;
@@ -400,7 +400,7 @@ static int wpcm_fiu_dirmap_create(struct spi_mem_dirmap_desc *desc)
 static ssize_t wpcm_fiu_direct_read(struct spi_mem_dirmap_desc *desc, u64 offs, size_t len, void *buf)
 {
 	struct wpcm_fiu_spi *fiu = spi_controller_get_devdata(desc->mem->spi->controller);
-	int cs = desc->mem->spi->chip_select;
+	int cs = spi_get_chipselect(desc->mem->spi, 0);
 
 	if (offs >= MAX_MEMORY_SIZE_PER_CS)
 		return -ENOTSUPP;
diff --git a/drivers/spi/spi-xcomm.c b/drivers/spi/spi-xcomm.c
index 8628241ec99e..5d23411f2a3e 100644
--- a/drivers/spi/spi-xcomm.c
+++ b/drivers/spi/spi-xcomm.c
@@ -58,7 +58,7 @@ static int spi_xcomm_sync_config(struct spi_xcomm *spi_xcomm, unsigned int len)
 static void spi_xcomm_chipselect(struct spi_xcomm *spi_xcomm,
 	struct spi_device *spi, int is_active)
 {
-	unsigned long cs = spi->chip_select;
+	unsigned long cs = spi_get_chipselect(spi, 0);
 	uint16_t chipselect = spi_xcomm->chipselect;
 
 	if (is_active)
diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index d2f9eea5e093..5dd3d1901412 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -213,7 +213,7 @@ static void xilinx_spi_chipselect(struct spi_device *spi, int is_on)
 	 */
 
 	cs = xspi->cs_inactive;
-	cs ^= BIT(spi->chip_select);
+	cs ^= BIT(spi_get_chipselect(spi, 0));
 
 	/* Activate the chip select */
 	xspi->write_fn(cs, xspi->regs + XSPI_SSR_OFFSET);
@@ -228,9 +228,9 @@ static int xilinx_spi_setup_transfer(struct spi_device *spi,
 	struct xilinx_spi *xspi = spi_master_get_devdata(spi->master);
 
 	if (spi->mode & SPI_CS_HIGH)
-		xspi->cs_inactive &= ~BIT(spi->chip_select);
+		xspi->cs_inactive &= ~BIT(spi_get_chipselect(spi, 0));
 	else
-		xspi->cs_inactive |= BIT(spi->chip_select);
+		xspi->cs_inactive |= BIT(spi_get_chipselect(spi, 0));
 
 	return 0;
 }
diff --git a/drivers/spi/spi-xlp.c b/drivers/spi/spi-xlp.c
index e5707fe5c8f1..3b91cdd5ae21 100644
--- a/drivers/spi/spi-xlp.c
+++ b/drivers/spi/spi-xlp.c
@@ -139,7 +139,7 @@ static int xlp_spi_setup(struct spi_device *spi)
 	int cs;
 
 	xspi = spi_master_get_devdata(spi->master);
-	cs = spi->chip_select;
+	cs = spi_get_chipselect(spi, 0);
 	/*
 	 * The value of fdiv must be between 4 and 65535.
 	 */
@@ -350,7 +350,7 @@ static int xlp_spi_transfer_one(struct spi_master *master,
 	struct xlp_spi_priv *xspi = spi_master_get_devdata(master);
 	int ret = 0;
 
-	xspi->cs = spi->chip_select;
+	xspi->cs = spi_get_chipselect(spi, 0);
 	xspi->dev = spi->dev;
 
 	if (spi_transfer_is_last(master, t))
diff --git a/drivers/spi/spi-zynq-qspi.c b/drivers/spi/spi-zynq-qspi.c
index 8558c0fe3775..ee1995b91287 100644
--- a/drivers/spi/spi-zynq-qspi.c
+++ b/drivers/spi/spi-zynq-qspi.c
@@ -296,7 +296,7 @@ static void zynq_qspi_chipselect(struct spi_device *spi, bool assert)
 	/* Select the lower (CS0) or upper (CS1) memory */
 	if (ctlr->num_chipselect > 1) {
 		config_reg = zynq_qspi_read(xqspi, ZYNQ_QSPI_LINEAR_CFG_OFFSET);
-		if (!spi->chip_select)
+		if (!spi_get_chipselect(spi, 0))
 			config_reg &= ~ZYNQ_QSPI_LCFG_U_PAGE;
 		else
 			config_reg |= ZYNQ_QSPI_LCFG_U_PAGE;
diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c
index 270d28a3f8eb..fb2ca9b90eab 100644
--- a/drivers/spi/spi-zynqmp-gqspi.c
+++ b/drivers/spi/spi-zynqmp-gqspi.c
@@ -468,7 +468,7 @@ static void zynqmp_qspi_chipselect(struct spi_device *qspi, bool is_high)
 	genfifoentry |= GQSPI_GENFIFO_MODE_SPI;
 
 	if (!is_high) {
-		if (!qspi->chip_select) {
+		if (!spi_get_chipselect(qspi, 0)) {
 			xqspi->genfifobus = GQSPI_GENFIFO_BUS_LOWER;
 			xqspi->genfifocs = GQSPI_GENFIFO_CS_LOWER;
 		} else {
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 5a038c667401..a399c617ca25 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -393,7 +393,7 @@ spidev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			struct spi_controller *ctlr = spi->controller;
 
 			if (ctlr->use_gpio_descriptors && ctlr->cs_gpiods &&
-			    ctlr->cs_gpiods[spi->chip_select])
+			    ctlr->cs_gpiods[spi_get_chipselect(spi, 0)])
 				tmp &= ~SPI_CS_HIGH;
 		}
 
@@ -432,7 +432,7 @@ spidev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			}
 
 			if (ctlr->use_gpio_descriptors && ctlr->cs_gpiods &&
-			    ctlr->cs_gpiods[spi->chip_select])
+			    ctlr->cs_gpiods[spi_get_chipselect(spi, 0)])
 				tmp |= SPI_CS_HIGH;
 
 			tmp |= spi->mode & ~SPI_MODE_MASK;
@@ -805,7 +805,7 @@ static int spidev_probe(struct spi_device *spi)
 		spidev->devt = MKDEV(SPIDEV_MAJOR, minor);
 		dev = device_create(spidev_class, &spi->dev, spidev->devt,
 				    spidev, "spidev%d.%d",
-				    spi->master->bus_num, spi->chip_select);
+				    spi->master->bus_num, spi_get_chipselect(spi, 0));
 		status = PTR_ERR_OR_ZERO(dev);
 	} else {
 		dev_dbg(&spi->dev, "no minor number available!\n");
diff --git a/include/trace/events/spi.h b/include/trace/events/spi.h
index c0d9844befd7..c0248a8fa79c 100644
--- a/include/trace/events/spi.h
+++ b/include/trace/events/spi.h
@@ -57,7 +57,7 @@ TRACE_EVENT(spi_setup,
 
 	TP_fast_assign(
 		__entry->bus_num = spi->controller->bus_num;
-		__entry->chip_select = spi->chip_select;
+		__entry->chip_select =  spi_get_chipselect(spi, 0);
 		__entry->mode = spi->mode;
 		__entry->bits_per_word = spi->bits_per_word;
 		__entry->max_speed_hz = spi->max_speed_hz;
@@ -88,7 +88,7 @@ TRACE_EVENT(spi_set_cs,
 
 	TP_fast_assign(
 		__entry->bus_num = spi->controller->bus_num;
-		__entry->chip_select = spi->chip_select;
+		__entry->chip_select = spi_get_chipselect(spi, 0);
 		__entry->mode = spi->mode;
 		__entry->enable = enable;
 	),
@@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(spi_message,
 
 	TP_fast_assign(
 		__entry->bus_num = msg->spi->controller->bus_num;
-		__entry->chip_select = msg->spi->chip_select;
+		__entry->chip_select = spi_get_chipselect(msg->spi, 0);
 		__entry->msg = msg;
 	),
 
@@ -154,7 +154,7 @@ TRACE_EVENT(spi_message_done,
 
 	TP_fast_assign(
 		__entry->bus_num = msg->spi->controller->bus_num;
-		__entry->chip_select = msg->spi->chip_select;
+		__entry->chip_select = spi_get_chipselect(msg->spi, 0);
 		__entry->msg = msg;
 		__entry->frame = msg->frame_length;
 		__entry->actual = msg->actual_length;
@@ -197,7 +197,7 @@ DECLARE_EVENT_CLASS(spi_transfer,
 
 	TP_fast_assign(
 		__entry->bus_num = msg->spi->controller->bus_num;
-		__entry->chip_select = msg->spi->chip_select;
+		__entry->chip_select = spi_get_chipselect(msg->spi, 0);
 		__entry->xfer = xfer;
 		__entry->len = xfer->len;
 
-- 
cgit v1.2.3


From cf587db2ee0261c74d04f61f39783db88a0b65e4 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:23 -0600
Subject: kernel: Allow a kernel thread's name to be set in copy_process

This patch allows kernel users to pass in the thread name so it can be
set during creation instead of having to use set_task_comm after the
thread is created.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched/task.h | 4 +++-
 init/main.c                | 2 +-
 kernel/fork.c              | 7 ++++++-
 kernel/kthread.c           | 3 ++-
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 357e0068497c..32c9f01af0a6 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -23,6 +23,7 @@ struct kernel_clone_args {
 	int __user *pidfd;
 	int __user *child_tid;
 	int __user *parent_tid;
+	const char *name;
 	int exit_signal;
 	unsigned long stack;
 	unsigned long stack_size;
@@ -91,7 +92,8 @@ extern void exit_itimers(struct task_struct *);
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+			    unsigned long flags);
 extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 int kernel_wait(pid_t pid, int *stat);
diff --git a/init/main.c b/init/main.c
index 4425d1783d5c..49b5e1ef53f0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -707,7 +707,7 @@ noinline void __ref rest_init(void)
 	rcu_read_unlock();
 
 	numa_default_policy();
-	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+	pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES);
 	rcu_read_lock();
 	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
 	rcu_read_unlock();
diff --git a/kernel/fork.c b/kernel/fork.c
index f68954d05e89..81dfb2532148 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2112,6 +2112,9 @@ static __latent_entropy struct task_struct *copy_process(
 		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 	}
 
+	if (args->name)
+		strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 	/*
 	 * Clear TID on mm_release()?
@@ -2730,7 +2733,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 /*
  * Create a kernel thread.
  */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+		    unsigned long flags)
 {
 	struct kernel_clone_args args = {
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
@@ -2738,6 +2742,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
 		.fn		= fn,
 		.fn_arg		= arg,
+		.name		= name,
 		.kthread	= 1,
 	};
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..fa4a12ddf7fb 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -396,7 +396,8 @@ static void create_kthread(struct kthread_create_info *create)
 	current->pref_node_fork = create->node;
 #endif
 	/* We want our own signal handler (we take no signals by default). */
-	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+	pid = kernel_thread(kthread, create, NULL,
+			    CLONE_FS | CLONE_FILES | SIGCHLD);
 	if (pid < 0) {
 		/* Release the structure when caller killed by a fatal signal. */
 		struct completion *done = xchg(&create->done, NULL);
-- 
cgit v1.2.3


From c81cc5819faf5dd77124f5086aa654482281ac37 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:25 -0600
Subject: kernel: Make io_thread and kthread bit fields

We only set args->io_thread/kthread to 0 or 1 then test if they are set,
so make them bit fields.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched/task.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 32c9f01af0a6..268c77a42155 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -25,6 +25,8 @@ struct kernel_clone_args {
 	int __user *parent_tid;
 	const char *name;
 	int exit_signal;
+	u32 kthread:1;
+	u32 io_thread:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
@@ -32,8 +34,6 @@ struct kernel_clone_args {
 	/* Number of elements in *set_tid */
 	size_t set_tid_size;
 	int cgroup;
-	int io_thread;
-	int kthread;
 	int idle;
 	int (*fn)(void *);
 	void *fn_arg;
-- 
cgit v1.2.3


From 54e6842d0775ba76db65cbe21311c3ca466e663d Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:26 -0600
Subject: fork/vm: Move common PF_IO_WORKER behavior to new flag

This adds a new flag, PF_USER_WORKER, that's used for behavior common to
to both PF_IO_WORKER and users like vhost which will use a new helper
instead of create_io_thread because they require different behavior for
operations like signal handling.

The common behavior PF_USER_WORKER covers is the vm reclaim handling.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched.h      | 2 +-
 include/linux/sched/task.h | 1 +
 kernel/fork.c              | 3 +++
 mm/vmscan.c                | 4 ++--
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..e1e605b1255b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1729,7 +1729,7 @@ extern struct pid *cad_pid;
 #define PF_MEMALLOC		0x00000800	/* Allocating memory */
 #define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
-#define PF__HOLE__00004000	0x00004000
+#define PF_USER_WORKER		0x00004000	/* Kernel thread cloned from userspace thread */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
 #define PF__HOLE__00010000	0x00010000
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 268c77a42155..2950e83d5382 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -27,6 +27,7 @@ struct kernel_clone_args {
 	int exit_signal;
 	u32 kthread:1;
 	u32 io_thread:1;
+	u32 user_worker:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index 81dfb2532148..fb8ec192c199 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2103,6 +2103,8 @@ static __latent_entropy struct task_struct *copy_process(
 	p->flags &= ~PF_KTHREAD;
 	if (args->kthread)
 		p->flags |= PF_KTHREAD;
+	if (args->user_worker)
+		p->flags |= PF_USER_WORKER;
 	if (args->io_thread) {
 		/*
 		 * Mark us an IO worker, and block any signal that isn't
@@ -2630,6 +2632,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.fn		= fn,
 		.fn_arg		= arg,
 		.io_thread	= 1,
+		.user_worker	= 1,
 	};
 
 	return copy_process(NULL, 0, node, &args);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..7ba6bfdd9a5f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1151,12 +1151,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 	DEFINE_WAIT(wait);
 
 	/*
-	 * Do not throttle IO workers, kthreads other than kswapd or
+	 * Do not throttle user workers, kthreads other than kswapd or
 	 * workqueues. They may be required for reclaim to make
 	 * forward progress (e.g. journalling workqueues or kthreads).
 	 */
 	if (!current_is_kswapd() &&
-	    current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
+	    current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
 		cond_resched();
 		return;
 	}
-- 
cgit v1.2.3


From 11f3f500ec8a75c96087f3bed87aa2b1c5de7498 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:27 -0600
Subject: fork: add kernel_clone_args flag to not dup/clone files

Each vhost device gets a thread that is used to perform IO and management
operations. Instead of a thread that is accessing a device, the thread is
part of the device, so when it creates a thread using a helper based on
copy_process we can't dup or clone the parent's files/FDS because it
would do an extra increment on ourself.

Later, when we do:

Qemu process exits:
        do_exit -> exit_files -> put_files_struct -> close_files

we would leak the device's resources because of that extra refcount
on the fd or file_struct.

This patch adds a no_files option so these worker threads can prevent
taking an extra refcount on themselves.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched/task.h |  1 +
 kernel/fork.c              | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 2950e83d5382..4f816048794f 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -28,6 +28,7 @@ struct kernel_clone_args {
 	u32 kthread:1;
 	u32 io_thread:1;
 	u32 user_worker:1;
+	u32 no_files:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8ec192c199..f8d530f4406e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1627,7 +1627,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+		      int no_files)
 {
 	struct files_struct *oldf, *newf;
 	int error = 0;
@@ -1639,6 +1640,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 	if (!oldf)
 		goto out;
 
+	if (no_files) {
+		tsk->files = NULL;
+		goto out;
+	}
+
 	if (clone_flags & CLONE_FILES) {
 		atomic_inc(&oldf->count);
 		goto out;
@@ -2259,7 +2265,7 @@ static __latent_entropy struct task_struct *copy_process(
 	retval = copy_semundo(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_security;
-	retval = copy_files(clone_flags, p);
+	retval = copy_files(clone_flags, p, args->no_files);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
 	retval = copy_fs(clone_flags, p);
-- 
cgit v1.2.3


From 094717586bf71ac20ae3b240d2654d826634b21e Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:28 -0600
Subject: fork: Add kernel_clone_args flag to ignore signals

Since:

commit 10ab825bdef8 ("change kernel threads to ignore signals instead of
blocking them")

kthreads have been ignoring signals by default, and the vhost layer has
never had a need to change that. This patch adds an option flag,
USER_WORKER_SIG_IGN, handled in copy_process() after copy_sighand()
and copy_signals() so vhost_tasks added in the next patches can continue
to ignore singals.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched/task.h | 1 +
 kernel/fork.c              | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 4f816048794f..00c54bfac0b5 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -29,6 +29,7 @@ struct kernel_clone_args {
 	u32 io_thread:1;
 	u32 user_worker:1;
 	u32 no_files:1;
+	u32 ignore_signals:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index f8d530f4406e..2918c685ede5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2290,6 +2290,9 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
+	if (args->ignore_signals)
+		ignore_signals(p);
+
 	stackleak_task_init(p);
 
 	if (pid != &init_struct_pid) {
-- 
cgit v1.2.3


From 89c8e98d8cfb0656dbeb648572df5b13e372247d Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:29 -0600
Subject: fork: allow kernel code to call copy_process

The next patch adds helpers like create_io_thread, but for use by the
vhost layer. There are several functions, so they are in their own file
instead of cluttering up fork.c. This patch allows that new file to
call copy_process.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/sched/task.h | 2 ++
 kernel/fork.c              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 00c54bfac0b5..537cbf9a2ade 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -93,6 +93,8 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct task_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *copy_process(struct pid *pid, int trace, int node,
+				 struct kernel_clone_args *args);
 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
diff --git a/kernel/fork.c b/kernel/fork.c
index 2918c685ede5..3363f284c6cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2016,7 +2016,7 @@ static void rv_task_fork(struct task_struct *p)
  * parts of the process environment (as per the clone
  * flags). The actual kick-off is left to the caller.
  */
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
 					struct pid *pid,
 					int trace,
 					int node,
-- 
cgit v1.2.3


From 2d683175827171c982f91996fdbef4f3fd8b1b01 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 15:31:25 +0100
Subject: mm,jfs: move write_one_page/folio_write_one to jfs

The last remaining user of folio_write_one through the write_one_page
wrapper is jfs, so move the functionality there and hard code the
call to metapage_writepage.

Note that the use of the pagecache by the JFS 'metapage' buffer cache
is a bit odd, and we could probably do without VM-level dirty tracking
at all, but that's a change for another time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/jfs_metapage.c   | 39 ++++++++++++++++++++++++++++++++++-----
 include/linux/pagemap.h |  6 ------
 mm/page-writeback.c     | 40 ----------------------------------------
 3 files changed, 34 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2e8461ce74de..961569c11159 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp)
 	unlock_page(mp->page);
 }
 
+static int metapage_write_one(struct page *page)
+{
+	struct folio *folio = page_folio(page);
+	struct address_space *mapping = folio->mapping;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = folio_nr_pages(folio),
+	};
+	int ret = 0;
+
+	BUG_ON(!folio_test_locked(folio));
+
+	folio_wait_writeback(folio);
+
+	if (folio_clear_dirty_for_io(folio)) {
+		folio_get(folio);
+		ret = metapage_writepage(page, &wbc);
+		if (ret == 0)
+			folio_wait_writeback(folio);
+		folio_put(folio);
+	} else {
+		folio_unlock(folio);
+	}
+
+	if (!ret)
+		ret = filemap_check_errors(mapping);
+	return ret;
+}
+
 void force_metapage(struct metapage *mp)
 {
 	struct page *page = mp->page;
@@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp)
 	get_page(page);
 	lock_page(page);
 	set_page_dirty(page);
-	if (write_one_page(page))
-		jfs_error(mp->sb, "write_one_page() failed\n");
+	if (metapage_write_one(page))
+		jfs_error(mp->sb, "metapage_write_one() failed\n");
 	clear_bit(META_forcewrite, &mp->flag);
 	put_page(page);
 }
@@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp)
 		set_page_dirty(page);
 		if (test_bit(META_sync, &mp->flag)) {
 			clear_bit(META_sync, &mp->flag);
-			if (write_one_page(page))
-				jfs_error(mp->sb, "write_one_page() failed\n");
-			lock_page(page); /* write_one_page unlocks the page */
+			if (metapage_write_one(page))
+				jfs_error(mp->sb, "metapage_write_one() failed\n");
+			lock_page(page);
 		}
 	} else if (mp->lsn)	/* discard_metapage doesn't remove it */
 		remove_from_logsync(mp);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0acb8e1fb7af..853184a46411 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1066,12 +1066,6 @@ static inline void folio_cancel_dirty(struct folio *folio)
 bool folio_clear_dirty_for_io(struct folio *folio);
 bool clear_page_dirty_for_io(struct page *page);
 void folio_invalidate(struct folio *folio, size_t offset, size_t length);
-int __must_check folio_write_one(struct folio *folio);
-static inline int __must_check write_one_page(struct page *page)
-{
-	return folio_write_one(page_folio(page));
-}
-
 int __set_page_dirty_nobuffers(struct page *page);
 bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 516b1aa247e8..db7943999007 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2583,46 +2583,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return ret;
 }
 
-/**
- * folio_write_one - write out a single folio and wait on I/O.
- * @folio: The folio to write.
- *
- * The folio must be locked by the caller and will be unlocked upon return.
- *
- * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
- * function returns.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int folio_write_one(struct folio *folio)
-{
-	struct address_space *mapping = folio->mapping;
-	int ret = 0;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = folio_nr_pages(folio),
-	};
-
-	BUG_ON(!folio_test_locked(folio));
-
-	folio_wait_writeback(folio);
-
-	if (folio_clear_dirty_for_io(folio)) {
-		folio_get(folio);
-		ret = mapping->a_ops->writepage(&folio->page, &wbc);
-		if (ret == 0)
-			folio_wait_writeback(folio);
-		folio_put(folio);
-	} else {
-		folio_unlock(folio);
-	}
-
-	if (!ret)
-		ret = filemap_check_errors(mapping);
-	return ret;
-}
-EXPORT_SYMBOL(folio_write_one);
-
 /*
  * For address_spaces which do not use buffers nor write back.
  */
-- 
cgit v1.2.3


From ce48f955860db841eeb4bcf9bb973ea3ef177b3a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 2 Mar 2023 18:12:06 +0200
Subject: i3c: Correct reference to the I²C device data type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I²C peripheral devices that are connected to the controller are
represented in the Linux kernel as objects of the struct i2c_client.
Fix this in the header file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230302161206.38106-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 604a126b78c8..a12cda9dc715 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -22,9 +22,10 @@
 #define I3C_BROADCAST_ADDR		0x7e
 #define I3C_MAX_ADDR			GENMASK(6, 0)
 
+struct i2c_client;
+
 struct i3c_master_controller;
 struct i3c_bus;
-struct i2c_device;
 struct i3c_device;
 
 /**
-- 
cgit v1.2.3


From e9d198f2be851f8c977b9ac47748edc4f9d0f26e Mon Sep 17 00:00:00 2001
From: Thorsten Scherer <t.scherer@eckelmann.de>
Date: Sun, 12 Mar 2023 13:23:15 +0100
Subject: slab: Adjust comment after refactoring of gfp.h

Reflect the change from the commit below.

Fixes: cb5a065b4ea9 ("headers/deps: mm: Split <linux/gfp_types.h> out of <linux/gfp.h>")
Signed-off-by: Thorsten Scherer <t.scherer@eckelmann.de>
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 45af70315a94..87d687c43d8c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -526,7 +526,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
  * to be at least to the size.
  *
  * The @flags argument may be one of the GFP flags defined at
- * include/linux/gfp.h and described at
+ * include/linux/gfp_types.h and described at
  * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
  *
  * The recommended usage of the @flags is described at
-- 
cgit v1.2.3


From 2e4ef6f4798c1d2951dd7bb3ae5f0d41ec3d31e8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Mon, 13 Mar 2023 13:03:40 +0200
Subject: ASoC: SOF: uapi: header: Convert sof_abi_hdr comments to kernel style

Replace the comments for sof_abi_hdr to kernel style.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Jaska Uimonen <jaska.uimonen@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20230313110344.16644-4-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/header.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index e9bba93a5399..e53b62b9e2c5 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -11,19 +11,25 @@
 
 #include <linux/types.h>
 
-/*
- * Header for all non IPC ABI data.
+/**
+ * struct sof_abi_hdr - Header for all non IPC ABI data.
+ * @magic: Magic number for validation: 0x00464F53 ('S', 'O', 'F', '\0')
+ * @type: Component specific type
+ * @size: The size in bytes of the data, excluding this struct
+ * @abi: SOF ABI version
+ * @reserved: Reserved for future use
+ * @data: Component data - opaque to core
  *
  * Identifies data type, size and ABI.
  * Used by any bespoke component data structures or binary blobs.
  */
 struct sof_abi_hdr {
-	__u32 magic;		/**< 'S', 'O', 'F', '\0' */
-	__u32 type;		/**< component specific type */
-	__u32 size;		/**< size in bytes of data excl. this struct */
-	__u32 abi;		/**< SOF ABI version */
-	__u32 reserved[4];	/**< reserved for future use */
-	__u32 data[];		/**< Component data - opaque to core */
+	__u32 magic;
+	__u32 type;
+	__u32 size;
+	__u32 abi;
+	__u32 reserved[4];
+	__u32 data[];
 }  __packed;
 
 #define SOF_MANIFEST_DATA_TYPE_NHLT 1
-- 
cgit v1.2.3


From ea4a4e82f625ae451175a2a74779776b006d25a1 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Mon, 13 Mar 2023 13:03:41 +0200
Subject: ASoC: SOF: uapi: header: Update sof_abi_hdr doc for IPC4 use

With IPC4 the sof_abit_hdr is only used between user space
(and in topology) and kernel.
The same abi header is used with small differencies like different
magic number and the type field have slightly different name, but
similar function in IPC4 (param_id).

Update the kernel documentation to highlight the differences.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Jaska Uimonen <jaska.uimonen@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Link: https://lore.kernel.org/r/20230313110344.16644-5-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/abi.h    |  2 ++
 include/uapi/sound/sof/header.h | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h
index 3566630ca965..45c657c3919e 100644
--- a/include/uapi/sound/sof/abi.h
+++ b/include/uapi/sound/sof/abi.h
@@ -60,5 +60,7 @@
 
 /* SOF ABI magic number "SOF\0". */
 #define SOF_ABI_MAGIC		0x00464F53
+/* SOF IPC4 ABI magic number "SOF4". */
+#define SOF_IPC4_ABI_MAGIC	0x34464F53
 
 #endif
diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index e53b62b9e2c5..cb3c1ace69e3 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -13,10 +13,15 @@
 
 /**
  * struct sof_abi_hdr - Header for all non IPC ABI data.
- * @magic: Magic number for validation: 0x00464F53 ('S', 'O', 'F', '\0')
- * @type: Component specific type
+ * @magic: Magic number for validation
+ *	   for IPC3 data: 0x00464F53 ('S', 'O', 'F', '\0')
+ *	   for IPC4 data: 0x34464F53 ('S', 'O', 'F', '4')
+ * @type: module specific parameter
+ *	  for IPC3: Component specific type
+ *	  for IPC4: parameter ID (param_id) of the data
  * @size: The size in bytes of the data, excluding this struct
- * @abi: SOF ABI version
+ * @abi: SOF ABI version. The version is valid in scope of the 'magic', IPC3 and
+ *	 IPC4 ABI version numbers have no relationship.
  * @reserved: Reserved for future use
  * @data: Component data - opaque to core
  *
-- 
cgit v1.2.3


From 3f738e4a126c9ee082d814edeb7416697f9e2b37 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Mon, 13 Mar 2023 14:48:46 +0200
Subject: ASoC: SOF: rename a couple of tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename SOF_TKN_CAVS_AUDIO_FORMAT_IN_VALID and
SOF_TKN_CAVS_AUDIO_FORMAT_OUT_VALID as
SOF_TKN_CAVS_AUDIO_FORMAT_IN_VALID_BIT_DEPTH and
SOF_TKN_CAVS_AUDIO_FORMAT_OUT_VALID_BIT_DEPTH respectively.

These are currently not used.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230313124856.8140-2-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index bacaf8a6317e..a1ef6b5c0d45 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -173,7 +173,7 @@
 /* CAVS AUDIO FORMAT */
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_RATE	1900
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_BIT_DEPTH	1901
-#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_VALID_BIT	1902
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_VALID_BIT_DEPTH	1902
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CHANNELS	1903
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_MAP	1904
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_CFG	1905
@@ -183,7 +183,7 @@
 /* intentional token numbering discontinuity, reserved for future use */
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_RATE	1930
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_BIT_DEPTH	1931
-#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_VALID_BIT	1932
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_VALID_BIT_DEPTH 1932
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CHANNELS	1933
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_MAP	1934
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_CFG	1935
-- 
cgit v1.2.3


From bb79f2a608245cd92b3183d77aec6902e51de950 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Mon, 13 Mar 2023 14:48:47 +0200
Subject: ASoC: SOF: Use input/output pin consistently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently we use input/output and sink/source pins interchangeably.
Remove the references to sink/source pins and replace with input/output
pins everywhere for consistency and clarity.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230313124856.8140-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h | 12 +++----
 sound/soc/sof/ipc4-topology.c   | 48 ++++++++++++-------------
 sound/soc/sof/sof-audio.h       | 32 ++++++++---------
 sound/soc/sof/topology.c        | 80 ++++++++++++++++++++---------------------
 4 files changed, 86 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index a1ef6b5c0d45..0b5110427132 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -88,14 +88,14 @@
 #define SOF_TKN_COMP_CPC			406
 #define SOF_TKN_COMP_IS_PAGES			409
 #define SOF_TKN_COMP_NUM_AUDIO_FORMATS		410
-#define SOF_TKN_COMP_NUM_SINK_PINS		411
-#define SOF_TKN_COMP_NUM_SOURCE_PINS		412
+#define SOF_TKN_COMP_NUM_INPUT_PINS		411
+#define SOF_TKN_COMP_NUM_OUTPUT_PINS		412
 /*
- * The token for sink/source pin binding, it specifies the widget
- * name that the sink/source pin is connected from/to.
+ * The token for input/output pin binding, it specifies the widget
+ * name that the input/output pin is connected from/to.
  */
-#define SOF_TKN_COMP_SINK_PIN_BINDING_WNAME	413
-#define SOF_TKN_COMP_SRC_PIN_BINDING_WNAME	414
+#define SOF_TKN_COMP_INPUT_PIN_BINDING_WNAME	413
+#define SOF_TKN_COMP_OUTPUT_PIN_BINDING_WNAME	414
 
 
 /* SSP */
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 0bb16ed38e48..a501eb9befa8 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -1698,17 +1698,17 @@ static int sof_ipc4_get_queue_id(struct snd_sof_widget *src_widget,
 	u32 num_pins;
 	int i;
 
-	if (pin_type == SOF_PIN_TYPE_SOURCE) {
+	if (pin_type == SOF_PIN_TYPE_OUTPUT) {
 		current_swidget = src_widget;
-		pin_binding = src_widget->src_pin_binding;
-		queue_ida = &src_widget->src_queue_ida;
-		num_pins = src_widget->num_source_pins;
+		pin_binding = src_widget->output_pin_binding;
+		queue_ida = &src_widget->output_queue_ida;
+		num_pins = src_widget->num_output_pins;
 		buddy_name = sink_widget->widget->name;
 	} else {
 		current_swidget = sink_widget;
-		pin_binding = sink_widget->sink_pin_binding;
-		queue_ida = &sink_widget->sink_queue_ida;
-		num_pins = sink_widget->num_sink_pins;
+		pin_binding = sink_widget->input_pin_binding;
+		queue_ida = &sink_widget->input_queue_ida;
+		num_pins = sink_widget->num_input_pins;
 		buddy_name = src_widget->widget->name;
 	}
 
@@ -1716,12 +1716,12 @@ static int sof_ipc4_get_queue_id(struct snd_sof_widget *src_widget,
 
 	if (num_pins < 1) {
 		dev_err(scomp->dev, "invalid %s num_pins: %d for queue allocation for %s\n",
-			(pin_type == SOF_PIN_TYPE_SOURCE ? "source" : "sink"),
+			(pin_type == SOF_PIN_TYPE_OUTPUT ? "output" : "input"),
 			num_pins, current_swidget->widget->name);
 		return -EINVAL;
 	}
 
-	/* If there is only one sink/source pin, queue id must be 0 */
+	/* If there is only one input/output pin, queue id must be 0 */
 	if (num_pins == 1)
 		return 0;
 
@@ -1736,7 +1736,7 @@ static int sof_ipc4_get_queue_id(struct snd_sof_widget *src_widget,
 		 * mixed use pin binding array and ida for queue ID allocation.
 		 */
 		dev_err(scomp->dev, "no %s queue id found from pin binding array for %s\n",
-			(pin_type == SOF_PIN_TYPE_SOURCE ? "source" : "sink"),
+			(pin_type == SOF_PIN_TYPE_OUTPUT ? "output" : "input"),
 			current_swidget->widget->name);
 		return -EINVAL;
 	}
@@ -1752,14 +1752,14 @@ static void sof_ipc4_put_queue_id(struct snd_sof_widget *swidget, int queue_id,
 	char **pin_binding;
 	int num_pins;
 
-	if (pin_type == SOF_PIN_TYPE_SOURCE) {
-		pin_binding = swidget->src_pin_binding;
-		queue_ida = &swidget->src_queue_ida;
-		num_pins = swidget->num_source_pins;
+	if (pin_type == SOF_PIN_TYPE_OUTPUT) {
+		pin_binding = swidget->output_pin_binding;
+		queue_ida = &swidget->output_queue_ida;
+		num_pins = swidget->num_output_pins;
 	} else {
-		pin_binding = swidget->sink_pin_binding;
-		queue_ida = &swidget->sink_queue_ida;
-		num_pins = swidget->num_sink_pins;
+		pin_binding = swidget->input_pin_binding;
+		queue_ida = &swidget->input_queue_ida;
+		num_pins = swidget->num_input_pins;
 	}
 
 	/* Nothing to free if queue ID is not allocated with ida. */
@@ -1829,7 +1829,7 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
 	int ret;
 
 	sroute->src_queue_id = sof_ipc4_get_queue_id(src_widget, sink_widget,
-						     SOF_PIN_TYPE_SOURCE);
+						     SOF_PIN_TYPE_OUTPUT);
 	if (sroute->src_queue_id < 0) {
 		dev_err(sdev->dev, "failed to get queue ID for source widget: %s\n",
 			src_widget->widget->name);
@@ -1837,12 +1837,12 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
 	}
 
 	sroute->dst_queue_id = sof_ipc4_get_queue_id(src_widget, sink_widget,
-						     SOF_PIN_TYPE_SINK);
+						     SOF_PIN_TYPE_INPUT);
 	if (sroute->dst_queue_id < 0) {
 		dev_err(sdev->dev, "failed to get queue ID for sink widget: %s\n",
 			sink_widget->widget->name);
 		sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id,
-				      SOF_PIN_TYPE_SOURCE);
+				      SOF_PIN_TYPE_OUTPUT);
 		return sroute->dst_queue_id;
 	}
 
@@ -1886,8 +1886,8 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
 	return ret;
 
 out:
-	sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id, SOF_PIN_TYPE_SOURCE);
-	sof_ipc4_put_queue_id(sink_widget, sroute->dst_queue_id, SOF_PIN_TYPE_SINK);
+	sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id, SOF_PIN_TYPE_OUTPUT);
+	sof_ipc4_put_queue_id(sink_widget, sroute->dst_queue_id, SOF_PIN_TYPE_INPUT);
 	return ret;
 }
 
@@ -1932,8 +1932,8 @@ static int sof_ipc4_route_free(struct snd_sof_dev *sdev, struct snd_sof_route *s
 			src_widget->widget->name, sroute->src_queue_id,
 			sink_widget->widget->name, sroute->dst_queue_id);
 out:
-	sof_ipc4_put_queue_id(sink_widget, sroute->dst_queue_id, SOF_PIN_TYPE_SINK);
-	sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id, SOF_PIN_TYPE_SOURCE);
+	sof_ipc4_put_queue_id(sink_widget, sroute->dst_queue_id, SOF_PIN_TYPE_INPUT);
+	sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id, SOF_PIN_TYPE_OUTPUT);
 
 	return ret;
 }
diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h
index 239b82f37976..a1c4d3f34153 100644
--- a/sound/soc/sof/sof-audio.h
+++ b/sound/soc/sof/sof-audio.h
@@ -30,9 +30,9 @@
  */
 #define SOF_WIDGET_MAX_NUM_PINS	8
 
-/* The type of a widget pin is either sink or source */
-#define SOF_PIN_TYPE_SINK	0
-#define SOF_PIN_TYPE_SOURCE	1
+/* Widget pin type */
+#define SOF_PIN_TYPE_INPUT	0
+#define SOF_PIN_TYPE_OUTPUT	1
 
 /* max number of FE PCMs before BEs */
 #define SOF_BE_PCM_BASE		16
@@ -433,31 +433,31 @@ struct snd_sof_widget {
 	struct snd_sof_tuple *tuples;
 
 	/*
-	 * The allowed range for num_sink/source_pins is [0, SOF_WIDGET_MAX_NUM_PINS].
-	 * Widgets may have zero sink or source pins, for example the tone widget has
-	 * zero sink pins.
+	 * The allowed range for num_input/output_pins is [0, SOF_WIDGET_MAX_NUM_PINS].
+	 * Widgets may have zero input or output pins, for example the tone widget has
+	 * zero input pins.
 	 */
-	u32 num_sink_pins;
-	u32 num_source_pins;
+	u32 num_input_pins;
+	u32 num_output_pins;
 
 	/*
-	 * The sink/source pin binding array, it takes the form of
+	 * The input/output pin binding array, it takes the form of
 	 * [widget_name_connected_to_pin0, widget_name_connected_to_pin1, ...],
 	 * with the index as the queue ID.
 	 *
 	 * The array is used for special pin binding. Note that even if there
-	 * is only one sink/source pin requires special pin binding, pin binding
-	 * should be defined for all sink/source pins in topology, for pin(s) that
+	 * is only one input/output pin requires special pin binding, pin binding
+	 * should be defined for all input/output pins in topology, for pin(s) that
 	 * are not used, give the value "NotConnected".
 	 *
 	 * If pin binding is not defined in topology, nothing to parse in the kernel,
-	 * sink_pin_binding and src_pin_binding shall be NULL.
+	 * input_pin_binding and output_pin_binding shall be NULL.
 	 */
-	char **sink_pin_binding;
-	char **src_pin_binding;
+	char **input_pin_binding;
+	char **output_pin_binding;
 
-	struct ida src_queue_ida;
-	struct ida sink_queue_ida;
+	struct ida output_queue_ida;
+	struct ida input_queue_ida;
 
 	void *private;		/* core does not touch this */
 };
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index 9f3a038fe21a..9f84b4c362c3 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -416,19 +416,19 @@ static const struct sof_topology_token led_tokens[] = {
 };
 
 static const struct sof_topology_token comp_pin_tokens[] = {
-	{SOF_TKN_COMP_NUM_SINK_PINS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct snd_sof_widget, num_sink_pins)},
-	{SOF_TKN_COMP_NUM_SOURCE_PINS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct snd_sof_widget, num_source_pins)},
+	{SOF_TKN_COMP_NUM_INPUT_PINS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct snd_sof_widget, num_input_pins)},
+	{SOF_TKN_COMP_NUM_OUTPUT_PINS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct snd_sof_widget, num_output_pins)},
 };
 
-static const struct sof_topology_token comp_sink_pin_binding_tokens[] = {
-	{SOF_TKN_COMP_SINK_PIN_BINDING_WNAME, SND_SOC_TPLG_TUPLE_TYPE_STRING,
+static const struct sof_topology_token comp_input_pin_binding_tokens[] = {
+	{SOF_TKN_COMP_INPUT_PIN_BINDING_WNAME, SND_SOC_TPLG_TUPLE_TYPE_STRING,
 		get_token_string, 0},
 };
 
-static const struct sof_topology_token comp_src_pin_binding_tokens[] = {
-	{SOF_TKN_COMP_SRC_PIN_BINDING_WNAME, SND_SOC_TPLG_TUPLE_TYPE_STRING,
+static const struct sof_topology_token comp_output_pin_binding_tokens[] = {
+	{SOF_TKN_COMP_OUTPUT_PIN_BINDING_WNAME, SND_SOC_TPLG_TUPLE_TYPE_STRING,
 		get_token_string, 0},
 };
 
@@ -1286,12 +1286,12 @@ static void sof_free_pin_binding(struct snd_sof_widget *swidget,
 	u32 num_pins;
 	int i;
 
-	if (pin_type == SOF_PIN_TYPE_SINK) {
-		pin_binding = swidget->sink_pin_binding;
-		num_pins = swidget->num_sink_pins;
+	if (pin_type == SOF_PIN_TYPE_INPUT) {
+		pin_binding = swidget->input_pin_binding;
+		num_pins = swidget->num_input_pins;
 	} else {
-		pin_binding = swidget->src_pin_binding;
-		num_pins = swidget->num_source_pins;
+		pin_binding = swidget->output_pin_binding;
+		num_pins = swidget->num_output_pins;
 	}
 
 	if (pin_binding) {
@@ -1313,14 +1313,14 @@ static int sof_parse_pin_binding(struct snd_sof_widget *swidget,
 	int ret;
 	int i;
 
-	if (pin_type == SOF_PIN_TYPE_SINK) {
-		num_pins = swidget->num_sink_pins;
-		pin_binding_token = comp_sink_pin_binding_tokens;
-		token_count = ARRAY_SIZE(comp_sink_pin_binding_tokens);
+	if (pin_type == SOF_PIN_TYPE_INPUT) {
+		num_pins = swidget->num_input_pins;
+		pin_binding_token = comp_input_pin_binding_tokens;
+		token_count = ARRAY_SIZE(comp_input_pin_binding_tokens);
 	} else {
-		num_pins = swidget->num_source_pins;
-		pin_binding_token = comp_src_pin_binding_tokens;
-		token_count = ARRAY_SIZE(comp_src_pin_binding_tokens);
+		num_pins = swidget->num_output_pins;
+		pin_binding_token = comp_output_pin_binding_tokens;
+		token_count = ARRAY_SIZE(comp_output_pin_binding_tokens);
 	}
 
 	memset(pin_binding, 0, SOF_WIDGET_MAX_NUM_PINS * sizeof(char *));
@@ -1337,10 +1337,10 @@ static int sof_parse_pin_binding(struct snd_sof_widget *swidget,
 			ret = -ENOMEM;
 			goto err;
 		}
-		if (pin_type == SOF_PIN_TYPE_SINK)
-			swidget->sink_pin_binding = pb;
+		if (pin_type == SOF_PIN_TYPE_INPUT)
+			swidget->input_pin_binding = pb;
 		else
-			swidget->src_pin_binding = pb;
+			swidget->output_pin_binding = pb;
 	}
 
 	return 0;
@@ -1379,8 +1379,8 @@ static int sof_widget_ready(struct snd_soc_component *scomp, int index,
 	swidget->private = NULL;
 	mutex_init(&swidget->setup_mutex);
 
-	ida_init(&swidget->src_queue_ida);
-	ida_init(&swidget->sink_queue_ida);
+	ida_init(&swidget->output_queue_ida);
+	ida_init(&swidget->input_queue_ida);
 
 	ret = sof_parse_tokens(scomp, swidget, comp_pin_tokens,
 			       ARRAY_SIZE(comp_pin_tokens), priv->array,
@@ -1391,29 +1391,29 @@ static int sof_widget_ready(struct snd_soc_component *scomp, int index,
 		goto widget_free;
 	}
 
-	if (swidget->num_sink_pins > SOF_WIDGET_MAX_NUM_PINS ||
-	    swidget->num_source_pins > SOF_WIDGET_MAX_NUM_PINS) {
-		dev_err(scomp->dev, "invalid pins for %s: [sink: %d, src: %d]\n",
-			swidget->widget->name, swidget->num_sink_pins, swidget->num_source_pins);
+	if (swidget->num_input_pins > SOF_WIDGET_MAX_NUM_PINS ||
+	    swidget->num_output_pins > SOF_WIDGET_MAX_NUM_PINS) {
+		dev_err(scomp->dev, "invalid pins for %s: [input: %d, output: %d]\n",
+			swidget->widget->name, swidget->num_input_pins, swidget->num_output_pins);
 		ret = -EINVAL;
 		goto widget_free;
 	}
 
-	if (swidget->num_sink_pins > 1) {
-		ret = sof_parse_pin_binding(swidget, priv, SOF_PIN_TYPE_SINK);
+	if (swidget->num_input_pins > 1) {
+		ret = sof_parse_pin_binding(swidget, priv, SOF_PIN_TYPE_INPUT);
 		/* on parsing error, pin binding is not allocated, nothing to free. */
 		if (ret < 0) {
-			dev_err(scomp->dev, "failed to parse sink pin binding for %s\n",
+			dev_err(scomp->dev, "failed to parse input pin binding for %s\n",
 				w->name);
 			goto widget_free;
 		}
 	}
 
-	if (swidget->num_source_pins > 1) {
-		ret = sof_parse_pin_binding(swidget, priv, SOF_PIN_TYPE_SOURCE);
+	if (swidget->num_output_pins > 1) {
+		ret = sof_parse_pin_binding(swidget, priv, SOF_PIN_TYPE_OUTPUT);
 		/* on parsing error, pin binding is not allocated, nothing to free. */
 		if (ret < 0) {
-			dev_err(scomp->dev, "failed to parse source pin binding for %s\n",
+			dev_err(scomp->dev, "failed to parse output pin binding for %s\n",
 				w->name);
 			goto widget_free;
 		}
@@ -1422,7 +1422,7 @@ static int sof_widget_ready(struct snd_soc_component *scomp, int index,
 	dev_dbg(scomp->dev,
 		"tplg: widget %d (%s) is ready [type: %d, pipe: %d, pins: %d / %d, stream: %s]\n",
 		swidget->comp_id, w->name, swidget->id, index,
-		swidget->num_sink_pins, swidget->num_source_pins,
+		swidget->num_input_pins, swidget->num_output_pins,
 		strnlen(w->sname, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) > 0 ? w->sname : "none");
 
 	widget_ops = tplg_ops ? tplg_ops->widget : NULL;
@@ -1643,11 +1643,11 @@ out:
 	if (widget_ops && widget_ops[swidget->id].ipc_free)
 		widget_ops[swidget->id].ipc_free(swidget);
 
-	ida_destroy(&swidget->src_queue_ida);
-	ida_destroy(&swidget->sink_queue_ida);
+	ida_destroy(&swidget->output_queue_ida);
+	ida_destroy(&swidget->input_queue_ida);
 
-	sof_free_pin_binding(swidget, SOF_PIN_TYPE_SINK);
-	sof_free_pin_binding(swidget, SOF_PIN_TYPE_SOURCE);
+	sof_free_pin_binding(swidget, SOF_PIN_TYPE_INPUT);
+	sof_free_pin_binding(swidget, SOF_PIN_TYPE_OUTPUT);
 
 	kfree(swidget->tuples);
 
-- 
cgit v1.2.3


From 594c1bb9ff7365b90cb4d325deb8c38ddda90557 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Mon, 13 Mar 2023 14:48:49 +0200
Subject: ASoC: SOF: ipc4-topology: Do not parse the DMA_BUFFER_SIZE token
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not parse the SOF_TKN_CAVS_AUDIO_FORMAT_DMA_BUFFER_SIZE token as the
dma_buffer_size can be derived from the input/output buffer size and the
type of widget during copier prepare. For the deep buffer case,
introduce a new token that will be used to get the deep buffer DMA size
for the host copier from topology.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230313124856.8140-5-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h |   1 +
 sound/soc/sof/ipc4-topology.c   | 112 +++++++++++++++++-----------------------
 sound/soc/sof/ipc4-topology.h   |   5 +-
 sound/soc/sof/sof-audio.h       |   2 +-
 sound/soc/sof/topology.c        |   1 -
 5 files changed, 52 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index 0b5110427132..9e91e2640dd4 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -197,6 +197,7 @@
 
 /* COPIER */
 #define SOF_TKN_INTEL_COPIER_NODE_TYPE		1980
+#define SOF_TKN_INTEL_COPIER_DEEP_BUFFER_DMA_MS	1981
 
 /* ACP I2S */
 #define SOF_TKN_AMD_ACPI2S_RATE			1700
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index d56e0f12b5d0..50cd4fe3141c 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -78,8 +78,8 @@ static const struct sof_topology_token ipc4_out_audio_format_tokens[] = {
 		offsetof(struct sof_ipc4_audio_format, fmt_cfg)},
 };
 
-static const struct sof_topology_token ipc4_copier_gateway_cfg_tokens[] = {
-	{SOF_TKN_CAVS_AUDIO_FORMAT_DMA_BUFFER_SIZE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, 0},
+static const struct sof_topology_token ipc4_copier_deep_buffer_tokens[] = {
+	{SOF_TKN_INTEL_COPIER_DEEP_BUFFER_DMA_MS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, 0},
 };
 
 static const struct sof_topology_token ipc4_copier_tokens[] = {
@@ -138,8 +138,8 @@ static const struct sof_token_info ipc4_token_list[SOF_TOKEN_COUNT] = {
 	[SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS] = {"IPC4 Audio format buffer size tokens",
 		ipc4_audio_format_buffer_size_tokens,
 		ARRAY_SIZE(ipc4_audio_format_buffer_size_tokens)},
-	[SOF_COPIER_GATEWAY_CFG_TOKENS] = {"IPC4 Copier gateway config tokens",
-		ipc4_copier_gateway_cfg_tokens, ARRAY_SIZE(ipc4_copier_gateway_cfg_tokens)},
+	[SOF_COPIER_DEEP_BUFFER_TOKENS] = {"IPC4 Copier deep buffer tokens",
+		ipc4_copier_deep_buffer_tokens, ARRAY_SIZE(ipc4_copier_deep_buffer_tokens)},
 	[SOF_COPIER_TOKENS] = {"IPC4 Copier tokens", ipc4_copier_tokens,
 		ARRAY_SIZE(ipc4_copier_tokens)},
 	[SOF_AUDIO_FMT_NUM_TOKENS] = {"IPC4 Audio format number tokens",
@@ -348,7 +348,7 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget)
 	struct snd_soc_component *scomp = swidget->scomp;
 	struct sof_ipc4_copier *ipc4_copier;
 	int node_type = 0;
-	int ret, i;
+	int ret;
 
 	ipc4_copier = kzalloc(sizeof(*ipc4_copier), GFP_KERNEL);
 	if (!ipc4_copier)
@@ -363,13 +363,6 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget)
 	if (ret)
 		goto free_copier;
 
-	available_fmt->dma_buffer_size = kcalloc(available_fmt->audio_fmt_num, sizeof(u32),
-						 GFP_KERNEL);
-	if (!available_fmt->dma_buffer_size) {
-		ret = -ENOMEM;
-		goto free_available_fmt;
-	}
-
 	/*
 	 * This callback is used by host copier and module-to-module copier,
 	 * and only host copier needs to set gtw_cfg.
@@ -377,21 +370,6 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget)
 	if (!WIDGET_IS_AIF(swidget->id))
 		goto skip_gtw_cfg;
 
-	ret = sof_update_ipc_object(scomp, available_fmt->dma_buffer_size,
-				    SOF_COPIER_GATEWAY_CFG_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(u32),
-				    available_fmt->audio_fmt_num);
-	if (ret) {
-		dev_err(scomp->dev, "Failed to parse dma buffer size in audio format for %s\n",
-			swidget->widget->name);
-		goto err;
-	}
-
-	dev_dbg(scomp->dev, "dma buffer size:\n");
-	for (i = 0; i < available_fmt->audio_fmt_num; i++)
-		dev_dbg(scomp->dev, "%d: %u\n", i,
-			available_fmt->dma_buffer_size[i]);
-
 	ret = sof_update_ipc_object(scomp, &node_type,
 				    SOF_COPIER_TOKENS, swidget->tuples,
 				    swidget->num_tuples, sizeof(node_type), 1);
@@ -399,7 +377,7 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget)
 	if (ret) {
 		dev_err(scomp->dev, "parse host copier node type token failed %d\n",
 			ret);
-		goto err;
+		goto free_available_fmt;
 	}
 	dev_dbg(scomp->dev, "host copier '%s' node_type %u\n", swidget->widget->name, node_type);
 
@@ -407,7 +385,7 @@ skip_gtw_cfg:
 	ipc4_copier->gtw_attr = kzalloc(sizeof(*ipc4_copier->gtw_attr), GFP_KERNEL);
 	if (!ipc4_copier->gtw_attr) {
 		ret = -ENOMEM;
-		goto err;
+		goto free_available_fmt;
 	}
 
 	ipc4_copier->copier_config = (uint32_t *)ipc4_copier->gtw_attr;
@@ -438,8 +416,6 @@ skip_gtw_cfg:
 
 free_gtw_attr:
 	kfree(ipc4_copier->gtw_attr);
-err:
-	kfree(available_fmt->dma_buffer_size);
 free_available_fmt:
 	sof_ipc4_free_audio_fmt(available_fmt);
 free_copier:
@@ -457,7 +433,6 @@ static void sof_ipc4_widget_free_comp_pcm(struct snd_sof_widget *swidget)
 		return;
 
 	available_fmt = &ipc4_copier->available_fmt;
-	kfree(available_fmt->dma_buffer_size);
 	kfree(available_fmt->base_config);
 	kfree(available_fmt->out_audio_fmt);
 	kfree(ipc4_copier->gtw_attr);
@@ -472,7 +447,7 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 	struct snd_sof_dai *dai = swidget->private;
 	struct sof_ipc4_copier *ipc4_copier;
 	int node_type = 0;
-	int ret, i;
+	int ret;
 
 	ipc4_copier = kzalloc(sizeof(*ipc4_copier), GFP_KERNEL);
 	if (!ipc4_copier)
@@ -486,33 +461,12 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 	if (ret)
 		goto free_copier;
 
-	available_fmt->dma_buffer_size = kcalloc(available_fmt->audio_fmt_num, sizeof(u32),
-						 GFP_KERNEL);
-	if (!available_fmt->dma_buffer_size) {
-		ret = -ENOMEM;
-		goto free_available_fmt;
-	}
-
-	ret = sof_update_ipc_object(scomp, available_fmt->dma_buffer_size,
-				    SOF_COPIER_GATEWAY_CFG_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(u32),
-				    available_fmt->audio_fmt_num);
-	if (ret) {
-		dev_err(scomp->dev, "Failed to parse dma buffer size in audio format for %s\n",
-			swidget->widget->name);
-		goto err;
-	}
-
-	for (i = 0; i < available_fmt->audio_fmt_num; i++)
-		dev_dbg(scomp->dev, "%d: dma buffer size: %u\n", i,
-			available_fmt->dma_buffer_size[i]);
-
 	ret = sof_update_ipc_object(scomp, &node_type,
 				    SOF_COPIER_TOKENS, swidget->tuples,
 				    swidget->num_tuples, sizeof(node_type), 1);
 	if (ret) {
 		dev_err(scomp->dev, "parse dai node type failed %d\n", ret);
-		goto err;
+		goto free_available_fmt;
 	}
 
 	ret = sof_update_ipc_object(scomp, ipc4_copier,
@@ -520,7 +474,7 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 				    swidget->num_tuples, sizeof(u32), 1);
 	if (ret) {
 		dev_err(scomp->dev, "parse dai copier node token failed %d\n", ret);
-		goto err;
+		goto free_available_fmt;
 	}
 
 	dev_dbg(scomp->dev, "dai %s node_type %u dai_type %u dai_index %d\n", swidget->widget->name,
@@ -554,7 +508,7 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 		blob = kzalloc(sizeof(*blob), GFP_KERNEL);
 		if (!blob) {
 			ret = -ENOMEM;
-			goto err;
+			goto free_available_fmt;
 		}
 
 		list_for_each_entry(w, &sdev->widget_list, list) {
@@ -583,7 +537,7 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 		ipc4_copier->gtw_attr = kzalloc(sizeof(*ipc4_copier->gtw_attr), GFP_KERNEL);
 		if (!ipc4_copier->gtw_attr) {
 			ret = -ENOMEM;
-			goto err;
+			goto free_available_fmt;
 		}
 
 		ipc4_copier->copier_config = (uint32_t *)ipc4_copier->gtw_attr;
@@ -604,8 +558,6 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 
 free_copier_config:
 	kfree(ipc4_copier->copier_config);
-err:
-	kfree(available_fmt->dma_buffer_size);
 free_available_fmt:
 	sof_ipc4_free_audio_fmt(available_fmt);
 free_copier:
@@ -633,7 +585,6 @@ static void sof_ipc4_widget_free_comp_dai(struct snd_sof_widget *swidget)
 	ipc4_copier = dai->private;
 	available_fmt = &ipc4_copier->available_fmt;
 
-	kfree(available_fmt->dma_buffer_size);
 	kfree(available_fmt->base_config);
 	kfree(available_fmt->out_audio_fmt);
 	if (ipc4_copier->dai_type != SOF_DAI_INTEL_SSP &&
@@ -1166,6 +1117,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 	int *ipc_config_size;
 	u32 **data;
 	int ipc_size, ret;
+	u32 deep_buffer_dma_ms = 0;
 
 	dev_dbg(sdev->dev, "copier %s, type %d", swidget->widget->name, swidget->id);
 
@@ -1177,6 +1129,16 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		struct snd_sof_widget *pipe_widget;
 		struct sof_ipc4_pipeline *pipeline;
 
+		/* parse the deep buffer dma size */
+		ret = sof_update_ipc_object(scomp, &deep_buffer_dma_ms,
+					    SOF_COPIER_DEEP_BUFFER_TOKENS, swidget->tuples,
+					    swidget->num_tuples, sizeof(u32), 1);
+		if (ret) {
+			dev_err(scomp->dev, "Failed to parse deep buffer dma size for %s\n",
+				swidget->widget->name);
+			return ret;
+		}
+
 		pipe_widget = swidget->spipe->pipe_widget;
 		pipeline = pipe_widget->private;
 		ipc4_copier = (struct sof_ipc4_copier *)swidget->private;
@@ -1367,8 +1329,29 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		return -EINVAL;
 	}
 
-	/* set the gateway dma_buffer_size using the matched ID returned above */
-	copier_data->gtw_cfg.dma_buffer_size = available_fmt->dma_buffer_size[ret];
+	/*
+	 * Set the gateway dma_buffer_size to 2ms buffer size to meet the FW expectation. In the
+	 * deep buffer case, set the dma_buffer_size depending on the deep_buffer_dma_ms set
+	 * in topology.
+	 */
+	switch (swidget->id) {
+	case snd_soc_dapm_dai_in:
+		copier_data->gtw_cfg.dma_buffer_size =
+			SOF_IPC4_MIN_DMA_BUFFER_SIZE * copier_data->base_config.ibs;
+		break;
+	case snd_soc_dapm_aif_in:
+			copier_data->gtw_cfg.dma_buffer_size =
+				max((u32)SOF_IPC4_MIN_DMA_BUFFER_SIZE, deep_buffer_dma_ms) *
+					copier_data->base_config.ibs;
+		break;
+	case snd_soc_dapm_dai_out:
+	case snd_soc_dapm_aif_out:
+		copier_data->gtw_cfg.dma_buffer_size =
+			SOF_IPC4_MIN_DMA_BUFFER_SIZE * copier_data->base_config.obs;
+		break;
+	default:
+		break;
+	}
 
 	data = &ipc4_copier->copier_config;
 	ipc_config_size = &ipc4_copier->ipc_config_size;
@@ -2170,7 +2153,7 @@ static enum sof_tokens common_copier_token_list[] = {
 	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
-	SOF_COPIER_GATEWAY_CFG_TOKENS,
+	SOF_COPIER_DEEP_BUFFER_TOKENS,
 	SOF_COPIER_TOKENS,
 	SOF_COMP_EXT_TOKENS,
 };
@@ -2186,7 +2169,6 @@ static enum sof_tokens dai_token_list[] = {
 	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
-	SOF_COPIER_GATEWAY_CFG_TOKENS,
 	SOF_COPIER_TOKENS,
 	SOF_DAI_TOKENS,
 	SOF_COMP_EXT_TOKENS,
diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h
index addc5b55cc10..696d6c39a21b 100644
--- a/sound/soc/sof/ipc4-topology.h
+++ b/sound/soc/sof/ipc4-topology.h
@@ -55,6 +55,9 @@
 
 #define SOF_IPC4_INVALID_NODE_ID	0xffffffff
 
+/* FW requires minimum 2ms DMA buffer size */
+#define SOF_IPC4_MIN_DMA_BUFFER_SIZE	2
+
 /*
  * The base of multi-gateways. Multi-gateways addressing starts from
  * ALH_MULTI_GTW_BASE and there are ALH_MULTI_GTW_COUNT multi-sources
@@ -148,7 +151,6 @@ struct ipc4_pipeline_set_state_data {
  * @out_audio_fmt: Available output audio format
  * @input_audio_fmts: Available input audio formats
  * @ref_audio_fmt: Reference audio format to match runtime audio format
- * @dma_buffer_size: Available Gateway DMA buffer size (in bytes)
  * @audio_fmt_num: Number of available audio formats
  */
 struct sof_ipc4_available_audio_format {
@@ -156,7 +158,6 @@ struct sof_ipc4_available_audio_format {
 	struct sof_ipc4_audio_format *out_audio_fmt;
 	struct sof_ipc4_audio_format *input_audio_fmts;
 	struct sof_ipc4_audio_format *ref_audio_fmt;
-	u32 *dma_buffer_size;
 	int audio_fmt_num;
 };
 
diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h
index a1c4d3f34153..4504f9efdc50 100644
--- a/sound/soc/sof/sof-audio.h
+++ b/sound/soc/sof/sof-audio.h
@@ -257,7 +257,7 @@ enum sof_tokens {
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
-	SOF_COPIER_GATEWAY_CFG_TOKENS,
+	SOF_COPIER_DEEP_BUFFER_TOKENS,
 	SOF_COPIER_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
 	SOF_COPIER_FORMAT_TOKENS,
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index 9f84b4c362c3..2d76ab13b3d1 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -1232,7 +1232,6 @@ static int sof_widget_parse_tokens(struct snd_soc_component *scomp, struct snd_s
 			continue;
 		case SOF_IN_AUDIO_FORMAT_TOKENS:
 		case SOF_OUT_AUDIO_FORMAT_TOKENS:
-		case SOF_COPIER_GATEWAY_CFG_TOKENS:
 		case SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS:
 			num_sets = sof_get_token_value(SOF_TKN_COMP_NUM_AUDIO_FORMATS,
 						       swidget->tuples, swidget->num_tuples);
-- 
cgit v1.2.3


From 7ab6b1e8302cf7a9bc8808c43b3e751e4148a351 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Mon, 13 Mar 2023 14:48:52 +0200
Subject: ASoC: SOF: ipc4-topology: Modify the type of available input/output
 formats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new struct sof_ipc4_pin_format which contains the pin index
and the buffer size. Replace the type of available input/output audio
formats in struct sof_ipc4_available_audio_format with this new struct
type and rename them to input_pin_fmts and output_pin_fmts.

Also, add a new token, SOF_TKN_CAVS_AUDIO_FORMAT_PIN_INDEX that will be
used to parse the pin index for the audio format from topology.
Currently we only set the audio format for Pin 0 in topology, so the
default value will be 0 for all audio formats.

Finally, parse the pin_index and the input/output buffer sizes
along with audio formats into the pin_format arrays in struct
sof_ipc4_available_audio_format. This makes the base_config array in struct
sof_ipc4_available_audio_format redundant. So remove it. This change
will allow the addition of audio formats for the non-zero pins in
topology transparent to the topology parser in the kernel.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230313124856.8140-8-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h |   1 +
 sound/soc/sof/ipc4-pcm.c        |   2 +-
 sound/soc/sof/ipc4-topology.c   | 181 ++++++++++++++++------------------------
 sound/soc/sof/ipc4-topology.h   |  27 ++++--
 sound/soc/sof/sof-audio.h       |   1 -
 sound/soc/sof/topology.c        |   1 -
 6 files changed, 96 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index 9e91e2640dd4..92360601b49c 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -180,6 +180,7 @@
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_INTERLEAVING_STYLE	1906
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_FMT_CFG	1907
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_SAMPLE_TYPE	1908
+#define SOF_TKN_CAVS_AUDIO_FORMAT_PIN_INDEX		1909
 /* intentional token numbering discontinuity, reserved for future use */
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_RATE	1930
 #define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_BIT_DEPTH	1931
diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c
index a679c08d9bb1..701da5ee4e4e 100644
--- a/sound/soc/sof/ipc4-pcm.c
+++ b/sound/soc/sof/ipc4-pcm.c
@@ -389,7 +389,7 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd,
 	snd_mask_none(fmt);
 	snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S32_LE);
 
-	rate->min = ipc4_copier->available_fmt.input_audio_fmts->sampling_frequency;
+	rate->min = ipc4_copier->available_fmt.input_pin_fmts->audio_fmt.sampling_frequency;
 	rate->max = rate->min;
 
 	switch (ipc4_copier->dai_type) {
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 0e1e4fc9224c..c462db95f3b2 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -41,41 +41,44 @@ static const struct sof_topology_token ipc4_comp_tokens[] = {
 		offsetof(struct sof_ipc4_base_module_cfg, is_pages)},
 };
 
-static const struct sof_topology_token ipc4_audio_format_buffer_size_tokens[] = {
-	{SOF_TKN_CAVS_AUDIO_FORMAT_IBS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_base_module_cfg, ibs)},
-	{SOF_TKN_CAVS_AUDIO_FORMAT_OBS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_base_module_cfg, obs)},
-};
-
 static const struct sof_topology_token ipc4_in_audio_format_tokens[] = {
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_RATE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, sampling_frequency)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.sampling_frequency)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_BIT_DEPTH, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, bit_depth)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.bit_depth)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_MAP, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, ch_map)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.ch_map)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_CFG, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, ch_cfg)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.ch_cfg)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_INTERLEAVING_STYLE, SND_SOC_TPLG_TUPLE_TYPE_WORD,
-		get_token_u32, offsetof(struct sof_ipc4_audio_format, interleaving_style)},
+		get_token_u32, offsetof(struct sof_ipc4_pin_format,
+		audio_fmt.interleaving_style)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_IN_FMT_CFG, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, fmt_cfg)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.fmt_cfg)},
+	{SOF_TKN_CAVS_AUDIO_FORMAT_PIN_INDEX, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_pin_format, pin_index)},
+	{SOF_TKN_CAVS_AUDIO_FORMAT_IBS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_pin_format, buffer_size)},
 };
 
 static const struct sof_topology_token ipc4_out_audio_format_tokens[] = {
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_RATE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, sampling_frequency)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.sampling_frequency)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_BIT_DEPTH, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, bit_depth)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.bit_depth)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_MAP, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, ch_map)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.ch_map)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_CFG, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, ch_cfg)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.ch_cfg)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_INTERLEAVING_STYLE, SND_SOC_TPLG_TUPLE_TYPE_WORD,
-		get_token_u32, offsetof(struct sof_ipc4_audio_format, interleaving_style)},
+		get_token_u32, offsetof(struct sof_ipc4_pin_format,
+		audio_fmt.interleaving_style)},
 	{SOF_TKN_CAVS_AUDIO_FORMAT_OUT_FMT_CFG, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		offsetof(struct sof_ipc4_audio_format, fmt_cfg)},
+		offsetof(struct sof_ipc4_pin_format, audio_fmt.fmt_cfg)},
+	{SOF_TKN_CAVS_AUDIO_FORMAT_PIN_INDEX, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_pin_format, pin_index)},
+	{SOF_TKN_CAVS_AUDIO_FORMAT_OBS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_pin_format, buffer_size)},
 };
 
 static const struct sof_topology_token ipc4_copier_deep_buffer_tokens[] = {
@@ -135,9 +138,6 @@ static const struct sof_token_info ipc4_token_list[SOF_TOKEN_COUNT] = {
 		ipc4_in_audio_format_tokens, ARRAY_SIZE(ipc4_in_audio_format_tokens)},
 	[SOF_OUT_AUDIO_FORMAT_TOKENS] = {"IPC4 Output Audio format tokens",
 		ipc4_out_audio_format_tokens, ARRAY_SIZE(ipc4_out_audio_format_tokens)},
-	[SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS] = {"IPC4 Audio format buffer size tokens",
-		ipc4_audio_format_buffer_size_tokens,
-		ARRAY_SIZE(ipc4_audio_format_buffer_size_tokens)},
 	[SOF_COPIER_DEEP_BUFFER_TOKENS] = {"IPC4 Copier deep buffer tokens",
 		ipc4_copier_deep_buffer_tokens, ARRAY_SIZE(ipc4_copier_deep_buffer_tokens)},
 	[SOF_COPIER_TOKENS] = {"IPC4 Copier tokens", ipc4_copier_tokens,
@@ -148,20 +148,18 @@ static const struct sof_token_info ipc4_token_list[SOF_TOKEN_COUNT] = {
 	[SOF_SRC_TOKENS] = {"SRC tokens", src_tokens, ARRAY_SIZE(src_tokens)},
 };
 
-static void sof_ipc4_dbg_audio_format(struct device *dev,
-				      struct sof_ipc4_audio_format *format,
-				      size_t object_size, int num_format)
+static void sof_ipc4_dbg_audio_format(struct device *dev, struct sof_ipc4_pin_format *pin_fmt,
+				      int num_formats)
 {
-	struct sof_ipc4_audio_format *fmt;
-	void *ptr = format;
 	int i;
 
-	for (i = 0; i < num_format; i++, ptr = (u8 *)ptr + object_size) {
-		fmt = ptr;
+	for (i = 0; i < num_formats; i++) {
+		struct sof_ipc4_audio_format *fmt = &pin_fmt[i].audio_fmt;
 		dev_dbg(dev,
-			" #%d: %uHz, %ubit (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x)\n",
-			i, fmt->sampling_frequency, fmt->bit_depth, fmt->ch_map,
-			fmt->ch_cfg, fmt->interleaving_style, fmt->fmt_cfg);
+			"Pin index #%d: %uHz, %ubit (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x) buffer size %d\n",
+			pin_fmt[i].pin_index, fmt->sampling_frequency, fmt->bit_depth, fmt->ch_map,
+			fmt->ch_cfg, fmt->interleaving_style, fmt->fmt_cfg,
+			pin_fmt[i].buffer_size);
 	}
 }
 
@@ -179,10 +177,9 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 				  struct sof_ipc4_available_audio_format *available_fmt,
 				  struct sof_ipc4_base_module_cfg *module_base_cfg)
 {
-	struct sof_ipc4_base_module_cfg *base_config;
-	struct sof_ipc4_audio_format *out_format, *in_format;
+	struct sof_ipc4_pin_format *out_format, *in_format;
 	int audio_fmt_num = 0;
-	int ret, i;
+	int ret;
 
 	ret = sof_update_ipc_object(scomp, &audio_fmt_num,
 				    SOF_AUDIO_FMT_NUM_TOKENS, swidget->tuples,
@@ -195,47 +192,25 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 
 	dev_dbg(scomp->dev, "Number of audio formats: %d\n", available_fmt->audio_fmt_num);
 
-	base_config = kcalloc(available_fmt->audio_fmt_num, sizeof(*base_config), GFP_KERNEL);
-	if (!base_config)
-		return -ENOMEM;
-
 	/* set cpc and is_pages in the module's base_config */
 	ret = sof_update_ipc_object(scomp, module_base_cfg, SOF_COMP_TOKENS, swidget->tuples,
 				    swidget->num_tuples, sizeof(*module_base_cfg), 1);
 	if (ret) {
 		dev_err(scomp->dev, "parse comp tokens for %s failed, error: %d\n",
 			swidget->widget->name, ret);
-		goto err;
+		return ret;
 	}
 
 	dev_dbg(scomp->dev, "widget %s cpc: %d is_pages: %d\n",
 		swidget->widget->name, module_base_cfg->cpc, module_base_cfg->is_pages);
 
-	/* copy the ibs/obs for each base_cfg */
-	ret = sof_update_ipc_object(scomp, base_config,
-				    SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(*base_config),
-				    available_fmt->audio_fmt_num);
-	if (ret) {
-		dev_err(scomp->dev, "parse buffer size tokens failed %d\n", ret);
-		goto err;
-	}
-
-	for (i = 0; i < available_fmt->audio_fmt_num; i++)
-		dev_dbg(scomp->dev, "%d: ibs: %d obs: %d\n", i,
-			base_config[i].ibs, base_config[i].obs);
-
-	in_format = kcalloc(available_fmt->audio_fmt_num,
-			    sizeof(struct sof_ipc4_audio_format), GFP_KERNEL);
-	if (!in_format) {
-		kfree(base_config);
-		ret = -ENOMEM;
-		goto err;
-	}
+	in_format = kcalloc(available_fmt->audio_fmt_num, sizeof(*in_format), GFP_KERNEL);
+	if (!in_format)
+		return -ENOMEM;
 
-	ret = sof_update_ipc_object(scomp, available_fmt->input_audio_fmts,
+	ret = sof_update_ipc_object(scomp, in_format,
 				    SOF_IN_AUDIO_FORMAT_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(struct sof_ipc4_audio_format),
+				    swidget->num_tuples, sizeof(*in_format),
 				    available_fmt->audio_fmt_num);
 	if (ret) {
 		dev_err(scomp->dev, "parse base_config audio_fmt tokens failed %d\n", ret);
@@ -243,8 +218,7 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 	}
 
 	dev_dbg(scomp->dev, "Get input audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(scomp->dev, available_fmt->input_audio_fmts,
-				  sizeof(struct sof_ipc4_audio_format),
+	sof_ipc4_dbg_audio_format(scomp->dev, available_fmt->input_pin_fmts,
 				  available_fmt->audio_fmt_num);
 
 	out_format = kcalloc(available_fmt->audio_fmt_num, sizeof(*out_format), GFP_KERNEL);
@@ -264,12 +238,10 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 	}
 
 	dev_dbg(scomp->dev, "Get output audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(scomp->dev, out_format, sizeof(*out_format),
-				  available_fmt->audio_fmt_num);
+	sof_ipc4_dbg_audio_format(scomp->dev, out_format, available_fmt->audio_fmt_num);
 
-	available_fmt->base_config = base_config;
-	available_fmt->out_audio_fmt = out_format;
-	available_fmt->input_audio_fmts = in_format;
+	available_fmt->output_pin_fmts = out_format;
+	available_fmt->input_pin_fmts = in_format;
 
 	return 0;
 
@@ -277,8 +249,6 @@ err_out:
 	kfree(out_format);
 err_in:
 	kfree(in_format);
-err:
-	kfree(base_config);
 	return ret;
 }
 
@@ -286,12 +256,10 @@ err:
 static void sof_ipc4_free_audio_fmt(struct sof_ipc4_available_audio_format *available_fmt)
 
 {
-	kfree(available_fmt->base_config);
-	available_fmt->base_config = NULL;
-	kfree(available_fmt->out_audio_fmt);
-	available_fmt->out_audio_fmt = NULL;
-	kfree(available_fmt->input_audio_fmts);
-	available_fmt->input_audio_fmts = NULL;
+	kfree(available_fmt->output_pin_fmts);
+	available_fmt->output_pin_fmts = NULL;
+	kfree(available_fmt->input_pin_fmts);
+	available_fmt->input_pin_fmts = NULL;
 }
 
 static void sof_ipc4_widget_free_comp_pipeline(struct snd_sof_widget *swidget)
@@ -431,8 +399,7 @@ static void sof_ipc4_widget_free_comp_pcm(struct snd_sof_widget *swidget)
 		return;
 
 	available_fmt = &ipc4_copier->available_fmt;
-	kfree(available_fmt->base_config);
-	kfree(available_fmt->out_audio_fmt);
+	kfree(available_fmt->output_pin_fmts);
 	kfree(ipc4_copier->gtw_attr);
 	kfree(ipc4_copier);
 	swidget->private = NULL;
@@ -584,8 +551,7 @@ static void sof_ipc4_widget_free_comp_dai(struct snd_sof_widget *swidget)
 	ipc4_copier = dai->private;
 	available_fmt = &ipc4_copier->available_fmt;
 
-	kfree(available_fmt->base_config);
-	kfree(available_fmt->out_audio_fmt);
+	kfree(available_fmt->output_pin_fmts);
 	if (ipc4_copier->dai_type != SOF_DAI_INTEL_SSP &&
 	    ipc4_copier->dai_type != SOF_DAI_INTEL_DMIC)
 		kfree(ipc4_copier->copier_config);
@@ -866,14 +832,14 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 				   struct snd_pcm_hw_params *params,
 				   struct sof_ipc4_available_audio_format *available_fmt)
 {
-	struct sof_ipc4_audio_format *fmt = available_fmt->ref_audio_fmt;
+	struct sof_ipc4_pin_format *pin_fmt = available_fmt->ref_audio_fmt;
 	u32 valid_bits;
 	u32 channels;
 	u32 rate;
 	int sample_valid_bits;
 	int i;
 
-	if (!fmt) {
+	if (!pin_fmt) {
 		dev_err(sdev->dev, "no reference formats for %s\n", swidget->widget->name);
 		return -EINVAL;
 	}
@@ -902,7 +868,9 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 	 * Search supported audio formats to match rate, channels ,and
 	 * sample_valid_bytes from runtime params
 	 */
-	for (i = 0; i < available_fmt->audio_fmt_num; i++, fmt++) {
+	for (i = 0; i < available_fmt->audio_fmt_num; i++) {
+		struct sof_ipc4_audio_format *fmt = &pin_fmt[i].audio_fmt;
+
 		rate = fmt->sampling_frequency;
 		channels = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(fmt->fmt_cfg);
 		valid_bits = SOF_IPC4_AUDIO_FORMAT_CFG_V_BIT_DEPTH(fmt->fmt_cfg);
@@ -921,15 +889,16 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 	}
 
 	/* copy input format */
-	memcpy(&base_config->audio_fmt, &available_fmt->input_audio_fmts[i], sizeof(*fmt));
+	memcpy(&base_config->audio_fmt, &available_fmt->input_pin_fmts[i].audio_fmt,
+	       sizeof(*out_format));
 
 	/* set base_cfg ibs/obs */
-	base_config->ibs = available_fmt->base_config[i].ibs;
-	base_config->obs = available_fmt->base_config[i].obs;
+	base_config->ibs = available_fmt->input_pin_fmts[i].buffer_size;
+	base_config->obs = available_fmt->output_pin_fmts[i].buffer_size;
 
 	dev_dbg(sdev->dev, "Init input audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(sdev->dev, &base_config->audio_fmt,
-				  sizeof(*base_config), 1);
+	sof_ipc4_dbg_audio_format(sdev->dev, &available_fmt->input_pin_fmts[i], 1);
+
 	if (out_format) {
 		/*
 		 * Current topology defines pin 0 input and output formats only in pairs.
@@ -938,10 +907,10 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 		 * input format. This logic will need to be updated when the format definitions
 		 * in topology change.
 		 */
-		memcpy(out_format, &available_fmt->out_audio_fmt[i], sizeof(*fmt));
+		memcpy(out_format, &available_fmt->output_pin_fmts[i].audio_fmt,
+		       sizeof(*out_format));
 		dev_dbg(sdev->dev, "Init output audio formats for %s\n", swidget->widget->name);
-		sof_ipc4_dbg_audio_format(sdev->dev, out_format,
-					  sizeof(*out_format), 1);
+		sof_ipc4_dbg_audio_format(sdev->dev, &available_fmt->output_pin_fmts[i], 1);
 	}
 
 	/* Return the index of the matched format */
@@ -1144,13 +1113,13 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		available_fmt = &ipc4_copier->available_fmt;
 
 		/*
-		 * Use the input_audio_fmts to match pcm params for playback and the out_audio_fmt
+		 * Use the input_pin_fmts to match pcm params for playback and the output_pin_fmts
 		 * for capture.
 		 */
 		if (dir == SNDRV_PCM_STREAM_PLAYBACK)
-			available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+			available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 		else
-			available_fmt->ref_audio_fmt = available_fmt->out_audio_fmt;
+			available_fmt->ref_audio_fmt = available_fmt->output_pin_fmts;
 
 		copier_data->gtw_cfg.node_id &= ~SOF_IPC4_NODE_INDEX_MASK;
 		copier_data->gtw_cfg.node_id |=
@@ -1170,7 +1139,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		copier_data = &ipc4_copier->data;
 		available_fmt = &ipc4_copier->available_fmt;
 		if (dir == SNDRV_PCM_STREAM_CAPTURE) {
-			available_fmt->ref_audio_fmt = available_fmt->out_audio_fmt;
+			available_fmt->ref_audio_fmt = available_fmt->output_pin_fmts;
 
 			/*
 			 * modify the input params for the dai copier as it only supports
@@ -1180,7 +1149,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 			snd_mask_none(fmt);
 			snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S32_LE);
 		} else {
-			available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+			available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 		}
 
 		ref_params = pipeline_params;
@@ -1201,7 +1170,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		available_fmt = &ipc4_copier->available_fmt;
 
 		/* Use the input formats as the reference to match pcm params */
-		available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+		available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 		ref_params = pipeline_params;
 
 		break;
@@ -1388,7 +1357,7 @@ static int sof_ipc4_prepare_gain_module(struct snd_sof_widget *swidget,
 	struct sof_ipc4_available_audio_format *available_fmt = &gain->available_fmt;
 	int ret;
 
-	available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+	available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 
 	/* output format is not required to be sent to the FW for gain */
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &gain->base_config,
@@ -1414,7 +1383,7 @@ static int sof_ipc4_prepare_mixer_module(struct snd_sof_widget *swidget,
 	int ret;
 
 	/* only 32bit is supported by mixer */
-	available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+	available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 
 	/* output format is not required to be sent to the FW for mixer */
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &mixer->base_config,
@@ -1440,7 +1409,7 @@ static int sof_ipc4_prepare_src_module(struct snd_sof_widget *swidget,
 	struct snd_interval *rate;
 	int ret;
 
-	available_fmt->ref_audio_fmt = available_fmt->input_audio_fmts;
+	available_fmt->ref_audio_fmt = available_fmt->input_pin_fmts;
 
 	/* output format is not required to be sent to the FW for SRC */
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &src->base_config,
@@ -2147,7 +2116,6 @@ static int sof_ipc4_link_setup(struct snd_sof_dev *sdev, struct snd_soc_dai_link
 static enum sof_tokens common_copier_token_list[] = {
 	SOF_COMP_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_COPIER_DEEP_BUFFER_TOKENS,
@@ -2163,7 +2131,6 @@ static enum sof_tokens pipeline_token_list[] = {
 static enum sof_tokens dai_token_list[] = {
 	SOF_COMP_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_COPIER_TOKENS,
@@ -2175,8 +2142,8 @@ static enum sof_tokens pga_token_list[] = {
 	SOF_COMP_TOKENS,
 	SOF_GAIN_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
+	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_COMP_EXT_TOKENS,
 };
 
@@ -2184,7 +2151,7 @@ static enum sof_tokens mixer_token_list[] = {
 	SOF_COMP_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
+	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_COMP_EXT_TOKENS,
 };
 
@@ -2193,7 +2160,7 @@ static enum sof_tokens src_token_list[] = {
 	SOF_SRC_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
+	SOF_OUT_AUDIO_FORMAT_TOKENS,
 	SOF_COMP_EXT_TOKENS,
 };
 
diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h
index 696d6c39a21b..2d9b1ba549f7 100644
--- a/sound/soc/sof/ipc4-topology.h
+++ b/sound/soc/sof/ipc4-topology.h
@@ -145,19 +145,32 @@ struct ipc4_pipeline_set_state_data {
 	DECLARE_FLEX_ARRAY(u32, pipeline_ids);
 } __packed;
 
+/**
+ * struct sof_ipc4_pin_format - Module pin format
+ * @pin_index: pin index
+ * @buffer_size: buffer size in bytes
+ * @audio_fmt: audio format for the pin
+ *
+ * This structure can be used for both output or input pins and the pin_index is relative to the
+ * pin type i.e output/input pin
+ */
+struct sof_ipc4_pin_format {
+	u32 pin_index;
+	u32 buffer_size;
+	struct sof_ipc4_audio_format audio_fmt;
+};
+
 /**
  * struct sof_ipc4_available_audio_format - Available audio formats
- * @base_config: Available base config
- * @out_audio_fmt: Available output audio format
- * @input_audio_fmts: Available input audio formats
+ * @output_pin_fmts: Available output pin formats
+ * @input_pin_fmts: Available input pin formats
  * @ref_audio_fmt: Reference audio format to match runtime audio format
  * @audio_fmt_num: Number of available audio formats
  */
 struct sof_ipc4_available_audio_format {
-	struct sof_ipc4_base_module_cfg *base_config;
-	struct sof_ipc4_audio_format *out_audio_fmt;
-	struct sof_ipc4_audio_format *input_audio_fmts;
-	struct sof_ipc4_audio_format *ref_audio_fmt;
+	struct sof_ipc4_pin_format *output_pin_fmts;
+	struct sof_ipc4_pin_format *input_pin_fmts;
+	struct sof_ipc4_pin_format *ref_audio_fmt;
 	int audio_fmt_num;
 };
 
diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h
index 4504f9efdc50..d220af5f08fb 100644
--- a/sound/soc/sof/sof-audio.h
+++ b/sound/soc/sof/sof-audio.h
@@ -256,7 +256,6 @@ enum sof_tokens {
 	SOF_COMP_EXT_TOKENS,
 	SOF_IN_AUDIO_FORMAT_TOKENS,
 	SOF_OUT_AUDIO_FORMAT_TOKENS,
-	SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS,
 	SOF_COPIER_DEEP_BUFFER_TOKENS,
 	SOF_COPIER_TOKENS,
 	SOF_AUDIO_FMT_NUM_TOKENS,
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index 2d76ab13b3d1..3a091f18731f 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -1232,7 +1232,6 @@ static int sof_widget_parse_tokens(struct snd_soc_component *scomp, struct snd_s
 			continue;
 		case SOF_IN_AUDIO_FORMAT_TOKENS:
 		case SOF_OUT_AUDIO_FORMAT_TOKENS:
-		case SOF_AUDIO_FORMAT_BUFFER_SIZE_TOKENS:
 			num_sets = sof_get_token_value(SOF_TKN_COMP_NUM_AUDIO_FORMATS,
 						       swidget->tuples, swidget->num_tuples);
 
-- 
cgit v1.2.3


From 4fdef47a44d6ff735902dfe740918f23932225ca Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Mon, 13 Mar 2023 14:48:55 +0200
Subject: ASoC: SOF: ipc4-topology: Add new tokens for input/output pin format
 count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for handling processing modules with different
input/output pin counts, introduce two new tokens for input/output
audio format counts. Use these token values to parse all the available
audio formats from topology.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230313124856.8140-11-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h |   2 +
 sound/soc/sof/ipc4-topology.c   | 143 ++++++++++++++++++++++++----------------
 sound/soc/sof/ipc4-topology.h   |   6 +-
 sound/soc/sof/topology.c        |  42 +++++++-----
 4 files changed, 116 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index 92360601b49c..bd02842124f9 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -96,6 +96,8 @@
  */
 #define SOF_TKN_COMP_INPUT_PIN_BINDING_WNAME	413
 #define SOF_TKN_COMP_OUTPUT_PIN_BINDING_WNAME	414
+#define SOF_TKN_COMP_NUM_INPUT_AUDIO_FORMATS	415
+#define SOF_TKN_COMP_NUM_OUTPUT_AUDIO_FORMATS	416
 
 
 /* SSP */
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 3aacc440a5e4..c91a90dd8a6e 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -90,8 +90,10 @@ static const struct sof_topology_token ipc4_copier_tokens[] = {
 };
 
 static const struct sof_topology_token ipc4_audio_fmt_num_tokens[] = {
-	{SOF_TKN_COMP_NUM_AUDIO_FORMATS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
-		0},
+	{SOF_TKN_COMP_NUM_INPUT_AUDIO_FORMATS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_available_audio_format, num_input_formats)},
+	{SOF_TKN_COMP_NUM_OUTPUT_AUDIO_FORMATS, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc4_available_audio_format, num_output_formats)},
 };
 
 static const struct sof_topology_token dai_tokens[] = {
@@ -178,19 +180,24 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 				  struct sof_ipc4_base_module_cfg *module_base_cfg)
 {
 	struct sof_ipc4_pin_format *out_format, *in_format;
-	int audio_fmt_num = 0;
 	int ret;
 
-	ret = sof_update_ipc_object(scomp, &audio_fmt_num,
+	ret = sof_update_ipc_object(scomp, available_fmt,
 				    SOF_AUDIO_FMT_NUM_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(audio_fmt_num), 1);
-	if (ret || audio_fmt_num <= 0) {
-		dev_err(scomp->dev, "Invalid number of audio formats: %d\n", audio_fmt_num);
+				    swidget->num_tuples, sizeof(available_fmt), 1);
+	if (ret) {
+		dev_err(scomp->dev, "Failed to parse audio format token count\n");
+		return ret;
+	}
+
+	if (!available_fmt->num_input_formats && !available_fmt->num_output_formats) {
+		dev_err(scomp->dev, "No input/output pin formats set in topology\n");
 		return -EINVAL;
 	}
-	available_fmt->audio_fmt_num = audio_fmt_num;
 
-	dev_dbg(scomp->dev, "Number of audio formats: %d\n", available_fmt->audio_fmt_num);
+	dev_dbg(scomp->dev,
+		"Number of input audio formats: %d. Number of output audio formats: %d\n",
+		available_fmt->num_input_formats, available_fmt->num_output_formats);
 
 	/* set cpc and is_pages in the module's base_config */
 	ret = sof_update_ipc_object(scomp, module_base_cfg, SOF_COMP_TOKENS, swidget->tuples,
@@ -204,51 +211,57 @@ static int sof_ipc4_get_audio_fmt(struct snd_soc_component *scomp,
 	dev_dbg(scomp->dev, "widget %s cpc: %d is_pages: %d\n",
 		swidget->widget->name, module_base_cfg->cpc, module_base_cfg->is_pages);
 
-	in_format = kcalloc(available_fmt->audio_fmt_num, sizeof(*in_format), GFP_KERNEL);
-	if (!in_format)
-		return -ENOMEM;
+	if (available_fmt->num_input_formats) {
+		in_format = kcalloc(available_fmt->num_input_formats,
+				    sizeof(*in_format), GFP_KERNEL);
+		if (!in_format)
+			return -ENOMEM;
+		available_fmt->input_pin_fmts = in_format;
+
+		ret = sof_update_ipc_object(scomp, in_format,
+					    SOF_IN_AUDIO_FORMAT_TOKENS, swidget->tuples,
+					    swidget->num_tuples, sizeof(*in_format),
+					    available_fmt->num_input_formats);
+		if (ret) {
+			dev_err(scomp->dev, "parse input audio fmt tokens failed %d\n", ret);
+			goto err_in;
+		}
 
-	ret = sof_update_ipc_object(scomp, in_format,
-				    SOF_IN_AUDIO_FORMAT_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(*in_format),
-				    available_fmt->audio_fmt_num);
-	if (ret) {
-		dev_err(scomp->dev, "parse base_config audio_fmt tokens failed %d\n", ret);
-		goto err_in;
+		dev_dbg(scomp->dev, "Input audio formats for %s\n", swidget->widget->name);
+		sof_ipc4_dbg_audio_format(scomp->dev, in_format,
+					  available_fmt->num_input_formats);
 	}
 
-	dev_dbg(scomp->dev, "Get input audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(scomp->dev, available_fmt->input_pin_fmts,
-				  available_fmt->audio_fmt_num);
-
-	out_format = kcalloc(available_fmt->audio_fmt_num, sizeof(*out_format), GFP_KERNEL);
-	if (!out_format) {
-		ret = -ENOMEM;
-		goto err_in;
-	}
+	if (available_fmt->num_output_formats) {
+		out_format = kcalloc(available_fmt->num_output_formats, sizeof(*out_format),
+				     GFP_KERNEL);
+		if (!out_format) {
+			ret = -ENOMEM;
+			goto err_in;
+		}
 
-	ret = sof_update_ipc_object(scomp, out_format,
-				    SOF_OUT_AUDIO_FORMAT_TOKENS, swidget->tuples,
-				    swidget->num_tuples, sizeof(*out_format),
-				    available_fmt->audio_fmt_num);
+		ret = sof_update_ipc_object(scomp, out_format,
+					    SOF_OUT_AUDIO_FORMAT_TOKENS, swidget->tuples,
+					    swidget->num_tuples, sizeof(*out_format),
+					    available_fmt->num_output_formats);
+		if (ret) {
+			dev_err(scomp->dev, "parse output audio fmt tokens failed\n");
+			goto err_out;
+		}
 
-	if (ret) {
-		dev_err(scomp->dev, "parse output audio_fmt tokens failed\n");
-		goto err_out;
+		available_fmt->output_pin_fmts = out_format;
+		dev_dbg(scomp->dev, "Output audio formats for %s\n", swidget->widget->name);
+		sof_ipc4_dbg_audio_format(scomp->dev, out_format,
+					  available_fmt->num_output_formats);
 	}
 
-	dev_dbg(scomp->dev, "Get output audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(scomp->dev, out_format, available_fmt->audio_fmt_num);
-
-	available_fmt->output_pin_fmts = out_format;
-	available_fmt->input_pin_fmts = in_format;
-
 	return 0;
 
 err_out:
 	kfree(out_format);
 err_in:
 	kfree(in_format);
+	available_fmt->input_pin_fmts = NULL;
 	return ret;
 }
 
@@ -830,7 +843,7 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 				   struct sof_ipc4_base_module_cfg *base_config,
 				   struct snd_pcm_hw_params *params,
 				   struct sof_ipc4_available_audio_format *available_fmt,
-				   struct sof_ipc4_pin_format *pin_fmts)
+				   struct sof_ipc4_pin_format *pin_fmts, u32 pin_fmts_size)
 {
 	u32 valid_bits;
 	u32 channels;
@@ -858,7 +871,7 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 		return -EINVAL;
 	}
 
-	if (!available_fmt->audio_fmt_num) {
+	if (!pin_fmts_size) {
 		dev_err(sdev->dev, "no formats available for %s\n", swidget->widget->name);
 		return -EINVAL;
 	}
@@ -867,7 +880,7 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 	 * Search supported audio formats to match rate, channels ,and
 	 * sample_valid_bytes from runtime params
 	 */
-	for (i = 0; i < available_fmt->audio_fmt_num; i++) {
+	for (i = 0; i < pin_fmts_size; i++) {
 		struct sof_ipc4_audio_format *fmt = &pin_fmts[i].audio_fmt;
 
 		rate = fmt->sampling_frequency;
@@ -881,22 +894,26 @@ static int sof_ipc4_init_audio_fmt(struct snd_sof_dev *sdev,
 		}
 	}
 
-	if (i == available_fmt->audio_fmt_num) {
+	if (i == pin_fmts_size) {
 		dev_err(sdev->dev, "%s: Unsupported audio format: %uHz, %ubit, %u channels\n",
 			__func__, params_rate(params), sample_valid_bits, params_channels(params));
 		return -EINVAL;
 	}
 
 	/* copy input format */
-	memcpy(&base_config->audio_fmt, &available_fmt->input_pin_fmts[i].audio_fmt,
-	       sizeof(struct sof_ipc4_audio_format));
+	if (available_fmt->num_input_formats && i < available_fmt->num_input_formats) {
+		memcpy(&base_config->audio_fmt, &available_fmt->input_pin_fmts[i].audio_fmt,
+		       sizeof(struct sof_ipc4_audio_format));
+
+		/* set base_cfg ibs/obs */
+		base_config->ibs = available_fmt->input_pin_fmts[i].buffer_size;
 
-	/* set base_cfg ibs/obs */
-	base_config->ibs = available_fmt->input_pin_fmts[i].buffer_size;
-	base_config->obs = available_fmt->output_pin_fmts[i].buffer_size;
+		dev_dbg(sdev->dev, "Init input audio formats for %s\n", swidget->widget->name);
+		sof_ipc4_dbg_audio_format(sdev->dev, &available_fmt->input_pin_fmts[i], 1);
+	}
 
-	dev_dbg(sdev->dev, "Init input audio formats for %s\n", swidget->widget->name);
-	sof_ipc4_dbg_audio_format(sdev->dev, &available_fmt->input_pin_fmts[i], 1);
+	if (available_fmt->num_output_formats && i < available_fmt->num_output_formats)
+		base_config->obs = available_fmt->output_pin_fmts[i].buffer_size;
 
 	/* Return the index of the matched format */
 	return i;
@@ -1070,6 +1087,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 	u32 **data;
 	int ipc_size, ret;
 	u32 deep_buffer_dma_ms = 0;
+	u32 format_list_count;
 
 	dev_dbg(sdev->dev, "copier %s, type %d", swidget->widget->name, swidget->id);
 
@@ -1102,10 +1120,13 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		 * Use the input_pin_fmts to match pcm params for playback and the output_pin_fmts
 		 * for capture.
 		 */
-		if (dir == SNDRV_PCM_STREAM_PLAYBACK)
+		if (dir == SNDRV_PCM_STREAM_PLAYBACK) {
 			format_list_to_search = available_fmt->input_pin_fmts;
-		else
+			format_list_count = available_fmt->num_input_formats;
+		} else {
 			format_list_to_search = available_fmt->output_pin_fmts;
+			format_list_count = available_fmt->num_output_formats;
+		}
 
 		copier_data->gtw_cfg.node_id &= ~SOF_IPC4_NODE_INDEX_MASK;
 		copier_data->gtw_cfg.node_id |=
@@ -1126,6 +1147,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 		available_fmt = &ipc4_copier->available_fmt;
 		if (dir == SNDRV_PCM_STREAM_CAPTURE) {
 			format_list_to_search = available_fmt->output_pin_fmts;
+			format_list_count = available_fmt->num_output_formats;
 
 			/*
 			 * modify the input params for the dai copier as it only supports
@@ -1136,6 +1158,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 			snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S32_LE);
 		} else {
 			format_list_to_search = available_fmt->input_pin_fmts;
+			format_list_count = available_fmt->num_input_formats;
 		}
 
 		ref_params = pipeline_params;
@@ -1157,6 +1180,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 
 		/* Use the input formats to match pcm params */
 		format_list_to_search = available_fmt->input_pin_fmts;
+		format_list_count = available_fmt->num_input_formats;
 		ref_params = pipeline_params;
 
 		break;
@@ -1169,7 +1193,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 
 	/* set input and output audio formats */
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &copier_data->base_config, ref_params,
-				      available_fmt, format_list_to_search);
+				      available_fmt, format_list_to_search, format_list_count);
 	if (ret < 0)
 		return ret;
 
@@ -1356,7 +1380,8 @@ static int sof_ipc4_prepare_gain_module(struct snd_sof_widget *swidget,
 
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &gain->base_config,
 				      pipeline_params, available_fmt,
-				      available_fmt->input_pin_fmts);
+				      available_fmt->input_pin_fmts,
+				      available_fmt->num_input_formats);
 	if (ret < 0)
 		return ret;
 
@@ -1379,7 +1404,8 @@ static int sof_ipc4_prepare_mixer_module(struct snd_sof_widget *swidget,
 
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &mixer->base_config,
 				      pipeline_params, available_fmt,
-				      available_fmt->input_pin_fmts);
+				      available_fmt->input_pin_fmts,
+				      available_fmt->num_input_formats);
 	if (ret < 0)
 		return ret;
 
@@ -1403,7 +1429,8 @@ static int sof_ipc4_prepare_src_module(struct snd_sof_widget *swidget,
 
 	ret = sof_ipc4_init_audio_fmt(sdev, swidget, &src->base_config,
 				      pipeline_params, available_fmt,
-				      available_fmt->input_pin_fmts);
+				      available_fmt->input_pin_fmts,
+				      available_fmt->num_input_formats);
 	if (ret < 0)
 		return ret;
 
diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h
index 6359ef8736ae..fad7a628f782 100644
--- a/sound/soc/sof/ipc4-topology.h
+++ b/sound/soc/sof/ipc4-topology.h
@@ -164,12 +164,14 @@ struct sof_ipc4_pin_format {
  * struct sof_ipc4_available_audio_format - Available audio formats
  * @output_pin_fmts: Available output pin formats
  * @input_pin_fmts: Available input pin formats
- * @audio_fmt_num: Number of available audio formats
+ * @num_input_formats: Number of input pin formats
+ * @num_output_formats: Number of output pin formats
  */
 struct sof_ipc4_available_audio_format {
 	struct sof_ipc4_pin_format *output_pin_fmts;
 	struct sof_ipc4_pin_format *input_pin_fmts;
-	int audio_fmt_num;
+	u32 num_input_formats;
+	u32 num_output_formats;
 };
 
 /**
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index 3a091f18731f..b642835e14df 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -1231,35 +1231,43 @@ static int sof_widget_parse_tokens(struct snd_soc_component *scomp, struct snd_s
 
 			continue;
 		case SOF_IN_AUDIO_FORMAT_TOKENS:
-		case SOF_OUT_AUDIO_FORMAT_TOKENS:
-			num_sets = sof_get_token_value(SOF_TKN_COMP_NUM_AUDIO_FORMATS,
+			num_sets = sof_get_token_value(SOF_TKN_COMP_NUM_INPUT_AUDIO_FORMATS,
 						       swidget->tuples, swidget->num_tuples);
-
 			if (num_sets < 0) {
-				dev_err(sdev->dev, "Invalid audio format count for %s\n",
+				dev_err(sdev->dev, "Invalid input audio format count for %s\n",
 					swidget->widget->name);
 				ret = num_sets;
 				goto err;
 			}
-
-			if (num_sets > 1) {
-				struct snd_sof_tuple *new_tuples;
-
-				num_tuples += token_list[object_token_list[i]].count * num_sets;
-				new_tuples = krealloc(swidget->tuples,
-						      sizeof(*new_tuples) * num_tuples, GFP_KERNEL);
-				if (!new_tuples) {
-					ret = -ENOMEM;
-					goto err;
-				}
-
-				swidget->tuples = new_tuples;
+			break;
+		case SOF_OUT_AUDIO_FORMAT_TOKENS:
+			num_sets = sof_get_token_value(SOF_TKN_COMP_NUM_OUTPUT_AUDIO_FORMATS,
+						       swidget->tuples, swidget->num_tuples);
+			if (num_sets < 0) {
+				dev_err(sdev->dev, "Invalid output audio format count for %s\n",
+					swidget->widget->name);
+				ret = num_sets;
+				goto err;
 			}
 			break;
 		default:
 			break;
 		}
 
+		if (num_sets > 1) {
+			struct snd_sof_tuple *new_tuples;
+
+			num_tuples += token_list[object_token_list[i]].count * num_sets;
+			new_tuples = krealloc(swidget->tuples,
+					      sizeof(*new_tuples) * num_tuples, GFP_KERNEL);
+			if (!new_tuples) {
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			swidget->tuples = new_tuples;
+		}
+
 		/* copy one set of tuples per token ID into swidget->tuples */
 		ret = sof_copy_tuples(sdev, private->array, le32_to_cpu(private->size),
 				      object_token_list[i], num_sets, swidget->tuples,
-- 
cgit v1.2.3


From cc4b15670340315fb0b25d886c06bffb5f128f02 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 13 Mar 2023 11:58:35 +0100
Subject: spi: Constify spi_get_ctldata()'s spi parameter

The "spi" parameter of spi_get_ctldata() can be const.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/8960e07adaad8d92d2c3aa045af9ee3c5d2130a8.1678704562.git.geert+renesas@glider.be
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index bdb35a91b4bf..6097d2f51266 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -244,7 +244,7 @@ static inline void spi_dev_put(struct spi_device *spi)
 }
 
 /* ctldata is for the bus_controller driver's runtime state */
-static inline void *spi_get_ctldata(struct spi_device *spi)
+static inline void *spi_get_ctldata(const struct spi_device *spi)
 {
 	return spi->controller_state;
 }
-- 
cgit v1.2.3


From 38dca04d659a422d842f7edcecd32253c7a6fb5e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 13 Mar 2023 11:58:36 +0100
Subject: spi: Constify spi_get_drvdata()'s spi parameter

The "spi" parameter of spi_get_drvdata() can be const.
dev_get_drvdata() has been taking a const pointer since commit
7d1d8999b4bec0ba ("i2c: Constify i2c_get_clientdata's parameter").

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/0f1700ade27a8f3935d04480ff7bef8a887331eb.1678704562.git.geert+renesas@glider.be
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 6097d2f51266..e09a61dd3459 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -261,7 +261,7 @@ static inline void spi_set_drvdata(struct spi_device *spi, void *data)
 	dev_set_drvdata(&spi->dev, data);
 }
 
-static inline void *spi_get_drvdata(struct spi_device *spi)
+static inline void *spi_get_drvdata(const struct spi_device *spi)
 {
 	return dev_get_drvdata(&spi->dev);
 }
-- 
cgit v1.2.3


From d2f19eec510424caa55ea949f016ddabe2d8173a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 13 Mar 2023 11:58:37 +0100
Subject: spi: Constify spi parameters of chip select APIs

The "spi" parameters of spi_get_chipselect() and spi_get_csgpiod() can
be const.

Fixes: 303feb3cc06ac066 ("spi: Add APIs in spi core to set/get spi->chip_select and spi->cs_gpiod")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/b112de79e7a1e9095a3b6ff22b639f39e39d7748.1678704562.git.geert+renesas@glider.be
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e09a61dd3459..74bff5a2f53d 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -266,7 +266,7 @@ static inline void *spi_get_drvdata(const struct spi_device *spi)
 	return dev_get_drvdata(&spi->dev);
 }
 
-static inline u8 spi_get_chipselect(struct spi_device *spi, u8 idx)
+static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx)
 {
 	return spi->chip_select;
 }
@@ -276,7 +276,7 @@ static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipsel
 	spi->chip_select = chipselect;
 }
 
-static inline struct gpio_desc *spi_get_csgpiod(struct spi_device *spi, u8 idx)
+static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx)
 {
 	return spi->cs_gpiod;
 }
-- 
cgit v1.2.3


From 22df776a9a866713d9decfb92b633bcfdb571954 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 15 Feb 2023 17:30:33 -0600
Subject: tasks: Extract rcu_users out of union

In commit 3fbd7ee285b2b ("tasks: Add a count of task RCU users"), a
count on the number of RCU users was added to struct task_struct. This
was done so as to enable the removal of task_rcu_dereference(), and
allow tasks to be protected by RCU even after exiting and being removed
from the runqueue. In this commit, the 'refcount_t rcu_users' field that
keeps track of this refcount was put into a union co-located with
'struct rcu_head rcu', so as to avoid taking up any extra space in
task_struct. This was possible to do safely, because the field was only
ever decremented by a static set of specific callers, and then never
incremented again.

While this restriction of there only being a small, static set of users
of this field has worked fine, it prevents us from leveraging the field
to use RCU to protect tasks in other contexts.

During tracing, for example, it would be useful to be able to collect
some tasks that performed a certain operation, put them in a map, and
then periodically summarize who they are, which cgroup they're in, how
much CPU time they've utilized, etc. While this can currently be done
with 'usage', it becomes tricky when a task is already in a map, or if a
reference should only be taken if a task is valid and will not soon be
reaped. Ideally, we could do something like pass a reference to a map
value, and then try to acquire a reference to the task in an RCU read
region by using refcount_inc_not_zero().

Similarly, in sched_ext, schedulers are using integer pids to remember
tasks, and then looking them up with find_task_by_pid_ns(). This is
slow, error prone, and adds complexity. It would be more convenient and
performant if BPF schedulers could instead store tasks directly in maps,
and then leverage RCU to ensure they can be safely accessed with low
overhead.

Finally, overloading fields like this is error prone. Someone that wants
to use 'rcu_users' could easily overlook the fact that once the rcu
callback is scheduled, the refcount will go back to being nonzero, thus
precluding the use of refcount_inc_not_zero(). Furthermore, as described
below, it's possible to extract the fields of the union without changing
the size of task_struct.

There are several possible ways to enable this:

1. The lightest touch approach is likely the one proposed in this patch,
   which is to simply extract 'rcu_users' and 'rcu' from the union, so
   that scheduling the 'rcu' callback doesn't overwrite the 'rcu_users'
   refcount. If we have a trusted task pointer, this would allow us to
   use refcnt_inc_not_zero() inside of an RCU region to determine if we
   can safely acquire a reference to the task and store it in a map. As
   mentioned below, this can be done without changing the size of
   task_struct, by moving the location of the union to another location
   that has padding gaps we can fill in.

2. Removing 'refcount_t rcu_users', and instead having the entire task
   be freed in an rcu callback. This is likely the most sound overall
   design, though it changes the behavioral semantics exposed to
   callers, who currently expect that a task that's successfully looked
   up in e.g. the pid_list with find_task_by_pid_ns(), can always have a
   'usage' reference acquired on them, as it's guaranteed to be >
   0 until after the next gp. In order for this approach to work, we'd
   have to audit all callers. This approach also slightly changes
   behavior observed by user space by not invoking
   trace_sched_process_free() until the whole task_struct is actually being
   freed, rather than just after it's exited. It also may change
   timings, as memory will be freed in an RCU callback rather than
   immediately when the final 'usage' refcount drops to 0. This also is
   arguably a benefit, as it provides more predictable performance to
   callers who are refcounting tasks.

3. There may be other solutions as well that don't require changing the
   layout of task_struct. For example, we could possibly do something
   complex from the BPF side, such as listen for task exit and remove a
   task from a map when the task is exiting. This would likely require
   significant custom handling for task_struct in the verifier, so a
   more generalizable solution is likely warranted.

As mentioned above, this patch proposes the lightest-touch approach
which allows callers elsewhere in the kernel to use 'rcu_users' to
ensure the lifetime of a task, by extracting 'rcu_users' and 'rcu' from
the union. There is no size change in task_struct with this patch.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: David Vernet <void@manifault.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230215233033.889644-1-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/sched.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..b11b4517760f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,11 +1318,6 @@ struct task_struct {
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
-	union {
-		refcount_t		rcu_users;
-		struct rcu_head		rcu;
-	};
-
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
 
@@ -1459,6 +1454,8 @@ struct task_struct {
 	unsigned long			saved_state_change;
 # endif
 #endif
+	struct rcu_head			rcu;
+	refcount_t			rcu_users;
 	int				pagefault_disabled;
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
-- 
cgit v1.2.3


From a5b9c5c548bd2c6997581e6a8991cea264cfd657 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 8 Feb 2023 10:13:35 +0100
Subject: dt-bindings: clock: Add Qcom SM6125 GPUCC

Add device tree bindings for graphics clock controller for Qualcomm
Technology Inc's SM6125 SoCs.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230208091340.124641-6-konrad.dybcio@linaro.org
---
 .../bindings/clock/qcom,sm6125-gpucc.yaml          | 64 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,sm6125-gpucc.h      | 31 +++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sm6125-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml
new file mode 100644
index 000000000000..374a1844a159
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6125-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6125
+
+maintainers:
+  - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+  Qualcomm graphics clock control module provides clocks and power domains on
+  Qualcomm SoCs.
+
+  See also:: include/dt-bindings/clock/qcom,sm6125-gpucc.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,sm6125-gpucc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 main branch source
+
+  '#clock-cells':
+    const: 1
+
+  '#power-domain-cells':
+    const: 1
+
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - '#clock-cells'
+  - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sm6125.h>
+    #include <dt-bindings/clock/qcom,rpmcc.h>
+
+    soc {
+        #address-cells = <1>;
+        #size-cells = <1>;
+
+        clock-controller@5990000 {
+            compatible = "qcom,sm6125-gpucc";
+            reg = <0x05990000 0x9000>;
+            clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_CLK_SRC>;
+            #clock-cells = <1>;
+            #power-domain-cells = <1>;
+        };
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sm6125-gpucc.h b/include/dt-bindings/clock/qcom,sm6125-gpucc.h
new file mode 100644
index 000000000000..ce5bd920f2c4
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm6125-gpucc.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_SM6125_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_SM6125_H
+
+/* Clocks */
+#define GPU_CC_PLL0_OUT_AUX2			0
+#define GPU_CC_PLL1_OUT_AUX2			1
+#define GPU_CC_CRC_AHB_CLK			2
+#define GPU_CC_CX_APB_CLK			3
+#define GPU_CC_CX_GFX3D_CLK			4
+#define GPU_CC_CX_GMU_CLK			5
+#define GPU_CC_CX_SNOC_DVM_CLK			6
+#define GPU_CC_CXO_AON_CLK			7
+#define GPU_CC_CXO_CLK				8
+#define GPU_CC_GMU_CLK_SRC			9
+#define GPU_CC_SLEEP_CLK			10
+#define GPU_CC_GX_GFX3D_CLK			11
+#define GPU_CC_GX_GFX3D_CLK_SRC			12
+#define GPU_CC_AHB_CLK				13
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK		14
+
+/* GDSCs */
+#define GPU_CX_GDSC				0
+#define GPU_GX_GDSC				1
+
+#endif
-- 
cgit v1.2.3


From 94329ce6e38fc420a7cc67b4174c7c8254e1493f Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 8 Feb 2023 10:13:37 +0100
Subject: dt-bindings: clock: Add Qcom SM6375 GPUCC

Add device tree bindings for graphics clock controller for Qualcomm
Technology Inc's SM6375 SoCs.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230208091340.124641-8-konrad.dybcio@linaro.org
---
 .../bindings/clock/qcom,sm6375-gpucc.yaml          | 60 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,sm6375-gpucc.h      | 36 +++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sm6375-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml
new file mode 100644
index 000000000000..b480ead5bd69
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6375-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6375
+
+maintainers:
+  - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+  Qualcomm graphics clock control module provides clocks, resets and power
+  domains on Qualcomm SoCs.
+
+  See also:: include/dt-bindings/clock/qcom,sm6375-gpucc.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,sm6375-gpucc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 main branch source
+      - description: GPLL0 div branch source
+      - description: SNoC DVM GFX source
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,sm6375-gcc.h>
+    #include <dt-bindings/clock/qcom,rpmcc.h>
+
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        clock-controller@5990000 {
+            compatible = "qcom,sm6375-gpucc";
+            reg = <0 0x05990000 0 0x9000>;
+            clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>,
+                     <&gcc GCC_GPU_SNOC_DVM_GFX_CLK>;
+            #clock-cells = <1>;
+            #reset-cells = <1>;
+            #power-domain-cells = <1>;
+        };
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sm6375-gpucc.h b/include/dt-bindings/clock/qcom,sm6375-gpucc.h
new file mode 100644
index 000000000000..0887ac03825e
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm6375-gpucc.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_BLAIR_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_BLAIR_H
+
+/* GPU CC clocks */
+#define GPU_CC_PLL0					0
+#define GPU_CC_PLL1					1
+#define GPU_CC_AHB_CLK					2
+#define GPU_CC_CX_GFX3D_CLK				3
+#define GPU_CC_CX_GFX3D_SLV_CLK				4
+#define GPU_CC_CX_GMU_CLK				5
+#define GPU_CC_CX_SNOC_DVM_CLK				6
+#define GPU_CC_CXO_AON_CLK				7
+#define GPU_CC_CXO_CLK					8
+#define GPU_CC_GMU_CLK_SRC				9
+#define GPU_CC_GX_CXO_CLK				10
+#define GPU_CC_GX_GFX3D_CLK				11
+#define GPU_CC_GX_GFX3D_CLK_SRC				12
+#define GPU_CC_GX_GMU_CLK				13
+#define GPU_CC_SLEEP_CLK				14
+
+/* GDSCs */
+#define GPU_CX_GDSC					0
+#define GPU_GX_GDSC					1
+
+/* Resets */
+#define GPU_GX_BCR					0
+#define GPU_ACD_BCR					1
+#define GPU_GX_ACD_MISC_BCR				2
+
+#endif
-- 
cgit v1.2.3


From c413f34e7aafbabdab16fd8014ac4c87038dfd08 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 8 Feb 2023 10:13:39 +0100
Subject: dt-bindings: clock: Add Qcom SM6115 GPUCC

Add device tree bindings for graphics clock controller for Qualcomm
Technology Inc's SM6115 SoCs.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230208091340.124641-10-konrad.dybcio@linaro.org
---
 .../bindings/clock/qcom,sm6115-gpucc.yaml          | 58 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,sm6115-gpucc.h      | 36 ++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sm6115-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml
new file mode 100644
index 000000000000..cf19f44af774
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6115-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6115
+
+maintainers:
+  - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+  Qualcomm graphics clock control module provides clocks, resets and power
+  domains on Qualcomm SoCs.
+
+  See also:: include/dt-bindings/clock/qcom,sm6115-gpucc.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,sm6115-gpucc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 main branch source
+      - description: GPLL0 main div source
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sm6115.h>
+    #include <dt-bindings/clock/qcom,rpmcc.h>
+
+    soc {
+        #address-cells = <1>;
+        #size-cells = <1>;
+
+        clock-controller@5990000 {
+            compatible = "qcom,sm6115-gpucc";
+            reg = <0x05990000 0x9000>;
+            clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>;
+            #clock-cells = <1>;
+            #reset-cells = <1>;
+            #power-domain-cells = <1>;
+        };
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sm6115-gpucc.h b/include/dt-bindings/clock/qcom,sm6115-gpucc.h
new file mode 100644
index 000000000000..945f21a7d745
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm6115-gpucc.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_SM6115_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_SM6115_H
+
+/* GPU_CC clocks */
+#define GPU_CC_PLL0			0
+#define GPU_CC_PLL0_OUT_AUX2		1
+#define GPU_CC_PLL1			2
+#define GPU_CC_PLL1_OUT_AUX		3
+#define GPU_CC_AHB_CLK			4
+#define GPU_CC_CRC_AHB_CLK		5
+#define GPU_CC_CX_GFX3D_CLK		6
+#define GPU_CC_CX_GMU_CLK		7
+#define GPU_CC_CX_SNOC_DVM_CLK		8
+#define GPU_CC_CXO_AON_CLK		9
+#define GPU_CC_CXO_CLK			10
+#define GPU_CC_GMU_CLK_SRC		11
+#define GPU_CC_GX_CXO_CLK		12
+#define GPU_CC_GX_GFX3D_CLK		13
+#define GPU_CC_GX_GFX3D_CLK_SRC		14
+#define GPU_CC_SLEEP_CLK		15
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK	16
+
+/* Resets */
+#define GPU_GX_BCR			0
+
+/* GDSCs */
+#define GPU_CX_GDSC			0
+#define GPU_GX_GDSC			1
+
+#endif
-- 
cgit v1.2.3


From be50da3e9d4ad1958f7b11322d44d94d5c25a4c1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 9 Mar 2023 10:45:59 +0100
Subject: net: virtio_net: implement exact header length guest feature

Virtio spec introduced a feature VIRTIO_NET_F_GUEST_HDRLEN which when
set implicates that device benefits from knowing the exact size
of the header. For compatibility, to signal to the device that
the header is reliable driver also needs to set this feature.
Without this feature set by driver, device has to figure
out the header size itself.

Quoting the original virtio spec:
"hdr_len is a hint to the device as to how much of the header needs to
 be kept to copy into each packet"

"a hint" might not be clear for the reader what does it mean, if it is
"maybe like that" of "exactly like that". This feature just makes it
crystal clear and let the device count on the hdr_len being filled up
by the exact length of header.

Also note the spec already has following note about hdr_len:
"Due to various bugs in implementations, this field is not useful
 as a guarantee of the transport header size."

Without this feature the device needs to parse the header in core
data path handling. Accurate information helps the device to eliminate
such header parsing and directly use the hardware accelerators
for GSO operation.

virtio_net_hdr_from_skb() fills up hdr_len to skb_headlen(skb).
The driver already complies to fill the correct value. Introduce the
feature and advertise it.

Note that virtio spec also includes following note for device
implementation:
"Caution should be taken by the implementation so as to prevent
 a malicious driver from attacking the device by setting
 an incorrect hdr_len."

There is a plan to support this feature in our emulated device.
A device of SolidRun offers this feature bit. They claim this feature
will save the device a few cycles for every GSO packet.

Link: https://docs.oasis-open.org/virtio/virtio/v1.2/cs01/virtio-v1.2-cs01.html#x1-230006x3
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20230309094559.917857-1-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/virtio_net.c        | 6 ++++--
 include/uapi/linux/virtio_net.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fb5e68ed3ec2..e85b03988733 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -62,7 +62,8 @@ static const unsigned long guest_offloads[] = {
 	VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_GUEST_CSUM,
 	VIRTIO_NET_F_GUEST_USO4,
-	VIRTIO_NET_F_GUEST_USO6
+	VIRTIO_NET_F_GUEST_USO6,
+	VIRTIO_NET_F_GUEST_HDRLEN
 };
 
 #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
@@ -4213,7 +4214,8 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
 	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
-	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
+	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL, \
+	VIRTIO_NET_F_GUEST_HDRLEN
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index b4062bed186a..12c1c9699935 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -61,6 +61,7 @@
 #define VIRTIO_NET_F_GUEST_USO6	55	/* Guest can handle USOv6 in. */
 #define VIRTIO_NET_F_HOST_USO	56	/* Host can handle USO in. */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
+#define VIRTIO_NET_F_GUEST_HDRLEN  59	/* Guest provides the exact hdr_len value. */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
 #define VIRTIO_NET_F_STANDBY	  62	/* Act as standby for another device
-- 
cgit v1.2.3


From 26a4bf805c6cd88847dd045f6b29d8a20f02d8df Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 9 Feb 2023 10:57:52 +0100
Subject: dt-bindings: arm: qcom: add the SoC ID for SA8775P

Add the SoC ID entry for SA8775P.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Eric Chanudet <echanude@redhat.com>
Tested-by: Eric Chanudet <echanude@redhat.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230209095753.447347-3-brgl@bgdev.pl
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index aa95439708dc..c3c078a8c0da 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -229,6 +229,7 @@
 #define QCOM_ID_SC7180P			495
 #define QCOM_ID_SM6375			507
 #define QCOM_ID_SM8550			519
+#define QCOM_ID_SA8775P			534
 #define QCOM_ID_QRU1000			539
 #define QCOM_ID_QDU1000			545
 #define QCOM_ID_QDU1010			587
-- 
cgit v1.2.3


From bad27783c962acf837ebb3338843b1ed8272d200 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Mon, 13 Feb 2023 19:53:17 +0300
Subject: dt-bindings: clock: Add SM7150 GCC clocks

Add device tree bindings for global clock subsystem clock
controller for Qualcomm Technology Inc's SM7150 SoCs.

Co-developed-by: David Wronek <davidwronek@gmail.com>
Signed-off-by: David Wronek <davidwronek@gmail.com>
Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230213165318.127160-2-danila@jiaxyga.com
---
 .../devicetree/bindings/clock/qcom,sm7150-gcc.yaml |  52 ++++++
 include/dt-bindings/clock/qcom,sm7150-gcc.h        | 186 +++++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sm7150-gcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml
new file mode 100644
index 000000000000..0eb76d9d51c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm7150-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SM7150
+
+maintainers:
+  - Bjorn Andersson <andersson@kernel.org>
+  - Danila Tikhonov <danila@jiaxyga.com>
+  - David Wronek <davidwronek@gmail.com>
+
+description: |
+  Qualcomm global clock control module provides the clocks, resets and power
+  domains on SM7150
+
+  See also:: include/dt-bindings/clock/qcom,sm7150-gcc.h
+
+properties:
+  compatible:
+    const: qcom,sm7150-gcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Board XO Active-Only source
+      - description: Sleep clock source
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    clock-controller@100000 {
+      compatible = "qcom,sm7150-gcc";
+      reg = <0x00100000 0x001f0000>;
+      clocks = <&rpmhcc RPMH_CXO_CLK>,
+               <&rpmhcc RPMH_CXO_CLK_A>,
+               <&sleep_clk>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sm7150-gcc.h b/include/dt-bindings/clock/qcom,sm7150-gcc.h
new file mode 100644
index 000000000000..7719ffc86139
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm7150-gcc.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023, Danila Tikhonov <danila@jiaxyga.com>
+ * Copyright (c) 2023, David Wronek <davidwronek@gmail.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GCC_SM7150_H
+#define _DT_BINDINGS_CLK_QCOM_GCC_SM7150_H
+
+/* GCC clock registers */
+#define GCC_GPLL0_MAIN_DIV_CDIV				0
+#define GPLL0						1
+#define GPLL0_OUT_EVEN					2
+#define GPLL6						3
+#define GPLL7						4
+#define GCC_AGGRE_NOC_PCIE_TBU_CLK			5
+#define GCC_AGGRE_UFS_PHY_AXI_CLK			6
+#define GCC_AGGRE_UFS_PHY_AXI_HW_CTL_CLK		7
+#define GCC_AGGRE_USB3_PRIM_AXI_CLK			8
+#define GCC_APC_VS_CLK					9
+#define GCC_BOOT_ROM_AHB_CLK				10
+#define GCC_CAMERA_HF_AXI_CLK				11
+#define GCC_CAMERA_SF_AXI_CLK				12
+#define GCC_CE1_AHB_CLK					13
+#define GCC_CE1_AXI_CLK					14
+#define GCC_CE1_CLK					15
+#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK			16
+#define GCC_CPUSS_AHB_CLK				17
+#define GCC_CPUSS_AHB_CLK_SRC				18
+#define GCC_CPUSS_RBCPR_CLK				19
+#define GCC_CPUSS_RBCPR_CLK_SRC				20
+#define GCC_DDRSS_GPU_AXI_CLK				21
+#define GCC_DISP_GPLL0_CLK_SRC				22
+#define GCC_DISP_GPLL0_DIV_CLK_SRC			23
+#define GCC_DISP_HF_AXI_CLK				24
+#define GCC_DISP_SF_AXI_CLK				25
+#define GCC_GP1_CLK					26
+#define GCC_GP1_CLK_SRC					27
+#define GCC_GP2_CLK					28
+#define GCC_GP2_CLK_SRC					29
+#define GCC_GP3_CLK					30
+#define GCC_GP3_CLK_SRC					31
+#define GCC_GPU_GPLL0_CLK_SRC				32
+#define GCC_GPU_GPLL0_DIV_CLK_SRC			33
+#define GCC_GPU_MEMNOC_GFX_CLK				34
+#define GCC_GPU_SNOC_DVM_GFX_CLK			35
+#define GCC_GPU_VS_CLK					36
+#define GCC_NPU_AXI_CLK					37
+#define GCC_NPU_CFG_AHB_CLK				38
+#define GCC_NPU_GPLL0_CLK_SRC				39
+#define GCC_NPU_GPLL0_DIV_CLK_SRC			40
+#define GCC_PCIE_0_AUX_CLK				41
+#define GCC_PCIE_0_AUX_CLK_SRC				42
+#define GCC_PCIE_0_CFG_AHB_CLK				43
+#define GCC_PCIE_0_CLKREF_CLK				44
+#define GCC_PCIE_0_MSTR_AXI_CLK				45
+#define GCC_PCIE_0_PIPE_CLK				46
+#define GCC_PCIE_0_SLV_AXI_CLK				47
+#define GCC_PCIE_0_SLV_Q2A_AXI_CLK			48
+#define GCC_PCIE_PHY_AUX_CLK				49
+#define GCC_PCIE_PHY_REFGEN_CLK				50
+#define GCC_PCIE_PHY_REFGEN_CLK_SRC			51
+#define GCC_PDM2_CLK					52
+#define GCC_PDM2_CLK_SRC				53
+#define GCC_PDM_AHB_CLK					54
+#define GCC_PDM_XO4_CLK					55
+#define GCC_PRNG_AHB_CLK				56
+#define GCC_QUPV3_WRAP0_CORE_2X_CLK			57
+#define GCC_QUPV3_WRAP0_CORE_CLK			58
+#define GCC_QUPV3_WRAP0_S0_CLK				59
+#define GCC_QUPV3_WRAP0_S0_CLK_SRC			60
+#define GCC_QUPV3_WRAP0_S1_CLK				61
+#define GCC_QUPV3_WRAP0_S1_CLK_SRC			62
+#define GCC_QUPV3_WRAP0_S2_CLK				63
+#define GCC_QUPV3_WRAP0_S2_CLK_SRC			64
+#define GCC_QUPV3_WRAP0_S3_CLK				65
+#define GCC_QUPV3_WRAP0_S3_CLK_SRC			66
+#define GCC_QUPV3_WRAP0_S4_CLK				67
+#define GCC_QUPV3_WRAP0_S4_CLK_SRC			68
+#define GCC_QUPV3_WRAP0_S5_CLK				69
+#define GCC_QUPV3_WRAP0_S5_CLK_SRC			70
+#define GCC_QUPV3_WRAP0_S6_CLK				71
+#define GCC_QUPV3_WRAP0_S6_CLK_SRC			72
+#define GCC_QUPV3_WRAP0_S7_CLK				73
+#define GCC_QUPV3_WRAP0_S7_CLK_SRC			74
+#define GCC_QUPV3_WRAP1_CORE_2X_CLK			75
+#define GCC_QUPV3_WRAP1_CORE_CLK			76
+#define GCC_QUPV3_WRAP1_S0_CLK				77
+#define GCC_QUPV3_WRAP1_S0_CLK_SRC			78
+#define GCC_QUPV3_WRAP1_S1_CLK				79
+#define GCC_QUPV3_WRAP1_S1_CLK_SRC			80
+#define GCC_QUPV3_WRAP1_S2_CLK				81
+#define GCC_QUPV3_WRAP1_S2_CLK_SRC			82
+#define GCC_QUPV3_WRAP1_S3_CLK				83
+#define GCC_QUPV3_WRAP1_S3_CLK_SRC			84
+#define GCC_QUPV3_WRAP1_S4_CLK				85
+#define GCC_QUPV3_WRAP1_S4_CLK_SRC			86
+#define GCC_QUPV3_WRAP1_S5_CLK				87
+#define GCC_QUPV3_WRAP1_S5_CLK_SRC			88
+#define GCC_QUPV3_WRAP1_S6_CLK				89
+#define GCC_QUPV3_WRAP1_S6_CLK_SRC			90
+#define GCC_QUPV3_WRAP1_S7_CLK				91
+#define GCC_QUPV3_WRAP1_S7_CLK_SRC			92
+#define GCC_QUPV3_WRAP_0_M_AHB_CLK			93
+#define GCC_QUPV3_WRAP_0_S_AHB_CLK			94
+#define GCC_QUPV3_WRAP_1_M_AHB_CLK			95
+#define GCC_QUPV3_WRAP_1_S_AHB_CLK			96
+#define GCC_SDCC1_AHB_CLK				97
+#define GCC_SDCC1_APPS_CLK				98
+#define GCC_SDCC1_APPS_CLK_SRC				99
+#define GCC_SDCC1_ICE_CORE_CLK				100
+#define GCC_SDCC1_ICE_CORE_CLK_SRC			101
+#define GCC_SDCC2_AHB_CLK				102
+#define GCC_SDCC2_APPS_CLK				103
+#define GCC_SDCC2_APPS_CLK_SRC				104
+#define GCC_SDCC4_AHB_CLK				105
+#define GCC_SDCC4_APPS_CLK				106
+#define GCC_SDCC4_APPS_CLK_SRC				107
+#define GCC_SYS_NOC_CPUSS_AHB_CLK			108
+#define GCC_TSIF_AHB_CLK				109
+#define GCC_TSIF_INACTIVITY_TIMERS_CLK			110
+#define GCC_TSIF_REF_CLK				111
+#define GCC_TSIF_REF_CLK_SRC				112
+#define GCC_UFS_MEM_CLKREF_CLK				113
+#define GCC_UFS_PHY_AHB_CLK				114
+#define GCC_UFS_PHY_AXI_CLK				115
+#define GCC_UFS_PHY_AXI_CLK_SRC				116
+#define GCC_UFS_PHY_AXI_HW_CTL_CLK			117
+#define GCC_UFS_PHY_ICE_CORE_CLK			118
+#define GCC_UFS_PHY_ICE_CORE_CLK_SRC			119
+#define GCC_UFS_PHY_ICE_CORE_HW_CTL_CLK			120
+#define GCC_UFS_PHY_PHY_AUX_CLK				121
+#define GCC_UFS_PHY_PHY_AUX_CLK_SRC			122
+#define GCC_UFS_PHY_PHY_AUX_HW_CTL_CLK			123
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK			124
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK			125
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK			126
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC			127
+#define GCC_UFS_PHY_UNIPRO_CORE_HW_CTL_CLK		128
+#define GCC_USB30_PRIM_MASTER_CLK			129
+#define GCC_USB30_PRIM_MASTER_CLK_SRC			130
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK			131
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC		132
+#define GCC_USB30_PRIM_SLEEP_CLK			133
+#define GCC_USB3_PRIM_CLKREF_CLK			134
+#define GCC_USB3_PRIM_PHY_AUX_CLK			135
+#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC			136
+#define GCC_USB3_PRIM_PHY_COM_AUX_CLK			137
+#define GCC_USB3_PRIM_PHY_PIPE_CLK			138
+#define GCC_USB_PHY_CFG_AHB2PHY_CLK			139
+#define GCC_VDDA_VS_CLK					140
+#define GCC_VDDCX_VS_CLK				141
+#define GCC_VDDMX_VS_CLK				142
+#define GCC_VIDEO_AXI_CLK				143
+#define GCC_VS_CTRL_AHB_CLK				144
+#define GCC_VS_CTRL_CLK					145
+#define GCC_VS_CTRL_CLK_SRC				146
+#define GCC_VSENSOR_CLK_SRC				147
+
+/* GCC Resets */
+#define GCC_PCIE_0_BCR					0
+#define GCC_PCIE_PHY_BCR				1
+#define GCC_PCIE_PHY_COM_BCR				2
+#define GCC_UFS_PHY_BCR					3
+#define GCC_USB30_PRIM_BCR				4
+#define GCC_USB3_DP_PHY_PRIM_BCR			5
+#define GCC_USB3_DP_PHY_SEC_BCR				6
+#define GCC_USB3_PHY_PRIM_BCR				7
+#define GCC_USB3_PHY_SEC_BCR				8
+#define GCC_QUSB2PHY_PRIM_BCR				9
+#define GCC_VIDEO_AXI_CLK_BCR				10
+
+/* GCC GDSCRs */
+#define PCIE_0_GDSC					0
+#define UFS_PHY_GDSC					1
+#define USB30_PRIM_GDSC					2
+#define HLOS1_VOTE_AGGRE_NOC_MMU_AUDIO_TBU_GDSC		3
+#define HLOS1_VOTE_AGGRE_NOC_MMU_PCIE_TBU_GDSC		4
+#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU1_GDSC		5
+#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU2_GDSC		6
+#define HLOS1_VOTE_MMNOC_MMU_TBU_HF0_GDSC		7
+#define HLOS1_VOTE_MMNOC_MMU_TBU_HF1_GDSC		8
+#define HLOS1_VOTE_MMNOC_MMU_TBU_SF_GDSC		9
+
+#endif
-- 
cgit v1.2.3


From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Mar 2023 14:46:41 -0700
Subject: bpf: Disable migration when freeing stashed local kptr using obj drop

When a local kptr is stashed in a map and freed when the map goes away,
currently an error like the below appears:

[   39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875
[   39.196549] caller is bpf_mem_free+0x56/0xc0
[   39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G           O       6.2.0-13016-g22df776a9a86 #4477
[   39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[   39.198949] Workqueue: events_unbound bpf_map_free_deferred
[   39.199470] Call Trace:
[   39.199703]  <TASK>
[   39.199911]  dump_stack_lvl+0x60/0x70
[   39.200267]  check_preemption_disabled+0xbf/0xe0
[   39.200704]  bpf_mem_free+0x56/0xc0
[   39.201032]  ? bpf_obj_new_impl+0xa0/0xa0
[   39.201430]  bpf_obj_free_fields+0x1cd/0x200
[   39.201838]  array_map_free+0xad/0x220
[   39.202193]  ? finish_task_switch+0xe5/0x3c0
[   39.202614]  bpf_map_free_deferred+0xea/0x210
[   39.203006]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.203460]  process_one_work+0x64f/0xbe0
[   39.203822]  ? pwq_dec_nr_in_flight+0x110/0x110
[   39.204264]  ? do_raw_spin_lock+0x107/0x1c0
[   39.204662]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.205107]  worker_thread+0x74/0x7a0
[   39.205451]  ? process_one_work+0xbe0/0xbe0
[   39.205818]  kthread+0x171/0x1a0
[   39.206111]  ? kthread_complete_and_exit+0x20/0x20
[   39.206552]  ret_from_fork+0x1f/0x30
[   39.206886]  </TASK>

This happens because the call to __bpf_obj_drop_impl I added in the patch
adding support for stashing local kptrs doesn't disable migration. Prior
to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF
progarm, whereas now it can be called from map free path, so it's
necessary to explicitly disable migration.

Also, refactor a bit to just call __bpf_obj_drop_impl directly instead
of bothering w/ dtor union and setting pointer-to-obj_drop.

Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 12 ++++--------
 kernel/bpf/btf.c     |  4 +---
 kernel/bpf/syscall.c | 10 +++++++---
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 756b85f0d0d3..71cc92a4ba48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,18 +190,14 @@ enum btf_field_type {
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
-typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
 
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	union {
-		/* dtor used if btf_is_kernel(btf), otherwise the type
-		 * is program-allocated and obj_drop is used
-		 */
-		btf_dtor_kfunc_t dtor;
-		btf_dtor_obj_drop obj_drop;
-	};
+	/* dtor used if btf_is_kernel(btf), otherwise the type is
+	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
+	 */
+	btf_dtor_kfunc_t dtor;
 	u32 btf_id;
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 66fad7a16b6c..b7e5a5510b91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,8 +3551,6 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
@@ -3578,7 +3576,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
 		 * kptr allocated via bpf_obj_new
 		 */
-		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		field->kptr.dtor = NULL;
 		id = info->kptr.type_id;
 		kptr_btf = (struct btf *)btf;
 		btf_get(kptr_btf);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0684febc447a..5b88301a2ae0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -679,9 +681,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
 				WARN_ON_ONCE(!pointee_struct_meta);
-				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
-								  pointee_struct_meta->record :
-								  NULL);
+				migrate_disable();
+				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+								 pointee_struct_meta->record :
+								 NULL);
+				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
 			}
-- 
cgit v1.2.3


From fe8aa1ba078366ba23fc676efab57183b12a3a81 Mon Sep 17 00:00:00 2001
From: Visweswara Tanuku <quic_vtanuku@quicinc.com>
Date: Tue, 14 Feb 2023 21:05:28 -0800
Subject: soc: qcom: geni-se: Update Tx and Rx fifo depth based on QUP HW
 version

From QUP HW Version 3.10 and above the Tx and Rx
fifo depth bits are increased to 23:16 bits from
21:16 bits in SE_HW_PARAM registers accomodating
256bytes of fifo depth.

Updated geni_se_get_tx_fifo_depth and
geni_se_get_rx_fifo_depth to retrieve right fifo
depth based on QUP HW version.

Signed-off-by: Visweswara Tanuku <quic_vtanuku@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230215050528.9507-1-quic_vtanuku@quicinc.com
---
 include/linux/soc/qcom/geni-se.h | 42 ++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h
index 400213daa461..c55a0bc8cb0e 100644
--- a/include/linux/soc/qcom/geni-se.h
+++ b/include/linux/soc/qcom/geni-se.h
@@ -245,12 +245,22 @@ struct geni_se {
 /* SE_HW_PARAM_0 fields */
 #define TX_FIFO_WIDTH_MSK		GENMASK(29, 24)
 #define TX_FIFO_WIDTH_SHFT		24
+/*
+ * For QUP HW Version >= 3.10 Tx fifo depth support is increased
+ * to 256bytes and corresponding bits are 16 to 23
+ */
+#define TX_FIFO_DEPTH_MSK_256_BYTES	GENMASK(23, 16)
 #define TX_FIFO_DEPTH_MSK		GENMASK(21, 16)
 #define TX_FIFO_DEPTH_SHFT		16
 
 /* SE_HW_PARAM_1 fields */
 #define RX_FIFO_WIDTH_MSK		GENMASK(29, 24)
 #define RX_FIFO_WIDTH_SHFT		24
+/*
+ * For QUP HW Version >= 3.10 Rx fifo depth support is increased
+ * to 256bytes and corresponding bits are 16 to 23
+ */
+#define RX_FIFO_DEPTH_MSK_256_BYTES	GENMASK(23, 16)
 #define RX_FIFO_DEPTH_MSK		GENMASK(21, 16)
 #define RX_FIFO_DEPTH_SHFT		16
 
@@ -391,7 +401,8 @@ static inline void geni_se_abort_s_cmd(struct geni_se *se)
 
 /**
  * geni_se_get_tx_fifo_depth() - Get the TX fifo depth of the serial engine
- * @se:	Pointer to the concerned serial engine.
+ * based on QUP HW version
+ * @se: Pointer to the concerned serial engine.
  *
  * This function is used to get the depth i.e. number of elements in the
  * TX fifo of the serial engine.
@@ -400,11 +411,20 @@ static inline void geni_se_abort_s_cmd(struct geni_se *se)
  */
 static inline u32 geni_se_get_tx_fifo_depth(struct geni_se *se)
 {
-	u32 val;
+	u32 val, hw_version, hw_major, hw_minor, tx_fifo_depth_mask;
+
+	hw_version = geni_se_get_qup_hw_version(se);
+	hw_major = GENI_SE_VERSION_MAJOR(hw_version);
+	hw_minor = GENI_SE_VERSION_MINOR(hw_version);
+
+	if ((hw_major == 3 && hw_minor >= 10) || hw_major > 3)
+		tx_fifo_depth_mask = TX_FIFO_DEPTH_MSK_256_BYTES;
+	else
+		tx_fifo_depth_mask = TX_FIFO_DEPTH_MSK;
 
 	val = readl_relaxed(se->base + SE_HW_PARAM_0);
 
-	return (val & TX_FIFO_DEPTH_MSK) >> TX_FIFO_DEPTH_SHFT;
+	return (val & tx_fifo_depth_mask) >> TX_FIFO_DEPTH_SHFT;
 }
 
 /**
@@ -427,7 +447,8 @@ static inline u32 geni_se_get_tx_fifo_width(struct geni_se *se)
 
 /**
  * geni_se_get_rx_fifo_depth() - Get the RX fifo depth of the serial engine
- * @se:	Pointer to the concerned serial engine.
+ * based on QUP HW version
+ * @se: Pointer to the concerned serial engine.
  *
  * This function is used to get the depth i.e. number of elements in the
  * RX fifo of the serial engine.
@@ -436,11 +457,20 @@ static inline u32 geni_se_get_tx_fifo_width(struct geni_se *se)
  */
 static inline u32 geni_se_get_rx_fifo_depth(struct geni_se *se)
 {
-	u32 val;
+	u32 val, hw_version, hw_major, hw_minor, rx_fifo_depth_mask;
+
+	hw_version = geni_se_get_qup_hw_version(se);
+	hw_major = GENI_SE_VERSION_MAJOR(hw_version);
+	hw_minor = GENI_SE_VERSION_MINOR(hw_version);
+
+	if ((hw_major == 3 && hw_minor >= 10) || hw_major > 3)
+		rx_fifo_depth_mask = RX_FIFO_DEPTH_MSK_256_BYTES;
+	else
+		rx_fifo_depth_mask = RX_FIFO_DEPTH_MSK;
 
 	val = readl_relaxed(se->base + SE_HW_PARAM_1);
 
-	return (val & RX_FIFO_DEPTH_MSK) >> RX_FIFO_DEPTH_SHFT;
+	return (val & rx_fifo_depth_mask) >> RX_FIFO_DEPTH_SHFT;
 }
 
 void geni_se_init(struct geni_se *se, u32 rx_wm, u32 rx_rfr);
-- 
cgit v1.2.3


From 27d7fdf06fdb84455ff585b58c8034e2fab42583 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@google.com>
Date: Mon, 13 Mar 2023 14:56:27 -0600
Subject: bpf: use canonical ftrace path

The canonical location for the tracefs filesystem is at /sys/kernel/tracing.

But, from Documentation/trace/ftrace.rst:

  Before 4.1, all ftrace tracing control files were within the debugfs
  file system, which is typically located at /sys/kernel/debug/tracing.
  For backward compatibility, when mounting the debugfs file system,
  the tracefs file system will be automatically mounted at:

  /sys/kernel/debug/tracing

Many comments and samples in the bpf code still refer to this older
debugfs path, so let's update them to avoid confusion.  There are a few
spots where the bpf code explicitly checks both tracefs and debugfs
(tools/bpf/bpftool/tracelog.c and tools/lib/api/fs/fs.c) and I've left
those alone so that the tools can continue to work with both paths.

Signed-off-by: Ross Zwisler <zwisler@google.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230313205628.1058720-2-zwisler@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h           | 8 ++++----
 samples/bpf/cpustat_kern.c         | 4 ++--
 samples/bpf/hbm.c                  | 4 ++--
 samples/bpf/ibumad_kern.c          | 4 ++--
 samples/bpf/lwt_len_hist.sh        | 2 +-
 samples/bpf/offwaketime_kern.c     | 2 +-
 samples/bpf/task_fd_query_user.c   | 4 ++--
 samples/bpf/test_lwt_bpf.sh        | 2 +-
 samples/bpf/test_overhead_tp.bpf.c | 4 ++--
 tools/include/uapi/linux/bpf.h     | 8 ++++----
 10 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d8c534e05b0a..13129df937cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1647,17 +1647,17 @@ union bpf_attr {
  * 	Description
  * 		This helper is a "printk()-like" facility for debugging. It
  * 		prints a message defined by format *fmt* (of size *fmt_size*)
- * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		to file *\/sys/kernel/tracing/trace* from TraceFS, if
  * 		available. It can take up to three additional **u64**
  * 		arguments (as an eBPF helpers, the total number of arguments is
  * 		limited to five).
  *
  * 		Each time the helper is called, it appends a line to the trace.
- * 		Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * 		open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 		Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * 		open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
  * 		The format of the trace is customizable, and the exact output
  * 		one will get depends on the options set in
- * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*\/sys/kernel/tracing/trace_options* (see also the
  * 		*README* file under the same directory). However, it usually
  * 		defaults to something like:
  *
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
index 5aefd19cdfa1..944f13fe164a 100644
--- a/samples/bpf/cpustat_kern.c
+++ b/samples/bpf/cpustat_kern.c
@@ -76,8 +76,8 @@ struct {
 
 /*
  * The trace events for cpu_idle and cpu_frequency are taken from:
- * /sys/kernel/debug/tracing/events/power/cpu_idle/format
- * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ * /sys/kernel/tracing/events/power/cpu_idle/format
+ * /sys/kernel/tracing/events/power/cpu_frequency/format
  *
  * These two events have same format, so define one common structure.
  */
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index 516fbac28b71..ff58ec43f56a 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -65,7 +65,7 @@ static void Usage(void);
 static void read_trace_pipe2(void);
 static void do_error(char *msg, bool errno_flag);
 
-#define DEBUGFS "/sys/kernel/debug/tracing/"
+#define TRACEFS "/sys/kernel/tracing/"
 
 static struct bpf_program *bpf_prog;
 static struct bpf_object *obj;
@@ -77,7 +77,7 @@ static void read_trace_pipe2(void)
 	FILE *outf;
 	char *outFname = "hbm_out.log";
 
-	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0);
 	if (trace_fd < 0) {
 		printf("Error opening trace_pipe\n");
 		return;
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
index 9b193231024a..f07474c72525 100644
--- a/samples/bpf/ibumad_kern.c
+++ b/samples/bpf/ibumad_kern.c
@@ -39,8 +39,8 @@ struct {
 /* Taken from the current format defined in
  * include/trace/events/ib_umad.h
  * and
- * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format
- * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_read/format
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_write/format
  */
 struct ib_umad_rw_args {
 	u64 pad;
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
index 7078bfcc4f4d..381b2c634784 100755
--- a/samples/bpf/lwt_len_hist.sh
+++ b/samples/bpf/lwt_len_hist.sh
@@ -5,7 +5,7 @@ NS1=lwt_ns1
 VETH0=tst_lwt1a
 VETH1=tst_lwt1b
 BPF_PROG=lwt_len_hist.bpf.o
-TRACE_ROOT=/sys/kernel/debug/tracing
+TRACE_ROOT=/sys/kernel/tracing
 
 function cleanup {
 	# To reset saved histogram, remove pinned map
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
index eb4d94742e6b..23f12b47e9e5 100644
--- a/samples/bpf/offwaketime_kern.c
+++ b/samples/bpf/offwaketime_kern.c
@@ -110,7 +110,7 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta)
 }
 
 #if 1
-/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+/* taken from /sys/kernel/tracing/events/sched/sched_switch/format */
 struct sched_switch_args {
 	unsigned long long pad;
 	char prev_comm[TASK_COMM_LEN];
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index a33d74bd3a4b..1e61f2180470 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -235,7 +235,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
 	struct bpf_link *link;
 	ssize_t bytes;
 
-	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+	snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events",
 		 event_type);
 	kfd = open(buf, O_WRONLY | O_TRUNC, 0);
 	CHECK_PERROR_RET(kfd < 0);
@@ -252,7 +252,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
 	close(kfd);
 	kfd = -1;
 
-	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+	snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s/id",
 		 event_type, event_alias);
 	efd = open(buf, O_RDONLY, 0);
 	CHECK_PERROR_RET(efd < 0);
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
index 2e9f5126963b..0bf2d0f6bf4b 100755
--- a/samples/bpf/test_lwt_bpf.sh
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -21,7 +21,7 @@ IP_LOCAL="192.168.99.1"
 
 PROG_SRC="test_lwt_bpf.c"
 BPF_PROG="test_lwt_bpf.o"
-TRACE_ROOT=/sys/kernel/debug/tracing
+TRACE_ROOT=/sys/kernel/tracing
 CONTEXT_INFO=$(cat ${TRACE_ROOT}/trace_options | grep context)
 
 function lookup_mac()
diff --git a/samples/bpf/test_overhead_tp.bpf.c b/samples/bpf/test_overhead_tp.bpf.c
index 67cab3881969..8b498328e961 100644
--- a/samples/bpf/test_overhead_tp.bpf.c
+++ b/samples/bpf/test_overhead_tp.bpf.c
@@ -7,7 +7,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 
-/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
+/* from /sys/kernel/tracing/events/task/task_rename/format */
 struct task_rename {
 	__u64 pad;
 	__u32 pid;
@@ -21,7 +21,7 @@ int prog(struct task_rename *ctx)
 	return 0;
 }
 
-/* from /sys/kernel/debug/tracing/events/fib/fib_table_lookup/format */
+/* from /sys/kernel/tracing/events/fib/fib_table_lookup/format */
 struct fib_table_lookup {
 	__u64 pad;
 	__u32 tb_id;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d8c534e05b0a..13129df937cd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1647,17 +1647,17 @@ union bpf_attr {
  * 	Description
  * 		This helper is a "printk()-like" facility for debugging. It
  * 		prints a message defined by format *fmt* (of size *fmt_size*)
- * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		to file *\/sys/kernel/tracing/trace* from TraceFS, if
  * 		available. It can take up to three additional **u64**
  * 		arguments (as an eBPF helpers, the total number of arguments is
  * 		limited to five).
  *
  * 		Each time the helper is called, it appends a line to the trace.
- * 		Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * 		open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 		Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * 		open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
  * 		The format of the trace is customizable, and the exact output
  * 		one will get depends on the options set in
- * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*\/sys/kernel/tracing/trace_options* (see also the
  * 		*README* file under the same directory). However, it usually
  * 		defaults to something like:
  *
-- 
cgit v1.2.3


From ed0733eaa579c49dbfeaec14d4071a69a49fdde4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:09 +0800
Subject: crypto: algapi - Move stat reporting into algapi

The stats code resurrected the unions from the early days of
kernel crypto.  This patch starts the process of moving them
out to the individual type structures as we do for everything
else.

In particular, add a report_stat function to cra_type and call
that from the stats code if available.  This allows us to move
the actual code over one-by-one.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_stat.c | 6 ++++++
 include/crypto/algapi.h   | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 154884bf9275..2369814029fa 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -204,6 +204,12 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		goto out;
 	}
 
+	if (alg->cra_type && alg->cra_type->report_stat) {
+		if (alg->cra_type->report_stat(skb, alg))
+			goto nla_put_failure;
+		goto out;
+	}
+
 	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
 	case CRYPTO_ALG_TYPE_AEAD:
 		if (crypto_report_aead(skb, alg))
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index fede394ae2ab..dcc1fd4ef1b4 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -50,6 +50,9 @@ struct crypto_type {
 	void (*show)(struct seq_file *m, struct crypto_alg *alg);
 	int (*report)(struct sk_buff *skb, struct crypto_alg *alg);
 	void (*free)(struct crypto_instance *inst);
+#ifdef CONFIG_CRYPTO_STATS
+	int (*report_stat)(struct sk_buff *skb, struct crypto_alg *alg);
+#endif
 
 	unsigned int type;
 	unsigned int maskclear;
-- 
cgit v1.2.3


From 0df4adf8682a017e43579ac8c9ec1a31c538e940 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:11 +0800
Subject: crypto: aead - Count error stats differently

Move all stat code specific to aead into the aead code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/aead.c             | 86 ++++++++++++++++++++++++++++++++++++++++-------
 crypto/algapi.c           | 26 --------------
 crypto/crypto_user_stat.c | 21 ------------
 include/crypto/aead.h     | 22 ++++++++++++
 include/linux/crypto.h    | 24 -------------
 5 files changed, 95 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/crypto/aead.c b/crypto/aead.c
index 16991095270d..5ea65c433608 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -8,17 +8,27 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <linux/cryptouser.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/cryptouser.h>
+#include <linux/string.h>
 #include <net/netlink.h>
 
 #include "internal.h"
 
+static inline struct crypto_istat_aead *aead_get_stat(struct aead_alg *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
 static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 			    unsigned int keylen)
 {
@@ -80,39 +90,62 @@ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 }
 EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);
 
+static inline int crypto_aead_errstat(struct crypto_istat_aead *istat, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&istat->err_cnt);
+
+	return err;
+}
+
 int crypto_aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct crypto_alg *alg = aead->base.__crt_alg;
-	unsigned int cryptlen = req->cryptlen;
+	struct aead_alg *alg = crypto_aead_alg(aead);
+	struct crypto_istat_aead *istat;
 	int ret;
 
-	crypto_stats_get(alg);
+	istat = aead_get_stat(alg);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		atomic64_inc(&istat->encrypt_cnt);
+		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
+	}
+
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
-		ret = crypto_aead_alg(aead)->encrypt(req);
-	crypto_stats_aead_encrypt(cryptlen, alg, ret);
-	return ret;
+		ret = alg->encrypt(req);
+
+	return crypto_aead_errstat(istat, ret);
 }
 EXPORT_SYMBOL_GPL(crypto_aead_encrypt);
 
 int crypto_aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct crypto_alg *alg = aead->base.__crt_alg;
-	unsigned int cryptlen = req->cryptlen;
+	struct aead_alg *alg = crypto_aead_alg(aead);
+	struct crypto_istat_aead *istat;
 	int ret;
 
-	crypto_stats_get(alg);
+	istat = aead_get_stat(alg);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		atomic64_inc(&istat->encrypt_cnt);
+		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
+	}
+
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else if (req->cryptlen < crypto_aead_authsize(aead))
 		ret = -EINVAL;
 	else
-		ret = crypto_aead_alg(aead)->decrypt(req);
-	crypto_stats_aead_decrypt(cryptlen, alg, ret);
-	return ret;
+		ret = alg->decrypt(req);
+
+	return crypto_aead_errstat(istat, ret);
 }
 EXPORT_SYMBOL_GPL(crypto_aead_decrypt);
 
@@ -188,6 +221,26 @@ static void crypto_aead_free_instance(struct crypto_instance *inst)
 	aead->free(aead);
 }
 
+static int __maybe_unused crypto_aead_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
+	struct crypto_istat_aead *istat = aead_get_stat(aead);
+	struct crypto_stat_aead raead;
+
+	memset(&raead, 0, sizeof(raead));
+
+	strscpy(raead.type, "aead", sizeof(raead.type));
+
+	raead.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
+	raead.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
+	raead.stat_decrypt_cnt = atomic64_read(&istat->decrypt_cnt);
+	raead.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
+	raead.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
+}
+
 static const struct crypto_type crypto_aead_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_aead_init_tfm,
@@ -196,6 +249,9 @@ static const struct crypto_type crypto_aead_type = {
 	.show = crypto_aead_show,
 #endif
 	.report = crypto_aead_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_aead_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_AEAD,
@@ -219,6 +275,7 @@ EXPORT_SYMBOL_GPL(crypto_alloc_aead);
 
 static int aead_prepare_alg(struct aead_alg *alg)
 {
+	struct crypto_istat_aead *istat = aead_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (max3(alg->maxauthsize, alg->ivsize, alg->chunksize) >
@@ -232,6 +289,9 @@ static int aead_prepare_alg(struct aead_alg *alg)
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AEAD;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
+
 	return 0;
 }
 
diff --git a/crypto/algapi.c b/crypto/algapi.c
index d08f864f08be..f7f7c61d456a 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1051,32 +1051,6 @@ void crypto_stats_get(struct crypto_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_stats_get);
 
-void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
-			       int ret)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.aead.encrypt_cnt);
-		atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_aead_encrypt);
-
-void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
-			       int ret)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.aead.decrypt_cnt);
-		atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_aead_decrypt);
-
 void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 2369814029fa..50ec076507a1 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -28,23 +28,6 @@ struct crypto_dump_info {
 	u16 nlmsg_flags;
 };
 
-static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_aead raead;
-
-	memset(&raead, 0, sizeof(raead));
-
-	strscpy(raead.type, "aead", sizeof(raead.type));
-
-	raead.stat_encrypt_cnt = atomic64_read(&alg->stats.aead.encrypt_cnt);
-	raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen);
-	raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt);
-	raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen);
-	raead.stat_err_cnt = atomic64_read(&alg->stats.aead.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
-}
-
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_cipher rcipher;
@@ -211,10 +194,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 	}
 
 	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
-	case CRYPTO_ALG_TYPE_AEAD:
-		if (crypto_report_aead(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_SKCIPHER:
 		if (crypto_report_cipher(skb, alg))
 			goto nla_put_failure;
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 4a2b7e6e0c1f..35e45b854a6f 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -8,6 +8,7 @@
 #ifndef _CRYPTO_AEAD_H
 #define _CRYPTO_AEAD_H
 
+#include <linux/atomic.h>
 #include <linux/container_of.h>
 #include <linux/crypto.h>
 #include <linux/slab.h>
@@ -100,6 +101,22 @@ struct aead_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
+/*
+ * struct crypto_istat_aead - statistics for AEAD algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @err_cnt:		number of error for AEAD requests
+ */
+struct crypto_istat_aead {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t err_cnt;
+};
+
 /**
  * struct aead_alg - AEAD cipher definition
  * @maxauthsize: Set the maximum authentication tag size supported by the
@@ -118,6 +135,7 @@ struct aead_request {
  * @setkey: see struct skcipher_alg
  * @encrypt: see struct skcipher_alg
  * @decrypt: see struct skcipher_alg
+ * @stat: statistics for AEAD algorithm
  * @ivsize: see struct skcipher_alg
  * @chunksize: see struct skcipher_alg
  * @init: Initialize the cryptographic transformation object. This function
@@ -144,6 +162,10 @@ struct aead_alg {
 	int (*init)(struct crypto_aead *tfm);
 	void (*exit)(struct crypto_aead *tfm);
 
+#ifdef CONFIG_CRYPTO_STATS
+	struct crypto_istat_aead stat;
+#endif
+
 	unsigned int ivsize;
 	unsigned int maxauthsize;
 	unsigned int chunksize;
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index bb1d9b0e1647..9eb6fc8ab69c 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -276,22 +276,6 @@ struct compress_alg {
 };
 
 #ifdef CONFIG_CRYPTO_STATS
-/*
- * struct crypto_istat_aead - statistics for AEAD algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @err_cnt:		number of error for AEAD requests
- */
-struct crypto_istat_aead {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_akcipher - statistics for akcipher algorithm
  * @encrypt_cnt:	number of encrypt requests
@@ -463,7 +447,6 @@ struct crypto_istat_rng {
  * @cra_destroy: internally used
  *
  * @stats: union of all possible crypto_istat_xxx structures
- * @stats.aead:		statistics for AEAD algorithm
  * @stats.akcipher:	statistics for akcipher algorithm
  * @stats.cipher:	statistics for cipher algorithm
  * @stats.compress:	statistics for compress algorithm
@@ -505,7 +488,6 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		struct crypto_istat_aead aead;
 		struct crypto_istat_akcipher akcipher;
 		struct crypto_istat_cipher cipher;
 		struct crypto_istat_compress compress;
@@ -520,8 +502,6 @@ struct crypto_alg {
 #ifdef CONFIG_CRYPTO_STATS
 void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
-void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
-void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
 void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
 void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
 void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
@@ -542,10 +522,6 @@ static inline void crypto_stats_init(struct crypto_alg *alg)
 {}
 static inline void crypto_stats_get(struct crypto_alg *alg)
 {}
-static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
-{}
-static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
-{}
 static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
 {}
 static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
-- 
cgit v1.2.3


From 035d78a11c56828bb4923fa87eeb9ed2546d52bd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:13 +0800
Subject: crypto: akcipher - Count error stats differently

Move all stat code specific to akcipher into the akcipher code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/akcipher.c         |  40 +++++++++++++++---
 crypto/algapi.c           |  46 ---------------------
 crypto/crypto_user_stat.c |  24 -----------
 include/crypto/akcipher.h | 102 +++++++++++++++++++++++++++++++++-------------
 include/linux/crypto.h    |  34 ----------------
 5 files changed, 108 insertions(+), 138 deletions(-)

(limited to 'include')

diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index ab975a420e1e..61d7c8b2d76e 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -5,19 +5,16 @@
  * Copyright (c) 2015, Intel Corporation
  * Authors: Tadeusz Struk <tadeusz.struk@intel.com>
  */
+#include <crypto/internal/akcipher.h>
+#include <linux/cryptouser.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/crypto.h>
-#include <linux/compiler.h>
-#include <crypto/algapi.h>
-#include <linux/cryptouser.h>
 #include <net/netlink.h>
-#include <crypto/akcipher.h>
-#include <crypto/internal/akcipher.h>
+
 #include "internal.h"
 
 #ifdef CONFIG_NET
@@ -76,6 +73,30 @@ static void crypto_akcipher_free_instance(struct crypto_instance *inst)
 	akcipher->free(akcipher);
 }
 
+static int __maybe_unused crypto_akcipher_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct akcipher_alg *akcipher = __crypto_akcipher_alg(alg);
+	struct crypto_istat_akcipher *istat;
+	struct crypto_stat_akcipher rakcipher;
+
+	istat = akcipher_get_stat(akcipher);
+
+	memset(&rakcipher, 0, sizeof(rakcipher));
+
+	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	rakcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
+	rakcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
+	rakcipher.stat_decrypt_cnt = atomic64_read(&istat->decrypt_cnt);
+	rakcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
+	rakcipher.stat_sign_cnt = atomic64_read(&istat->sign_cnt);
+	rakcipher.stat_verify_cnt = atomic64_read(&istat->verify_cnt);
+	rakcipher.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
+		       sizeof(rakcipher), &rakcipher);
+}
+
 static const struct crypto_type crypto_akcipher_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_akcipher_init_tfm,
@@ -84,6 +105,9 @@ static const struct crypto_type crypto_akcipher_type = {
 	.show = crypto_akcipher_show,
 #endif
 	.report = crypto_akcipher_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_akcipher_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_AKCIPHER,
@@ -108,11 +132,15 @@ EXPORT_SYMBOL_GPL(crypto_alloc_akcipher);
 
 static void akcipher_prepare_alg(struct akcipher_alg *alg)
 {
+	struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	base->cra_type = &crypto_akcipher_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AKCIPHER;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
 }
 
 static int akcipher_default_op(struct akcipher_request *req)
diff --git a/crypto/algapi.c b/crypto/algapi.c
index f7f7c61d456a..33dc82ffe20a 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1051,52 +1051,6 @@ void crypto_stats_get(struct crypto_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_stats_get);
 
-void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
-				   struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.akcipher.encrypt_cnt);
-		atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_akcipher_encrypt);
-
-void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
-				   struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.akcipher.decrypt_cnt);
-		atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
-
-void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.err_cnt);
-	else
-		atomic64_inc(&alg->stats.akcipher.sign_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
-
-void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.err_cnt);
-	else
-		atomic64_inc(&alg->stats.akcipher.verify_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
-
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 50ec076507a1..7a5a2591c95f 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -13,7 +13,6 @@
 #include <net/sock.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/internal/rng.h>
-#include <crypto/akcipher.h>
 #include <crypto/kpp.h>
 #include <crypto/internal/cryptouser.h>
 
@@ -77,25 +76,6 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
 
-static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_akcipher rakcipher;
-
-	memset(&rakcipher, 0, sizeof(rakcipher));
-
-	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	rakcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.akcipher.encrypt_cnt);
-	rakcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.akcipher.encrypt_tlen);
-	rakcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.akcipher.decrypt_cnt);
-	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen);
-	rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt);
-	rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt);
-	rakcipher.stat_err_cnt = atomic64_read(&alg->stats.akcipher.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
-		       sizeof(rakcipher), &rakcipher);
-}
-
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_kpp rkpp;
@@ -214,10 +194,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		if (crypto_report_acomp(skb, alg))
 			goto nla_put_failure;
 		break;
-	case CRYPTO_ALG_TYPE_AKCIPHER:
-		if (crypto_report_akcipher(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_KPP:
 		if (crypto_report_kpp(skb, alg))
 			goto nla_put_failure;
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index 734c213918bd..f35fd653e4e5 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -7,6 +7,8 @@
  */
 #ifndef _CRYPTO_AKCIPHER_H
 #define _CRYPTO_AKCIPHER_H
+
+#include <linux/atomic.h>
 #include <linux/crypto.h>
 
 /**
@@ -52,6 +54,26 @@ struct crypto_akcipher {
 	struct crypto_tfm base;
 };
 
+/*
+ * struct crypto_istat_akcipher - statistics for akcipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @verify_cnt:		number of verify operation
+ * @sign_cnt:		number of sign requests
+ * @err_cnt:		number of error for akcipher requests
+ */
+struct crypto_istat_akcipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t verify_cnt;
+	atomic64_t sign_cnt;
+	atomic64_t err_cnt;
+};
+
 /**
  * struct akcipher_alg - generic public key algorithm
  *
@@ -88,6 +110,7 @@ struct crypto_akcipher {
  * @exit:	Deinitialize the cryptographic transformation object. This is a
  *		counterpart to @init, used to remove various changes set in
  *		@init.
+ * @stat:	Statistics for akcipher algorithm
  *
  * @base:	Common crypto API algorithm data structure
  */
@@ -104,6 +127,10 @@ struct akcipher_alg {
 	int (*init)(struct crypto_akcipher *tfm);
 	void (*exit)(struct crypto_akcipher *tfm);
 
+#ifdef CONFIG_CRYPTO_STATS
+	struct crypto_istat_akcipher stat;
+#endif
+
 	struct crypto_alg base;
 };
 
@@ -275,6 +302,27 @@ static inline unsigned int crypto_akcipher_maxsize(struct crypto_akcipher *tfm)
 	return alg->max_size(tfm);
 }
 
+static inline struct crypto_istat_akcipher *akcipher_get_stat(
+	struct akcipher_alg *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_akcipher_errstat(struct akcipher_alg *alg, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&akcipher_get_stat(alg)->err_cnt);
+
+	return err;
+}
+
 /**
  * crypto_akcipher_encrypt() - Invoke public key encrypt operation
  *
@@ -289,14 +337,15 @@ static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	unsigned int src_len = req->src_len;
-	int ret;
-
-	crypto_stats_get(calg);
-	ret = alg->encrypt(req);
-	crypto_stats_akcipher_encrypt(src_len, ret, calg);
-	return ret;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
+
+		atomic64_inc(&istat->encrypt_cnt);
+		atomic64_add(req->src_len, &istat->encrypt_tlen);
+	}
+
+	return crypto_akcipher_errstat(alg, alg->encrypt(req));
 }
 
 /**
@@ -313,14 +362,15 @@ static inline int crypto_akcipher_decrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	unsigned int src_len = req->src_len;
-	int ret;
-
-	crypto_stats_get(calg);
-	ret = alg->decrypt(req);
-	crypto_stats_akcipher_decrypt(src_len, ret, calg);
-	return ret;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
+
+		atomic64_inc(&istat->decrypt_cnt);
+		atomic64_add(req->src_len, &istat->decrypt_tlen);
+	}
+
+	return crypto_akcipher_errstat(alg, alg->decrypt(req));
 }
 
 /**
@@ -337,13 +387,11 @@ static inline int crypto_akcipher_sign(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	int ret;
 
-	crypto_stats_get(calg);
-	ret = alg->sign(req);
-	crypto_stats_akcipher_sign(ret, calg);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&akcipher_get_stat(alg)->sign_cnt);
+
+	return crypto_akcipher_errstat(alg, alg->sign(req));
 }
 
 /**
@@ -364,13 +412,11 @@ static inline int crypto_akcipher_verify(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	int ret;
 
-	crypto_stats_get(calg);
-	ret = alg->verify(req);
-	crypto_stats_akcipher_verify(ret, calg);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&akcipher_get_stat(alg)->verify_cnt);
+
+	return crypto_akcipher_errstat(alg, alg->verify(req));
 }
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9eb6fc8ab69c..778cc05f76a8 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -276,26 +276,6 @@ struct compress_alg {
 };
 
 #ifdef CONFIG_CRYPTO_STATS
-/*
- * struct crypto_istat_akcipher - statistics for akcipher algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @verify_cnt:		number of verify operation
- * @sign_cnt:		number of sign requests
- * @err_cnt:		number of error for akcipher requests
- */
-struct crypto_istat_akcipher {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t verify_cnt;
-	atomic64_t sign_cnt;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_cipher - statistics for cipher algorithm
  * @encrypt_cnt:	number of encrypt requests
@@ -447,7 +427,6 @@ struct crypto_istat_rng {
  * @cra_destroy: internally used
  *
  * @stats: union of all possible crypto_istat_xxx structures
- * @stats.akcipher:	statistics for akcipher algorithm
  * @stats.cipher:	statistics for cipher algorithm
  * @stats.compress:	statistics for compress algorithm
  * @stats.hash:		statistics for hash algorithm
@@ -488,7 +467,6 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		struct crypto_istat_akcipher akcipher;
 		struct crypto_istat_cipher cipher;
 		struct crypto_istat_compress compress;
 		struct crypto_istat_hash hash;
@@ -504,10 +482,6 @@ void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
 void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
 void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
-void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
-void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
-void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg);
-void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg);
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
 void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
@@ -526,14 +500,6 @@ static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struc
 {}
 static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
 {}
-static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
-{}
 static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {}
 static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
-- 
cgit v1.2.3


From 42808e5dc602c12ef3eb42cf96cb416b55205fa4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:15 +0800
Subject: crypto: hash - Count error stats differently

Move all stat code specific to hash into the hash code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c            |  81 ++++++++++++++++++--------------
 crypto/algapi.c           |  24 ----------
 crypto/crypto_user_stat.c |  38 ----------------
 crypto/hash.h             |  36 +++++++++++++++
 crypto/shash.c            | 114 +++++++++++++++++++++++++++++++++++++---------
 include/crypto/hash.h     |  84 ++++++++++++++++++++++++++--------
 include/linux/crypto.h    |  20 --------
 7 files changed, 240 insertions(+), 157 deletions(-)
 create mode 100644 crypto/hash.h

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index ff8c79d975c1..c886cec64c23 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -8,19 +8,18 @@
  * Copyright (c) 2008 Loc Ho <lho@amcc.com>
  */
 
-#include <crypto/internal/hash.h>
 #include <crypto/scatterwalk.h>
+#include <linux/cryptouser.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/cryptouser.h>
-#include <linux/compiler.h>
+#include <linux/string.h>
 #include <net/netlink.h>
 
-#include "internal.h"
+#include "hash.h"
 
 static const struct crypto_type crypto_ahash_type;
 
@@ -296,55 +295,60 @@ static int crypto_ahash_op(struct ahash_request *req,
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	unsigned long alignmask = crypto_ahash_alignmask(tfm);
+	int err;
 
 	if ((unsigned long)req->result & alignmask)
-		return ahash_op_unaligned(req, op, has_state);
+		err = ahash_op_unaligned(req, op, has_state);
+	else
+		err = op(req);
 
-	return op(req);
+	return crypto_hash_errstat(crypto_hash_alg_common(tfm), err);
 }
 
 int crypto_ahash_final(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int nbytes = req->nbytes;
-	int ret;
+	struct hash_alg_common *alg = crypto_hash_alg_common(tfm);
 
-	crypto_stats_get(alg);
-	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final, true);
-	crypto_stats_ahash_final(nbytes, ret, alg);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&hash_get_stat(alg)->hash_cnt);
+
+	return crypto_ahash_op(req, tfm->final, true);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_final);
 
 int crypto_ahash_finup(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int nbytes = req->nbytes;
-	int ret;
+	struct hash_alg_common *alg = crypto_hash_alg_common(tfm);
 
-	crypto_stats_get(alg);
-	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup, true);
-	crypto_stats_ahash_final(nbytes, ret, alg);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_hash *istat = hash_get_stat(alg);
+
+		atomic64_inc(&istat->hash_cnt);
+		atomic64_add(req->nbytes, &istat->hash_tlen);
+	}
+
+	return crypto_ahash_op(req, tfm->finup, true);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_finup);
 
 int crypto_ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int nbytes = req->nbytes;
-	int ret;
+	struct hash_alg_common *alg = crypto_hash_alg_common(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_hash *istat = hash_get_stat(alg);
+
+		atomic64_inc(&istat->hash_cnt);
+		atomic64_add(req->nbytes, &istat->hash_tlen);
+	}
 
-	crypto_stats_get(alg);
 	if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		ret = -ENOKEY;
-	else
-		ret = crypto_ahash_op(req, tfm->digest, false);
-	crypto_stats_ahash_final(nbytes, ret, alg);
-	return ret;
+		return crypto_hash_errstat(alg, -ENOKEY);
+
+	return crypto_ahash_op(req, tfm->digest, false);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
 
@@ -498,6 +502,12 @@ static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
 		   __crypto_hash_alg_common(alg)->digestsize);
 }
 
+static int __maybe_unused crypto_ahash_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	return crypto_hash_report_stat(skb, alg, "ahash");
+}
+
 static const struct crypto_type crypto_ahash_type = {
 	.extsize = crypto_ahash_extsize,
 	.init_tfm = crypto_ahash_init_tfm,
@@ -506,6 +516,9 @@ static const struct crypto_type crypto_ahash_type = {
 	.show = crypto_ahash_show,
 #endif
 	.report = crypto_ahash_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_ahash_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_AHASH_MASK,
 	.type = CRYPTO_ALG_TYPE_AHASH,
@@ -537,14 +550,16 @@ EXPORT_SYMBOL_GPL(crypto_has_ahash);
 static int ahash_prepare_alg(struct ahash_alg *alg)
 {
 	struct crypto_alg *base = &alg->halg.base;
+	int err;
 
-	if (alg->halg.digestsize > HASH_MAX_DIGESTSIZE ||
-	    alg->halg.statesize > HASH_MAX_STATESIZE ||
-	    alg->halg.statesize == 0)
+	if (alg->halg.statesize == 0)
 		return -EINVAL;
 
+	err = hash_prepare_alg(&alg->halg);
+	if (err)
+		return err;
+
 	base->cra_type = &crypto_ahash_type;
-	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AHASH;
 
 	return 0;
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 33dc82ffe20a..deabd2d42216 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1075,30 +1075,6 @@ void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_stats_decompress);
 
-void crypto_stats_ahash_update(unsigned int nbytes, int ret,
-			       struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.hash.err_cnt);
-	else
-		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_ahash_update);
-
-void crypto_stats_ahash_final(unsigned int nbytes, int ret,
-			      struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.hash.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.hash.hash_cnt);
-		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
-
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 7a5a2591c95f..d65f10f71f11 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -92,36 +92,6 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
 
-static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_hash rhash;
-
-	memset(&rhash, 0, sizeof(rhash));
-
-	strscpy(rhash.type, "ahash", sizeof(rhash.type));
-
-	rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt);
-	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
-}
-
-static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_hash rhash;
-
-	memset(&rhash, 0, sizeof(rhash));
-
-	strscpy(rhash.type, "shash", sizeof(rhash.type));
-
-	rhash.stat_hash_cnt =  atomic64_read(&alg->stats.hash.hash_cnt);
-	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
-}
-
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_rng rrng;
@@ -198,14 +168,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		if (crypto_report_kpp(skb, alg))
 			goto nla_put_failure;
 		break;
-	case CRYPTO_ALG_TYPE_AHASH:
-		if (crypto_report_ahash(skb, alg))
-			goto nla_put_failure;
-		break;
-	case CRYPTO_ALG_TYPE_HASH:
-		if (crypto_report_shash(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_RNG:
 		if (crypto_report_rng(skb, alg))
 			goto nla_put_failure;
diff --git a/crypto/hash.h b/crypto/hash.h
new file mode 100644
index 000000000000..57b28a986d69
--- /dev/null
+++ b/crypto/hash.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#ifndef _LOCAL_CRYPTO_HASH_H
+#define _LOCAL_CRYPTO_HASH_H
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptouser.h>
+
+#include "internal.h"
+
+static inline int crypto_hash_report_stat(struct sk_buff *skb,
+					  struct crypto_alg *alg,
+					  const char *type)
+{
+	struct hash_alg_common *halg = __crypto_hash_alg_common(alg);
+	struct crypto_istat_hash *istat = hash_get_stat(halg);
+	struct crypto_stat_hash rhash;
+
+	memset(&rhash, 0, sizeof(rhash));
+
+	strscpy(rhash.type, type, sizeof(rhash.type));
+
+	rhash.stat_hash_cnt = atomic64_read(&istat->hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&istat->hash_tlen);
+	rhash.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
+}
+
+int hash_prepare_alg(struct hash_alg_common *alg);
+
+#endif	/* _LOCAL_CRYPTO_HASH_H */
diff --git a/crypto/shash.c b/crypto/shash.c
index 58b46f198449..1f3454736f6e 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -6,22 +6,31 @@
  */
 
 #include <crypto/scatterwalk.h>
-#include <crypto/internal/hash.h>
+#include <linux/cryptouser.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/cryptouser.h>
+#include <linux/string.h>
 #include <net/netlink.h>
-#include <linux/compiler.h>
 
-#include "internal.h"
+#include "hash.h"
 
 #define MAX_SHASH_ALIGNMASK 63
 
 static const struct crypto_type crypto_shash_type;
 
+static inline struct crypto_istat_hash *shash_get_stat(struct shash_alg *alg)
+{
+	return hash_get_stat(&alg->halg);
+}
+
+static inline int crypto_shash_errstat(struct shash_alg *alg, int err)
+{
+	return crypto_hash_errstat(&alg->halg, err);
+}
+
 int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
 		    unsigned int keylen)
 {
@@ -114,11 +123,17 @@ int crypto_shash_update(struct shash_desc *desc, const u8 *data,
 	struct crypto_shash *tfm = desc->tfm;
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	int err;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_add(len, &shash_get_stat(shash)->hash_tlen);
 
 	if ((unsigned long)data & alignmask)
-		return shash_update_unaligned(desc, data, len);
+		err = shash_update_unaligned(desc, data, len);
+	else
+		err = shash->update(desc, data, len);
 
-	return shash->update(desc, data, len);
+	return crypto_shash_errstat(shash, err);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_update);
 
@@ -155,19 +170,25 @@ int crypto_shash_final(struct shash_desc *desc, u8 *out)
 	struct crypto_shash *tfm = desc->tfm;
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	int err;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&shash_get_stat(shash)->hash_cnt);
 
 	if ((unsigned long)out & alignmask)
-		return shash_final_unaligned(desc, out);
+		err = shash_final_unaligned(desc, out);
+	else
+		err = shash->final(desc, out);
 
-	return shash->final(desc, out);
+	return crypto_shash_errstat(shash, err);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_final);
 
 static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data,
 				 unsigned int len, u8 *out)
 {
-	return crypto_shash_update(desc, data, len) ?:
-	       crypto_shash_final(desc, out);
+	return shash_update_unaligned(desc, data, len) ?:
+	       shash_final_unaligned(desc, out);
 }
 
 int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
@@ -176,11 +197,22 @@ int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
 	struct crypto_shash *tfm = desc->tfm;
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	int err;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_hash *istat = shash_get_stat(shash);
+
+		atomic64_inc(&istat->hash_cnt);
+		atomic64_add(len, &istat->hash_tlen);
+	}
 
 	if (((unsigned long)data | (unsigned long)out) & alignmask)
-		return shash_finup_unaligned(desc, data, len, out);
+		err = shash_finup_unaligned(desc, data, len, out);
+	else
+		err = shash->finup(desc, data, len, out);
+
 
-	return shash->finup(desc, data, len, out);
+	return crypto_shash_errstat(shash, err);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_finup);
 
@@ -188,7 +220,8 @@ static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data,
 				  unsigned int len, u8 *out)
 {
 	return crypto_shash_init(desc) ?:
-	       crypto_shash_finup(desc, data, len, out);
+	       shash_update_unaligned(desc, data, len) ?:
+	       shash_final_unaligned(desc, out);
 }
 
 int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
@@ -197,14 +230,23 @@ int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
 	struct crypto_shash *tfm = desc->tfm;
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	int err;
 
-	if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		return -ENOKEY;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_hash *istat = shash_get_stat(shash);
 
-	if (((unsigned long)data | (unsigned long)out) & alignmask)
-		return shash_digest_unaligned(desc, data, len, out);
+		atomic64_inc(&istat->hash_cnt);
+		atomic64_add(len, &istat->hash_tlen);
+	}
 
-	return shash->digest(desc, data, len, out);
+	if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
+		err = -ENOKEY;
+	else if (((unsigned long)data | (unsigned long)out) & alignmask)
+		err = shash_digest_unaligned(desc, data, len, out);
+	else
+		err = shash->digest(desc, data, len, out);
+
+	return crypto_shash_errstat(shash, err);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_digest);
 
@@ -481,6 +523,12 @@ static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "digestsize   : %u\n", salg->digestsize);
 }
 
+static int __maybe_unused crypto_shash_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	return crypto_hash_report_stat(skb, alg, "shash");
+}
+
 static const struct crypto_type crypto_shash_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_shash_init_tfm,
@@ -489,6 +537,9 @@ static const struct crypto_type crypto_shash_type = {
 	.show = crypto_shash_show,
 #endif
 	.report = crypto_shash_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_shash_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_SHASH,
@@ -517,23 +568,42 @@ int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_shash);
 
-static int shash_prepare_alg(struct shash_alg *alg)
+int hash_prepare_alg(struct hash_alg_common *alg)
 {
+	struct crypto_istat_hash *istat = hash_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->digestsize > HASH_MAX_DIGESTSIZE ||
-	    alg->descsize > HASH_MAX_DESCSIZE ||
 	    alg->statesize > HASH_MAX_STATESIZE)
 		return -EINVAL;
 
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
+
+	return 0;
+}
+
+static int shash_prepare_alg(struct shash_alg *alg)
+{
+	struct crypto_alg *base = &alg->halg.base;
+	int err;
+
+	if (alg->descsize > HASH_MAX_DESCSIZE)
+		return -EINVAL;
+
 	if (base->cra_alignmask > MAX_SHASH_ALIGNMASK)
 		return -EINVAL;
 
 	if ((alg->export && !alg->import) || (alg->import && !alg->export))
 		return -EINVAL;
 
+	err = hash_prepare_alg(&alg->halg);
+	if (err)
+		return err;
+
 	base->cra_type = &crypto_shash_type;
-	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;
 
 	if (!alg->finup)
@@ -543,7 +613,7 @@ static int shash_prepare_alg(struct shash_alg *alg)
 	if (!alg->export) {
 		alg->export = shash_default_export;
 		alg->import = shash_default_import;
-		alg->statesize = alg->descsize;
+		alg->halg.statesize = alg->descsize;
 	}
 	if (!alg->setkey)
 		alg->setkey = shash_no_setkey;
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index f5841992dc9b..2aa61e7679db 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -8,6 +8,7 @@
 #ifndef _CRYPTO_HASH_H
 #define _CRYPTO_HASH_H
 
+#include <linux/atomic.h>
 #include <linux/crypto.h>
 #include <linux/string.h>
 
@@ -22,8 +23,27 @@ struct crypto_ahash;
  * crypto_unregister_shash().
  */
 
+/*
+ * struct crypto_istat_hash - statistics for has algorithm
+ * @hash_cnt:		number of hash requests
+ * @hash_tlen:		total data size hashed
+ * @err_cnt:		number of error for hash requests
+ */
+struct crypto_istat_hash {
+	atomic64_t hash_cnt;
+	atomic64_t hash_tlen;
+	atomic64_t err_cnt;
+};
+
+#ifdef CONFIG_CRYPTO_STATS
+#define HASH_ALG_COMMON_STAT struct crypto_istat_hash stat;
+#else
+#define HASH_ALG_COMMON_STAT
+#endif
+
 /**
  * struct hash_alg_common - define properties of message digest
+ * @stat: Statistics for hash algorithm.
  * @digestsize: Size of the result of the transformation. A buffer of this size
  *	        must be available to the @final and @finup calls, so they can
  *	        store the resulting hash into it. For various predefined sizes,
@@ -39,12 +59,15 @@ struct crypto_ahash;
  *	  The hash_alg_common data structure now adds the hash-specific
  *	  information.
  */
-struct hash_alg_common {
-	unsigned int digestsize;
-	unsigned int statesize;
-
-	struct crypto_alg base;
-};
+#define HASH_ALG_COMMON {		\
+	HASH_ALG_COMMON_STAT		\
+					\
+	unsigned int digestsize;	\
+	unsigned int statesize;		\
+					\
+	struct crypto_alg base;		\
+}
+struct hash_alg_common HASH_ALG_COMMON;
 
 struct ahash_request {
 	struct crypto_async_request base;
@@ -193,7 +216,9 @@ struct shash_desc {
  * @descsize: Size of the operational state for the message digest. This state
  * 	      size is the memory size that needs to be allocated for
  *	      shash_desc.__ctx
+ * @stat: Statistics for hash algorithm.
  * @base: internally used
+ * @halg: see struct hash_alg_common
  */
 struct shash_alg {
 	int (*init)(struct shash_desc *desc);
@@ -213,13 +238,13 @@ struct shash_alg {
 
 	unsigned int descsize;
 
-	/* These fields must match hash_alg_common. */
-	unsigned int digestsize
-		__attribute__ ((aligned(__alignof__(struct hash_alg_common))));
-	unsigned int statesize;
-
-	struct crypto_alg base;
+	union {
+		struct HASH_ALG_COMMON;
+		struct hash_alg_common halg;
+	};
 };
+#undef HASH_ALG_COMMON
+#undef HASH_ALG_COMMON_STAT
 
 struct crypto_ahash {
 	int (*init)(struct ahash_request *req);
@@ -535,6 +560,27 @@ static inline int crypto_ahash_init(struct ahash_request *req)
 	return tfm->init(req);
 }
 
+static inline struct crypto_istat_hash *hash_get_stat(
+	struct hash_alg_common *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_hash_errstat(struct hash_alg_common *alg, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&hash_get_stat(alg)->err_cnt);
+
+	return err;
+}
+
 /**
  * crypto_ahash_update() - add data to message digest for processing
  * @req: ahash_request handle that was previously initialized with the
@@ -549,14 +595,12 @@ static inline int crypto_ahash_init(struct ahash_request *req)
 static inline int crypto_ahash_update(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int nbytes = req->nbytes;
-	int ret;
-
-	crypto_stats_get(alg);
-	ret = crypto_ahash_reqtfm(req)->update(req);
-	crypto_stats_ahash_update(nbytes, ret, alg);
-	return ret;
+	struct hash_alg_common *alg = crypto_hash_alg_common(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_add(req->nbytes, &hash_get_stat(alg)->hash_tlen);
+
+	return crypto_hash_errstat(alg, tfm->update(req));
 }
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 778cc05f76a8..caf759e4201c 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -308,18 +308,6 @@ struct crypto_istat_compress {
 	atomic64_t err_cnt;
 };
 
-/*
- * struct crypto_istat_hash - statistics for has algorithm
- * @hash_cnt:		number of hash requests
- * @hash_tlen:		total data size hashed
- * @err_cnt:		number of error for hash requests
- */
-struct crypto_istat_hash {
-	atomic64_t hash_cnt;
-	atomic64_t hash_tlen;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_kpp - statistics for KPP algorithm
  * @setsecret_cnt:		number of setsecrey operation
@@ -429,7 +417,6 @@ struct crypto_istat_rng {
  * @stats: union of all possible crypto_istat_xxx structures
  * @stats.cipher:	statistics for cipher algorithm
  * @stats.compress:	statistics for compress algorithm
- * @stats.hash:		statistics for hash algorithm
  * @stats.rng:		statistics for rng algorithm
  * @stats.kpp:		statistics for KPP algorithm
  *
@@ -469,7 +456,6 @@ struct crypto_alg {
 	union {
 		struct crypto_istat_cipher cipher;
 		struct crypto_istat_compress compress;
-		struct crypto_istat_hash hash;
 		struct crypto_istat_rng rng;
 		struct crypto_istat_kpp kpp;
 	} stats;
@@ -480,8 +466,6 @@ struct crypto_alg {
 #ifdef CONFIG_CRYPTO_STATS
 void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
-void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
-void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
 void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
@@ -496,10 +480,6 @@ static inline void crypto_stats_init(struct crypto_alg *alg)
 {}
 static inline void crypto_stats_get(struct crypto_alg *alg)
 {}
-static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
-{}
 static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {}
 static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
-- 
cgit v1.2.3


From 0a742389bcc00053d63b5271fefb00d3a338d512 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:17 +0800
Subject: crypto: acomp - Count error stats differently

Move all stat code specific to acomp into the acomp code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/acompress.c                  |  69 ++++++++++++++++---
 crypto/algapi.c                     |  24 -------
 crypto/compress.h                   |  26 ++++++++
 crypto/crypto_user_stat.c           |  29 --------
 crypto/scompress.c                  |  27 ++++----
 include/crypto/acompress.h          | 128 ++++++++++++++++++++++--------------
 include/crypto/internal/acompress.h |  43 ++++++++++--
 include/crypto/internal/scompress.h |  15 +++--
 include/linux/crypto.h              |  24 -------
 9 files changed, 229 insertions(+), 156 deletions(-)
 create mode 100644 crypto/compress.h

(limited to 'include')

diff --git a/crypto/acompress.c b/crypto/acompress.c
index c32c72048a1c..022839ab457a 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -6,23 +6,33 @@
  * Authors: Weigang Li <weigang.li@intel.com>
  *          Giovanni Cabiddu <giovanni.cabiddu@intel.com>
  */
+
+#include <crypto/internal/acompress.h>
+#include <linux/cryptouser.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/crypto.h>
-#include <crypto/algapi.h>
-#include <linux/cryptouser.h>
-#include <linux/compiler.h>
 #include <net/netlink.h>
-#include <crypto/internal/acompress.h>
-#include <crypto/internal/scompress.h>
-#include "internal.h"
+
+#include "compress.h"
+
+struct crypto_scomp;
 
 static const struct crypto_type crypto_acomp_type;
 
+static inline struct acomp_alg *__crypto_acomp_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct acomp_alg, calg.base);
+}
+
+static inline struct acomp_alg *crypto_acomp_alg(struct crypto_acomp *tfm)
+{
+	return __crypto_acomp_alg(crypto_acomp_tfm(tfm)->__crt_alg);
+}
+
 #ifdef CONFIG_NET
 static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
@@ -89,6 +99,32 @@ static unsigned int crypto_acomp_extsize(struct crypto_alg *alg)
 	return extsize;
 }
 
+static inline int __crypto_acomp_report_stat(struct sk_buff *skb,
+					     struct crypto_alg *alg)
+{
+	struct comp_alg_common *calg = __crypto_comp_alg_common(alg);
+	struct crypto_istat_compress *istat = comp_get_stat(calg);
+	struct crypto_stat_compress racomp;
+
+	memset(&racomp, 0, sizeof(racomp));
+
+	strscpy(racomp.type, "acomp", sizeof(racomp.type));
+	racomp.stat_compress_cnt = atomic64_read(&istat->compress_cnt);
+	racomp.stat_compress_tlen = atomic64_read(&istat->compress_tlen);
+	racomp.stat_decompress_cnt =  atomic64_read(&istat->decompress_cnt);
+	racomp.stat_decompress_tlen = atomic64_read(&istat->decompress_tlen);
+	racomp.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
+}
+
+#ifdef CONFIG_CRYPTO_STATS
+int crypto_acomp_report_stat(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	return __crypto_acomp_report_stat(skb, alg);
+}
+#endif
+
 static const struct crypto_type crypto_acomp_type = {
 	.extsize = crypto_acomp_extsize,
 	.init_tfm = crypto_acomp_init_tfm,
@@ -96,6 +132,9 @@ static const struct crypto_type crypto_acomp_type = {
 	.show = crypto_acomp_show,
 #endif
 	.report = crypto_acomp_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_acomp_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_ACOMPRESS_MASK,
 	.type = CRYPTO_ALG_TYPE_ACOMPRESS,
@@ -147,12 +186,24 @@ void acomp_request_free(struct acomp_req *req)
 }
 EXPORT_SYMBOL_GPL(acomp_request_free);
 
-int crypto_register_acomp(struct acomp_alg *alg)
+void comp_prepare_alg(struct comp_alg_common *alg)
 {
+	struct crypto_istat_compress *istat = comp_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
-	base->cra_type = &crypto_acomp_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
+}
+
+int crypto_register_acomp(struct acomp_alg *alg)
+{
+	struct crypto_alg *base = &alg->calg.base;
+
+	comp_prepare_alg(&alg->calg);
+
+	base->cra_type = &crypto_acomp_type;
 	base->cra_flags |= CRYPTO_ALG_TYPE_ACOMPRESS;
 
 	return crypto_register_alg(base);
diff --git a/crypto/algapi.c b/crypto/algapi.c
index deabd2d42216..fe48ce1957e1 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1051,30 +1051,6 @@ void crypto_stats_get(struct crypto_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_stats_get);
 
-void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.compress.compress_cnt);
-		atomic64_add(slen, &alg->stats.compress.compress_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_compress);
-
-void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.compress.decompress_cnt);
-		atomic64_add(slen, &alg->stats.compress.decompress_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_decompress);
-
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
diff --git a/crypto/compress.h b/crypto/compress.h
new file mode 100644
index 000000000000..19f65516d699
--- /dev/null
+++ b/crypto/compress.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * Copyright 2015 LG Electronics Inc.
+ * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#ifndef _LOCAL_CRYPTO_COMPRESS_H
+#define _LOCAL_CRYPTO_COMPRESS_H
+
+#include "internal.h"
+
+struct acomp_req;
+struct comp_alg_common;
+struct sk_buff;
+
+int crypto_init_scomp_ops_async(struct crypto_tfm *tfm);
+struct acomp_req *crypto_acomp_scomp_alloc_ctx(struct acomp_req *req);
+void crypto_acomp_scomp_free_ctx(struct acomp_req *req);
+
+int crypto_acomp_report_stat(struct sk_buff *skb, struct crypto_alg *alg);
+
+void comp_prepare_alg(struct comp_alg_common *alg);
+
+#endif	/* _LOCAL_CRYPTO_COMPRESS_H */
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index d65f10f71f11..ad616e19a3e1 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -51,31 +51,10 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&rcomp, 0, sizeof(rcomp));
 
 	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-	rcomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
-	rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
-	rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt);
-	rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	rcomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
-static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_compress racomp;
-
-	memset(&racomp, 0, sizeof(racomp));
-
-	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	racomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
-	racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
-	racomp.stat_decompress_cnt =  atomic64_read(&alg->stats.compress.decompress_cnt);
-	racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	racomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
-}
-
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_kpp rkpp;
@@ -156,14 +135,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		if (crypto_report_comp(skb, alg))
 			goto nla_put_failure;
 		break;
-	case CRYPTO_ALG_TYPE_ACOMPRESS:
-		if (crypto_report_acomp(skb, alg))
-			goto nla_put_failure;
-		break;
-	case CRYPTO_ALG_TYPE_SCOMPRESS:
-		if (crypto_report_acomp(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_KPP:
 		if (crypto_report_kpp(skb, alg))
 			goto nla_put_failure;
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 738f4f8f0f41..214283f7730a 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -6,23 +6,22 @@
  * Copyright (c) 2016, Intel Corporation
  * Author: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
  */
-#include <linux/errno.h>
+
+#include <crypto/internal/acompress.h>
+#include <crypto/internal/scompress.h>
+#include <crypto/scatterwalk.h>
+#include <linux/cryptouser.h>
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/scatterlist.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/crypto.h>
-#include <linux/compiler.h>
 #include <linux/vmalloc.h>
-#include <crypto/algapi.h>
-#include <linux/cryptouser.h>
 #include <net/netlink.h>
-#include <linux/scatterlist.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/internal/acompress.h>
-#include <crypto/internal/scompress.h>
-#include "internal.h"
+
+#include "compress.h"
 
 struct scomp_scratch {
 	spinlock_t	lock;
@@ -248,6 +247,9 @@ static const struct crypto_type crypto_scomp_type = {
 	.show = crypto_scomp_show,
 #endif
 	.report = crypto_scomp_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_acomp_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_SCOMPRESS,
@@ -256,10 +258,11 @@ static const struct crypto_type crypto_scomp_type = {
 
 int crypto_register_scomp(struct scomp_alg *alg)
 {
-	struct crypto_alg *base = &alg->base;
+	struct crypto_alg *base = &alg->calg.base;
+
+	comp_prepare_alg(&alg->calg);
 
 	base->cra_type = &crypto_scomp_type;
-	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_SCOMPRESS;
 
 	return crypto_register_alg(base);
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index c14cfc9a3b79..574cffc90730 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -8,6 +8,9 @@
  */
 #ifndef _CRYPTO_ACOMP_H
 #define _CRYPTO_ACOMP_H
+
+#include <linux/atomic.h>
+#include <linux/container_of.h>
 #include <linux/crypto.h>
 
 #define CRYPTO_ACOMP_ALLOC_OUTPUT	0x00000001
@@ -53,37 +56,35 @@ struct crypto_acomp {
 	struct crypto_tfm base;
 };
 
-/**
- * struct acomp_alg - asynchronous compression algorithm
- *
- * @compress:	Function performs a compress operation
- * @decompress:	Function performs a de-compress operation
- * @dst_free:	Frees destination buffer if allocated inside the algorithm
- * @init:	Initialize the cryptographic transformation object.
- *		This function is used to initialize the cryptographic
- *		transformation object. This function is called only once at
- *		the instantiation time, right after the transformation context
- *		was allocated. In case the cryptographic hardware has some
- *		special requirements which need to be handled by software, this
- *		function shall check for the precise requirement of the
- *		transformation and put any software fallbacks in place.
- * @exit:	Deinitialize the cryptographic transformation object. This is a
- *		counterpart to @init, used to remove various changes set in
- *		@init.
- *
- * @reqsize:	Context size for (de)compression requests
- * @base:	Common crypto API algorithm data structure
+/*
+ * struct crypto_istat_compress - statistics for compress algorithm
+ * @compress_cnt:	number of compress requests
+ * @compress_tlen:	total data size handled by compress requests
+ * @decompress_cnt:	number of decompress requests
+ * @decompress_tlen:	total data size handled by decompress requests
+ * @err_cnt:		number of error for compress requests
  */
-struct acomp_alg {
-	int (*compress)(struct acomp_req *req);
-	int (*decompress)(struct acomp_req *req);
-	void (*dst_free)(struct scatterlist *dst);
-	int (*init)(struct crypto_acomp *tfm);
-	void (*exit)(struct crypto_acomp *tfm);
-	unsigned int reqsize;
-	struct crypto_alg base;
+struct crypto_istat_compress {
+	atomic64_t compress_cnt;
+	atomic64_t compress_tlen;
+	atomic64_t decompress_cnt;
+	atomic64_t decompress_tlen;
+	atomic64_t err_cnt;
 };
 
+#ifdef CONFIG_CRYPTO_STATS
+#define COMP_ALG_COMMON_STATS struct crypto_istat_compress stat;
+#else
+#define COMP_ALG_COMMON_STATS
+#endif
+
+#define COMP_ALG_COMMON {			\
+	COMP_ALG_COMMON_STATS			\
+						\
+	struct crypto_alg base;			\
+}
+struct comp_alg_common COMP_ALG_COMMON;
+
 /**
  * DOC: Asynchronous Compression API
  *
@@ -131,9 +132,10 @@ static inline struct crypto_tfm *crypto_acomp_tfm(struct crypto_acomp *tfm)
 	return &tfm->base;
 }
 
-static inline struct acomp_alg *__crypto_acomp_alg(struct crypto_alg *alg)
+static inline struct comp_alg_common *__crypto_comp_alg_common(
+	struct crypto_alg *alg)
 {
-	return container_of(alg, struct acomp_alg, base);
+	return container_of(alg, struct comp_alg_common, base);
 }
 
 static inline struct crypto_acomp *__crypto_acomp_tfm(struct crypto_tfm *tfm)
@@ -141,9 +143,10 @@ static inline struct crypto_acomp *__crypto_acomp_tfm(struct crypto_tfm *tfm)
 	return container_of(tfm, struct crypto_acomp, base);
 }
 
-static inline struct acomp_alg *crypto_acomp_alg(struct crypto_acomp *tfm)
+static inline struct comp_alg_common *crypto_comp_alg_common(
+	struct crypto_acomp *tfm)
 {
-	return __crypto_acomp_alg(crypto_acomp_tfm(tfm)->__crt_alg);
+	return __crypto_comp_alg_common(crypto_acomp_tfm(tfm)->__crt_alg);
 }
 
 static inline unsigned int crypto_acomp_reqsize(struct crypto_acomp *tfm)
@@ -252,6 +255,27 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 		req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
 
+static inline struct crypto_istat_compress *comp_get_stat(
+	struct comp_alg_common *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_comp_errstat(struct comp_alg_common *alg, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&comp_get_stat(alg)->err_cnt);
+
+	return err;
+}
+
 /**
  * crypto_acomp_compress() -- Invoke asynchronous compress operation
  *
@@ -264,14 +288,18 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 static inline int crypto_acomp_compress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int slen = req->slen;
-	int ret;
-
-	crypto_stats_get(alg);
-	ret = tfm->compress(req);
-	crypto_stats_compress(slen, ret, alg);
-	return ret;
+	struct comp_alg_common *alg;
+
+	alg = crypto_comp_alg_common(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_compress *istat = comp_get_stat(alg);
+
+		atomic64_inc(&istat->compress_cnt);
+		atomic64_add(req->slen, &istat->compress_tlen);
+	}
+
+	return crypto_comp_errstat(alg, tfm->compress(req));
 }
 
 /**
@@ -286,14 +314,18 @@ static inline int crypto_acomp_compress(struct acomp_req *req)
 static inline int crypto_acomp_decompress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int slen = req->slen;
-	int ret;
-
-	crypto_stats_get(alg);
-	ret = tfm->decompress(req);
-	crypto_stats_decompress(slen, ret, alg);
-	return ret;
+	struct comp_alg_common *alg;
+
+	alg = crypto_comp_alg_common(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_compress *istat = comp_get_stat(alg);
+
+		atomic64_inc(&istat->decompress_cnt);
+		atomic64_add(req->slen, &istat->decompress_tlen);
+	}
+
+	return crypto_comp_errstat(alg, tfm->decompress(req));
 }
 
 #endif
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 978b57a3f4f0..4ac46bafba9d 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -12,6 +12,44 @@
 #include <crypto/acompress.h>
 #include <crypto/algapi.h>
 
+/**
+ * struct acomp_alg - asynchronous compression algorithm
+ *
+ * @compress:	Function performs a compress operation
+ * @decompress:	Function performs a de-compress operation
+ * @dst_free:	Frees destination buffer if allocated inside the algorithm
+ * @init:	Initialize the cryptographic transformation object.
+ *		This function is used to initialize the cryptographic
+ *		transformation object. This function is called only once at
+ *		the instantiation time, right after the transformation context
+ *		was allocated. In case the cryptographic hardware has some
+ *		special requirements which need to be handled by software, this
+ *		function shall check for the precise requirement of the
+ *		transformation and put any software fallbacks in place.
+ * @exit:	Deinitialize the cryptographic transformation object. This is a
+ *		counterpart to @init, used to remove various changes set in
+ *		@init.
+ *
+ * @reqsize:	Context size for (de)compression requests
+ * @stat:	Statistics for compress algorithm
+ * @base:	Common crypto API algorithm data structure
+ * @calg:	Cmonn algorithm data structure shared with scomp
+ */
+struct acomp_alg {
+	int (*compress)(struct acomp_req *req);
+	int (*decompress)(struct acomp_req *req);
+	void (*dst_free)(struct scatterlist *dst);
+	int (*init)(struct crypto_acomp *tfm);
+	void (*exit)(struct crypto_acomp *tfm);
+
+	unsigned int reqsize;
+
+	union {
+		struct COMP_ALG_COMMON;
+		struct comp_alg_common calg;
+	};
+};
+
 /*
  * Transform internal helpers.
  */
@@ -31,11 +69,6 @@ static inline void acomp_request_complete(struct acomp_req *req,
 	crypto_request_complete(&req->base, err);
 }
 
-static inline const char *acomp_alg_name(struct crypto_acomp *tfm)
-{
-	return crypto_acomp_tfm(tfm)->__crt_alg->cra_name;
-}
-
 static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm)
 {
 	struct acomp_req *req;
diff --git a/include/crypto/internal/scompress.h b/include/crypto/internal/scompress.h
index 252cc949d4ee..858fe3965ae3 100644
--- a/include/crypto/internal/scompress.h
+++ b/include/crypto/internal/scompress.h
@@ -9,10 +9,13 @@
 #ifndef _CRYPTO_SCOMP_INT_H
 #define _CRYPTO_SCOMP_INT_H
 
+#include <crypto/acompress.h>
 #include <crypto/algapi.h>
 
 #define SCOMP_SCRATCH_SIZE	131072
 
+struct acomp_req;
+
 struct crypto_scomp {
 	struct crypto_tfm base;
 };
@@ -24,7 +27,9 @@ struct crypto_scomp {
  * @free_ctx:	Function frees context allocated with alloc_ctx
  * @compress:	Function performs a compress operation
  * @decompress:	Function performs a de-compress operation
+ * @stat:	Statistics for compress algorithm
  * @base:	Common crypto API algorithm data structure
+ * @calg:	Cmonn algorithm data structure shared with acomp
  */
 struct scomp_alg {
 	void *(*alloc_ctx)(struct crypto_scomp *tfm);
@@ -35,7 +40,11 @@ struct scomp_alg {
 	int (*decompress)(struct crypto_scomp *tfm, const u8 *src,
 			  unsigned int slen, u8 *dst, unsigned int *dlen,
 			  void *ctx);
-	struct crypto_alg base;
+
+	union {
+		struct COMP_ALG_COMMON;
+		struct comp_alg_common calg;
+	};
 };
 
 static inline struct scomp_alg *__crypto_scomp_alg(struct crypto_alg *alg)
@@ -90,10 +99,6 @@ static inline int crypto_scomp_decompress(struct crypto_scomp *tfm,
 						 ctx);
 }
 
-int crypto_init_scomp_ops_async(struct crypto_tfm *tfm);
-struct acomp_req *crypto_acomp_scomp_alloc_ctx(struct acomp_req *req);
-void crypto_acomp_scomp_free_ctx(struct acomp_req *req);
-
 /**
  * crypto_register_scomp() -- Register synchronous compression algorithm
  *
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index caf759e4201c..42bc55b642a0 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -292,22 +292,6 @@ struct crypto_istat_cipher {
 	atomic64_t err_cnt;
 };
 
-/*
- * struct crypto_istat_compress - statistics for compress algorithm
- * @compress_cnt:	number of compress requests
- * @compress_tlen:	total data size handled by compress requests
- * @decompress_cnt:	number of decompress requests
- * @decompress_tlen:	total data size handled by decompress requests
- * @err_cnt:		number of error for compress requests
- */
-struct crypto_istat_compress {
-	atomic64_t compress_cnt;
-	atomic64_t compress_tlen;
-	atomic64_t decompress_cnt;
-	atomic64_t decompress_tlen;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_kpp - statistics for KPP algorithm
  * @setsecret_cnt:		number of setsecrey operation
@@ -416,7 +400,6 @@ struct crypto_istat_rng {
  *
  * @stats: union of all possible crypto_istat_xxx structures
  * @stats.cipher:	statistics for cipher algorithm
- * @stats.compress:	statistics for compress algorithm
  * @stats.rng:		statistics for rng algorithm
  * @stats.kpp:		statistics for KPP algorithm
  *
@@ -455,7 +438,6 @@ struct crypto_alg {
 #ifdef CONFIG_CRYPTO_STATS
 	union {
 		struct crypto_istat_cipher cipher;
-		struct crypto_istat_compress compress;
 		struct crypto_istat_rng rng;
 		struct crypto_istat_kpp kpp;
 	} stats;
@@ -466,8 +448,6 @@ struct crypto_alg {
 #ifdef CONFIG_CRYPTO_STATS
 void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
-void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
-void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
 void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
 void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
@@ -480,10 +460,6 @@ static inline void crypto_stats_init(struct crypto_alg *alg)
 {}
 static inline void crypto_stats_get(struct crypto_alg *alg)
 {}
-static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
-{}
 static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {}
 static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
-- 
cgit v1.2.3


From e2950bf166ef71ed5588437b7ee94f65ceaa6cd0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:19 +0800
Subject: crypto: kpp - Count error stats differently

Move all stat code specific to kpp into the kpp code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c           | 30 -------------------
 crypto/crypto_user_stat.c | 21 --------------
 crypto/kpp.c              | 41 +++++++++++++++++++++-----
 include/crypto/kpp.h      | 73 +++++++++++++++++++++++++++++++++++------------
 include/linux/crypto.h    | 25 ----------------
 5 files changed, 89 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index fe48ce1957e1..6fcb6192a3d7 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1051,36 +1051,6 @@ void crypto_stats_get(struct crypto_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_stats_get);
 
-void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
-{
-	if (ret)
-		atomic64_inc(&alg->stats.kpp.err_cnt);
-	else
-		atomic64_inc(&alg->stats.kpp.setsecret_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
-
-void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
-{
-	if (ret)
-		atomic64_inc(&alg->stats.kpp.err_cnt);
-	else
-		atomic64_inc(&alg->stats.kpp.generate_public_key_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
-
-void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
-{
-	if (ret)
-		atomic64_inc(&alg->stats.kpp.err_cnt);
-	else
-		atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
-
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index ad616e19a3e1..6ace8b70866f 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -13,7 +13,6 @@
 #include <net/sock.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/internal/rng.h>
-#include <crypto/kpp.h>
 #include <crypto/internal/cryptouser.h>
 
 #include "internal.h"
@@ -55,22 +54,6 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
-static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_kpp rkpp;
-
-	memset(&rkpp, 0, sizeof(rkpp));
-
-	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
-
-	rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt);
-	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt);
-	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt);
-	rkpp.stat_err_cnt = atomic64_read(&alg->stats.kpp.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
-}
-
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_rng rrng;
@@ -135,10 +118,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		if (crypto_report_comp(skb, alg))
 			goto nla_put_failure;
 		break;
-	case CRYPTO_ALG_TYPE_KPP:
-		if (crypto_report_kpp(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_RNG:
 		if (crypto_report_rng(skb, alg))
 			goto nla_put_failure;
diff --git a/crypto/kpp.c b/crypto/kpp.c
index 678e871ce418..3e19c2f2cf94 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -5,19 +5,16 @@
  * Copyright (c) 2016, Intel Corporation
  * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com>
  */
+
+#include <crypto/internal/kpp.h>
+#include <linux/cryptouser.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/crypto.h>
-#include <crypto/algapi.h>
-#include <linux/cryptouser.h>
-#include <linux/compiler.h>
 #include <net/netlink.h>
-#include <crypto/kpp.h>
-#include <crypto/internal/kpp.h>
+
 #include "internal.h"
 
 #ifdef CONFIG_NET
@@ -75,6 +72,29 @@ static void crypto_kpp_free_instance(struct crypto_instance *inst)
 	kpp->free(kpp);
 }
 
+static int __maybe_unused crypto_kpp_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct kpp_alg *kpp = __crypto_kpp_alg(alg);
+	struct crypto_istat_kpp *istat;
+	struct crypto_stat_kpp rkpp;
+
+	istat = kpp_get_stat(kpp);
+
+	memset(&rkpp, 0, sizeof(rkpp));
+
+	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
+
+	rkpp.stat_setsecret_cnt = atomic64_read(&istat->setsecret_cnt);
+	rkpp.stat_generate_public_key_cnt =
+		atomic64_read(&istat->generate_public_key_cnt);
+	rkpp.stat_compute_shared_secret_cnt =
+		atomic64_read(&istat->compute_shared_secret_cnt);
+	rkpp.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
+}
+
 static const struct crypto_type crypto_kpp_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_kpp_init_tfm,
@@ -83,6 +103,9 @@ static const struct crypto_type crypto_kpp_type = {
 	.show = crypto_kpp_show,
 #endif
 	.report = crypto_kpp_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_kpp_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_KPP,
@@ -112,11 +135,15 @@ EXPORT_SYMBOL_GPL(crypto_has_kpp);
 
 static void kpp_prepare_alg(struct kpp_alg *alg)
 {
+	struct crypto_istat_kpp *istat = kpp_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	base->cra_type = &crypto_kpp_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_KPP;
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
 }
 
 int crypto_register_kpp(struct kpp_alg *alg)
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index 33ff32878802..1988e24a0d1d 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -8,7 +8,11 @@
 
 #ifndef _CRYPTO_KPP_
 #define _CRYPTO_KPP_
+
+#include <linux/atomic.h>
+#include <linux/container_of.h>
 #include <linux/crypto.h>
+#include <linux/slab.h>
 
 /**
  * struct kpp_request
@@ -47,6 +51,20 @@ struct crypto_kpp {
 	struct crypto_tfm base;
 };
 
+/*
+ * struct crypto_istat_kpp - statistics for KPP algorithm
+ * @setsecret_cnt:		number of setsecrey operation
+ * @generate_public_key_cnt:	number of generate_public_key operation
+ * @compute_shared_secret_cnt:	number of compute_shared_secret operation
+ * @err_cnt:			number of error for KPP requests
+ */
+struct crypto_istat_kpp {
+	atomic64_t setsecret_cnt;
+	atomic64_t generate_public_key_cnt;
+	atomic64_t compute_shared_secret_cnt;
+	atomic64_t err_cnt;
+};
+
 /**
  * struct kpp_alg - generic key-agreement protocol primitives
  *
@@ -69,6 +87,7 @@ struct crypto_kpp {
  * @exit:		Undo everything @init did.
  *
  * @base:		Common crypto API algorithm data structure
+ * @stat:		Statistics for KPP algorithm
  */
 struct kpp_alg {
 	int (*set_secret)(struct crypto_kpp *tfm, const void *buffer,
@@ -81,6 +100,10 @@ struct kpp_alg {
 	int (*init)(struct crypto_kpp *tfm);
 	void (*exit)(struct crypto_kpp *tfm);
 
+#ifdef CONFIG_CRYPTO_STATS
+	struct crypto_istat_kpp stat;
+#endif
+
 	struct crypto_alg base;
 };
 
@@ -268,6 +291,26 @@ struct kpp_secret {
 	unsigned short len;
 };
 
+static inline struct crypto_istat_kpp *kpp_get_stat(struct kpp_alg *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_kpp_errstat(struct kpp_alg *alg, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&kpp_get_stat(alg)->err_cnt);
+
+	return err;
+}
+
 /**
  * crypto_kpp_set_secret() - Invoke kpp operation
  *
@@ -287,13 +330,11 @@ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm,
 					const void *buffer, unsigned int len)
 {
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	int ret;
 
-	crypto_stats_get(calg);
-	ret = alg->set_secret(tfm, buffer, len);
-	crypto_stats_kpp_set_secret(calg, ret);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&kpp_get_stat(alg)->setsecret_cnt);
+
+	return crypto_kpp_errstat(alg, alg->set_secret(tfm, buffer, len));
 }
 
 /**
@@ -313,13 +354,11 @@ static inline int crypto_kpp_generate_public_key(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	int ret;
 
-	crypto_stats_get(calg);
-	ret = alg->generate_public_key(req);
-	crypto_stats_kpp_generate_public_key(calg, ret);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&kpp_get_stat(alg)->generate_public_key_cnt);
+
+	return crypto_kpp_errstat(alg, alg->generate_public_key(req));
 }
 
 /**
@@ -336,13 +375,11 @@ static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-	struct crypto_alg *calg = tfm->base.__crt_alg;
-	int ret;
 
-	crypto_stats_get(calg);
-	ret = alg->compute_shared_secret(req);
-	crypto_stats_kpp_compute_shared_secret(calg, ret);
-	return ret;
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&kpp_get_stat(alg)->compute_shared_secret_cnt);
+
+	return crypto_kpp_errstat(alg, alg->compute_shared_secret(req));
 }
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 42bc55b642a0..c66f7dc21cbb 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -292,20 +292,6 @@ struct crypto_istat_cipher {
 	atomic64_t err_cnt;
 };
 
-/*
- * struct crypto_istat_kpp - statistics for KPP algorithm
- * @setsecret_cnt:		number of setsecrey operation
- * @generate_public_key_cnt:	number of generate_public_key operation
- * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @err_cnt:			number of error for KPP requests
- */
-struct crypto_istat_kpp {
-	atomic64_t setsecret_cnt;
-	atomic64_t generate_public_key_cnt;
-	atomic64_t compute_shared_secret_cnt;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_rng: statistics for RNG algorithm
  * @generate_cnt:	number of RNG generate requests
@@ -401,7 +387,6 @@ struct crypto_istat_rng {
  * @stats: union of all possible crypto_istat_xxx structures
  * @stats.cipher:	statistics for cipher algorithm
  * @stats.rng:		statistics for rng algorithm
- * @stats.kpp:		statistics for KPP algorithm
  *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
  * for all of the transformations. Any variable not documented here shall not
@@ -439,7 +424,6 @@ struct crypto_alg {
 	union {
 		struct crypto_istat_cipher cipher;
 		struct crypto_istat_rng rng;
-		struct crypto_istat_kpp kpp;
 	} stats;
 #endif /* CONFIG_CRYPTO_STATS */
 
@@ -448,9 +432,6 @@ struct crypto_alg {
 #ifdef CONFIG_CRYPTO_STATS
 void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
-void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
-void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
-void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
 void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
 void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
@@ -460,12 +441,6 @@ static inline void crypto_stats_init(struct crypto_alg *alg)
 {}
 static inline void crypto_stats_get(struct crypto_alg *alg)
 {}
-static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
-{}
-static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
-{}
-static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
-{}
 static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {}
 static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
-- 
cgit v1.2.3


From 1085680bbb7a5235351937bea938c7051b443103 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:21 +0800
Subject: crypto: skcipher - Count error stats differently

Move all stat code specific to skcipher into the skcipher code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c           |  26 ------------
 crypto/crypto_user_stat.c |  11 -----
 crypto/skcipher.c         | 105 ++++++++++++++++++++++++++++++++++++++--------
 include/crypto/skcipher.h |  22 ++++++++++
 include/linux/crypto.h    |  24 -----------
 5 files changed, 109 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 6fcb6192a3d7..3259be84169b 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1073,32 +1073,6 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_rng_generate);
-
-void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
-				   struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
-		atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_skcipher_encrypt);
-
-void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
-				   struct crypto_alg *alg)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
-		atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt);
 #endif
 
 static void __init crypto_start_tests(void)
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 6ace8b70866f..b57e43278ee1 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -11,7 +11,6 @@
 #include <linux/sched.h>
 #include <net/netlink.h>
 #include <net/sock.h>
-#include <crypto/internal/skcipher.h>
 #include <crypto/internal/rng.h>
 #include <crypto/internal/cryptouser.h>
 
@@ -34,12 +33,6 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-	rcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.cipher.encrypt_cnt);
-	rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen);
-	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->stats.cipher.decrypt_cnt);
-	rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen);
-	rcipher.stat_err_cnt =  atomic64_read(&alg->stats.cipher.err_cnt);
-
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
 
@@ -106,10 +99,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 	}
 
 	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
-	case CRYPTO_ALG_TYPE_SKCIPHER:
-		if (crypto_report_cipher(skb, alg))
-			goto nla_put_failure;
-		break;
 	case CRYPTO_ALG_TYPE_CIPHER:
 		if (crypto_report_cipher(skb, alg))
 			goto nla_put_failure;
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 7bf4871fec80..0139f3416339 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -15,11 +15,14 @@
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
 #include <linux/cryptouser.h>
-#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/rtnetlink.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -77,6 +80,35 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len)
 	return max(start, end_page);
 }
 
+static inline struct skcipher_alg *__crypto_skcipher_alg(
+	struct crypto_alg *alg)
+{
+	return container_of(alg, struct skcipher_alg, base);
+}
+
+static inline struct crypto_istat_cipher *skcipher_get_stat(
+	struct skcipher_alg *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_skcipher_errstat(struct skcipher_alg *alg, int err)
+{
+	struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&istat->err_cnt);
+
+	return err;
+}
+
 static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
 {
 	u8 *addr;
@@ -605,34 +637,44 @@ EXPORT_SYMBOL_GPL(crypto_skcipher_setkey);
 int crypto_skcipher_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int cryptlen = req->cryptlen;
+	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
 	int ret;
 
-	crypto_stats_get(alg);
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
+
+		atomic64_inc(&istat->encrypt_cnt);
+		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
+	}
+
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
-		ret = crypto_skcipher_alg(tfm)->encrypt(req);
-	crypto_stats_skcipher_encrypt(cryptlen, ret, alg);
-	return ret;
+		ret = alg->encrypt(req);
+
+	return crypto_skcipher_errstat(alg, ret);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt);
 
 int crypto_skcipher_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	unsigned int cryptlen = req->cryptlen;
+	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
 	int ret;
 
-	crypto_stats_get(alg);
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
+
+		atomic64_inc(&istat->decrypt_cnt);
+		atomic64_add(req->cryptlen, &istat->decrypt_tlen);
+	}
+
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
-		ret = crypto_skcipher_alg(tfm)->decrypt(req);
-	crypto_stats_skcipher_decrypt(cryptlen, ret, alg);
-	return ret;
+		ret = alg->decrypt(req);
+
+	return crypto_skcipher_errstat(alg, ret);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt);
 
@@ -672,8 +714,7 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	__maybe_unused;
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
-	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
-						     base);
+	struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
 
 	seq_printf(m, "type         : skcipher\n");
 	seq_printf(m, "async        : %s\n",
@@ -689,9 +730,8 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 #ifdef CONFIG_NET
 static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
+	struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
 	struct crypto_report_blkcipher rblkcipher;
-	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
-						     base);
 
 	memset(&rblkcipher, 0, sizeof(rblkcipher));
 
@@ -713,6 +753,28 @@ static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 }
 #endif
 
+static int __maybe_unused crypto_skcipher_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
+	struct crypto_istat_cipher *istat;
+	struct crypto_stat_cipher rcipher;
+
+	istat = skcipher_get_stat(skcipher);
+
+	memset(&rcipher, 0, sizeof(rcipher));
+
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
+
+	rcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
+	rcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
+	rcipher.stat_decrypt_cnt =  atomic64_read(&istat->decrypt_cnt);
+	rcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
+	rcipher.stat_err_cnt =  atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
+}
+
 static const struct crypto_type crypto_skcipher_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_skcipher_init_tfm,
@@ -721,6 +783,9 @@ static const struct crypto_type crypto_skcipher_type = {
 	.show = crypto_skcipher_show,
 #endif
 	.report = crypto_skcipher_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_skcipher_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_SKCIPHER,
@@ -775,6 +840,7 @@ EXPORT_SYMBOL_GPL(crypto_has_skcipher);
 
 static int skcipher_prepare_alg(struct skcipher_alg *alg)
 {
+	struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
@@ -790,6 +856,9 @@ static int skcipher_prepare_alg(struct skcipher_alg *alg)
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
+
 	return 0;
 }
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 39f5b67c3069..080d1ba3611d 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -8,6 +8,7 @@
 #ifndef _CRYPTO_SKCIPHER_H
 #define _CRYPTO_SKCIPHER_H
 
+#include <linux/atomic.h>
 #include <linux/container_of.h>
 #include <linux/crypto.h>
 #include <linux/slab.h>
@@ -48,6 +49,22 @@ struct crypto_sync_skcipher {
 	struct crypto_skcipher base;
 };
 
+/*
+ * struct crypto_istat_cipher - statistics for cipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @err_cnt:		number of error for cipher requests
+ */
+struct crypto_istat_cipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t err_cnt;
+};
+
 /**
  * struct skcipher_alg - symmetric key cipher definition
  * @min_keysize: Minimum key size supported by the transformation. This is the
@@ -101,6 +118,7 @@ struct crypto_sync_skcipher {
  * @walksize: Equal to the chunk size except in cases where the algorithm is
  * 	      considerably more efficient if it can operate on multiple chunks
  * 	      in parallel. Should be a multiple of chunksize.
+ * @stat: Statistics for cipher algorithm
  * @base: Definition of a generic crypto algorithm.
  *
  * All fields except @ivsize are mandatory and must be filled.
@@ -119,6 +137,10 @@ struct skcipher_alg {
 	unsigned int chunksize;
 	unsigned int walksize;
 
+#ifdef CONFIG_CRYPTO_STATS
+	struct crypto_istat_cipher stat;
+#endif
+
 	struct crypto_alg base;
 };
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index c66f7dc21cbb..e2db56160d5c 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -276,22 +276,6 @@ struct compress_alg {
 };
 
 #ifdef CONFIG_CRYPTO_STATS
-/*
- * struct crypto_istat_cipher - statistics for cipher algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @err_cnt:		number of error for cipher requests
- */
-struct crypto_istat_cipher {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t err_cnt;
-};
-
 /*
  * struct crypto_istat_rng: statistics for RNG algorithm
  * @generate_cnt:	number of RNG generate requests
@@ -385,7 +369,6 @@ struct crypto_istat_rng {
  * @cra_destroy: internally used
  *
  * @stats: union of all possible crypto_istat_xxx structures
- * @stats.cipher:	statistics for cipher algorithm
  * @stats.rng:		statistics for rng algorithm
  *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
@@ -422,7 +405,6 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		struct crypto_istat_cipher cipher;
 		struct crypto_istat_rng rng;
 	} stats;
 #endif /* CONFIG_CRYPTO_STATS */
@@ -434,8 +416,6 @@ void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
 void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
-void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
-void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
 #else
 static inline void crypto_stats_init(struct crypto_alg *alg)
 {}
@@ -445,10 +425,6 @@ static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {}
 static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
 {}
-static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
-{}
-static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
-{}
 #endif
 /*
  * A helper struct for waiting for completion of async crypto ops
-- 
cgit v1.2.3


From 9807e49b6aab3451b00a99ced42acb4a535e8e22 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:23 +0800
Subject: crypto: rng - Count error stats differently

Move all stat code specific to rng into the rng code.

While we're at it, change the stats so that bytes and counts
are always incremented even in case of error.  This allows the
reference counting to be removed as we can now increment the
counters prior to the operation.

After the operation we simply increase the error count if necessary.
This is safe as errors can only occur synchronously (or rather,
the existing code already ignored asynchronous errors which are
only visible to the callback function).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c           | 39 ----------------------------
 crypto/crypto_user_stat.c | 33 +++++-------------------
 crypto/rng.c              | 53 ++++++++++++++++++++++++++++++--------
 include/crypto/rng.h      | 65 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/crypto.h    | 41 ------------------------------
 5 files changed, 105 insertions(+), 126 deletions(-)

(limited to 'include')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 3259be84169b..9b7e263ed469 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -339,8 +339,6 @@ __crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put)
 
 	list_add(&alg->cra_list, &crypto_alg_list);
 
-	crypto_stats_init(alg);
-
 	if (larval) {
 		/* No cheating! */
 		alg->cra_flags &= ~CRYPTO_ALG_TESTED;
@@ -1038,43 +1036,6 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
 }
 EXPORT_SYMBOL_GPL(crypto_type_has_alg);
 
-#ifdef CONFIG_CRYPTO_STATS
-void crypto_stats_init(struct crypto_alg *alg)
-{
-	memset(&alg->stats, 0, sizeof(alg->stats));
-}
-EXPORT_SYMBOL_GPL(crypto_stats_init);
-
-void crypto_stats_get(struct crypto_alg *alg)
-{
-	crypto_alg_get(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_get);
-
-void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.rng.err_cnt);
-	else
-		atomic64_inc(&alg->stats.rng.seed_cnt);
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_rng_seed);
-
-void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
-			       int ret)
-{
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.rng.err_cnt);
-	} else {
-		atomic64_inc(&alg->stats.rng.generate_cnt);
-		atomic64_add(dlen, &alg->stats.rng.generate_tlen);
-	}
-	crypto_alg_put(alg);
-}
-EXPORT_SYMBOL_GPL(crypto_stats_rng_generate);
-#endif
-
 static void __init crypto_start_tests(void)
 {
 	if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index b57e43278ee1..d4f3d39b5137 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -6,15 +6,14 @@
  *
  */
 
-#include <linux/crypto.h>
-#include <linux/cryptouser.h>
-#include <linux/sched.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/cryptouser.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 #include <net/netlink.h>
 #include <net/sock.h>
-#include <crypto/internal/rng.h>
-#include <crypto/internal/cryptouser.h>
-
-#include "internal.h"
 
 #define null_terminated(x)	(strnlen(x, sizeof(x)) < sizeof(x))
 
@@ -47,22 +46,6 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
-static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_rng rrng;
-
-	memset(&rrng, 0, sizeof(rrng));
-
-	strscpy(rrng.type, "rng", sizeof(rrng.type));
-
-	rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt);
-	rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen);
-	rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt);
-	rrng.stat_err_cnt = atomic64_read(&alg->stats.rng.err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
-}
-
 static int crypto_reportstat_one(struct crypto_alg *alg,
 				 struct crypto_user_alg *ualg,
 				 struct sk_buff *skb)
@@ -107,10 +90,6 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		if (crypto_report_comp(skb, alg))
 			goto nla_put_failure;
 		break;
-	case CRYPTO_ALG_TYPE_RNG:
-		if (crypto_report_rng(skb, alg))
-			goto nla_put_failure;
-		break;
 	default:
 		pr_err("ERROR: Unhandled alg %d in %s\n",
 		       alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL),
diff --git a/crypto/rng.c b/crypto/rng.c
index fea082b25fe4..ef56c71bda50 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -8,17 +8,17 @@
  * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  */
 
-#include <linux/atomic.h>
 #include <crypto/internal/rng.h>
+#include <linux/atomic.h>
+#include <linux/cryptouser.h>
 #include <linux/err.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/cryptouser.h>
-#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -30,27 +30,30 @@ static int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
-	struct crypto_alg *alg = tfm->base.__crt_alg;
+	struct rng_alg *alg = crypto_rng_alg(tfm);
 	u8 *buf = NULL;
 	int err;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		atomic64_inc(&rng_get_stat(alg)->seed_cnt);
+
 	if (!seed && slen) {
 		buf = kmalloc(slen, GFP_KERNEL);
+		err = -ENOMEM;
 		if (!buf)
-			return -ENOMEM;
+			goto out;
 
 		err = get_random_bytes_wait(buf, slen);
 		if (err)
-			goto out;
+			goto free_buf;
 		seed = buf;
 	}
 
-	crypto_stats_get(alg);
-	err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
-	crypto_stats_rng_seed(alg, err);
-out:
+	err = alg->seed(tfm, seed, slen);
+free_buf:
 	kfree_sensitive(buf);
-	return err;
+out:
+	return crypto_rng_errstat(alg, err);
 }
 EXPORT_SYMBOL_GPL(crypto_rng_reset);
 
@@ -94,6 +97,27 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "seedsize     : %u\n", seedsize(alg));
 }
 
+static int __maybe_unused crypto_rng_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct rng_alg *rng = __crypto_rng_alg(alg);
+	struct crypto_istat_rng *istat;
+	struct crypto_stat_rng rrng;
+
+	istat = rng_get_stat(rng);
+
+	memset(&rrng, 0, sizeof(rrng));
+
+	strscpy(rrng.type, "rng", sizeof(rrng.type));
+
+	rrng.stat_generate_cnt = atomic64_read(&istat->generate_cnt);
+	rrng.stat_generate_tlen = atomic64_read(&istat->generate_tlen);
+	rrng.stat_seed_cnt = atomic64_read(&istat->seed_cnt);
+	rrng.stat_err_cnt = atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
+}
+
 static const struct crypto_type crypto_rng_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_rng_init_tfm,
@@ -101,6 +125,9 @@ static const struct crypto_type crypto_rng_type = {
 	.show = crypto_rng_show,
 #endif
 	.report = crypto_rng_report,
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_rng_report_stat,
+#endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
 	.type = CRYPTO_ALG_TYPE_RNG,
@@ -176,6 +203,7 @@ EXPORT_SYMBOL_GPL(crypto_del_default_rng);
 
 int crypto_register_rng(struct rng_alg *alg)
 {
+	struct crypto_istat_rng *istat = rng_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->seedsize > PAGE_SIZE / 8)
@@ -185,6 +213,9 @@ int crypto_register_rng(struct rng_alg *alg)
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_RNG;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
+		memset(istat, 0, sizeof(*istat));
+
 	return crypto_register_alg(base);
 }
 EXPORT_SYMBOL_GPL(crypto_register_rng);
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 17bb3673d3c1..6abe5102e5fb 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -9,10 +9,26 @@
 #ifndef _CRYPTO_RNG_H
 #define _CRYPTO_RNG_H
 
+#include <linux/atomic.h>
+#include <linux/container_of.h>
 #include <linux/crypto.h>
 
 struct crypto_rng;
 
+/*
+ * struct crypto_istat_rng: statistics for RNG algorithm
+ * @generate_cnt:	number of RNG generate requests
+ * @generate_tlen:	total data size of generated data by the RNG
+ * @seed_cnt:		number of times the RNG was seeded
+ * @err_cnt:		number of error for RNG requests
+ */
+struct crypto_istat_rng {
+	atomic64_t generate_cnt;
+	atomic64_t generate_tlen;
+	atomic64_t seed_cnt;
+	atomic64_t err_cnt;
+};
+
 /**
  * struct rng_alg - random number generator definition
  *
@@ -30,6 +46,7 @@ struct crypto_rng;
  *		size of the seed is defined with @seedsize .
  * @set_ent:	Set entropy that would otherwise be obtained from
  *		entropy source.  Internal use only.
+ * @stat:	Statistics for rng algorithm
  * @seedsize:	The seed size required for a random number generator
  *		initialization defined with this variable. Some
  *		random number generators does not require a seed
@@ -46,6 +63,10 @@ struct rng_alg {
 	void (*set_ent)(struct crypto_rng *tfm, const u8 *data,
 			unsigned int len);
 
+#ifdef CONFIG_CRYPTO_STATS
+	struct crypto_istat_rng stat;
+#endif
+
 	unsigned int seedsize;
 
 	struct crypto_alg base;
@@ -94,6 +115,11 @@ static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm)
 	return &tfm->base;
 }
 
+static inline struct rng_alg *__crypto_rng_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct rng_alg, base);
+}
+
 /**
  * crypto_rng_alg - obtain name of RNG
  * @tfm: cipher handle
@@ -104,8 +130,7 @@ static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm)
  */
 static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm)
 {
-	return container_of(crypto_rng_tfm(tfm)->__crt_alg,
-			    struct rng_alg, base);
+	return __crypto_rng_alg(crypto_rng_tfm(tfm)->__crt_alg);
 }
 
 /**
@@ -119,6 +144,26 @@ static inline void crypto_free_rng(struct crypto_rng *tfm)
 	crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
 }
 
+static inline struct crypto_istat_rng *rng_get_stat(struct rng_alg *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+static inline int crypto_rng_errstat(struct rng_alg *alg, int err)
+{
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err && err != -EINPROGRESS && err != -EBUSY)
+		atomic64_inc(&rng_get_stat(alg)->err_cnt);
+
+	return err;
+}
+
 /**
  * crypto_rng_generate() - get random number
  * @tfm: cipher handle
@@ -137,13 +182,17 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm,
 				      const u8 *src, unsigned int slen,
 				      u8 *dst, unsigned int dlen)
 {
-	struct crypto_alg *alg = tfm->base.__crt_alg;
-	int ret;
+	struct rng_alg *alg = crypto_rng_alg(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_rng *istat = rng_get_stat(alg);
+
+		atomic64_inc(&istat->generate_cnt);
+		atomic64_add(dlen, &istat->generate_tlen);
+	}
 
-	crypto_stats_get(alg);
-	ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
-	crypto_stats_rng_generate(alg, dlen, ret);
-	return ret;
+	return crypto_rng_errstat(alg,
+				  alg->generate(tfm, src, slen, dst, dlen));
 }
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index e2db56160d5c..c26e59bb7bca 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -275,22 +275,6 @@ struct compress_alg {
 			      unsigned int slen, u8 *dst, unsigned int *dlen);
 };
 
-#ifdef CONFIG_CRYPTO_STATS
-/*
- * struct crypto_istat_rng: statistics for RNG algorithm
- * @generate_cnt:	number of RNG generate requests
- * @generate_tlen:	total data size of generated data by the RNG
- * @seed_cnt:		number of times the RNG was seeded
- * @err_cnt:		number of error for RNG requests
- */
-struct crypto_istat_rng {
-	atomic64_t generate_cnt;
-	atomic64_t generate_tlen;
-	atomic64_t seed_cnt;
-	atomic64_t err_cnt;
-};
-#endif /* CONFIG_CRYPTO_STATS */
-
 #define cra_cipher	cra_u.cipher
 #define cra_compress	cra_u.compress
 
@@ -368,9 +352,6 @@ struct crypto_istat_rng {
  * @cra_refcnt: internally used
  * @cra_destroy: internally used
  *
- * @stats: union of all possible crypto_istat_xxx structures
- * @stats.rng:		statistics for rng algorithm
- *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
  * for all of the transformations. Any variable not documented here shall not
  * be used by a cipher implementation as it is internal to the Crypto API.
@@ -402,30 +383,8 @@ struct crypto_alg {
 	void (*cra_destroy)(struct crypto_alg *alg);
 	
 	struct module *cra_module;
-
-#ifdef CONFIG_CRYPTO_STATS
-	union {
-		struct crypto_istat_rng rng;
-	} stats;
-#endif /* CONFIG_CRYPTO_STATS */
-
 } CRYPTO_MINALIGN_ATTR;
 
-#ifdef CONFIG_CRYPTO_STATS
-void crypto_stats_init(struct crypto_alg *alg);
-void crypto_stats_get(struct crypto_alg *alg);
-void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
-void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
-#else
-static inline void crypto_stats_init(struct crypto_alg *alg)
-{}
-static inline void crypto_stats_get(struct crypto_alg *alg)
-{}
-static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
-{}
-static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
-{}
-#endif
 /*
  * A helper struct for waiting for completion of async crypto ops
  */
-- 
cgit v1.2.3


From 0c0edf6168ce1e02518ba44400b9269a13c3b9e6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Feb 2023 18:35:25 +0800
Subject: crypto: api - Move MODULE_ALIAS_CRYPTO to algapi.h

This is part of the low-level API and should not be exposed to
top-level Crypto API users.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/algapi.h | 13 +++++++++++++
 include/linux/crypto.h  | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index dcc1fd4ef1b4..e28957993b56 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -34,6 +34,19 @@
 
 #define CRYPTO_DMA_PADDING ((CRYPTO_DMA_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
 
+/*
+ * Autoloaded crypto modules should only use a prefixed name to avoid allowing
+ * arbitrary modules to be loaded. Loading from userspace may still need the
+ * unprefixed names, so retains those aliases as well.
+ * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
+ * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
+ * expands twice on the same line. Instead, use a separate base name for the
+ * alias.
+ */
+#define MODULE_ALIAS_CRYPTO(name)	\
+		__MODULE_INFO(alias, alias_userspace, name);	\
+		__MODULE_INFO(alias, alias_crypto, "crypto-" name)
+
 struct crypto_aead;
 struct crypto_instance;
 struct module;
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index c26e59bb7bca..d57597ebef6e 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -20,19 +20,6 @@
 #include <linux/slab.h>
 #include <linux/completion.h>
 
-/*
- * Autoloaded crypto modules should only use a prefixed name to avoid allowing
- * arbitrary modules to be loaded. Loading from userspace may still need the
- * unprefixed names, so retains those aliases as well.
- * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
- * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
- * expands twice on the same line. Instead, use a separate base name for the
- * alias.
- */
-#define MODULE_ALIAS_CRYPTO(name)	\
-		__MODULE_INFO(alias, alias_userspace, name);	\
-		__MODULE_INFO(alias, alias_crypto, "crypto-" name)
-
 /*
  * Algorithm masks and types.
  */
-- 
cgit v1.2.3


From 0d3a5178c2994eaf91ad135816a79138055b394a Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Mon, 6 Mar 2023 01:43:54 +0000
Subject: ASoC: soc-pcm.c: remove indirect runtime copy

substream->runtime will be attached when substream was opened
at snd_pcm_attach_substream(). When it uses DPCM,
FE substream->runtime is attached, but BE substream->runtime is not.
Thus, we are copying FE substream->runtime to BE.

But, we are copyig FE substream->runtime to FE dpcm->runtime first (A),
and copy it to BE dpcm->runtime (B), and copy it to
BE substream->runtime (C).

	static int dpcm_fe_dai_open(...) {
		...
(A)		fe->dpcm[stream].runtime = fe_substream->runtime;
		...
	}

	static int dpcm_be_connect(...) {
		...
(B)		be->dpcm[stream].runtime = fe->dpcm[stream].runtime;
		...
	}

	int dpcm_be_dai_startup(...) {
		...
(C)		be_substream->runtime = be->dpcm[stream].runtime;
		...
	}

It is too roundabout and troublesome.
OTOH, it is directly copying fe_substream->runtime at dpcm_be_reparent()
without using be->dpcm[stream].runtime.

	static void dpcm_be_reparent(...)
	{
		...
		for_each_dpcm_fe(be, stream, dpcm) {
			...
=>			be_substream->runtime = fe_substream->runtime;
			break;
		}
	}

This patch removes indirect copying.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87v8je64dh.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dpcm.h |  1 -
 sound/soc/soc-compress.c |  7 -------
 sound/soc/soc-pcm.c      | 10 ++++------
 3 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dpcm.h b/include/sound/soc-dpcm.h
index 1e7d09556fe3..4d6ac7699833 100644
--- a/include/sound/soc-dpcm.h
+++ b/include/sound/soc-dpcm.h
@@ -91,7 +91,6 @@ struct snd_soc_dpcm_runtime {
 	struct list_head fe_clients;
 
 	int users;
-	struct snd_pcm_runtime *runtime;
 	struct snd_pcm_hw_params hw_params;
 
 	/* state and update */
diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c
index e7aa6f360cab..554c329ec87f 100644
--- a/sound/soc/soc-compress.c
+++ b/sound/soc/soc-compress.c
@@ -134,8 +134,6 @@ err_no_lock:
 static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 {
 	struct snd_soc_pcm_runtime *fe = cstream->private_data;
-	struct snd_pcm_substream *fe_substream =
-		 fe->pcm->streams[cstream->direction].substream;
 	struct snd_soc_dai *cpu_dai = asoc_rtd_to_cpu(fe, 0);
 	struct snd_soc_dpcm *dpcm;
 	struct snd_soc_dapm_widget_list *list;
@@ -143,7 +141,6 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	int ret;
 
 	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
-	fe->dpcm[stream].runtime = fe_substream->runtime;
 
 	ret = dpcm_path_get(fe, stream, &list);
 	if (ret < 0)
@@ -153,7 +150,6 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 
 	/* calculate valid and active FE <-> BE dpcms */
 	dpcm_process_paths(fe, stream, &list, 1);
-	fe->dpcm[stream].runtime = fe_substream->runtime;
 
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_FE;
 
@@ -164,7 +160,6 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 			dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE;
 
 		dpcm_be_disconnect(fe, stream);
-		fe->dpcm[stream].runtime = NULL;
 		goto out;
 	}
 
@@ -236,8 +231,6 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 
 	mutex_unlock(&fe->card->pcm_mutex);
 
-	fe->dpcm[stream].runtime = NULL;
-
 	snd_soc_link_compr_shutdown(cstream, 0);
 
 	snd_soc_compr_components_free(cstream, 0);
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 5eb056b942ce..b7f8a5bcfbc6 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -1230,7 +1230,6 @@ static int dpcm_be_connect(struct snd_soc_pcm_runtime *fe,
 
 	dpcm->be = be;
 	dpcm->fe = fe;
-	be->dpcm[stream].runtime = fe->dpcm[stream].runtime;
 	dpcm->state = SND_SOC_DPCM_LINK_STATE_NEW;
 	snd_soc_dpcm_stream_lock_irq(fe, stream);
 	list_add(&dpcm->list_be, &fe->dpcm[stream].be_clients);
@@ -1465,10 +1464,11 @@ static int dpcm_add_paths(struct snd_soc_pcm_runtime *fe, int stream,
 	struct snd_soc_dapm_widget_list *list = *list_;
 	struct snd_soc_pcm_runtime *be;
 	struct snd_soc_dapm_widget *widget;
+	struct snd_pcm_substream *fe_substream = snd_soc_dpcm_get_substream(fe, stream);
 	int i, new = 0, err;
 
 	/* don't connect if FE is not running */
-	if (!fe->dpcm[stream].runtime && !fe->fe_compr)
+	if (!fe_substream->runtime && !fe->fe_compr)
 		return new;
 
 	/* Create any new FE <--> BE connections */
@@ -1590,6 +1590,7 @@ void dpcm_be_dai_stop(struct snd_soc_pcm_runtime *fe, int stream,
 
 int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream)
 {
+	struct snd_pcm_substream *fe_substream = snd_soc_dpcm_get_substream(fe, stream);
 	struct snd_soc_pcm_runtime *be;
 	struct snd_soc_dpcm *dpcm;
 	int err, count = 0;
@@ -1629,7 +1630,7 @@ int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream)
 		dev_dbg(be->dev, "ASoC: open %s BE %s\n",
 			stream ? "capture" : "playback", be->dai_link->name);
 
-		be_substream->runtime = be->dpcm[stream].runtime;
+		be_substream->runtime = fe_substream->runtime;
 		err = __soc_pcm_open(be, be_substream);
 		if (err < 0) {
 			be->dpcm[stream].users--;
@@ -2693,8 +2694,6 @@ static void dpcm_fe_dai_cleanup(struct snd_pcm_substream *fe_substream)
 		dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE;
 
 	dpcm_be_disconnect(fe, stream);
-
-	fe->dpcm[stream].runtime = NULL;
 }
 
 static int dpcm_fe_dai_close(struct snd_pcm_substream *fe_substream)
@@ -2719,7 +2718,6 @@ static int dpcm_fe_dai_open(struct snd_pcm_substream *fe_substream)
 	int stream = fe_substream->stream;
 
 	snd_soc_dpcm_mutex_lock(fe);
-	fe->dpcm[stream].runtime = fe_substream->runtime;
 
 	ret = dpcm_path_get(fe, stream, &list);
 	if (ret < 0)
-- 
cgit v1.2.3


From 79963fbfc233759bd8a43462f120d15a1bd4f4fa Mon Sep 17 00:00:00 2001
From: Tanmay Shah <tanmay.shah@amd.com>
Date: Fri, 10 Mar 2023 17:24:06 -0800
Subject: mailbox: zynqmp: Fix typo in IPI documentation

Xilinx IPI message buffers allows 32-byte data transfer.
Fix documentation that says 12 bytes

Fixes: 4981b82ba2ff ("mailbox: ZynqMP IPI mailbox controller")
Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230311012407.1292118-4-tanmay.shah@amd.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/mailbox/zynqmp-ipi-message.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mailbox/zynqmp-ipi-message.h b/include/linux/mailbox/zynqmp-ipi-message.h
index 35ce84c8ca02..31d8046d945e 100644
--- a/include/linux/mailbox/zynqmp-ipi-message.h
+++ b/include/linux/mailbox/zynqmp-ipi-message.h
@@ -9,7 +9,7 @@
  * @data: message payload
  *
  * This is the structure for data used in mbox_send_message
- * the maximum length of data buffer is fixed to 12 bytes.
+ * the maximum length of data buffer is fixed to 32 bytes.
  * Client is supposed to be aware of this.
  */
 struct zynqmp_ipi_message {
-- 
cgit v1.2.3


From 2c854e5fcd7e243f5a7cf6a6afa0ef83060c903c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 22:55:51 +0100
Subject: net: page_pool, skbuff: make skb_mark_for_recycle() always available

skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates
unneeded complication when using it in the generic code. For now, it's
only used in the drivers always selecting Page Pool, so this works.
Move the guards so that preprocessor will cut out only the operation
itself and the function will still be a noop on !PAGE_POOL systems,
but available there as well.
No functional changes.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230313215553.1045175-3-aleksander.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b..3f3a2a82a86b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_PAGE_POOL
 static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
+#ifdef CONFIG_PAGE_POOL
 	skb->pp_recycle = 1;
-}
 #endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
-- 
cgit v1.2.3


From d4e492338d11937c55841b1279287280d6e35894 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 22:55:53 +0100
Subject: xdp: remove unused {__,}xdp_release_frame()

__xdp_build_skb_from_frame() was the last user of
{__,}xdp_release_frame(), which detaches pages from the page_pool.
All the consumers now recycle Page Pool skbs and page, except mlx5,
stmmac and tsnep drivers, which use page_pool_release_page() directly
(might change one day). It's safe to assume this functionality is not
needed anymore and can be removed (in favor of recycling).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230313215553.1045175-5-aleksander.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 29 -----------------------------
 net/core/xdp.c    | 15 ---------------
 2 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index d517bfac937b..5393b3ebe56e 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -317,35 +317,6 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq);
 
-/* When sending xdp_frame into the network stack, then there is no
- * return point callback, which is needed to release e.g. DMA-mapping
- * resources with page_pool.  Thus, have explicit function to release
- * frame resources.
- */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
-static inline void xdp_release_frame(struct xdp_frame *xdpf)
-{
-	struct xdp_mem_info *mem = &xdpf->mem;
-	struct skb_shared_info *sinfo;
-	int i;
-
-	/* Curr only page_pool needs this */
-	if (mem->type != MEM_TYPE_PAGE_POOL)
-		return;
-
-	if (likely(!xdp_frame_has_frags(xdpf)))
-		goto out;
-
-	sinfo = xdp_get_shared_info_from_frame(xdpf);
-	for (i = 0; i < sinfo->nr_frags; i++) {
-		struct page *page = skb_frag_page(&sinfo->frags[i]);
-
-		__xdp_release_frame(page_address(page), mem);
-	}
-out:
-	__xdp_release_frame(xdpf->data, mem);
-}
-
 static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
 {
 	struct skb_shared_info *sinfo;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a2237cfca8e9..8d3ad315f18d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -531,21 +531,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
-/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
-{
-	struct xdp_mem_allocator *xa;
-	struct page *page;
-
-	rcu_read_lock();
-	xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-	page = virt_to_head_page(data);
-	if (xa)
-		page_pool_release_page(xa->page_pool, page);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__xdp_release_frame);
-
 void xdp_attachment_setup(struct xdp_attachment_info *info,
 			  struct netdev_bpf *bpf)
 {
-- 
cgit v1.2.3


From b071af523579df7341cabf0f16fc661125e9a13f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Mar 2023 20:17:31 +0000
Subject: neighbour: annotate lockless accesses to n->nud_state

We have many lockless accesses to n->nud_state.

Before adding another one in the following patch,
add annotations to readers and writers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_core.c  |  4 ++--
 include/net/neighbour.h         |  2 +-
 net/bridge/br_arp_nd_proxy.c    |  4 ++--
 net/bridge/br_netfilter_hooks.c |  3 ++-
 net/core/filter.c               |  4 ++--
 net/core/neighbour.c            | 28 ++++++++++++++--------------
 net/ipv4/arp.c                  |  8 ++++----
 net/ipv4/fib_semantics.c        |  4 ++--
 net/ipv4/nexthop.c              |  4 ++--
 net/ipv4/route.c                |  2 +-
 net/ipv6/ip6_output.c           |  2 +-
 net/ipv6/ndisc.c                |  4 ++--
 net/ipv6/route.c                |  2 +-
 13 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index b1b179effe2a..f2c30214cae8 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1863,7 +1863,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 		struct vxlan_fdb *f;
 		struct sk_buff	*reply;
 
-		if (!(n->nud_state & NUD_CONNECTED)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
 			neigh_release(n);
 			goto out;
 		}
@@ -2027,7 +2027,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 		struct vxlan_fdb *f;
 		struct sk_buff *reply;
 
-		if (!(n->nud_state & NUD_CONNECTED)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
 			neigh_release(n);
 			goto out;
 		}
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 234799ca527e..c8d39bba2a0d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -464,7 +464,7 @@ static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
 
 	if (READ_ONCE(neigh->used) != now)
 		WRITE_ONCE(neigh->used, now);
-	if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
+	if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
 		return __neigh_event_send(neigh, skb, immediate_ok);
 	return 0;
 }
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index e5e48c6e35d7..b45c00c01dea 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -192,7 +192,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
 	if (n) {
 		struct net_bridge_fdb_entry *f;
 
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_release(n);
 			return;
 		}
@@ -452,7 +452,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
 	if (n) {
 		struct net_bridge_fdb_entry *f;
 
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_release(n);
 			return;
 		}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 638a4d5359db..3e3065bc0465 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -277,7 +277,8 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
 		struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 		int ret;
 
-		if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) {
+		if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
+		    READ_ONCE(neigh->hh.hh_len)) {
 			neigh_hh_bridge(&neigh->hh, skb);
 			skb->dev = nf_bridge->physindev;
 			ret = br_handle_frame_finish(net, sk, skb);
diff --git a/net/core/filter.c b/net/core/filter.c
index 50f649f1b4a9..d052fac28d02 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5871,7 +5871,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	else
 		neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
 
-	if (!neigh || !(neigh->nud_state & NUD_VALID))
+	if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
 	memcpy(params->dmac, neigh->ha, ETH_ALEN);
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
@@ -5992,7 +5992,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 * not needed here.
 	 */
 	neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
-	if (!neigh || !(neigh->nud_state & NUD_VALID))
+	if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
 	memcpy(params->dmac, neigh->ha, ETH_ALEN);
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0116b0ff91a7..90d399b3f980 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1093,13 +1093,13 @@ static void neigh_timer_handler(struct timer_list *t)
 					  neigh->used +
 					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
 			neigh_dbg(2, "neigh %p is delayed\n", neigh);
-			neigh->nud_state = NUD_DELAY;
+			WRITE_ONCE(neigh->nud_state, NUD_DELAY);
 			neigh->updated = jiffies;
 			neigh_suspect(neigh);
 			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
 		} else {
 			neigh_dbg(2, "neigh %p is suspected\n", neigh);
-			neigh->nud_state = NUD_STALE;
+			WRITE_ONCE(neigh->nud_state, NUD_STALE);
 			neigh->updated = jiffies;
 			neigh_suspect(neigh);
 			notify = 1;
@@ -1109,14 +1109,14 @@ static void neigh_timer_handler(struct timer_list *t)
 				   neigh->confirmed +
 				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
 			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
-			neigh->nud_state = NUD_REACHABLE;
+			WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
 			neigh->updated = jiffies;
 			neigh_connect(neigh);
 			notify = 1;
 			next = neigh->confirmed + neigh->parms->reachable_time;
 		} else {
 			neigh_dbg(2, "neigh %p is probed\n", neigh);
-			neigh->nud_state = NUD_PROBE;
+			WRITE_ONCE(neigh->nud_state, NUD_PROBE);
 			neigh->updated = jiffies;
 			atomic_set(&neigh->probes, 0);
 			notify = 1;
@@ -1130,7 +1130,7 @@ static void neigh_timer_handler(struct timer_list *t)
 
 	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
 	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
-		neigh->nud_state = NUD_FAILED;
+		WRITE_ONCE(neigh->nud_state, NUD_FAILED);
 		notify = 1;
 		neigh_invalidate(neigh);
 		goto out;
@@ -1179,7 +1179,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 			atomic_set(&neigh->probes,
 				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
 			neigh_del_timer(neigh);
-			neigh->nud_state = NUD_INCOMPLETE;
+			WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
 			neigh->updated = now;
 			if (!immediate_ok) {
 				next = now + 1;
@@ -1191,7 +1191,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 			}
 			neigh_add_timer(neigh, next);
 		} else {
-			neigh->nud_state = NUD_FAILED;
+			WRITE_ONCE(neigh->nud_state, NUD_FAILED);
 			neigh->updated = jiffies;
 			write_unlock_bh(&neigh->lock);
 
@@ -1201,7 +1201,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 	} else if (neigh->nud_state & NUD_STALE) {
 		neigh_dbg(2, "neigh %p is delayed\n", neigh);
 		neigh_del_timer(neigh);
-		neigh->nud_state = NUD_DELAY;
+		WRITE_ONCE(neigh->nud_state, NUD_DELAY);
 		neigh->updated = jiffies;
 		neigh_add_timer(neigh, jiffies +
 				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
@@ -1313,7 +1313,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 	neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
 	if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
 		new = old & ~NUD_PERMANENT;
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		err = 0;
 		goto out;
 	}
@@ -1322,7 +1322,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 		neigh_del_timer(neigh);
 		if (old & NUD_CONNECTED)
 			neigh_suspect(neigh);
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		err = 0;
 		notify = old & NUD_VALID;
 		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1401,7 +1401,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 						((new & NUD_REACHABLE) ?
 						 neigh->parms->reachable_time :
 						 0)));
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		notify = 1;
 	}
 
@@ -1488,7 +1488,7 @@ void __neigh_set_probe_once(struct neighbour *neigh)
 	neigh->updated = jiffies;
 	if (!(neigh->nud_state & NUD_FAILED))
 		return;
-	neigh->nud_state = NUD_INCOMPLETE;
+	WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
 	atomic_set(&neigh->probes, neigh_max_probes(neigh));
 	neigh_add_timer(neigh,
 			jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
@@ -3198,7 +3198,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 			}
 			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
 				break;
-			if (n->nud_state & ~NUD_NOARP)
+			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
 			n = rcu_dereference_bh(n->next);
@@ -3240,7 +3240,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
 				break;
 
-			if (n->nud_state & ~NUD_NOARP)
+			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
 			n = rcu_dereference_bh(n->next);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4f7237661afb..9456f5bb35e5 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -375,7 +375,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
-		if (!(neigh->nud_state & NUD_VALID))
+		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
 			pr_debug("trying to ucast probe in NUD_INVALID\n");
 		neigh_ha_snapshot(dst_ha, neigh, dev);
 		dst_hw = dst_ha;
@@ -1123,7 +1123,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
 
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		if (!(neigh->nud_state & NUD_NOARP)) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) {
 			read_lock_bh(&neigh->lock);
 			memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
 			r->arp_flags = arp_state_to_flags(neigh);
@@ -1144,12 +1144,12 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
 	struct neigh_table *tbl = &arp_tbl;
 
 	if (neigh) {
-		if ((neigh->nud_state & NUD_VALID) && !force) {
+		if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
 			neigh_release(neigh);
 			return 0;
 		}
 
-		if (neigh->nud_state & ~NUD_NOARP)
+		if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN, 0);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3bb890a40ed7..574ff450c4d2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -563,7 +563,7 @@ static int fib_detect_death(struct fib_info *fi, int order,
 		n = NULL;
 
 	if (n) {
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 		neigh_release(n);
 	} else {
 		return 0;
@@ -2202,7 +2202,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 		else
 			n = NULL;
 		if (n)
-			state = n->nud_state;
+			state = READ_ONCE(n->nud_state);
 
 		rcu_read_unlock_bh();
 	}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d8ef05347fd9..e28a99f1996b 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1128,7 +1128,7 @@ static bool ipv6_good_nh(const struct fib6_nh *nh)
 
 	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 	if (n)
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 
 	rcu_read_unlock_bh();
 
@@ -1145,7 +1145,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
 	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 				      (__force u32)nh->fib_nh_gw4);
 	if (n)
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 
 	rcu_read_unlock_bh();
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index de6e3515ab4f..232009d216c4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,7 +784,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 	if (!n)
 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 	if (!IS_ERR(n)) {
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_event_send(n, NULL);
 		} else {
 			if (fib_lookup(net, fl4, &res, 0) == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4ce3f9d3bc8a..e5ed39a3c65f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1153,7 +1153,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	rcu_read_lock_bh();
 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 				      rt6_nexthop(rt, &fl6->daddr));
-	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
+	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
 	rcu_read_unlock_bh();
 
 	if (err) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c4be62c99f73..18634ebd20a4 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -745,7 +745,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		saddr = &ipv6_hdr(skb)->saddr;
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
-		if (!(neigh->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
 			ND_PRINTK(1, dbg,
 				  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
 				  __func__, target);
@@ -1090,7 +1090,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		u8 old_flags = neigh->flags;
 		struct net *net = dev_net(dev);
 
-		if (neigh->nud_state & NUD_FAILED)
+		if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
 			goto out;
 
 		/*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fdb03df2287..25c00c6f5131 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -638,7 +638,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 	idev = __in6_dev_get(dev);
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
-		if (neigh->nud_state & NUD_VALID)
+		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
 			goto out;
 
 		write_lock(&neigh->lock);
-- 
cgit v1.2.3


From f947568e258038d3c2f8f38a9a7dabaca36643ec Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.ibm.com>
Date: Mon, 13 Mar 2023 11:10:31 +0100
Subject: net/smc: Introduce explicit check for v2 support

Previously, v2 support was derived from a very specific format of the SEID
as part of the SMC-D codebase. Make this part of the SMC-D device API, so
implementers do not need to adhere to a specific SEID format.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Reviewed-and-tested-by: Jan Karcher <jaka@linux.ibm.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Signed-off-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism_drv.c | 7 +++++++
 include/net/smc.h          | 1 +
 net/smc/smc_ism.c          | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index eb7e13486087..1c73d32966f1 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -842,6 +842,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
 	return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size);
 }
 
+static int smcd_supports_v2(void)
+{
+	return SYSTEM_EID.serial_number[0] != '0' ||
+		SYSTEM_EID.type[0] != '0';
+}
+
 static u64 smcd_get_local_gid(struct smcd_dev *smcd)
 {
 	return ism_get_local_gid(smcd->priv);
@@ -869,6 +875,7 @@ static const struct smcd_ops ism_ops = {
 	.reset_vlan_required = smcd_reset_vlan_required,
 	.signal_event = smcd_signal_ieq,
 	.move_data = smcd_move,
+	.supports_v2 = smcd_supports_v2,
 	.get_system_eid = ism_get_seid,
 	.get_local_gid = smcd_get_local_gid,
 	.get_chid = smcd_get_chid,
diff --git a/include/net/smc.h b/include/net/smc.h
index 597cb9381182..a002552be29c 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -67,6 +67,7 @@ struct smcd_ops {
 	int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
 			 bool sf, unsigned int offset, void *data,
 			 unsigned int size);
+	int (*supports_v2)(void);
 	u8* (*get_system_eid)(void);
 	u64 (*get_local_gid)(struct smcd_dev *dev);
 	u16 (*get_chid)(struct smcd_dev *dev);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 3b0b7710c6b0..fbee2493091f 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -429,7 +429,7 @@ static void smcd_register_dev(struct ism_dev *ism)
 		u8 *system_eid = NULL;
 
 		system_eid = smcd->ops->get_system_eid();
-		if (system_eid[24] != '0' || system_eid[28] != '0') {
+		if (smcd->ops->supports_v2()) {
 			smc_ism_v2_capable = true;
 			memcpy(smc_ism_v2_system_eid, system_eid,
 			       SMC_MAX_EID_LEN);
-- 
cgit v1.2.3


From a02d83f9947d8f71904eda4de046630c3eb6802c Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 13 Mar 2023 12:32:11 +0100
Subject: scm: fix MSG_CTRUNC setting condition for SO_PASSSEC

Currently, kernel would set MSG_CTRUNC flag if msg_control buffer
wasn't provided and SO_PASSCRED was set or if there was pending SCM_RIGHTS.

For some reason we have no corresponding check for SO_PASSSEC.

In the recvmsg(2) doc we have:
       MSG_CTRUNC
              indicates that some control data was discarded due to lack
              of space in the buffer for ancillary data.

So, we need to set MSG_CTRUNC flag for all types of SCM.

This change can break applications those don't check MSG_CTRUNC flag.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Leon Romanovsky <leon@kernel.org>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>

v2:
- commit message was rewritten according to Eric's suggestion
Acked-by: Paul Moore <paul@paul-moore.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/scm.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/scm.h b/include/net/scm.h
index 1ce365f4c256..585adc1346bd 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -105,16 +105,27 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
 		}
 	}
 }
+
+static inline bool scm_has_secdata(struct socket *sock)
+{
+	return test_bit(SOCK_PASSSEC, &sock->flags);
+}
 #else
 static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
 { }
+
+static inline bool scm_has_secdata(struct socket *sock)
+{
+	return false;
+}
 #endif /* CONFIG_SECURITY_NETWORK */
 
 static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 				struct scm_cookie *scm, int flags)
 {
 	if (!msg->msg_control) {
-		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp)
+		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp ||
+		    scm_has_secdata(sock))
 			msg->msg_flags |= MSG_CTRUNC;
 		scm_destroy(scm);
 		return;
-- 
cgit v1.2.3


From 84706e9a75ffc3c950424bdfea06cabb4101a6b4 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 14 Mar 2023 09:54:00 +0800
Subject: soundwire: intel: add sync_arm/sync_go to ops

The bus start/stop sequences can be reused between platforms if we add
a couple of new callbacks. In following patches the code will be moved to
a shared file.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20230314015410.487311-7-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/intel.c           | 16 ++++++++++------
 drivers/soundwire/intel.h           | 20 ++++++++++++++++++++
 include/linux/soundwire/sdw_intel.h |  8 ++++++++
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 6fdb10117e59..902934cbb27b 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -686,7 +686,7 @@ static int intel_pre_bank_switch(struct sdw_intel *sdw)
 	if (!bus->multi_link)
 		return 0;
 
-	intel_shim_sync_arm(sdw);
+	sdw_intel_sync_arm(sdw);
 
 	return 0;
 }
@@ -720,7 +720,7 @@ static int intel_post_bank_switch(struct sdw_intel *sdw)
 		goto unlock;
 	}
 
-	ret = intel_shim_sync_go_unlocked(sdw);
+	ret = sdw_intel_sync_go_unlocked(sdw);
 unlock:
 	mutex_unlock(sdw->link_res->shim_lock);
 
@@ -1140,7 +1140,7 @@ static int intel_start_bus(struct sdw_intel *sdw)
 	 * gsync is enabled
 	 */
 	if (bus->multi_link)
-		intel_shim_sync_arm(sdw);
+		sdw_intel_sync_arm(sdw);
 
 	ret = sdw_cdns_init(cdns);
 	if (ret < 0) {
@@ -1155,7 +1155,7 @@ static int intel_start_bus(struct sdw_intel *sdw)
 	}
 
 	if (bus->multi_link) {
-		ret = intel_shim_sync_go(sdw);
+		ret = sdw_intel_sync_go(sdw);
 		if (ret < 0) {
 			dev_err(dev, "%s: sync go failed: %d\n", __func__, ret);
 			goto err_interrupt;
@@ -1210,7 +1210,7 @@ static int intel_start_bus_after_reset(struct sdw_intel *sdw)
 		 * timeouts when gsync is enabled
 		 */
 		if (bus->multi_link)
-			intel_shim_sync_arm(sdw);
+			sdw_intel_sync_arm(sdw);
 
 		/*
 		 * Re-initialize the IP since it was powered-off
@@ -1239,7 +1239,7 @@ static int intel_start_bus_after_reset(struct sdw_intel *sdw)
 		}
 
 		if (bus->multi_link) {
-			ret = intel_shim_sync_go(sdw);
+			ret = sdw_intel_sync_go(sdw);
 			if (ret < 0) {
 				dev_err(sdw->cdns.dev, "sync go failed during resume\n");
 				goto err_interrupt;
@@ -1342,6 +1342,10 @@ const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops = {
 
 	.pre_bank_switch = intel_pre_bank_switch,
 	.post_bank_switch = intel_post_bank_switch,
+
+	.sync_arm = intel_shim_sync_arm,
+	.sync_go_unlocked = intel_shim_sync_go_unlocked,
+	.sync_go = intel_shim_sync_go,
 };
 EXPORT_SYMBOL_NS(sdw_intel_cnl_hw_ops, SOUNDWIRE_INTEL);
 
diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index 089c41babfc1..28b21a92e28b 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -167,4 +167,24 @@ static inline void sdw_intel_shim_wake(struct sdw_intel *sdw, bool wake_enable)
 		SDW_INTEL_OPS(sdw, shim_wake)(sdw, wake_enable);
 }
 
+static inline void sdw_intel_sync_arm(struct sdw_intel *sdw)
+{
+	if (SDW_INTEL_CHECK_OPS(sdw, sync_arm))
+		SDW_INTEL_OPS(sdw, sync_arm)(sdw);
+}
+
+static inline int sdw_intel_sync_go_unlocked(struct sdw_intel *sdw)
+{
+	if (SDW_INTEL_CHECK_OPS(sdw, sync_go_unlocked))
+		return SDW_INTEL_OPS(sdw, sync_go_unlocked)(sdw);
+	return -ENOTSUPP;
+}
+
+static inline int sdw_intel_sync_go(struct sdw_intel *sdw)
+{
+	if (SDW_INTEL_CHECK_OPS(sdw, sync_go))
+		return SDW_INTEL_OPS(sdw, sync_go)(sdw);
+	return -ENOTSUPP;
+}
+
 #endif /* __SDW_INTEL_LOCAL_H */
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 91f0dc564fe5..06fa30929ebd 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -309,6 +309,10 @@ struct sdw_intel;
  * @shim_wake: enable/disable in-band wake management
  * @pre_bank_switch: helper for bus management
  * @post_bank_switch: helper for bus management
+ * @sync_arm: helper for multi-link synchronization
+ * @sync_go_unlocked: helper for multi-link synchronization -
+ * shim_lock is assumed to be locked at higher level
+ * @sync_go: helper for multi-link synchronization
  */
 struct sdw_intel_hw_ops {
 	void (*debugfs_init)(struct sdw_intel *sdw);
@@ -330,6 +334,10 @@ struct sdw_intel_hw_ops {
 
 	int (*pre_bank_switch)(struct sdw_intel *sdw);
 	int (*post_bank_switch)(struct sdw_intel *sdw);
+
+	void (*sync_arm)(struct sdw_intel *sdw);
+	int (*sync_go_unlocked)(struct sdw_intel *sdw);
+	int (*sync_go)(struct sdw_intel *sdw);
 };
 
 extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops;
-- 
cgit v1.2.3


From 1e76de2e5dfeff17f13afe8146449fec3f5b69f7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 14 Mar 2023 09:54:03 +0800
Subject: soundwire: intel: add abstraction for cmdsync check

If we add one more callback, we can have common bank switch sequences
between old and new hardware: the only difference is where the CMDSYNC
register is located.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20230314015410.487311-10-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/intel.c           | 24 +++++++++++++-----------
 drivers/soundwire/intel.h           |  7 +++++++
 include/linux/soundwire/sdw_intel.h |  3 +++
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 77d698908595..1131ecb4b5e7 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -325,6 +325,15 @@ static void intel_shim_wake(struct sdw_intel *sdw, bool wake_enable)
 	mutex_unlock(sdw->link_res->shim_lock);
 }
 
+static bool intel_check_cmdsync_unlocked(struct sdw_intel *sdw)
+{
+	void __iomem *shim = sdw->link_res->shim;
+	int sync_reg;
+
+	sync_reg = intel_readl(shim, SDW_SHIM_SYNC);
+	return !!(sync_reg & SDW_SHIM_SYNC_CMDSYNC_MASK);
+}
+
 static int intel_link_power_up(struct sdw_intel *sdw)
 {
 	unsigned int link_id = sdw->instance;
@@ -695,8 +704,7 @@ static int intel_post_bank_switch(struct sdw_intel *sdw)
 {
 	struct sdw_cdns *cdns = &sdw->cdns;
 	struct sdw_bus *bus = &cdns->bus;
-	void __iomem *shim = sdw->link_res->shim;
-	int sync_reg, ret;
+	int ret = 0;
 
 	/* Write to register only for multi-link */
 	if (!bus->multi_link)
@@ -704,9 +712,6 @@ static int intel_post_bank_switch(struct sdw_intel *sdw)
 
 	mutex_lock(sdw->link_res->shim_lock);
 
-	/* Read SYNC register */
-	sync_reg = intel_readl(shim, SDW_SHIM_SYNC);
-
 	/*
 	 * post_bank_switch() ops is called from the bus in loop for
 	 * all the Masters in the steam with the expectation that
@@ -715,13 +720,9 @@ static int intel_post_bank_switch(struct sdw_intel *sdw)
 	 *
 	 * So, set the SYNCGO bit only if CMDSYNC bit is set for any Master.
 	 */
-	if (!(sync_reg & SDW_SHIM_SYNC_CMDSYNC_MASK)) {
-		ret = 0;
-		goto unlock;
-	}
+	if (sdw_intel_sync_check_cmdsync_unlocked(sdw))
+		ret = sdw_intel_sync_go_unlocked(sdw);
 
-	ret = sdw_intel_sync_go_unlocked(sdw);
-unlock:
 	mutex_unlock(sdw->link_res->shim_lock);
 
 	if (ret < 0)
@@ -1147,6 +1148,7 @@ const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops = {
 	.sync_arm = intel_shim_sync_arm,
 	.sync_go_unlocked = intel_shim_sync_go_unlocked,
 	.sync_go = intel_shim_sync_go,
+	.sync_check_cmdsync_unlocked = intel_check_cmdsync_unlocked,
 };
 EXPORT_SYMBOL_NS(sdw_intel_cnl_hw_ops, SOUNDWIRE_INTEL);
 
diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index abd1a500defa..7a69cf755954 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -187,6 +187,13 @@ static inline int sdw_intel_sync_go(struct sdw_intel *sdw)
 	return -ENOTSUPP;
 }
 
+static inline bool sdw_intel_sync_check_cmdsync_unlocked(struct sdw_intel *sdw)
+{
+	if (SDW_INTEL_CHECK_OPS(sdw, sync_check_cmdsync_unlocked))
+		return SDW_INTEL_OPS(sdw, sync_check_cmdsync_unlocked)(sdw);
+	return false;
+}
+
 /* common bus management */
 int intel_start_bus(struct sdw_intel *sdw);
 int intel_start_bus_after_reset(struct sdw_intel *sdw);
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 06fa30929ebd..207701aeeb47 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -313,6 +313,8 @@ struct sdw_intel;
  * @sync_go_unlocked: helper for multi-link synchronization -
  * shim_lock is assumed to be locked at higher level
  * @sync_go: helper for multi-link synchronization
+ * @sync_check_cmdsync_unlocked: helper for multi-link synchronization
+ * and bank switch - shim_lock is assumed to be locked at higher level
  */
 struct sdw_intel_hw_ops {
 	void (*debugfs_init)(struct sdw_intel *sdw);
@@ -338,6 +340,7 @@ struct sdw_intel_hw_ops {
 	void (*sync_arm)(struct sdw_intel *sdw);
 	int (*sync_go_unlocked)(struct sdw_intel *sdw);
 	int (*sync_go)(struct sdw_intel *sdw);
+	bool (*sync_check_cmdsync_unlocked)(struct sdw_intel *sdw);
 };
 
 extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops;
-- 
cgit v1.2.3


From f26e18bda9e3b069fbb07a2827597952a6d0afe0 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 15 Mar 2023 11:17:26 +0100
Subject: dt-bindings: arm: qcom,ids: Add IDs for QCM2290/QRB2210

Add the missing IDs for scuba and its QRB variant.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230314-topic-scuba_socinfo-v2-1-44fa1256aa6d@linaro.org
---
 include/dt-bindings/arm/qcom,ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index c3c078a8c0da..197873ed3a17 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -213,6 +213,7 @@
 #define QCOM_ID_QCM2150			436
 #define QCOM_ID_SDA429W			437
 #define QCOM_ID_SM8350			439
+#define QCOM_ID_QCM2290			441
 #define QCOM_ID_SM6115			444
 #define QCOM_ID_SC8280XP		449
 #define QCOM_ID_IPQ6005			453
@@ -229,6 +230,7 @@
 #define QCOM_ID_SC7180P			495
 #define QCOM_ID_SM6375			507
 #define QCOM_ID_SM8550			519
+#define QCOM_ID_QRB2210			524
 #define QCOM_ID_SA8775P			534
 #define QCOM_ID_QRU1000			539
 #define QCOM_ID_QDU1000			545
-- 
cgit v1.2.3


From ee13b5008707948d3052c1b5aab485c6cd53658e Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Tue, 14 Mar 2023 13:34:41 +0530
Subject: qcom: llcc/edac: Fix the base address used for accessing LLCC banks

The Qualcomm LLCC/EDAC drivers were using a fixed register stride for
accessing the (Control and Status Registers) CSRs of each LLCC bank.
This stride only works for some SoCs like SDM845 for which driver
support was initially added.

But the later SoCs use different register stride that vary between the
banks with holes in-between. So it is not possible to use a single register
stride for accessing the CSRs of each bank. By doing so could result in a
crash.

For fixing this issue, let's obtain the base address of each LLCC bank from
devicetree and get rid of the fixed stride. This also means, there is no
need to rely on reg-names property and the base addresses can be obtained
using the index.

First index is LLCC bank 0 and last index is LLCC broadcast. If the SoC
supports more than one bank, then those need to be defined in devicetree
for index from 1..N-1.

Reported-by: Parikshit Pareek <quic_ppareek@quicinc.com>
Tested-by: Luca Weiss <luca.weiss@fairphone.com>
Tested-by: Steev Klimaszewski <steev@kali.org> # Thinkpad X13s
Tested-by: Andrew Halaney <ahalaney@redhat.com> # sa8540p-ride
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230314080443.64635-13-manivannan.sadhasivam@linaro.org
---
 drivers/edac/qcom_edac.c           | 14 +++-----
 drivers/soc/qcom/llcc-qcom.c       | 72 ++++++++++++++++++++++----------------
 include/linux/soc/qcom/llcc-qcom.h |  6 ++--
 3 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
index 3256254c3722..1d3cc1930a74 100644
--- a/drivers/edac/qcom_edac.c
+++ b/drivers/edac/qcom_edac.c
@@ -213,7 +213,7 @@ dump_syn_reg_values(struct llcc_drv_data *drv, u32 bank, int err_type)
 
 	for (i = 0; i < reg_data.reg_cnt; i++) {
 		synd_reg = reg_data.synd_reg + (i * 4);
-		ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg,
+		ret = regmap_read(drv->regmaps[bank], synd_reg,
 				  &synd_val);
 		if (ret)
 			goto clear;
@@ -222,8 +222,7 @@ dump_syn_reg_values(struct llcc_drv_data *drv, u32 bank, int err_type)
 			    reg_data.name, i, synd_val);
 	}
 
-	ret = regmap_read(drv->regmap,
-			  drv->offsets[bank] + reg_data.count_status_reg,
+	ret = regmap_read(drv->regmaps[bank], reg_data.count_status_reg,
 			  &err_cnt);
 	if (ret)
 		goto clear;
@@ -233,8 +232,7 @@ dump_syn_reg_values(struct llcc_drv_data *drv, u32 bank, int err_type)
 	edac_printk(KERN_CRIT, EDAC_LLCC, "%s: Error count: 0x%4x\n",
 		    reg_data.name, err_cnt);
 
-	ret = regmap_read(drv->regmap,
-			  drv->offsets[bank] + reg_data.ways_status_reg,
+	ret = regmap_read(drv->regmaps[bank], reg_data.ways_status_reg,
 			  &err_ways);
 	if (ret)
 		goto clear;
@@ -296,8 +294,7 @@ llcc_ecc_irq_handler(int irq, void *edev_ctl)
 
 	/* Iterate over the banks and look for Tag RAM or Data RAM errors */
 	for (i = 0; i < drv->num_banks; i++) {
-		ret = regmap_read(drv->regmap,
-				  drv->offsets[i] + DRP_INTERRUPT_STATUS,
+		ret = regmap_read(drv->regmaps[i], DRP_INTERRUPT_STATUS,
 				  &drp_error);
 
 		if (!ret && (drp_error & SB_ECC_ERROR)) {
@@ -312,8 +309,7 @@ llcc_ecc_irq_handler(int irq, void *edev_ctl)
 		if (!ret)
 			irq_rc = IRQ_HANDLED;
 
-		ret = regmap_read(drv->regmap,
-				  drv->offsets[i] + TRP_INTERRUPT_0_STATUS,
+		ret = regmap_read(drv->regmaps[i], TRP_INTERRUPT_0_STATUS,
 				  &trp_error);
 
 		if (!ret && (trp_error & SB_ECC_ERROR)) {
diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 23ce2f78c4ed..72f3f2a9aaa0 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -62,8 +62,6 @@
 #define LLCC_TRP_WRSC_CACHEABLE_EN    0x21f2c
 #define LLCC_TRP_ALGO_CFG8	      0x21f30
 
-#define BANK_OFFSET_STRIDE	      0x80000
-
 #define LLCC_VERSION_2_0_0_0          0x02000000
 #define LLCC_VERSION_2_1_0_0          0x02010000
 #define LLCC_VERSION_4_1_0_0          0x04010000
@@ -898,8 +896,8 @@ static int qcom_llcc_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static struct regmap *qcom_llcc_init_mmio(struct platform_device *pdev,
-		const char *name)
+static struct regmap *qcom_llcc_init_mmio(struct platform_device *pdev, u8 index,
+					  const char *name)
 {
 	void __iomem *base;
 	struct regmap_config llcc_regmap_config = {
@@ -909,7 +907,7 @@ static struct regmap *qcom_llcc_init_mmio(struct platform_device *pdev,
 		.fast_io = true,
 	};
 
-	base = devm_platform_ioremap_resource_byname(pdev, name);
+	base = devm_platform_ioremap_resource(pdev, index);
 	if (IS_ERR(base))
 		return ERR_CAST(base);
 
@@ -927,6 +925,7 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 	const struct llcc_slice_config *llcc_cfg;
 	u32 sz;
 	u32 version;
+	struct regmap *regmap;
 
 	drv_data = devm_kzalloc(dev, sizeof(*drv_data), GFP_KERNEL);
 	if (!drv_data) {
@@ -934,21 +933,51 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	drv_data->regmap = qcom_llcc_init_mmio(pdev, "llcc_base");
-	if (IS_ERR(drv_data->regmap)) {
-		ret = PTR_ERR(drv_data->regmap);
+	/* Initialize the first LLCC bank regmap */
+	regmap = qcom_llcc_init_mmio(pdev, 0, "llcc0_base");
+	if (IS_ERR(regmap)) {
+		ret = PTR_ERR(regmap);
 		goto err;
 	}
 
-	drv_data->bcast_regmap =
-		qcom_llcc_init_mmio(pdev, "llcc_broadcast_base");
+	cfg = of_device_get_match_data(&pdev->dev);
+
+	ret = regmap_read(regmap, cfg->reg_offset[LLCC_COMMON_STATUS0], &num_banks);
+	if (ret)
+		goto err;
+
+	num_banks &= LLCC_LB_CNT_MASK;
+	num_banks >>= LLCC_LB_CNT_SHIFT;
+	drv_data->num_banks = num_banks;
+
+	drv_data->regmaps = devm_kcalloc(dev, num_banks, sizeof(*drv_data->regmaps), GFP_KERNEL);
+	if (!drv_data->regmaps) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	drv_data->regmaps[0] = regmap;
+
+	/* Initialize rest of LLCC bank regmaps */
+	for (i = 1; i < num_banks; i++) {
+		char *base = kasprintf(GFP_KERNEL, "llcc%d_base", i);
+
+		drv_data->regmaps[i] = qcom_llcc_init_mmio(pdev, i, base);
+		if (IS_ERR(drv_data->regmaps[i])) {
+			ret = PTR_ERR(drv_data->regmaps[i]);
+			kfree(base);
+			goto err;
+		}
+
+		kfree(base);
+	}
+
+	drv_data->bcast_regmap = qcom_llcc_init_mmio(pdev, i, "llcc_broadcast_base");
 	if (IS_ERR(drv_data->bcast_regmap)) {
 		ret = PTR_ERR(drv_data->bcast_regmap);
 		goto err;
 	}
 
-	cfg = of_device_get_match_data(&pdev->dev);
-
 	/* Extract version of the IP */
 	ret = regmap_read(drv_data->bcast_regmap, cfg->reg_offset[LLCC_COMMON_HW_INFO],
 			  &version);
@@ -957,15 +986,6 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 
 	drv_data->version = version;
 
-	ret = regmap_read(drv_data->regmap, cfg->reg_offset[LLCC_COMMON_STATUS0],
-			  &num_banks);
-	if (ret)
-		goto err;
-
-	num_banks &= LLCC_LB_CNT_MASK;
-	num_banks >>= LLCC_LB_CNT_SHIFT;
-	drv_data->num_banks = num_banks;
-
 	llcc_cfg = cfg->sct_data;
 	sz = cfg->size;
 
@@ -973,16 +993,6 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 		if (llcc_cfg[i].slice_id > drv_data->max_slices)
 			drv_data->max_slices = llcc_cfg[i].slice_id;
 
-	drv_data->offsets = devm_kcalloc(dev, num_banks, sizeof(u32),
-							GFP_KERNEL);
-	if (!drv_data->offsets) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < num_banks; i++)
-		drv_data->offsets[i] = i * BANK_OFFSET_STRIDE;
-
 	drv_data->bitmap = devm_bitmap_zalloc(dev, drv_data->max_slices,
 					      GFP_KERNEL);
 	if (!drv_data->bitmap) {
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index ad1fd718169d..423220e66026 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -120,7 +120,7 @@ struct llcc_edac_reg_offset {
 
 /**
  * struct llcc_drv_data - Data associated with the llcc driver
- * @regmap: regmap associated with the llcc device
+ * @regmaps: regmaps associated with the llcc device
  * @bcast_regmap: regmap associated with llcc broadcast offset
  * @cfg: pointer to the data structure for slice configuration
  * @edac_reg_offset: Offset of the LLCC EDAC registers
@@ -129,12 +129,11 @@ struct llcc_edac_reg_offset {
  * @max_slices: max slices as read from device tree
  * @num_banks: Number of llcc banks
  * @bitmap: Bit map to track the active slice ids
- * @offsets: Pointer to the bank offsets array
  * @ecc_irq: interrupt for llcc cache error detection and reporting
  * @version: Indicates the LLCC version
  */
 struct llcc_drv_data {
-	struct regmap *regmap;
+	struct regmap **regmaps;
 	struct regmap *bcast_regmap;
 	const struct llcc_slice_config *cfg;
 	const struct llcc_edac_reg_offset *edac_reg_offset;
@@ -143,7 +142,6 @@ struct llcc_drv_data {
 	u32 max_slices;
 	u32 num_banks;
 	unsigned long *bitmap;
-	u32 *offsets;
 	int ecc_irq;
 	u32 version;
 };
-- 
cgit v1.2.3


From fd972da1b228974f788c115d37abe209828ca5a9 Mon Sep 17 00:00:00 2001
From: Varadarajan Narayanan <quic_varada@quicinc.com>
Date: Tue, 14 Mar 2023 11:43:33 +0530
Subject: dt-bindings: arm: qcom,ids: Add IDs for IPQ9574 and its variants

Add SOC ID for Qualcomm IPQ9574, IPQ9570, IPQ9554, IPQ9550,
IPQ9514 and IPQ9510

Signed-off-by: Varadarajan Narayanan <quic_varada@quicinc.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Kathiravan T <quic_kathirav@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/1678774414-14414-2-git-send-email-quic_varada@quicinc.com
---
 include/dt-bindings/arm/qcom,ids.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index 197873ed3a17..c3cc440dec33 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -229,7 +229,13 @@
 #define QCOM_ID_SC7280			487
 #define QCOM_ID_SC7180P			495
 #define QCOM_ID_SM6375			507
+#define QCOM_ID_IPQ9514			510
+#define QCOM_ID_IPQ9550			511
+#define QCOM_ID_IPQ9554			512
+#define QCOM_ID_IPQ9570			513
+#define QCOM_ID_IPQ9574			514
 #define QCOM_ID_SM8550			519
+#define QCOM_ID_IPQ9510			521
 #define QCOM_ID_QRB2210			524
 #define QCOM_ID_SA8775P			534
 #define QCOM_ID_QRU1000			539
-- 
cgit v1.2.3


From ee6ae544ddfabe60245d4eaee41a7330acd46d94 Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Date: Wed, 15 Mar 2023 21:31:50 +0530
Subject: dt-bindings: arm: qcom,ids: Add IDs for QRB4210

Add the ID for QRB4210 variant.

Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230315160151.2166861-2-bhupesh.sharma@linaro.org
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index c3cc440dec33..cd7a95756ea1 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -236,6 +236,7 @@
 #define QCOM_ID_IPQ9574			514
 #define QCOM_ID_SM8550			519
 #define QCOM_ID_IPQ9510			521
+#define QCOM_ID_QRB4210			523
 #define QCOM_ID_QRB2210			524
 #define QCOM_ID_SA8775P			534
 #define QCOM_ID_QRU1000			539
-- 
cgit v1.2.3


From de7aeee0d942d2e1d85b3db3652e037af38e24d7 Mon Sep 17 00:00:00 2001
From: David Wronek <davidwronek@gmail.com>
Date: Sun, 5 Mar 2023 22:17:44 +0300
Subject: dt-bindings: arm: qcom,ids: Add Soc ID for SM7150

Add the ID for the Qualcomm SM7150 SoC.

Signed-off-by: David Wronek <davidwronek@gmail.com>
Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230305191745.386862-2-danila@jiaxyga.com
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index cd7a95756ea1..802495b20276 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -192,6 +192,7 @@
 #define QCOM_ID_SA8155			362
 #define QCOM_ID_SDA439			363
 #define QCOM_ID_SDA429			364
+#define QCOM_ID_SM7150			365
 #define QCOM_ID_IPQ8070			375
 #define QCOM_ID_IPQ8071			376
 #define QCOM_ID_QM215			386
-- 
cgit v1.2.3


From f99cdbd858df8457bcffc3e90fa6939d517193c5 Mon Sep 17 00:00:00 2001
From: Kathiravan T <quic_kathirav@quicinc.com>
Date: Tue, 7 Mar 2023 11:52:26 +0530
Subject: dt-bindings: clock: Add Qualcomm IPQ5332 GCC

Add binding for the Qualcomm IPQ5332 Global Clock Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Kathiravan T <quic_kathirav@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230307062232.4889-4-quic_kathirav@quicinc.com
---
 .../bindings/clock/qcom,ipq5332-gcc.yaml           |  53 +++
 include/dt-bindings/clock/qcom,ipq5332-gcc.h       | 356 +++++++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,ipq5332-gcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml
new file mode 100644
index 000000000000..718fe0625424
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,ipq5332-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ5332
+
+maintainers:
+  - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+  Qualcomm global clock control module provides the clocks, resets and power
+  domains on IPQ5332.
+
+  See also:: include/dt-bindings/clock/qcom,gcc-ipq5332.h
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+properties:
+  compatible:
+    const: qcom,ipq5332-gcc
+
+  clocks:
+    items:
+      - description: Board XO clock source
+      - description: Sleep clock source
+      - description: PCIE 2lane PHY pipe clock source
+      - description: PCIE 2lane x1 PHY pipe clock source (For second lane)
+      - description: USB PCIE wrapper pipe clock source
+
+required:
+  - compatible
+  - clocks
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    clock-controller@1800000 {
+      compatible = "qcom,ipq5332-gcc";
+      reg = <0x01800000 0x80000>;
+      clocks = <&xo_board>,
+               <&sleep_clk>,
+               <&pcie_2lane_phy_pipe_clk>,
+               <&pcie_2lane_phy_pipe_clk_x1>,
+               <&usb_pcie_wrapper_pipe_clk>;
+      #clock-cells = <1>;
+      #power-domain-cells = <1>;
+      #reset-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,ipq5332-gcc.h b/include/dt-bindings/clock/qcom,ipq5332-gcc.h
new file mode 100644
index 000000000000..8a405a0a96d0
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,ipq5332-gcc.h
@@ -0,0 +1,356 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GCC_IPQ5332_H
+#define _DT_BINDINGS_CLK_QCOM_GCC_IPQ5332_H
+
+#define GPLL0_MAIN					0
+#define GPLL0						1
+#define GPLL2_MAIN					2
+#define GPLL2						3
+#define GPLL4_MAIN					4
+#define GPLL4						5
+#define GCC_ADSS_PWM_CLK				6
+#define GCC_ADSS_PWM_CLK_SRC				7
+#define GCC_AHB_CLK					8
+#define GCC_APSS_AXI_CLK_SRC				9
+#define GCC_BLSP1_AHB_CLK				10
+#define GCC_BLSP1_QUP1_I2C_APPS_CLK			11
+#define GCC_BLSP1_QUP1_SPI_APPS_CLK			12
+#define GCC_BLSP1_QUP1_SPI_APPS_CLK_SRC			13
+#define GCC_BLSP1_QUP2_I2C_APPS_CLK			14
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK			15
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK_SRC			16
+#define GCC_BLSP1_QUP3_I2C_APPS_CLK			17
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK			18
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK_SRC			19
+#define GCC_BLSP1_SLEEP_CLK				20
+#define GCC_BLSP1_UART1_APPS_CLK			21
+#define GCC_BLSP1_UART1_APPS_CLK_SRC			22
+#define GCC_BLSP1_UART2_APPS_CLK			23
+#define GCC_BLSP1_UART2_APPS_CLK_SRC			24
+#define GCC_BLSP1_UART3_APPS_CLK			25
+#define GCC_BLSP1_UART3_APPS_CLK_SRC			26
+#define GCC_CE_AHB_CLK					27
+#define GCC_CE_AXI_CLK					28
+#define GCC_CE_PCNOC_AHB_CLK				29
+#define GCC_CMN_12GPLL_AHB_CLK				30
+#define GCC_CMN_12GPLL_APU_CLK				31
+#define GCC_CMN_12GPLL_SYS_CLK				32
+#define GCC_GP1_CLK					33
+#define GCC_GP1_CLK_SRC					34
+#define GCC_GP2_CLK					35
+#define GCC_GP2_CLK_SRC					36
+#define GCC_LPASS_CORE_AXIM_CLK				37
+#define GCC_LPASS_SWAY_CLK				38
+#define GCC_LPASS_SWAY_CLK_SRC				39
+#define GCC_MDIO_AHB_CLK				40
+#define GCC_MDIO_SLAVE_AHB_CLK				41
+#define GCC_MEM_NOC_Q6_AXI_CLK				42
+#define GCC_MEM_NOC_TS_CLK				43
+#define GCC_NSS_TS_CLK					44
+#define GCC_NSS_TS_CLK_SRC				45
+#define GCC_NSSCC_CLK					46
+#define GCC_NSSCFG_CLK					47
+#define GCC_NSSNOC_ATB_CLK				48
+#define GCC_NSSNOC_NSSCC_CLK				49
+#define GCC_NSSNOC_QOSGEN_REF_CLK			50
+#define GCC_NSSNOC_SNOC_1_CLK				51
+#define GCC_NSSNOC_SNOC_CLK				52
+#define GCC_NSSNOC_TIMEOUT_REF_CLK			53
+#define GCC_NSSNOC_XO_DCD_CLK				54
+#define GCC_PCIE3X1_0_AHB_CLK				55
+#define GCC_PCIE3X1_0_AUX_CLK				56
+#define GCC_PCIE3X1_0_AXI_CLK_SRC			57
+#define GCC_PCIE3X1_0_AXI_M_CLK				58
+#define GCC_PCIE3X1_0_AXI_S_BRIDGE_CLK			59
+#define GCC_PCIE3X1_0_AXI_S_CLK				60
+#define GCC_PCIE3X1_0_PIPE_CLK				61
+#define GCC_PCIE3X1_0_RCHG_CLK				62
+#define GCC_PCIE3X1_0_RCHG_CLK_SRC			63
+#define GCC_PCIE3X1_1_AHB_CLK				64
+#define GCC_PCIE3X1_1_AUX_CLK				65
+#define GCC_PCIE3X1_1_AXI_CLK_SRC			66
+#define GCC_PCIE3X1_1_AXI_M_CLK				67
+#define GCC_PCIE3X1_1_AXI_S_BRIDGE_CLK			68
+#define GCC_PCIE3X1_1_AXI_S_CLK				69
+#define GCC_PCIE3X1_1_PIPE_CLK				70
+#define GCC_PCIE3X1_1_RCHG_CLK				71
+#define GCC_PCIE3X1_1_RCHG_CLK_SRC			72
+#define GCC_PCIE3X1_PHY_AHB_CLK				73
+#define GCC_PCIE3X2_AHB_CLK				74
+#define GCC_PCIE3X2_AUX_CLK				75
+#define GCC_PCIE3X2_AXI_M_CLK				76
+#define GCC_PCIE3X2_AXI_M_CLK_SRC			77
+#define GCC_PCIE3X2_AXI_S_BRIDGE_CLK			78
+#define GCC_PCIE3X2_AXI_S_CLK				79
+#define GCC_PCIE3X2_AXI_S_CLK_SRC			80
+#define GCC_PCIE3X2_PHY_AHB_CLK				81
+#define GCC_PCIE3X2_PIPE_CLK				82
+#define GCC_PCIE3X2_RCHG_CLK				83
+#define GCC_PCIE3X2_RCHG_CLK_SRC			84
+#define GCC_PCIE_AUX_CLK_SRC				85
+#define GCC_PCNOC_AT_CLK				86
+#define GCC_PCNOC_BFDCD_CLK_SRC				87
+#define GCC_PCNOC_LPASS_CLK				88
+#define GCC_PRNG_AHB_CLK				89
+#define GCC_Q6_AHB_CLK					90
+#define GCC_Q6_AHB_S_CLK				91
+#define GCC_Q6_AXIM_CLK					92
+#define GCC_Q6_AXIM_CLK_SRC				93
+#define GCC_Q6_AXIS_CLK					94
+#define GCC_Q6_TSCTR_1TO2_CLK				95
+#define GCC_Q6SS_ATBM_CLK				96
+#define GCC_Q6SS_PCLKDBG_CLK				97
+#define GCC_Q6SS_TRIG_CLK				98
+#define GCC_QDSS_AT_CLK					99
+#define GCC_QDSS_AT_CLK_SRC				100
+#define GCC_QDSS_CFG_AHB_CLK				101
+#define GCC_QDSS_DAP_AHB_CLK				102
+#define GCC_QDSS_DAP_CLK				103
+#define GCC_QDSS_DAP_DIV_CLK_SRC			104
+#define GCC_QDSS_ETR_USB_CLK				105
+#define GCC_QDSS_EUD_AT_CLK				106
+#define GCC_QDSS_TSCTR_CLK_SRC				107
+#define GCC_QPIC_AHB_CLK				108
+#define GCC_QPIC_CLK					109
+#define GCC_QPIC_IO_MACRO_CLK				110
+#define GCC_QPIC_IO_MACRO_CLK_SRC			111
+#define GCC_QPIC_SLEEP_CLK				112
+#define GCC_SDCC1_AHB_CLK				113
+#define GCC_SDCC1_APPS_CLK				114
+#define GCC_SDCC1_APPS_CLK_SRC				115
+#define GCC_SLEEP_CLK_SRC				116
+#define GCC_SNOC_LPASS_CFG_CLK				117
+#define GCC_SNOC_NSSNOC_1_CLK				118
+#define GCC_SNOC_NSSNOC_CLK				119
+#define GCC_SNOC_PCIE3_1LANE_1_M_CLK			120
+#define GCC_SNOC_PCIE3_1LANE_1_S_CLK			121
+#define GCC_SNOC_PCIE3_1LANE_M_CLK			122
+#define GCC_SNOC_PCIE3_1LANE_S_CLK			123
+#define GCC_SNOC_PCIE3_2LANE_M_CLK			124
+#define GCC_SNOC_PCIE3_2LANE_S_CLK			125
+#define GCC_SNOC_USB_CLK				126
+#define GCC_SYS_NOC_AT_CLK				127
+#define GCC_SYS_NOC_WCSS_AHB_CLK			128
+#define GCC_SYSTEM_NOC_BFDCD_CLK_SRC			129
+#define GCC_UNIPHY0_AHB_CLK				130
+#define GCC_UNIPHY0_SYS_CLK				131
+#define GCC_UNIPHY1_AHB_CLK				132
+#define GCC_UNIPHY1_SYS_CLK				133
+#define GCC_UNIPHY_SYS_CLK_SRC				134
+#define GCC_USB0_AUX_CLK				135
+#define GCC_USB0_AUX_CLK_SRC				136
+#define GCC_USB0_EUD_AT_CLK				137
+#define GCC_USB0_LFPS_CLK				138
+#define GCC_USB0_LFPS_CLK_SRC				139
+#define GCC_USB0_MASTER_CLK				140
+#define GCC_USB0_MASTER_CLK_SRC				141
+#define GCC_USB0_MOCK_UTMI_CLK				142
+#define GCC_USB0_MOCK_UTMI_CLK_SRC			143
+#define GCC_USB0_MOCK_UTMI_DIV_CLK_SRC			144
+#define GCC_USB0_PHY_CFG_AHB_CLK			145
+#define GCC_USB0_PIPE_CLK				146
+#define GCC_USB0_SLEEP_CLK				147
+#define GCC_WCSS_AHB_CLK_SRC				148
+#define GCC_WCSS_AXIM_CLK				149
+#define GCC_WCSS_AXIS_CLK				150
+#define GCC_WCSS_DBG_IFC_APB_BDG_CLK			151
+#define GCC_WCSS_DBG_IFC_APB_CLK			152
+#define GCC_WCSS_DBG_IFC_ATB_BDG_CLK			153
+#define GCC_WCSS_DBG_IFC_ATB_CLK			154
+#define GCC_WCSS_DBG_IFC_NTS_BDG_CLK			155
+#define GCC_WCSS_DBG_IFC_NTS_CLK			156
+#define GCC_WCSS_ECAHB_CLK				157
+#define GCC_WCSS_MST_ASYNC_BDG_CLK			158
+#define GCC_WCSS_SLV_ASYNC_BDG_CLK			159
+#define GCC_XO_CLK					160
+#define GCC_XO_CLK_SRC					161
+#define GCC_XO_DIV4_CLK					162
+#define GCC_IM_SLEEP_CLK				163
+#define GCC_NSSNOC_PCNOC_1_CLK				164
+#define GCC_MEM_NOC_AHB_CLK				165
+#define GCC_MEM_NOC_APSS_AXI_CLK			166
+#define GCC_SNOC_QOSGEN_EXTREF_DIV_CLK_SRC		167
+#define GCC_MEM_NOC_QOSGEN_EXTREF_CLK			168
+#define GCC_PCIE3X2_PIPE_CLK_SRC			169
+#define GCC_PCIE3X1_0_PIPE_CLK_SRC			170
+#define GCC_PCIE3X1_1_PIPE_CLK_SRC			171
+#define GCC_USB0_PIPE_CLK_SRC				172
+
+#define GCC_ADSS_BCR					0
+#define GCC_ADSS_PWM_CLK_ARES				1
+#define GCC_AHB_CLK_ARES				2
+#define GCC_APC0_VOLTAGE_DROOP_DETECTOR_BCR		3
+#define GCC_APC0_VOLTAGE_DROOP_DETECTOR_GPLL0_CLK_ARES	4
+#define GCC_APSS_AHB_CLK_ARES				5
+#define GCC_APSS_AXI_CLK_ARES				6
+#define GCC_BLSP1_AHB_CLK_ARES				7
+#define GCC_BLSP1_BCR					8
+#define GCC_BLSP1_QUP1_BCR				9
+#define GCC_BLSP1_QUP1_I2C_APPS_CLK_ARES		10
+#define GCC_BLSP1_QUP1_SPI_APPS_CLK_ARES		11
+#define GCC_BLSP1_QUP2_BCR				12
+#define GCC_BLSP1_QUP2_I2C_APPS_CLK_ARES		13
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK_ARES		14
+#define GCC_BLSP1_QUP3_BCR				15
+#define GCC_BLSP1_QUP3_I2C_APPS_CLK_ARES		16
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK_ARES		17
+#define GCC_BLSP1_SLEEP_CLK_ARES			18
+#define GCC_BLSP1_UART1_APPS_CLK_ARES			19
+#define GCC_BLSP1_UART1_BCR				20
+#define GCC_BLSP1_UART2_APPS_CLK_ARES			21
+#define GCC_BLSP1_UART2_BCR				22
+#define GCC_BLSP1_UART3_APPS_CLK_ARES			23
+#define GCC_BLSP1_UART3_BCR				24
+#define GCC_CE_BCR					25
+#define GCC_CMN_BLK_BCR					26
+#define GCC_CMN_LDO0_BCR				27
+#define GCC_CMN_LDO1_BCR				28
+#define GCC_DCC_BCR					29
+#define GCC_GP1_CLK_ARES				30
+#define GCC_GP2_CLK_ARES				31
+#define GCC_LPASS_BCR					32
+#define GCC_LPASS_CORE_AXIM_CLK_ARES			33
+#define GCC_LPASS_SWAY_CLK_ARES				34
+#define GCC_MDIOM_BCR					35
+#define GCC_MDIOS_BCR					36
+#define GCC_NSS_BCR					37
+#define GCC_NSS_TS_CLK_ARES				38
+#define GCC_NSSCC_CLK_ARES				39
+#define GCC_NSSCFG_CLK_ARES				40
+#define GCC_NSSNOC_ATB_CLK_ARES				41
+#define GCC_NSSNOC_NSSCC_CLK_ARES			42
+#define GCC_NSSNOC_QOSGEN_REF_CLK_ARES			43
+#define GCC_NSSNOC_SNOC_1_CLK_ARES			44
+#define GCC_NSSNOC_SNOC_CLK_ARES			45
+#define GCC_NSSNOC_TIMEOUT_REF_CLK_ARES			46
+#define GCC_NSSNOC_XO_DCD_CLK_ARES			47
+#define GCC_PCIE3X1_0_AHB_CLK_ARES			48
+#define GCC_PCIE3X1_0_AUX_CLK_ARES			49
+#define GCC_PCIE3X1_0_AXI_M_CLK_ARES			50
+#define GCC_PCIE3X1_0_AXI_S_BRIDGE_CLK_ARES		51
+#define GCC_PCIE3X1_0_AXI_S_CLK_ARES			52
+#define GCC_PCIE3X1_0_BCR				53
+#define GCC_PCIE3X1_0_LINK_DOWN_BCR			54
+#define GCC_PCIE3X1_0_PHY_BCR				55
+#define GCC_PCIE3X1_0_PHY_PHY_BCR			56
+#define GCC_PCIE3X1_1_AHB_CLK_ARES			57
+#define GCC_PCIE3X1_1_AUX_CLK_ARES			58
+#define GCC_PCIE3X1_1_AXI_M_CLK_ARES			59
+#define GCC_PCIE3X1_1_AXI_S_BRIDGE_CLK_ARES		60
+#define GCC_PCIE3X1_1_AXI_S_CLK_ARES			61
+#define GCC_PCIE3X1_1_BCR				62
+#define GCC_PCIE3X1_1_LINK_DOWN_BCR			63
+#define GCC_PCIE3X1_1_PHY_BCR				64
+#define GCC_PCIE3X1_1_PHY_PHY_BCR			65
+#define GCC_PCIE3X1_PHY_AHB_CLK_ARES			66
+#define GCC_PCIE3X2_AHB_CLK_ARES			67
+#define GCC_PCIE3X2_AUX_CLK_ARES			68
+#define GCC_PCIE3X2_AXI_M_CLK_ARES			69
+#define GCC_PCIE3X2_AXI_S_BRIDGE_CLK_ARES		70
+#define GCC_PCIE3X2_AXI_S_CLK_ARES			71
+#define GCC_PCIE3X2_BCR					72
+#define GCC_PCIE3X2_LINK_DOWN_BCR			73
+#define GCC_PCIE3X2_PHY_AHB_CLK_ARES			74
+#define GCC_PCIE3X2_PHY_BCR				75
+#define GCC_PCIE3X2PHY_PHY_BCR				76
+#define GCC_PCNOC_BCR					77
+#define GCC_PCNOC_LPASS_CLK_ARES			78
+#define GCC_PRNG_AHB_CLK_ARES				79
+#define GCC_PRNG_BCR					80
+#define GCC_Q6_AHB_CLK_ARES				81
+#define GCC_Q6_AHB_S_CLK_ARES				82
+#define GCC_Q6_AXIM_CLK_ARES				83
+#define GCC_Q6_AXIS_CLK_ARES				84
+#define GCC_Q6_TSCTR_1TO2_CLK_ARES			85
+#define GCC_Q6SS_ATBM_CLK_ARES				86
+#define GCC_Q6SS_PCLKDBG_CLK_ARES			87
+#define GCC_Q6SS_TRIG_CLK_ARES				88
+#define GCC_QDSS_APB2JTAG_CLK_ARES			89
+#define GCC_QDSS_AT_CLK_ARES				90
+#define GCC_QDSS_BCR					91
+#define GCC_QDSS_CFG_AHB_CLK_ARES			92
+#define GCC_QDSS_DAP_AHB_CLK_ARES			93
+#define GCC_QDSS_DAP_CLK_ARES				94
+#define GCC_QDSS_ETR_USB_CLK_ARES			95
+#define GCC_QDSS_EUD_AT_CLK_ARES			96
+#define GCC_QDSS_STM_CLK_ARES				97
+#define GCC_QDSS_TRACECLKIN_CLK_ARES			98
+#define GCC_QDSS_TS_CLK_ARES				99
+#define GCC_QDSS_TSCTR_DIV16_CLK_ARES			100
+#define GCC_QDSS_TSCTR_DIV2_CLK_ARES			101
+#define GCC_QDSS_TSCTR_DIV3_CLK_ARES			102
+#define GCC_QDSS_TSCTR_DIV4_CLK_ARES			103
+#define GCC_QDSS_TSCTR_DIV8_CLK_ARES			104
+#define GCC_QPIC_AHB_CLK_ARES				105
+#define GCC_QPIC_CLK_ARES				106
+#define GCC_QPIC_BCR					107
+#define GCC_QPIC_IO_MACRO_CLK_ARES			108
+#define GCC_QPIC_SLEEP_CLK_ARES				109
+#define GCC_QUSB2_0_PHY_BCR				110
+#define GCC_SDCC1_AHB_CLK_ARES				111
+#define GCC_SDCC1_APPS_CLK_ARES				112
+#define GCC_SDCC_BCR					113
+#define GCC_SNOC_BCR					114
+#define GCC_SNOC_LPASS_CFG_CLK_ARES			115
+#define GCC_SNOC_NSSNOC_1_CLK_ARES			116
+#define GCC_SNOC_NSSNOC_CLK_ARES			117
+#define GCC_SYS_NOC_QDSS_STM_AXI_CLK_ARES		118
+#define GCC_SYS_NOC_WCSS_AHB_CLK_ARES			119
+#define GCC_UNIPHY0_AHB_CLK_ARES			120
+#define GCC_UNIPHY0_BCR					121
+#define GCC_UNIPHY0_SYS_CLK_ARES			122
+#define GCC_UNIPHY1_AHB_CLK_ARES			123
+#define GCC_UNIPHY1_BCR					124
+#define GCC_UNIPHY1_SYS_CLK_ARES			125
+#define GCC_USB0_AUX_CLK_ARES				126
+#define GCC_USB0_EUD_AT_CLK_ARES			127
+#define GCC_USB0_LFPS_CLK_ARES				128
+#define GCC_USB0_MASTER_CLK_ARES			129
+#define GCC_USB0_MOCK_UTMI_CLK_ARES			130
+#define GCC_USB0_PHY_BCR				131
+#define GCC_USB0_PHY_CFG_AHB_CLK_ARES			132
+#define GCC_USB0_SLEEP_CLK_ARES				133
+#define GCC_USB3PHY_0_PHY_BCR				134
+#define GCC_USB_BCR					135
+#define GCC_WCSS_AXIM_CLK_ARES				136
+#define GCC_WCSS_AXIS_CLK_ARES				137
+#define GCC_WCSS_BCR					138
+#define GCC_WCSS_DBG_IFC_APB_BDG_CLK_ARES		139
+#define GCC_WCSS_DBG_IFC_APB_CLK_ARES			140
+#define GCC_WCSS_DBG_IFC_ATB_BDG_CLK_ARES		141
+#define GCC_WCSS_DBG_IFC_ATB_CLK_ARES			142
+#define GCC_WCSS_DBG_IFC_NTS_BDG_CLK_ARES		143
+#define GCC_WCSS_DBG_IFC_NTS_CLK_ARES			144
+#define GCC_WCSS_ECAHB_CLK_ARES				145
+#define GCC_WCSS_MST_ASYNC_BDG_CLK_ARES			146
+#define GCC_WCSS_Q6_BCR					147
+#define GCC_WCSS_SLV_ASYNC_BDG_CLK_ARES			148
+#define GCC_XO_CLK_ARES					149
+#define GCC_XO_DIV4_CLK_ARES				150
+#define GCC_Q6SS_DBG_ARES				151
+#define GCC_WCSS_DBG_BDG_ARES				152
+#define GCC_WCSS_DBG_ARES				153
+#define GCC_WCSS_AXI_S_ARES				154
+#define GCC_WCSS_AXI_M_ARES				155
+#define GCC_WCSSAON_ARES				156
+#define GCC_PCIE3X2_PIPE_ARES				157
+#define GCC_PCIE3X2_CORE_STICKY_ARES			158
+#define GCC_PCIE3X2_AXI_S_STICKY_ARES			159
+#define GCC_PCIE3X2_AXI_M_STICKY_ARES			160
+#define GCC_PCIE3X1_0_PIPE_ARES				161
+#define GCC_PCIE3X1_0_CORE_STICKY_ARES			162
+#define GCC_PCIE3X1_0_AXI_S_STICKY_ARES			163
+#define GCC_PCIE3X1_0_AXI_M_STICKY_ARES			164
+#define GCC_PCIE3X1_1_PIPE_ARES				165
+#define GCC_PCIE3X1_1_CORE_STICKY_ARES			166
+#define GCC_PCIE3X1_1_AXI_S_STICKY_ARES			167
+#define GCC_PCIE3X1_1_AXI_M_STICKY_ARES			168
+#define GCC_IM_SLEEP_CLK_ARES				169
+#define GCC_NSSNOC_PCNOC_1_CLK_ARES			170
+#define GCC_UNIPHY0_XPCS_ARES				171
+#define GCC_UNIPHY1_XPCS_ARES				172
+#endif
-- 
cgit v1.2.3


From 968a26a07f75377afbd4f7bb18ef587a1443c244 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Mon, 13 Feb 2023 10:18:29 -0800
Subject: firmware: qcom_scm: Use fixed width src vm bitmap

The maximum VMID for assign_mem is 63. Use a u64 to represent this
bitmap instead of architecture-dependent "unsigned int" which varies in
size on 32-bit and 64-bit platforms.

Acked-by: Kalle Valo <kvalo@kernel.org> (ath10k)
Tested-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230213181832.3489174-1-quic_eberman@quicinc.com
---
 drivers/firmware/qcom_scm.c            | 12 +++++++-----
 drivers/misc/fastrpc.c                 |  2 +-
 drivers/net/wireless/ath/ath10k/qmi.c  |  4 ++--
 drivers/remoteproc/qcom_q6v5_mss.c     |  8 ++++----
 drivers/remoteproc/qcom_q6v5_pas.c     |  2 +-
 drivers/soc/qcom/rmtfs_mem.c           |  2 +-
 include/linux/firmware/qcom/qcom_scm.h |  2 +-
 7 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index 468d4d5ab550..b95616b35bff 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -905,7 +905,7 @@ static int __qcom_scm_assign_mem(struct device *dev, phys_addr_t mem_region,
  * Return negative errno on failure or 0 on success with @srcvm updated.
  */
 int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
-			unsigned int *srcvm,
+			u64 *srcvm,
 			const struct qcom_scm_vmperm *newvm,
 			unsigned int dest_cnt)
 {
@@ -922,9 +922,9 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
 	__le32 *src;
 	void *ptr;
 	int ret, i, b;
-	unsigned long srcvm_bits = *srcvm;
+	u64 srcvm_bits = *srcvm;
 
-	src_sz = hweight_long(srcvm_bits) * sizeof(*src);
+	src_sz = hweight64(srcvm_bits) * sizeof(*src);
 	mem_to_map_sz = sizeof(*mem_to_map);
 	dest_sz = dest_cnt * sizeof(*destvm);
 	ptr_sz = ALIGN(src_sz, SZ_64) + ALIGN(mem_to_map_sz, SZ_64) +
@@ -937,8 +937,10 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
 	/* Fill source vmid detail */
 	src = ptr;
 	i = 0;
-	for_each_set_bit(b, &srcvm_bits, BITS_PER_LONG)
-		src[i++] = cpu_to_le32(b);
+	for (b = 0; b < BITS_PER_TYPE(u64); b++) {
+		if (srcvm_bits & BIT(b))
+			src[i++] = cpu_to_le32(b);
+	}
 
 	/* Fill details of mem buff to map */
 	mem_to_map = ptr + ALIGN(src_sz, SZ_64);
diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index a701132638cf..f48466960f1b 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -262,7 +262,7 @@ struct fastrpc_channel_ctx {
 	int domain_id;
 	int sesscount;
 	int vmcount;
-	u32 perms;
+	u64 perms;
 	struct qcom_scm_vmperm vmperms[FASTRPC_MAX_VMIDS];
 	struct rpmsg_device *rpdev;
 	struct fastrpc_session_ctx session[FASTRPC_MAX_SESSIONS];
diff --git a/drivers/net/wireless/ath/ath10k/qmi.c b/drivers/net/wireless/ath/ath10k/qmi.c
index 90f457b8e1fe..038c5903c0dc 100644
--- a/drivers/net/wireless/ath/ath10k/qmi.c
+++ b/drivers/net/wireless/ath/ath10k/qmi.c
@@ -33,7 +33,7 @@ static int ath10k_qmi_map_msa_permission(struct ath10k_qmi *qmi,
 {
 	struct qcom_scm_vmperm dst_perms[3];
 	struct ath10k *ar = qmi->ar;
-	unsigned int src_perms;
+	u64 src_perms;
 	u32 perm_count;
 	int ret;
 
@@ -65,7 +65,7 @@ static int ath10k_qmi_unmap_msa_permission(struct ath10k_qmi *qmi,
 {
 	struct qcom_scm_vmperm dst_perms;
 	struct ath10k *ar = qmi->ar;
-	unsigned int src_perms;
+	u64 src_perms;
 	int ret;
 
 	src_perms = BIT(QCOM_SCM_VMID_MSS_MSA) | BIT(QCOM_SCM_VMID_WLAN);
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index ab053084f7a2..1ba711bc0100 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -235,8 +235,8 @@ struct q6v5 {
 	bool has_qaccept_regs;
 	bool has_ext_cntl_regs;
 	bool has_vq6;
-	int mpss_perm;
-	int mba_perm;
+	u64 mpss_perm;
+	u64 mba_perm;
 	const char *hexagon_mdt_image;
 	int version;
 };
@@ -414,7 +414,7 @@ static void q6v5_pds_disable(struct q6v5 *qproc, struct device **pds,
 	}
 }
 
-static int q6v5_xfer_mem_ownership(struct q6v5 *qproc, int *current_perm,
+static int q6v5_xfer_mem_ownership(struct q6v5 *qproc, u64 *current_perm,
 				   bool local, bool remote, phys_addr_t addr,
 				   size_t size)
 {
@@ -967,7 +967,7 @@ static int q6v5_mpss_init_image(struct q6v5 *qproc, const struct firmware *fw,
 	unsigned long dma_attrs = DMA_ATTR_FORCE_CONTIGUOUS;
 	dma_addr_t phys;
 	void *metadata;
-	int mdata_perm;
+	u64 mdata_perm;
 	int xferop_ret;
 	size_t size;
 	void *ptr;
diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
index 0871108fb4dc..c99a20542685 100644
--- a/drivers/remoteproc/qcom_q6v5_pas.c
+++ b/drivers/remoteproc/qcom_q6v5_pas.c
@@ -94,7 +94,7 @@ struct qcom_adsp {
 	size_t region_assign_size;
 
 	int region_assign_idx;
-	int region_assign_perms;
+	u64 region_assign_perms;
 
 	struct qcom_rproc_glink glink_subdev;
 	struct qcom_rproc_subdev smd_subdev;
diff --git a/drivers/soc/qcom/rmtfs_mem.c b/drivers/soc/qcom/rmtfs_mem.c
index 2d3ee22b9249..2657c6105bb7 100644
--- a/drivers/soc/qcom/rmtfs_mem.c
+++ b/drivers/soc/qcom/rmtfs_mem.c
@@ -31,7 +31,7 @@ struct qcom_rmtfs_mem {
 
 	unsigned int client_id;
 
-	unsigned int perms;
+	u64 perms;
 };
 
 static ssize_t qcom_rmtfs_mem_show(struct device *dev,
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 1e449a5d7f5c..250ea4efb7cb 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -94,7 +94,7 @@ extern int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
 					  u32 cp_nonpixel_start,
 					  u32 cp_nonpixel_size);
 extern int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
-			       unsigned int *src,
+			       u64 *src,
 			       const struct qcom_scm_vmperm *newvm,
 			       unsigned int dest_cnt);
 
-- 
cgit v1.2.3


From 2d1fc2d804bc1503f4e36fb97b0f6698e7769796 Mon Sep 17 00:00:00 2001
From: Otto Pflüger <otto.pflueger@abscue.de>
Date: Thu, 23 Feb 2023 19:09:32 +0100
Subject: dt-bindings: clock: Add MSM8917 global clock controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a device tree binding to describe clocks, resets and power domains
provided by the global clock controller on MSM8917 SoCs and the very
similar QM215 SoCs.

Add the new compatibles to qcom,gcc-msm8909.yaml. There is
no need to create another YAML file because the bindings are identical
(MSM8917 GCC requires the same parent clocks as the MSM8909 GCC).

Signed-off-by: Otto Pflüger <otto.pflueger@abscue.de>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230223180935.60546-2-otto.pflueger@abscue.de
---
 .../bindings/clock/qcom,gcc-msm8909.yaml           |  13 +-
 include/dt-bindings/clock/qcom,gcc-msm8917.h       | 190 +++++++++++++++++++++
 2 files changed, 199 insertions(+), 4 deletions(-)
 create mode 100644 include/dt-bindings/clock/qcom,gcc-msm8917.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml
index 6279a59c2e20..b91462587df5 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml
@@ -4,20 +4,25 @@
 $id: http://devicetree.org/schemas/clock/qcom,gcc-msm8909.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
-title: Qualcomm Global Clock & Reset Controller on MSM8909
+title: Qualcomm Global Clock & Reset Controller on MSM8909, MSM8917 and QM215
 
 maintainers:
   - Stephan Gerhold <stephan@gerhold.net>
 
 description: |
   Qualcomm global clock control module provides the clocks, resets and power
-  domains on MSM8909.
+  domains on MSM8909, MSM8917 or QM215.
 
-  See also:: include/dt-bindings/clock/qcom,gcc-msm8909.h
+  See also::
+    include/dt-bindings/clock/qcom,gcc-msm8909.h
+    include/dt-bindings/clock/qcom,gcc-msm8917.h
 
 properties:
   compatible:
-    const: qcom,gcc-msm8909
+    enum:
+      - qcom,gcc-msm8909
+      - qcom,gcc-msm8917
+      - qcom,gcc-qm215
 
   clocks:
     items:
diff --git a/include/dt-bindings/clock/qcom,gcc-msm8917.h b/include/dt-bindings/clock/qcom,gcc-msm8917.h
new file mode 100644
index 000000000000..a371b1adc896
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,gcc-msm8917.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef _DT_BINDINGS_CLK_MSM_GCC_8917_H
+#define _DT_BINDINGS_CLK_MSM_GCC_8917_H
+
+/* Clocks */
+#define APSS_AHB_CLK_SRC			0
+#define BLSP1_QUP2_I2C_APPS_CLK_SRC		1
+#define BLSP1_QUP2_SPI_APPS_CLK_SRC		2
+#define BLSP1_QUP3_I2C_APPS_CLK_SRC		3
+#define BLSP1_QUP3_SPI_APPS_CLK_SRC		4
+#define BLSP1_QUP4_I2C_APPS_CLK_SRC		5
+#define BLSP1_QUP4_SPI_APPS_CLK_SRC		6
+#define BLSP1_UART1_APPS_CLK_SRC		7
+#define BLSP1_UART2_APPS_CLK_SRC		8
+#define BLSP2_QUP1_I2C_APPS_CLK_SRC		9
+#define BLSP2_QUP1_SPI_APPS_CLK_SRC		10
+#define BLSP2_QUP2_I2C_APPS_CLK_SRC		11
+#define BLSP2_QUP2_SPI_APPS_CLK_SRC		12
+#define BLSP2_QUP3_I2C_APPS_CLK_SRC		13
+#define BLSP2_QUP3_SPI_APPS_CLK_SRC		14
+#define BLSP2_UART1_APPS_CLK_SRC		15
+#define BLSP2_UART2_APPS_CLK_SRC		16
+#define BYTE0_CLK_SRC				17
+#define CAMSS_GP0_CLK_SRC			18
+#define CAMSS_GP1_CLK_SRC			19
+#define CAMSS_TOP_AHB_CLK_SRC			20
+#define CCI_CLK_SRC				21
+#define CPP_CLK_SRC				22
+#define CRYPTO_CLK_SRC				23
+#define CSI0PHYTIMER_CLK_SRC			24
+#define CSI0_CLK_SRC				25
+#define CSI1PHYTIMER_CLK_SRC			26
+#define CSI1_CLK_SRC				27
+#define CSI2_CLK_SRC				28
+#define ESC0_CLK_SRC				29
+#define GCC_APSS_TCU_CLK			30
+#define GCC_BIMC_GFX_CLK			31
+#define GCC_BIMC_GPU_CLK			32
+#define GCC_BLSP1_AHB_CLK			33
+#define GCC_BLSP1_QUP2_I2C_APPS_CLK		34
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK		35
+#define GCC_BLSP1_QUP3_I2C_APPS_CLK		36
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK		37
+#define GCC_BLSP1_QUP4_I2C_APPS_CLK		38
+#define GCC_BLSP1_QUP4_SPI_APPS_CLK		39
+#define GCC_BLSP1_UART1_APPS_CLK		40
+#define GCC_BLSP1_UART2_APPS_CLK		41
+#define GCC_BLSP2_AHB_CLK			42
+#define GCC_BLSP2_QUP1_I2C_APPS_CLK		43
+#define GCC_BLSP2_QUP1_SPI_APPS_CLK		44
+#define GCC_BLSP2_QUP2_I2C_APPS_CLK		45
+#define GCC_BLSP2_QUP2_SPI_APPS_CLK		46
+#define GCC_BLSP2_QUP3_I2C_APPS_CLK		47
+#define GCC_BLSP2_QUP3_SPI_APPS_CLK		48
+#define GCC_BLSP2_UART1_APPS_CLK		49
+#define GCC_BLSP2_UART2_APPS_CLK		50
+#define GCC_BOOT_ROM_AHB_CLK			51
+#define GCC_CAMSS_AHB_CLK			52
+#define GCC_CAMSS_CCI_AHB_CLK			53
+#define GCC_CAMSS_CCI_CLK			54
+#define GCC_CAMSS_CPP_AHB_CLK			55
+#define GCC_CAMSS_CPP_CLK			56
+#define GCC_CAMSS_CSI0PHYTIMER_CLK		57
+#define GCC_CAMSS_CSI0PHY_CLK			58
+#define GCC_CAMSS_CSI0PIX_CLK			59
+#define GCC_CAMSS_CSI0RDI_CLK			60
+#define GCC_CAMSS_CSI0_AHB_CLK			61
+#define GCC_CAMSS_CSI0_CLK			62
+#define GCC_CAMSS_CSI1PHYTIMER_CLK		63
+#define GCC_CAMSS_CSI1PHY_CLK			64
+#define GCC_CAMSS_CSI1PIX_CLK			65
+#define GCC_CAMSS_CSI1RDI_CLK			66
+#define GCC_CAMSS_CSI1_AHB_CLK			67
+#define GCC_CAMSS_CSI1_CLK			68
+#define GCC_CAMSS_CSI2PHY_CLK			69
+#define GCC_CAMSS_CSI2PIX_CLK			70
+#define GCC_CAMSS_CSI2RDI_CLK			71
+#define GCC_CAMSS_CSI2_AHB_CLK			72
+#define GCC_CAMSS_CSI2_CLK			73
+#define GCC_CAMSS_CSI_VFE0_CLK			74
+#define GCC_CAMSS_CSI_VFE1_CLK			75
+#define GCC_CAMSS_GP0_CLK			76
+#define GCC_CAMSS_GP1_CLK			77
+#define GCC_CAMSS_ISPIF_AHB_CLK			78
+#define GCC_CAMSS_JPEG0_CLK			79
+#define GCC_CAMSS_JPEG_AHB_CLK			80
+#define GCC_CAMSS_JPEG_AXI_CLK			81
+#define GCC_CAMSS_MCLK0_CLK			82
+#define GCC_CAMSS_MCLK1_CLK			83
+#define GCC_CAMSS_MCLK2_CLK			84
+#define GCC_CAMSS_MICRO_AHB_CLK			85
+#define GCC_CAMSS_TOP_AHB_CLK			86
+#define GCC_CAMSS_VFE0_AHB_CLK			87
+#define GCC_CAMSS_VFE0_AXI_CLK			88
+#define GCC_CAMSS_VFE0_CLK			89
+#define GCC_CAMSS_VFE1_AHB_CLK			90
+#define GCC_CAMSS_VFE1_AXI_CLK			91
+#define GCC_CAMSS_VFE1_CLK			92
+#define GCC_CPP_TBU_CLK				93
+#define GCC_CRYPTO_AHB_CLK			94
+#define GCC_CRYPTO_AXI_CLK			95
+#define GCC_CRYPTO_CLK				96
+#define GCC_DCC_CLK				97
+#define GCC_GFX_TBU_CLK				98
+#define GCC_GFX_TCU_CLK				99
+#define GCC_GP1_CLK				100
+#define GCC_GP2_CLK				101
+#define GCC_GP3_CLK				102
+#define GCC_GTCU_AHB_CLK			103
+#define GCC_JPEG_TBU_CLK			104
+#define GCC_MDP_TBU_CLK				105
+#define GCC_MDSS_AHB_CLK			106
+#define GCC_MDSS_AXI_CLK			107
+#define GCC_MDSS_BYTE0_CLK			108
+#define GCC_MDSS_ESC0_CLK			109
+#define GCC_MDSS_MDP_CLK			110
+#define GCC_MDSS_PCLK0_CLK			111
+#define GCC_MDSS_VSYNC_CLK			112
+#define GCC_MSS_CFG_AHB_CLK			113
+#define GCC_MSS_Q6_BIMC_AXI_CLK			114
+#define GCC_OXILI_AHB_CLK			115
+#define GCC_OXILI_GFX3D_CLK			116
+#define GCC_PDM2_CLK				117
+#define GCC_PDM_AHB_CLK				118
+#define GCC_PRNG_AHB_CLK			119
+#define GCC_QDSS_DAP_CLK			120
+#define GCC_SDCC1_AHB_CLK			121
+#define GCC_SDCC1_APPS_CLK			122
+#define GCC_SDCC1_ICE_CORE_CLK			123
+#define GCC_SDCC2_AHB_CLK			124
+#define GCC_SDCC2_APPS_CLK			125
+#define GCC_SMMU_CFG_CLK			126
+#define GCC_USB2A_PHY_SLEEP_CLK			127
+#define GCC_USB_HS_AHB_CLK			128
+#define GCC_USB_HS_PHY_CFG_AHB_CLK		129
+#define GCC_USB_HS_SYSTEM_CLK			130
+#define GCC_VENUS0_AHB_CLK			131
+#define GCC_VENUS0_AXI_CLK			132
+#define GCC_VENUS0_CORE0_VCODEC0_CLK		133
+#define GCC_VENUS0_VCODEC0_CLK			134
+#define GCC_VENUS_TBU_CLK			135
+#define GCC_VFE1_TBU_CLK			136
+#define GCC_VFE_TBU_CLK				137
+#define GFX3D_CLK_SRC				138
+#define GP1_CLK_SRC				139
+#define GP2_CLK_SRC				140
+#define GP3_CLK_SRC				141
+#define GPLL0					142
+#define GPLL0_EARLY				143
+#define GPLL3					144
+#define GPLL3_EARLY				145
+#define GPLL4					146
+#define GPLL4_EARLY				147
+#define GPLL6					148
+#define GPLL6_EARLY				149
+#define JPEG0_CLK_SRC				150
+#define MCLK0_CLK_SRC				151
+#define MCLK1_CLK_SRC				152
+#define MCLK2_CLK_SRC				153
+#define MDP_CLK_SRC				154
+#define PCLK0_CLK_SRC				155
+#define PDM2_CLK_SRC				156
+#define SDCC1_APPS_CLK_SRC			157
+#define SDCC1_ICE_CORE_CLK_SRC			158
+#define SDCC2_APPS_CLK_SRC			159
+#define USB_HS_SYSTEM_CLK_SRC			160
+#define VCODEC0_CLK_SRC				161
+#define VFE0_CLK_SRC				162
+#define VFE1_CLK_SRC				163
+#define VSYNC_CLK_SRC				164
+
+/* GCC block resets */
+#define GCC_CAMSS_MICRO_BCR			0
+#define GCC_MSS_BCR				1
+#define GCC_QUSB2_PHY_BCR			2
+#define GCC_USB_HS_BCR				3
+#define GCC_USB2_HS_PHY_ONLY_BCR		4
+
+/* GDSCs */
+#define CPP_GDSC				0
+#define JPEG_GDSC				1
+#define MDSS_GDSC				2
+#define OXILI_GX_GDSC				3
+#define VENUS_CORE0_GDSC			4
+#define VENUS_GDSC				5
+#define VFE0_GDSC				6
+#define VFE1_GDSC				7
+
+#endif
-- 
cgit v1.2.3


From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 10 Mar 2023 08:40:59 +0100
Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules

This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
to functions located in modules:

1. The verifier tries to find the address to attach to in kallsyms. This
   is always done by searching the entire kallsyms, not respecting the
   module in which the function is located. Such approach causes an
   incorrect attachment address to be computed if the function to attach
   to is shadowed by a function of the same name located earlier in
   kallsyms.

2. If the address to attach to is located in a module, the module
   reference is only acquired in register_fentry. If the module is
   unloaded between the place where the address is found
   (bpf_check_attach_target in the verifier) and register_fentry, it is
   possible that another module is loaded to the same address which may
   lead to potential errors.

Since the attachment must contain the BTF of the program to attach to,
we extract the module from it and search for the function address in the
correct module (resolving problem no. 1). Then, the module reference is
taken directly in bpf_check_attach_target and stored in the bpf program
(in bpf_prog_aux). The reference is only released when the program is
unloaded (resolving problem no. 2).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/syscall.c     |  6 ++++++
 kernel/bpf/trampoline.c  | 28 ----------------------------
 kernel/bpf/verifier.c    | 18 +++++++++++++++++-
 kernel/module/internal.h |  5 +++++
 5 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 71cc92a4ba48..3ef98fb92987 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1103,6 +1103,7 @@ struct bpf_trampoline {
 struct bpf_attach_target_info {
 	struct btf_func_model fmodel;
 	long tgt_addr;
+	struct module *tgt_mod;
 	const char *tgt_name;
 	const struct btf_type *tgt_type;
 };
@@ -1406,6 +1407,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct module *mod;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
 	union {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5b88301a2ae0..099e9068bcdd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2067,6 +2067,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 {
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
+	module_put(prog->aux->mod);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
 	kfree(prog->aux->kfunc_tab);
@@ -3113,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		if (err)
 			goto out_unlock;
 
+		if (tgt_info.tgt_mod) {
+			module_put(prog->aux->mod);
+			prog->aux->mod = tgt_info.tgt_mod;
+		}
+
 		tr = bpf_trampoline_get(key, &tgt_info);
 		if (!tr) {
 			err = -ENOMEM;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..f61d5138b12b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
 #include <linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
-#include <linux/module.h>
 #include <linux/static_call.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
@@ -172,26 +171,6 @@ out:
 	return tr;
 }
 
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
-	struct module *mod;
-	int err = 0;
-
-	preempt_disable();
-	mod = __module_text_address((unsigned long) tr->func.addr);
-	if (mod && !try_module_get(mod))
-		err = -ENOENT;
-	preempt_enable();
-	tr->mod = mod;
-	return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
-	module_put(tr->mod);
-	tr->mod = NULL;
-}
-
 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 {
 	void *ip = tr->func.addr;
@@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
-	if (!ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
@@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		tr->func.ftrace_managed = true;
 	}
 
-	if (bpf_trampoline_module_get(tr))
-		return -ENOENT;
-
 	if (tr->func.ftrace_managed) {
 		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
 		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
@@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 	}
 
-	if (ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bbd89279070..60793f793ca6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
+#include "../module/internal.h"
 
 #include "disasm.h"
 
@@ -18307,6 +18308,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	const char *tname;
 	struct btf *btf;
 	long addr = 0;
+	struct module *mod = NULL;
 
 	if (!btf_id) {
 		bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -18480,8 +18482,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			else
 				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
 		} else {
-			addr = kallsyms_lookup_name(tname);
+			if (btf_is_module(btf)) {
+				mod = btf_try_get_module(btf);
+				if (mod)
+					addr = find_kallsyms_symbol_value(mod, tname);
+				else
+					addr = 0;
+			} else {
+				addr = kallsyms_lookup_name(tname);
+			}
 			if (!addr) {
+				module_put(mod);
 				bpf_log(log,
 					"The address of function %s cannot be found\n",
 					tname);
@@ -18521,11 +18532,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				break;
 			}
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s is not sleepable\n", tname);
 				return ret;
 			}
 		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
 			if (tgt_prog) {
+				module_put(mod);
 				bpf_log(log, "can't modify return codes of BPF programs\n");
 				return -EINVAL;
 			}
@@ -18534,6 +18547,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    !check_attach_modify_return(addr, tname))
 				ret = 0;
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s() is not modifiable\n", tname);
 				return ret;
 			}
@@ -18544,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	tgt_info->tgt_addr = addr;
 	tgt_info->tgt_name = tname;
 	tgt_info->tgt_type = t;
+	tgt_info->tgt_mod = mod;
 	return 0;
 }
 
@@ -18623,6 +18638,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	/* store info about the attachment target that will be used later */
 	prog->aux->attach_func_proto = tgt_info.tgt_type;
 	prog->aux->attach_func_name = tgt_info.tgt_name;
+	prog->aux->mod = tgt_info.tgt_mod;
 
 	if (tgt_prog) {
 		prog->aux->saved_dst_prog_type = tgt_prog->type;
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..5c9170f9135c 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From c1fef618d611b31964ab397aa0bf0611da94bade Mon Sep 17 00:00:00 2001
From: Sandipan Patra <spatra@nvidia.com>
Date: Mon, 13 Mar 2023 22:42:23 -0700
Subject: net/mlx5: Implement thermal zone

Implement thermal zone support for mlx5 based HW. The NIC
uses temperature sensor provided by ASIC to report current temperature
to thermal core.

Signed-off-by: Sandipan Patra <spatra@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20230314054234.267365-5-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |   6 ++
 drivers/net/ethernet/mellanox/mlx5/core/thermal.c | 108 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/thermal.h |  20 ++++
 include/linux/mlx5/driver.h                       |   3 +
 include/linux/mlx5/mlx5_ifc.h                     |  26 ++++++
 6 files changed, 164 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/thermal.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/thermal.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8d4e25cc54ea..6c2f1d4a58ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -77,6 +77,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += esw/acl/helper.o \
 
 mlx5_core-$(CONFIG_MLX5_BRIDGE)    += esw/bridge.o en/rep/bridge.o
 
+mlx5_core-$(CONFIG_THERMAL)        += thermal.o
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0ff0eb660495..644c889f9a32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -52,6 +52,7 @@
 #include <linux/version.h>
 #include <net/devlink.h>
 #include "mlx5_core.h"
+#include "thermal.h"
 #include "lib/eq.h"
 #include "fs_core.h"
 #include "lib/mpfs.h"
@@ -1768,6 +1769,10 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
 
+	err = mlx5_thermal_init(dev);
+	if (err)
+		dev_err(&pdev->dev, "mlx5_thermal_init failed with error code %d\n", err);
+
 	pci_save_state(pdev);
 	devlink_register(devlink);
 	return 0;
@@ -1796,6 +1801,7 @@ static void remove_one(struct pci_dev *pdev)
 	set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state);
 	devlink_unregister(devlink);
 	mlx5_sriov_disable(pdev);
+	mlx5_thermal_uninit(dev);
 	mlx5_crdump_disable(dev);
 	mlx5_drain_health_wq(dev);
 	mlx5_uninit_one(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/thermal.c b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c
new file mode 100644
index 000000000000..e47fa6fb836f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/thermal.h>
+#include <linux/err.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "thermal.h"
+
+#define MLX5_THERMAL_POLL_INT_MSEC	1000
+#define MLX5_THERMAL_NUM_TRIPS		0
+#define MLX5_THERMAL_ASIC_SENSOR_INDEX	0
+
+/* Bit string indicating the writeablility of trip points if any */
+#define MLX5_THERMAL_TRIP_MASK	(BIT(MLX5_THERMAL_NUM_TRIPS) - 1)
+
+struct mlx5_thermal {
+	struct mlx5_core_dev *mdev;
+	struct thermal_zone_device *tzdev;
+};
+
+static int mlx5_thermal_get_mtmp_temp(struct mlx5_core_dev *mdev, u32 id, int *p_temp)
+{
+	u32 mtmp_out[MLX5_ST_SZ_DW(mtmp_reg)] = {};
+	u32 mtmp_in[MLX5_ST_SZ_DW(mtmp_reg)] = {};
+	int err;
+
+	MLX5_SET(mtmp_reg, mtmp_in, sensor_index, id);
+
+	err = mlx5_core_access_reg(mdev, mtmp_in,  sizeof(mtmp_in),
+				   mtmp_out, sizeof(mtmp_out),
+				   MLX5_REG_MTMP, 0, 0);
+
+	if (err)
+		return err;
+
+	*p_temp = MLX5_GET(mtmp_reg, mtmp_out, temperature);
+
+	return 0;
+}
+
+static int mlx5_thermal_get_temp(struct thermal_zone_device *tzdev,
+				 int *p_temp)
+{
+	struct mlx5_thermal *thermal = tzdev->devdata;
+	struct mlx5_core_dev *mdev = thermal->mdev;
+	int err;
+
+	err = mlx5_thermal_get_mtmp_temp(mdev, MLX5_THERMAL_ASIC_SENSOR_INDEX, p_temp);
+
+	if (err)
+		return err;
+
+	/* The unit of temp returned is in 0.125 C. The thermal
+	 * framework expects the value in 0.001 C.
+	 */
+	*p_temp *= 125;
+
+	return 0;
+}
+
+static struct thermal_zone_device_ops mlx5_thermal_ops = {
+	.get_temp = mlx5_thermal_get_temp,
+};
+
+int mlx5_thermal_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_thermal *thermal;
+	struct thermal_zone_device *tzd;
+	const char *data = "mlx5";
+
+	tzd = thermal_zone_get_zone_by_name(data);
+	if (!IS_ERR(tzd))
+		return 0;
+
+	thermal = kzalloc(sizeof(*thermal), GFP_KERNEL);
+	if (!thermal)
+		return -ENOMEM;
+
+	thermal->mdev = mdev;
+	thermal->tzdev = thermal_zone_device_register(data,
+						      MLX5_THERMAL_NUM_TRIPS,
+						      MLX5_THERMAL_TRIP_MASK,
+						      thermal,
+						      &mlx5_thermal_ops,
+						      NULL, 0, MLX5_THERMAL_POLL_INT_MSEC);
+	if (IS_ERR(thermal->tzdev)) {
+		dev_err(mdev->device, "Failed to register thermal zone device (%s) %ld\n",
+			data, PTR_ERR(thermal->tzdev));
+		kfree(thermal);
+		return -EINVAL;
+	}
+
+	mdev->thermal = thermal;
+	return 0;
+}
+
+void mlx5_thermal_uninit(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->thermal)
+		return;
+
+	thermal_zone_device_unregister(mdev->thermal->tzdev);
+	kfree(mdev->thermal);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/thermal.h b/drivers/net/ethernet/mellanox/mlx5/core/thermal.h
new file mode 100644
index 000000000000..7d752c122192
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/thermal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __MLX5_THERMAL_DRIVER_H
+#define __MLX5_THERMAL_DRIVER_H
+
+#if IS_ENABLED(CONFIG_THERMAL)
+int mlx5_thermal_init(struct mlx5_core_dev *mdev);
+void mlx5_thermal_uninit(struct mlx5_core_dev *mdev);
+#else
+static inline int mlx5_thermal_init(struct mlx5_core_dev *mdev)
+{
+	mdev->thermal = NULL;
+	return 0;
+}
+
+static inline void mlx5_thermal_uninit(struct mlx5_core_dev *mdev) { }
+#endif
+
+#endif /* __MLX5_THERMAL_DRIVER_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209..7a898113b6b7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -134,6 +134,7 @@ enum {
 	MLX5_REG_PCAM		 = 0x507f,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MTMP		 = 0x900A,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MFRL		 = 0x9028,
 	MLX5_REG_MLCR		 = 0x902b,
@@ -731,6 +732,7 @@ struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 struct mlx5_geneve;
 struct mlx5_hv_vhca;
+struct mlx5_thermal;
 
 #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
 #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
@@ -808,6 +810,7 @@ struct mlx5_core_dev {
 	struct mlx5_rsc_dump    *rsc_dump;
 	u32                      vsc_addr;
 	struct mlx5_hv_vhca	*hv_vhca;
+	struct mlx5_thermal	*thermal;
 };
 
 struct mlx5_db {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 66d76e97a087..d2c164f0778c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10869,6 +10869,31 @@ struct mlx5_ifc_mrtc_reg_bits {
 	u8         time_l[0x20];
 };
 
+struct mlx5_ifc_mtmp_reg_bits {
+	u8         reserved_at_0[0x14];
+	u8         sensor_index[0xc];
+
+	u8         reserved_at_20[0x10];
+	u8         temperature[0x10];
+
+	u8         mte[0x1];
+	u8         mtr[0x1];
+	u8         reserved_at_42[0xe];
+	u8         max_temperature[0x10];
+
+	u8         tee[0x2];
+	u8         reserved_at_62[0xe];
+	u8         temp_threshold_hi[0x10];
+
+	u8         reserved_at_80[0x10];
+	u8         temp_threshold_lo[0x10];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         sensor_name_hi[0x20];
+	u8         sensor_name_lo[0x20];
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -10931,6 +10956,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_mfrl_reg_bits mfrl_reg;
 	struct mlx5_ifc_mtutc_reg_bits mtutc_reg;
 	struct mlx5_ifc_mrtc_reg_bits mrtc_reg;
+	struct mlx5_ifc_mtmp_reg_bits mtmp_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From 028522e2844393abc44f7bd7477eb4a455f01579 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 13 Mar 2023 22:42:29 -0700
Subject: net/mlx5: Move needed PTYS functions to core layer

Downstream patches require devlink params to access the PTYS register,
move the needed functions from mlx5e to the core layer.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20230314054234.267365-11-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/params.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c  | 157 +--------------------
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h  |  14 --
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  12 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 151 ++++++++++++++++++++
 include/linux/mlx5/port.h                          |  16 +++
 8 files changed, 179 insertions(+), 177 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index a21bd1179477..561da78d3b5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -553,7 +553,7 @@ bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 	u32 link_speed = 0;
 	u32 pci_bw = 0;
 
-	mlx5e_port_max_linkspeed(mdev, &link_speed);
+	mlx5_port_max_linkspeed(mdev, &link_speed);
 	pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
 	mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n",
 			   link_speed, pci_bw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index 505ba41195b9..dbe2b19a9570 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -32,101 +32,6 @@
 
 #include "port.h"
 
-/* speed in units of 1Mb */
-static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
-	[MLX5E_1000BASE_CX_SGMII] = 1000,
-	[MLX5E_1000BASE_KX]       = 1000,
-	[MLX5E_10GBASE_CX4]       = 10000,
-	[MLX5E_10GBASE_KX4]       = 10000,
-	[MLX5E_10GBASE_KR]        = 10000,
-	[MLX5E_20GBASE_KR2]       = 20000,
-	[MLX5E_40GBASE_CR4]       = 40000,
-	[MLX5E_40GBASE_KR4]       = 40000,
-	[MLX5E_56GBASE_R4]        = 56000,
-	[MLX5E_10GBASE_CR]        = 10000,
-	[MLX5E_10GBASE_SR]        = 10000,
-	[MLX5E_10GBASE_ER]        = 10000,
-	[MLX5E_40GBASE_SR4]       = 40000,
-	[MLX5E_40GBASE_LR4]       = 40000,
-	[MLX5E_50GBASE_SR2]       = 50000,
-	[MLX5E_100GBASE_CR4]      = 100000,
-	[MLX5E_100GBASE_SR4]      = 100000,
-	[MLX5E_100GBASE_KR4]      = 100000,
-	[MLX5E_100GBASE_LR4]      = 100000,
-	[MLX5E_100BASE_TX]        = 100,
-	[MLX5E_1000BASE_T]        = 1000,
-	[MLX5E_10GBASE_T]         = 10000,
-	[MLX5E_25GBASE_CR]        = 25000,
-	[MLX5E_25GBASE_KR]        = 25000,
-	[MLX5E_25GBASE_SR]        = 25000,
-	[MLX5E_50GBASE_CR2]       = 50000,
-	[MLX5E_50GBASE_KR2]       = 50000,
-};
-
-static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
-	[MLX5E_SGMII_100M]			= 100,
-	[MLX5E_1000BASE_X_SGMII]		= 1000,
-	[MLX5E_5GBASE_R]			= 5000,
-	[MLX5E_10GBASE_XFI_XAUI_1]		= 10000,
-	[MLX5E_40GBASE_XLAUI_4_XLPPI_4]		= 40000,
-	[MLX5E_25GAUI_1_25GBASE_CR_KR]		= 25000,
-	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2]	= 50000,
-	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR]	= 50000,
-	[MLX5E_CAUI_4_100GBASE_CR4_KR4]		= 100000,
-	[MLX5E_100GAUI_2_100GBASE_CR2_KR2]	= 100000,
-	[MLX5E_200GAUI_4_200GBASE_CR4_KR4]	= 200000,
-	[MLX5E_400GAUI_8]			= 400000,
-	[MLX5E_100GAUI_1_100GBASE_CR_KR]	= 100000,
-	[MLX5E_200GAUI_2_200GBASE_CR2_KR2]	= 200000,
-	[MLX5E_400GAUI_4_400GBASE_CR4_KR4]	= 400000,
-};
-
-bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev)
-{
-	struct mlx5e_port_eth_proto eproto;
-	int err;
-
-	if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet))
-		return true;
-
-	err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto);
-	if (err)
-		return false;
-
-	return !!eproto.cap;
-}
-
-static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev,
-				     const u32 **arr, u32 *size,
-				     bool force_legacy)
-{
-	bool ext = force_legacy ? false : mlx5e_ptys_ext_supported(mdev);
-
-	*size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) :
-		      ARRAY_SIZE(mlx5e_link_speed);
-	*arr  = ext ? mlx5e_ext_link_speed : mlx5e_link_speed;
-}
-
-int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
-			      struct mlx5e_port_eth_proto *eproto)
-{
-	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
-	int err;
-
-	if (!eproto)
-		return -EINVAL;
-
-	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port);
-	if (err)
-		return err;
-
-	eproto->cap   = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
-					   eth_proto_capability);
-	eproto->admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_admin);
-	eproto->oper  = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
-	return 0;
-}
-
 void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status,
 				 u8 *an_disable_cap, u8 *an_disable_admin)
 {
@@ -172,30 +77,14 @@ int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			    sizeof(out), MLX5_REG_PTYS, 0, 1);
 }
 
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
-			  bool force_legacy)
-{
-	unsigned long temp = eth_proto_oper;
-	const u32 *table;
-	u32 speed = 0;
-	u32 max_size;
-	int i;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
-	i = find_first_bit(&temp, max_size);
-	if (i < max_size)
-		speed = table[i];
-	return speed;
-}
-
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 {
-	struct mlx5e_port_eth_proto eproto;
+	struct mlx5_port_eth_proto eproto;
 	bool force_legacy = false;
 	bool ext;
 	int err;
 
-	ext = mlx5e_ptys_ext_supported(mdev);
+	ext = mlx5_ptys_ext_supported(mdev);
 	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
 	if (err)
 		goto out;
@@ -205,7 +94,7 @@ int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 		if (err)
 			goto out;
 	}
-	*speed = mlx5e_port_ptys2speed(mdev, eproto.oper, force_legacy);
+	*speed = mlx5_port_ptys2speed(mdev, eproto.oper, force_legacy);
 	if (!(*speed))
 		err = -EINVAL;
 
@@ -213,46 +102,6 @@ out:
 	return err;
 }
 
-int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
-{
-	struct mlx5e_port_eth_proto eproto;
-	u32 max_speed = 0;
-	const u32 *table;
-	u32 max_size;
-	bool ext;
-	int err;
-	int i;
-
-	ext = mlx5e_ptys_ext_supported(mdev);
-	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
-	if (err)
-		return err;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, false);
-	for (i = 0; i < max_size; ++i)
-		if (eproto.cap & MLX5E_PROT_MASK(i))
-			max_speed = max(max_speed, table[i]);
-
-	*speed = max_speed;
-	return 0;
-}
-
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
-			       bool force_legacy)
-{
-	u32 link_modes = 0;
-	const u32 *table;
-	u32 max_size;
-	int i;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
-	for (i = 0; i < max_size; ++i) {
-		if (table[i] == speed)
-			link_modes |= MLX5E_PROT_MASK(i);
-	}
-	return link_modes;
-}
-
 int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out)
 {
 	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
index 3f474e370828..d1da225f35da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -36,25 +36,11 @@
 #include <linux/mlx5/driver.h>
 #include "en.h"
 
-struct mlx5e_port_eth_proto {
-	u32 cap;
-	u32 admin;
-	u32 oper;
-};
-
-int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
-			      struct mlx5e_port_eth_proto *eproto);
 void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status,
 				 u8 *an_disable_cap, u8 *an_disable_admin);
 int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			   u32 proto_admin, bool ext);
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
-			  bool force_legacy);
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
-			       bool force_legacy);
-bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev);
 int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
 int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
 int mlx5e_port_query_sbpr(struct mlx5_core_dev *mdev, u32 desc, u8 dir,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 7708acc9b2ab..53c35147f29b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -220,7 +220,7 @@ static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev,
 					struct ptys2ethtool_config **arr,
 					u32 *size)
 {
-	bool ext = mlx5e_ptys_ext_supported(mdev);
+	bool ext = mlx5_ptys_ext_supported(mdev);
 
 	*arr = ext ? ptys2ext_ethtool_table : ptys2legacy_ethtool_table;
 	*size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) :
@@ -895,7 +895,7 @@ static void get_speed_duplex(struct net_device *netdev,
 	if (!netif_carrier_ok(netdev))
 		goto out;
 
-	speed = mlx5e_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy);
+	speed = mlx5_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy);
 	if (!speed) {
 		if (data_rate_oper)
 			speed = 100 * data_rate_oper;
@@ -980,7 +980,7 @@ static void get_lp_advertising(struct mlx5_core_dev *mdev, u32 eth_proto_lp,
 			       struct ethtool_link_ksettings *link_ksettings)
 {
 	unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising;
-	bool ext = mlx5e_ptys_ext_supported(mdev);
+	bool ext = mlx5_ptys_ext_supported(mdev);
 
 	ptys2ethtool_adver_link(lp_advertising, eth_proto_lp, ext);
 }
@@ -1160,7 +1160,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 				     const struct ethtool_link_ksettings *link_ksettings)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_port_eth_proto eproto;
+	struct mlx5_port_eth_proto eproto;
 	const unsigned long *adver;
 	bool an_changes = false;
 	u8 an_disable_admin;
@@ -1180,7 +1180,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 	autoneg = link_ksettings->base.autoneg;
 	speed = link_ksettings->base.speed;
 
-	ext_supported = mlx5e_ptys_ext_supported(mdev);
+	ext_supported = mlx5_ptys_ext_supported(mdev);
 	ext = ext_requested(autoneg, adver, ext_supported);
 	if (!ext_supported && ext)
 		return -EOPNOTSUPP;
@@ -1194,7 +1194,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 		goto out;
 	}
 	link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) :
-		mlx5e_port_speed2linkmodes(mdev, speed, !ext);
+		mlx5_port_speed2linkmodes(mdev, speed, !ext);
 
 	err = mlx5e_speed_validate(priv->netdev, ext, link_modes, autoneg);
 	if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 70b8d2dfa751..79dd8ad5ede7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1108,7 +1108,7 @@ mlx5e_hairpin_params_init(struct mlx5e_hairpin_params *hairpin_params,
 
 	hairpin_params->mdev = mdev;
 	/* set hairpin pair per each 50Gbs share of the link */
-	mlx5e_port_max_linkspeed(mdev, &link_speed);
+	mlx5_port_max_linkspeed(mdev, &link_speed);
 	link_speed = max_t(u32, link_speed, 50000);
 	link_speed64 = link_speed;
 	do_div(link_speed64, 50000);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 75015d370922..7c79476cc5f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -744,7 +744,7 @@ static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char *
 	u64 value;
 	int err;
 
-	err = mlx5e_port_max_linkspeed(mdev, &link_speed_max);
+	err = mlx5_port_max_linkspeed(mdev, &link_speed_max);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to get link maximum speed");
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index a1548e6bfb35..0daeb4b72cca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1054,3 +1054,154 @@ out:
 	kfree(out);
 	return err;
 }
+
+/* speed in units of 1Mb */
+static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
+	[MLX5E_1000BASE_CX_SGMII] = 1000,
+	[MLX5E_1000BASE_KX]       = 1000,
+	[MLX5E_10GBASE_CX4]       = 10000,
+	[MLX5E_10GBASE_KX4]       = 10000,
+	[MLX5E_10GBASE_KR]        = 10000,
+	[MLX5E_20GBASE_KR2]       = 20000,
+	[MLX5E_40GBASE_CR4]       = 40000,
+	[MLX5E_40GBASE_KR4]       = 40000,
+	[MLX5E_56GBASE_R4]        = 56000,
+	[MLX5E_10GBASE_CR]        = 10000,
+	[MLX5E_10GBASE_SR]        = 10000,
+	[MLX5E_10GBASE_ER]        = 10000,
+	[MLX5E_40GBASE_SR4]       = 40000,
+	[MLX5E_40GBASE_LR4]       = 40000,
+	[MLX5E_50GBASE_SR2]       = 50000,
+	[MLX5E_100GBASE_CR4]      = 100000,
+	[MLX5E_100GBASE_SR4]      = 100000,
+	[MLX5E_100GBASE_KR4]      = 100000,
+	[MLX5E_100GBASE_LR4]      = 100000,
+	[MLX5E_100BASE_TX]        = 100,
+	[MLX5E_1000BASE_T]        = 1000,
+	[MLX5E_10GBASE_T]         = 10000,
+	[MLX5E_25GBASE_CR]        = 25000,
+	[MLX5E_25GBASE_KR]        = 25000,
+	[MLX5E_25GBASE_SR]        = 25000,
+	[MLX5E_50GBASE_CR2]       = 50000,
+	[MLX5E_50GBASE_KR2]       = 50000,
+};
+
+static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
+	[MLX5E_SGMII_100M] = 100,
+	[MLX5E_1000BASE_X_SGMII] = 1000,
+	[MLX5E_5GBASE_R] = 5000,
+	[MLX5E_10GBASE_XFI_XAUI_1] = 10000,
+	[MLX5E_40GBASE_XLAUI_4_XLPPI_4] = 40000,
+	[MLX5E_25GAUI_1_25GBASE_CR_KR] = 25000,
+	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2] = 50000,
+	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR] = 50000,
+	[MLX5E_CAUI_4_100GBASE_CR4_KR4] = 100000,
+	[MLX5E_100GAUI_2_100GBASE_CR2_KR2] = 100000,
+	[MLX5E_200GAUI_4_200GBASE_CR4_KR4] = 200000,
+	[MLX5E_400GAUI_8] = 400000,
+	[MLX5E_100GAUI_1_100GBASE_CR_KR] = 100000,
+	[MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000,
+	[MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000,
+};
+
+int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
+			      struct mlx5_port_eth_proto *eproto)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	if (!eproto)
+		return -EINVAL;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port);
+	if (err)
+		return err;
+
+	eproto->cap   = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
+					   eth_proto_capability);
+	eproto->admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_admin);
+	eproto->oper  = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
+	return 0;
+}
+
+bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_port_eth_proto eproto;
+	int err;
+
+	if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet))
+		return true;
+
+	err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto);
+	if (err)
+		return false;
+
+	return !!eproto.cap;
+}
+
+static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev,
+				     const u32 **arr, u32 *size,
+				     bool force_legacy)
+{
+	bool ext = force_legacy ? false : mlx5_ptys_ext_supported(mdev);
+
+	*size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) :
+		      ARRAY_SIZE(mlx5e_link_speed);
+	*arr  = ext ? mlx5e_ext_link_speed : mlx5e_link_speed;
+}
+
+u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			 bool force_legacy)
+{
+	unsigned long temp = eth_proto_oper;
+	const u32 *table;
+	u32 speed = 0;
+	u32 max_size;
+	int i;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
+	i = find_first_bit(&temp, max_size);
+	if (i < max_size)
+		speed = table[i];
+	return speed;
+}
+
+u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			      bool force_legacy)
+{
+	u32 link_modes = 0;
+	const u32 *table;
+	u32 max_size;
+	int i;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
+	for (i = 0; i < max_size; ++i) {
+		if (table[i] == speed)
+			link_modes |= MLX5E_PROT_MASK(i);
+	}
+	return link_modes;
+}
+
+int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	struct mlx5_port_eth_proto eproto;
+	u32 max_speed = 0;
+	const u32 *table;
+	u32 max_size;
+	bool ext;
+	int err;
+	int i;
+
+	ext = mlx5_ptys_ext_supported(mdev);
+	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
+	if (err)
+		return err;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, false);
+	for (i = 0; i < max_size; ++i)
+		if (eproto.cap & MLX5E_PROT_MASK(i))
+			max_speed = max(max_speed, table[i]);
+
+	*speed = max_speed;
+	return 0;
+}
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index e96ee1e348cb..98b2e1e149f9 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -141,6 +141,12 @@ enum mlx5_ptys_width {
 	MLX5_PTYS_WIDTH_12X	= 1 << 4,
 };
 
+struct mlx5_port_eth_proto {
+	u32 cap;
+	u32 admin;
+	u32 oper;
+};
+
 #define MLX5E_PROT_MASK(link_mode) (1U << link_mode)
 #define MLX5_GET_ETH_PROTO(reg, out, ext, field)	\
 	(ext ? MLX5_GET(reg, out, ext_##field) :	\
@@ -218,4 +224,14 @@ int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
 int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
 int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
 int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
+
+int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
+			      struct mlx5_port_eth_proto *eproto);
+bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev);
+u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			 bool force_legacy);
+u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			      bool force_legacy);
+int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 96ec2d9868c4afd3cea17f4ee18b84ccc1cde20f Mon Sep 17 00:00:00 2001
From: Orlando Chamberlain <orlandoch.dev@gmail.com>
Date: Fri, 3 Mar 2023 22:28:41 +1100
Subject: platform/x86: apple-gmux: refactor gmux types

Add apple_gmux_config struct containing operations and data specific to
each mux type.

This is in preparation for adding a third, MMIO based, gmux type.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Orlando Chamberlain <orlandoch.dev@gmail.com>
Link: https://lore.kernel.org/r/20230303112842.3094-3-orlandoch.dev@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/apple-gmux.c | 93 ++++++++++++++++++++++++++-------------
 include/linux/apple-gmux.h        | 18 +++++---
 2 files changed, 74 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index ec99e05e532c..36208e93d745 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -5,6 +5,7 @@
  *  Copyright (C) Canonical Ltd. <seth.forshee@canonical.com>
  *  Copyright (C) 2010-2012 Andreas Heider <andreas@meetr.de>
  *  Copyright (C) 2015 Lukas Wunner <lukas@wunner.de>
+ *  Copyright (C) 2023 Orlando Chamberlain <orlandoch.dev@gmail.com>
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -43,10 +44,12 @@
  *     http://www.renesas.com/products/mpumcu/h8s/h8s2100/h8s2113/index.jsp
  */
 
+struct apple_gmux_config;
+
 struct apple_gmux_data {
 	unsigned long iostart;
 	unsigned long iolen;
-	bool indexed;
+	const struct apple_gmux_config *config;
 	struct mutex index_lock;
 
 	struct backlight_device *bdev;
@@ -64,6 +67,18 @@ struct apple_gmux_data {
 
 static struct apple_gmux_data *apple_gmux_data;
 
+struct apple_gmux_config {
+	u8 (*read8)(struct apple_gmux_data *gmux_data, int port);
+	void (*write8)(struct apple_gmux_data *gmux_data, int port, u8 val);
+	u32 (*read32)(struct apple_gmux_data *gmux_data, int port);
+	void (*write32)(struct apple_gmux_data *gmux_data, int port, u32 val);
+	const struct vga_switcheroo_handler *gmux_handler;
+	enum vga_switcheroo_handler_flags_t handler_flags;
+	unsigned long resource_type;
+	bool read_version_as_u32;
+	char *name;
+};
+
 #define GMUX_INTERRUPT_ENABLE		0xff
 #define GMUX_INTERRUPT_DISABLE		0x00
 
@@ -195,35 +210,23 @@ static void gmux_index_write32(struct apple_gmux_data *gmux_data, int port,
 
 static u8 gmux_read8(struct apple_gmux_data *gmux_data, int port)
 {
-	if (gmux_data->indexed)
-		return gmux_index_read8(gmux_data, port);
-	else
-		return gmux_pio_read8(gmux_data, port);
+	return gmux_data->config->read8(gmux_data, port);
 }
 
 static void gmux_write8(struct apple_gmux_data *gmux_data, int port, u8 val)
 {
-	if (gmux_data->indexed)
-		gmux_index_write8(gmux_data, port, val);
-	else
-		gmux_pio_write8(gmux_data, port, val);
+	return gmux_data->config->write8(gmux_data, port, val);
 }
 
 static u32 gmux_read32(struct apple_gmux_data *gmux_data, int port)
 {
-	if (gmux_data->indexed)
-		return gmux_index_read32(gmux_data, port);
-	else
-		return gmux_pio_read32(gmux_data, port);
+	return gmux_data->config->read32(gmux_data, port);
 }
 
 static void gmux_write32(struct apple_gmux_data *gmux_data, int port,
 			     u32 val)
 {
-	if (gmux_data->indexed)
-		gmux_index_write32(gmux_data, port, val);
-	else
-		gmux_pio_write32(gmux_data, port, val);
+	return gmux_data->config->write32(gmux_data, port, val);
 }
 
 /**
@@ -463,19 +466,43 @@ static enum vga_switcheroo_client_id gmux_get_client_id(struct pci_dev *pdev)
 		return VGA_SWITCHEROO_DIS;
 }
 
-static const struct vga_switcheroo_handler gmux_handler_indexed = {
+static const struct vga_switcheroo_handler gmux_handler_no_ddc = {
 	.switchto = gmux_switchto,
 	.power_state = gmux_set_power_state,
 	.get_client_id = gmux_get_client_id,
 };
 
-static const struct vga_switcheroo_handler gmux_handler_classic = {
+static const struct vga_switcheroo_handler gmux_handler_ddc = {
 	.switchto = gmux_switchto,
 	.switch_ddc = gmux_switch_ddc,
 	.power_state = gmux_set_power_state,
 	.get_client_id = gmux_get_client_id,
 };
 
+static const struct apple_gmux_config apple_gmux_pio = {
+	.read8 = &gmux_pio_read8,
+	.write8 = &gmux_pio_write8,
+	.read32 = &gmux_pio_read32,
+	.write32 = &gmux_pio_write32,
+	.gmux_handler = &gmux_handler_ddc,
+	.handler_flags = VGA_SWITCHEROO_CAN_SWITCH_DDC,
+	.resource_type = IORESOURCE_IO,
+	.read_version_as_u32 = false,
+	.name = "classic"
+};
+
+static const struct apple_gmux_config apple_gmux_index = {
+	.read8 = &gmux_index_read8,
+	.write8 = &gmux_index_write8,
+	.read32 = &gmux_index_read32,
+	.write32 = &gmux_index_write32,
+	.gmux_handler = &gmux_handler_no_ddc,
+	.handler_flags = VGA_SWITCHEROO_NEEDS_EDP_CONFIG,
+	.resource_type = IORESOURCE_IO,
+	.read_version_as_u32 = true,
+	.name = "indexed"
+};
+
 /**
  * DOC: Interrupt
  *
@@ -565,13 +592,13 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 	int ret = -ENXIO;
 	acpi_status status;
 	unsigned long long gpe;
-	bool indexed = false;
+	enum apple_gmux_type type;
 	u32 version;
 
 	if (apple_gmux_data)
 		return -EBUSY;
 
-	if (!apple_gmux_detect(pnp, &indexed)) {
+	if (!apple_gmux_detect(pnp, &type)) {
 		pr_info("gmux device not present\n");
 		return -ENODEV;
 	}
@@ -581,6 +608,16 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 		return -ENOMEM;
 	pnp_set_drvdata(pnp, gmux_data);
 
+	switch (type) {
+	case APPLE_GMUX_TYPE_INDEXED:
+		gmux_data->config = &apple_gmux_index;
+		mutex_init(&gmux_data->index_lock);
+		break;
+	case APPLE_GMUX_TYPE_PIO:
+		gmux_data->config = &apple_gmux_pio;
+		break;
+	}
+
 	res = pnp_get_resource(pnp, IORESOURCE_IO, 0);
 	gmux_data->iostart = res->start;
 	gmux_data->iolen = resource_size(res);
@@ -591,9 +628,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 		goto err_free;
 	}
 
-	if (indexed) {
-		mutex_init(&gmux_data->index_lock);
-		gmux_data->indexed = true;
+	if (gmux_data->config->read_version_as_u32) {
 		version = gmux_read32(gmux_data, GMUX_PORT_VERSION_MAJOR);
 		ver_major = (version >> 24) & 0xff;
 		ver_minor = (version >> 16) & 0xff;
@@ -604,7 +639,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 		ver_release = gmux_read8(gmux_data, GMUX_PORT_VERSION_RELEASE);
 	}
 	pr_info("Found gmux version %d.%d.%d [%s]\n", ver_major, ver_minor,
-		ver_release, (gmux_data->indexed ? "indexed" : "classic"));
+		ver_release, gmux_data->config->name);
 
 	memset(&props, 0, sizeof(props));
 	props.type = BACKLIGHT_PLATFORM;
@@ -694,12 +729,8 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 	 *
 	 * Pre-retina MacBook Pros can switch the panel's DDC separately.
 	 */
-	if (gmux_data->indexed)
-		ret = vga_switcheroo_register_handler(&gmux_handler_indexed,
-					      VGA_SWITCHEROO_NEEDS_EDP_CONFIG);
-	else
-		ret = vga_switcheroo_register_handler(&gmux_handler_classic,
-					      VGA_SWITCHEROO_CAN_SWITCH_DDC);
+	ret = vga_switcheroo_register_handler(gmux_data->config->gmux_handler,
+			gmux_data->config->handler_flags);
 	if (ret) {
 		pr_err("Failed to register vga_switcheroo handler\n");
 		goto err_register_handler;
diff --git a/include/linux/apple-gmux.h b/include/linux/apple-gmux.h
index 1f68b49bcd68..147dc1c52e08 100644
--- a/include/linux/apple-gmux.h
+++ b/include/linux/apple-gmux.h
@@ -36,6 +36,11 @@
 
 #define GMUX_MIN_IO_LEN			(GMUX_PORT_BRIGHTNESS + 4)
 
+enum apple_gmux_type {
+	APPLE_GMUX_TYPE_PIO,
+	APPLE_GMUX_TYPE_INDEXED,
+};
+
 #if IS_ENABLED(CONFIG_APPLE_GMUX)
 static inline bool apple_gmux_is_indexed(unsigned long iostart)
 {
@@ -65,13 +70,13 @@ static inline bool apple_gmux_is_indexed(unsigned long iostart)
  * Return: %true if a supported gmux ACPI device is detected and the kernel
  * was configured with CONFIG_APPLE_GMUX, %false otherwise.
  */
-static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, bool *indexed_ret)
+static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, enum apple_gmux_type *type_ret)
 {
 	u8 ver_major, ver_minor, ver_release;
 	struct device *dev = NULL;
 	struct acpi_device *adev;
 	struct resource *res;
-	bool indexed = false;
+	enum apple_gmux_type type = APPLE_GMUX_TYPE_PIO;
 	bool ret = false;
 
 	if (!pnp_dev) {
@@ -99,13 +104,14 @@ static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, bool *indexed_ret)
 	ver_minor = inb(res->start + GMUX_PORT_VERSION_MINOR);
 	ver_release = inb(res->start + GMUX_PORT_VERSION_RELEASE);
 	if (ver_major == 0xff && ver_minor == 0xff && ver_release == 0xff) {
-		indexed = apple_gmux_is_indexed(res->start);
-		if (!indexed)
+		if (apple_gmux_is_indexed(res->start))
+			type = APPLE_GMUX_TYPE_INDEXED;
+		else
 			goto out;
 	}
 
-	if (indexed_ret)
-		*indexed_ret = indexed;
+	if (type_ret)
+		*type_ret = type;
 
 	ret = true;
 out:
-- 
cgit v1.2.3


From 0c18184de990e63f708b090bcb9fc6c0fbc427cd Mon Sep 17 00:00:00 2001
From: Orlando Chamberlain <orlandoch.dev@gmail.com>
Date: Fri, 3 Mar 2023 22:28:42 +1100
Subject: platform/x86: apple-gmux: support MMIO gmux on T2 Macs

In some newer dual gpu MacBooks, the T2 Coprocessor functions as the
gmux, and the Intel side can interract with this new gmux type through
MMIO. Add support for these gmux controllers to the apple-gmux driver.

We start using the GMSP(0) acpi method on these gmux's when clearing
interrupts, as this prevents a flood of status=0 interrupts that can't
be cleared. It's unknown if this helps or hinders older gmux types, so
it isn't enabled for those.

Interestingly, the ACPI table only allocates 8 bytes for GMUX, but we
actually need 16, and as such we request 16 with request_mem_region.

Reading and writing from ports:
    16 bytes from 0xfe0b0200 are used. 0x0 to 0x4 are where data
    to read appears, and where data to write goes. Writing to 0xe
    sets the gmux port being accessed, and writing to 0xf sends commands.

    These commands are 0x40 & data_length for write, and data_length for
    read, where data_length is 1, 2 or 4. Once byte base+0xf is 0, the
    command is done.

Issues:
    As with other retina models, we can't switch DDC lines so
    switching at runtime doesn't work if the inactive gpu driver
    already disabled eDP due to it not being connected when that
    driver loaded.

    Additionally, turning on the dgpu back on on the MacBookPro16,1 does
    not work.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Orlando Chamberlain <orlandoch.dev@gmail.com>
Link: https://lore.kernel.org/r/20230303112842.3094-4-orlandoch.dev@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/apple-gmux.c | 170 +++++++++++++++++++++++++++++++++++---
 include/linux/apple-gmux.h        |  56 ++++++++++---
 2 files changed, 200 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index 36208e93d745..59c4ee8109dd 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -28,25 +28,35 @@
  * DOC: Overview
  *
  * gmux is a microcontroller built into the MacBook Pro to support dual GPUs:
- * A `Lattice XP2`_ on pre-retinas, a `Renesas R4F2113`_ on retinas.
+ * A `Lattice XP2`_ on pre-retinas, a `Renesas R4F2113`_ on pre-T2 retinas.
+ *
+ * On T2 Macbooks, the gmux is part of the T2 Coprocessor's SMC. The SMC has
+ * an I2C connection to a `NXP PCAL6524` GPIO expander, which enables/disables
+ * the voltage regulators of the discrete GPU, drives the display panel power,
+ * and has a GPIO to switch the eDP mux. The Intel CPU can interact with
+ * gmux through MMIO, similar to how the main SMC interface is controlled.
  *
  * (The MacPro6,1 2013 also has a gmux, however it is unclear why since it has
  * dual GPUs but no built-in display.)
  *
  * gmux is connected to the LPC bus of the southbridge. Its I/O ports are
  * accessed differently depending on the microcontroller: Driver functions
- * to access a pre-retina gmux are infixed ``_pio_``, those for a retina gmux
- * are infixed ``_index_``.
+ * to access a pre-retina gmux are infixed ``_pio_``, those for a pre-T2
+ * retina gmux are infixed ``_index_``, and those on T2 Macs are infixed
+ * with ``_mmio_``.
  *
  * .. _Lattice XP2:
  *     http://www.latticesemi.com/en/Products/FPGAandCPLD/LatticeXP2.aspx
  * .. _Renesas R4F2113:
  *     http://www.renesas.com/products/mpumcu/h8s/h8s2100/h8s2113/index.jsp
+ * .. _NXP PCAL6524:
+ *     https://www.nxp.com/docs/en/data-sheet/PCAL6524.pdf
  */
 
 struct apple_gmux_config;
 
 struct apple_gmux_data {
+	u8 *__iomem iomem_base;
 	unsigned long iostart;
 	unsigned long iolen;
 	const struct apple_gmux_config *config;
@@ -208,6 +218,79 @@ static void gmux_index_write32(struct apple_gmux_data *gmux_data, int port,
 	mutex_unlock(&gmux_data->index_lock);
 }
 
+static int gmux_mmio_wait(struct apple_gmux_data *gmux_data)
+{
+	int i = 200;
+	u8 gwr = ioread8(gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+
+	while (i && gwr) {
+		gwr = ioread8(gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+		udelay(100);
+		i--;
+	}
+
+	return !!i;
+}
+
+static u8 gmux_mmio_read8(struct apple_gmux_data *gmux_data, int port)
+{
+	u8 val;
+
+	mutex_lock(&gmux_data->index_lock);
+	gmux_mmio_wait(gmux_data);
+	iowrite8((port & 0xff), gmux_data->iomem_base + GMUX_MMIO_PORT_SELECT);
+	iowrite8(GMUX_MMIO_READ | sizeof(val),
+		gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+	gmux_mmio_wait(gmux_data);
+	val = ioread8(gmux_data->iomem_base);
+	mutex_unlock(&gmux_data->index_lock);
+
+	return val;
+}
+
+static void gmux_mmio_write8(struct apple_gmux_data *gmux_data, int port,
+			      u8 val)
+{
+	mutex_lock(&gmux_data->index_lock);
+	gmux_mmio_wait(gmux_data);
+	iowrite8(val, gmux_data->iomem_base);
+
+	iowrite8(port & 0xff, gmux_data->iomem_base + GMUX_MMIO_PORT_SELECT);
+	iowrite8(GMUX_MMIO_WRITE | sizeof(val),
+		gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+
+	gmux_mmio_wait(gmux_data);
+	mutex_unlock(&gmux_data->index_lock);
+}
+
+static u32 gmux_mmio_read32(struct apple_gmux_data *gmux_data, int port)
+{
+	u32 val;
+
+	mutex_lock(&gmux_data->index_lock);
+	gmux_mmio_wait(gmux_data);
+	iowrite8((port & 0xff), gmux_data->iomem_base + GMUX_MMIO_PORT_SELECT);
+	iowrite8(GMUX_MMIO_READ | sizeof(val),
+		gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+	gmux_mmio_wait(gmux_data);
+	val = be32_to_cpu(ioread32(gmux_data->iomem_base));
+	mutex_unlock(&gmux_data->index_lock);
+
+	return val;
+}
+
+static void gmux_mmio_write32(struct apple_gmux_data *gmux_data, int port,
+			       u32 val)
+{
+	mutex_lock(&gmux_data->index_lock);
+	iowrite32(cpu_to_be32(val), gmux_data->iomem_base);
+	iowrite8(port & 0xff, gmux_data->iomem_base + GMUX_MMIO_PORT_SELECT);
+	iowrite8(GMUX_MMIO_WRITE | sizeof(val),
+		gmux_data->iomem_base + GMUX_MMIO_COMMAND_SEND);
+	gmux_mmio_wait(gmux_data);
+	mutex_unlock(&gmux_data->index_lock);
+}
+
 static u8 gmux_read8(struct apple_gmux_data *gmux_data, int port)
 {
 	return gmux_data->config->read8(gmux_data, port);
@@ -236,8 +319,8 @@ static void gmux_write32(struct apple_gmux_data *gmux_data, int port,
  * the GPU. On dual GPU MacBook Pros by contrast, either GPU may be suspended
  * to conserve energy. Hence the PWM signal needs to be generated by a separate
  * backlight driver which is controlled by gmux. The earliest generation
- * MBP5 2008/09 uses a `TI LP8543`_ backlight driver. All newer models
- * use a `TI LP8545`_.
+ * MBP5 2008/09 uses a `TI LP8543`_ backlight driver. Newer models
+ * use a `TI LP8545`_ or a TI LP8548.
  *
  * .. _TI LP8543: https://www.ti.com/lit/ds/symlink/lp8543.pdf
  * .. _TI LP8545: https://www.ti.com/lit/ds/symlink/lp8545.pdf
@@ -301,8 +384,8 @@ static const struct backlight_ops gmux_bl_ops = {
  * connecting it either to the discrete GPU or the Thunderbolt controller.
  * Oddly enough, while the full port is no longer switchable, AUX and HPD
  * are still switchable by way of an `NXP CBTL03062`_ (on pre-retinas
- * MBP8 2011 and MBP9 2012) or two `TI TS3DS10224`_ (on retinas) under the
- * control of gmux. Since the integrated GPU is missing the main link,
+ * MBP8 2011 and MBP9 2012) or two `TI TS3DS10224`_ (on pre-t2 retinas) under
+ * the control of gmux. Since the integrated GPU is missing the main link,
  * external displays appear to it as phantoms which fail to link-train.
  *
  * gmux receives the HPD signal of all display connectors and sends an
@@ -503,14 +586,42 @@ static const struct apple_gmux_config apple_gmux_index = {
 	.name = "indexed"
 };
 
+static const struct apple_gmux_config apple_gmux_mmio = {
+	.read8 = &gmux_mmio_read8,
+	.write8 = &gmux_mmio_write8,
+	.read32 = &gmux_mmio_read32,
+	.write32 = &gmux_mmio_write32,
+	.gmux_handler = &gmux_handler_no_ddc,
+	.handler_flags = VGA_SWITCHEROO_NEEDS_EDP_CONFIG,
+	.resource_type = IORESOURCE_MEM,
+	.read_version_as_u32 = true,
+	.name = "T2"
+};
+
+
 /**
  * DOC: Interrupt
  *
  * gmux is also connected to a GPIO pin of the southbridge and thereby is able
- * to trigger an ACPI GPE. On the MBP5 2008/09 it's GPIO pin 22 of the Nvidia
- * MCP79, on all following generations it's GPIO pin 6 of the Intel PCH.
+ * to trigger an ACPI GPE. ACPI name GMGP holds this GPIO pin's number. On the
+ * MBP5 2008/09 it's GPIO pin 22 of the Nvidia MCP79, on following generations
+ * it's GPIO pin 6 of the Intel PCH, on MMIO gmux's it's pin 21.
+ *
  * The GPE merely signals that an interrupt occurred, the actual type of event
  * is identified by reading a gmux register.
+ *
+ * In addition to the GMGP name, gmux's ACPI device also has two methods GMSP
+ * and GMLV. GMLV likely means "GMUX Level", and reads the value of the GPIO,
+ * while GMSP likely means "GMUX Set Polarity", and seems to write to the GPIO's
+ * value. On newer Macbooks (This was introduced with or sometime before the
+ * MacBookPro14,3), the ACPI GPE method differentiates between the OS type: On
+ * Darwin, only a notification is signaled, whereas on other OSes, the GPIO's
+ * value is read and then inverted.
+ *
+ * Because Linux masquerades as Darwin, it ends up in the notification-only code
+ * path. On MMIO gmux's, this seems to lead to us being unable to clear interrupts,
+ * unless we call GMSP(0). Without this, there is a flood of status=0 interrupts
+ * that can't be cleared. This issue seems to be unique to MMIO gmux's.
  */
 
 static inline void gmux_disable_interrupts(struct apple_gmux_data *gmux_data)
@@ -537,6 +648,9 @@ static void gmux_clear_interrupts(struct apple_gmux_data *gmux_data)
 	/* to clear interrupts write back current status */
 	status = gmux_interrupt_get_status(gmux_data);
 	gmux_write8(gmux_data, GMUX_PORT_INTERRUPT_STATUS, status);
+	/* Prevent flood of status=0 interrupts */
+	if (gmux_data->config == &apple_gmux_mmio)
+		acpi_execute_simple_method(gmux_data->dhandle, "GMSP", 0);
 }
 
 static void gmux_notify_handler(acpi_handle device, u32 value, void *context)
@@ -609,6 +723,25 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 	pnp_set_drvdata(pnp, gmux_data);
 
 	switch (type) {
+	case APPLE_GMUX_TYPE_MMIO:
+		gmux_data->config = &apple_gmux_mmio;
+		mutex_init(&gmux_data->index_lock);
+
+		res = pnp_get_resource(pnp, IORESOURCE_MEM, 0);
+		gmux_data->iostart = res->start;
+		/* Although the ACPI table only allocates 8 bytes, we need 16. */
+		gmux_data->iolen = 16;
+		if (!request_mem_region(gmux_data->iostart, gmux_data->iolen,
+					"Apple gmux")) {
+			pr_err("gmux I/O already in use\n");
+			goto err_free;
+		}
+		gmux_data->iomem_base = ioremap(gmux_data->iostart, gmux_data->iolen);
+		if (!gmux_data->iomem_base) {
+			pr_err("couldn't remap gmux mmio region");
+			goto err_release;
+		}
+		goto get_version;
 	case APPLE_GMUX_TYPE_INDEXED:
 		gmux_data->config = &apple_gmux_index;
 		mutex_init(&gmux_data->index_lock);
@@ -628,6 +761,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 		goto err_free;
 	}
 
+get_version:
 	if (gmux_data->config->read_version_as_u32) {
 		version = gmux_read32(gmux_data, GMUX_PORT_VERSION_MAJOR);
 		ver_major = (version >> 24) & 0xff;
@@ -658,7 +792,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 					 gmux_data, &gmux_bl_ops, &props);
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
-		goto err_release;
+		goto err_unmap;
 	}
 
 	gmux_data->bdev = bdev;
@@ -725,7 +859,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 	/*
 	 * Retina MacBook Pros cannot switch the panel's AUX separately
 	 * and need eDP pre-calibration. They are distinguishable from
-	 * pre-retinas by having an "indexed" gmux.
+	 * pre-retinas by having an "indexed" or "T2" gmux.
 	 *
 	 * Pre-retina MacBook Pros can switch the panel's DDC separately.
 	 */
@@ -750,8 +884,14 @@ err_enable_gpe:
 					   &gmux_notify_handler);
 err_notify:
 	backlight_device_unregister(bdev);
+err_unmap:
+	if (gmux_data->iomem_base)
+		iounmap(gmux_data->iomem_base);
 err_release:
-	release_region(gmux_data->iostart, gmux_data->iolen);
+	if (gmux_data->config->resource_type == IORESOURCE_MEM)
+		release_mem_region(gmux_data->iostart, gmux_data->iolen);
+	else
+		release_region(gmux_data->iostart, gmux_data->iolen);
 err_free:
 	kfree(gmux_data);
 	return ret;
@@ -772,7 +912,11 @@ static void gmux_remove(struct pnp_dev *pnp)
 
 	backlight_device_unregister(gmux_data->bdev);
 
-	release_region(gmux_data->iostart, gmux_data->iolen);
+	if (gmux_data->iomem_base) {
+		iounmap(gmux_data->iomem_base);
+		release_mem_region(gmux_data->iostart, gmux_data->iolen);
+	} else
+		release_region(gmux_data->iostart, gmux_data->iolen);
 	apple_gmux_data = NULL;
 	kfree(gmux_data);
 
diff --git a/include/linux/apple-gmux.h b/include/linux/apple-gmux.h
index 147dc1c52e08..272f63f8fd7c 100644
--- a/include/linux/apple-gmux.h
+++ b/include/linux/apple-gmux.h
@@ -34,11 +34,18 @@
 #define GMUX_PORT_READ			0xd0
 #define GMUX_PORT_WRITE			0xd4
 
+#define GMUX_MMIO_PORT_SELECT		0x0e
+#define GMUX_MMIO_COMMAND_SEND		0x0f
+
+#define GMUX_MMIO_READ			0x00
+#define GMUX_MMIO_WRITE			0x40
+
 #define GMUX_MIN_IO_LEN			(GMUX_PORT_BRIGHTNESS + 4)
 
 enum apple_gmux_type {
 	APPLE_GMUX_TYPE_PIO,
 	APPLE_GMUX_TYPE_INDEXED,
+	APPLE_GMUX_TYPE_MMIO,
 };
 
 #if IS_ENABLED(CONFIG_APPLE_GMUX)
@@ -57,6 +64,24 @@ static inline bool apple_gmux_is_indexed(unsigned long iostart)
 	return false;
 }
 
+static inline bool apple_gmux_is_mmio(unsigned long iostart)
+{
+	u8 *__iomem iomem_base = ioremap(iostart, 16);
+	u8 val;
+
+	if (!iomem_base)
+		return false;
+
+	/*
+	 * If this is 0xff, then gmux must not be present, as the gmux would
+	 * reset it to 0x00, or it would be one of 0x1, 0x4, 0x41, 0x44 if a
+	 * command is currently being processed.
+	 */
+	val = ioread8(iomem_base + GMUX_MMIO_COMMAND_SEND);
+	iounmap(iomem_base);
+	return (val != 0xff);
+}
+
 /**
  * apple_gmux_detect() - detect if gmux is built into the machine
  *
@@ -93,19 +118,24 @@ static inline bool apple_gmux_detect(struct pnp_dev *pnp_dev, enum apple_gmux_ty
 	}
 
 	res = pnp_get_resource(pnp_dev, IORESOURCE_IO, 0);
-	if (!res || resource_size(res) < GMUX_MIN_IO_LEN)
-		goto out;
-
-	/*
-	 * Invalid version information may indicate either that the gmux
-	 * device isn't present or that it's a new one that uses indexed io.
-	 */
-	ver_major = inb(res->start + GMUX_PORT_VERSION_MAJOR);
-	ver_minor = inb(res->start + GMUX_PORT_VERSION_MINOR);
-	ver_release = inb(res->start + GMUX_PORT_VERSION_RELEASE);
-	if (ver_major == 0xff && ver_minor == 0xff && ver_release == 0xff) {
-		if (apple_gmux_is_indexed(res->start))
-			type = APPLE_GMUX_TYPE_INDEXED;
+	if (res && resource_size(res) >= GMUX_MIN_IO_LEN) {
+		/*
+		 * Invalid version information may indicate either that the gmux
+		 * device isn't present or that it's a new one that uses indexed io.
+		 */
+		ver_major = inb(res->start + GMUX_PORT_VERSION_MAJOR);
+		ver_minor = inb(res->start + GMUX_PORT_VERSION_MINOR);
+		ver_release = inb(res->start + GMUX_PORT_VERSION_RELEASE);
+		if (ver_major == 0xff && ver_minor == 0xff && ver_release == 0xff) {
+			if (apple_gmux_is_indexed(res->start))
+				type = APPLE_GMUX_TYPE_INDEXED;
+			else
+				goto out;
+		}
+	} else {
+		res = pnp_get_resource(pnp_dev, IORESOURCE_MEM, 0);
+		if (res && apple_gmux_is_mmio(res->start))
+			type = APPLE_GMUX_TYPE_MMIO;
 		else
 			goto out;
 	}
-- 
cgit v1.2.3


From b91e6107119f62be8b51b9955294be52c0b4fc80 Mon Sep 17 00:00:00 2001
From: Emanuele Ghidoli <emanuele.ghidoli@toradex.com>
Date: Mon, 13 Mar 2023 17:50:39 +0100
Subject: usb: misc: usb3503: support usb3803 and bypass mode

Add support for USB3803 and bypass mode, with this change
is also possible to move the component out of bypass mode.

In bypass mode the downstream port 3 is connected to the
upstream port with low switch resistance R_on.

Controlling mode of operations:

| RESET_N | BYPASS_N | Mode    |
--------------------------------
|    0    |    0     | standby |
|    1    |    0     | bypass  |
|    1    |    1     | hub     |

Datasheet: https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/DataSheets/00001691D.pdf
Signed-off-by: Emanuele Ghidoli <emanuele.ghidoli@toradex.com>
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Link: https://lore.kernel.org/r/20230313165039.255579-4-francesco@dolcini.it
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/misc/usb3503.c            | 22 +++++++++++++++++++++-
 include/linux/platform_data/usb3503.h |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c
index 3044db9fd8aa..c6cfd1edaf76 100644
--- a/drivers/usb/misc/usb3503.c
+++ b/drivers/usb/misc/usb3503.c
@@ -46,6 +46,7 @@ struct usb3503 {
 	struct device		*dev;
 	struct clk		*clk;
 	u8	port_off_mask;
+	struct gpio_desc	*bypass;
 	struct gpio_desc	*intn;
 	struct gpio_desc 	*reset;
 	struct gpio_desc 	*connect;
@@ -109,18 +110,25 @@ static int usb3503_connect(struct usb3503 *hub)
 static int usb3503_switch_mode(struct usb3503 *hub, enum usb3503_mode mode)
 {
 	struct device *dev = hub->dev;
-	int rst, conn;
+	int rst, bypass, conn;
 
 	switch (mode) {
 	case USB3503_MODE_HUB:
 		conn = 1;
 		rst = 0;
+		bypass = 0;
 		break;
 	case USB3503_MODE_STANDBY:
 		conn = 0;
 		rst = 1;
+		bypass = 1;
 		dev_info(dev, "switched to STANDBY mode\n");
 		break;
+	case USB3503_MODE_BYPASS:
+		conn = 0;
+		rst = 0;
+		bypass = 1;
+		break;
 	default:
 		dev_err(dev, "unknown mode is requested\n");
 		return -EINVAL;
@@ -132,6 +140,9 @@ static int usb3503_switch_mode(struct usb3503 *hub, enum usb3503_mode mode)
 	if (hub->reset)
 		gpiod_set_value_cansleep(hub->reset, rst);
 
+	if (hub->bypass)
+		gpiod_set_value_cansleep(hub->bypass, bypass);
+
 	if (conn) {
 		/* Wait T_HUBINIT == 4ms for hub logic to stabilize */
 		usleep_range(4000, 10000);
@@ -247,6 +258,14 @@ static int usb3503_probe(struct usb3503 *hub)
 	if (hub->connect)
 		gpiod_set_consumer_name(hub->connect, "usb3503 connect");
 
+	hub->bypass = devm_gpiod_get_optional(dev, "bypass", GPIOD_OUT_HIGH);
+	if (IS_ERR(hub->bypass)) {
+		err = PTR_ERR(hub->bypass);
+		goto err_clk;
+	}
+	if (hub->bypass)
+		gpiod_set_consumer_name(hub->bypass, "usb3503 bypass");
+
 	hub->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
 	if (IS_ERR(hub->reset)) {
 		err = PTR_ERR(hub->reset);
@@ -382,6 +401,7 @@ MODULE_DEVICE_TABLE(i2c, usb3503_id);
 static const struct of_device_id usb3503_of_match[] = {
 	{ .compatible = "smsc,usb3503", },
 	{ .compatible = "smsc,usb3503a", },
+	{ .compatible = "smsc,usb3803", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, usb3503_of_match);
diff --git a/include/linux/platform_data/usb3503.h b/include/linux/platform_data/usb3503.h
index d01ef97ddf36..f3c942f396f8 100644
--- a/include/linux/platform_data/usb3503.h
+++ b/include/linux/platform_data/usb3503.h
@@ -12,6 +12,7 @@ enum usb3503_mode {
 	USB3503_MODE_UNKNOWN,
 	USB3503_MODE_HUB,
 	USB3503_MODE_STANDBY,
+	USB3503_MODE_BYPASS,
 };
 
 struct usb3503_platform_data {
-- 
cgit v1.2.3


From ad3d0ee8dca5fa4a656057e20f9605bbe7ff067c Mon Sep 17 00:00:00 2001
From: Orlando Chamberlain <orlandoch.dev@gmail.com>
Date: Thu, 9 Mar 2023 21:43:54 +1100
Subject: platform/x86: apple-gmux: Update apple_gmux_detect documentation

Commit fc83fbc80e1a ("platform/x86: apple-gmux: refactor gmux types")
neglected to update the documentation of apple_gmux_detect's arguments.
Update the documentation to reflect that commit's changes.

include/linux/apple-gmux.h:99: warning:
	Function parameter or member 'type_ret' not described in 'apple_gmux_detect'
include/linux/apple-gmux.h:99: warning:
	Excess function parameter 'indexed_ret' description in 'apple_gmux_detect'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/lkml/20230309122822.77435e33@canb.auug.org.au/
Fixes: fc83fbc80e1a ("platform/x86: apple-gmux: refactor gmux types")
Signed-off-by: Orlando Chamberlain <orlandoch.dev@gmail.com>
Link: https://lore.kernel.org/r/20230309104353.10905-1-orlandoch.dev@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/apple-gmux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/apple-gmux.h b/include/linux/apple-gmux.h
index 272f63f8fd7c..a7a7d430024b 100644
--- a/include/linux/apple-gmux.h
+++ b/include/linux/apple-gmux.h
@@ -86,7 +86,7 @@ static inline bool apple_gmux_is_mmio(unsigned long iostart)
  * apple_gmux_detect() - detect if gmux is built into the machine
  *
  * @pnp_dev:     Device to probe or NULL to use the first matching device
- * @indexed_ret: Returns (by reference) if the gmux is indexed or not
+ * @type_ret: Returns (by reference) the apple_gmux_type of the device
  *
  * Detect if a supported gmux device is present by actually probing it.
  * This avoids the false positives returned on some models by
-- 
cgit v1.2.3


From 3608a2cd818a8d97bd3d4dc32f6c0afaf09b15ec Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 7 Mar 2023 13:05:40 +0100
Subject: backlight: apple_bl: Use acpi_video_get_backlight_type()

On some MacBooks both the apple_bl and the apple-gmux backlight drivers
may be able to export a /sys/class/backlight device.

To avoid having 2 backlight devices for one LCD panel until now
the apple-gmux driver has been calling apple_bl_unregister() to move
the apple_bl backlight device out of the way when it loads.

Similar problems exist on other x86 laptops and all backlight drivers
which may be used on x86 laptops have moved to using
acpi_video_get_backlight_type() to determine whether they should load
or not.

Switch apple_bl to this model too, so that it is consistent with all
the other x86 backlight drivers.

Besides code-simplification and consistency this has 2 other benefits:

1) It removes a race during boot where userspace will briefly see
   an apple_bl backlight and then have it disappear again, leading to e.g.:
   https://bbs.archlinux.org/viewtopic.php?id=269920

2) This allows user to switch between the drivers by passing
   acpi_backlight=apple_gmux or acpi_backlight=vendor on the kernel
   commandline.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230307120540.389920-1-hdegoede@redhat.com
---
 drivers/platform/x86/Kconfig       |  1 -
 drivers/platform/x86/apple-gmux.c  | 11 -----------
 drivers/video/backlight/Kconfig    |  1 +
 drivers/video/backlight/apple_bl.c | 31 ++++++++++---------------------
 include/linux/apple_bl.h           | 27 ---------------------------
 5 files changed, 11 insertions(+), 60 deletions(-)
 delete mode 100644 include/linux/apple_bl.h

(limited to 'include')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index ec7c2b4e1721..1b3189d6e8d2 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -213,7 +213,6 @@ config APPLE_GMUX
 	depends on ACPI && PCI
 	depends on PNP
 	depends on BACKLIGHT_CLASS_DEVICE
-	depends on BACKLIGHT_APPLE=n || BACKLIGHT_APPLE
 	help
 	  This driver provides support for the gmux device found on many
 	  Apple laptops, which controls the display mux for the hybrid
diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index 8aa81a3517b1..4c311e1dedad 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -16,7 +16,6 @@
 #include <linux/backlight.h>
 #include <linux/acpi.h>
 #include <linux/pnp.h>
-#include <linux/apple_bl.h>
 #include <linux/apple-gmux.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
@@ -883,14 +882,6 @@ get_version:
 		gmux_data->bdev = bdev;
 		bdev->props.brightness = gmux_get_brightness(bdev);
 		backlight_update_status(bdev);
-
-		/*
-		 * The backlight situation on Macs is complicated. If the gmux is
-		 * present it's the best choice, because it always works for
-		 * backlight control and supports more levels than other options.
-		 * Disable the other backlight choices.
-		 */
-		apple_bl_unregister();
 	}
 
 	gmux_data->power_state = VGA_SWITCHEROO_ON;
@@ -1007,8 +998,6 @@ static void gmux_remove(struct pnp_dev *pnp)
 		release_region(gmux_data->iostart, gmux_data->iolen);
 	apple_gmux_data = NULL;
 	kfree(gmux_data);
-
-	apple_bl_register();
 }
 
 static const struct pnp_device_id gmux_device_ids[] = {
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 4c33e971c0f0..51387b1ef012 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -285,6 +285,7 @@ config BACKLIGHT_MT6370
 config BACKLIGHT_APPLE
 	tristate "Apple Backlight Driver"
 	depends on X86 && ACPI
+	depends on ACPI_VIDEO=n || ACPI_VIDEO
 	help
 	  If you have an Intel-based Apple say Y to enable a driver for its
 	  backlight.
diff --git a/drivers/video/backlight/apple_bl.c b/drivers/video/backlight/apple_bl.c
index e9e7acb577bf..aaa824437a2a 100644
--- a/drivers/video/backlight/apple_bl.c
+++ b/drivers/video/backlight/apple_bl.c
@@ -24,7 +24,7 @@
 #include <linux/pci.h>
 #include <linux/acpi.h>
 #include <linux/atomic.h>
-#include <linux/apple_bl.h>
+#include <acpi/video.h>
 
 static struct backlight_device *apple_backlight_device;
 
@@ -215,32 +215,21 @@ static struct acpi_driver apple_bl_driver = {
 	},
 };
 
-static atomic_t apple_bl_registered = ATOMIC_INIT(0);
-
-int apple_bl_register(void)
-{
-	if (atomic_xchg(&apple_bl_registered, 1) == 0)
-		return acpi_bus_register_driver(&apple_bl_driver);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(apple_bl_register);
-
-void apple_bl_unregister(void)
-{
-	if (atomic_xchg(&apple_bl_registered, 0) == 1)
-		acpi_bus_unregister_driver(&apple_bl_driver);
-}
-EXPORT_SYMBOL_GPL(apple_bl_unregister);
-
 static int __init apple_bl_init(void)
 {
-	return apple_bl_register();
+	/*
+	 * Use ACPI video detection code to see if this driver should register
+	 * or if another driver, e.g. the apple-gmux driver should be used.
+	 */
+	if (acpi_video_get_backlight_type() != acpi_backlight_vendor)
+		return -ENODEV;
+
+	return acpi_bus_register_driver(&apple_bl_driver);
 }
 
 static void __exit apple_bl_exit(void)
 {
-	apple_bl_unregister();
+	acpi_bus_unregister_driver(&apple_bl_driver);
 }
 
 module_init(apple_bl_init);
diff --git a/include/linux/apple_bl.h b/include/linux/apple_bl.h
deleted file mode 100644
index 445af2e3cc21..000000000000
--- a/include/linux/apple_bl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * apple_bl exported symbols
- */
-
-#ifndef _LINUX_APPLE_BL_H
-#define _LINUX_APPLE_BL_H
-
-#if defined(CONFIG_BACKLIGHT_APPLE) || defined(CONFIG_BACKLIGHT_APPLE_MODULE)
-
-extern int apple_bl_register(void);
-extern void apple_bl_unregister(void);
-
-#else /* !CONFIG_BACKLIGHT_APPLE */
-
-static inline int apple_bl_register(void)
-{
-	return 0;
-}
-
-static inline void apple_bl_unregister(void)
-{
-}
-
-#endif /* !CONFIG_BACKLIGHT_APPLE */
-
-#endif /* _LINUX_APPLE_BL_H */
-- 
cgit v1.2.3


From 027781f3920ad16f40133890fc87247b8baa2d8d Mon Sep 17 00:00:00 2001
From: Leonard Göhrs <l.goehrs@pengutronix.de>
Date: Fri, 10 Mar 2023 10:20:52 +0100
Subject: spi: core: add spi_split_transfers_maxwords
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add spi_split_transfers_maxwords() function that splits
spi_transfers transparently into multiple transfers
that are below a given number of SPI words.

This function reuses most of its code from
spi_split_transfers_maxsize() and for transfers with
eight or less bits per word actually behaves the same.

Signed-off-by: Leonard Göhrs <l.goehrs@pengutronix.de>
Link: https://lore.kernel.org/r/20230310092053.1006459-1-l.goehrs@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi.h |  4 ++++
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c725b4bab7af..9036d7a50674 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3622,6 +3622,55 @@ int spi_split_transfers_maxsize(struct spi_controller *ctlr,
 }
 EXPORT_SYMBOL_GPL(spi_split_transfers_maxsize);
 
+
+/**
+ * spi_split_transfers_maxwords - split spi transfers into multiple transfers
+ *                                when an individual transfer exceeds a
+ *                                certain number of SPI words
+ * @ctlr:     the @spi_controller for this transfer
+ * @msg:      the @spi_message to transform
+ * @maxwords: the number of words to limit each transfer to
+ * @gfp:      GFP allocation flags
+ *
+ * Return: status of transformation
+ */
+int spi_split_transfers_maxwords(struct spi_controller *ctlr,
+				 struct spi_message *msg,
+				 size_t maxwords,
+				 gfp_t gfp)
+{
+	struct spi_transfer *xfer;
+
+	/*
+	 * Iterate over the transfer_list,
+	 * but note that xfer is advanced to the last transfer inserted
+	 * to avoid checking sizes again unnecessarily (also xfer does
+	 * potentially belong to a different list by the time the
+	 * replacement has happened).
+	 */
+	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+		size_t maxsize;
+		int ret;
+
+		if (xfer->bits_per_word <= 8)
+			maxsize = maxwords;
+		else if (xfer->bits_per_word <= 16)
+			maxsize = 2 * maxwords;
+		else
+			maxsize = 4 * maxwords;
+
+		if (xfer->len > maxsize) {
+			ret = __spi_split_transfer_maxsize(ctlr, msg, &xfer,
+							   maxsize, gfp);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_split_transfers_maxwords);
+
 /*-------------------------------------------------------------------------*/
 
 /* Core methods for SPI controller protocol drivers.  Some of the
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 74bff5a2f53d..873ced6ae4ca 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1295,6 +1295,10 @@ extern int spi_split_transfers_maxsize(struct spi_controller *ctlr,
 				       struct spi_message *msg,
 				       size_t maxsize,
 				       gfp_t gfp);
+extern int spi_split_transfers_maxwords(struct spi_controller *ctlr,
+					struct spi_message *msg,
+					size_t maxwords,
+					gfp_t gfp);
 
 /*---------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From 4e04143c869c5b6d499fbd5083caa860d5c942c3 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Thu, 16 Mar 2023 14:07:51 +0100
Subject: fs_context: drop the unused lsm_flags member

This isn't ever used by VFS now, and it couldn't even work. Any FS that
uses the SECURITY_LSM_NATIVE_LABELS flag needs to also process the
value returned back from the LSM, so it needs to do its
security_sb_set_mnt_opts() call on its own anyway.

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 Documentation/filesystems/mount_api.rst | 1 -
 fs/nfs/super.c                          | 3 ---
 include/linux/fs_context.h              | 1 -
 include/linux/security.h                | 2 +-
 4 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst
index 63204d2094fd..9aaf6ef75eb5 100644
--- a/Documentation/filesystems/mount_api.rst
+++ b/Documentation/filesystems/mount_api.rst
@@ -79,7 +79,6 @@ context.  This is represented by the fs_context structure::
 		unsigned int		sb_flags;
 		unsigned int		sb_flags_mask;
 		unsigned int		s_iflags;
-		unsigned int		lsm_flags;
 		enum fs_context_purpose	purpose:8;
 		...
 	};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 05ae23657527..397c096d874e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1274,9 +1274,6 @@ int nfs_get_tree_common(struct fs_context *fc)
 		if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS)
 			fc->sb_flags |= SB_SYNCHRONOUS;
 
-	if (server->caps & NFS_CAP_SECURITY_LABEL)
-		fc->lsm_flags |= SECURITY_LSM_NATIVE_LABELS;
-
 	/* Get a superblock - note that we may end up sharing one that already exists */
 	fc->s_fs_info = server;
 	s = sget_fc(fc, compare_super, nfs_set_super);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 5469ffee21c7..ff6341e09925 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -104,7 +104,6 @@ struct fs_context {
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
 	unsigned int		s_iflags;	/* OR'd with sb->s_iflags */
-	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	enum fs_context_phase	phase:8;	/* The phase the context is in */
 	bool			need_free:1;	/* Need to call ops->free() */
diff --git a/include/linux/security.h b/include/linux/security.h
index 5984d0d550b4..db9b659b02b9 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -68,7 +68,7 @@ struct watch_notification;
 /* If capable is being called by a setid function */
 #define CAP_OPT_INSETID BIT(2)
 
-/* LSM Agnostic defines for fs_context::lsm_flags */
+/* LSM Agnostic defines for security_sb_set_mnt_opts() flags */
 #define SECURITY_LSM_NATIVE_LABELS	1
 
 struct ctl_table;
-- 
cgit v1.2.3


From d805456c712f93ba8a012430f2a93bec133b6ff4 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 7 Mar 2023 23:06:36 -0800
Subject: platform/x86: ISST: Enumerate TPMI SST and create framework

Enumerate TPMI SST driver and create basic framework to add more
features.

The basic user space interface is still same as the legacy using
/dev/isst_interface. Users of "intel-speed-select" utility should
be able to use same commands as prior gens without being aware
of new underlying hardware interface.

TPMI SST driver enumerates on device "intel_vsec.tpmi-sst". Since there
can be multiple instances and there is one common SST core, split
implementation into two parts: A common core part and an enumeration
part. The enumeration driver is loaded for each device instance and
register with the TPMI SST core driver.

On very first enumeration the TPMI SST core driver register with SST
core driver to get IOCTL callbacks. The api_version is incremented
for IOCTL ISST_IF_GET_PLATFORM_INFO, so that user space can issue
new IOCTLs.

Each TPMI package contains multiple power domains. Each power domain
has its own set of SST controls. For each domain map the MMIO memory
and update per domain struct tpmi_per_power_domain_info. This information
will be used to implement other SST interfaces.

Implement first IOCTL commands to get number of TPMI SST instances
and instance mask as some of the power domains may not have any
SST controls.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Pragya Tanwar <pragya.tanwar@intel.com>
Link: https://lore.kernel.org/r/20230308070642.1727167-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/intel/speed_select_if/Kconfig |   4 +
 .../platform/x86/intel/speed_select_if/Makefile    |   2 +
 .../platform/x86/intel/speed_select_if/isst_tpmi.c |  53 ++++
 .../x86/intel/speed_select_if/isst_tpmi_core.c     | 274 +++++++++++++++++++++
 .../x86/intel/speed_select_if/isst_tpmi_core.h     |  16 ++
 include/uapi/linux/isst_if.h                       |  18 ++
 6 files changed, 367 insertions(+)
 create mode 100644 drivers/platform/x86/intel/speed_select_if/isst_tpmi.c
 create mode 100644 drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
 create mode 100644 drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.h

(limited to 'include')

diff --git a/drivers/platform/x86/intel/speed_select_if/Kconfig b/drivers/platform/x86/intel/speed_select_if/Kconfig
index ce3e3dc076d2..4eb3ad299db0 100644
--- a/drivers/platform/x86/intel/speed_select_if/Kconfig
+++ b/drivers/platform/x86/intel/speed_select_if/Kconfig
@@ -2,8 +2,12 @@ menu "Intel Speed Select Technology interface support"
 	depends on PCI
 	depends on X86_64 || COMPILE_TEST
 
+config INTEL_SPEED_SELECT_TPMI
+	tristate
+
 config INTEL_SPEED_SELECT_INTERFACE
 	tristate "Intel(R) Speed Select Technology interface drivers"
+	select INTEL_SPEED_SELECT_TPMI if INTEL_TPMI
 	help
 	  This config enables the Intel(R) Speed Select Technology interface
 	  drivers. The Intel(R) speed select technology features are non
diff --git a/drivers/platform/x86/intel/speed_select_if/Makefile b/drivers/platform/x86/intel/speed_select_if/Makefile
index 856076206f35..1d878a36d0ab 100644
--- a/drivers/platform/x86/intel/speed_select_if/Makefile
+++ b/drivers/platform/x86/intel/speed_select_if/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mmio.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mbox_pci.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mbox_msr.o
+obj-$(CONFIG_INTEL_SPEED_SELECT_TPMI) += isst_tpmi_core.o
+obj-$(CONFIG_INTEL_SPEED_SELECT_TPMI) += isst_tpmi.o
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi.c
new file mode 100644
index 000000000000..7b4bdeefb8bc
--- /dev/null
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * isst_tpmi.c: SST TPMI interface
+ *
+ * Copyright (c) 2023, Intel Corporation.
+ * All Rights Reserved.
+ *
+ */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/module.h>
+#include <linux/intel_tpmi.h>
+
+#include "isst_tpmi_core.h"
+
+static int intel_sst_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id)
+{
+	int ret;
+
+	ret = tpmi_sst_init();
+	if (ret)
+		return ret;
+
+	ret = tpmi_sst_dev_add(auxdev);
+	if (ret)
+		tpmi_sst_exit();
+
+	return ret;
+}
+
+static void intel_sst_remove(struct auxiliary_device *auxdev)
+{
+	tpmi_sst_dev_remove(auxdev);
+	tpmi_sst_exit();
+}
+
+static const struct auxiliary_device_id intel_sst_id_table[] = {
+	{ .name = "intel_vsec.tpmi-sst" },
+	{}
+};
+MODULE_DEVICE_TABLE(auxiliary, intel_sst_id_table);
+
+static struct auxiliary_driver intel_sst_aux_driver = {
+	.id_table       = intel_sst_id_table,
+	.remove         = intel_sst_remove,
+	.probe          = intel_sst_probe,
+};
+
+module_auxiliary_driver(intel_sst_aux_driver);
+
+MODULE_IMPORT_NS(INTEL_TPMI_SST);
+MODULE_DESCRIPTION("Intel TPMI SST Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
new file mode 100644
index 000000000000..6b37016c0417
--- /dev/null
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * isst_tpmi.c: SST TPMI interface core
+ *
+ * Copyright (c) 2023, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This information will be useful to understand flows:
+ * In the current generation of platforms, TPMI is supported via OOB
+ * PCI device. This PCI device has one instance per CPU package.
+ * There is a unique TPMI ID for SST. Each TPMI ID also has multiple
+ * entries, representing per power domain information.
+ *
+ * There is one dev file for complete SST information and control same as the
+ * prior generation of hardware. User spaces don't need to know how the
+ * information is presented by the hardware. The TPMI core module implements
+ * the hardware mapping.
+ */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/intel_tpmi.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <uapi/linux/isst_if.h>
+
+#include "isst_tpmi_core.h"
+#include "isst_if_common.h"
+
+/**
+ * struct tpmi_per_power_domain_info -	Store per power_domain SST info
+ * @package_id:		Package id for this power_domain
+ * @power_domain_id:	Power domain id, Each entry from the SST-TPMI instance is a power_domain.
+ * @sst_base:		Mapped SST base IO memory
+ * @auxdev:		Auxiliary device instance enumerated this instance
+ *
+ * This structure is used store complete SST information for a power_domain. This information
+ * is used to read/write request for any SST IOCTL. Each physical CPU package can have multiple
+ * power_domains. Each power domain describes its own SST information and has its own controls.
+ */
+struct tpmi_per_power_domain_info {
+	int package_id;
+	int power_domain_id;
+	void __iomem *sst_base;
+	struct auxiliary_device *auxdev;
+};
+
+/**
+ * struct tpmi_sst_struct -	Store sst info for a package
+ * @package_id:			Package id for this aux device instance
+ * @number_of_power_domains:	Number of power_domains pointed by power_domain_info pointer
+ * @power_domain_info:		Pointer to power domains information
+ *
+ * This structure is used store full SST information for a package.
+ * Each package has a unique OOB PCI device, which enumerates TPMI.
+ * Each Package will have multiple power_domains.
+ */
+struct tpmi_sst_struct {
+	int package_id;
+	int number_of_power_domains;
+	struct tpmi_per_power_domain_info *power_domain_info;
+};
+
+/**
+ * struct tpmi_sst_common_struct -	Store all SST instances
+ * @max_index:		Maximum instances currently present
+ * @sst_inst:		Pointer to per package instance
+ *
+ * Stores every SST Package instance.
+ */
+struct tpmi_sst_common_struct {
+	int max_index;
+	struct tpmi_sst_struct **sst_inst;
+};
+
+/*
+ * Each IOCTL request is processed under this lock. Also used to protect
+ * registration functions and common data structures.
+ */
+static DEFINE_MUTEX(isst_tpmi_dev_lock);
+
+/* Usage count to track, number of TPMI SST instances registered to this core. */
+static int isst_core_usage_count;
+
+/* Stores complete SST information for every package and power_domain */
+static struct tpmi_sst_common_struct isst_common;
+
+static int isst_if_get_tpmi_instance_count(void __user *argp)
+{
+	struct isst_tpmi_instance_count tpmi_inst;
+	struct tpmi_sst_struct *sst_inst;
+	int i;
+
+	if (copy_from_user(&tpmi_inst, argp, sizeof(tpmi_inst)))
+		return -EFAULT;
+
+	if (tpmi_inst.socket_id >= topology_max_packages())
+		return -EINVAL;
+
+	tpmi_inst.count = isst_common.sst_inst[tpmi_inst.socket_id]->number_of_power_domains;
+
+	sst_inst = isst_common.sst_inst[tpmi_inst.socket_id];
+	tpmi_inst.valid_mask = 0;
+	for (i = 0; i < sst_inst->number_of_power_domains; ++i) {
+		struct tpmi_per_power_domain_info *power_domain_info;
+
+		power_domain_info = &sst_inst->power_domain_info[i];
+		if (power_domain_info->sst_base)
+			tpmi_inst.valid_mask |= BIT(i);
+	}
+
+	if (copy_to_user(argp, &tpmi_inst, sizeof(tpmi_inst)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long ret = -ENOTTY;
+
+	mutex_lock(&isst_tpmi_dev_lock);
+	switch (cmd) {
+	case ISST_IF_COUNT_TPMI_INSTANCES:
+		ret = isst_if_get_tpmi_instance_count(argp);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&isst_tpmi_dev_lock);
+
+	return ret;
+}
+
+int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
+{
+	struct intel_tpmi_plat_info *plat_info;
+	struct tpmi_sst_struct *tpmi_sst;
+	int i, pkg = 0, inst = 0;
+	int num_resources;
+
+	plat_info = tpmi_get_platform_data(auxdev);
+	if (!plat_info) {
+		dev_err(&auxdev->dev, "No platform info\n");
+		return -EINVAL;
+	}
+
+	pkg = plat_info->package_id;
+	if (pkg >= topology_max_packages()) {
+		dev_err(&auxdev->dev, "Invalid package id :%x\n", pkg);
+		return -EINVAL;
+	}
+
+	if (isst_common.sst_inst[pkg])
+		return -EEXIST;
+
+	num_resources = tpmi_get_resource_count(auxdev);
+
+	if (!num_resources)
+		return -EINVAL;
+
+	tpmi_sst = devm_kzalloc(&auxdev->dev, sizeof(*tpmi_sst), GFP_KERNEL);
+	if (!tpmi_sst)
+		return -ENOMEM;
+
+	tpmi_sst->power_domain_info = devm_kcalloc(&auxdev->dev, num_resources,
+						   sizeof(*tpmi_sst->power_domain_info),
+						   GFP_KERNEL);
+	if (!tpmi_sst->power_domain_info)
+		return -ENOMEM;
+
+	tpmi_sst->number_of_power_domains = num_resources;
+
+	for (i = 0; i < num_resources; ++i) {
+		struct resource *res;
+
+		res = tpmi_get_resource_at_index(auxdev, i);
+		if (!res) {
+			tpmi_sst->power_domain_info[i].sst_base = NULL;
+			continue;
+		}
+
+		tpmi_sst->power_domain_info[i].package_id = pkg;
+		tpmi_sst->power_domain_info[i].power_domain_id = i;
+		tpmi_sst->power_domain_info[i].auxdev = auxdev;
+		tpmi_sst->power_domain_info[i].sst_base = devm_ioremap_resource(&auxdev->dev, res);
+		if (IS_ERR(tpmi_sst->power_domain_info[i].sst_base))
+			return PTR_ERR(tpmi_sst->power_domain_info[i].sst_base);
+
+		++inst;
+	}
+
+	if (!inst)
+		return -ENODEV;
+
+	tpmi_sst->package_id = pkg;
+	auxiliary_set_drvdata(auxdev, tpmi_sst);
+
+	mutex_lock(&isst_tpmi_dev_lock);
+	if (isst_common.max_index < pkg)
+		isst_common.max_index = pkg;
+	isst_common.sst_inst[pkg] = tpmi_sst;
+	mutex_unlock(&isst_tpmi_dev_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_add, INTEL_TPMI_SST);
+
+void tpmi_sst_dev_remove(struct auxiliary_device *auxdev)
+{
+	struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
+
+	mutex_lock(&isst_tpmi_dev_lock);
+	isst_common.sst_inst[tpmi_sst->package_id] = NULL;
+	mutex_unlock(&isst_tpmi_dev_lock);
+}
+EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_remove, INTEL_TPMI_SST);
+
+#define ISST_TPMI_API_VERSION	0x02
+
+int tpmi_sst_init(void)
+{
+	struct isst_if_cmd_cb cb;
+	int ret = 0;
+
+	mutex_lock(&isst_tpmi_dev_lock);
+
+	if (isst_core_usage_count) {
+		++isst_core_usage_count;
+		goto init_done;
+	}
+
+	isst_common.sst_inst = kcalloc(topology_max_packages(),
+				       sizeof(*isst_common.sst_inst),
+				       GFP_KERNEL);
+	if (!isst_common.sst_inst)
+		return -ENOMEM;
+
+	memset(&cb, 0, sizeof(cb));
+	cb.cmd_size = sizeof(struct isst_if_io_reg);
+	cb.offset = offsetof(struct isst_if_io_regs, io_reg);
+	cb.cmd_callback = NULL;
+	cb.api_version = ISST_TPMI_API_VERSION;
+	cb.def_ioctl = isst_if_def_ioctl;
+	cb.owner = THIS_MODULE;
+	ret = isst_if_cdev_register(ISST_IF_DEV_TPMI, &cb);
+	if (ret)
+		kfree(isst_common.sst_inst);
+init_done:
+	mutex_unlock(&isst_tpmi_dev_lock);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(tpmi_sst_init, INTEL_TPMI_SST);
+
+void tpmi_sst_exit(void)
+{
+	mutex_lock(&isst_tpmi_dev_lock);
+	if (isst_core_usage_count)
+		--isst_core_usage_count;
+
+	if (!isst_core_usage_count) {
+		isst_if_cdev_unregister(ISST_IF_DEV_TPMI);
+		kfree(isst_common.sst_inst);
+	}
+	mutex_unlock(&isst_tpmi_dev_lock);
+}
+EXPORT_SYMBOL_NS_GPL(tpmi_sst_exit, INTEL_TPMI_SST);
+
+MODULE_IMPORT_NS(INTEL_TPMI);
+MODULE_IMPORT_NS(INTEL_TPMI_POWER_DOMAIN);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.h b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.h
new file mode 100644
index 000000000000..356cb02273b1
--- /dev/null
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel Speed Select Interface: Drivers Internal defines
+ * Copyright (c) 2023, Intel Corporation.
+ * All rights reserved.
+ *
+ */
+
+#ifndef _ISST_TPMI_CORE_H
+#define _ISST_TPMI_CORE_H
+
+int tpmi_sst_init(void);
+void tpmi_sst_exit(void);
+int tpmi_sst_dev_add(struct auxiliary_device *auxdev);
+void tpmi_sst_dev_remove(struct auxiliary_device *auxdev);
+#endif
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index ba078f8e9add..bf32d959f6e8 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -163,10 +163,28 @@ struct isst_if_msr_cmds {
 	struct isst_if_msr_cmd msr_cmd[1];
 };
 
+/**
+ * struct isst_tpmi_instance_count - Get number of TPMI instances per socket
+ * @socket_id:	Socket/package id
+ * @count:	Number of instances
+ * @valid_mask: Mask of instances as there can be holes
+ *
+ * Structure used to get TPMI instances information using
+ * IOCTL ISST_IF_COUNT_TPMI_INSTANCES.
+ */
+struct isst_tpmi_instance_count {
+	__u8 socket_id;
+	__u8 count;
+	__u16 valid_mask;
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
 #define ISST_IF_IO_CMD		_IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *)
 #define ISST_IF_MBOX_COMMAND	_IOWR(ISST_IF_MAGIC, 3, struct isst_if_mbox_cmds *)
 #define ISST_IF_MSR_COMMAND	_IOWR(ISST_IF_MAGIC, 4, struct isst_if_msr_cmds *)
+
+#define ISST_IF_COUNT_TPMI_INSTANCES	_IOR(ISST_IF_MAGIC, 5, struct isst_tpmi_instance_count *)
+
 #endif
-- 
cgit v1.2.3


From 12a7d2cb811dd8a884dea088a2701fcb8d00136e Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 7 Mar 2023 23:06:38 -0800
Subject: platform/x86: ISST: Add SST-CP support via TPMI

Intel Speed Select Technology Core Power (SST-CP) is an interface that
allows users to define per core priority. This defines a mechanism to
distribute power among cores when there is a power constrained
scenario. This defines a class of service (CLOS) configuration.

Three new IOCTLs are added:
ISST_IF_CORE_POWER_STATE : Enable/Disable SST-CP
ISST_IF_CLOS_PARAM : Configure CLOS parameters
ISST_IF_CLOS_ASSOC : Associate CPUs to a CLOS

To associate CPUs to CLOS, either Linux CPU numbering or PUNIT numbering
scheme can be used, using parameter punit_cpu_map (1: for PUNIT numbering
0 for Linux CPU number).

There is no change to IOCTL to get PUNIT CPU number for a CPU.

Introduce get_instance() function, which is used by majority of IOCTLs
processing to convert a socket and power domain to
tpmi_per_power_domain_info * instance. This instance has all the MMIO
offsets stored to read a particular field.

Once an instance is identified, read or write from correct MMIO
offset for a given field as defined in the specification.

For details on SST CP operations using intel-speed-selet utility,
refer to:
Documentation/admin-guide/pm/intel-speed-select.rst
under the kernel documentation

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Pragya Tanwar <pragya.tanwar@intel.com>
Link: https://lore.kernel.org/r/20230308070642.1727167-5-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../x86/intel/speed_select_if/isst_tpmi_core.c     | 264 +++++++++++++++++++++
 include/uapi/linux/isst_if.h                       |  79 ++++++
 2 files changed, 343 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
index 3453708c2dd0..bc1c1f26fbf9 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -31,6 +31,18 @@
 /* Supported SST hardware version by this driver */
 #define ISST_HEADER_VERSION		1
 
+/*
+ * Used to indicate if value read from MMIO needs to get multiplied
+ * to get to a standard unit or not.
+ */
+#define SST_MUL_FACTOR_NONE    1
+
+/* Define 100 as a scaling factor frequency ratio to frequency conversion */
+#define SST_MUL_FACTOR_FREQ    100
+
+/* All SST regs are 64 bit size */
+#define SST_REG_SIZE   8
+
 /**
  * struct sst_header -	SST main header
  * @interface_version:	Version number for this interface
@@ -359,6 +371,249 @@ static int sst_main(struct auxiliary_device *auxdev, struct tpmi_per_power_domai
 	return 0;
 }
 
+/*
+ * Map a package and power_domain id to SST information structure unique for a power_domain.
+ * The caller should call under isst_tpmi_dev_lock.
+ */
+static struct tpmi_per_power_domain_info *get_instance(int pkg_id, int power_domain_id)
+{
+	struct tpmi_per_power_domain_info *power_domain_info;
+	struct tpmi_sst_struct *sst_inst;
+
+	if (pkg_id < 0 || pkg_id > isst_common.max_index ||
+	    pkg_id >= topology_max_packages())
+		return NULL;
+
+	sst_inst = isst_common.sst_inst[pkg_id];
+	if (!sst_inst)
+		return NULL;
+
+	if (power_domain_id < 0 || power_domain_id >= sst_inst->number_of_power_domains)
+		return NULL;
+
+	power_domain_info = &sst_inst->power_domain_info[power_domain_id];
+
+	if (power_domain_info && !power_domain_info->sst_base)
+		return NULL;
+
+	return power_domain_info;
+}
+
+static bool disable_dynamic_sst_features(void)
+{
+	u64 value;
+
+	rdmsrl(MSR_PM_ENABLE, value);
+	return !(value & 0x1);
+}
+
+#define _read_cp_info(name_str, name, offset, start, width, mult_factor)\
+{\
+	u64 val, mask;\
+	\
+	val = readq(power_domain_info->sst_base + power_domain_info->sst_header.cp_offset +\
+			(offset));\
+	mask = GENMASK_ULL((start + width - 1), start);\
+	val &= mask; \
+	val >>= start;\
+	name = (val * mult_factor);\
+}
+
+#define _write_cp_info(name_str, name, offset, start, width, div_factor)\
+{\
+	u64 val, mask;\
+	\
+	val = readq(power_domain_info->sst_base +\
+		    power_domain_info->sst_header.cp_offset + (offset));\
+	mask = GENMASK_ULL((start + width - 1), start);\
+	val &= ~mask;\
+	val |= (name / div_factor) << start;\
+	writeq(val, power_domain_info->sst_base + power_domain_info->sst_header.cp_offset +\
+		(offset));\
+}
+
+#define	SST_CP_CONTROL_OFFSET	8
+#define	SST_CP_STATUS_OFFSET	16
+
+#define SST_CP_ENABLE_START		0
+#define SST_CP_ENABLE_WIDTH		1
+
+#define SST_CP_PRIORITY_TYPE_START	1
+#define SST_CP_PRIORITY_TYPE_WIDTH	1
+
+static long isst_if_core_power_state(void __user *argp)
+{
+	struct tpmi_per_power_domain_info *power_domain_info;
+	struct isst_core_power core_power;
+
+	if (disable_dynamic_sst_features())
+		return -EFAULT;
+
+	if (copy_from_user(&core_power, argp, sizeof(core_power)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(core_power.socket_id, core_power.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	if (core_power.get_set) {
+		_write_cp_info("cp_enable", core_power.enable, SST_CP_CONTROL_OFFSET,
+			       SST_CP_ENABLE_START, SST_CP_ENABLE_WIDTH, SST_MUL_FACTOR_NONE)
+		_write_cp_info("cp_prio_type", core_power.priority_type, SST_CP_CONTROL_OFFSET,
+			       SST_CP_PRIORITY_TYPE_START, SST_CP_PRIORITY_TYPE_WIDTH,
+			       SST_MUL_FACTOR_NONE)
+	} else {
+		/* get */
+		_read_cp_info("cp_enable", core_power.enable, SST_CP_STATUS_OFFSET,
+			      SST_CP_ENABLE_START, SST_CP_ENABLE_WIDTH, SST_MUL_FACTOR_NONE)
+		_read_cp_info("cp_prio_type", core_power.priority_type, SST_CP_STATUS_OFFSET,
+			      SST_CP_PRIORITY_TYPE_START, SST_CP_PRIORITY_TYPE_WIDTH,
+			      SST_MUL_FACTOR_NONE)
+		core_power.supported = !!(power_domain_info->sst_header.cap_mask & BIT(0));
+		if (copy_to_user(argp, &core_power, sizeof(core_power)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+#define SST_CLOS_CONFIG_0_OFFSET	24
+
+#define SST_CLOS_CONFIG_PRIO_START	4
+#define SST_CLOS_CONFIG_PRIO_WIDTH	4
+
+#define SST_CLOS_CONFIG_MIN_START	8
+#define SST_CLOS_CONFIG_MIN_WIDTH	8
+
+#define SST_CLOS_CONFIG_MAX_START	16
+#define SST_CLOS_CONFIG_MAX_WIDTH	8
+
+static long isst_if_clos_param(void __user *argp)
+{
+	struct tpmi_per_power_domain_info *power_domain_info;
+	struct isst_clos_param clos_param;
+
+	if (copy_from_user(&clos_param, argp, sizeof(clos_param)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(clos_param.socket_id, clos_param.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	if (clos_param.get_set) {
+		_write_cp_info("clos.min_freq", clos_param.min_freq_mhz,
+			       (SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+			       SST_CLOS_CONFIG_MIN_START, SST_CLOS_CONFIG_MIN_WIDTH,
+			       SST_MUL_FACTOR_FREQ);
+		_write_cp_info("clos.max_freq", clos_param.max_freq_mhz,
+			       (SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+			       SST_CLOS_CONFIG_MAX_START, SST_CLOS_CONFIG_MAX_WIDTH,
+			       SST_MUL_FACTOR_FREQ);
+		_write_cp_info("clos.prio", clos_param.prop_prio,
+			       (SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+			       SST_CLOS_CONFIG_PRIO_START, SST_CLOS_CONFIG_PRIO_WIDTH,
+			       SST_MUL_FACTOR_NONE);
+	} else {
+		/* get */
+		_read_cp_info("clos.min_freq", clos_param.min_freq_mhz,
+				(SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+				SST_CLOS_CONFIG_MIN_START, SST_CLOS_CONFIG_MIN_WIDTH,
+				SST_MUL_FACTOR_FREQ)
+		_read_cp_info("clos.max_freq", clos_param.max_freq_mhz,
+				(SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+				SST_CLOS_CONFIG_MAX_START, SST_CLOS_CONFIG_MAX_WIDTH,
+				SST_MUL_FACTOR_FREQ)
+		_read_cp_info("clos.prio", clos_param.prop_prio,
+				(SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE),
+				SST_CLOS_CONFIG_PRIO_START, SST_CLOS_CONFIG_PRIO_WIDTH,
+				SST_MUL_FACTOR_NONE)
+
+		if (copy_to_user(argp, &clos_param, sizeof(clos_param)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+#define SST_CLOS_ASSOC_0_OFFSET		56
+#define SST_CLOS_ASSOC_CPUS_PER_REG	16
+#define SST_CLOS_ASSOC_BITS_PER_CPU	4
+
+static long isst_if_clos_assoc(void __user *argp)
+{
+	struct isst_if_clos_assoc_cmds assoc_cmds;
+	unsigned char __user *ptr;
+	int i;
+
+	/* Each multi command has u16 command count as the first field */
+	if (copy_from_user(&assoc_cmds, argp, sizeof(assoc_cmds)))
+		return -EFAULT;
+
+	if (!assoc_cmds.cmd_count || assoc_cmds.cmd_count > ISST_IF_CMD_LIMIT)
+		return -EINVAL;
+
+	ptr = argp + offsetof(struct isst_if_clos_assoc_cmds, assoc_info);
+	for (i = 0; i < assoc_cmds.cmd_count; ++i) {
+		struct tpmi_per_power_domain_info *power_domain_info;
+		struct isst_if_clos_assoc clos_assoc;
+		int punit_id, punit_cpu_no, pkg_id;
+		struct tpmi_sst_struct *sst_inst;
+		int offset, shift, cpu;
+		u64 val, mask, clos;
+
+		if (copy_from_user(&clos_assoc, ptr, sizeof(clos_assoc)))
+			return -EFAULT;
+
+		if (clos_assoc.socket_id > topology_max_packages())
+			return -EINVAL;
+
+		cpu = clos_assoc.logical_cpu;
+		clos = clos_assoc.clos;
+
+		if (assoc_cmds.punit_cpu_map)
+			punit_cpu_no = cpu;
+		else
+			return -EOPNOTSUPP;
+
+		if (punit_cpu_no < 0)
+			return -EINVAL;
+
+		punit_id = clos_assoc.power_domain_id;
+		pkg_id = clos_assoc.socket_id;
+
+		sst_inst = isst_common.sst_inst[pkg_id];
+
+		if (clos_assoc.power_domain_id > sst_inst->number_of_power_domains)
+			return -EINVAL;
+
+		power_domain_info = &sst_inst->power_domain_info[punit_id];
+
+		offset = SST_CLOS_ASSOC_0_OFFSET +
+				(punit_cpu_no / SST_CLOS_ASSOC_CPUS_PER_REG) * SST_REG_SIZE;
+		shift = punit_cpu_no % SST_CLOS_ASSOC_CPUS_PER_REG;
+		shift *= SST_CLOS_ASSOC_BITS_PER_CPU;
+
+		val = readq(power_domain_info->sst_base +
+				power_domain_info->sst_header.cp_offset + offset);
+		if (assoc_cmds.get_set) {
+			mask = GENMASK_ULL((shift + SST_CLOS_ASSOC_BITS_PER_CPU - 1), shift);
+			val &= ~mask;
+			val |= (clos << shift);
+			writeq(val, power_domain_info->sst_base +
+					power_domain_info->sst_header.cp_offset + offset);
+		} else {
+			val >>= shift;
+			clos_assoc.clos = val & GENMASK(SST_CLOS_ASSOC_BITS_PER_CPU - 1, 0);
+			if (copy_to_user(ptr, &clos_assoc, sizeof(clos_assoc)))
+				return -EFAULT;
+		}
+
+		ptr += sizeof(clos_assoc);
+	}
+
+	return 0;
+}
+
 static int isst_if_get_tpmi_instance_count(void __user *argp)
 {
 	struct isst_tpmi_instance_count tpmi_inst;
@@ -400,6 +655,15 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 	case ISST_IF_COUNT_TPMI_INSTANCES:
 		ret = isst_if_get_tpmi_instance_count(argp);
 		break;
+	case ISST_IF_CORE_POWER_STATE:
+		ret = isst_if_core_power_state(argp);
+		break;
+	case ISST_IF_CLOS_PARAM:
+		ret = isst_if_clos_param(argp);
+		break;
+	case ISST_IF_CLOS_ASSOC:
+		ret = isst_if_clos_assoc(argp);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index bf32d959f6e8..32687d8023ef 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -163,6 +163,82 @@ struct isst_if_msr_cmds {
 	struct isst_if_msr_cmd msr_cmd[1];
 };
 
+/**
+ * struct isst_core_power - Structure to get/set core_power feature
+ * @get_set:	0: Get, 1: Set
+ * @socket_id:	Socket/package id
+ * @power_domain: Power Domain id
+ * @enable:	Feature enable status
+ * @priority_type: Priority type for the feature (ordered/proportional)
+ *
+ * Structure to get/set core_power feature state using IOCTL
+ * ISST_IF_CORE_POWER_STATE.
+ */
+struct isst_core_power {
+	__u8 get_set;
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 enable;
+	__u8 supported;
+	__u8 priority_type;
+};
+
+/**
+ * struct isst_clos_param - Structure to get/set clos praram
+ * @get_set:	0: Get, 1: Set
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * clos:	Clos ID for the parameters
+ * min_freq_mhz: Minimum frequency in MHz
+ * max_freq_mhz: Maximum frequency in MHz
+ * prop_prio:	Proportional priority from 0-15
+ *
+ * Structure to get/set per clos property using IOCTL
+ * ISST_IF_CLOS_PARAM.
+ */
+struct isst_clos_param {
+	__u8 get_set;
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 clos;
+	__u16 min_freq_mhz;
+	__u16 max_freq_mhz;
+	__u8 prop_prio;
+};
+
+/**
+ * struct isst_if_clos_assoc - Structure to assign clos to a CPU
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @logical_cpu: CPU number
+ * @clos:	Clos ID to assign to the logical CPU
+ *
+ * Structure to get/set core_power feature.
+ */
+struct isst_if_clos_assoc {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u16 logical_cpu;
+	__u16 clos;
+};
+
+/**
+ * struct isst_if_clos_assoc_cmds - Structure to assign clos to CPUs
+ * @cmd_count:	Number of cmds (cpus) in this request
+ * @get_set:	Request is for get or set
+ * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
+ *		   Linux CPU number
+ *
+ * Structure used to get/set associate CPUs to clos using IOCTL
+ * ISST_IF_CLOS_ASSOC.
+ */
+struct isst_if_clos_assoc_cmds {
+	__u16 cmd_count;
+	__u16 get_set;
+	__u16 punit_cpu_map;
+	struct isst_if_clos_assoc assoc_info[1];
+};
+
 /**
  * struct isst_tpmi_instance_count - Get number of TPMI instances per socket
  * @socket_id:	Socket/package id
@@ -186,5 +262,8 @@ struct isst_tpmi_instance_count {
 #define ISST_IF_MSR_COMMAND	_IOWR(ISST_IF_MAGIC, 4, struct isst_if_msr_cmds *)
 
 #define ISST_IF_COUNT_TPMI_INSTANCES	_IOR(ISST_IF_MAGIC, 5, struct isst_tpmi_instance_count *)
+#define ISST_IF_CORE_POWER_STATE _IOWR(ISST_IF_MAGIC, 6, struct isst_core_power *)
+#define ISST_IF_CLOS_PARAM	_IOWR(ISST_IF_MAGIC, 7, struct isst_clos_param *)
+#define ISST_IF_CLOS_ASSOC	_IOWR(ISST_IF_MAGIC, 8, struct isst_if_clos_assoc_cmds *)
 
 #endif
-- 
cgit v1.2.3


From ea009e4769fa3bd05d4c111c3b6865eb3a9be829 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 7 Mar 2023 23:06:39 -0800
Subject: platform/x86: ISST: Add SST-PP support via TPMI

This Intel Speed Select Technology - Performance Profile (SST-PP) feature
introduces a mechanism that allows multiple optimized performance profiles
per system. Each profile defines a set of CPUs that need to be online and
rest offline to sustain a guaranteed base frequency.

Five new IOCTLs are added:
ISST_IF_PERF_LEVELS : Get number of performance levels
ISST_IF_PERF_SET_LEVEL : Set to a new performance level
ISST_IF_PERF_SET_FEATURE : Activate SST-BF/SST-TF for a performance level
ISST_IF_GET_PERF_LEVEL_INFO : Get parameters for a performance level
ISST_IF_GET_PERF_LEVEL_CPU_MASK : Get CPU mask for a performance level

Once an instance is identified, read or write from correct MMIO
offset for a given field as defined in the specification.

For details on SST PP operations using intel-speed-selet utility,
refer to:
Documentation/admin-guide/pm/intel-speed-select.rst
under the kernel documentation

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Pragya Tanwar <pragya.tanwar@intel.com>
Link: https://lore.kernel.org/r/20230308070642.1727167-6-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../x86/intel/speed_select_if/isst_tpmi_core.c     | 417 ++++++++++++++++++++-
 include/uapi/linux/isst_if.h                       | 180 +++++++++
 2 files changed, 596 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
index bc1c1f26fbf9..c9b1321dfd1b 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/auxiliary_bus.h>
+#include <linux/delay.h>
 #include <linux/intel_tpmi.h>
 #include <linux/fs.h>
 #include <linux/io.h>
@@ -325,7 +326,7 @@ static int sst_add_perf_profiles(struct auxiliary_device *auxdev,
 	for (i = 0; i < levels; ++i) {
 		u64 offset;
 
-		offset = perf_level_offsets & (0xff << (i * SST_PP_OFFSET_SIZE));
+		offset = perf_level_offsets & (0xffULL << (i * SST_PP_OFFSET_SIZE));
 		offset >>= (i * 8);
 		offset &= 0xff;
 		offset *= 8; /* Convert to byte from QWORD offset */
@@ -614,6 +615,405 @@ static long isst_if_clos_assoc(void __user *argp)
 	return 0;
 }
 
+#define _read_pp_info(name_str, name, offset, start, width, mult_factor)\
+{\
+	u64 val, _mask;\
+	\
+	val = readq(power_domain_info->sst_base + power_domain_info->sst_header.pp_offset +\
+		    (offset));\
+	_mask = GENMASK_ULL((start + width - 1), start);\
+	val &= _mask;\
+	val >>= start;\
+	name = (val * mult_factor);\
+}
+
+#define _write_pp_info(name_str, name, offset, start, width, div_factor)\
+{\
+	u64 val, _mask;\
+	\
+	val = readq(power_domain_info->sst_base + power_domain_info->sst_header.pp_offset +\
+		    (offset));\
+	_mask = GENMASK((start + width - 1), start);\
+	val &= ~_mask;\
+	val |= (name / div_factor) << start;\
+	writeq(val, power_domain_info->sst_base + power_domain_info->sst_header.pp_offset +\
+	      (offset));\
+}
+
+#define _read_bf_level_info(name_str, name, level, offset, start, width, mult_factor)\
+{\
+	u64 val, _mask;\
+	\
+	val = readq(power_domain_info->sst_base +\
+		    power_domain_info->perf_levels[level].mmio_offset +\
+		(power_domain_info->feature_offsets.bf_offset * 8) + (offset));\
+	_mask = GENMASK_ULL((start + width - 1), start);\
+	val &= _mask; \
+	val >>= start;\
+	name = (val * mult_factor);\
+}
+
+#define _read_tf_level_info(name_str, name, level, offset, start, width, mult_factor)\
+{\
+	u64 val, _mask;\
+	\
+	val = readq(power_domain_info->sst_base +\
+		    power_domain_info->perf_levels[level].mmio_offset +\
+		(power_domain_info->feature_offsets.tf_offset * 8) + (offset));\
+	_mask = GENMASK_ULL((start + width - 1), start);\
+	val &= _mask; \
+	val >>= start;\
+	name = (val * mult_factor);\
+}
+
+#define SST_PP_STATUS_OFFSET	32
+
+#define SST_PP_LEVEL_START	0
+#define SST_PP_LEVEL_WIDTH	3
+
+#define SST_PP_LOCK_START	3
+#define SST_PP_LOCK_WIDTH	1
+
+#define SST_PP_FEATURE_STATE_START	8
+#define SST_PP_FEATURE_STATE_WIDTH	8
+
+#define SST_BF_FEATURE_SUPPORTED_START	12
+#define SST_BF_FEATURE_SUPPORTED_WIDTH	1
+
+#define SST_TF_FEATURE_SUPPORTED_START	12
+#define SST_TF_FEATURE_SUPPORTED_WIDTH	1
+
+static int isst_if_get_perf_level(void __user *argp)
+{
+	struct isst_perf_level_info perf_level;
+	struct tpmi_per_power_domain_info *power_domain_info;
+
+	if (copy_from_user(&perf_level, argp, sizeof(perf_level)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(perf_level.socket_id, perf_level.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	perf_level.max_level = power_domain_info->max_level;
+	perf_level.level_mask = power_domain_info->pp_header.allowed_level_mask;
+	perf_level.feature_rev = power_domain_info->pp_header.feature_rev;
+	_read_pp_info("current_level", perf_level.current_level, SST_PP_STATUS_OFFSET,
+		      SST_PP_LEVEL_START, SST_PP_LEVEL_WIDTH, SST_MUL_FACTOR_NONE)
+	_read_pp_info("locked", perf_level.locked, SST_PP_STATUS_OFFSET,
+		      SST_PP_LOCK_START, SST_PP_LEVEL_WIDTH, SST_MUL_FACTOR_NONE)
+	_read_pp_info("feature_state", perf_level.feature_state, SST_PP_STATUS_OFFSET,
+		      SST_PP_FEATURE_STATE_START, SST_PP_FEATURE_STATE_WIDTH, SST_MUL_FACTOR_NONE)
+	perf_level.enabled = !!(power_domain_info->sst_header.cap_mask & BIT(1));
+
+	_read_bf_level_info("bf_support", perf_level.sst_bf_support, 0, 0,
+			    SST_BF_FEATURE_SUPPORTED_START, SST_BF_FEATURE_SUPPORTED_WIDTH,
+			    SST_MUL_FACTOR_NONE);
+	_read_tf_level_info("tf_support", perf_level.sst_tf_support, 0, 0,
+			    SST_TF_FEATURE_SUPPORTED_START, SST_TF_FEATURE_SUPPORTED_WIDTH,
+			    SST_MUL_FACTOR_NONE);
+
+	if (copy_to_user(argp, &perf_level, sizeof(perf_level)))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define SST_PP_CONTROL_OFFSET		24
+#define SST_PP_LEVEL_CHANGE_TIME_MS	5
+#define SST_PP_LEVEL_CHANGE_RETRY_COUNT	3
+
+static int isst_if_set_perf_level(void __user *argp)
+{
+	struct isst_perf_level_control perf_level;
+	struct tpmi_per_power_domain_info *power_domain_info;
+	int level, retry = 0;
+
+	if (disable_dynamic_sst_features())
+		return -EFAULT;
+
+	if (copy_from_user(&perf_level, argp, sizeof(perf_level)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(perf_level.socket_id, perf_level.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	if (!(power_domain_info->pp_header.allowed_level_mask & BIT(perf_level.level)))
+		return -EINVAL;
+
+	_read_pp_info("current_level", level, SST_PP_STATUS_OFFSET,
+		      SST_PP_LEVEL_START, SST_PP_LEVEL_WIDTH, SST_MUL_FACTOR_NONE)
+
+	/* If the requested new level is same as the current level, reject */
+	if (perf_level.level == level)
+		return -EINVAL;
+
+	_write_pp_info("perf_level", perf_level.level, SST_PP_CONTROL_OFFSET,
+		       SST_PP_LEVEL_START, SST_PP_LEVEL_WIDTH, SST_MUL_FACTOR_NONE)
+
+	/* It is possible that firmware is busy (although unlikely), so retry */
+	do {
+		/* Give time to FW to process */
+		msleep(SST_PP_LEVEL_CHANGE_TIME_MS);
+
+		_read_pp_info("current_level", level, SST_PP_STATUS_OFFSET,
+			      SST_PP_LEVEL_START, SST_PP_LEVEL_WIDTH, SST_MUL_FACTOR_NONE)
+
+		/* Check if the new level is active */
+		if (perf_level.level == level)
+			break;
+
+	} while (retry++ < SST_PP_LEVEL_CHANGE_RETRY_COUNT);
+
+	/* If the level change didn't happen, return fault */
+	if (perf_level.level != level)
+		return -EFAULT;
+
+	/* Reset the feature state on level change */
+	_write_pp_info("perf_feature", 0, SST_PP_CONTROL_OFFSET,
+		       SST_PP_FEATURE_STATE_START, SST_PP_FEATURE_STATE_WIDTH,
+		       SST_MUL_FACTOR_NONE)
+
+	/* Give time to FW to process */
+	msleep(SST_PP_LEVEL_CHANGE_TIME_MS);
+
+	return 0;
+}
+
+static int isst_if_set_perf_feature(void __user *argp)
+{
+	struct isst_perf_feature_control perf_feature;
+	struct tpmi_per_power_domain_info *power_domain_info;
+
+	if (disable_dynamic_sst_features())
+		return -EFAULT;
+
+	if (copy_from_user(&perf_feature, argp, sizeof(perf_feature)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(perf_feature.socket_id, perf_feature.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	_write_pp_info("perf_feature", perf_feature.feature, SST_PP_CONTROL_OFFSET,
+		       SST_PP_FEATURE_STATE_START, SST_PP_FEATURE_STATE_WIDTH,
+		       SST_MUL_FACTOR_NONE)
+
+	return 0;
+}
+
+#define _read_pp_level_info(name_str, name, level, offset, start, width, mult_factor)\
+{\
+	u64 val, _mask;\
+	\
+	val = readq(power_domain_info->sst_base +\
+		    power_domain_info->perf_levels[level].mmio_offset +\
+		(power_domain_info->feature_offsets.pp_offset * 8) + (offset));\
+	_mask = GENMASK_ULL((start + width - 1), start);\
+	val &= _mask; \
+	val >>= start;\
+	name = (val * mult_factor);\
+}
+
+#define SST_PP_INFO_0_OFFSET	0
+#define SST_PP_INFO_1_OFFSET	8
+#define SST_PP_INFO_2_OFFSET	16
+#define SST_PP_INFO_3_OFFSET	24
+
+/* SST_PP_INFO_4_OFFSET to SST_PP_INFO_9_OFFSET are trl levels */
+#define SST_PP_INFO_4_OFFSET	32
+
+#define SST_PP_INFO_10_OFFSET	80
+#define SST_PP_INFO_11_OFFSET	88
+
+#define SST_PP_P1_SSE_START	0
+#define SST_PP_P1_SSE_WIDTH	8
+
+#define SST_PP_P1_AVX2_START	8
+#define SST_PP_P1_AVX2_WIDTH	8
+
+#define SST_PP_P1_AVX512_START	16
+#define SST_PP_P1_AVX512_WIDTH	8
+
+#define SST_PP_P1_AMX_START	24
+#define SST_PP_P1_AMX_WIDTH	8
+
+#define SST_PP_TDP_START	32
+#define SST_PP_TDP_WIDTH	15
+
+#define SST_PP_T_PROCHOT_START	47
+#define SST_PP_T_PROCHOT_WIDTH	8
+
+#define SST_PP_MAX_MEMORY_FREQ_START	55
+#define SST_PP_MAX_MEMORY_FREQ_WIDTH	7
+
+#define SST_PP_COOLING_TYPE_START	62
+#define SST_PP_COOLING_TYPE_WIDTH	2
+
+#define SST_PP_TRL_0_RATIO_0_START	0
+#define SST_PP_TRL_0_RATIO_0_WIDTH	8
+
+#define SST_PP_TRL_CORES_BUCKET_0_START	0
+#define SST_PP_TRL_CORES_BUCKET_0_WIDTH	8
+
+#define SST_PP_CORE_RATIO_P0_START	0
+#define SST_PP_CORE_RATIO_P0_WIDTH	8
+
+#define SST_PP_CORE_RATIO_P1_START	8
+#define SST_PP_CORE_RATIO_P1_WIDTH	8
+
+#define SST_PP_CORE_RATIO_PN_START	16
+#define SST_PP_CORE_RATIO_PN_WIDTH	8
+
+#define SST_PP_CORE_RATIO_PM_START	24
+#define SST_PP_CORE_RATIO_PM_WIDTH	8
+
+#define SST_PP_CORE_RATIO_P0_FABRIC_START	32
+#define SST_PP_CORE_RATIO_P0_FABRIC_WIDTH	8
+
+#define SST_PP_CORE_RATIO_P1_FABRIC_START	40
+#define SST_PP_CORE_RATIO_P1_FABRIC_WIDTH	8
+
+#define SST_PP_CORE_RATIO_PM_FABRIC_START	48
+#define SST_PP_CORE_RATIO_PM_FABRIC_WIDTH	8
+
+static int isst_if_get_perf_level_info(void __user *argp)
+{
+	struct isst_perf_level_data_info perf_level;
+	struct tpmi_per_power_domain_info *power_domain_info;
+	int i, j;
+
+	if (copy_from_user(&perf_level, argp, sizeof(perf_level)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(perf_level.socket_id, perf_level.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	if (perf_level.level > power_domain_info->max_level)
+		return -EINVAL;
+
+	if (!(power_domain_info->pp_header.level_en_mask & BIT(perf_level.level)))
+		return -EINVAL;
+
+	_read_pp_level_info("tdp_ratio", perf_level.tdp_ratio, perf_level.level,
+			    SST_PP_INFO_0_OFFSET, SST_PP_P1_SSE_START, SST_PP_P1_SSE_WIDTH,
+			    SST_MUL_FACTOR_NONE)
+	_read_pp_level_info("base_freq_mhz", perf_level.base_freq_mhz, perf_level.level,
+			    SST_PP_INFO_0_OFFSET, SST_PP_P1_SSE_START, SST_PP_P1_SSE_WIDTH,
+			    SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("base_freq_avx2_mhz", perf_level.base_freq_avx2_mhz, perf_level.level,
+			    SST_PP_INFO_0_OFFSET, SST_PP_P1_AVX2_START, SST_PP_P1_AVX2_WIDTH,
+			    SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("base_freq_avx512_mhz", perf_level.base_freq_avx512_mhz,
+			    perf_level.level, SST_PP_INFO_0_OFFSET, SST_PP_P1_AVX512_START,
+			    SST_PP_P1_AVX512_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("base_freq_amx_mhz", perf_level.base_freq_amx_mhz, perf_level.level,
+			    SST_PP_INFO_0_OFFSET, SST_PP_P1_AMX_START, SST_PP_P1_AMX_WIDTH,
+			    SST_MUL_FACTOR_FREQ)
+
+	_read_pp_level_info("thermal_design_power_w", perf_level.thermal_design_power_w,
+			    perf_level.level, SST_PP_INFO_1_OFFSET, SST_PP_TDP_START,
+			    SST_PP_TDP_WIDTH, SST_MUL_FACTOR_NONE)
+	perf_level.thermal_design_power_w /= 8; /* units are in 1/8th watt */
+	_read_pp_level_info("tjunction_max_c", perf_level.tjunction_max_c, perf_level.level,
+			    SST_PP_INFO_1_OFFSET, SST_PP_T_PROCHOT_START, SST_PP_T_PROCHOT_WIDTH,
+			    SST_MUL_FACTOR_NONE)
+	_read_pp_level_info("max_memory_freq_mhz", perf_level.max_memory_freq_mhz,
+			    perf_level.level, SST_PP_INFO_1_OFFSET, SST_PP_MAX_MEMORY_FREQ_START,
+			    SST_PP_MAX_MEMORY_FREQ_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("cooling_type", perf_level.cooling_type, perf_level.level,
+			    SST_PP_INFO_1_OFFSET, SST_PP_COOLING_TYPE_START,
+			    SST_PP_COOLING_TYPE_WIDTH, SST_MUL_FACTOR_NONE)
+
+	for (i = 0; i < TRL_MAX_LEVELS; ++i) {
+		for (j = 0; j < TRL_MAX_BUCKETS; ++j)
+			_read_pp_level_info("trl*_bucket*_freq_mhz",
+					    perf_level.trl_freq_mhz[i][j], perf_level.level,
+					    SST_PP_INFO_4_OFFSET + (i * SST_PP_TRL_0_RATIO_0_WIDTH),
+					    j * SST_PP_TRL_0_RATIO_0_WIDTH,
+					    SST_PP_TRL_0_RATIO_0_WIDTH,
+					    SST_MUL_FACTOR_FREQ);
+	}
+
+	for (i = 0; i < TRL_MAX_BUCKETS; ++i)
+		_read_pp_level_info("bucket*_core_count", perf_level.bucket_core_counts[i],
+				    perf_level.level, SST_PP_INFO_10_OFFSET,
+				    SST_PP_TRL_CORES_BUCKET_0_WIDTH * i,
+				    SST_PP_TRL_CORES_BUCKET_0_WIDTH, SST_MUL_FACTOR_NONE)
+
+	perf_level.max_buckets = TRL_MAX_BUCKETS;
+	perf_level.max_trl_levels = TRL_MAX_LEVELS;
+
+	_read_pp_level_info("p0_freq_mhz", perf_level.p0_freq_mhz, perf_level.level,
+			    SST_PP_INFO_11_OFFSET, SST_PP_CORE_RATIO_P0_START,
+			    SST_PP_CORE_RATIO_P0_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("p1_freq_mhz", perf_level.p1_freq_mhz, perf_level.level,
+			    SST_PP_INFO_11_OFFSET, SST_PP_CORE_RATIO_P1_START,
+			    SST_PP_CORE_RATIO_P1_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("pn_freq_mhz", perf_level.pn_freq_mhz, perf_level.level,
+			    SST_PP_INFO_11_OFFSET, SST_PP_CORE_RATIO_PN_START,
+			    SST_PP_CORE_RATIO_PN_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("pm_freq_mhz", perf_level.pm_freq_mhz, perf_level.level,
+			    SST_PP_INFO_11_OFFSET, SST_PP_CORE_RATIO_PM_START,
+			    SST_PP_CORE_RATIO_PM_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("p0_fabric_freq_mhz", perf_level.p0_fabric_freq_mhz,
+			    perf_level.level, SST_PP_INFO_11_OFFSET,
+			    SST_PP_CORE_RATIO_P0_FABRIC_START,
+			    SST_PP_CORE_RATIO_P0_FABRIC_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("p1_fabric_freq_mhz", perf_level.p1_fabric_freq_mhz,
+			    perf_level.level, SST_PP_INFO_11_OFFSET,
+			    SST_PP_CORE_RATIO_P1_FABRIC_START,
+			    SST_PP_CORE_RATIO_P1_FABRIC_WIDTH, SST_MUL_FACTOR_FREQ)
+	_read_pp_level_info("pm_fabric_freq_mhz", perf_level.pm_fabric_freq_mhz,
+			    perf_level.level, SST_PP_INFO_11_OFFSET,
+			    SST_PP_CORE_RATIO_PM_FABRIC_START,
+			    SST_PP_CORE_RATIO_PM_FABRIC_WIDTH, SST_MUL_FACTOR_FREQ)
+
+	if (copy_to_user(argp, &perf_level, sizeof(perf_level)))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define SST_PP_FUSED_CORE_COUNT_START	0
+#define SST_PP_FUSED_CORE_COUNT_WIDTH	8
+
+#define SST_PP_RSLVD_CORE_COUNT_START	8
+#define SST_PP_RSLVD_CORE_COUNT_WIDTH	8
+
+#define SST_PP_RSLVD_CORE_MASK_START	0
+#define SST_PP_RSLVD_CORE_MASK_WIDTH	64
+
+static int isst_if_get_perf_level_mask(void __user *argp)
+{
+	static struct isst_perf_level_cpu_mask cpumask;
+	struct tpmi_per_power_domain_info *power_domain_info;
+	u64 mask;
+
+	if (copy_from_user(&cpumask, argp, sizeof(cpumask)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(cpumask.socket_id, cpumask.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	_read_pp_level_info("mask", mask, cpumask.level, SST_PP_INFO_2_OFFSET,
+			    SST_PP_RSLVD_CORE_MASK_START, SST_PP_RSLVD_CORE_MASK_WIDTH,
+			    SST_MUL_FACTOR_NONE)
+
+	cpumask.mask = mask;
+
+	if (!cpumask.punit_cpu_map)
+		return -EOPNOTSUPP;
+
+	if (copy_to_user(argp, &cpumask, sizeof(cpumask)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int isst_if_get_tpmi_instance_count(void __user *argp)
 {
 	struct isst_tpmi_instance_count tpmi_inst;
@@ -664,6 +1064,21 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 	case ISST_IF_CLOS_ASSOC:
 		ret = isst_if_clos_assoc(argp);
 		break;
+	case ISST_IF_PERF_LEVELS:
+		ret = isst_if_get_perf_level(argp);
+		break;
+	case ISST_IF_PERF_SET_LEVEL:
+		ret = isst_if_set_perf_level(argp);
+		break;
+	case ISST_IF_PERF_SET_FEATURE:
+		ret = isst_if_set_perf_feature(argp);
+		break;
+	case ISST_IF_GET_PERF_LEVEL_INFO:
+		ret = isst_if_get_perf_level_info(argp);
+		break;
+	case ISST_IF_GET_PERF_LEVEL_CPU_MASK:
+		ret = isst_if_get_perf_level_mask(argp);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index 32687d8023ef..c4b350ea5cbe 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -254,6 +254,178 @@ struct isst_tpmi_instance_count {
 	__u16 valid_mask;
 };
 
+/**
+ * struct isst_perf_level_info - Structure to get information on SST-PP levels
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @logical_cpu: CPU number
+ * @clos:	Clos ID to assign to the logical CPU
+ * @max_level: Maximum performance level supported by the platform
+ * @feature_rev: The feature revision for SST-PP supported by the platform
+ * @level_mask: Mask of supported performance levels
+ * @current_level: Current performance level
+ * @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level
+ * @locked: SST-PP performance level change is locked/unlocked
+ * @enabled: SST-PP feature is enabled or not
+ * @sst-tf_support: SST-TF support status at this level
+ * @sst-bf_support: SST-BF support status at this level
+ *
+ * Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS.
+ */
+struct isst_perf_level_info {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 max_level;
+	__u8 feature_rev;
+	__u8 level_mask;
+	__u8 current_level;
+	__u8 feature_state;
+	__u8 locked;
+	__u8 enabled;
+	__u8 sst_tf_support;
+	__u8 sst_bf_support;
+};
+
+/**
+ * struct isst_perf_level_control - Structure to set SST-PP level
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @level:	level to set
+ *
+ * Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL.
+ */
+struct isst_perf_level_control {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 level;
+};
+
+/**
+ * struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @feature:	bit 0 = SST-BF state, bit 1 = SST-TF state
+ *
+ * Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE.
+ */
+struct isst_perf_feature_control {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 feature;
+};
+
+#define TRL_MAX_BUCKETS	8
+#define TRL_MAX_LEVELS		6
+
+/**
+ * struct isst_perf_level_data_info - Structure to get SST-PP level details
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @level:	SST-PP level for which caller wants to get information
+ * @tdp_ratio: TDP Ratio
+ * @base_freq_mhz: Base frequency in MHz
+ * @base_freq_avx2_mhz: AVX2 Base frequency in MHz
+ * @base_freq_avx512_mhz: AVX512 base frequency in MHz
+ * @base_freq_amx_mhz: AMX base frequency in MHz
+ * @thermal_design_power_w: Thermal design (TDP) power
+ * @tjunction_max_c: Max junction temperature
+ * @max_memory_freq_mhz: Max memory frequency in MHz
+ * @cooling_type: Type of cooling is used
+ * @p0_freq_mhz: core maximum frequency
+ * @p1_freq_mhz: Core TDP frequency
+ * @pn_freq_mhz: Core maximum efficiency frequency
+ * @pm_freq_mhz: Core minimum frequency
+ * @p0_fabric_freq_mhz: Fabric (Uncore) maximum frequency
+ * @p1_fabric_freq_mhz: Fabric (Uncore) TDP frequency
+ * @pn_fabric_freq_mhz: Fabric (Uncore) minimum efficiency frequency
+ * @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency
+ * @max_buckets: Maximum trl buckets
+ * @max_trl_levels: Maximum trl levels
+ * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket
+ * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency
+ * for a bucket and trl level
+ *
+ * Structure used to get information on frequencies and TDP for a SST-PP
+ * level using ISST_IF_GET_PERF_LEVEL_INFO.
+ */
+struct isst_perf_level_data_info {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u16 level;
+	__u16 tdp_ratio;
+	__u16 base_freq_mhz;
+	__u16 base_freq_avx2_mhz;
+	__u16 base_freq_avx512_mhz;
+	__u16 base_freq_amx_mhz;
+	__u16 thermal_design_power_w;
+	__u16 tjunction_max_c;
+	__u16 max_memory_freq_mhz;
+	__u16 cooling_type;
+	__u16 p0_freq_mhz;
+	__u16 p1_freq_mhz;
+	__u16 pn_freq_mhz;
+	__u16 pm_freq_mhz;
+	__u16 p0_fabric_freq_mhz;
+	__u16 p1_fabric_freq_mhz;
+	__u16 pn_fabric_freq_mhz;
+	__u16 pm_fabric_freq_mhz;
+	__u16 max_buckets;
+	__u16 max_trl_levels;
+	__u16 bucket_core_counts[TRL_MAX_BUCKETS];
+	__u16 trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS];
+};
+
+/**
+ * struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @level:	SST-PP level for which caller wants to get information
+ * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
+ *		   Linux CPU number. If 0 CPU buffer is copied to user space
+ *		   supplied cpu_buffer of size cpu_buffer_size. Punit
+ *		   cpu mask is copied to "mask" field.
+ * @mask:	cpu mask for this PP level (punit CPU numbering)
+ * @cpu_buffer_size: size of cpu_buffer also used to return the copied CPU
+ *		buffer size.
+ * @cpu_buffer:	Buffer to copy CPU mask when punit_cpu_map is 0
+ *
+ * Structure used to get cpumask for a SST-PP level using
+ * IOCTL ISST_IF_GET_PERF_LEVEL_CPU_MASK. Also used to get CPU mask for
+ * IOCTL ISST_IF_GET_BASE_FREQ_CPU_MASK for SST-BF.
+ */
+struct isst_perf_level_cpu_mask {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u8 level;
+	__u8 punit_cpu_map;
+	__u64 mask;
+	__u16 cpu_buffer_size;
+	__s8 cpu_buffer[1];
+};
+
+/**
+ * struct isst_base_freq_info - Structure to get SST-BF frequencies
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @level:	SST-PP level for which caller wants to get information
+ * @high_base_freq_mhz: High priority CPU base frequency
+ * @low_base_freq_mhz: Low priority CPU base frequency
+ * @tjunction_max_c: Max junction temperature
+ * @thermal_design_power_w: Thermal design power in watts
+ *
+ * Structure used to get SST-BF information using
+ * IOCTL ISST_IF_GET_BASE_FREQ_INFO.
+ */
+struct isst_base_freq_info {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u16 level;
+	__u16 high_base_freq_mhz;
+	__u16 low_base_freq_mhz;
+	__u16 tjunction_max_c;
+	__u16 thermal_design_power_w;
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
@@ -266,4 +438,12 @@ struct isst_tpmi_instance_count {
 #define ISST_IF_CLOS_PARAM	_IOWR(ISST_IF_MAGIC, 7, struct isst_clos_param *)
 #define ISST_IF_CLOS_ASSOC	_IOWR(ISST_IF_MAGIC, 8, struct isst_if_clos_assoc_cmds *)
 
+#define ISST_IF_PERF_LEVELS	_IOWR(ISST_IF_MAGIC, 9, struct isst_perf_level_info *)
+#define ISST_IF_PERF_SET_LEVEL	_IOW(ISST_IF_MAGIC, 10, struct isst_perf_level_control *)
+#define ISST_IF_PERF_SET_FEATURE _IOW(ISST_IF_MAGIC, 11, struct isst_perf_feature_control *)
+#define ISST_IF_GET_PERF_LEVEL_INFO	_IOR(ISST_IF_MAGIC, 12, struct isst_perf_level_data_info *)
+#define ISST_IF_GET_PERF_LEVEL_CPU_MASK	_IOR(ISST_IF_MAGIC, 13, struct isst_perf_level_cpu_mask *)
+#define ISST_IF_GET_BASE_FREQ_INFO	_IOR(ISST_IF_MAGIC, 14, struct isst_base_freq_info *)
+#define ISST_IF_GET_BASE_FREQ_CPU_MASK	_IOR(ISST_IF_MAGIC, 15, struct isst_perf_level_cpu_mask *)
+
 #endif
-- 
cgit v1.2.3


From f8e0077a9d526cac7abbc682d83e98f834ffa909 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 7 Mar 2023 23:06:41 -0800
Subject: platform/x86: ISST: Add SST-TF support via TPMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The support of Intel Speed Select Technology - Turbo Frequency (SST-TF)
feature enables the ability to set different “All core turbo ratio
limits” to cores based on the priority. By using this feature, some cores
can be configured to get higher turbo frequency by designating them as
high priority at the cost of lower or no turbo frequency on the low
priority cores.

One new IOCTLs are added:
ISST_IF_GET_TURBO_FREQ_INFO : Get information about turbo frequency
				buckets

Once an instance is identified, read or write from correct MMIO
offset for a given field as defined in the specification.

For details on SST-TF operations using intel-speed-selet utility,
refer to:
Documentation/admin-guide/pm/intel-speed-select.rst
under the kernel documentation

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Pragya Tanwar <pragya.tanwar@intel.com>
Link: https://lore.kernel.org/r/20230308070642.1727167-8-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../x86/intel/speed_select_if/isst_tpmi_core.c     | 66 ++++++++++++++++++++++
 include/uapi/linux/isst_if.h                       | 26 +++++++++
 2 files changed, 92 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
index 0566c8d30181..5104717afe0e 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -1125,6 +1125,69 @@ static int isst_if_get_tpmi_instance_count(void __user *argp)
 	return 0;
 }
 
+#define SST_TF_INFO_0_OFFSET	0
+#define SST_TF_INFO_1_OFFSET	8
+#define SST_TF_INFO_2_OFFSET	16
+
+#define SST_TF_MAX_LP_CLIP_RATIOS	TRL_MAX_LEVELS
+
+#define SST_TF_LP_CLIP_RATIO_0_START	16
+#define SST_TF_LP_CLIP_RATIO_0_WIDTH	8
+
+#define SST_TF_RATIO_0_START	0
+#define SST_TF_RATIO_0_WIDTH	8
+
+#define SST_TF_NUM_CORE_0_START 0
+#define SST_TF_NUM_CORE_0_WIDTH 8
+
+static int isst_if_get_turbo_freq_info(void __user *argp)
+{
+	static struct isst_turbo_freq_info turbo_freq;
+	struct tpmi_per_power_domain_info *power_domain_info;
+	int i, j;
+
+	if (copy_from_user(&turbo_freq, argp, sizeof(turbo_freq)))
+		return -EFAULT;
+
+	power_domain_info = get_instance(turbo_freq.socket_id, turbo_freq.power_domain_id);
+	if (!power_domain_info)
+		return -EINVAL;
+
+	if (turbo_freq.level > power_domain_info->max_level)
+		return -EINVAL;
+
+	turbo_freq.max_buckets = TRL_MAX_BUCKETS;
+	turbo_freq.max_trl_levels = TRL_MAX_LEVELS;
+	turbo_freq.max_clip_freqs = SST_TF_MAX_LP_CLIP_RATIOS;
+
+	for (i = 0; i < turbo_freq.max_clip_freqs; ++i)
+		_read_tf_level_info("lp_clip*", turbo_freq.lp_clip_freq_mhz[i],
+				    turbo_freq.level, SST_TF_INFO_0_OFFSET,
+				    SST_TF_LP_CLIP_RATIO_0_START +
+				    (i * SST_TF_LP_CLIP_RATIO_0_WIDTH),
+				    SST_TF_LP_CLIP_RATIO_0_WIDTH, SST_MUL_FACTOR_FREQ)
+
+	for (i = 0; i < TRL_MAX_LEVELS; ++i) {
+		for (j = 0; j < TRL_MAX_BUCKETS; ++j)
+			_read_tf_level_info("cydn*_bucket_*_trl",
+					    turbo_freq.trl_freq_mhz[i][j], turbo_freq.level,
+					    SST_TF_INFO_2_OFFSET + (i * SST_TF_RATIO_0_WIDTH),
+					    j * SST_TF_RATIO_0_WIDTH, SST_TF_RATIO_0_WIDTH,
+					    SST_MUL_FACTOR_FREQ)
+	}
+
+	for (i = 0; i < TRL_MAX_BUCKETS; ++i)
+		_read_tf_level_info("bucket_*_core_count", turbo_freq.bucket_core_counts[i],
+				    turbo_freq.level, SST_TF_INFO_1_OFFSET,
+				    SST_TF_NUM_CORE_0_WIDTH * i, SST_TF_NUM_CORE_0_WIDTH,
+				    SST_MUL_FACTOR_NONE)
+
+	if (copy_to_user(argp, &turbo_freq, sizeof(turbo_freq)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 			      unsigned long arg)
 {
@@ -1166,6 +1229,9 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 	case ISST_IF_GET_BASE_FREQ_CPU_MASK:
 		ret = isst_if_get_base_freq_mask(argp);
 		break;
+	case ISST_IF_GET_TURBO_FREQ_INFO:
+		ret = isst_if_get_turbo_freq_info(argp);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index c4b350ea5cbe..0df1a1c3caf4 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -426,6 +426,31 @@ struct isst_base_freq_info {
 	__u16 thermal_design_power_w;
 };
 
+/**
+ * struct isst_turbo_freq_info - Structure to get SST-TF frequencies
+ * @socket_id:	Socket/package id
+ * @power_domain:	Power Domain id
+ * @level:	SST-PP level for which caller wants to get information
+ * @max_clip_freqs: Maximum number of low priority core clipping frequencies
+ * @lp_clip_freq_mhz: Clip frequencies per trl level
+ * @bucket_core_counts: Maximum number of cores for a bucket
+ * @trl_freq_mhz: Frequencies per trl level for each bucket
+ *
+ * Structure used to get SST-TF information using
+ * IOCTL ISST_IF_GET_TURBO_FREQ_INFO.
+ */
+struct isst_turbo_freq_info {
+	__u8 socket_id;
+	__u8 power_domain_id;
+	__u16 level;
+	__u16 max_clip_freqs;
+	__u16 max_buckets;
+	__u16 max_trl_levels;
+	__u16 lp_clip_freq_mhz[TRL_MAX_LEVELS];
+	__u16 bucket_core_counts[TRL_MAX_BUCKETS];
+	__u16 trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS];
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
@@ -445,5 +470,6 @@ struct isst_base_freq_info {
 #define ISST_IF_GET_PERF_LEVEL_CPU_MASK	_IOR(ISST_IF_MAGIC, 13, struct isst_perf_level_cpu_mask *)
 #define ISST_IF_GET_BASE_FREQ_INFO	_IOR(ISST_IF_MAGIC, 14, struct isst_base_freq_info *)
 #define ISST_IF_GET_BASE_FREQ_CPU_MASK	_IOR(ISST_IF_MAGIC, 15, struct isst_perf_level_cpu_mask *)
+#define ISST_IF_GET_TURBO_FREQ_INFO	_IOR(ISST_IF_MAGIC, 16, struct isst_turbo_freq_info *)
 
 #endif
-- 
cgit v1.2.3


From c5edd753a0bd6243a597f5199c227a50457ee179 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 8 Feb 2023 15:01:02 +0100
Subject: KVM: x86: Remove the KVM_GET_NR_MMU_PAGES ioctl

The KVM_GET_NR_MMU_PAGES ioctl is quite questionable on 64-bit hosts
since it fails to return the full 64 bits of the value that can be
set with the corresponding KVM_SET_NR_MMU_PAGES call. Its "long" return
value is truncated into an "int" in the kvm_arch_vm_ioctl() function.

Since this ioctl also never has been used by userspace applications
(QEMU, Google's internal VMM, kvmtool and CrosVM have been checked),
it's likely the best if we remove this badly designed ioctl before
anybody really tries to use it.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230208140105.655814-4-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c             | 8 --------
 include/uapi/linux/kvm.h       | 2 +-
 tools/include/uapi/linux/kvm.h | 2 +-
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff7f398a0c6a..4282daeaee84 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6021,11 +6021,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 	return 0;
 }
 
-static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
-	return kvm->arch.n_max_mmu_pages;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
@@ -6711,9 +6706,6 @@ set_identity_unlock:
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		break;
-	case KVM_GET_NR_MMU_PAGES:
-		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
-		break;
 	case KVM_CREATE_IRQCHIP: {
 		mutex_lock(&kvm->lock);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..4003a166328c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)  /* deprecated */
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index d77aef872a0a..4003a166328c 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)  /* deprecated */
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
-- 
cgit v1.2.3


From d8708b80fa0e6e21bc0c9e7276ad0bccef73b6e7 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 8 Feb 2023 15:01:05 +0100
Subject: KVM: Change return type of kvm_arch_vm_ioctl() to "int"

All kvm_arch_vm_ioctl() implementations now only deal with "int"
types as return values, so we can change the return type of these
functions to use "int" instead of "long".

Signed-off-by: Thomas Huth <thuth@redhat.com>
Acked-by: Anup Patel <anup@brainfault.org>
Message-Id: <20230208140105.655814-7-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/arm.c       | 3 +--
 arch/mips/kvm/mips.c       | 4 ++--
 arch/powerpc/kvm/powerpc.c | 5 ++---
 arch/riscv/kvm/vm.c        | 3 +--
 arch/s390/kvm/kvm-s390.c   | 3 +--
 arch/x86/kvm/x86.c         | 3 +--
 include/linux/kvm_host.h   | 3 +--
 7 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3bd732eaf087..a43e1cb3b7e9 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1439,8 +1439,7 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 	}
 }
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 36c8991b5d39..884be4ef99dc 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -993,9 +993,9 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 	kvm_flush_remote_tlbs(kvm);
 }
 
-long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
-	long r;
+	int r;
 
 	switch (ioctl) {
 	default:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c5405fc5538..c0bac9cf2d87 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -2371,12 +2371,11 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
 }
 #endif
 
-long kvm_arch_vm_ioctl(struct file *filp,
-                       unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm __maybe_unused = filp->private_data;
 	void __user *argp = (void __user *)arg;
-	long r;
+	int r;
 
 	switch (ioctl) {
 	case KVM_PPC_GET_PVINFO: {
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 65a964d7e70d..c13130ab459a 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -87,8 +87,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	return r;
 }
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	return -EINVAL;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7039abda6d49..4c3edccb9911 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2898,8 +2898,7 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 	}
 }
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4282daeaee84..237c483b1230 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6667,8 +6667,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
 	return 0;
 }
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8ada23756b0e..90edc16d37e5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1397,8 +1397,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 			bool line_status);
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			    struct kvm_enable_cap *cap);
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg);
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
 long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
 			      unsigned long arg);
 
-- 
cgit v1.2.3


From 8af70e202ac4ea5dd2d5561cc2675bc09b30412e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 18 Feb 2023 17:21:21 +0000
Subject: leds: Fix reference to led_set_brightness() in doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The referenced function led_classdev_brightness_set() never existed.

Fixes: 5ada28bf7675 ("led-class: always implement blinking")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230218-typo-led-set-v1-1-3c35362a2f2d@weissschuh.net
---
 include/linux/leds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index d71201a968b6..aedf34165354 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -256,7 +256,7 @@ struct led_classdev *__must_check devm_of_led_get(struct device *dev,
  *
  * Note that if software blinking is active, simply calling
  * led_cdev->brightness_set() will not stop the blinking,
- * use led_classdev_brightness_set() instead.
+ * use led_set_brightness() instead.
  */
 void led_blink_set(struct led_classdev *led_cdev, unsigned long *delay_on,
 		   unsigned long *delay_off);
-- 
cgit v1.2.3


From 70493a63ba04f754f7a7dd53a4fcc82700181490 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 15 Mar 2023 11:39:03 -0700
Subject: blk-crypto: make blk_crypto_evict_key() return void

blk_crypto_evict_key() is only called in contexts such as inode eviction
where failure is not an option.  So there is nothing the caller can do
with errors except log them.  (dm-table.c does "use" the error code, but
only to pass on to upper layers, so it doesn't really count.)

Just make blk_crypto_evict_key() return void and log errors itself.

Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230315183907.53675-2-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto.c         | 20 +++++++++-----------
 drivers/md/dm-table.c      | 19 +++++--------------
 include/linux/blk-crypto.h |  4 ++--
 3 files changed, 16 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index d0c7feb447e9..e800f305e9ed 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/blk-crypto-profile.h>
 #include <linux/module.h>
+#include <linux/ratelimit.h>
 #include <linux/slab.h>
 
 #include "blk-crypto-internal.h"
@@ -408,21 +409,18 @@ int blk_crypto_start_using_key(struct block_device *bdev,
  * Upper layers (filesystems) must call this function to ensure that a key is
  * evicted from any hardware that it might have been programmed into.  The key
  * must not be in use by any in-flight IO when this function is called.
- *
- * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
  */
-int blk_crypto_evict_key(struct block_device *bdev,
-			 const struct blk_crypto_key *key)
+void blk_crypto_evict_key(struct block_device *bdev,
+			  const struct blk_crypto_key *key)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
+	int err;
 
 	if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
-		return __blk_crypto_evict_key(q->crypto_profile, key);
-
-	/*
-	 * If the block_device didn't support the key, then blk-crypto-fallback
-	 * may have been used, so try to evict the key from blk-crypto-fallback.
-	 */
-	return blk_crypto_fallback_evict_key(key);
+		err = __blk_crypto_evict_key(q->crypto_profile, key);
+	else
+		err = blk_crypto_fallback_evict_key(key);
+	if (err)
+		pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
 }
 EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2055a758541d..7899f5fb4c13 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1202,21 +1202,12 @@ struct dm_crypto_profile {
 	struct mapped_device *md;
 };
 
-struct dm_keyslot_evict_args {
-	const struct blk_crypto_key *key;
-	int err;
-};
-
 static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
 				     sector_t start, sector_t len, void *data)
 {
-	struct dm_keyslot_evict_args *args = data;
-	int err;
+	const struct blk_crypto_key *key = data;
 
-	err = blk_crypto_evict_key(dev->bdev, args->key);
-	if (!args->err)
-		args->err = err;
-	/* Always try to evict the key from all devices. */
+	blk_crypto_evict_key(dev->bdev, key);
 	return 0;
 }
 
@@ -1229,7 +1220,6 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
 {
 	struct mapped_device *md =
 		container_of(profile, struct dm_crypto_profile, profile)->md;
-	struct dm_keyslot_evict_args args = { key };
 	struct dm_table *t;
 	int srcu_idx;
 
@@ -1242,11 +1232,12 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
 
 		if (!ti->type->iterate_devices)
 			continue;
-		ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args);
+		ti->type->iterate_devices(ti, dm_keyslot_evict_callback,
+					  (void *)key);
 	}
 
 	dm_put_live_table(md, srcu_idx);
-	return args.err;
+	return 0;
 }
 
 static int
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 1e3e5d0adf12..5e5822c18ee4 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -95,8 +95,8 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
 int blk_crypto_start_using_key(struct block_device *bdev,
 			       const struct blk_crypto_key *key);
 
-int blk_crypto_evict_key(struct block_device *bdev,
-			 const struct blk_crypto_key *key);
+void blk_crypto_evict_key(struct block_device *bdev,
+			  const struct blk_crypto_key *key);
 
 bool blk_crypto_config_supported_natively(struct block_device *bdev,
 					  const struct blk_crypto_config *cfg);
-- 
cgit v1.2.3


From 053fdaa841bd1af9fe9c2c30bba81119059aac95 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 14 Mar 2023 15:13:08 -0500
Subject: nfc: mrvl: Move platform_data struct into driver

There are no users of nfcmrvl platform_data struct outside of the
driver and none will be added, so move it into the driver.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/nfcmrvl/nfcmrvl.h         | 30 ++++++++++++++++++++--
 include/linux/platform_data/nfcmrvl.h | 48 -----------------------------------
 2 files changed, 28 insertions(+), 50 deletions(-)
 delete mode 100644 include/linux/platform_data/nfcmrvl.h

(limited to 'include')

diff --git a/drivers/nfc/nfcmrvl/nfcmrvl.h b/drivers/nfc/nfcmrvl/nfcmrvl.h
index 165bd0a95190..0f22b3233f73 100644
--- a/drivers/nfc/nfcmrvl/nfcmrvl.h
+++ b/drivers/nfc/nfcmrvl/nfcmrvl.h
@@ -8,8 +8,6 @@
 #ifndef _NFCMRVL_H_
 #define _NFCMRVL_H_
 
-#include <linux/platform_data/nfcmrvl.h>
-
 #include "fw_dnld.h"
 
 /* Define private flags: */
@@ -50,6 +48,34 @@ enum nfcmrvl_phy {
 	NFCMRVL_PHY_SPI		= 3,
 };
 
+struct nfcmrvl_platform_data {
+	/*
+	 * Generic
+	 */
+
+	/* GPIO that is wired to RESET_N signal */
+	int reset_n_io;
+	/* Tell if transport is muxed in HCI one */
+	unsigned int hci_muxed;
+
+	/*
+	 * UART specific
+	 */
+
+	/* Tell if UART needs flow control at init */
+	unsigned int flow_control;
+	/* Tell if firmware supports break control for power management */
+	unsigned int break_control;
+
+
+	/*
+	 * I2C specific
+	 */
+
+	unsigned int irq;
+	unsigned int irq_polarity;
+};
+
 struct nfcmrvl_private {
 
 	unsigned long flags;
diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h
deleted file mode 100644
index 9e75ac8d19be..000000000000
--- a/include/linux/platform_data/nfcmrvl.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2015, Marvell International Ltd.
- *
- * This software file (the "File") is distributed by Marvell International
- * Ltd. under the terms of the GNU General Public License Version 2, June 1991
- * (the "License").  You may use, redistribute and/or modify this File in
- * accordance with the terms and conditions of the License, a copy of which
- * is available on the worldwide web at
- * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
- *
- * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
- * ARE EXPRESSLY DISCLAIMED.  The License provides additional details about
- * this warranty disclaimer.
- */
-
-#ifndef _NFCMRVL_PTF_H_
-#define _NFCMRVL_PTF_H_
-
-struct nfcmrvl_platform_data {
-	/*
-	 * Generic
-	 */
-
-	/* GPIO that is wired to RESET_N signal */
-	int reset_n_io;
-	/* Tell if transport is muxed in HCI one */
-	unsigned int hci_muxed;
-
-	/*
-	 * UART specific
-	 */
-
-	/* Tell if UART needs flow control at init */
-	unsigned int flow_control;
-	/* Tell if firmware supports break control for power management */
-	unsigned int break_control;
-
-
-	/*
-	 * I2C specific
-	 */
-
-	unsigned int irq;
-	unsigned int irq_polarity;
-};
-
-#endif /* _NFCMRVL_PTF_H_ */
-- 
cgit v1.2.3


From 428e106ae1ad4e45d3fd6978a753db475d0d0ec9 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sun, 12 Mar 2023 14:26:00 +0300
Subject: mm: Introduce untagged_addr_remote()

untagged_addr() removes tags/metadata from the address and brings it to
the canonical form. The helper is implemented on arm64 and sparc. Both of
them do untagging based on global rules.

However, Linear Address Masking (LAM) on x86 introduces per-process
settings for untagging. As a result, untagged_addr() is now only
suitable for untagging addresses for the current proccess.

The new helper untagged_addr_remote() has to be used when the address
targets remote process. It requires the mmap lock for target mm to be
taken.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Alexander Potapenko <glider@google.com>
Link: https://lore.kernel.org/all/20230312112612.31869-6-kirill.shutemov%40linux.intel.com
---
 arch/sparc/include/asm/uaccess_64.h |  2 ++
 drivers/vfio/vfio_iommu_type1.c     |  2 +-
 fs/proc/task_mmu.c                  |  9 +++++++--
 include/linux/mm.h                  | 11 -----------
 include/linux/uaccess.h             | 22 ++++++++++++++++++++++
 mm/gup.c                            |  4 ++--
 mm/madvise.c                        |  5 +++--
 mm/migrate.c                        | 11 ++++++-----
 8 files changed, 43 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 94266a5c5b04..b825a5dd0210 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -8,8 +8,10 @@
 
 #include <linux/compiler.h>
 #include <linux/string.h>
+#include <linux/mm_types.h>
 #include <asm/asi.h>
 #include <asm/spitfire.h>
+#include <asm/pgtable.h>
 
 #include <asm/processor.h>
 #include <asm-generic/access_ok.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 493c31de0edb..3d4dd9420c30 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -580,7 +580,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 		goto done;
 	}
 
-	vaddr = untagged_addr(vaddr);
+	vaddr = untagged_addr_remote(mm, vaddr);
 
 retry:
 	vma = vma_lookup(mm, vaddr);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..29fd6b1f4058 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1689,8 +1689,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	/* watch out for wraparound */
 	start_vaddr = end_vaddr;
-	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
-		start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+		ret = mmap_read_lock_killable(mm);
+		if (ret)
+			goto out_free;
+		start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+		mmap_read_unlock(mm);
+	}
 
 	/* Ensure the address is inside the task */
 	if (start_vaddr > mm->task_size)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..289ae4caf878 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -96,17 +96,6 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #include <asm/page.h>
 #include <asm/processor.h>
 
-/*
- * Architectures that support memory tagging (assigning tags to memory regions,
- * embedding these tags into addresses that point to these memory regions, and
- * checking that the memory and the pointer tags match on memory accesses)
- * redefine this macro to strip tags from pointers.
- * It's defined as noop for architectures that don't support memory tagging.
- */
-#ifndef untagged_addr
-#define untagged_addr(addr) (addr)
-#endif
-
 #ifndef __pa_symbol
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ab9728138ad6..3064314f4832 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -10,6 +10,28 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Architectures that support memory tagging (assigning tags to memory regions,
+ * embedding these tags into addresses that point to these memory regions, and
+ * checking that the memory and the pointer tags match on memory accesses)
+ * redefine this macro to strip tags from pointers.
+ *
+ * Passing down mm_struct allows to define untagging rules on per-process
+ * basis.
+ *
+ * It's defined as noop for architectures that don't support memory tagging.
+ */
+#ifndef untagged_addr
+#define untagged_addr(addr) (addr)
+#endif
+
+#ifndef untagged_addr_remote
+#define untagged_addr_remote(mm, addr)	({		\
+	mmap_assert_locked(mm);				\
+	untagged_addr(addr);				\
+})
+#endif
+
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
  * and get rid of their private instances of copy_{to,from}_user() and
diff --git a/mm/gup.c b/mm/gup.c
index eab18ba045db..5ee8b682a0fe 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1085,7 +1085,7 @@ static long __get_user_pages(struct mm_struct *mm,
 	if (!nr_pages)
 		return 0;
 
-	start = untagged_addr(start);
+	start = untagged_addr_remote(mm, start);
 
 	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
 
@@ -1259,7 +1259,7 @@ int fixup_user_fault(struct mm_struct *mm,
 	struct vm_area_struct *vma;
 	vm_fault_t ret;
 
-	address = untagged_addr(address);
+	address = untagged_addr_remote(mm, address);
 
 	if (unlocked)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/mm/madvise.c b/mm/madvise.c
index 340125d08c03..d4b67f36f70f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1402,8 +1402,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	size_t len;
 	struct blk_plug plug;
 
-	start = untagged_addr(start);
-
 	if (!madvise_behavior_valid(behavior))
 		return -EINVAL;
 
@@ -1435,6 +1433,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		mmap_read_lock(mm);
 	}
 
+	start = untagged_addr_remote(mm, start);
+	end = start + len;
+
 	blk_start_plug(&plug);
 	error = madvise_walk_vmas(mm, start, end, behavior,
 			madvise_vma_behavior);
diff --git a/mm/migrate.c b/mm/migrate.c
index 98f1c11197a8..8cd11bc9208f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2097,15 +2097,18 @@ static int do_move_pages_to_node(struct mm_struct *mm,
  *         target node
  *     1 - when it has been queued
  */
-static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
 		int node, struct list_head *pagelist, bool migrate_all)
 {
 	struct vm_area_struct *vma;
+	unsigned long addr;
 	struct page *page;
 	int err;
 	bool isolated;
 
 	mmap_read_lock(mm);
+	addr = (unsigned long)untagged_addr_remote(mm, p);
+
 	err = -EFAULT;
 	vma = vma_lookup(mm, addr);
 	if (!vma || !vma_migratable(vma))
@@ -2211,7 +2214,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 
 	for (i = start = 0; i < nr_pages; i++) {
 		const void __user *p;
-		unsigned long addr;
 		int node;
 
 		err = -EFAULT;
@@ -2219,7 +2221,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 			goto out_flush;
 		if (get_user(node, nodes + i))
 			goto out_flush;
-		addr = (unsigned long)untagged_addr(p);
 
 		err = -ENODEV;
 		if (node < 0 || node >= MAX_NUMNODES)
@@ -2247,8 +2248,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 		 * Errors in the page lookup or isolation are not fatal and we simply
 		 * report them via status
 		 */
-		err = add_page_for_migration(mm, addr, current_node,
-				&pagelist, flags & MPOL_MF_MOVE_ALL);
+		err = add_page_for_migration(mm, p, current_node, &pagelist,
+					     flags & MPOL_MF_MOVE_ALL);
 
 		if (err > 0) {
 			/* The page is successfully queued for migration */
-- 
cgit v1.2.3


From f7d304343b9d2456ffba23b99d2345408251ea45 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sun, 12 Mar 2023 14:26:04 +0300
Subject: mm: Expose untagging mask in /proc/$PID/status

Add a line in /proc/$PID/status to report untag_mask. It can be
used to find out LAM status of the process from the outside. It is
useful for debuggers.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Alexander Potapenko <glider@google.com>
Link: https://lore.kernel.org/all/20230312112612.31869-10-kirill.shutemov%40linux.intel.com
---
 arch/arm64/include/asm/mmu_context.h    | 6 ++++++
 arch/sparc/include/asm/mmu_context_64.h | 6 ++++++
 arch/x86/include/asm/mmu_context.h      | 6 ++++++
 fs/proc/array.c                         | 7 +++++++
 include/linux/mmu_context.h             | 7 +++++++
 5 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 72dbd6400549..56911691bef0 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -288,6 +288,12 @@ void post_ttbr_update_workaround(void);
 unsigned long arm64_mm_context_get(struct mm_struct *mm);
 void arm64_mm_context_put(struct mm_struct *mm);
 
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+	return -1UL >> 8;
+}
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 7a8380c63aab..799e797c5cdd 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -185,6 +185,12 @@ static inline void finish_arch_post_lock_switch(void)
 	}
 }
 
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+       return -1UL >> adi_nbits();
+}
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* !(__ASSEMBLY__) */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index eb1387ac40fa..06eaaf75d572 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -104,6 +104,12 @@ static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
 	mm->context.untag_mask = oldmm->context.untag_mask;
 }
 
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+	return mm->context.untag_mask;
+}
+
 static inline void mm_reset_untag_mask(struct mm_struct *mm)
 {
 	mm->context.untag_mask = -1UL;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b0315d34c58..6daea628bc76 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,6 +91,7 @@
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/mmu_context.h>
 
 #include <asm/processor.h>
 #include "internal.h"
@@ -423,6 +424,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
 	seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
 }
 
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+	seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -438,6 +444,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 		task_mem(m, mm);
 		task_core_dumping(m, task);
 		task_thp_status(m, mm);
+		task_untag_mask(m, mm);
 		mmput(mm);
 	}
 	task_sig(m, task);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index b9b970f7ab45..14b9c1fa05c4 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -28,4 +28,11 @@ static inline void leave_mm(int cpu) { }
 # define task_cpu_possible(cpu, p)	cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
 #endif
 
+#ifndef mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+	return -1UL;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 400b9b93441cd4e2fe824a70140f3d5a2a9c802b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sun, 12 Mar 2023 14:26:05 +0300
Subject: iommu/sva: Replace pasid_valid() helper with mm_valid_pasid()

Kernel has few users of pasid_valid() and all but one checks if the
process has PASID allocated. The helper takes ioasid_t as the input.

Replace the helper with mm_valid_pasid() that takes mm_struct as the
argument. The only call that checks PASID that is not tied to mm_struct
is open-codded now.

This is preparatory patch. It helps avoid ifdeffery: no need to
dereference mm->pasid in generic code to check if the process has PASID.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20230312112612.31869-11-kirill.shutemov%40linux.intel.com
---
 arch/x86/kernel/traps.c   | 6 +++---
 drivers/iommu/iommu-sva.c | 4 ++--
 include/linux/ioasid.h    | 9 ---------
 include/linux/sched/mm.h  | 8 +++++++-
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d317dc3d06a3..8b83d8fbce71 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -671,15 +671,15 @@ static bool try_fixup_enqcmd_gp(void)
 	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
 		return false;
 
-	pasid = current->mm->pasid;
-
 	/*
 	 * If the mm has not been allocated a
 	 * PASID, the #GP can not be fixed up.
 	 */
-	if (!pasid_valid(pasid))
+	if (!mm_valid_pasid(current->mm))
 		return false;
 
+	pasid = current->mm->pasid;
+
 	/*
 	 * Did this thread already have its PASID activated?
 	 * If so, the #GP must be from something else.
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 24bf9b2b58aa..4ee2929f0d7a 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -34,14 +34,14 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 
 	mutex_lock(&iommu_sva_lock);
 	/* Is a PASID already associated with this mm? */
-	if (pasid_valid(mm->pasid)) {
+	if (mm_valid_pasid(mm)) {
 		if (mm->pasid < min || mm->pasid >= max)
 			ret = -EOVERFLOW;
 		goto out;
 	}
 
 	pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
-	if (!pasid_valid(pasid))
+	if (pasid == INVALID_IOASID)
 		ret = -ENOMEM;
 	else
 		mm_pasid_set(mm, pasid);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index af1c9d62e642..836ae09e92c2 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -40,10 +40,6 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_set_data(ioasid_t ioasid, void *data);
-static inline bool pasid_valid(ioasid_t ioasid)
-{
-	return ioasid != INVALID_IOASID;
-}
 
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
@@ -74,10 +70,5 @@ static inline int ioasid_set_data(ioasid_t ioasid, void *data)
 	return -ENOTSUPP;
 }
 
-static inline bool pasid_valid(ioasid_t ioasid)
-{
-	return false;
-}
-
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..b69fe7e8c0ac 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -457,6 +457,11 @@ static inline void mm_pasid_init(struct mm_struct *mm)
 	mm->pasid = INVALID_IOASID;
 }
 
+static inline bool mm_valid_pasid(struct mm_struct *mm)
+{
+	return mm->pasid != INVALID_IOASID;
+}
+
 /* Associate a PASID with an mm_struct: */
 static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
 {
@@ -465,13 +470,14 @@ static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
 
 static inline void mm_pasid_drop(struct mm_struct *mm)
 {
-	if (pasid_valid(mm->pasid)) {
+	if (mm_valid_pasid(mm)) {
 		ioasid_free(mm->pasid);
 		mm->pasid = INVALID_IOASID;
 	}
 }
 #else
 static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
 static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif
-- 
cgit v1.2.3


From 23e5d9ec2bab53c4e5fbac675304e699726c1ac5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sun, 12 Mar 2023 14:26:06 +0300
Subject: x86/mm/iommu/sva: Make LAM and SVA mutually exclusive

IOMMU and SVA-capable devices know nothing about LAM and only expect
canonical addresses. An attempt to pass down tagged pointer will lead
to address translation failure.

By default do not allow to enable both LAM and use SVA in the same
process.

The new ARCH_FORCE_TAGGED_SVA arch_prctl() overrides the limitation.
By using the arch_prctl() userspace takes responsibility to never pass
tagged address to the device.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20230312112612.31869-12-kirill.shutemov%40linux.intel.com
---
 arch/x86/include/asm/mmu.h         | 2 ++
 arch/x86/include/asm/mmu_context.h | 6 ++++++
 arch/x86/include/uapi/asm/prctl.h  | 1 +
 arch/x86/kernel/process_64.c       | 7 +++++++
 drivers/iommu/iommu-sva.c          | 4 ++++
 include/linux/mmu_context.h        | 7 +++++++
 6 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index e80762e998ce..0da5c227f490 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -14,6 +14,8 @@
 #define MM_CONTEXT_HAS_VSYSCALL		1
 /* Do not allow changing LAM mode */
 #define MM_CONTEXT_LOCK_LAM		2
+/* Allow LAM and SVA coexisting */
+#define MM_CONTEXT_FORCE_TAGGED_SVA	3
 
 /*
  * x86 has arch-specific MMU state beyond what lives in mm_struct.
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 06eaaf75d572..4c396e9a384f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -115,6 +115,12 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
 	mm->context.untag_mask = -1UL;
 }
 
+#define arch_pgtable_dma_compat arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+	return !mm_lam_cr3_mask(mm) ||
+		test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+}
 #else
 
 static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index a31e27b95b19..eb290d89cb32 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -23,5 +23,6 @@
 #define ARCH_GET_UNTAG_MASK		0x4001
 #define ARCH_ENABLE_TAGGED_ADDR		0x4002
 #define ARCH_GET_MAX_TAG_BITS		0x4003
+#define ARCH_FORCE_TAGGED_SVA		0x4004
 
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 186f34add865..b46924c9e46d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -756,6 +756,10 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
 	if (current->mm != mm)
 		return -EINVAL;
 
+	if (mm_valid_pasid(mm) &&
+	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
+		return -EINTR;
+
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
 
@@ -878,6 +882,9 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 				(unsigned long __user *)arg2);
 	case ARCH_ENABLE_TAGGED_ADDR:
 		return prctl_enable_tagged_addr(task->mm, arg2);
+	case ARCH_FORCE_TAGGED_SVA:
+		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
+		return 0;
 	case ARCH_GET_MAX_TAG_BITS:
 		if (!cpu_feature_enabled(X86_FEATURE_LAM))
 			return put_user(0, (unsigned long __user *)arg2);
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 4ee2929f0d7a..dd76a1a09cf7 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -2,6 +2,7 @@
 /*
  * Helpers for IOMMU drivers implementing SVA
  */
+#include <linux/mmu_context.h>
 #include <linux/mutex.h>
 #include <linux/sched/mm.h>
 #include <linux/iommu.h>
@@ -32,6 +33,9 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 	    min == 0 || max < min)
 		return -EINVAL;
 
+	if (!arch_pgtable_dma_compat(mm))
+		return -EBUSY;
+
 	mutex_lock(&iommu_sva_lock);
 	/* Is a PASID already associated with this mm? */
 	if (mm_valid_pasid(mm)) {
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 14b9c1fa05c4..f2b7a3f04099 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -35,4 +35,11 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
 }
 #endif
 
+#ifndef arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+	return true;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 4a52338bf288c95a46f20af1f5df77b80b74c9ad Mon Sep 17 00:00:00 2001
From: Ziqi Chen <quic_ziqichen@quicinc.com>
Date: Wed, 15 Mar 2023 15:44:25 +0800
Subject: scsi: ufs: core: Add trace event for MCQ

Add MCQ hardware queue ID in the existing trace event ufshcd_command().

Signed-off-by: Ziqi Chen <quic_ziqichen@quicinc.com>
Link: https://lore.kernel.org/r/1678866271-49601-1-git-send-email-quic_ziqichen@quicinc.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c  | 15 ++++++++++++---
 include/trace/events/ufs.h | 22 ++++++++++++----------
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index b960b516c4ae..8e7dfaadc691 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -422,7 +422,9 @@ static void ufshcd_add_command_trace(struct ufs_hba *hba, unsigned int tag,
 {
 	u64 lba = 0;
 	u8 opcode = 0, group_id = 0;
-	u32 intr, doorbell;
+	u32 doorbell = 0;
+	u32 intr;
+	int hwq_id = -1;
 	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
 	struct scsi_cmnd *cmd = lrbp->cmd;
 	struct request *rq = scsi_cmd_to_rq(cmd);
@@ -456,9 +458,16 @@ static void ufshcd_add_command_trace(struct ufs_hba *hba, unsigned int tag,
 	}
 
 	intr = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
-	doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
+
+	if (is_mcq_enabled(hba)) {
+		struct ufs_hw_queue *hwq = ufshcd_mcq_req_to_hwq(hba, rq);
+
+		hwq_id = hwq->id;
+	} else {
+		doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
+	}
 	trace_ufshcd_command(dev_name(hba->dev), str_t, tag,
-			doorbell, transfer_len, intr, lba, opcode, group_id);
+			doorbell, hwq_id, transfer_len, intr, lba, opcode, group_id);
 }
 
 static void ufshcd_print_clk_freqs(struct ufs_hba *hba)
diff --git a/include/trace/events/ufs.h b/include/trace/events/ufs.h
index 599739ee7b20..992517ac3292 100644
--- a/include/trace/events/ufs.h
+++ b/include/trace/events/ufs.h
@@ -268,20 +268,21 @@ DEFINE_EVENT(ufshcd_template, ufshcd_wl_runtime_resume,
 
 TRACE_EVENT(ufshcd_command,
 	TP_PROTO(const char *dev_name, enum ufs_trace_str_t str_t,
-		 unsigned int tag, u32 doorbell, int transfer_len, u32 intr,
-		 u64 lba, u8 opcode, u8 group_id),
+		 unsigned int tag, u32 doorbell, u32 hwq_id, int transfer_len,
+		 u32 intr, u64 lba, u8 opcode, u8 group_id),
 
-	TP_ARGS(dev_name, str_t, tag, doorbell, transfer_len,
-				intr, lba, opcode, group_id),
+	TP_ARGS(dev_name, str_t, tag, doorbell, hwq_id, transfer_len,
+			intr, lba, opcode, group_id),
 
 	TP_STRUCT__entry(
 		__string(dev_name, dev_name)
 		__field(enum ufs_trace_str_t, str_t)
 		__field(unsigned int, tag)
 		__field(u32, doorbell)
-		__field(int, transfer_len)
+		__field(u32, hwq_id)
 		__field(u32, intr)
 		__field(u64, lba)
+		__field(int, transfer_len)
 		__field(u8, opcode)
 		__field(u8, group_id)
 	),
@@ -291,19 +292,20 @@ TRACE_EVENT(ufshcd_command,
 		__entry->str_t = str_t;
 		__entry->tag = tag;
 		__entry->doorbell = doorbell;
-		__entry->transfer_len = transfer_len;
+		__entry->hwq_id = hwq_id;
 		__entry->intr = intr;
 		__entry->lba = lba;
+		__entry->transfer_len = transfer_len;
 		__entry->opcode = opcode;
 		__entry->group_id = group_id;
 	),
 
 	TP_printk(
-		"%s: %s: tag: %u, DB: 0x%x, size: %d, IS: %u, LBA: %llu, opcode: 0x%x (%s), group_id: 0x%x",
+		"%s: %s: tag: %u, DB: 0x%x, size: %d, IS: %u, LBA: %llu, opcode: 0x%x (%s), group_id: 0x%x, hwq_id: %d",
 		show_ufs_cmd_trace_str(__entry->str_t), __get_str(dev_name),
-		__entry->tag, __entry->doorbell, __entry->transfer_len,
-		__entry->intr, __entry->lba, (u32)__entry->opcode,
-		str_opcode(__entry->opcode), (u32)__entry->group_id
+		__entry->tag, __entry->doorbell, __entry->transfer_len, __entry->intr,
+		__entry->lba, (u32)__entry->opcode, str_opcode(__entry->opcode),
+		(u32)__entry->group_id, __entry->hwq_id
 	)
 );
 
-- 
cgit v1.2.3


From ae7d45fb7ca75e94b478e2404709ba3024774334 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 10 Mar 2023 15:19:45 -0600
Subject: crypto: ccp - Add a header for multiple drivers to use `__psp_pa`

The TEE subdriver for CCP, the amdtee driver and the i2c-designware-amdpsp
drivers all include `psp-sev.h` even though they don't use SEV
functionality.

Move the definition of `__psp_pa` into a common header to be included
by all of these drivers.

Reviewed-by: Jan Dabros <jsd@semihalf.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com> # For the drivers/i2c/busses/i2c-designware-amdpsp.c
Acked-by: Sumit Garg <sumit.garg@linaro.org> # For TEE subsystem bits
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Sean Christopherson <seanjc@google.com> # KVM
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/kvm/svm/sev.c                     |  1 +
 drivers/crypto/ccp/sev-dev.c               |  1 +
 drivers/crypto/ccp/tee-dev.c               |  2 +-
 drivers/i2c/busses/i2c-designware-amdpsp.c |  2 +-
 drivers/tee/amdtee/call.c                  |  2 +-
 drivers/tee/amdtee/shm_pool.c              |  2 +-
 include/linux/psp-sev.h                    |  8 --------
 include/linux/psp.h                        | 14 ++++++++++++++
 8 files changed, 20 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/psp.h

(limited to 'include')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c25aeb550cd9..ec18a756b7c9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kernel.h>
 #include <linux/highmem.h>
+#include <linux/psp.h>
 #include <linux/psp-sev.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index e2f25926eb51..28945ca7c856 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -24,6 +24,7 @@
 #include <linux/cpufeature.h>
 #include <linux/fs.h>
 #include <linux/fs_struct.h>
+#include <linux/psp.h>
 
 #include <asm/smp.h>
 #include <asm/cacheflush.h>
diff --git a/drivers/crypto/ccp/tee-dev.c b/drivers/crypto/ccp/tee-dev.c
index 5c9d47f3be37..f24fc953718a 100644
--- a/drivers/crypto/ccp/tee-dev.c
+++ b/drivers/crypto/ccp/tee-dev.c
@@ -13,7 +13,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/gfp.h>
-#include <linux/psp-sev.h>
+#include <linux/psp.h>
 #include <linux/psp-tee.h>
 
 #include "psp-dev.h"
diff --git a/drivers/i2c/busses/i2c-designware-amdpsp.c b/drivers/i2c/busses/i2c-designware-amdpsp.c
index 8f36167bce62..80f28a1bbbef 100644
--- a/drivers/i2c/busses/i2c-designware-amdpsp.c
+++ b/drivers/i2c/busses/i2c-designware-amdpsp.c
@@ -4,7 +4,7 @@
 #include <linux/bits.h>
 #include <linux/i2c.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
-#include <linux/psp-sev.h>
+#include <linux/psp.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c
index cec6e70f0ac9..e8cd9aaa3467 100644
--- a/drivers/tee/amdtee/call.c
+++ b/drivers/tee/amdtee/call.c
@@ -8,7 +8,7 @@
 #include <linux/tee_drv.h>
 #include <linux/psp-tee.h>
 #include <linux/slab.h>
-#include <linux/psp-sev.h>
+#include <linux/psp.h>
 #include "amdtee_if.h"
 #include "amdtee_private.h"
 
diff --git a/drivers/tee/amdtee/shm_pool.c b/drivers/tee/amdtee/shm_pool.c
index f87f96a291c9..f0303126f199 100644
--- a/drivers/tee/amdtee/shm_pool.c
+++ b/drivers/tee/amdtee/shm_pool.c
@@ -5,7 +5,7 @@
 
 #include <linux/slab.h>
 #include <linux/tee_drv.h>
-#include <linux/psp-sev.h>
+#include <linux/psp.h>
 #include "amdtee_private.h"
 
 static int pool_op_alloc(struct tee_shm_pool *pool, struct tee_shm *shm,
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 1595088c428b..7fd17e82bab4 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -14,14 +14,6 @@
 
 #include <uapi/linux/psp-sev.h>
 
-#ifdef CONFIG_X86
-#include <linux/mem_encrypt.h>
-
-#define __psp_pa(x)	__sme_pa(x)
-#else
-#define __psp_pa(x)	__pa(x)
-#endif
-
 #define SEV_FW_BLOB_MAX_SIZE	0x4000	/* 16KB */
 
 /**
diff --git a/include/linux/psp.h b/include/linux/psp.h
new file mode 100644
index 000000000000..202162487ec3
--- /dev/null
+++ b/include/linux/psp.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __PSP_H
+#define __PSP_H
+
+#ifdef CONFIG_X86
+#include <linux/mem_encrypt.h>
+
+#define __psp_pa(x)	__sme_pa(x)
+#else
+#define __psp_pa(x)	__pa(x)
+#endif
+
+#endif /* __PSP_H */
-- 
cgit v1.2.3


From 1c5c1daf04d13916867ef68c6ba7ae4f5e73801f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 10 Mar 2023 15:19:46 -0600
Subject: crypto: ccp - Move some PSP mailbox bit definitions into common
 header

Some of the bits and fields used for mailboxes communicating with the
PSP are common across all mailbox implementations (SEV, TEE, etc).

Move these bits into the common `linux/psp.h` so they don't need to
be re-defined for each implementation.

Acked-by: Rijo Thomas <Rijo-john.Thomas@amd.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/psp-dev.h               |  3 ---
 drivers/crypto/ccp/sev-dev.c               | 15 +++++++--------
 drivers/crypto/ccp/sev-dev.h               |  2 +-
 drivers/crypto/ccp/tee-dev.c               | 15 ++++++++-------
 drivers/i2c/busses/i2c-designware-amdpsp.c | 16 +++++-----------
 include/linux/psp.h                        | 12 ++++++++++++
 6 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
index 06e1f317216d..55f54bb2b3fb 100644
--- a/drivers/crypto/ccp/psp-dev.h
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -17,9 +17,6 @@
 
 #include "sp-dev.h"
 
-#define PSP_CMDRESP_RESP		BIT(31)
-#define PSP_CMDRESP_ERR_MASK		0xffff
-
 #define MAX_PSP_NAME_LEN		16
 
 extern struct psp_device *psp_master;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 28945ca7c856..6440d35dfa4e 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -7,6 +7,7 @@
  * Author: Brijesh Singh <brijesh.singh@amd.com>
  */
 
+#include <linux/bitfield.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
@@ -103,7 +104,7 @@ static void sev_irq_handler(int irq, void *data, unsigned int status)
 
 	/* Check if it is SEV command completion: */
 	reg = ioread32(sev->io_regs + sev->vdata->cmdresp_reg);
-	if (reg & PSP_CMDRESP_RESP) {
+	if (FIELD_GET(PSP_CMDRESP_RESP, reg)) {
 		sev->int_rcvd = 1;
 		wake_up(&sev->int_queue);
 	}
@@ -347,9 +348,7 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 
 	sev->int_rcvd = 0;
 
-	reg = cmd;
-	reg <<= SEV_CMDRESP_CMD_SHIFT;
-	reg |= SEV_CMDRESP_IOC;
+	reg = FIELD_PREP(SEV_CMDRESP_CMD, cmd) | SEV_CMDRESP_IOC;
 	iowrite32(reg, sev->io_regs + sev->vdata->cmdresp_reg);
 
 	/* wait for command completion */
@@ -367,11 +366,11 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 	psp_timeout = psp_cmd_timeout;
 
 	if (psp_ret)
-		*psp_ret = reg & PSP_CMDRESP_ERR_MASK;
+		*psp_ret = FIELD_GET(PSP_CMDRESP_STS, reg);
 
-	if (reg & PSP_CMDRESP_ERR_MASK) {
-		dev_dbg(sev->dev, "sev command %#x failed (%#010x)\n",
-			cmd, reg & PSP_CMDRESP_ERR_MASK);
+	if (FIELD_GET(PSP_CMDRESP_STS, reg)) {
+		dev_dbg(sev->dev, "sev command %#x failed (%#010lx)\n",
+			cmd, FIELD_GET(PSP_CMDRESP_STS, reg));
 		ret = -EIO;
 	} else {
 		ret = sev_write_init_ex_file_if_required(cmd);
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index 666c21eb81ab..778c95155e74 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -25,8 +25,8 @@
 #include <linux/miscdevice.h>
 #include <linux/capability.h>
 
+#define SEV_CMDRESP_CMD			GENMASK(26, 16)
 #define SEV_CMD_COMPLETE		BIT(1)
-#define SEV_CMDRESP_CMD_SHIFT		16
 #define SEV_CMDRESP_IOC			BIT(0)
 
 struct sev_misc_dev {
diff --git a/drivers/crypto/ccp/tee-dev.c b/drivers/crypto/ccp/tee-dev.c
index f24fc953718a..5560bf8329a1 100644
--- a/drivers/crypto/ccp/tee-dev.c
+++ b/drivers/crypto/ccp/tee-dev.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2019,2021 Advanced Micro Devices, Inc.
  */
 
+#include <linux/bitfield.h>
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
@@ -69,7 +70,7 @@ static int tee_wait_cmd_poll(struct psp_tee_device *tee, unsigned int timeout,
 
 	while (--nloop) {
 		*reg = ioread32(tee->io_regs + tee->vdata->cmdresp_reg);
-		if (*reg & PSP_CMDRESP_RESP)
+		if (FIELD_GET(PSP_CMDRESP_RESP, *reg))
 			return 0;
 
 		usleep_range(10000, 10100);
@@ -149,9 +150,9 @@ static int tee_init_ring(struct psp_tee_device *tee)
 		goto free_buf;
 	}
 
-	if (reg & PSP_CMDRESP_ERR_MASK) {
-		dev_err(tee->dev, "tee: ring init command failed (%#010x)\n",
-			reg & PSP_CMDRESP_ERR_MASK);
+	if (FIELD_GET(PSP_CMDRESP_STS, reg)) {
+		dev_err(tee->dev, "tee: ring init command failed (%#010lx)\n",
+			FIELD_GET(PSP_CMDRESP_STS, reg));
 		tee_free_ring(tee);
 		ret = -EIO;
 	}
@@ -179,9 +180,9 @@ static void tee_destroy_ring(struct psp_tee_device *tee)
 	ret = tee_wait_cmd_poll(tee, TEE_DEFAULT_TIMEOUT, &reg);
 	if (ret) {
 		dev_err(tee->dev, "tee: ring destroy command timed out\n");
-	} else if (reg & PSP_CMDRESP_ERR_MASK) {
-		dev_err(tee->dev, "tee: ring destroy command failed (%#010x)\n",
-			reg & PSP_CMDRESP_ERR_MASK);
+	} else if (FIELD_GET(PSP_CMDRESP_STS, reg)) {
+		dev_err(tee->dev, "tee: ring destroy command failed (%#010lx)\n",
+			FIELD_GET(PSP_CMDRESP_STS, reg));
 	}
 
 free_ring:
diff --git a/drivers/i2c/busses/i2c-designware-amdpsp.c b/drivers/i2c/busses/i2c-designware-amdpsp.c
index 80f28a1bbbef..652e6b64bd5f 100644
--- a/drivers/i2c/busses/i2c-designware-amdpsp.c
+++ b/drivers/i2c/busses/i2c-designware-amdpsp.c
@@ -25,12 +25,6 @@
 #define PSP_I2C_REQ_STS_BUS_BUSY	0x1
 #define PSP_I2C_REQ_STS_INV_PARAM	0x3
 
-#define PSP_MBOX_FIELDS_STS		GENMASK(15, 0)
-#define PSP_MBOX_FIELDS_CMD		GENMASK(23, 16)
-#define PSP_MBOX_FIELDS_RESERVED	GENMASK(29, 24)
-#define PSP_MBOX_FIELDS_RECOVERY	BIT(30)
-#define PSP_MBOX_FIELDS_READY		BIT(31)
-
 struct psp_req_buffer_hdr {
 	u32 total_size;
 	u32 status;
@@ -99,15 +93,15 @@ static int psp_check_mbox_recovery(struct psp_mbox __iomem *mbox)
 
 	tmp = readl(&mbox->cmd_fields);
 
-	return FIELD_GET(PSP_MBOX_FIELDS_RECOVERY, tmp);
+	return FIELD_GET(PSP_CMDRESP_RECOVERY, tmp);
 }
 
 static int psp_wait_cmd(struct psp_mbox __iomem *mbox)
 {
 	u32 tmp, expected;
 
-	/* Expect mbox_cmd to be cleared and ready bit to be set by PSP */
-	expected = FIELD_PREP(PSP_MBOX_FIELDS_READY, 1);
+	/* Expect mbox_cmd to be cleared and the response bit to be set by PSP */
+	expected = FIELD_PREP(PSP_CMDRESP_RESP, 1);
 
 	/*
 	 * Check for readiness of PSP mailbox in a tight loop in order to
@@ -124,7 +118,7 @@ static u32 psp_check_mbox_sts(struct psp_mbox __iomem *mbox)
 
 	cmd_reg = readl(&mbox->cmd_fields);
 
-	return FIELD_GET(PSP_MBOX_FIELDS_STS, cmd_reg);
+	return FIELD_GET(PSP_CMDRESP_STS, cmd_reg);
 }
 
 static int psp_send_cmd(struct psp_i2c_req *req)
@@ -148,7 +142,7 @@ static int psp_send_cmd(struct psp_i2c_req *req)
 	writeq(req_addr, &mbox->i2c_req_addr);
 
 	/* Write command register to trigger processing */
-	cmd_reg = FIELD_PREP(PSP_MBOX_FIELDS_CMD, PSP_I2C_REQ_BUS_CMD);
+	cmd_reg = FIELD_PREP(PSP_CMDRESP_CMD, PSP_I2C_REQ_BUS_CMD);
 	writel(cmd_reg, &mbox->cmd_fields);
 
 	if (psp_wait_cmd(mbox))
diff --git a/include/linux/psp.h b/include/linux/psp.h
index 202162487ec3..d3424790a70e 100644
--- a/include/linux/psp.h
+++ b/include/linux/psp.h
@@ -11,4 +11,16 @@
 #define __psp_pa(x)	__pa(x)
 #endif
 
+/*
+ * Fields and bits used by most PSP mailboxes
+ *
+ * Note: Some mailboxes (such as SEV) have extra bits or different meanings
+ * and should include an appropriate local definition in their source file.
+ */
+#define PSP_CMDRESP_STS		GENMASK(15, 0)
+#define PSP_CMDRESP_CMD		GENMASK(23, 16)
+#define PSP_CMDRESP_RESERVED	GENMASK(29, 24)
+#define PSP_CMDRESP_RECOVERY	BIT(30)
+#define PSP_CMDRESP_RESP	BIT(31)
+
 #endif /* __PSP_H */
-- 
cgit v1.2.3


From 7ccc4f4e2e50e4a29f9ee8f5c9e187f8491bb6e7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 10 Mar 2023 15:19:47 -0600
Subject: crypto: ccp - Add support for an interface for platform features

Some platforms with a PSP support an interface for features that
interact directly with the PSP instead of through a SEV or TEE
environment.

Initialize this interface so that other drivers can consume it.
These drivers may either be subdrivers for the ccp module or
external modules.  For external modules, export a symbol for them
to utilize.

Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/Makefile          |   3 +-
 drivers/crypto/ccp/platform-access.c | 166 +++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/platform-access.h |  34 +++++++
 drivers/crypto/ccp/psp-dev.c         |  17 ++++
 drivers/crypto/ccp/psp-dev.h         |   1 +
 drivers/crypto/ccp/sp-dev.h          |   7 ++
 include/linux/psp-platform-access.h  |  49 +++++++++++
 7 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 drivers/crypto/ccp/platform-access.c
 create mode 100644 drivers/crypto/ccp/platform-access.h
 create mode 100644 include/linux/psp-platform-access.h

(limited to 'include')

diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index db362fe472ea..f6196495e862 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -10,7 +10,8 @@ ccp-$(CONFIG_CRYPTO_DEV_CCP_DEBUGFS) += ccp-debugfs.o
 ccp-$(CONFIG_PCI) += sp-pci.o
 ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \
                                    sev-dev.o \
-                                   tee-dev.o
+                                   tee-dev.o \
+                                   platform-access.o
 
 obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o
 ccp-crypto-objs := ccp-crypto-main.o \
diff --git a/drivers/crypto/ccp/platform-access.c b/drivers/crypto/ccp/platform-access.c
new file mode 100644
index 000000000000..9cc0c60bbf7b
--- /dev/null
+++ b/drivers/crypto/ccp/platform-access.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Platform Security Processor (PSP) Platform Access interface
+ *
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ *
+ * Author: Mario Limonciello <mario.limonciello@amd.com>
+ *
+ * Some of this code is adapted from drivers/i2c/busses/i2c-designware-amdpsp.c
+ * developed by Jan Dabros <jsd@semihalf.com> and Copyright (C) 2022 Google Inc.
+ *
+ */
+
+#include <linux/bitfield.h>
+#include <linux/errno.h>
+#include <linux/iopoll.h>
+#include <linux/mutex.h>
+
+#include "platform-access.h"
+
+#define PSP_CMD_TIMEOUT_US	(500 * USEC_PER_MSEC)
+
+/* Recovery field should be equal 0 to start sending commands */
+static int check_recovery(u32 __iomem *cmd)
+{
+	return FIELD_GET(PSP_CMDRESP_RECOVERY, ioread32(cmd));
+}
+
+static int wait_cmd(u32 __iomem *cmd)
+{
+	u32 tmp, expected;
+
+	/* Expect mbox_cmd to be cleared and ready bit to be set by PSP */
+	expected = FIELD_PREP(PSP_CMDRESP_RESP, 1);
+
+	/*
+	 * Check for readiness of PSP mailbox in a tight loop in order to
+	 * process further as soon as command was consumed.
+	 */
+	return readl_poll_timeout(cmd, tmp, (tmp & expected), 0,
+				  PSP_CMD_TIMEOUT_US);
+}
+
+int psp_check_platform_access_status(void)
+{
+	struct psp_device *psp = psp_get_master_device();
+
+	if (!psp || !psp->platform_access_data)
+		return -ENODEV;
+
+	return 0;
+}
+EXPORT_SYMBOL(psp_check_platform_access_status);
+
+int psp_send_platform_access_msg(enum psp_platform_access_msg msg,
+				 struct psp_request *req)
+{
+	struct psp_device *psp = psp_get_master_device();
+	u32 __iomem *cmd, *lo, *hi;
+	struct psp_platform_access_device *pa_dev;
+	phys_addr_t req_addr;
+	u32 cmd_reg;
+	int ret;
+
+	if (!psp || !psp->platform_access_data)
+		return -ENODEV;
+
+	pa_dev = psp->platform_access_data;
+	cmd = psp->io_regs + pa_dev->vdata->cmdresp_reg;
+	lo = psp->io_regs + pa_dev->vdata->cmdbuff_addr_lo_reg;
+	hi = psp->io_regs + pa_dev->vdata->cmdbuff_addr_hi_reg;
+
+	mutex_lock(&pa_dev->mailbox_mutex);
+
+	if (check_recovery(cmd)) {
+		dev_dbg(psp->dev, "platform mailbox is in recovery\n");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (wait_cmd(cmd)) {
+		dev_dbg(psp->dev, "platform mailbox is not done processing command\n");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	/*
+	 * Fill mailbox with address of command-response buffer, which will be
+	 * used for sending i2c requests as well as reading status returned by
+	 * PSP. Use physical address of buffer, since PSP will map this region.
+	 */
+	req_addr = __psp_pa(req);
+	iowrite32(lower_32_bits(req_addr), lo);
+	iowrite32(upper_32_bits(req_addr), hi);
+
+	print_hex_dump_debug("->psp ", DUMP_PREFIX_OFFSET, 16, 2, req,
+			     req->header.payload_size, false);
+
+	/* Write command register to trigger processing */
+	cmd_reg = FIELD_PREP(PSP_CMDRESP_CMD, msg);
+	iowrite32(cmd_reg, cmd);
+
+	if (wait_cmd(cmd)) {
+		ret = -ETIMEDOUT;
+		goto unlock;
+	}
+
+	/* Ensure it was triggered by this driver */
+	if (ioread32(lo) != lower_32_bits(req_addr) ||
+	    ioread32(hi) != upper_32_bits(req_addr)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	/* Store the status in request header for caller to investigate */
+	cmd_reg = ioread32(cmd);
+	req->header.status = FIELD_GET(PSP_CMDRESP_STS, cmd_reg);
+	if (req->header.status) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	print_hex_dump_debug("<-psp ", DUMP_PREFIX_OFFSET, 16, 2, req,
+			     req->header.payload_size, false);
+
+	ret = 0;
+
+unlock:
+	mutex_unlock(&pa_dev->mailbox_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(psp_send_platform_access_msg);
+
+void platform_access_dev_destroy(struct psp_device *psp)
+{
+	struct psp_platform_access_device *pa_dev = psp->platform_access_data;
+
+	if (!pa_dev)
+		return;
+
+	mutex_destroy(&pa_dev->mailbox_mutex);
+	psp->platform_access_data = NULL;
+}
+
+int platform_access_dev_init(struct psp_device *psp)
+{
+	struct device *dev = psp->dev;
+	struct psp_platform_access_device *pa_dev;
+
+	pa_dev = devm_kzalloc(dev, sizeof(*pa_dev), GFP_KERNEL);
+	if (!pa_dev)
+		return -ENOMEM;
+
+	psp->platform_access_data = pa_dev;
+	pa_dev->psp = psp;
+	pa_dev->dev = dev;
+
+	pa_dev->vdata = (struct platform_access_vdata *)psp->vdata->platform_access;
+
+	mutex_init(&pa_dev->mailbox_mutex);
+
+	dev_dbg(dev, "platform access enabled\n");
+
+	return 0;
+}
diff --git a/drivers/crypto/ccp/platform-access.h b/drivers/crypto/ccp/platform-access.h
new file mode 100644
index 000000000000..c3a97893320d
--- /dev/null
+++ b/drivers/crypto/ccp/platform-access.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AMD Platform Security Processor (PSP) Platform Access interface
+ *
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ *
+ * Author: Mario Limonciello <mario.limonciello@amd.com>
+ */
+
+#ifndef __PSP_PLATFORM_ACCESS_H__
+#define __PSP_PLATFORM_ACCESS_H__
+
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/psp-platform-access.h>
+
+#include "psp-dev.h"
+
+struct psp_platform_access_device {
+	struct device *dev;
+	struct psp_device *psp;
+
+	struct platform_access_vdata *vdata;
+
+	struct mutex mailbox_mutex;
+
+	void *platform_access_data;
+};
+
+void platform_access_dev_destroy(struct psp_device *psp);
+int platform_access_dev_init(struct psp_device *psp);
+
+#endif /* __PSP_PLATFORM_ACCESS_H__ */
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index cd8d1974726a..ec98f19800de 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -14,6 +14,7 @@
 #include "psp-dev.h"
 #include "sev-dev.h"
 #include "tee-dev.h"
+#include "platform-access.h"
 
 struct psp_device *psp_master;
 
@@ -102,6 +103,17 @@ static int psp_check_tee_support(struct psp_device *psp)
 	return 0;
 }
 
+static void psp_init_platform_access(struct psp_device *psp)
+{
+	int ret;
+
+	ret = platform_access_dev_init(psp);
+	if (ret) {
+		dev_warn(psp->dev, "platform access init failed: %d\n", ret);
+		return;
+	}
+}
+
 static int psp_init(struct psp_device *psp)
 {
 	int ret;
@@ -118,6 +130,9 @@ static int psp_init(struct psp_device *psp)
 			return ret;
 	}
 
+	if (psp->vdata->platform_access)
+		psp_init_platform_access(psp);
+
 	return 0;
 }
 
@@ -198,6 +213,8 @@ void psp_dev_destroy(struct sp_device *sp)
 
 	tee_dev_destroy(psp);
 
+	platform_access_dev_destroy(psp);
+
 	sp_free_psp_irq(sp, psp);
 
 	if (sp->clear_psp_master_device)
diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
index 55f54bb2b3fb..505e4bdeaca8 100644
--- a/drivers/crypto/ccp/psp-dev.h
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -39,6 +39,7 @@ struct psp_device {
 
 	void *sev_data;
 	void *tee_data;
+	void *platform_access_data;
 
 	unsigned int capability;
 };
diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h
index 20377e67f65d..5ec6c219a731 100644
--- a/drivers/crypto/ccp/sp-dev.h
+++ b/drivers/crypto/ccp/sp-dev.h
@@ -53,9 +53,16 @@ struct tee_vdata {
 	const unsigned int ring_rptr_reg;
 };
 
+struct platform_access_vdata {
+	const unsigned int cmdresp_reg;
+	const unsigned int cmdbuff_addr_lo_reg;
+	const unsigned int cmdbuff_addr_hi_reg;
+};
+
 struct psp_vdata {
 	const struct sev_vdata *sev;
 	const struct tee_vdata *tee;
+	const struct platform_access_vdata *platform_access;
 	const unsigned int feature_reg;
 	const unsigned int inten_reg;
 	const unsigned int intsts_reg;
diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h
new file mode 100644
index 000000000000..977df5cfd494
--- /dev/null
+++ b/include/linux/psp-platform-access.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __PSP_PLATFORM_ACCESS_H
+#define __PSP_PLATFORM_ACCESS_H
+
+#include <linux/psp.h>
+
+enum psp_platform_access_msg {
+	PSP_CMD_NONE = 0x0,
+};
+
+struct psp_req_buffer_hdr {
+	u32 payload_size;
+	u32 status;
+} __packed;
+
+struct psp_request {
+	struct psp_req_buffer_hdr header;
+	void *buf;
+} __packed;
+
+/**
+ * psp_send_platform_access_msg() - Send a message to control platform features
+ *
+ * This function is intended to be used by drivers outside of ccp to communicate
+ * with the platform.
+ *
+ * Returns:
+ *  0:           success
+ *  -%EBUSY:     mailbox in recovery or in use
+ *  -%ENODEV:    driver not bound with PSP device
+ *  -%ETIMEDOUT: request timed out
+ *  -%EIO:       unknown error (see kernel log)
+ */
+int psp_send_platform_access_msg(enum psp_platform_access_msg, struct psp_request *req);
+
+/**
+ * psp_check_platform_access_status() - Checks whether platform features is ready
+ *
+ * This function is intended to be used by drivers outside of ccp to determine
+ * if platform features has initialized.
+ *
+ * Returns:
+ * 0          platform features is ready
+ * -%ENODEV   platform features is not ready or present
+ */
+int psp_check_platform_access_status(void);
+
+#endif /* __PSP_PLATFORM_ACCESS_H */
-- 
cgit v1.2.3


From d5812571f594b03438d0d7acd0dc044f73c1719e Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 10 Mar 2023 15:19:50 -0600
Subject: crypto: ccp - Add support for ringing a platform doorbell

Some platforms support using a doorbell to communicate. Export
this feature for other drivers to utilize as well.

Link: https://lore.kernel.org/linux-i2c/20220916131854.687371-3-jsd@semihalf.com/
Suggested-by: Jan Dabros <jsd@semihalf.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/platform-access.c | 66 ++++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/platform-access.h |  1 +
 drivers/crypto/ccp/sp-dev.h          |  3 ++
 drivers/crypto/ccp/sp-pci.c          |  2 ++
 include/linux/psp-platform-access.h  | 15 ++++++++
 include/linux/psp.h                  |  3 ++
 6 files changed, 90 insertions(+)

(limited to 'include')

diff --git a/drivers/crypto/ccp/platform-access.c b/drivers/crypto/ccp/platform-access.c
index 9cc0c60bbf7b..b51fb1196932 100644
--- a/drivers/crypto/ccp/platform-access.c
+++ b/drivers/crypto/ccp/platform-access.c
@@ -20,6 +20,14 @@
 
 #define PSP_CMD_TIMEOUT_US	(500 * USEC_PER_MSEC)
 
+/* Doorbell shouldn't be ringing */
+static int check_doorbell(u32 __iomem *doorbell)
+{
+	u32 tmp;
+
+	return readl_poll_timeout(doorbell, tmp, tmp != 0, 0, PSP_CMD_TIMEOUT_US);
+}
+
 /* Recovery field should be equal 0 to start sending commands */
 static int check_recovery(u32 __iomem *cmd)
 {
@@ -132,6 +140,62 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(psp_send_platform_access_msg);
 
+int psp_ring_platform_doorbell(int msg)
+{
+	struct psp_device *psp = psp_get_master_device();
+	struct psp_platform_access_device *pa_dev;
+	u32 __iomem *button, *cmd;
+	int ret, val;
+
+	if (!psp || !psp->platform_access_data)
+		return -ENODEV;
+
+	pa_dev = psp->platform_access_data;
+	button = psp->io_regs + pa_dev->vdata->doorbell_button_reg;
+	cmd = psp->io_regs + pa_dev->vdata->doorbell_cmd_reg;
+
+	mutex_lock(&pa_dev->doorbell_mutex);
+
+	if (check_doorbell(button)) {
+		dev_dbg(psp->dev, "doorbell is not ready\n");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (check_recovery(cmd)) {
+		dev_dbg(psp->dev, "doorbell command in recovery\n");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (wait_cmd(cmd)) {
+		dev_dbg(psp->dev, "doorbell command not done processing\n");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	iowrite32(FIELD_PREP(PSP_DRBL_MSG, msg), cmd);
+	iowrite32(PSP_DRBL_RING, button);
+
+	if (wait_cmd(cmd)) {
+		ret = -ETIMEDOUT;
+		goto unlock;
+	}
+
+	val = FIELD_GET(PSP_CMDRESP_STS, ioread32(cmd));
+	if (val) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	ret = 0;
+unlock:
+	mutex_unlock(&pa_dev->doorbell_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(psp_ring_platform_doorbell);
+
 void platform_access_dev_destroy(struct psp_device *psp)
 {
 	struct psp_platform_access_device *pa_dev = psp->platform_access_data;
@@ -140,6 +204,7 @@ void platform_access_dev_destroy(struct psp_device *psp)
 		return;
 
 	mutex_destroy(&pa_dev->mailbox_mutex);
+	mutex_destroy(&pa_dev->doorbell_mutex);
 	psp->platform_access_data = NULL;
 }
 
@@ -159,6 +224,7 @@ int platform_access_dev_init(struct psp_device *psp)
 	pa_dev->vdata = (struct platform_access_vdata *)psp->vdata->platform_access;
 
 	mutex_init(&pa_dev->mailbox_mutex);
+	mutex_init(&pa_dev->doorbell_mutex);
 
 	dev_dbg(dev, "platform access enabled\n");
 
diff --git a/drivers/crypto/ccp/platform-access.h b/drivers/crypto/ccp/platform-access.h
index c3a97893320d..a83f03beb869 100644
--- a/drivers/crypto/ccp/platform-access.h
+++ b/drivers/crypto/ccp/platform-access.h
@@ -24,6 +24,7 @@ struct psp_platform_access_device {
 	struct platform_access_vdata *vdata;
 
 	struct mutex mailbox_mutex;
+	struct mutex doorbell_mutex;
 
 	void *platform_access_data;
 };
diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h
index 5ec6c219a731..1253a0217985 100644
--- a/drivers/crypto/ccp/sp-dev.h
+++ b/drivers/crypto/ccp/sp-dev.h
@@ -57,6 +57,9 @@ struct platform_access_vdata {
 	const unsigned int cmdresp_reg;
 	const unsigned int cmdbuff_addr_lo_reg;
 	const unsigned int cmdbuff_addr_hi_reg;
+	const unsigned int doorbell_button_reg;
+	const unsigned int doorbell_cmd_reg;
+
 };
 
 struct psp_vdata {
diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c
index 18aa902eb5ce..b5896f7af7ab 100644
--- a/drivers/crypto/ccp/sp-pci.c
+++ b/drivers/crypto/ccp/sp-pci.c
@@ -365,6 +365,8 @@ static const struct platform_access_vdata pa_v1 = {
 	.cmdresp_reg		= 0x10570,	/* C2PMSG_28 */
 	.cmdbuff_addr_lo_reg	= 0x10574,	/* C2PMSG_29 */
 	.cmdbuff_addr_hi_reg	= 0x10578,	/* C2PMSG_30 */
+	.doorbell_button_reg	= 0x10a24,	/* C2PMSG_73 */
+	.doorbell_cmd_reg	= 0x10a40,	/* C2PMSG_80 */
 };
 
 static const struct psp_vdata pspv1 = {
diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h
index 977df5cfd494..89df4549fada 100644
--- a/include/linux/psp-platform-access.h
+++ b/include/linux/psp-platform-access.h
@@ -34,6 +34,21 @@ struct psp_request {
  */
 int psp_send_platform_access_msg(enum psp_platform_access_msg, struct psp_request *req);
 
+/**
+ * psp_ring_platform_doorbell() - Ring platform doorbell
+ *
+ * This function is intended to be used by drivers outside of ccp to ring the
+ * platform doorbell with a message.
+ *
+ * Returns:
+ *  0:           success
+ *  -%EBUSY:     mailbox in recovery or in use
+ *  -%ENODEV:    driver not bound with PSP device
+ *  -%ETIMEDOUT: request timed out
+ *  -%EIO:       unknown error (see kernel log)
+ */
+int psp_ring_platform_doorbell(int msg);
+
 /**
  * psp_check_platform_access_status() - Checks whether platform features is ready
  *
diff --git a/include/linux/psp.h b/include/linux/psp.h
index d3424790a70e..92e60aeef21e 100644
--- a/include/linux/psp.h
+++ b/include/linux/psp.h
@@ -23,4 +23,7 @@
 #define PSP_CMDRESP_RECOVERY	BIT(30)
 #define PSP_CMDRESP_RESP	BIT(31)
 
+#define PSP_DRBL_MSG		PSP_CMDRESP_CMD
+#define PSP_DRBL_RING		BIT(0)
+
 #endif /* __PSP_H */
-- 
cgit v1.2.3


From bd7fc6e1957c2102866f9e464c1f2302e891b7e9 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta@linux.microsoft.com>
Date: Wed, 15 Mar 2023 04:55:13 -0700
Subject: net: mana: Add new MANA VF performance counters for easier
 troubleshooting

Extended performance counter stats in 'ethtool -S <interface>' output
for MANA VF to facilitate troubleshooting.

Tested-on: Ubuntu22
Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 62 ++++++++++++++++++++--
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 52 +++++++++++++++++-
 include/net/mana/mana.h                            | 18 +++++++
 3 files changed, 128 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 6120f2b6684f..492474b4d8aa 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -156,6 +156,7 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	struct mana_txq *txq;
 	struct mana_cq *cq;
 	int err, len;
+	u16 ihs;
 
 	if (unlikely(!apc->port_is_up))
 		goto tx_drop;
@@ -166,6 +167,7 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	txq = &apc->tx_qp[txq_idx].txq;
 	gdma_sq = txq->gdma_sq;
 	cq = &apc->tx_qp[txq_idx].tx_cq;
+	tx_stats = &txq->stats;
 
 	pkg.tx_oob.s_oob.vcq_num = cq->gdma_id;
 	pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame;
@@ -179,10 +181,17 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
 	pkg.tx_oob.s_oob.pkt_fmt = pkt_fmt;
 
-	if (pkt_fmt == MANA_SHORT_PKT_FMT)
+	if (pkt_fmt == MANA_SHORT_PKT_FMT) {
 		pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_short_oob);
-	else
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->short_pkt_fmt++;
+		u64_stats_update_end(&tx_stats->syncp);
+	} else {
 		pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_oob);
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->long_pkt_fmt++;
+		u64_stats_update_end(&tx_stats->syncp);
+	}
 
 	pkg.wqe_req.inline_oob_data = &pkg.tx_oob;
 	pkg.wqe_req.flags = 0;
@@ -232,9 +241,35 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 						 &ipv6_hdr(skb)->daddr, 0,
 						 IPPROTO_TCP, 0);
 		}
+
+		if (skb->encapsulation) {
+			ihs = skb_inner_tcp_all_headers(skb);
+			u64_stats_update_begin(&tx_stats->syncp);
+			tx_stats->tso_inner_packets++;
+			tx_stats->tso_inner_bytes += skb->len - ihs;
+			u64_stats_update_end(&tx_stats->syncp);
+		} else {
+			if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+				ihs = skb_transport_offset(skb) + sizeof(struct udphdr);
+			} else {
+				ihs = skb_tcp_all_headers(skb);
+				if (ipv6_has_hopopt_jumbo(skb))
+					ihs -= sizeof(struct hop_jumbo_hdr);
+			}
+
+			u64_stats_update_begin(&tx_stats->syncp);
+			tx_stats->tso_packets++;
+			tx_stats->tso_bytes += skb->len - ihs;
+			u64_stats_update_end(&tx_stats->syncp);
+		}
+
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		csum_type = mana_checksum_info(skb);
 
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->csum_partial++;
+		u64_stats_update_end(&tx_stats->syncp);
+
 		if (csum_type == IPPROTO_TCP) {
 			pkg.tx_oob.s_oob.is_outer_ipv4 = ipv4;
 			pkg.tx_oob.s_oob.is_outer_ipv6 = ipv6;
@@ -254,8 +289,12 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		}
 	}
 
-	if (mana_map_skb(skb, apc, &pkg))
+	if (mana_map_skb(skb, apc, &pkg)) {
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->mana_map_err++;
+		u64_stats_update_end(&tx_stats->syncp);
 		goto free_sgl_ptr;
+	}
 
 	skb_queue_tail(&txq->pending_skbs, skb);
 
@@ -1038,6 +1077,8 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 	if (comp_read < 1)
 		return;
 
+	apc->eth_stats.tx_cqes = comp_read;
+
 	for (i = 0; i < comp_read; i++) {
 		struct mana_tx_comp_oob *cqe_oob;
 
@@ -1064,6 +1105,7 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 		case CQE_TX_VLAN_TAGGING_VIOLATION:
 			WARN_ONCE(1, "TX: CQE error %d: ignored.\n",
 				  cqe_oob->cqe_hdr.cqe_type);
+			apc->eth_stats.tx_cqe_err++;
 			break;
 
 		default:
@@ -1072,6 +1114,7 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 			 */
 			WARN_ONCE(1, "TX: Unexpected CQE type %d: HW BUG?\n",
 				  cqe_oob->cqe_hdr.cqe_type);
+			apc->eth_stats.tx_cqe_unknown_type++;
 			return;
 		}
 
@@ -1118,6 +1161,8 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 		WARN_ON_ONCE(1);
 
 	cq->work_done = pkt_transmitted;
+
+	apc->eth_stats.tx_cqes -= pkt_transmitted;
 }
 
 static void mana_post_pkt_rxq(struct mana_rxq *rxq)
@@ -1252,12 +1297,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct gdma_context *gc = rxq->gdma_rq->gdma_dev->gdma_context;
 	struct net_device *ndev = rxq->ndev;
 	struct mana_recv_buf_oob *rxbuf_oob;
+	struct mana_port_context *apc;
 	struct device *dev = gc->dev;
 	void *new_buf, *old_buf;
 	struct page *new_page;
 	u32 curr, pktlen;
 	dma_addr_t da;
 
+	apc = netdev_priv(ndev);
+
 	switch (oob->cqe_hdr.cqe_type) {
 	case CQE_RX_OKAY:
 		break;
@@ -1270,6 +1318,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 
 	case CQE_RX_COALESCED_4:
 		netdev_err(ndev, "RX coalescing is unsupported\n");
+		apc->eth_stats.rx_coalesced_err++;
 		return;
 
 	case CQE_RX_OBJECT_FENCE:
@@ -1279,6 +1328,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	default:
 		netdev_err(ndev, "Unknown RX CQE type = %d\n",
 			   oob->cqe_hdr.cqe_type);
+		apc->eth_stats.rx_cqe_unknown_type++;
 		return;
 	}
 
@@ -1341,11 +1391,15 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
 {
 	struct gdma_comp *comp = cq->gdma_comp_buf;
 	struct mana_rxq *rxq = cq->rxq;
+	struct mana_port_context *apc;
 	int comp_read, i;
 
+	apc = netdev_priv(rxq->ndev);
+
 	comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER);
 	WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER);
 
+	apc->eth_stats.rx_cqes = comp_read;
 	rxq->xdp_flush = false;
 
 	for (i = 0; i < comp_read; i++) {
@@ -1357,6 +1411,8 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
 			return;
 
 		mana_process_rx_cqe(rxq, cq, &comp[i]);
+
+		apc->eth_stats.rx_cqes--;
 	}
 
 	if (rxq->xdp_flush)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 5b776a33a817..a64c81410dc1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -13,6 +13,15 @@ static const struct {
 } mana_eth_stats[] = {
 	{"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)},
 	{"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)},
+	{"tx_cqes", offsetof(struct mana_ethtool_stats, tx_cqes)},
+	{"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
+	{"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					tx_cqe_unknown_type)},
+	{"rx_cqes", offsetof(struct mana_ethtool_stats, rx_cqes)},
+	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
+					rx_coalesced_err)},
+	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					rx_cqe_unknown_type)},
 };
 
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
@@ -23,7 +32,8 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	if (stringset != ETH_SS_STATS)
 		return -EINVAL;
 
-	return ARRAY_SIZE(mana_eth_stats) + num_queues * 8;
+	return ARRAY_SIZE(mana_eth_stats) + num_queues *
+				(MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
 }
 
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
@@ -61,6 +71,22 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 		p += ETH_GSTRING_LEN;
 		sprintf(p, "tx_%d_xdp_xmit", i);
 		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_packets", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_bytes", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_inner_packets", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_inner_bytes", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_long_pkt_fmt", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_short_pkt_fmt", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_csum_partial", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_mana_map_err", i);
+		p += ETH_GSTRING_LEN;
 	}
 }
 
@@ -78,6 +104,14 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	u64 xdp_xmit;
 	u64 xdp_drop;
 	u64 xdp_tx;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 long_pkt_fmt;
+	u64 short_pkt_fmt;
+	u64 csum_partial;
+	u64 mana_map_err;
 	int q, i = 0;
 
 	if (!apc->port_is_up)
@@ -113,11 +147,27 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 			packets = tx_stats->packets;
 			bytes = tx_stats->bytes;
 			xdp_xmit = tx_stats->xdp_xmit;
+			tso_packets = tx_stats->tso_packets;
+			tso_bytes = tx_stats->tso_bytes;
+			tso_inner_packets = tx_stats->tso_inner_packets;
+			tso_inner_bytes = tx_stats->tso_inner_bytes;
+			long_pkt_fmt = tx_stats->long_pkt_fmt;
+			short_pkt_fmt = tx_stats->short_pkt_fmt;
+			csum_partial = tx_stats->csum_partial;
+			mana_map_err = tx_stats->mana_map_err;
 		} while (u64_stats_fetch_retry(&tx_stats->syncp, start));
 
 		data[i++] = packets;
 		data[i++] = bytes;
 		data[i++] = xdp_xmit;
+		data[i++] = tso_packets;
+		data[i++] = tso_bytes;
+		data[i++] = tso_inner_packets;
+		data[i++] = tso_inner_bytes;
+		data[i++] = long_pkt_fmt;
+		data[i++] = short_pkt_fmt;
+		data[i++] = csum_partial;
+		data[i++] = mana_map_err;
 	}
 }
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3bb579962a14..bb11a6535d80 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -48,6 +48,10 @@ enum TRI_STATE {
 
 #define MAX_PORTS_IN_MANA_DEV 256
 
+/* Update this count whenever the respective structures are changed */
+#define MANA_STATS_RX_COUNT 5
+#define MANA_STATS_TX_COUNT 11
+
 struct mana_stats_rx {
 	u64 packets;
 	u64 bytes;
@@ -61,6 +65,14 @@ struct mana_stats_tx {
 	u64 packets;
 	u64 bytes;
 	u64 xdp_xmit;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 short_pkt_fmt;
+	u64 long_pkt_fmt;
+	u64 csum_partial;
+	u64 mana_map_err;
 	struct u64_stats_sync syncp;
 };
 
@@ -331,6 +343,12 @@ struct mana_tx_qp {
 struct mana_ethtool_stats {
 	u64 stop_queue;
 	u64 wake_queue;
+	u64 tx_cqes;
+	u64 tx_cqe_err;
+	u64 tx_cqe_unknown_type;
+	u64 rx_cqes;
+	u64 rx_coalesced_err;
+	u64 rx_cqe_unknown_type;
 };
 
 struct mana_context {
-- 
cgit v1.2.3


From 8c44fa12c8fa09c6c12f0dc25129a6d13ee0a1ea Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:45 +0200
Subject: net: Add MDB net device operations

Add MDB net device operations that will be invoked by rtnetlink code in
response to received RTM_{NEW,DEL,GET}MDB messages. Subsequent patches
will implement these operations in the bridge and VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ee483071cf59..23b0d7eaaadd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1307,6 +1307,17 @@ struct netdev_net_notifier {
  *	Used to add FDB entries to dump requests. Implementers should add
  *	entries to skb and update idx with the number of entries.
  *
+ * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
+ *		      u16 nlmsg_flags, struct netlink_ext_ack *extack);
+ *	Adds an MDB entry to dev.
+ * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
+ *		      struct netlink_ext_ack *extack);
+ *	Deletes the MDB entry from dev.
+ * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
+ *		       struct netlink_callback *cb);
+ *	Dumps MDB entries from dev. The first argument (marker) in the netlink
+ *	callback is used by core rtnetlink code.
+ *
  * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
  *			     u16 flags, struct netlink_ext_ack *extack)
  * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
@@ -1569,6 +1580,16 @@ struct net_device_ops {
 					       const unsigned char *addr,
 					       u16 vid, u32 portid, u32 seq,
 					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_add)(struct net_device *dev,
+					       struct nlattr *tb[],
+					       u16 nlmsg_flags,
+					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_del)(struct net_device *dev,
+					       struct nlattr *tb[],
+					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_dump)(struct net_device *dev,
+						struct sk_buff *skb,
+						struct netlink_callback *cb);
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
 						      struct nlmsghdr *nlh,
 						      u16 flags,
-- 
cgit v1.2.3


From a3a48de5eade770e911d35291217bdd69ce04ef1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:51 +0200
Subject: vxlan: mdb: Add MDB control path support

Implement MDB control path support, enabling the creation, deletion,
replacement and dumping of MDB entries in a similar fashion to the
bridge driver. Unlike the bridge driver, each entry stores a list of
remote VTEPs to which matched packets need to be replicated to and not a
list of bridge ports.

The motivating use case is the installation of MDB entries by a user
space control plane in response to received EVPN routes. As such, only
allow permanent MDB entries to be installed and do not implement
snooping functionality, avoiding a lot of unnecessary complexity.

Since entries can only be modified by user space under RTNL, use RTNL as
the write lock. Use RCU to ensure that MDB entries and remotes are not
freed while being accessed from the data path during transmission.

In terms of uAPI, reuse the existing MDB netlink interface, but add a
few new attributes to request and response messages:

* IP address of the destination VXLAN tunnel endpoint where the
  multicast receivers reside.

* UDP destination port number to use to connect to the remote VXLAN
  tunnel endpoint.

* VXLAN VNI Network Identifier to use to connect to the remote VXLAN
  tunnel endpoint. Required when Ingress Replication (IR) is used and
  the remote VTEP is not a member of originating broadcast domain
  (VLAN/VNI) [1].

* Source VNI Network Identifier the MDB entry belongs to. Used only when
  the VXLAN device is in external mode.

* Interface index of the outgoing interface to reach the remote VXLAN
  tunnel endpoint. This is required when the underlay destination IP is
  multicast (P2MP), as the multicast routing tables are not consulted.

All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest
which is strictly validated by the bridge driver, thereby automatically
rejecting the new attributes.

[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan/Makefile        |    2 +-
 drivers/net/vxlan/vxlan_core.c    |    8 +
 drivers/net/vxlan/vxlan_mdb.c     | 1341 +++++++++++++++++++++++++++++++++++++
 drivers/net/vxlan/vxlan_private.h |   31 +
 include/net/vxlan.h               |    5 +
 include/uapi/linux/if_bridge.h    |   10 +
 6 files changed, 1396 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/vxlan/vxlan_mdb.c

(limited to 'include')

diff --git a/drivers/net/vxlan/Makefile b/drivers/net/vxlan/Makefile
index d4c255499b72..91b8fec8b6cf 100644
--- a/drivers/net/vxlan/Makefile
+++ b/drivers/net/vxlan/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_VXLAN) += vxlan.o
 
-vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o
+vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o vxlan_mdb.o
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 5de1a20497a6..a8b26d4f76de 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2878,8 +2878,14 @@ static int vxlan_init(struct net_device *dev)
 	if (err)
 		goto err_free_percpu;
 
+	err = vxlan_mdb_init(vxlan);
+	if (err)
+		goto err_gro_cells_destroy;
+
 	return 0;
 
+err_gro_cells_destroy:
+	gro_cells_destroy(&vxlan->gro_cells);
 err_free_percpu:
 	free_percpu(dev->tstats);
 err_vnigroup_uninit:
@@ -2904,6 +2910,8 @@ static void vxlan_uninit(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
+	vxlan_mdb_fini(vxlan);
+
 	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
 		vxlan_vnigroup_uninit(vxlan);
 
diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
new file mode 100644
index 000000000000..129692b3663f
--- /dev/null
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -0,0 +1,1341 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/if_bridge.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/vxlan.h>
+
+#include "vxlan_private.h"
+
+struct vxlan_mdb_entry_key {
+	union vxlan_addr src;
+	union vxlan_addr dst;
+	__be32 vni;
+};
+
+struct vxlan_mdb_entry {
+	struct rhash_head rhnode;
+	struct list_head remotes;
+	struct vxlan_mdb_entry_key key;
+	struct hlist_node mdb_node;
+	struct rcu_head rcu;
+};
+
+#define VXLAN_MDB_REMOTE_F_BLOCKED	BIT(0)
+
+struct vxlan_mdb_remote {
+	struct list_head list;
+	struct vxlan_rdst __rcu *rd;
+	u8 flags;
+	u8 filter_mode;
+	u8 rt_protocol;
+	struct hlist_head src_list;
+	struct rcu_head rcu;
+};
+
+#define VXLAN_SGRP_F_DELETE	BIT(0)
+
+struct vxlan_mdb_src_entry {
+	struct hlist_node node;
+	union vxlan_addr addr;
+	u8 flags;
+};
+
+struct vxlan_mdb_dump_ctx {
+	long reserved;
+	long entry_idx;
+	long remote_idx;
+};
+
+struct vxlan_mdb_config_src_entry {
+	union vxlan_addr addr;
+	struct list_head node;
+};
+
+struct vxlan_mdb_config {
+	struct vxlan_dev *vxlan;
+	struct vxlan_mdb_entry_key group;
+	struct list_head src_list;
+	union vxlan_addr remote_ip;
+	u32 remote_ifindex;
+	__be32 remote_vni;
+	__be16 remote_port;
+	u16 nlflags;
+	u8 flags;
+	u8 filter_mode;
+	u8 rt_protocol;
+};
+
+static const struct rhashtable_params vxlan_mdb_rht_params = {
+	.head_offset = offsetof(struct vxlan_mdb_entry, rhnode),
+	.key_offset = offsetof(struct vxlan_mdb_entry, key),
+	.key_len = sizeof(struct vxlan_mdb_entry_key),
+	.automatic_shrinking = true,
+};
+
+static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack);
+static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack);
+
+static void vxlan_br_mdb_entry_fill(const struct vxlan_dev *vxlan,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    const struct vxlan_mdb_remote *remote,
+				    struct br_mdb_entry *e)
+{
+	const union vxlan_addr *dst = &mdb_entry->key.dst;
+
+	memset(e, 0, sizeof(*e));
+	e->ifindex = vxlan->dev->ifindex;
+	e->state = MDB_PERMANENT;
+
+	if (remote->flags & VXLAN_MDB_REMOTE_F_BLOCKED)
+		e->flags |= MDB_FLAGS_BLOCKED;
+
+	switch (dst->sa.sa_family) {
+	case AF_INET:
+		e->addr.u.ip4 = dst->sin.sin_addr.s_addr;
+		e->addr.proto = htons(ETH_P_IP);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		e->addr.u.ip6 = dst->sin6.sin6_addr;
+		e->addr.proto = htons(ETH_P_IPV6);
+		break;
+#endif
+	}
+}
+
+static int vxlan_mdb_entry_info_fill_srcs(struct sk_buff *skb,
+					  const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	struct nlattr *nest;
+
+	if (hlist_empty(&remote->src_list))
+		return 0;
+
+	nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+	if (!nest)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+		struct nlattr *nest_ent;
+
+		nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+		if (!nest_ent)
+			goto out_cancel_err;
+
+		if (vxlan_nla_put_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+				       &ent->addr) ||
+		    nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, 0))
+			goto out_cancel_err;
+
+		nla_nest_end(skb, nest_ent);
+	}
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_cancel_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int vxlan_mdb_entry_info_fill(const struct vxlan_dev *vxlan,
+				     struct sk_buff *skb,
+				     const struct vxlan_mdb_entry *mdb_entry,
+				     const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	struct br_mdb_entry e;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO);
+	if (!nest)
+		return -EMSGSIZE;
+
+	vxlan_br_mdb_entry_fill(vxlan, mdb_entry, remote, &e);
+
+	if (nla_put_nohdr(skb, sizeof(e), &e) ||
+	    nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, 0))
+		goto nest_err;
+
+	if (!vxlan_addr_any(&mdb_entry->key.src) &&
+	    vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_SOURCE, &mdb_entry->key.src))
+		goto nest_err;
+
+	if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, remote->rt_protocol) ||
+	    nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, remote->filter_mode) ||
+	    vxlan_mdb_entry_info_fill_srcs(skb, remote) ||
+	    vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_DST, &rd->remote_ip))
+		goto nest_err;
+
+	if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port &&
+	    nla_put_u16(skb, MDBA_MDB_EATTR_DST_PORT,
+			be16_to_cpu(rd->remote_port)))
+		goto nest_err;
+
+	if (rd->remote_vni != vxlan->default_dst.remote_vni &&
+	    nla_put_u32(skb, MDBA_MDB_EATTR_VNI, be32_to_cpu(rd->remote_vni)))
+		goto nest_err;
+
+	if (rd->remote_ifindex &&
+	    nla_put_u32(skb, MDBA_MDB_EATTR_IFINDEX, rd->remote_ifindex))
+		goto nest_err;
+
+	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) &&
+	    mdb_entry->key.vni && nla_put_u32(skb, MDBA_MDB_EATTR_SRC_VNI,
+					      be32_to_cpu(mdb_entry->key.vni)))
+		goto nest_err;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+nest_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int vxlan_mdb_entry_fill(const struct vxlan_dev *vxlan,
+				struct sk_buff *skb,
+				struct vxlan_mdb_dump_ctx *ctx,
+				const struct vxlan_mdb_entry *mdb_entry)
+{
+	int remote_idx = 0, s_remote_idx = ctx->remote_idx;
+	struct vxlan_mdb_remote *remote;
+	struct nlattr *nest;
+	int err = 0;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+	if (!nest)
+		return -EMSGSIZE;
+
+	list_for_each_entry(remote, &mdb_entry->remotes, list) {
+		if (remote_idx < s_remote_idx)
+			goto skip;
+
+		err = vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote);
+		if (err)
+			break;
+skip:
+		remote_idx++;
+	}
+
+	ctx->remote_idx = err ? remote_idx : 0;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+static int vxlan_mdb_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb,
+			  struct vxlan_mdb_dump_ctx *ctx)
+{
+	int entry_idx = 0, s_entry_idx = ctx->entry_idx;
+	struct vxlan_mdb_entry *mdb_entry;
+	struct nlattr *nest;
+	int err = 0;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB);
+	if (!nest)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry(mdb_entry, &vxlan->mdb_list, mdb_node) {
+		if (entry_idx < s_entry_idx)
+			goto skip;
+
+		err = vxlan_mdb_entry_fill(vxlan, skb, ctx, mdb_entry);
+		if (err)
+			break;
+skip:
+		entry_idx++;
+	}
+
+	ctx->entry_idx = err ? entry_idx : 0;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+		   struct netlink_callback *cb)
+{
+	struct vxlan_mdb_dump_ctx *ctx = (void *)cb->ctx;
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct br_port_msg *bpm;
+	struct nlmsghdr *nlh;
+	int err;
+
+	ASSERT_RTNL();
+
+	NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx);
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm),
+			NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	bpm = nlmsg_data(nlh);
+	memset(bpm, 0, sizeof(*bpm));
+	bpm->family = AF_BRIDGE;
+	bpm->ifindex = dev->ifindex;
+
+	err = vxlan_mdb_fill(vxlan, skb, ctx);
+
+	nlmsg_end(skb, nlh);
+
+	cb->seq = vxlan->mdb_seq;
+	nl_dump_check_consistent(cb, nlh);
+
+	return err;
+}
+
+static const struct nla_policy
+vxlan_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = {
+	[MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY,
+						  sizeof(struct in_addr),
+						  sizeof(struct in6_addr)),
+};
+
+static const struct nla_policy
+vxlan_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
+	[MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_entry_pol),
+};
+
+static struct netlink_range_validation vni_range = {
+	.max = VXLAN_N_VID - 1,
+};
+
+static const struct nla_policy vxlan_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+	[MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+					      sizeof(struct in_addr),
+					      sizeof(struct in6_addr)),
+	[MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE,
+						  MCAST_INCLUDE),
+	[MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_pol),
+	[MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+	[MDBE_ATTR_DST] = NLA_POLICY_RANGE(NLA_BINARY,
+					   sizeof(struct in_addr),
+					   sizeof(struct in6_addr)),
+	[MDBE_ATTR_DST_PORT] = { .type = NLA_U16 },
+	[MDBE_ATTR_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range),
+	[MDBE_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+	[MDBE_ATTR_SRC_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range),
+};
+
+static bool vxlan_mdb_is_valid_source(const struct nlattr *attr, __be16 proto,
+				      struct netlink_ext_ack *extack)
+{
+	switch (proto) {
+	case htons(ETH_P_IP):
+		if (nla_len(attr) != sizeof(struct in_addr)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length");
+			return false;
+		}
+		if (ipv4_is_multicast(nla_get_in_addr(attr))) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed");
+			return false;
+		}
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6): {
+		struct in6_addr src;
+
+		if (nla_len(attr) != sizeof(struct in6_addr)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length");
+			return false;
+		}
+		src = nla_get_in6_addr(attr);
+		if (ipv6_addr_is_multicast(&src)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed");
+			return false;
+		}
+		break;
+	}
+#endif
+	default:
+		NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address");
+		return false;
+	}
+
+	return true;
+}
+
+static void vxlan_mdb_config_group_set(struct vxlan_mdb_config *cfg,
+				       const struct br_mdb_entry *entry,
+				       const struct nlattr *source_attr)
+{
+	struct vxlan_mdb_entry_key *group = &cfg->group;
+
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
+		group->dst.sa.sa_family = AF_INET;
+		group->dst.sin.sin_addr.s_addr = entry->addr.u.ip4;
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6):
+		group->dst.sa.sa_family = AF_INET6;
+		group->dst.sin6.sin6_addr = entry->addr.u.ip6;
+		break;
+#endif
+	}
+
+	if (source_attr)
+		vxlan_nla_get_addr(&group->src, source_attr);
+}
+
+static bool vxlan_mdb_is_star_g(const struct vxlan_mdb_entry_key *group)
+{
+	return !vxlan_addr_any(&group->dst) && vxlan_addr_any(&group->src);
+}
+
+static bool vxlan_mdb_is_sg(const struct vxlan_mdb_entry_key *group)
+{
+	return !vxlan_addr_any(&group->dst) && !vxlan_addr_any(&group->src);
+}
+
+static int vxlan_mdb_config_src_entry_init(struct vxlan_mdb_config *cfg,
+					   __be16 proto,
+					   const struct nlattr *src_entry,
+					   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[MDBE_SRCATTR_MAX + 1];
+	struct vxlan_mdb_config_src_entry *src;
+	int err;
+
+	err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry,
+			       vxlan_mdbe_src_list_entry_pol, extack);
+	if (err)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS))
+		return -EINVAL;
+
+	if (!vxlan_mdb_is_valid_source(tb[MDBE_SRCATTR_ADDRESS], proto,
+				       extack))
+		return -EINVAL;
+
+	src = kzalloc(sizeof(*src), GFP_KERNEL);
+	if (!src)
+		return -ENOMEM;
+
+	err = vxlan_nla_get_addr(&src->addr, tb[MDBE_SRCATTR_ADDRESS]);
+	if (err)
+		goto err_free_src;
+
+	list_add_tail(&src->node, &cfg->src_list);
+
+	return 0;
+
+err_free_src:
+	kfree(src);
+	return err;
+}
+
+static void
+vxlan_mdb_config_src_entry_fini(struct vxlan_mdb_config_src_entry *src)
+{
+	list_del(&src->node);
+	kfree(src);
+}
+
+static int vxlan_mdb_config_src_list_init(struct vxlan_mdb_config *cfg,
+					  __be16 proto,
+					  const struct nlattr *src_list,
+					  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config_src_entry *src, *tmp;
+	struct nlattr *src_entry;
+	int rem, err;
+
+	nla_for_each_nested(src_entry, src_list, rem) {
+		err = vxlan_mdb_config_src_entry_init(cfg, proto, src_entry,
+						      extack);
+		if (err)
+			goto err_src_entry_init;
+	}
+
+	return 0;
+
+err_src_entry_init:
+	list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node)
+		vxlan_mdb_config_src_entry_fini(src);
+	return err;
+}
+
+static void vxlan_mdb_config_src_list_fini(struct vxlan_mdb_config *cfg)
+{
+	struct vxlan_mdb_config_src_entry *src, *tmp;
+
+	list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node)
+		vxlan_mdb_config_src_entry_fini(src);
+}
+
+static int vxlan_mdb_config_attrs_init(struct vxlan_mdb_config *cfg,
+				       const struct br_mdb_entry *entry,
+				       const struct nlattr *set_attrs,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, set_attrs,
+			       vxlan_mdbe_attrs_pol, extack);
+	if (err)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, set_attrs, mdbe_attrs, MDBE_ATTR_DST)) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing remote destination IP address");
+		return -EINVAL;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SOURCE] &&
+	    !vxlan_mdb_is_valid_source(mdbe_attrs[MDBE_ATTR_SOURCE],
+				       entry->addr.proto, extack))
+		return -EINVAL;
+
+	vxlan_mdb_config_group_set(cfg, entry, mdbe_attrs[MDBE_ATTR_SOURCE]);
+
+	/* rtnetlink code only validates that IPv4 group address is
+	 * multicast.
+	 */
+	if (!vxlan_addr_is_multicast(&cfg->group.dst) &&
+	    !vxlan_addr_any(&cfg->group.dst)) {
+		NL_SET_ERR_MSG_MOD(extack, "Group address is not multicast");
+		return -EINVAL;
+	}
+
+	if (vxlan_addr_any(&cfg->group.dst) &&
+	    mdbe_attrs[MDBE_ATTR_SOURCE]) {
+		NL_SET_ERR_MSG_MOD(extack, "Source cannot be specified for the all-zeros entry");
+		return -EINVAL;
+	}
+
+	if (vxlan_mdb_is_sg(&cfg->group))
+		cfg->filter_mode = MCAST_INCLUDE;
+
+	if (mdbe_attrs[MDBE_ATTR_GROUP_MODE]) {
+		if (!vxlan_mdb_is_star_g(&cfg->group)) {
+			NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries");
+			return -EINVAL;
+		}
+		cfg->filter_mode = nla_get_u8(mdbe_attrs[MDBE_ATTR_GROUP_MODE]);
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SRC_LIST]) {
+		if (!vxlan_mdb_is_star_g(&cfg->group)) {
+			NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries");
+			return -EINVAL;
+		}
+		if (!mdbe_attrs[MDBE_ATTR_GROUP_MODE]) {
+			NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode");
+			return -EINVAL;
+		}
+		err = vxlan_mdb_config_src_list_init(cfg, entry->addr.proto,
+						     mdbe_attrs[MDBE_ATTR_SRC_LIST],
+						     extack);
+		if (err)
+			return err;
+	}
+
+	if (vxlan_mdb_is_star_g(&cfg->group) && list_empty(&cfg->src_list) &&
+	    cfg->filter_mode == MCAST_INCLUDE) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list");
+		return -EINVAL;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_RTPROT])
+		cfg->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]);
+
+	err = vxlan_nla_get_addr(&cfg->remote_ip, mdbe_attrs[MDBE_ATTR_DST]);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid remote destination address");
+		goto err_src_list_fini;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_DST_PORT])
+		cfg->remote_port =
+			cpu_to_be16(nla_get_u16(mdbe_attrs[MDBE_ATTR_DST_PORT]));
+
+	if (mdbe_attrs[MDBE_ATTR_VNI])
+		cfg->remote_vni =
+			cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_VNI]));
+
+	if (mdbe_attrs[MDBE_ATTR_IFINDEX]) {
+		cfg->remote_ifindex =
+			nla_get_s32(mdbe_attrs[MDBE_ATTR_IFINDEX]);
+		if (!__dev_get_by_index(cfg->vxlan->net, cfg->remote_ifindex)) {
+			NL_SET_ERR_MSG_MOD(extack, "Outgoing interface not found");
+			err = -EINVAL;
+			goto err_src_list_fini;
+		}
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SRC_VNI])
+		cfg->group.vni =
+			cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_SRC_VNI]));
+
+	return 0;
+
+err_src_list_fini:
+	vxlan_mdb_config_src_list_fini(cfg);
+	return err;
+}
+
+static int vxlan_mdb_config_init(struct vxlan_mdb_config *cfg,
+				 struct net_device *dev, struct nlattr *tb[],
+				 u16 nlmsg_flags,
+				 struct netlink_ext_ack *extack)
+{
+	struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]);
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->vxlan = vxlan;
+	cfg->group.vni = vxlan->default_dst.remote_vni;
+	INIT_LIST_HEAD(&cfg->src_list);
+	cfg->nlflags = nlmsg_flags;
+	cfg->filter_mode = MCAST_EXCLUDE;
+	cfg->rt_protocol = RTPROT_STATIC;
+	cfg->remote_vni = vxlan->default_dst.remote_vni;
+	cfg->remote_port = vxlan->cfg.dst_port;
+
+	if (entry->ifindex != dev->ifindex) {
+		NL_SET_ERR_MSG_MOD(extack, "Port net device must be the VXLAN net device");
+		return -EINVAL;
+	}
+
+	/* State is not part of the entry key and can be ignored on deletion
+	 * requests.
+	 */
+	if ((nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) &&
+	    entry->state != MDB_PERMANENT) {
+		NL_SET_ERR_MSG_MOD(extack, "MDB entry must be permanent");
+		return -EINVAL;
+	}
+
+	if (entry->flags) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid MDB entry flags");
+		return -EINVAL;
+	}
+
+	if (entry->vid) {
+		NL_SET_ERR_MSG_MOD(extack, "VID must not be specified");
+		return -EINVAL;
+	}
+
+	if (entry->addr.proto != htons(ETH_P_IP) &&
+	    entry->addr.proto != htons(ETH_P_IPV6)) {
+		NL_SET_ERR_MSG_MOD(extack, "Group address must be an IPv4 / IPv6 address");
+		return -EINVAL;
+	}
+
+	if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY_ATTRS)) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY_ATTRS attribute");
+		return -EINVAL;
+	}
+
+	return vxlan_mdb_config_attrs_init(cfg, entry, tb[MDBA_SET_ENTRY_ATTRS],
+					   extack);
+}
+
+static void vxlan_mdb_config_fini(struct vxlan_mdb_config *cfg)
+{
+	vxlan_mdb_config_src_list_fini(cfg);
+}
+
+static struct vxlan_mdb_entry *
+vxlan_mdb_entry_lookup(struct vxlan_dev *vxlan,
+		       const struct vxlan_mdb_entry_key *group)
+{
+	return rhashtable_lookup_fast(&vxlan->mdb_tbl, group,
+				      vxlan_mdb_rht_params);
+}
+
+static struct vxlan_mdb_remote *
+vxlan_mdb_remote_lookup(const struct vxlan_mdb_entry *mdb_entry,
+			const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_remote *remote;
+
+	list_for_each_entry(remote, &mdb_entry->remotes, list) {
+		struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+
+		if (vxlan_addr_equal(addr, &rd->remote_ip))
+			return remote;
+	}
+
+	return NULL;
+}
+
+static void vxlan_mdb_rdst_free(struct rcu_head *head)
+{
+	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);
+
+	dst_cache_destroy(&rd->dst_cache);
+	kfree(rd);
+}
+
+static int vxlan_mdb_remote_rdst_init(const struct vxlan_mdb_config *cfg,
+				      struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_rdst *rd;
+	int err;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	err = dst_cache_init(&rd->dst_cache, GFP_KERNEL);
+	if (err)
+		goto err_free_rdst;
+
+	rd->remote_ip = cfg->remote_ip;
+	rd->remote_port = cfg->remote_port;
+	rd->remote_vni = cfg->remote_vni;
+	rd->remote_ifindex = cfg->remote_ifindex;
+	rcu_assign_pointer(remote->rd, rd);
+
+	return 0;
+
+err_free_rdst:
+	kfree(rd);
+	return err;
+}
+
+static void vxlan_mdb_remote_rdst_fini(struct vxlan_rdst *rd)
+{
+	call_rcu(&rd->rcu, vxlan_mdb_rdst_free);
+}
+
+static int vxlan_mdb_remote_init(const struct vxlan_mdb_config *cfg,
+				 struct vxlan_mdb_remote *remote)
+{
+	int err;
+
+	err = vxlan_mdb_remote_rdst_init(cfg, remote);
+	if (err)
+		return err;
+
+	remote->flags = cfg->flags;
+	remote->filter_mode = cfg->filter_mode;
+	remote->rt_protocol = cfg->rt_protocol;
+	INIT_HLIST_HEAD(&remote->src_list);
+
+	return 0;
+}
+
+static void vxlan_mdb_remote_fini(struct vxlan_dev *vxlan,
+				  struct vxlan_mdb_remote *remote)
+{
+	WARN_ON_ONCE(!hlist_empty(&remote->src_list));
+	vxlan_mdb_remote_rdst_fini(rtnl_dereference(remote->rd));
+}
+
+static struct vxlan_mdb_src_entry *
+vxlan_mdb_remote_src_entry_lookup(const struct vxlan_mdb_remote *remote,
+				  const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_src_entry *ent;
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+		if (vxlan_addr_equal(&ent->addr, addr))
+			return ent;
+	}
+
+	return NULL;
+}
+
+static struct vxlan_mdb_src_entry *
+vxlan_mdb_remote_src_entry_add(struct vxlan_mdb_remote *remote,
+			       const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_src_entry *ent;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return NULL;
+
+	ent->addr = *addr;
+	hlist_add_head(&ent->node, &remote->src_list);
+
+	return ent;
+}
+
+static void
+vxlan_mdb_remote_src_entry_del(struct vxlan_mdb_src_entry *ent)
+{
+	hlist_del(&ent->node);
+	kfree(ent);
+}
+
+static int
+vxlan_mdb_remote_src_fwd_add(const struct vxlan_mdb_config *cfg,
+			     const union vxlan_addr *addr,
+			     struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config sg_cfg;
+
+	memset(&sg_cfg, 0, sizeof(sg_cfg));
+	sg_cfg.vxlan = cfg->vxlan;
+	sg_cfg.group.src = *addr;
+	sg_cfg.group.dst = cfg->group.dst;
+	sg_cfg.group.vni = cfg->group.vni;
+	INIT_LIST_HEAD(&sg_cfg.src_list);
+	sg_cfg.remote_ip = cfg->remote_ip;
+	sg_cfg.remote_ifindex = cfg->remote_ifindex;
+	sg_cfg.remote_vni = cfg->remote_vni;
+	sg_cfg.remote_port = cfg->remote_port;
+	sg_cfg.nlflags = cfg->nlflags;
+	sg_cfg.filter_mode = MCAST_INCLUDE;
+	if (cfg->filter_mode == MCAST_EXCLUDE)
+		sg_cfg.flags = VXLAN_MDB_REMOTE_F_BLOCKED;
+	sg_cfg.rt_protocol = cfg->rt_protocol;
+
+	return __vxlan_mdb_add(&sg_cfg, extack);
+}
+
+static void
+vxlan_mdb_remote_src_fwd_del(struct vxlan_dev *vxlan,
+			     const struct vxlan_mdb_entry_key *group,
+			     const struct vxlan_mdb_remote *remote,
+			     const union vxlan_addr *addr)
+{
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	struct vxlan_mdb_config sg_cfg;
+
+	memset(&sg_cfg, 0, sizeof(sg_cfg));
+	sg_cfg.vxlan = vxlan;
+	sg_cfg.group.src = *addr;
+	sg_cfg.group.dst = group->dst;
+	sg_cfg.group.vni = group->vni;
+	INIT_LIST_HEAD(&sg_cfg.src_list);
+	sg_cfg.remote_ip = rd->remote_ip;
+
+	__vxlan_mdb_del(&sg_cfg, NULL);
+}
+
+static int
+vxlan_mdb_remote_src_add(const struct vxlan_mdb_config *cfg,
+			 struct vxlan_mdb_remote *remote,
+			 const struct vxlan_mdb_config_src_entry *src,
+			 struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_src_entry *ent;
+	int err;
+
+	ent = vxlan_mdb_remote_src_entry_lookup(remote, &src->addr);
+	if (!ent) {
+		ent = vxlan_mdb_remote_src_entry_add(remote, &src->addr);
+		if (!ent)
+			return -ENOMEM;
+	} else if (!(cfg->nlflags & NLM_F_REPLACE)) {
+		NL_SET_ERR_MSG_MOD(extack, "Source entry already exists");
+		return -EEXIST;
+	}
+
+	err = vxlan_mdb_remote_src_fwd_add(cfg, &ent->addr, extack);
+	if (err)
+		goto err_src_del;
+
+	/* Clear flags in case source entry was marked for deletion as part of
+	 * replace flow.
+	 */
+	ent->flags = 0;
+
+	return 0;
+
+err_src_del:
+	vxlan_mdb_remote_src_entry_del(ent);
+	return err;
+}
+
+static void vxlan_mdb_remote_src_del(struct vxlan_dev *vxlan,
+				     const struct vxlan_mdb_entry_key *group,
+				     const struct vxlan_mdb_remote *remote,
+				     struct vxlan_mdb_src_entry *ent)
+{
+	vxlan_mdb_remote_src_fwd_del(vxlan, group, remote, &ent->addr);
+	vxlan_mdb_remote_src_entry_del(ent);
+}
+
+static int vxlan_mdb_remote_srcs_add(const struct vxlan_mdb_config *cfg,
+				     struct vxlan_mdb_remote *remote,
+				     struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config_src_entry *src;
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+	int err;
+
+	list_for_each_entry(src, &cfg->src_list, node) {
+		err = vxlan_mdb_remote_src_add(cfg, remote, src, extack);
+		if (err)
+			goto err_src_del;
+	}
+
+	return 0;
+
+err_src_del:
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node)
+		vxlan_mdb_remote_src_del(cfg->vxlan, &cfg->group, remote, ent);
+	return err;
+}
+
+static void vxlan_mdb_remote_srcs_del(struct vxlan_dev *vxlan,
+				      const struct vxlan_mdb_entry_key *group,
+				      struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node)
+		vxlan_mdb_remote_src_del(vxlan, group, remote, ent);
+}
+
+static size_t
+vxlan_mdb_nlmsg_src_list_size(const struct vxlan_mdb_entry_key *group,
+			      const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	size_t nlmsg_size;
+
+	if (hlist_empty(&remote->src_list))
+		return 0;
+
+	/* MDBA_MDB_EATTR_SRC_LIST */
+	nlmsg_size = nla_total_size(0);
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+			      /* MDBA_MDB_SRCLIST_ENTRY */
+		nlmsg_size += nla_total_size(0) +
+			      /* MDBA_MDB_SRCATTR_ADDRESS */
+			      nla_total_size(vxlan_addr_size(&group->dst)) +
+			      /* MDBA_MDB_SRCATTR_TIMER */
+			      nla_total_size(sizeof(u8));
+	}
+
+	return nlmsg_size;
+}
+
+static size_t vxlan_mdb_nlmsg_size(const struct vxlan_dev *vxlan,
+				   const struct vxlan_mdb_entry *mdb_entry,
+				   const struct vxlan_mdb_remote *remote)
+{
+	const struct vxlan_mdb_entry_key *group = &mdb_entry->key;
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	size_t nlmsg_size;
+
+	nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+		     /* MDBA_MDB */
+		     nla_total_size(0) +
+		     /* MDBA_MDB_ENTRY */
+		     nla_total_size(0) +
+		     /* MDBA_MDB_ENTRY_INFO */
+		     nla_total_size(sizeof(struct br_mdb_entry)) +
+		     /* MDBA_MDB_EATTR_TIMER */
+		     nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_SOURCE */
+	if (vxlan_mdb_is_sg(group))
+		nlmsg_size += nla_total_size(vxlan_addr_size(&group->dst));
+	/* MDBA_MDB_EATTR_RTPROT */
+	nlmsg_size += nla_total_size(sizeof(u8));
+	/* MDBA_MDB_EATTR_SRC_LIST */
+	nlmsg_size += vxlan_mdb_nlmsg_src_list_size(group, remote);
+	/* MDBA_MDB_EATTR_GROUP_MODE */
+	nlmsg_size += nla_total_size(sizeof(u8));
+	/* MDBA_MDB_EATTR_DST */
+	nlmsg_size += nla_total_size(vxlan_addr_size(&rd->remote_ip));
+	/* MDBA_MDB_EATTR_DST_PORT */
+	if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port)
+		nlmsg_size += nla_total_size(sizeof(u16));
+	/* MDBA_MDB_EATTR_VNI */
+	if (rd->remote_vni != vxlan->default_dst.remote_vni)
+		nlmsg_size += nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_IFINDEX */
+	if (rd->remote_ifindex)
+		nlmsg_size += nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_SRC_VNI */
+	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && group->vni)
+		nlmsg_size += nla_total_size(sizeof(u32));
+
+	return nlmsg_size;
+}
+
+static int vxlan_mdb_nlmsg_fill(const struct vxlan_dev *vxlan,
+				struct sk_buff *skb,
+				const struct vxlan_mdb_entry *mdb_entry,
+				const struct vxlan_mdb_remote *remote,
+				int type)
+{
+	struct nlattr *mdb_nest, *mdb_entry_nest;
+	struct br_port_msg *bpm;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	bpm = nlmsg_data(nlh);
+	memset(bpm, 0, sizeof(*bpm));
+	bpm->family  = AF_BRIDGE;
+	bpm->ifindex = vxlan->dev->ifindex;
+
+	mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB);
+	if (!mdb_nest)
+		goto cancel;
+	mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+	if (!mdb_entry_nest)
+		goto cancel;
+
+	if (vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote))
+		goto cancel;
+
+	nla_nest_end(skb, mdb_entry_nest);
+	nla_nest_end(skb, mdb_nest);
+	nlmsg_end(skb, nlh);
+
+	return 0;
+
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void vxlan_mdb_remote_notify(const struct vxlan_dev *vxlan,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    const struct vxlan_mdb_remote *remote,
+				    int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_mdb_nlmsg_size(vxlan, mdb_entry, remote),
+			GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = vxlan_mdb_nlmsg_fill(vxlan, skb, mdb_entry, remote, type);
+	if (err) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_KERNEL);
+	return;
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+static int
+vxlan_mdb_remote_srcs_replace(const struct vxlan_mdb_config *cfg,
+			      const struct vxlan_mdb_entry *mdb_entry,
+			      struct vxlan_mdb_remote *remote,
+			      struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+	int err;
+
+	hlist_for_each_entry(ent, &remote->src_list, node)
+		ent->flags |= VXLAN_SGRP_F_DELETE;
+
+	err = vxlan_mdb_remote_srcs_add(cfg, remote, extack);
+	if (err)
+		goto err_clear_delete;
+
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node) {
+		if (ent->flags & VXLAN_SGRP_F_DELETE)
+			vxlan_mdb_remote_src_del(vxlan, &mdb_entry->key, remote,
+						 ent);
+	}
+
+	return 0;
+
+err_clear_delete:
+	hlist_for_each_entry(ent, &remote->src_list, node)
+		ent->flags &= ~VXLAN_SGRP_F_DELETE;
+	return err;
+}
+
+static int vxlan_mdb_remote_replace(const struct vxlan_mdb_config *cfg,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    struct vxlan_mdb_remote *remote,
+				    struct netlink_ext_ack *extack)
+{
+	struct vxlan_rdst *new_rd, *old_rd = rtnl_dereference(remote->rd);
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	int err;
+
+	err = vxlan_mdb_remote_rdst_init(cfg, remote);
+	if (err)
+		return err;
+	new_rd = rtnl_dereference(remote->rd);
+
+	err = vxlan_mdb_remote_srcs_replace(cfg, mdb_entry, remote, extack);
+	if (err)
+		goto err_rdst_reset;
+
+	WRITE_ONCE(remote->flags, cfg->flags);
+	WRITE_ONCE(remote->filter_mode, cfg->filter_mode);
+	remote->rt_protocol = cfg->rt_protocol;
+	vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_NEWMDB);
+
+	vxlan_mdb_remote_rdst_fini(old_rd);
+
+	return 0;
+
+err_rdst_reset:
+	rcu_assign_pointer(remote->rd, old_rd);
+	vxlan_mdb_remote_rdst_fini(new_rd);
+	return err;
+}
+
+static int vxlan_mdb_remote_add(const struct vxlan_mdb_config *cfg,
+				struct vxlan_mdb_entry *mdb_entry,
+				struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_remote *remote;
+	int err;
+
+	remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip);
+	if (remote) {
+		if (!(cfg->nlflags & NLM_F_REPLACE)) {
+			NL_SET_ERR_MSG_MOD(extack, "Replace not specified and MDB remote entry already exists");
+			return -EEXIST;
+		}
+		return vxlan_mdb_remote_replace(cfg, mdb_entry, remote, extack);
+	}
+
+	if (!(cfg->nlflags & NLM_F_CREATE)) {
+		NL_SET_ERR_MSG_MOD(extack, "Create not specified and entry does not exist");
+		return -ENOENT;
+	}
+
+	remote = kzalloc(sizeof(*remote), GFP_KERNEL);
+	if (!remote)
+		return -ENOMEM;
+
+	err = vxlan_mdb_remote_init(cfg, remote);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to initialize remote MDB entry");
+		goto err_free_remote;
+	}
+
+	err = vxlan_mdb_remote_srcs_add(cfg, remote, extack);
+	if (err)
+		goto err_remote_fini;
+
+	list_add_rcu(&remote->list, &mdb_entry->remotes);
+	vxlan_mdb_remote_notify(cfg->vxlan, mdb_entry, remote, RTM_NEWMDB);
+
+	return 0;
+
+err_remote_fini:
+	vxlan_mdb_remote_fini(cfg->vxlan, remote);
+err_free_remote:
+	kfree(remote);
+	return err;
+}
+
+static void vxlan_mdb_remote_del(struct vxlan_dev *vxlan,
+				 struct vxlan_mdb_entry *mdb_entry,
+				 struct vxlan_mdb_remote *remote)
+{
+	vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_DELMDB);
+	list_del_rcu(&remote->list);
+	vxlan_mdb_remote_srcs_del(vxlan, &mdb_entry->key, remote);
+	vxlan_mdb_remote_fini(vxlan, remote);
+	kfree_rcu(remote, rcu);
+}
+
+static struct vxlan_mdb_entry *
+vxlan_mdb_entry_get(struct vxlan_dev *vxlan,
+		    const struct vxlan_mdb_entry_key *group)
+{
+	struct vxlan_mdb_entry *mdb_entry;
+	int err;
+
+	mdb_entry = vxlan_mdb_entry_lookup(vxlan, group);
+	if (mdb_entry)
+		return mdb_entry;
+
+	mdb_entry = kzalloc(sizeof(*mdb_entry), GFP_KERNEL);
+	if (!mdb_entry)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&mdb_entry->remotes);
+	memcpy(&mdb_entry->key, group, sizeof(mdb_entry->key));
+	hlist_add_head(&mdb_entry->mdb_node, &vxlan->mdb_list);
+
+	err = rhashtable_lookup_insert_fast(&vxlan->mdb_tbl,
+					    &mdb_entry->rhnode,
+					    vxlan_mdb_rht_params);
+	if (err)
+		goto err_free_entry;
+
+	return mdb_entry;
+
+err_free_entry:
+	hlist_del(&mdb_entry->mdb_node);
+	kfree(mdb_entry);
+	return ERR_PTR(err);
+}
+
+static void vxlan_mdb_entry_put(struct vxlan_dev *vxlan,
+				struct vxlan_mdb_entry *mdb_entry)
+{
+	if (!list_empty(&mdb_entry->remotes))
+		return;
+
+	rhashtable_remove_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode,
+			       vxlan_mdb_rht_params);
+	hlist_del(&mdb_entry->mdb_node);
+	kfree_rcu(mdb_entry, rcu);
+}
+
+static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_entry *mdb_entry;
+	int err;
+
+	mdb_entry = vxlan_mdb_entry_get(vxlan, &cfg->group);
+	if (IS_ERR(mdb_entry))
+		return PTR_ERR(mdb_entry);
+
+	err = vxlan_mdb_remote_add(cfg, mdb_entry, extack);
+	if (err)
+		goto err_entry_put;
+
+	vxlan->mdb_seq++;
+
+	return 0;
+
+err_entry_put:
+	vxlan_mdb_entry_put(vxlan, mdb_entry);
+	return err;
+}
+
+static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_entry *mdb_entry;
+	struct vxlan_mdb_remote *remote;
+
+	mdb_entry = vxlan_mdb_entry_lookup(vxlan, &cfg->group);
+	if (!mdb_entry) {
+		NL_SET_ERR_MSG_MOD(extack, "Did not find MDB entry");
+		return -ENOENT;
+	}
+
+	remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip);
+	if (!remote) {
+		NL_SET_ERR_MSG_MOD(extack, "Did not find MDB remote entry");
+		return -ENOENT;
+	}
+
+	vxlan_mdb_remote_del(vxlan, mdb_entry, remote);
+	vxlan_mdb_entry_put(vxlan, mdb_entry);
+
+	vxlan->mdb_seq++;
+
+	return 0;
+}
+
+int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+		  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config cfg;
+	int err;
+
+	ASSERT_RTNL();
+
+	err = vxlan_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack);
+	if (err)
+		return err;
+
+	err = __vxlan_mdb_add(&cfg, extack);
+
+	vxlan_mdb_config_fini(&cfg);
+	return err;
+}
+
+int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[],
+		  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config cfg;
+	int err;
+
+	ASSERT_RTNL();
+
+	err = vxlan_mdb_config_init(&cfg, dev, tb, 0, extack);
+	if (err)
+		return err;
+
+	err = __vxlan_mdb_del(&cfg, extack);
+
+	vxlan_mdb_config_fini(&cfg);
+	return err;
+}
+
+static void vxlan_mdb_check_empty(void *ptr, void *arg)
+{
+	WARN_ON_ONCE(1);
+}
+
+static void vxlan_mdb_remotes_flush(struct vxlan_dev *vxlan,
+				    struct vxlan_mdb_entry *mdb_entry)
+{
+	struct vxlan_mdb_remote *remote, *tmp;
+
+	list_for_each_entry_safe(remote, tmp, &mdb_entry->remotes, list)
+		vxlan_mdb_remote_del(vxlan, mdb_entry, remote);
+}
+
+static void vxlan_mdb_entries_flush(struct vxlan_dev *vxlan)
+{
+	struct vxlan_mdb_entry *mdb_entry;
+	struct hlist_node *tmp;
+
+	/* The removal of an entry cannot trigger the removal of another entry
+	 * since entries are always added to the head of the list.
+	 */
+	hlist_for_each_entry_safe(mdb_entry, tmp, &vxlan->mdb_list, mdb_node) {
+		vxlan_mdb_remotes_flush(vxlan, mdb_entry);
+		vxlan_mdb_entry_put(vxlan, mdb_entry);
+	}
+}
+
+int vxlan_mdb_init(struct vxlan_dev *vxlan)
+{
+	int err;
+
+	err = rhashtable_init(&vxlan->mdb_tbl, &vxlan_mdb_rht_params);
+	if (err)
+		return err;
+
+	INIT_HLIST_HEAD(&vxlan->mdb_list);
+
+	return 0;
+}
+
+void vxlan_mdb_fini(struct vxlan_dev *vxlan)
+{
+	vxlan_mdb_entries_flush(vxlan);
+	rhashtable_free_and_destroy(&vxlan->mdb_tbl, vxlan_mdb_check_empty,
+				    NULL);
+}
diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h
index f4977925cb8a..7bcc38faae27 100644
--- a/drivers/net/vxlan/vxlan_private.h
+++ b/drivers/net/vxlan/vxlan_private.h
@@ -110,6 +110,14 @@ static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
 }
 
+static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip)
+{
+	if (ip->sa.sa_family == AF_INET6)
+		return ipv6_addr_is_multicast(&ip->sin6.sin6_addr);
+	else
+		return ipv4_is_multicast(ip->sin.sin_addr.s_addr);
+}
+
 #else /* !CONFIG_IPV6 */
 
 static inline
@@ -138,8 +146,21 @@ static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
 }
 
+static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip)
+{
+	return ipv4_is_multicast(ip->sin.sin_addr.s_addr);
+}
+
 #endif
 
+static inline size_t vxlan_addr_size(const union vxlan_addr *ip)
+{
+	if (ip->sa.sa_family == AF_INET6)
+		return sizeof(struct in6_addr);
+	else
+		return sizeof(__be32);
+}
+
 static inline struct vxlan_vni_node *
 vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni)
 {
@@ -206,4 +227,14 @@ int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
 		    int rifindex);
 int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
 		     int rifindex);
+
+/* vxlan_mdb.c */
+int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+		   struct netlink_callback *cb);
+int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+		  struct netlink_ext_ack *extack);
+int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[],
+		  struct netlink_ext_ack *extack);
+int vxlan_mdb_init(struct vxlan_dev *vxlan);
+void vxlan_mdb_fini(struct vxlan_dev *vxlan);
 #endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index bca5b01af247..110b703d8978 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -3,6 +3,7 @@
 #define __NET_VXLAN_H 1
 
 #include <linux/if_vlan.h>
+#include <linux/rhashtable-types.h>
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
@@ -302,6 +303,10 @@ struct vxlan_dev {
 	struct vxlan_vni_group  __rcu *vnigrp;
 
 	struct hlist_head fdb_head[FDB_HASH_SIZE];
+
+	struct rhashtable mdb_tbl;
+	struct hlist_head mdb_list;
+	unsigned int mdb_seq;
 };
 
 #define VXLAN_F_LEARN			0x01
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index d60c456710b3..c9d624f528c5 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -633,6 +633,11 @@ enum {
 	MDBA_MDB_EATTR_GROUP_MODE,
 	MDBA_MDB_EATTR_SOURCE,
 	MDBA_MDB_EATTR_RTPROT,
+	MDBA_MDB_EATTR_DST,
+	MDBA_MDB_EATTR_DST_PORT,
+	MDBA_MDB_EATTR_VNI,
+	MDBA_MDB_EATTR_IFINDEX,
+	MDBA_MDB_EATTR_SRC_VNI,
 	__MDBA_MDB_EATTR_MAX
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
@@ -728,6 +733,11 @@ enum {
 	MDBE_ATTR_SRC_LIST,
 	MDBE_ATTR_GROUP_MODE,
 	MDBE_ATTR_RTPROT,
+	MDBE_ATTR_DST,
+	MDBE_ATTR_DST_PORT,
+	MDBE_ATTR_VNI,
+	MDBE_ATTR_IFINDEX,
+	MDBE_ATTR_SRC_VNI,
 	__MDBE_ATTR_MAX,
 };
 #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1)
-- 
cgit v1.2.3


From bc6c6b013ffee36eb555cc0a68aa3d9608e1fad2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:52 +0200
Subject: vxlan: mdb: Add an internal flag to indicate MDB usage

Add an internal flag to indicate whether MDB entries are configured or
not. Set the flag after installing the first MDB entry and clear it
before deleting the last one.

The flag will be consulted by the data path which will only perform an
MDB lookup if the flag is set, thereby keeping the MDB overhead to a
minimum when the MDB is not used.

Another option would have been to use a static key, but it is global and
not per-device, unlike the current approach.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan/vxlan_mdb.c | 7 +++++++
 include/net/vxlan.h           | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
index 129692b3663f..b32b1fb4a74a 100644
--- a/drivers/net/vxlan/vxlan_mdb.c
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -1185,6 +1185,9 @@ vxlan_mdb_entry_get(struct vxlan_dev *vxlan,
 	if (err)
 		goto err_free_entry;
 
+	if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list))
+		vxlan->cfg.flags |= VXLAN_F_MDB;
+
 	return mdb_entry;
 
 err_free_entry:
@@ -1199,6 +1202,9 @@ static void vxlan_mdb_entry_put(struct vxlan_dev *vxlan,
 	if (!list_empty(&mdb_entry->remotes))
 		return;
 
+	if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list))
+		vxlan->cfg.flags &= ~VXLAN_F_MDB;
+
 	rhashtable_remove_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode,
 			       vxlan_mdb_rht_params);
 	hlist_del(&mdb_entry->mdb_node);
@@ -1336,6 +1342,7 @@ int vxlan_mdb_init(struct vxlan_dev *vxlan)
 void vxlan_mdb_fini(struct vxlan_dev *vxlan)
 {
 	vxlan_mdb_entries_flush(vxlan);
+	WARN_ON_ONCE(vxlan->cfg.flags & VXLAN_F_MDB);
 	rhashtable_free_and_destroy(&vxlan->mdb_tbl, vxlan_mdb_check_empty,
 				    NULL);
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 110b703d8978..b7b2e9abfb37 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -327,6 +327,7 @@ struct vxlan_dev {
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
 #define VXLAN_F_TTL_INHERIT		0x10000
 #define VXLAN_F_VNIFILTER               0x20000
+#define VXLAN_F_MDB			0x40000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
-- 
cgit v1.2.3


From abc17a11ed29b0471e428d86189acca8d1a213c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:55 +0000
Subject: inet: preserve const qualifier in inet_sk()

We can change inet_sk() to propagate const qualifier of its argument.

This should avoid some potential errors caused by accidental
(const -> not_const) promotion.

Other helpers like tcp_sk(), udp_sk(), raw_sk() will be handled
in separate patch series.

v2: use container_of_const() as advised by Jakub and Linus

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/20230315142841.3a2ac99a@kernel.org/
Link: https://lore.kernel.org/netdev/CAHk-=wiOf12nrYEF2vJMcucKjWPN-Ns_SW9fA7LwST_2Dzp7rw@mail.gmail.com/
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h     | 5 +----
 include/trace/events/sock.h | 4 ++--
 include/trace/events/tcp.h  | 2 +-
 net/ipv4/ip_output.c        | 5 +++--
 net/ipv6/ping.c             | 2 +-
 net/ipv6/udp.c              | 2 +-
 net/mptcp/sockopt.c         | 2 +-
 security/lsm_audit.c        | 4 ++--
 8 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 51857117ac09..caa20a905531 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -305,10 +305,7 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
 	return sk_to_full_sk(skb->sk);
 }
 
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
-	return (struct inet_sock *)sk;
-}
+#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)
 
 static inline void __inet_sk_copy_descendant(struct sock *sk_to,
 					     const struct sock *sk_from,
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 03d19fc562f8..fd206a6ab5b8 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -158,7 +158,7 @@ TRACE_EVENT(inet_sock_set_state,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		struct in6_addr *pin6;
 		__be32 *p32;
 
@@ -222,7 +222,7 @@ TRACE_EVENT(inet_sk_error_report,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		struct in6_addr *pin6;
 		__be32 *p32;
 
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 901b440238d5..bf06db8d2046 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -67,7 +67,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		__be32 *p32;
 
 		__entry->skbaddr = skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e7bef36ce26f..cb04dbad9ea4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -129,7 +129,8 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
 
-static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+static inline int ip_select_ttl(const struct inet_sock *inet,
+				const struct dst_entry *dst)
 {
 	int ttl = inet->uc_ttl;
 
@@ -146,7 +147,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
 			  u8 tos)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 	struct rtable *rt = skb_rtable(skb);
 	struct net *net = sock_net(sk);
 	struct iphdr *iph;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 808983bc2ec9..c4835dbdfcff 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -237,7 +237,7 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
 	} else {
 		int bucket = ((struct ping_iter_state *) seq->private)->bucket;
-		struct inet_sock *inet = inet_sk(v);
+		struct inet_sock *inet = inet_sk((struct sock *)v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
 		ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9fb2f33ee3a7..ab4ae886235a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1708,7 +1708,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
 	} else {
 		int bucket = ((struct udp_iter_state *)seq->private)->bucket;
-		struct inet_sock *inet = inet_sk(v);
+		const struct inet_sock *inet = inet_sk((const struct sock *)v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
 		__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8a9656248b0f..5cef4d3d21ac 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1046,7 +1046,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
 
 static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 
 	memset(a, 0, sizeof(*a));
 
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index a7355b4b9bb8..00d3bdd386e2 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -317,7 +317,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 
 			switch (sk->sk_family) {
 			case AF_INET: {
-				struct inet_sock *inet = inet_sk(sk);
+				const struct inet_sock *inet = inet_sk(sk);
 
 				print_ipv4_addr(ab, inet->inet_rcv_saddr,
 						inet->inet_sport,
@@ -329,7 +329,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			}
 #if IS_ENABLED(CONFIG_IPV6)
 			case AF_INET6: {
-				struct inet_sock *inet = inet_sk(sk);
+				const struct inet_sock *inet = inet_sk(sk);
 
 				print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr,
 						inet->inet_sport,
-- 
cgit v1.2.3


From 33e972bdf0b0aa208b67164c64eef3c307e4b303 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:56 +0000
Subject: ipv4: constify ip_mc_sf_allow() socket argument

This clarifies ip_mc_sf_allow() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 net/ipv4/igmp.c      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index b19d3284551f..ebf4349a53af 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -122,7 +122,7 @@ extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 			sockptr_t optval, sockptr_t optlen);
 extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 			sockptr_t optval, size_t offset);
-extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
+extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt,
 			  int dif, int sdif);
 extern void ip_mc_init_dev(struct in_device *);
 extern void ip_mc_destroy_dev(struct in_device *);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c920aa9a62a9..48ff5f13e797 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2638,10 +2638,10 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 /*
  * check if a multicast source filter allows delivery for a given <src,dst,intf>
  */
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
 		   int dif, int sdif)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *pmc;
 	struct ip_sf_socklist *psl;
 	int i;
-- 
cgit v1.2.3


From 66eb554c6449cef8c1e1b814f74d13f264582591 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:58 +0000
Subject: ipv6: constify inet6_mc_check()

inet6_mc_check() is essentially a read-only function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/mcast.c       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c04f359655b8..82da55101b5a 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,7 +223,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 void __ipv6_sock_mc_close(struct sock *sk);
 void ipv6_sock_mc_close(struct sock *sk);
-bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr);
 
 int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 1c02160cf7a4..714cdc9e2b8e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -627,12 +627,12 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 	return 0;
 }
 
-bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct ipv6_mc_socklist *mc;
-	struct ip6_sf_socklist *psl;
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct ipv6_mc_socklist *mc;
+	const struct ip6_sf_socklist *psl;
 	bool rv = true;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From db6af4fdb150b45e1ba6b295ccfd3df482e022d2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:32:00 +0000
Subject: ipv6: raw: constify raw_v6_match() socket argument

This clarifies raw_v6_match() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rawv6.h | 2 +-
 net/ipv6/raw.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index bc70909625f6..82810cbe3798 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -6,7 +6,7 @@
 #include <net/raw.h>
 
 extern struct raw_hashinfo raw_v6_hashinfo;
-bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
 		  const struct in6_addr *loc_addr,
 		  const struct in6_addr *rmt_addr, int dif, int sdif);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index bac9ba747bde..6ac2f2690c44 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -64,7 +64,7 @@
 struct raw_hashinfo raw_v6_hashinfo;
 EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
-bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
 		  const struct in6_addr *loc_addr,
 		  const struct in6_addr *rmt_addr, int dif, int sdif)
 {
-- 
cgit v1.2.3


From 0a8c2568209ee0c3392593c7c5c7fe41c625a383 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:32:01 +0000
Subject: ipv4: raw: constify raw_v4_match() socket argument

This clarifies raw_v4_match() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 2 +-
 net/ipv4/raw.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 2c004c20ed99..7ad15830cf38 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -22,7 +22,7 @@
 extern struct proto raw_prot;
 
 extern struct raw_hashinfo raw_v4_hashinfo;
-bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
 		  __be32 raddr, __be32 laddr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 94df935ee0c5..3cf68695b40d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -116,10 +116,10 @@ void raw_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
-bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
 		  __be32 raddr, __be32 laddr, int dif, int sdif)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 
 	if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
 	    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
-- 
cgit v1.2.3


From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 17 Mar 2023 10:56:01 +0100
Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header

Moving find_kallsyms_symbol_value from kernel/module/internal.h to
include/linux/module.h. The reason is that internal.h is not prepared to
be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by
kernel/bpf/verifier.c and including internal.h from it (without modules)
leads into a compilation error:

  In file included from ../include/linux/container_of.h:5,
                   from ../include/linux/list.h:5,
                   from ../include/linux/timer.h:5,
                   from ../include/linux/workqueue.h:9,
                   from ../include/linux/bpf.h:10,
                   from ../include/linux/bpf-cgroup.h:5,
                   from ../kernel/bpf/verifier.c:7:
  ../kernel/bpf/../module/internal.h: In function 'mod_find':
  ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module'
     20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
        |                                                      ^~
  [...]

This patch fixes the above error.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/
Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com
---
 include/linux/module.h   | 8 ++++++++
 kernel/bpf/verifier.c    | 2 +-
 kernel/module/internal.h | 6 ------
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..41cfd3be57e5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d62b7127ff2a..99394a2f7ee4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,7 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
-#include "../module/internal.h"
+#include <linux/module.h>
 
 #include "disasm.h"
 
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 5c9170f9135c..1c877561a7d2 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module *mod,
 void init_build_id(struct module *mod, const struct load_info *info);
 void layout_symtab(struct module *mod, struct load_info *info);
 void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
 
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
@@ -256,11 +255,6 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 6229ad9913ac794233fdb3ed20f1e899add0af6b Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 9 Mar 2023 10:09:18 +0200
Subject: serial: Remove extern from func prototypes in headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove unnecessary externs from function prototypes in serial_8250.h
and serial_core.h.

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20230309080923.11778-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 41 ++++++++++++++++++-----------------------
 include/linux/serial_core.h | 13 ++++++-------
 2 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 741ed4807a9c..6f78f302d272 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -151,26 +151,22 @@ void serial8250_unregister_port(int line);
 void serial8250_suspend_port(int line);
 void serial8250_resume_port(int line);
 
-extern int early_serial_setup(struct uart_port *port);
-
-extern int early_serial8250_setup(struct earlycon_device *device,
-					 const char *options);
-extern void serial8250_update_uartclk(struct uart_port *port,
-				      unsigned int uartclk);
-extern void serial8250_do_set_termios(struct uart_port *port,
-		struct ktermios *termios, const struct ktermios *old);
-extern void serial8250_do_set_ldisc(struct uart_port *port,
-				    struct ktermios *termios);
-extern unsigned int serial8250_do_get_mctrl(struct uart_port *port);
-extern int serial8250_do_startup(struct uart_port *port);
-extern void serial8250_do_shutdown(struct uart_port *port);
-extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
-			     unsigned int oldstate);
-extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
-extern void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
-				      unsigned int quot,
-				      unsigned int quot_frac);
-extern int fsl8250_handle_irq(struct uart_port *port);
+int early_serial_setup(struct uart_port *port);
+int early_serial8250_setup(struct earlycon_device *device, const char *options);
+
+void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk);
+void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
+			       const struct ktermios *old);
+void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios);
+unsigned int serial8250_do_get_mctrl(struct uart_port *port);
+int serial8250_do_startup(struct uart_port *port);
+void serial8250_do_shutdown(struct uart_port *port);
+void serial8250_do_pm(struct uart_port *port, unsigned int state,
+		      unsigned int oldstate);
+void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
+void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
+			       unsigned int quot, unsigned int quot_frac);
+int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr);
 void serial8250_read_char(struct uart_8250_port *up, u16 lsr);
@@ -183,9 +179,8 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
 int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
 int serial8250_console_exit(struct uart_port *port);
 
-extern void serial8250_set_isa_configurator(void (*v)
-					(int port, struct uart_port *up,
-						u32 *capabilities));
+void serial8250_set_isa_configurator(void (*v)(int port, struct uart_port *up,
+					       u32 *capabilities));
 
 #ifdef CONFIG_SERIAL_8250_RT288X
 unsigned int au_serial_in(struct uart_port *p, int offset);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 9e3e5e0d11b2..05d18a145b3a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -812,9 +812,8 @@ extern const struct earlycon_id __earlycon_table_end[];
 
 #define EARLYCON_DECLARE(_name, fn)	OF_EARLYCON_DECLARE(_name, "", fn)
 
-extern int of_setup_earlycon(const struct earlycon_id *match,
-			     unsigned long node,
-			     const char *options);
+int of_setup_earlycon(const struct earlycon_id *match, unsigned long node,
+		      const char *options);
 
 #ifdef CONFIG_SERIAL_EARLYCON
 extern bool earlycon_acpi_spcr_enable __initdata;
@@ -897,11 +896,11 @@ static inline bool uart_softcts_mode(struct uart_port *uport)
  * The following are helper functions for the low level drivers.
  */
 
-extern void uart_handle_dcd_change(struct uart_port *uport, bool active);
-extern void uart_handle_cts_change(struct uart_port *uport, bool active);
+void uart_handle_dcd_change(struct uart_port *uport, bool active);
+void uart_handle_cts_change(struct uart_port *uport, bool active);
 
-extern void uart_insert_char(struct uart_port *port, unsigned int status,
-		 unsigned int overrun, unsigned int ch, unsigned int flag);
+void uart_insert_char(struct uart_port *port, unsigned int status,
+		      unsigned int overrun, unsigned int ch, unsigned int flag);
 
 void uart_xchar_out(struct uart_port *uport, int offset);
 
-- 
cgit v1.2.3


From b5def43a7b3e7e68d6b49a23166f8b886a26bb54 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 9 Mar 2023 10:09:21 +0200
Subject: serial: Make hw_stopped bool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert hw_stopped in uart_port to bool because its more appropriate
type for how it is used.

Also convert the local variable in uart_change_line_settings() caching
old hw_stopped to bool.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20230309080923.11778-7-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 6 +++---
 include/linux/serial_core.h      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index ecdc5d9cdb53..31b69e61e71d 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -182,7 +182,7 @@ static void uart_change_line_settings(struct tty_struct *tty, struct uart_state
 {
 	struct uart_port *uport = uart_port_check(state);
 	struct ktermios *termios;
-	int hw_stopped;
+	bool hw_stopped;
 
 	/*
 	 * If we have no tty, termios, or the port does not exist,
@@ -3304,13 +3304,13 @@ void uart_handle_cts_change(struct uart_port *uport, bool active)
 	if (uart_softcts_mode(uport)) {
 		if (uport->hw_stopped) {
 			if (active) {
-				uport->hw_stopped = 0;
+				uport->hw_stopped = false;
 				uport->ops->start_tx(uport);
 				uart_write_wakeup(uport);
 			}
 		} else {
 			if (!active) {
-				uport->hw_stopped = 1;
+				uport->hw_stopped = true;
 				uport->ops->stop_tx(uport);
 			}
 		}
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 05d18a145b3a..66ecec15a1bf 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -553,7 +553,7 @@ struct uart_port {
 #define UPSTAT_AUTOXOFF		((__force upstat_t) (1 << 4))
 #define UPSTAT_SYNC_FIFO	((__force upstat_t) (1 << 5))
 
-	int			hw_stopped;		/* sw-assisted CTS flow state */
+	bool			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
 	unsigned int		frame_time;		/* frame timing in ns */
 	unsigned int		type;			/* port type */
-- 
cgit v1.2.3


From 035173c91c6b53144295a5b546c6bad3f40fb8a9 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 9 Mar 2023 10:20:35 +0200
Subject: tty: Convert hw_stopped in tty_struct to bool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hw_stopped in tty_struct is used like bool, convert the variable type
to bool.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Link: https://lore.kernel.org/r/20230309082035.14880-9-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c |  6 +++---
 drivers/mmc/core/sdio_uart.c      | 10 +++++-----
 drivers/tty/amiserial.c           |  6 +++---
 drivers/tty/mxser.c               |  6 +++---
 drivers/tty/synclink_gt.c         |  6 +++---
 include/linux/tty.h               |  2 +-
 6 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 6ddfeb2fe98f..97c5bfb9d58a 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1060,7 +1060,7 @@ static void cts_change(MGSLPC_INFO *info, struct tty_struct *tty)
 			if (info->serial_signals & SerialSignal_CTS) {
 				if (debug_level >= DEBUG_LEVEL_ISR)
 					printk("CTS tx start...");
-				tty->hw_stopped = 0;
+				tty->hw_stopped = false;
 				tx_start(info, tty);
 				info->pending_bh |= BH_TRANSMIT;
 				return;
@@ -1069,7 +1069,7 @@ static void cts_change(MGSLPC_INFO *info, struct tty_struct *tty)
 			if (!(info->serial_signals & SerialSignal_CTS)) {
 				if (debug_level >= DEBUG_LEVEL_ISR)
 					printk("CTS tx stop...");
-				tty->hw_stopped = 1;
+				tty->hw_stopped = true;
 				tx_stop(info);
 			}
 		}
@@ -2312,7 +2312,7 @@ static void mgslpc_set_termios(struct tty_struct *tty,
 
 	/* Handle turning off CRTSCTS */
 	if (old_termios->c_cflag & CRTSCTS && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
+		tty->hw_stopped = false;
 		tx_release(tty);
 	}
 }
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index 50536fe59f1a..aa659758563f 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -478,13 +478,13 @@ static void sdio_uart_check_modem_status(struct sdio_uart_port *port)
 			int cts = (status & UART_MSR_CTS);
 			if (tty->hw_stopped) {
 				if (cts) {
-					tty->hw_stopped = 0;
+					tty->hw_stopped = false;
 					sdio_uart_start_tx(port);
 					tty_wakeup(tty);
 				}
 			} else {
 				if (!cts) {
-					tty->hw_stopped = 1;
+					tty->hw_stopped = true;
 					sdio_uart_stop_tx(port);
 				}
 			}
@@ -633,7 +633,7 @@ static int sdio_uart_activate(struct tty_port *tport, struct tty_struct *tty)
 
 	if (C_CRTSCTS(tty))
 		if (!(sdio_uart_get_mctrl(port) & TIOCM_CTS))
-			tty->hw_stopped = 1;
+			tty->hw_stopped = true;
 
 	clear_bit(TTY_IO_ERROR, &tty->flags);
 
@@ -882,14 +882,14 @@ static void sdio_uart_set_termios(struct tty_struct *tty,
 
 	/* Handle turning off CRTSCTS */
 	if ((old_termios->c_cflag & CRTSCTS) && !(cflag & CRTSCTS)) {
-		tty->hw_stopped = 0;
+		tty->hw_stopped = false;
 		sdio_uart_start_tx(port);
 	}
 
 	/* Handle turning on CRTSCTS */
 	if (!(old_termios->c_cflag & CRTSCTS) && (cflag & CRTSCTS)) {
 		if (!(sdio_uart_get_mctrl(port) & TIOCM_CTS)) {
-			tty->hw_stopped = 1;
+			tty->hw_stopped = true;
 			sdio_uart_stop_tx(port);
 		}
 	}
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index d7515d61659e..c06ad0a0744b 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -347,7 +347,7 @@ static void check_modem_status(struct serial_state *info)
 #if (defined(SERIAL_DEBUG_INTR) || defined(SERIAL_DEBUG_FLOW))
 				printk("CTS tx start...");
 #endif
-				port->tty->hw_stopped = 0;
+				port->tty->hw_stopped = false;
 				info->IER |= UART_IER_THRI;
 				amiga_custom.intena = IF_SETCLR | IF_TBE;
 				mb();
@@ -362,7 +362,7 @@ static void check_modem_status(struct serial_state *info)
 #if (defined(SERIAL_DEBUG_INTR) || defined(SERIAL_DEBUG_FLOW))
 				printk("CTS tx stop...");
 #endif
-				port->tty->hw_stopped = 1;
+				port->tty->hw_stopped = true;
 				info->IER &= ~UART_IER_THRI;
 				/* disable Tx interrupt and remove any pending interrupts */
 				amiga_custom.intena = IF_TBE;
@@ -1197,7 +1197,7 @@ static void rs_set_termios(struct tty_struct *tty, const struct ktermios *old_te
 
 	/* Handle turning off CRTSCTS */
 	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
+		tty->hw_stopped = false;
 		rs_start(tty);
 	}
 
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index ef3116e87975..10855e66fda1 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -553,7 +553,7 @@ static void mxser_handle_cts(struct tty_struct *tty, struct mxser_port *info,
 
 	if (tty->hw_stopped) {
 		if (cts) {
-			tty->hw_stopped = 0;
+			tty->hw_stopped = false;
 
 			if (!mxser_16550A_or_MUST(info))
 				__mxser_start_tx(info);
@@ -563,7 +563,7 @@ static void mxser_handle_cts(struct tty_struct *tty, struct mxser_port *info,
 	} else if (cts)
 		return;
 
-	tty->hw_stopped = 1;
+	tty->hw_stopped = true;
 	if (!mxser_16550A_or_MUST(info))
 		__mxser_stop_tx(info);
 }
@@ -1361,7 +1361,7 @@ static void mxser_set_termios(struct tty_struct *tty,
 	spin_unlock_irqrestore(&info->slock, flags);
 
 	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
+		tty->hw_stopped = false;
 		mxser_start(tty);
 	}
 
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 33f258d6fef9..543b3224dce9 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -730,7 +730,7 @@ static void set_termios(struct tty_struct *tty,
 
 	/* Handle turning off CRTSCTS */
 	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
+		tty->hw_stopped = false;
 		tx_release(tty);
 	}
 }
@@ -1953,13 +1953,13 @@ static void cts_change(struct slgt_info *info, unsigned short status)
 		if (info->port.tty) {
 			if (info->port.tty->hw_stopped) {
 				if (info->signals & SerialSignal_CTS) {
-		 			info->port.tty->hw_stopped = 0;
+					info->port.tty->hw_stopped = false;
 					info->pending_bh |= BH_TRANSMIT;
 					return;
 				}
 			} else {
 				if (!(info->signals & SerialSignal_CTS))
-		 			info->port.tty->hw_stopped = 1;
+					info->port.tty->hw_stopped = true;
 			}
 		}
 	}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 093935e97f42..60871a9d3212 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -227,7 +227,7 @@ struct tty_struct {
 		unsigned long unused[0];
 	} __aligned(sizeof(unsigned long)) ctrl;
 
-	int hw_stopped;
+	bool hw_stopped;
 	unsigned int receive_room;
 	int flow_change;
 
-- 
cgit v1.2.3


From 410e7088e971ad656170fe9768b072b267a95310 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 13 Mar 2023 13:31:00 +0200
Subject: devres: Pass unique name of the resource to
 devm_add_action_or_reset()

All the same as it's done in the commit e32c80bbd2f9 ("devres:
Pass unique name of the resource to devm_add_action()") applies
to the devm_add_action_or_reset(), which this change makes real.
This helps with debug resource management.

Reported-and-tested-by: Mirsad Goran Todorovac <mirsad.todorovac@alu.unizg.hr>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230313113100.59643-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 0f128520f6e5..12dc08aa5c0f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -250,17 +250,19 @@ int __devm_add_action(struct device *dev, void (*action)(void *), void *data, co
 #define devm_add_action(release, action, data) \
 	__devm_add_action(release, action, data, #action)
 
-static inline int devm_add_action_or_reset(struct device *dev,
-					   void (*action)(void *), void *data)
+static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *),
+					     void *data, const char *name)
 {
 	int ret;
 
-	ret = devm_add_action(dev, action, data);
+	ret = __devm_add_action(dev, action, data, name);
 	if (ret)
 		action(data);
 
 	return ret;
 }
+#define devm_add_action_or_reset(release, action, data) \
+	__devm_add_action_or_reset(release, action, data, #action)
 
 /**
  * devm_alloc_percpu - Resource-managed alloc_percpu
-- 
cgit v1.2.3


From 4a46ac9d64030d9f6f18baaf115a3558880c1ae2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:32 +0100
Subject: driver core: class: specify the module owner in __class_register()

There's no need to manually have to set the module owner of a class, the
compiler should do it automatically for you, so add a module * to the
__class_register() function and allow it to set the module owner
automatically.

This will let us move the module pointer out of struct class eventually,
as it should not be embedded in there if we wish for it to be a
read-only structure eventually.

And, funny story, this module pointer isn't even being used for
anything, so while we will keep it around for now, it's not like it even
matters.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 6 +++---
 include/linux/device/class.h | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 5983eead8391..90dc5788957a 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -154,7 +154,7 @@ static void class_remove_groups(struct class *cls,
 	return sysfs_remove_groups(&cls->p->subsys.kobj, groups);
 }
 
-int __class_register(struct class *cls, struct lock_class_key *key)
+int __class_register(struct class *cls, struct module *owner, struct lock_class_key *key)
 {
 	struct subsys_private *cp;
 	int error;
@@ -187,6 +187,7 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 	if (error)
 		goto err_out;
 
+	cls->owner = owner;
 	error = class_add_groups(class_get(cls), cls->class_groups);
 	class_put(cls);
 	if (error) {
@@ -244,10 +245,9 @@ struct class *__class_create(struct module *owner, const char *name,
 	}
 
 	cls->name = name;
-	cls->owner = owner;
 	cls->class_release = class_create_release;
 
-	retval = __class_register(cls, key);
+	retval = __class_register(cls, owner, key);
 	if (retval)
 		goto error;
 
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 42cc3fb44a84..d1ba4ee235dc 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -85,15 +85,16 @@ struct class_dev_iter {
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
 extern int __must_check __class_register(struct class *class,
+					 struct module *owner,
 					 struct lock_class_key *key);
 extern void class_unregister(struct class *class);
 
 /* This is a #define to keep the compiler from merging different
  * instances of the __key variable */
-#define class_register(class)			\
-({						\
-	static struct lock_class_key __key;	\
-	__class_register(class, &__key);	\
+#define class_register(class)				\
+({							\
+	static struct lock_class_key __key;		\
+	__class_register(class, THIS_MODULE, &__key);	\
 })
 
 struct class_compat;
-- 
cgit v1.2.3


From 6e30a66433afee90e902ced95d7136e8f7edcc7e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:34 +0100
Subject: driver core: class: remove struct module owner out of struct class

The module owner field for a struct class was never actually used, so
remove it as it is not doing anything at all.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-3-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         |  9 +++------
 include/linux/device/class.h | 18 +++++++-----------
 2 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 90dc5788957a..9439c6c7466f 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -154,7 +154,7 @@ static void class_remove_groups(struct class *cls,
 	return sysfs_remove_groups(&cls->p->subsys.kobj, groups);
 }
 
-int __class_register(struct class *cls, struct module *owner, struct lock_class_key *key)
+int __class_register(struct class *cls, struct lock_class_key *key)
 {
 	struct subsys_private *cp;
 	int error;
@@ -187,7 +187,6 @@ int __class_register(struct class *cls, struct module *owner, struct lock_class_
 	if (error)
 		goto err_out;
 
-	cls->owner = owner;
 	error = class_add_groups(class_get(cls), cls->class_groups);
 	class_put(cls);
 	if (error) {
@@ -220,7 +219,6 @@ static void class_create_release(struct class *cls)
 
 /**
  * __class_create - create a struct class structure
- * @owner: pointer to the module that is to "own" this struct class
  * @name: pointer to a string for the name of this class.
  * @key: the lock_class_key for this class; used by mutex lock debugging
  *
@@ -232,8 +230,7 @@ static void class_create_release(struct class *cls)
  * Note, the pointer created here is to be destroyed when finished by
  * making a call to class_destroy().
  */
-struct class *__class_create(struct module *owner, const char *name,
-			     struct lock_class_key *key)
+struct class *__class_create(const char *name, struct lock_class_key *key)
 {
 	struct class *cls;
 	int retval;
@@ -247,7 +244,7 @@ struct class *__class_create(struct module *owner, const char *name,
 	cls->name = name;
 	cls->class_release = class_create_release;
 
-	retval = __class_register(cls, owner, key);
+	retval = __class_register(cls, key);
 	if (retval)
 		goto error;
 
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index d1ba4ee235dc..bf736f14f0c1 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -25,7 +25,6 @@ struct fwnode_handle;
 /**
  * struct class - device classes
  * @name:	Name of the class.
- * @owner:	The module owner.
  * @class_groups: Default attributes of this class.
  * @dev_groups:	Default attributes of the devices that belong to the class.
  * @dev_kobj:	The kobject that represents this class and links it into the hierarchy.
@@ -53,7 +52,6 @@ struct fwnode_handle;
  */
 struct class {
 	const char		*name;
-	struct module		*owner;
 
 	const struct attribute_group	**class_groups;
 	const struct attribute_group	**dev_groups;
@@ -85,16 +83,15 @@ struct class_dev_iter {
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
 extern int __must_check __class_register(struct class *class,
-					 struct module *owner,
 					 struct lock_class_key *key);
 extern void class_unregister(struct class *class);
 
 /* This is a #define to keep the compiler from merging different
  * instances of the __key variable */
-#define class_register(class)				\
-({							\
-	static struct lock_class_key __key;		\
-	__class_register(class, THIS_MODULE, &__key);	\
+#define class_register(class)			\
+({						\
+	static struct lock_class_key __key;	\
+	__class_register(class, &__key);	\
 })
 
 struct class_compat;
@@ -250,8 +247,7 @@ struct class_interface {
 extern int __must_check class_interface_register(struct class_interface *);
 extern void class_interface_unregister(struct class_interface *);
 
-extern struct class * __must_check __class_create(struct module *owner,
-						  const char *name,
+extern struct class * __must_check __class_create(const char *name,
 						  struct lock_class_key *key);
 extern void class_destroy(struct class *cls);
 
@@ -260,7 +256,7 @@ extern void class_destroy(struct class *cls);
 
 /**
  * class_create - create a struct class structure
- * @owner: pointer to the module that is to "own" this struct class
+ * @owner: dummy pointer, does nothing, will be removed soon.
  * @name: pointer to a string for the name of this class.
  *
  * This is used to create a struct class pointer that can then be used
@@ -274,7 +270,7 @@ extern void class_destroy(struct class *cls);
 #define class_create(owner, name)		\
 ({						\
 	static struct lock_class_key __key;	\
-	__class_create(owner, name, &__key);	\
+	__class_create(name, &__key);		\
 })
 
 
-- 
cgit v1.2.3


From 1aaba11da9aa7d7d6b52a74d45b31cac118295a1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:35 +0100
Subject: driver core: class: remove module * from class_create()

The module pointer in class_create() never actually did anything, and it
shouldn't have been requred to be set as a parameter even if it did
something.  So just remove it and fix up all callers of the function in
the kernel tree at the same time.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20230313181843.1207845-4-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/mips-mt.c                       | 2 +-
 arch/mips/sibyte/common/sb_tbprof.c              | 2 +-
 arch/powerpc/platforms/book3s/vas-api.c          | 2 +-
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c        | 2 +-
 arch/x86/kernel/cpuid.c                          | 2 +-
 arch/x86/kernel/msr.c                            | 2 +-
 block/bsg.c                                      | 2 +-
 drivers/accel/drm_accel.c                        | 2 +-
 drivers/accel/habanalabs/common/habanalabs_drv.c | 2 +-
 drivers/base/power/wakeup_stats.c                | 2 +-
 drivers/block/aoe/aoechr.c                       | 2 +-
 drivers/block/rnbd/rnbd-clt-sysfs.c              | 2 +-
 drivers/block/rnbd/rnbd-srv-sysfs.c              | 2 +-
 drivers/block/ublk_drv.c                         | 2 +-
 drivers/char/bsr.c                               | 2 +-
 drivers/char/dsp56k.c                            | 2 +-
 drivers/char/ipmi/ipmi_devintf.c                 | 2 +-
 drivers/char/lp.c                                | 2 +-
 drivers/char/mem.c                               | 2 +-
 drivers/char/misc.c                              | 2 +-
 drivers/char/pcmcia/cm4000_cs.c                  | 2 +-
 drivers/char/pcmcia/cm4040_cs.c                  | 2 +-
 drivers/char/pcmcia/scr24x_cs.c                  | 2 +-
 drivers/char/ppdev.c                             | 2 +-
 drivers/char/tpm/tpm-interface.c                 | 4 ++--
 drivers/char/virtio_console.c                    | 2 +-
 drivers/char/xilinx_hwicap/xilinx_hwicap.c       | 2 +-
 drivers/char/xillybus/xillybus_class.c           | 2 +-
 drivers/comedi/comedi_fops.c                     | 2 +-
 drivers/comedi/drivers/comedi_test.c             | 2 +-
 drivers/crypto/qat/qat_common/adf_ctl_drv.c      | 2 +-
 drivers/dca/dca-sysfs.c                          | 2 +-
 drivers/devfreq/devfreq-event.c                  | 2 +-
 drivers/devfreq/devfreq.c                        | 2 +-
 drivers/dma-buf/dma-heap.c                       | 2 +-
 drivers/extcon/extcon.c                          | 2 +-
 drivers/fpga/fpga-bridge.c                       | 2 +-
 drivers/fpga/fpga-mgr.c                          | 2 +-
 drivers/fpga/fpga-region.c                       | 2 +-
 drivers/gnss/core.c                              | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         | 2 +-
 drivers/gpu/drm/display/drm_dp_aux_dev.c         | 2 +-
 drivers/gpu/drm/drm_sysfs.c                      | 2 +-
 drivers/hid/hid-roccat-arvo.c                    | 2 +-
 drivers/hid/hid-roccat-isku.c                    | 2 +-
 drivers/hid/hid-roccat-kone.c                    | 2 +-
 drivers/hid/hid-roccat-koneplus.c                | 2 +-
 drivers/hid/hid-roccat-konepure.c                | 2 +-
 drivers/hid/hid-roccat-kovaplus.c                | 2 +-
 drivers/hid/hid-roccat-pyra.c                    | 2 +-
 drivers/hid/hid-roccat-ryos.c                    | 2 +-
 drivers/hid/hid-roccat-savu.c                    | 2 +-
 drivers/hid/hidraw.c                             | 2 +-
 drivers/i2c/i2c-dev.c                            | 2 +-
 drivers/infiniband/core/uverbs_main.c            | 2 +-
 drivers/infiniband/hw/hfi1/device.c              | 4 ++--
 drivers/infiniband/hw/qib/qib_file_ops.c         | 2 +-
 drivers/infiniband/ulp/rtrs/rtrs-clt.c           | 2 +-
 drivers/infiniband/ulp/rtrs/rtrs-srv.c           | 2 +-
 drivers/isdn/capi/capi.c                         | 2 +-
 drivers/isdn/mISDN/dsp_pipeline.c                | 2 +-
 drivers/leds/led-class.c                         | 2 +-
 drivers/macintosh/adb.c                          | 2 +-
 drivers/media/dvb-core/dvbdev.c                  | 2 +-
 drivers/media/rc/lirc_dev.c                      | 2 +-
 drivers/misc/c2port/core.c                       | 2 +-
 drivers/misc/cxl/file.c                          | 2 +-
 drivers/misc/genwqe/card_base.c                  | 2 +-
 drivers/misc/hpilo.c                             | 2 +-
 drivers/misc/mei/main.c                          | 2 +-
 drivers/misc/ocxl/file.c                         | 2 +-
 drivers/misc/phantom.c                           | 2 +-
 drivers/misc/uacce/uacce.c                       | 2 +-
 drivers/most/most_cdev.c                         | 2 +-
 drivers/net/ethernet/hisilicon/hns/hnae.c        | 2 +-
 drivers/net/ppp/ppp_generic.c                    | 2 +-
 drivers/net/wireless/mac80211_hwsim.c            | 2 +-
 drivers/net/wwan/wwan_core.c                     | 2 +-
 drivers/net/wwan/wwan_hwsim.c                    | 2 +-
 drivers/nvdimm/bus.c                             | 2 +-
 drivers/nvme/host/core.c                         | 6 +++---
 drivers/nvme/host/fabrics.c                      | 2 +-
 drivers/nvme/target/fcloop.c                     | 2 +-
 drivers/pci/endpoint/pci-epc-core.c              | 2 +-
 drivers/pci/switch/switchtec.c                   | 2 +-
 drivers/phy/phy-core.c                           | 2 +-
 drivers/power/supply/power_supply_core.c         | 2 +-
 drivers/pps/pps.c                                | 2 +-
 drivers/ptp/ptp_clock.c                          | 2 +-
 drivers/rapidio/devices/rio_mport_cdev.c         | 2 +-
 drivers/rapidio/rio_cm.c                         | 2 +-
 drivers/rpmsg/rpmsg_core.c                       | 2 +-
 drivers/rtc/class.c                              | 2 +-
 drivers/s390/char/hmcdrv_dev.c                   | 2 +-
 drivers/s390/char/raw3270.c                      | 2 +-
 drivers/s390/char/tape_class.c                   | 2 +-
 drivers/s390/char/vmlogrdr.c                     | 2 +-
 drivers/s390/char/vmur.c                         | 2 +-
 drivers/s390/crypto/zcrypt_api.c                 | 2 +-
 drivers/sbus/char/oradax.c                       | 2 +-
 drivers/scsi/ch.c                                | 2 +-
 drivers/scsi/cxlflash/main.c                     | 2 +-
 drivers/scsi/pmcraid.c                           | 2 +-
 drivers/scsi/sg.c                                | 2 +-
 drivers/spi/spidev.c                             | 2 +-
 drivers/staging/fieldbus/anybuss/arcx-anybus.c   | 2 +-
 drivers/staging/greybus/authentication.c         | 2 +-
 drivers/staging/greybus/fw-management.c          | 2 +-
 drivers/staging/greybus/raw.c                    | 2 +-
 drivers/staging/pi433/pi433_if.c                 | 2 +-
 drivers/staging/vme_user/vme_user.c              | 2 +-
 drivers/tee/tee_core.c                           | 2 +-
 drivers/tty/tty_io.c                             | 2 +-
 drivers/tty/vt/vc_screen.c                       | 2 +-
 drivers/tty/vt/vt.c                              | 2 +-
 drivers/usb/core/file.c                          | 2 +-
 drivers/usb/gadget/function/f_hid.c              | 2 +-
 drivers/usb/gadget/function/f_printer.c          | 2 +-
 drivers/usb/gadget/udc/core.c                    | 2 +-
 drivers/usb/mon/mon_bin.c                        | 2 +-
 drivers/usb/roles/class.c                        | 2 +-
 drivers/vdpa/vdpa_user/vduse_dev.c               | 2 +-
 drivers/vfio/group.c                             | 2 +-
 drivers/vfio/vfio_main.c                         | 2 +-
 drivers/video/backlight/backlight.c              | 2 +-
 drivers/video/backlight/lcd.c                    | 2 +-
 drivers/video/fbdev/core/fbmem.c                 | 2 +-
 fs/coda/psdev.c                                  | 2 +-
 fs/fuse/cuse.c                                   | 2 +-
 fs/pstore/pmsg.c                                 | 2 +-
 include/linux/device/class.h                     | 3 +--
 mm/backing-dev.c                                 | 2 +-
 net/bluetooth/hci_sysfs.c                        | 2 +-
 net/netfilter/xt_IDLETIMER.c                     | 2 +-
 samples/vfio-mdev/mbochs.c                       | 2 +-
 samples/vfio-mdev/mdpy.c                         | 2 +-
 samples/vfio-mdev/mtty.c                         | 2 +-
 sound/sound_core.c                               | 2 +-
 tools/testing/nvdimm/test/ndtest.c               | 2 +-
 tools/testing/nvdimm/test/nfit.c                 | 2 +-
 140 files changed, 144 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c
index dc023a979803..f88b7919f11f 100644
--- a/arch/mips/kernel/mips-mt.c
+++ b/arch/mips/kernel/mips-mt.c
@@ -234,7 +234,7 @@ static int __init mips_mt_init(void)
 {
 	struct class *mtc;
 
-	mtc = class_create(THIS_MODULE, "mt");
+	mtc = class_create("mt");
 	if (IS_ERR(mtc))
 		return PTR_ERR(mtc);
 
diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c
index bc47681e825a..ac376dfb4e7c 100644
--- a/arch/mips/sibyte/common/sb_tbprof.c
+++ b/arch/mips/sibyte/common/sb_tbprof.c
@@ -550,7 +550,7 @@ static int __init sbprof_tb_init(void)
 		return -EIO;
 	}
 
-	tbc = class_create(THIS_MODULE, "sb_tracebuffer");
+	tbc = class_create("sb_tracebuffer");
 	if (IS_ERR(tbc)) {
 		err = PTR_ERR(tbc);
 		goto out_chrdev;
diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c
index 36c21648d19a..77ea9335fd04 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -581,7 +581,7 @@ int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type,
 	pr_devel("%s device allocated, dev [%i,%i]\n", name,
 			MAJOR(coproc_device.devt), MINOR(coproc_device.devt));
 
-	coproc_device.class = class_create(mod, name);
+	coproc_device.class = class_create(name);
 	if (IS_ERR(coproc_device.class)) {
 		rc = PTR_ERR(coproc_device.class);
 		pr_err("Unable to create %s class %d\n", name, rc);
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 524f8ff3e69c..458cb7419502 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1580,7 +1580,7 @@ int rdt_pseudo_lock_init(void)
 
 	pseudo_lock_major = ret;
 
-	pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
+	pseudo_lock_class = class_create("pseudo_lock");
 	if (IS_ERR(pseudo_lock_class)) {
 		ret = PTR_ERR(pseudo_lock_class);
 		unregister_chrdev(pseudo_lock_major, "pseudo_lock");
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 621ba9c0f17a..bdc0d5539b57 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -154,7 +154,7 @@ static int __init cpuid_init(void)
 		       CPUID_MAJOR);
 		return -EBUSY;
 	}
-	cpuid_class = class_create(THIS_MODULE, "cpuid");
+	cpuid_class = class_create("cpuid");
 	if (IS_ERR(cpuid_class)) {
 		err = PTR_ERR(cpuid_class);
 		goto out_chrdev;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 708751311786..7bb17d37db01 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -263,7 +263,7 @@ static int __init msr_init(void)
 		pr_err("unable to get major %d for msr\n", MSR_MAJOR);
 		return -EBUSY;
 	}
-	msr_class = class_create(THIS_MODULE, "msr");
+	msr_class = class_create("msr");
 	if (IS_ERR(msr_class)) {
 		err = PTR_ERR(msr_class);
 		goto out_chrdev;
diff --git a/block/bsg.c b/block/bsg.c
index 30fcc865ef4f..7eca43f33d7f 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -245,7 +245,7 @@ static int __init bsg_init(void)
 	dev_t devid;
 	int ret;
 
-	bsg_class = class_create(THIS_MODULE, "bsg");
+	bsg_class = class_create("bsg");
 	if (IS_ERR(bsg_class))
 		return PTR_ERR(bsg_class);
 	bsg_class->devnode = bsg_devnode;
diff --git a/drivers/accel/drm_accel.c b/drivers/accel/drm_accel.c
index 1b69824286fd..4a9baf02439e 100644
--- a/drivers/accel/drm_accel.c
+++ b/drivers/accel/drm_accel.c
@@ -34,7 +34,7 @@ static char *accel_devnode(const struct device *dev, umode_t *mode)
 
 static int accel_sysfs_init(void)
 {
-	accel_class = class_create(THIS_MODULE, "accel");
+	accel_class = class_create("accel");
 	if (IS_ERR(accel_class))
 		return PTR_ERR(accel_class);
 
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index 03dae57dc838..3538e89e4e5e 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -702,7 +702,7 @@ static int __init hl_init(void)
 
 	hl_major = MAJOR(dev);
 
-	hl_class = class_create(THIS_MODULE, HL_NAME);
+	hl_class = class_create(HL_NAME);
 	if (IS_ERR(hl_class)) {
 		pr_err("failed to allocate class\n");
 		rc = PTR_ERR(hl_class);
diff --git a/drivers/base/power/wakeup_stats.c b/drivers/base/power/wakeup_stats.c
index 924fac493c4f..6732ed2869f9 100644
--- a/drivers/base/power/wakeup_stats.c
+++ b/drivers/base/power/wakeup_stats.c
@@ -210,7 +210,7 @@ void wakeup_source_sysfs_remove(struct wakeup_source *ws)
 
 static int __init wakeup_sources_sysfs_init(void)
 {
-	wakeup_class = class_create(THIS_MODULE, "wakeup");
+	wakeup_class = class_create("wakeup");
 
 	return PTR_ERR_OR_ZERO(wakeup_class);
 }
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 7a368c90467d..4c666f72203f 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -290,7 +290,7 @@ aoechr_init(void)
 	}
 	init_completion(&emsgs_comp);
 	spin_lock_init(&emsgs_lock);
-	aoe_class = class_create(THIS_MODULE, "aoe");
+	aoe_class = class_create("aoe");
 	if (IS_ERR(aoe_class)) {
 		unregister_chrdev(AOE_MAJOR, "aoechr");
 		return PTR_ERR(aoe_class);
diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
index e7c7d9a68168..8c6087949794 100644
--- a/drivers/block/rnbd/rnbd-clt-sysfs.c
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -646,7 +646,7 @@ int rnbd_clt_create_sysfs_files(void)
 {
 	int err;
 
-	rnbd_dev_class = class_create(THIS_MODULE, "rnbd-client");
+	rnbd_dev_class = class_create("rnbd-client");
 	if (IS_ERR(rnbd_dev_class))
 		return PTR_ERR(rnbd_dev_class);
 
diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c
index 297a6924ff4e..d5d9267e1fa5 100644
--- a/drivers/block/rnbd/rnbd-srv-sysfs.c
+++ b/drivers/block/rnbd/rnbd-srv-sysfs.c
@@ -215,7 +215,7 @@ int rnbd_srv_create_sysfs_files(void)
 {
 	int err;
 
-	rnbd_dev_class = class_create(THIS_MODULE, "rnbd-server");
+	rnbd_dev_class = class_create("rnbd-server");
 	if (IS_ERR(rnbd_dev_class))
 		return PTR_ERR(rnbd_dev_class);
 
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index d1d1c8d606c8..2eb46419562b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2266,7 +2266,7 @@ static int __init ublk_init(void)
 	if (ret)
 		goto unregister_mis;
 
-	ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
+	ublk_chr_class = class_create("ublk-char");
 	if (IS_ERR(ublk_chr_class)) {
 		ret = PTR_ERR(ublk_chr_class);
 		goto free_chrdev_region;
diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c
index d5f943938427..ff429ba02fa4 100644
--- a/drivers/char/bsr.c
+++ b/drivers/char/bsr.c
@@ -293,7 +293,7 @@ static int __init bsr_init(void)
 	if (!np)
 		goto out_err;
 
-	bsr_class = class_create(THIS_MODULE, "bsr");
+	bsr_class = class_create("bsr");
 	if (IS_ERR(bsr_class)) {
 		printk(KERN_ERR "class_create() failed for bsr_class\n");
 		ret = PTR_ERR(bsr_class);
diff --git a/drivers/char/dsp56k.c b/drivers/char/dsp56k.c
index 06749e295ada..b3eaf3e5ef2e 100644
--- a/drivers/char/dsp56k.c
+++ b/drivers/char/dsp56k.c
@@ -504,7 +504,7 @@ static int __init dsp56k_init_driver(void)
 		printk("DSP56k driver: Unable to register driver\n");
 		return -ENODEV;
 	}
-	dsp56k_class = class_create(THIS_MODULE, "dsp56k");
+	dsp56k_class = class_create("dsp56k");
 	if (IS_ERR(dsp56k_class)) {
 		err = PTR_ERR(dsp56k_class);
 		goto out_chrdev;
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index d160fa4c73fe..73e5a9e28f85 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -860,7 +860,7 @@ static int __init init_ipmi_devintf(void)
 
 	pr_info("ipmi device interface\n");
 
-	ipmi_class = class_create(THIS_MODULE, "ipmi");
+	ipmi_class = class_create("ipmi");
 	if (IS_ERR(ipmi_class)) {
 		pr_err("ipmi: can't register device class\n");
 		return PTR_ERR(ipmi_class);
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 38aad99ebb61..70cfc5140c2c 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -1049,7 +1049,7 @@ static int __init lp_init(void)
 		return -EIO;
 	}
 
-	lp_class = class_create(THIS_MODULE, "printer");
+	lp_class = class_create("printer");
 	if (IS_ERR(lp_class)) {
 		err = PTR_ERR(lp_class);
 		goto out_reg;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index ffb101d349f0..f494d31f2b98 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -762,7 +762,7 @@ static int __init chr_dev_init(void)
 	if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 
-	mem_class = class_create(THIS_MODULE, "mem");
+	mem_class = class_create("mem");
 	if (IS_ERR(mem_class))
 		return PTR_ERR(mem_class);
 
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 7a1388b0572b..1c44c29a666e 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -286,7 +286,7 @@ static int __init misc_init(void)
 	struct proc_dir_entry *ret;
 
 	ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops);
-	misc_class = class_create(THIS_MODULE, "misc");
+	misc_class = class_create("misc");
 	err = PTR_ERR(misc_class);
 	if (IS_ERR(misc_class))
 		goto fail_remove;
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index e656f42a28ac..7f96d8571a53 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1878,7 +1878,7 @@ static int __init cmm_init(void)
 {
 	int rc;
 
-	cmm_class = class_create(THIS_MODULE, "cardman_4000");
+	cmm_class = class_create("cardman_4000");
 	if (IS_ERR(cmm_class))
 		return PTR_ERR(cmm_class);
 
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index 827711911da4..11ff59e2b963 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -650,7 +650,7 @@ static int __init cm4040_init(void)
 {
 	int rc;
 
-	cmx_class = class_create(THIS_MODULE, "cardman_4040");
+	cmx_class = class_create("cardman_4040");
 	if (IS_ERR(cmx_class))
 		return PTR_ERR(cmx_class);
 
diff --git a/drivers/char/pcmcia/scr24x_cs.c b/drivers/char/pcmcia/scr24x_cs.c
index 1bdce08fae3d..870781f5a08c 100644
--- a/drivers/char/pcmcia/scr24x_cs.c
+++ b/drivers/char/pcmcia/scr24x_cs.c
@@ -325,7 +325,7 @@ static int __init scr24x_init(void)
 {
 	int ret;
 
-	scr24x_class = class_create(THIS_MODULE, "scr24x");
+	scr24x_class = class_create("scr24x");
 	if (IS_ERR(scr24x_class))
 		return PTR_ERR(scr24x_class);
 
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 38b46c7d1737..81ed58157b15 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -841,7 +841,7 @@ static int __init ppdev_init(void)
 		pr_warn(CHRDEV ": unable to get major %d\n", PP_MAJOR);
 		return -EIO;
 	}
-	ppdev_class = class_create(THIS_MODULE, CHRDEV);
+	ppdev_class = class_create(CHRDEV);
 	if (IS_ERR(ppdev_class)) {
 		err = PTR_ERR(ppdev_class);
 		goto out_chrdev;
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 7e513b771832..8763c820d1f8 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -466,13 +466,13 @@ static int __init tpm_init(void)
 {
 	int rc;
 
-	tpm_class = class_create(THIS_MODULE, "tpm");
+	tpm_class = class_create("tpm");
 	if (IS_ERR(tpm_class)) {
 		pr_err("couldn't create tpm class\n");
 		return PTR_ERR(tpm_class);
 	}
 
-	tpmrm_class = class_create(THIS_MODULE, "tpmrm");
+	tpmrm_class = class_create("tpmrm");
 	if (IS_ERR(tpmrm_class)) {
 		pr_err("couldn't create tpmrm class\n");
 		rc = PTR_ERR(tpmrm_class);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index d5ac4d955bc8..b65c809a4e97 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -2244,7 +2244,7 @@ static int __init virtio_console_init(void)
 {
 	int err;
 
-	pdrvdata.class = class_create(THIS_MODULE, "virtio-ports");
+	pdrvdata.class = class_create("virtio-ports");
 	if (IS_ERR(pdrvdata.class)) {
 		err = PTR_ERR(pdrvdata.class);
 		pr_err("Error %d creating virtio-ports class\n", err);
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index 74a4928aea1d..a46f637da959 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -856,7 +856,7 @@ static int __init hwicap_module_init(void)
 	dev_t devt;
 	int retval;
 
-	icap_class = class_create(THIS_MODULE, "xilinx_config");
+	icap_class = class_create("xilinx_config");
 	mutex_init(&icap_sem);
 
 	devt = MKDEV(XHWICAP_MAJOR, XHWICAP_MINOR);
diff --git a/drivers/char/xillybus/xillybus_class.c b/drivers/char/xillybus/xillybus_class.c
index e9a288e61c15..89926fe9d813 100644
--- a/drivers/char/xillybus/xillybus_class.c
+++ b/drivers/char/xillybus/xillybus_class.c
@@ -242,7 +242,7 @@ EXPORT_SYMBOL(xillybus_find_inode);
 
 static int __init xillybus_class_init(void)
 {
-	xillybus_class = class_create(THIS_MODULE, "xillybus");
+	xillybus_class = class_create("xillybus");
 
 	if (IS_ERR(xillybus_class)) {
 		pr_warn("Failed to register xillybus class\n");
diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index b982903aaa46..8e43918d38c4 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -3383,7 +3383,7 @@ static int __init comedi_init(void)
 	if (retval)
 		goto out_unregister_chrdev_region;
 
-	comedi_class = class_create(THIS_MODULE, "comedi");
+	comedi_class = class_create("comedi");
 	if (IS_ERR(comedi_class)) {
 		retval = PTR_ERR(comedi_class);
 		pr_err("failed to create class\n");
diff --git a/drivers/comedi/drivers/comedi_test.c b/drivers/comedi/drivers/comedi_test.c
index 0b5c0af1cebf..c02dc19a679b 100644
--- a/drivers/comedi/drivers/comedi_test.c
+++ b/drivers/comedi/drivers/comedi_test.c
@@ -795,7 +795,7 @@ static int __init comedi_test_init(void)
 	}
 
 	if (!config_mode) {
-		ctcls = class_create(THIS_MODULE, CLASS_NAME);
+		ctcls = class_create(CLASS_NAME);
 		if (IS_ERR(ctcls)) {
 			pr_warn("comedi_test: unable to create class\n");
 			goto clean3;
diff --git a/drivers/crypto/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/qat/qat_common/adf_ctl_drv.c
index 9190532b27eb..75a50fde5644 100644
--- a/drivers/crypto/qat/qat_common/adf_ctl_drv.c
+++ b/drivers/crypto/qat/qat_common/adf_ctl_drv.c
@@ -56,7 +56,7 @@ static int adf_chr_drv_create(void)
 		return -EFAULT;
 	}
 
-	adf_ctl_drv.drv_class = class_create(THIS_MODULE, DEVICE_NAME);
+	adf_ctl_drv.drv_class = class_create(DEVICE_NAME);
 	if (IS_ERR(adf_ctl_drv.drv_class)) {
 		pr_err("QAT: class_create failed for adf_ctl\n");
 		goto err_chrdev_unreg;
diff --git a/drivers/dca/dca-sysfs.c b/drivers/dca/dca-sysfs.c
index 21ebd0af268b..fcc83ede0909 100644
--- a/drivers/dca/dca-sysfs.c
+++ b/drivers/dca/dca-sysfs.c
@@ -74,7 +74,7 @@ int __init dca_sysfs_init(void)
 	idr_init(&dca_idr);
 	spin_lock_init(&dca_idr_lock);
 
-	dca_class = class_create(THIS_MODULE, "dca");
+	dca_class = class_create("dca");
 	if (IS_ERR(dca_class)) {
 		idr_destroy(&dca_idr);
 		return PTR_ERR(dca_class);
diff --git a/drivers/devfreq/devfreq-event.c b/drivers/devfreq/devfreq-event.c
index f041edccd107..3ebac2496679 100644
--- a/drivers/devfreq/devfreq-event.c
+++ b/drivers/devfreq/devfreq-event.c
@@ -469,7 +469,7 @@ ATTRIBUTE_GROUPS(devfreq_event);
 
 static int __init devfreq_event_init(void)
 {
-	devfreq_event_class = class_create(THIS_MODULE, "devfreq-event");
+	devfreq_event_class = class_create("devfreq-event");
 	if (IS_ERR(devfreq_event_class)) {
 		pr_err("%s: couldn't create class\n", __FILE__);
 		return PTR_ERR(devfreq_event_class);
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 817c71da391a..e36cbb920ec8 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -1988,7 +1988,7 @@ DEFINE_SHOW_ATTRIBUTE(devfreq_summary);
 
 static int __init devfreq_init(void)
 {
-	devfreq_class = class_create(THIS_MODULE, "devfreq");
+	devfreq_class = class_create("devfreq");
 	if (IS_ERR(devfreq_class)) {
 		pr_err("%s: couldn't create class\n", __FILE__);
 		return PTR_ERR(devfreq_class);
diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index c9e41e8a9e27..84ae708fafe7 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -314,7 +314,7 @@ static int dma_heap_init(void)
 	if (ret)
 		return ret;
 
-	dma_heap_class = class_create(THIS_MODULE, DEVNAME);
+	dma_heap_class = class_create(DEVNAME);
 	if (IS_ERR(dma_heap_class)) {
 		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
 		return PTR_ERR(dma_heap_class);
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index e1c71359b605..d43ba8e7260d 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -1013,7 +1013,7 @@ ATTRIBUTE_GROUPS(extcon);
 static int create_extcon_class(void)
 {
 	if (!extcon_class) {
-		extcon_class = class_create(THIS_MODULE, "extcon");
+		extcon_class = class_create("extcon");
 		if (IS_ERR(extcon_class))
 			return PTR_ERR(extcon_class);
 		extcon_class->dev_groups = extcon_groups;
diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c
index 5cd40acab5bf..6a521c83a1ed 100644
--- a/drivers/fpga/fpga-bridge.c
+++ b/drivers/fpga/fpga-bridge.c
@@ -416,7 +416,7 @@ static void fpga_bridge_dev_release(struct device *dev)
 
 static int __init fpga_bridge_dev_init(void)
 {
-	fpga_bridge_class = class_create(THIS_MODULE, "fpga_bridge");
+	fpga_bridge_class = class_create("fpga_bridge");
 	if (IS_ERR(fpga_bridge_class))
 		return PTR_ERR(fpga_bridge_class);
 
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 8efa67620e21..eb583f86a0b9 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -971,7 +971,7 @@ static int __init fpga_mgr_class_init(void)
 {
 	pr_info("FPGA manager framework\n");
 
-	fpga_mgr_class = class_create(THIS_MODULE, "fpga_manager");
+	fpga_mgr_class = class_create("fpga_manager");
 	if (IS_ERR(fpga_mgr_class))
 		return PTR_ERR(fpga_mgr_class);
 
diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
index 27ff9dea04ae..ccf6fdab1360 100644
--- a/drivers/fpga/fpga-region.c
+++ b/drivers/fpga/fpga-region.c
@@ -293,7 +293,7 @@ static void fpga_region_dev_release(struct device *dev)
  */
 static int __init fpga_region_init(void)
 {
-	fpga_region_class = class_create(THIS_MODULE, "fpga_region");
+	fpga_region_class = class_create("fpga_region");
 	if (IS_ERR(fpga_region_class))
 		return PTR_ERR(fpga_region_class);
 
diff --git a/drivers/gnss/core.c b/drivers/gnss/core.c
index 77a4b280c552..48f2ee0f78c4 100644
--- a/drivers/gnss/core.c
+++ b/drivers/gnss/core.c
@@ -387,7 +387,7 @@ static int __init gnss_module_init(void)
 		return ret;
 	}
 
-	gnss_class = class_create(THIS_MODULE, "gnss");
+	gnss_class = class_create("gnss");
 	if (IS_ERR(gnss_class)) {
 		ret = PTR_ERR(gnss_class);
 		pr_err("failed to create class: %d\n", ret);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a0e30f21e12e..20e75bd74bed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -93,7 +93,7 @@ int kfd_chardev_init(void)
 	if (err < 0)
 		goto err_register_chrdev;
 
-	kfd_class = class_create(THIS_MODULE, kfd_dev_name);
+	kfd_class = class_create(kfd_dev_name);
 	err = PTR_ERR(kfd_class);
 	if (IS_ERR(kfd_class))
 		goto err_class_create;
diff --git a/drivers/gpu/drm/display/drm_dp_aux_dev.c b/drivers/gpu/drm/display/drm_dp_aux_dev.c
index 098e482e65a2..29555b9f03c8 100644
--- a/drivers/gpu/drm/display/drm_dp_aux_dev.c
+++ b/drivers/gpu/drm/display/drm_dp_aux_dev.c
@@ -330,7 +330,7 @@ int drm_dp_aux_dev_init(void)
 {
 	int res;
 
-	drm_dp_aux_dev_class = class_create(THIS_MODULE, "drm_dp_aux_dev");
+	drm_dp_aux_dev_class = class_create("drm_dp_aux_dev");
 	if (IS_ERR(drm_dp_aux_dev_class)) {
 		return PTR_ERR(drm_dp_aux_dev_class);
 	}
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 183130355997..3c22a803201d 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -112,7 +112,7 @@ int drm_sysfs_init(void)
 {
 	int err;
 
-	drm_class = class_create(THIS_MODULE, "drm");
+	drm_class = class_create("drm");
 	if (IS_ERR(drm_class))
 		return PTR_ERR(drm_class);
 
diff --git a/drivers/hid/hid-roccat-arvo.c b/drivers/hid/hid-roccat-arvo.c
index d94ee0539421..ea6b79b3aeeb 100644
--- a/drivers/hid/hid-roccat-arvo.c
+++ b/drivers/hid/hid-roccat-arvo.c
@@ -433,7 +433,7 @@ static int __init arvo_init(void)
 {
 	int retval;
 
-	arvo_class = class_create(THIS_MODULE, "arvo");
+	arvo_class = class_create("arvo");
 	if (IS_ERR(arvo_class))
 		return PTR_ERR(arvo_class);
 	arvo_class->dev_groups = arvo_groups;
diff --git a/drivers/hid/hid-roccat-isku.c b/drivers/hid/hid-roccat-isku.c
index e95d59cd8d07..3903a2cea00c 100644
--- a/drivers/hid/hid-roccat-isku.c
+++ b/drivers/hid/hid-roccat-isku.c
@@ -435,7 +435,7 @@ static struct hid_driver isku_driver = {
 static int __init isku_init(void)
 {
 	int retval;
-	isku_class = class_create(THIS_MODULE, "isku");
+	isku_class = class_create("isku");
 	if (IS_ERR(isku_class))
 		return PTR_ERR(isku_class);
 	isku_class->dev_groups = isku_groups;
diff --git a/drivers/hid/hid-roccat-kone.c b/drivers/hid/hid-roccat-kone.c
index 76da04801ca9..945ae236fb45 100644
--- a/drivers/hid/hid-roccat-kone.c
+++ b/drivers/hid/hid-roccat-kone.c
@@ -890,7 +890,7 @@ static int __init kone_init(void)
 	int retval;
 
 	/* class name has to be same as driver name */
-	kone_class = class_create(THIS_MODULE, "kone");
+	kone_class = class_create("kone");
 	if (IS_ERR(kone_class))
 		return PTR_ERR(kone_class);
 	kone_class->dev_groups = kone_groups;
diff --git a/drivers/hid/hid-roccat-koneplus.c b/drivers/hid/hid-roccat-koneplus.c
index 1896c69ea512..97b83b6f53dd 100644
--- a/drivers/hid/hid-roccat-koneplus.c
+++ b/drivers/hid/hid-roccat-koneplus.c
@@ -549,7 +549,7 @@ static int __init koneplus_init(void)
 	int retval;
 
 	/* class name has to be same as driver name */
-	koneplus_class = class_create(THIS_MODULE, "koneplus");
+	koneplus_class = class_create("koneplus");
 	if (IS_ERR(koneplus_class))
 		return PTR_ERR(koneplus_class);
 	koneplus_class->dev_groups = koneplus_groups;
diff --git a/drivers/hid/hid-roccat-konepure.c b/drivers/hid/hid-roccat-konepure.c
index cf8eeb33a125..a297756f2410 100644
--- a/drivers/hid/hid-roccat-konepure.c
+++ b/drivers/hid/hid-roccat-konepure.c
@@ -207,7 +207,7 @@ static int __init konepure_init(void)
 {
 	int retval;
 
-	konepure_class = class_create(THIS_MODULE, "konepure");
+	konepure_class = class_create("konepure");
 	if (IS_ERR(konepure_class))
 		return PTR_ERR(konepure_class);
 	konepure_class->dev_groups = konepure_groups;
diff --git a/drivers/hid/hid-roccat-kovaplus.c b/drivers/hid/hid-roccat-kovaplus.c
index 6fb9b9563769..1a1d96e11683 100644
--- a/drivers/hid/hid-roccat-kovaplus.c
+++ b/drivers/hid/hid-roccat-kovaplus.c
@@ -638,7 +638,7 @@ static int __init kovaplus_init(void)
 {
 	int retval;
 
-	kovaplus_class = class_create(THIS_MODULE, "kovaplus");
+	kovaplus_class = class_create("kovaplus");
 	if (IS_ERR(kovaplus_class))
 		return PTR_ERR(kovaplus_class);
 	kovaplus_class->dev_groups = kovaplus_groups;
diff --git a/drivers/hid/hid-roccat-pyra.c b/drivers/hid/hid-roccat-pyra.c
index 4fcc8e7d276f..15528c3b013c 100644
--- a/drivers/hid/hid-roccat-pyra.c
+++ b/drivers/hid/hid-roccat-pyra.c
@@ -585,7 +585,7 @@ static int __init pyra_init(void)
 	int retval;
 
 	/* class name has to be same as driver name */
-	pyra_class = class_create(THIS_MODULE, "pyra");
+	pyra_class = class_create("pyra");
 	if (IS_ERR(pyra_class))
 		return PTR_ERR(pyra_class);
 	pyra_class->dev_groups = pyra_groups;
diff --git a/drivers/hid/hid-roccat-ryos.c b/drivers/hid/hid-roccat-ryos.c
index 5bf1971a2b14..0eb17a3b925d 100644
--- a/drivers/hid/hid-roccat-ryos.c
+++ b/drivers/hid/hid-roccat-ryos.c
@@ -216,7 +216,7 @@ static int __init ryos_init(void)
 {
 	int retval;
 
-	ryos_class = class_create(THIS_MODULE, "ryos");
+	ryos_class = class_create("ryos");
 	if (IS_ERR(ryos_class))
 		return PTR_ERR(ryos_class);
 	ryos_class->dev_groups = ryos_groups;
diff --git a/drivers/hid/hid-roccat-savu.c b/drivers/hid/hid-roccat-savu.c
index a784bb4ee651..93be7acef673 100644
--- a/drivers/hid/hid-roccat-savu.c
+++ b/drivers/hid/hid-roccat-savu.c
@@ -204,7 +204,7 @@ static int __init savu_init(void)
 {
 	int retval;
 
-	savu_class = class_create(THIS_MODULE, "savu");
+	savu_class = class_create("savu");
 	if (IS_ERR(savu_class))
 		return PTR_ERR(savu_class);
 	savu_class->dev_groups = savu_groups;
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 197b1e7bf029..93e62b161501 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -618,7 +618,7 @@ int __init hidraw_init(void)
 
 	hidraw_major = MAJOR(dev_id);
 
-	hidraw_class = class_create(THIS_MODULE, "hidraw");
+	hidraw_class = class_create("hidraw");
 	if (IS_ERR(hidraw_class)) {
 		result = PTR_ERR(hidraw_class);
 		goto error_cdev;
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 107623c4cc14..ef93064cfdb3 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -739,7 +739,7 @@ static int __init i2c_dev_init(void)
 	if (res)
 		goto out;
 
-	i2c_dev_class = class_create(THIS_MODULE, "i2c-dev");
+	i2c_dev_class = class_create("i2c-dev");
 	if (IS_ERR(i2c_dev_class)) {
 		res = PTR_ERR(i2c_dev_class);
 		goto out_unreg_chrdev;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index bdb179a09d77..fbace69672ca 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1264,7 +1264,7 @@ static int __init ib_uverbs_init(void)
 		goto out_alloc;
 	}
 
-	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	uverbs_class = class_create("infiniband_verbs");
 	if (IS_ERR(uverbs_class)) {
 		ret = PTR_ERR(uverbs_class);
 		pr_err("user_verbs: couldn't create class infiniband_verbs\n");
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
index 1f4496032170..05be0d119f79 100644
--- a/drivers/infiniband/hw/hfi1/device.c
+++ b/drivers/infiniband/hw/hfi1/device.c
@@ -102,7 +102,7 @@ int __init dev_init(void)
 		goto done;
 	}
 
-	class = class_create(THIS_MODULE, class_name());
+	class = class_create(class_name());
 	if (IS_ERR(class)) {
 		ret = PTR_ERR(class);
 		pr_err("Could not create device class (err %d)\n", -ret);
@@ -111,7 +111,7 @@ int __init dev_init(void)
 	}
 	class->devnode = hfi1_devnode;
 
-	user_class = class_create(THIS_MODULE, class_name_user());
+	user_class = class_create(class_name_user());
 	if (IS_ERR(user_class)) {
 		ret = PTR_ERR(user_class);
 		pr_err("Could not create device class for user accessible files (err %d)\n",
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 80fe92a21f96..c07c95fd3cfa 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -2326,7 +2326,7 @@ int __init qib_dev_init(void)
 		goto done;
 	}
 
-	qib_class = class_create(THIS_MODULE, "ipath");
+	qib_class = class_create("ipath");
 	if (IS_ERR(qib_class)) {
 		ret = PTR_ERR(qib_class);
 		pr_err("Could not create device class (err %d)\n", -ret);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 80abf45a197a..edb2e3a25880 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -3163,7 +3163,7 @@ static int __init rtrs_client_init(void)
 {
 	rtrs_rdma_dev_pd_init(0, &dev_pd);
 
-	rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client");
+	rtrs_clt_dev_class = class_create("rtrs-client");
 	if (IS_ERR(rtrs_clt_dev_class)) {
 		pr_err("Failed to create rtrs-client dev class\n");
 		return PTR_ERR(rtrs_clt_dev_class);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index d1703e2c0b82..c38901e2c8f4 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -2253,7 +2253,7 @@ static int __init rtrs_server_init(void)
 		       err);
 		return err;
 	}
-	rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server");
+	rtrs_dev_class = class_create("rtrs-server");
 	if (IS_ERR(rtrs_dev_class)) {
 		err = PTR_ERR(rtrs_dev_class);
 		goto out_err;
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 0f00be62438d..45a4043c5042 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1393,7 +1393,7 @@ static int __init capi_init(void)
 		kcapi_exit();
 		return major_ret;
 	}
-	capi_class = class_create(THIS_MODULE, "capi");
+	capi_class = class_create("capi");
 	if (IS_ERR(capi_class)) {
 		unregister_chrdev(capi_major, "capi20");
 		kcapi_exit();
diff --git a/drivers/isdn/mISDN/dsp_pipeline.c b/drivers/isdn/mISDN/dsp_pipeline.c
index cfbcd9e973c2..09b72f14d4b7 100644
--- a/drivers/isdn/mISDN/dsp_pipeline.c
+++ b/drivers/isdn/mISDN/dsp_pipeline.c
@@ -131,7 +131,7 @@ EXPORT_SYMBOL(mISDN_dsp_element_unregister);
 
 int dsp_pipeline_module_init(void)
 {
-	elements_class = class_create(THIS_MODULE, "dsp_pipeline");
+	elements_class = class_create("dsp_pipeline");
 	if (IS_ERR(elements_class))
 		return PTR_ERR(elements_class);
 
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index a6b3adcd044a..9255bc11f99d 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -626,7 +626,7 @@ EXPORT_SYMBOL_GPL(devm_led_classdev_unregister);
 
 static int __init leds_init(void)
 {
-	leds_class = class_create(THIS_MODULE, "leds");
+	leds_class = class_create("leds");
 	if (IS_ERR(leds_class))
 		return PTR_ERR(leds_class);
 	leds_class->pm = &leds_class_dev_pm_ops;
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 23bd0c77ac1a..57e987cf84b2 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -888,7 +888,7 @@ adbdev_init(void)
 		return;
 	}
 
-	adb_dev_class = class_create(THIS_MODULE, "adb");
+	adb_dev_class = class_create("adb");
 	if (IS_ERR(adb_dev_class))
 		return;
 	device_create(adb_dev_class, NULL, MKDEV(ADB_MAJOR, 0), NULL, "adb");
diff --git a/drivers/media/dvb-core/dvbdev.c b/drivers/media/dvb-core/dvbdev.c
index 0ed087caf7f3..e9b3ce09e534 100644
--- a/drivers/media/dvb-core/dvbdev.c
+++ b/drivers/media/dvb-core/dvbdev.c
@@ -1063,7 +1063,7 @@ static int __init init_dvbdev(void)
 		goto error;
 	}
 
-	dvb_class = class_create(THIS_MODULE, "dvb");
+	dvb_class = class_create("dvb");
 	if (IS_ERR(dvb_class)) {
 		retval = PTR_ERR(dvb_class);
 		goto error;
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 25ab61dae126..043d23aaa3cb 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -785,7 +785,7 @@ int __init lirc_dev_init(void)
 {
 	int retval;
 
-	lirc_class = class_create(THIS_MODULE, "lirc");
+	lirc_class = class_create("lirc");
 	if (IS_ERR(lirc_class)) {
 		pr_err("class_create failed\n");
 		return PTR_ERR(lirc_class);
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index fb9a1b49ff6d..f574c83b82cf 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -977,7 +977,7 @@ static int __init c2port_init(void)
 	printk(KERN_INFO "Silicon Labs C2 port support v. " DRIVER_VERSION
 		" - (C) 2007 Rodolfo Giometti\n");
 
-	c2port_class = class_create(THIS_MODULE, "c2port");
+	c2port_class = class_create("c2port");
 	if (IS_ERR(c2port_class)) {
 		printk(KERN_ERR "c2port: failed to allocate class\n");
 		return PTR_ERR(c2port_class);
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 5878329b011a..144d1f2d78ce 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -678,7 +678,7 @@ int __init cxl_file_init(void)
 
 	pr_devel("CXL device allocated, MAJOR %i\n", MAJOR(cxl_dev));
 
-	cxl_class = class_create(THIS_MODULE, "cxl");
+	cxl_class = class_create("cxl");
 	if (IS_ERR(cxl_class)) {
 		pr_err("Unable to create CXL class\n");
 		rc = PTR_ERR(cxl_class);
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index 5b63d179b24e..02628288cd0f 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -1363,7 +1363,7 @@ static int __init genwqe_init_module(void)
 {
 	int rc;
 
-	class_genwqe = class_create(THIS_MODULE, GENWQE_DEVNAME);
+	class_genwqe = class_create(GENWQE_DEVNAME);
 	if (IS_ERR(class_genwqe)) {
 		pr_err("[%s] create class failed\n", __func__);
 		return -ENOMEM;
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index 8d00df9243c4..2c3a991d6e88 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -888,7 +888,7 @@ static int __init ilo_init(void)
 	int error;
 	dev_t dev;
 
-	ilo_class = class_create(THIS_MODULE, "iLO");
+	ilo_class = class_create("iLO");
 	if (IS_ERR(ilo_class)) {
 		error = PTR_ERR(ilo_class);
 		goto out;
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 632d4ae21e46..76c771a424f7 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -1275,7 +1275,7 @@ static int __init mei_init(void)
 {
 	int ret;
 
-	mei_class = class_create(THIS_MODULE, "mei");
+	mei_class = class_create("mei");
 	if (IS_ERR(mei_class)) {
 		pr_err("couldn't create class\n");
 		ret = PTR_ERR(mei_class);
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index 3b058654b45b..6e63f060e4cc 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -601,7 +601,7 @@ int ocxl_file_init(void)
 		return rc;
 	}
 
-	ocxl_class = class_create(THIS_MODULE, "ocxl");
+	ocxl_class = class_create("ocxl");
 	if (IS_ERR(ocxl_class)) {
 		pr_err("Unable to create ocxl class\n");
 		unregister_chrdev_region(ocxl_dev, OCXL_NUM_MINORS);
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index ce72e46a2e73..7966a6b8b5b3 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -503,7 +503,7 @@ static int __init phantom_init(void)
 	int retval;
 	dev_t dev;
 
-	phantom_class = class_create(THIS_MODULE, "phantom");
+	phantom_class = class_create("phantom");
 	if (IS_ERR(phantom_class)) {
 		retval = PTR_ERR(phantom_class);
 		printk(KERN_ERR "phantom: can't register phantom class\n");
diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index 07023397afc7..346bd7cf2e94 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -620,7 +620,7 @@ static int __init uacce_init(void)
 {
 	int ret;
 
-	uacce_class = class_create(THIS_MODULE, UACCE_NAME);
+	uacce_class = class_create(UACCE_NAME);
 	if (IS_ERR(uacce_class))
 		return PTR_ERR(uacce_class);
 
diff --git a/drivers/most/most_cdev.c b/drivers/most/most_cdev.c
index 4ee536980f71..3ed8f461e01e 100644
--- a/drivers/most/most_cdev.c
+++ b/drivers/most/most_cdev.c
@@ -491,7 +491,7 @@ static int __init most_cdev_init(void)
 {
 	int err;
 
-	comp.class = class_create(THIS_MODULE, "most_cdev");
+	comp.class = class_create("most_cdev");
 	if (IS_ERR(comp.class))
 		return PTR_ERR(comp.class);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c
index 9b26f0f2c748..8a1027ad340d 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.c
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.c
@@ -448,7 +448,7 @@ EXPORT_SYMBOL(hnae_ae_unregister);
 
 static int __init hnae_init(void)
 {
-	hnae_class = class_create(THIS_MODULE, "hnae");
+	hnae_class = class_create("hnae");
 	return PTR_ERR_OR_ZERO(hnae_class);
 }
 
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 1d71f5276241..a9beacd552cf 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1394,7 +1394,7 @@ static int __init ppp_init(void)
 		goto out_net;
 	}
 
-	ppp_class = class_create(THIS_MODULE, "ppp");
+	ppp_class = class_create("ppp");
 	if (IS_ERR(ppp_class)) {
 		err = PTR_ERR(ppp_class);
 		goto out_chrdev;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 4cc4eaf80b14..68a7e09a195e 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -5748,7 +5748,7 @@ static int __init init_mac80211_hwsim(void)
 	if (err)
 		goto out_exit_netlink;
 
-	hwsim_class = class_create(THIS_MODULE, "mac80211_hwsim");
+	hwsim_class = class_create("mac80211_hwsim");
 	if (IS_ERR(hwsim_class)) {
 		err = PTR_ERR(hwsim_class);
 		goto out_exit_virtio;
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 966d0ccd2276..51bbd6bb74b5 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -1174,7 +1174,7 @@ static int __init wwan_init(void)
 	if (err)
 		return err;
 
-	wwan_class = class_create(THIS_MODULE, "wwan");
+	wwan_class = class_create("wwan");
 	if (IS_ERR(wwan_class)) {
 		err = PTR_ERR(wwan_class);
 		goto unregister;
diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c
index 2397a903d8f5..edb5fc803211 100644
--- a/drivers/net/wwan/wwan_hwsim.c
+++ b/drivers/net/wwan/wwan_hwsim.c
@@ -511,7 +511,7 @@ static int __init wwan_hwsim_init(void)
 	if (!wwan_wq)
 		return -ENOMEM;
 
-	wwan_hwsim_class = class_create(THIS_MODULE, "wwan_hwsim");
+	wwan_hwsim_class = class_create("wwan_hwsim");
 	if (IS_ERR(wwan_hwsim_class)) {
 		err = PTR_ERR(wwan_hwsim_class);
 		goto err_wq_destroy;
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 4976a0069e9c..954dbc105fc8 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1320,7 +1320,7 @@ int __init nvdimm_bus_init(void)
 		goto err_dimm_chrdev;
 	nvdimm_major = rc;
 
-	nd_class = class_create(THIS_MODULE, "nd");
+	nd_class = class_create("nd");
 	if (IS_ERR(nd_class)) {
 		rc = PTR_ERR(nd_class);
 		goto err_class;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2730b116dc6..877a61fbd78b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5381,14 +5381,14 @@ static int __init nvme_core_init(void)
 	if (result < 0)
 		goto destroy_delete_wq;
 
-	nvme_class = class_create(THIS_MODULE, "nvme");
+	nvme_class = class_create("nvme");
 	if (IS_ERR(nvme_class)) {
 		result = PTR_ERR(nvme_class);
 		goto unregister_chrdev;
 	}
 	nvme_class->dev_uevent = nvme_class_uevent;
 
-	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+	nvme_subsys_class = class_create("nvme-subsystem");
 	if (IS_ERR(nvme_subsys_class)) {
 		result = PTR_ERR(nvme_subsys_class);
 		goto destroy_class;
@@ -5399,7 +5399,7 @@ static int __init nvme_core_init(void)
 	if (result < 0)
 		goto destroy_subsys_class;
 
-	nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
+	nvme_ns_chr_class = class_create("nvme-generic");
 	if (IS_ERR(nvme_ns_chr_class)) {
 		result = PTR_ERR(nvme_ns_chr_class);
 		goto unregister_generic_ns;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bbaa04a0c502..0069ebff85df 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1254,7 +1254,7 @@ static int __init nvmf_init(void)
 	if (!nvmf_default_host)
 		return -ENOMEM;
 
-	nvmf_class = class_create(THIS_MODULE, "nvme-fabrics");
+	nvmf_class = class_create("nvme-fabrics");
 	if (IS_ERR(nvmf_class)) {
 		pr_err("couldn't register class nvme-fabrics\n");
 		ret = PTR_ERR(nvmf_class);
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 5c16372f3b53..af0d01797e5e 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -1562,7 +1562,7 @@ static int __init fcloop_init(void)
 {
 	int ret;
 
-	fcloop_class = class_create(THIS_MODULE, "fcloop");
+	fcloop_class = class_create("fcloop");
 	if (IS_ERR(fcloop_class)) {
 		pr_err("couldn't register class fcloop\n");
 		ret = PTR_ERR(fcloop_class);
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 9440d9811eea..46c9a5c3ca14 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -860,7 +860,7 @@ EXPORT_SYMBOL_GPL(__devm_pci_epc_create);
 
 static int __init pci_epc_init(void)
 {
-	pci_epc_class = class_create(THIS_MODULE, "pci_epc");
+	pci_epc_class = class_create("pci_epc");
 	if (IS_ERR(pci_epc_class)) {
 		pr_err("failed to create pci epc class --> %ld\n",
 		       PTR_ERR(pci_epc_class));
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 3d6f17ff2429..d837da055921 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -1804,7 +1804,7 @@ static int __init switchtec_init(void)
 	if (rc)
 		return rc;
 
-	switchtec_class = class_create(THIS_MODULE, "switchtec");
+	switchtec_class = class_create("switchtec");
 	if (IS_ERR(switchtec_class)) {
 		rc = PTR_ERR(switchtec_class);
 		goto err_create_class;
diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index 9951efc03eaa..6464dcb56d56 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -1233,7 +1233,7 @@ static void phy_release(struct device *dev)
 
 static int __init phy_core_init(void)
 {
-	phy_class = class_create(THIS_MODULE, "phy");
+	phy_class = class_create("phy");
 	if (IS_ERR(phy_class)) {
 		pr_err("failed to create phy class --> %ld\n",
 			PTR_ERR(phy_class));
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f3d7c1da299f..1bf393239d40 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1462,7 +1462,7 @@ EXPORT_SYMBOL_GPL(power_supply_get_drvdata);
 
 static int __init power_supply_class_init(void)
 {
-	power_supply_class = class_create(THIS_MODULE, "power_supply");
+	power_supply_class = class_create("power_supply");
 
 	if (IS_ERR(power_supply_class))
 		return PTR_ERR(power_supply_class);
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index 22a65ad4e46e..5d19baae6a38 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -456,7 +456,7 @@ static int __init pps_init(void)
 {
 	int err;
 
-	pps_class = class_create(THIS_MODULE, "pps");
+	pps_class = class_create("pps");
 	if (IS_ERR(pps_class)) {
 		pr_err("failed to allocate class\n");
 		return PTR_ERR(pps_class);
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 62d4d29e7c05..790f9250b381 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -460,7 +460,7 @@ static int __init ptp_init(void)
 {
 	int err;
 
-	ptp_class = class_create(THIS_MODULE, "ptp");
+	ptp_class = class_create("ptp");
 	if (IS_ERR(ptp_class)) {
 		pr_err("ptp: failed to allocate class\n");
 		return PTR_ERR(ptp_class);
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 43db495f1986..deb96c3160a7 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -2603,7 +2603,7 @@ static int __init mport_init(void)
 	int ret;
 
 	/* Create device class needed by udev */
-	dev_class = class_create(THIS_MODULE, DRV_NAME);
+	dev_class = class_create(DRV_NAME);
 	if (IS_ERR(dev_class)) {
 		rmcd_error("Unable to create " DRV_NAME " class");
 		return PTR_ERR(dev_class);
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index db4c265287ae..acaf9cda014c 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -2297,7 +2297,7 @@ static int __init riocm_init(void)
 	int ret;
 
 	/* Create device class needed by udev */
-	dev_class = class_create(THIS_MODULE, DRV_NAME);
+	dev_class = class_create(DRV_NAME);
 	if (IS_ERR(dev_class)) {
 		riocm_error("Cannot create " DRV_NAME " class");
 		return PTR_ERR(dev_class);
diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index a2207c0cf432..5039df757127 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -694,7 +694,7 @@ static int __init rpmsg_init(void)
 {
 	int ret;
 
-	rpmsg_class = class_create(THIS_MODULE, "rpmsg");
+	rpmsg_class = class_create("rpmsg");
 	if (IS_ERR(rpmsg_class)) {
 		pr_err("failed to create rpmsg class\n");
 		return PTR_ERR(rpmsg_class);
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index e5b7b48cffac..edfd942f8c54 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(devm_rtc_device_register);
 
 static int __init rtc_init(void)
 {
-	rtc_class = class_create(THIS_MODULE, "rtc");
+	rtc_class = class_create("rtc");
 	if (IS_ERR(rtc_class)) {
 		pr_err("couldn't create class\n");
 		return PTR_ERR(rtc_class);
diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c
index cb8fdf057eca..8d50c894711f 100644
--- a/drivers/s390/char/hmcdrv_dev.c
+++ b/drivers/s390/char/hmcdrv_dev.c
@@ -308,7 +308,7 @@ int hmcdrv_dev_init(void)
 	 * /proc/devices), but not under /dev nor /sys/devices/virtual. So
 	 * we have to create an associated class (see /sys/class).
 	 */
-	hmcdrv_dev_class = class_create(THIS_MODULE, HMCDRV_DEV_CLASS);
+	hmcdrv_dev_class = class_create(HMCDRV_DEV_CLASS);
 
 	if (IS_ERR(hmcdrv_dev_class)) {
 		rc = PTR_ERR(hmcdrv_dev_class);
diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c
index 09d7570d3b7d..7115c0f85650 100644
--- a/drivers/s390/char/raw3270.c
+++ b/drivers/s390/char/raw3270.c
@@ -1319,7 +1319,7 @@ static int raw3270_init(void)
 	if (rc == 0) {
 		/* Create attributes for early (= console) device. */
 		mutex_lock(&raw3270_mutex);
-		class3270 = class_create(THIS_MODULE, "3270");
+		class3270 = class_create("3270");
 		list_for_each_entry(rp, &raw3270_devices, list) {
 			get_device(&rp->cdev->dev);
 			raw3270_create_attributes(rp);
diff --git a/drivers/s390/char/tape_class.c b/drivers/s390/char/tape_class.c
index c21dc68e05a0..277a0f903d11 100644
--- a/drivers/s390/char/tape_class.c
+++ b/drivers/s390/char/tape_class.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(unregister_tape_dev);
 
 static int __init tape_init(void)
 {
-	tape_class = class_create(THIS_MODULE, "tape390");
+	tape_class = class_create("tape390");
 
 	return 0;
 }
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index ed970ecfafdf..6946ba9a9de2 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -699,7 +699,7 @@ static int vmlogrdr_register_driver(void)
 	if (ret)
 		goto out_iucv;
 
-	vmlogrdr_class = class_create(THIS_MODULE, "vmlogrdr");
+	vmlogrdr_class = class_create("vmlogrdr");
 	if (IS_ERR(vmlogrdr_class)) {
 		ret = PTR_ERR(vmlogrdr_class);
 		vmlogrdr_class = NULL;
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index 131293f7f152..82efdd20ad01 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -1022,7 +1022,7 @@ static int __init ur_init(void)
 
 	debug_set_level(vmur_dbf, 6);
 
-	vmur_class = class_create(THIS_MODULE, "vmur");
+	vmur_class = class_create("vmur");
 	if (IS_ERR(vmur_class)) {
 		rc = PTR_ERR(vmur_class);
 		goto fail_free_dbf;
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 6fe05bb82c77..582ac301d315 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -2171,7 +2171,7 @@ static int __init zcdn_init(void)
 	int rc;
 
 	/* create a new class 'zcrypt' */
-	zcrypt_class = class_create(THIS_MODULE, ZCRYPT_NAME);
+	zcrypt_class = class_create(ZCRYPT_NAME);
 	if (IS_ERR(zcrypt_class)) {
 		rc = PTR_ERR(zcrypt_class);
 		goto out_class_create_failed;
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index e300cf26bc2a..75701b165e91 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -323,7 +323,7 @@ static int __init dax_attach(void)
 		goto done;
 	}
 
-	cl = class_create(THIS_MODULE, DAX_NAME);
+	cl = class_create(DAX_NAME);
 	if (IS_ERR(cl)) {
 		dax_err("class_create failed");
 		ret = PTR_ERR(cl);
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index 72fe6df78bc5..ac648bb8f7e7 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -995,7 +995,7 @@ static int __init init_ch_module(void)
 	int rc;
 
 	printk(KERN_INFO "SCSI Media Changer driver v" VERSION " \n");
-        ch_sysfs_class = class_create(THIS_MODULE, "scsi_changer");
+        ch_sysfs_class = class_create("scsi_changer");
         if (IS_ERR(ch_sysfs_class)) {
 		rc = PTR_ERR(ch_sysfs_class);
 		return rc;
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 395b00b942f7..debd36974119 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3880,7 +3880,7 @@ static int cxlflash_class_init(void)
 
 	cxlflash_major = MAJOR(devno);
 
-	cxlflash_class = class_create(THIS_MODULE, "cxlflash");
+	cxlflash_class = class_create("cxlflash");
 	if (IS_ERR(cxlflash_class)) {
 		rc = PTR_ERR(cxlflash_class);
 		pr_err("%s: class_create failed rc=%d\n", __func__, rc);
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 836ddc476764..7d57cc92532c 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -5346,7 +5346,7 @@ static int __init pmcraid_init(void)
 	}
 
 	pmcraid_major = MAJOR(dev);
-	pmcraid_class = class_create(THIS_MODULE, PMCRAID_DEVFILE);
+	pmcraid_class = class_create(PMCRAID_DEVFILE);
 
 	if (IS_ERR(pmcraid_class)) {
 		error = PTR_ERR(pmcraid_class);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index a91049213203..4997f880d4a4 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1677,7 +1677,7 @@ init_sg(void)
 				    SG_MAX_DEVS, "sg");
 	if (rc)
 		return rc;
-        sg_sysfs_class = class_create(THIS_MODULE, "scsi_generic");
+        sg_sysfs_class = class_create("scsi_generic");
         if ( IS_ERR(sg_sysfs_class) ) {
 		rc = PTR_ERR(sg_sysfs_class);
 		goto err_out;
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 5a038c667401..539e70c7daed 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -877,7 +877,7 @@ static int __init spidev_init(void)
 	if (status < 0)
 		return status;
 
-	spidev_class = class_create(THIS_MODULE, "spidev");
+	spidev_class = class_create("spidev");
 	if (IS_ERR(spidev_class)) {
 		unregister_chrdev(SPIDEV_MAJOR, spidev_spi_driver.driver.name);
 		return PTR_ERR(spidev_class);
diff --git a/drivers/staging/fieldbus/anybuss/arcx-anybus.c b/drivers/staging/fieldbus/anybuss/arcx-anybus.c
index 9af2e63050d1..369400ec30d8 100644
--- a/drivers/staging/fieldbus/anybuss/arcx-anybus.c
+++ b/drivers/staging/fieldbus/anybuss/arcx-anybus.c
@@ -352,7 +352,7 @@ static int __init controller_init(void)
 {
 	int err;
 
-	controller_class = class_create(THIS_MODULE, "arcx_anybus_controller");
+	controller_class = class_create("arcx_anybus_controller");
 	if (IS_ERR(controller_class))
 		return PTR_ERR(controller_class);
 	err = platform_driver_register(&controller_driver);
diff --git a/drivers/staging/greybus/authentication.c b/drivers/staging/greybus/authentication.c
index 297e69f011c7..7e01790a4659 100644
--- a/drivers/staging/greybus/authentication.c
+++ b/drivers/staging/greybus/authentication.c
@@ -402,7 +402,7 @@ int cap_init(void)
 {
 	int ret;
 
-	cap_class = class_create(THIS_MODULE, "gb_authenticate");
+	cap_class = class_create("gb_authenticate");
 	if (IS_ERR(cap_class))
 		return PTR_ERR(cap_class);
 
diff --git a/drivers/staging/greybus/fw-management.c b/drivers/staging/greybus/fw-management.c
index 3342b84597da..cd9141e4b794 100644
--- a/drivers/staging/greybus/fw-management.c
+++ b/drivers/staging/greybus/fw-management.c
@@ -696,7 +696,7 @@ int fw_mgmt_init(void)
 {
 	int ret;
 
-	fw_mgmt_class = class_create(THIS_MODULE, "gb_fw_mgmt");
+	fw_mgmt_class = class_create("gb_fw_mgmt");
 	if (IS_ERR(fw_mgmt_class))
 		return PTR_ERR(fw_mgmt_class);
 
diff --git a/drivers/staging/greybus/raw.c b/drivers/staging/greybus/raw.c
index 2a375f407d38..8bca8cb12cc6 100644
--- a/drivers/staging/greybus/raw.c
+++ b/drivers/staging/greybus/raw.c
@@ -340,7 +340,7 @@ static int raw_init(void)
 	dev_t dev;
 	int retval;
 
-	raw_class = class_create(THIS_MODULE, "gb_raw");
+	raw_class = class_create("gb_raw");
 	if (IS_ERR(raw_class)) {
 		retval = PTR_ERR(raw_class);
 		goto error_class;
diff --git a/drivers/staging/pi433/pi433_if.c b/drivers/staging/pi433/pi433_if.c
index b59f6a4cb611..f08fdf06d566 100644
--- a/drivers/staging/pi433/pi433_if.c
+++ b/drivers/staging/pi433/pi433_if.c
@@ -1400,7 +1400,7 @@ static int __init pi433_init(void)
 	if (status < 0)
 		return status;
 
-	pi433_class = class_create(THIS_MODULE, "pi433");
+	pi433_class = class_create("pi433");
 	if (IS_ERR(pi433_class)) {
 		unregister_chrdev(MAJOR(pi433_dev),
 				  pi433_spi_driver.driver.name);
diff --git a/drivers/staging/vme_user/vme_user.c b/drivers/staging/vme_user/vme_user.c
index 4e533c0bfe6d..b9367b575d00 100644
--- a/drivers/staging/vme_user/vme_user.c
+++ b/drivers/staging/vme_user/vme_user.c
@@ -614,7 +614,7 @@ static int vme_user_probe(struct vme_dev *vdev)
 	}
 
 	/* Create sysfs entries - on udev systems this creates the dev files */
-	vme_user_sysfs_class = class_create(THIS_MODULE, driver_name);
+	vme_user_sysfs_class = class_create(driver_name);
 	if (IS_ERR(vme_user_sysfs_class)) {
 		dev_err(&vdev->dev, "Error creating vme_user class.\n");
 		err = PTR_ERR(vme_user_sysfs_class);
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 452cbb8ad484..0eb342de0b00 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -1226,7 +1226,7 @@ static int __init tee_init(void)
 {
 	int rc;
 
-	tee_class = class_create(THIS_MODULE, "tee");
+	tee_class = class_create("tee");
 	if (IS_ERR(tee_class)) {
 		pr_err("couldn't create class\n");
 		return PTR_ERR(tee_class);
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 36fb945fdad4..1382d9050ce8 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3512,7 +3512,7 @@ static char *tty_devnode(const struct device *dev, umode_t *mode)
 
 static int __init tty_class_init(void)
 {
-	tty_class = class_create(THIS_MODULE, "tty");
+	tty_class = class_create("tty");
 	if (IS_ERR(tty_class))
 		return PTR_ERR(tty_class);
 	tty_class->devnode = tty_devnode;
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index 1dc07f9214d5..498ba9c0ee93 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -804,7 +804,7 @@ int __init vcs_init(void)
 
 	if (register_chrdev(VCS_MAJOR, "vcs", &vcs_fops))
 		panic("unable to get major %d for vcs device", VCS_MAJOR);
-	vc_class = class_create(THIS_MODULE, "vc");
+	vc_class = class_create("vc");
 
 	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, 0), NULL, "vcs");
 	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, 64), NULL, "vcsu");
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 57a5c23b51d4..5496bf4b76ec 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4241,7 +4241,7 @@ static int __init vtconsole_class_init(void)
 {
 	int i;
 
-	vtconsole_class = class_create(THIS_MODULE, "vtconsole");
+	vtconsole_class = class_create("vtconsole");
 	if (IS_ERR(vtconsole_class)) {
 		pr_warn("Unable to create vt console class; errno = %ld\n",
 			PTR_ERR(vtconsole_class));
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index da7d88e069e6..c4ed3310e069 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -88,7 +88,7 @@ static int init_usb_class(void)
 	}
 
 	kref_init(&usb_class->kref);
-	usb_class->class = class_create(THIS_MODULE, "usbmisc");
+	usb_class->class = class_create("usbmisc");
 	if (IS_ERR(usb_class->class)) {
 		result = PTR_ERR(usb_class->class);
 		printk(KERN_ERR "class_create failed for usb devices\n");
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index a8da3b4a2855..9f6b10134121 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -1325,7 +1325,7 @@ int ghid_setup(struct usb_gadget *g, int count)
 	int status;
 	dev_t dev;
 
-	hidg_class = class_create(THIS_MODULE, "hidg");
+	hidg_class = class_create("hidg");
 	if (IS_ERR(hidg_class)) {
 		status = PTR_ERR(hidg_class);
 		hidg_class = NULL;
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index 4903d761a872..28db3e336e7d 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -1512,7 +1512,7 @@ static int gprinter_setup(int count)
 	int status;
 	dev_t devt;
 
-	usb_gadget_class = class_create(THIS_MODULE, "usb_printer_gadget");
+	usb_gadget_class = class_create("usb_printer_gadget");
 	if (IS_ERR(usb_gadget_class)) {
 		status = PTR_ERR(usb_gadget_class);
 		usb_gadget_class = NULL;
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 23b0629a8774..df930b041c33 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1758,7 +1758,7 @@ static int __init usb_udc_init(void)
 {
 	int rc;
 
-	udc_class = class_create(THIS_MODULE, "udc");
+	udc_class = class_create("udc");
 	if (IS_ERR(udc_class)) {
 		pr_err("failed to create udc class --> %ld\n",
 				PTR_ERR(udc_class));
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index abb1cd35d8a6..952c56789258 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1379,7 +1379,7 @@ int __init mon_bin_init(void)
 {
 	int rc;
 
-	mon_bin_class = class_create(THIS_MODULE, "usbmon");
+	mon_bin_class = class_create("usbmon");
 	if (IS_ERR(mon_bin_class)) {
 		rc = PTR_ERR(mon_bin_class);
 		goto err_class;
diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c
index 56814ef80c24..0395bd5dbd3e 100644
--- a/drivers/usb/roles/class.c
+++ b/drivers/usb/roles/class.c
@@ -392,7 +392,7 @@ EXPORT_SYMBOL_GPL(usb_role_switch_get_drvdata);
 
 static int __init usb_roles_init(void)
 {
-	role_class = class_create(THIS_MODULE, "usb_role");
+	role_class = class_create("usb_role");
 	return PTR_ERR_OR_ZERO(role_class);
 }
 subsys_initcall(usb_roles_init);
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0c3b48616a9f..c421b83f6fa2 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1793,7 +1793,7 @@ static int vduse_init(void)
 	int ret;
 	struct device *dev;
 
-	vduse_class = class_create(THIS_MODULE, "vduse");
+	vduse_class = class_create("vduse");
 	if (IS_ERR(vduse_class))
 		return PTR_ERR(vduse_class);
 
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 27d5ba7cf9dc..fc75c1000d74 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -878,7 +878,7 @@ int __init vfio_group_init(void)
 		return ret;
 
 	/* /dev/vfio/$GROUP */
-	vfio.class = class_create(THIS_MODULE, "vfio");
+	vfio.class = class_create("vfio");
 	if (IS_ERR(vfio.class)) {
 		ret = PTR_ERR(vfio.class);
 		goto err_group_class;
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 43bd6b76e2b6..9ae671b98592 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1408,7 +1408,7 @@ static int __init vfio_init(void)
 		goto err_virqfd;
 
 	/* /sys/class/vfio-dev/vfioX */
-	vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
+	vfio.device_class = class_create("vfio-dev");
 	if (IS_ERR(vfio.device_class)) {
 		ret = PTR_ERR(vfio.device_class);
 		goto err_dev_class;
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 6eea72aa8dbf..9a885d398c22 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -751,7 +751,7 @@ static void __exit backlight_class_exit(void)
 
 static int __init backlight_class_init(void)
 {
-	backlight_class = class_create(THIS_MODULE, "backlight");
+	backlight_class = class_create("backlight");
 	if (IS_ERR(backlight_class)) {
 		pr_warn("Unable to create backlight class; errno = %ld\n",
 			PTR_ERR(backlight_class));
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index db56e465aaff..77c5cb2a44e2 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -323,7 +323,7 @@ static void __exit lcd_class_exit(void)
 
 static int __init lcd_class_init(void)
 {
-	lcd_class = class_create(THIS_MODULE, "lcd");
+	lcd_class = class_create("lcd");
 	if (IS_ERR(lcd_class)) {
 		pr_warn("Unable to create backlight class; errno = %ld\n",
 			PTR_ERR(lcd_class));
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 875541ff185b..b073795d99d0 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1749,7 +1749,7 @@ fbmem_init(void)
 		goto err_chrdev;
 	}
 
-	fb_class = class_create(THIS_MODULE, "graphics");
+	fb_class = class_create("graphics");
 	if (IS_ERR(fb_class)) {
 		ret = PTR_ERR(fb_class);
 		pr_warn("Unable to create fb class; errno = %d\n", ret);
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index b39580ad4ce5..3c3148588491 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -361,7 +361,7 @@ static int __init init_coda_psdev(void)
 		       __func__, CODA_PSDEV_MAJOR);
 		return -EIO;
 	}
-	coda_psdev_class = class_create(THIS_MODULE, "coda");
+	coda_psdev_class = class_create("coda");
 	if (IS_ERR(coda_psdev_class)) {
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 179a5c5e28fd..91e89e68177e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -623,7 +623,7 @@ static int __init cuse_init(void)
 	/* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
 	cuse_channel_fops.unlocked_ioctl	= NULL;
 
-	cuse_class = class_create(THIS_MODULE, "cuse");
+	cuse_class = class_create("cuse");
 	if (IS_ERR(cuse_class))
 		return PTR_ERR(cuse_class);
 
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index ab82e5f05346..aa178304bc03 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -64,7 +64,7 @@ void pstore_register_pmsg(void)
 		goto err;
 	}
 
-	pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+	pmsg_class = class_create(PMSG_NAME);
 	if (IS_ERR(pmsg_class)) {
 		pr_err("device class file already in use\n");
 		goto err_class;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index bf736f14f0c1..cda598fc5fa0 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -256,7 +256,6 @@ extern void class_destroy(struct class *cls);
 
 /**
  * class_create - create a struct class structure
- * @owner: dummy pointer, does nothing, will be removed soon.
  * @name: pointer to a string for the name of this class.
  *
  * This is used to create a struct class pointer that can then be used
@@ -267,7 +266,7 @@ extern void class_destroy(struct class *cls);
  * Note, the pointer created here is to be destroyed when finished by
  * making a call to class_destroy().
  */
-#define class_create(owner, name)		\
+#define class_create(name)			\
 ({						\
 	static struct lock_class_key __key;	\
 	__class_create(name, &__key);		\
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a53b9360b72e..ad011308cebe 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -347,7 +347,7 @@ ATTRIBUTE_GROUPS(bdi_dev);
 
 static __init int bdi_class_init(void)
 {
-	bdi_class = class_create(THIS_MODULE, "bdi");
+	bdi_class = class_create("bdi");
 	if (IS_ERR(bdi_class))
 		return PTR_ERR(bdi_class);
 
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 08542dfc2dc5..2934d7f4d564 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -112,7 +112,7 @@ void hci_init_sysfs(struct hci_dev *hdev)
 
 int __init bt_sysfs_init(void)
 {
-	bt_class = class_create(THIS_MODULE, "bluetooth");
+	bt_class = class_create("bluetooth");
 
 	return PTR_ERR_OR_ZERO(bt_class);
 }
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 8d36303f3935..db720efa811d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -490,7 +490,7 @@ static int __init idletimer_tg_init(void)
 {
 	int err;
 
-	idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+	idletimer_tg_class = class_create("xt_idletimer");
 	err = PTR_ERR(idletimer_tg_class);
 	if (IS_ERR(idletimer_tg_class)) {
 		pr_debug("couldn't register device class\n");
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index e54eb752e1ba..a3d3249639d0 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1418,7 +1418,7 @@ static int __init mbochs_dev_init(void)
 	if (ret)
 		goto err_cdev;
 
-	mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
+	mbochs_class = class_create(MBOCHS_CLASS_NAME);
 	if (IS_ERR(mbochs_class)) {
 		pr_err("Error: failed to register mbochs_dev class\n");
 		ret = PTR_ERR(mbochs_class);
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index e8400fdab71d..ef1630fc5a91 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -705,7 +705,7 @@ static int __init mdpy_dev_init(void)
 	if (ret)
 		goto err_cdev;
 
-	mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
+	mdpy_class = class_create(MDPY_CLASS_NAME);
 	if (IS_ERR(mdpy_class)) {
 		pr_err("Error: failed to register mdpy_dev class\n");
 		ret = PTR_ERR(mdpy_class);
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index e887de672c52..0b6c386e620d 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1316,7 +1316,7 @@ static int __init mtty_dev_init(void)
 	if (ret)
 		goto err_cdev;
 
-	mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME);
+	mtty_dev.vd_class = class_create(MTTY_CLASS_NAME);
 
 	if (IS_ERR(mtty_dev.vd_class)) {
 		pr_err("Error: failed to register mtty_dev class\n");
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 3e7dd6fcb7cf..4f6911274d56 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -45,7 +45,7 @@ static int __init init_soundcore(void)
 	if (rc)
 		return rc;
 
-	sound_class = class_create(THIS_MODULE, "sound");
+	sound_class = class_create("sound");
 	if (IS_ERR(sound_class)) {
 		cleanup_oss_soundcore();
 		return PTR_ERR(sound_class);
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index 01ceb98c15a0..3eba10c1e3e8 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -921,7 +921,7 @@ static __init int ndtest_init(void)
 
 	nfit_test_setup(ndtest_resource_lookup, NULL);
 
-	ndtest_dimm_class = class_create(THIS_MODULE, "nfit_test_dimm");
+	ndtest_dimm_class = class_create("nfit_test_dimm");
 	if (IS_ERR(ndtest_dimm_class)) {
 		rc = PTR_ERR(ndtest_dimm_class);
 		goto err_register;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index c75abb497a1a..45b1c072e349 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3282,7 +3282,7 @@ static __init int nfit_test_init(void)
 	if (!nfit_wq)
 		return -ENOMEM;
 
-	nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm");
+	nfit_test_dimm = class_create("nfit_test_dimm");
 	if (IS_ERR(nfit_test_dimm)) {
 		rc = PTR_ERR(nfit_test_dimm);
 		goto err_register;
-- 
cgit v1.2.3


From a2fd6e42e4fb661e183977072022b7dd8dd65d90 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:36 +0100
Subject: driver core: class: make class_dev_iter_init() options const

class_dev_iter_init() does not modify the struct class or the struct
device passed into it, so mark them as const * to enforce that.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-5-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 4 ++--
 include/linux/device/class.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 9439c6c7466f..5a60e8895165 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -284,8 +284,8 @@ EXPORT_SYMBOL_GPL(class_destroy);
  * otherwise if it is NULL, the iteration starts at the beginning of
  * the list.
  */
-void class_dev_iter_init(struct class_dev_iter *iter, struct class *class,
-			 struct device *start, const struct device_type *type)
+void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class,
+			 const struct device *start, const struct device_type *type)
 {
 	struct klist_node *start_knode = NULL;
 
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index cda598fc5fa0..1f5cfae8db88 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -103,8 +103,8 @@ void class_compat_remove_link(struct class_compat *cls, struct device *dev,
 			      struct device *device_link);
 
 extern void class_dev_iter_init(struct class_dev_iter *iter,
-				struct class *class,
-				struct device *start,
+				const struct class *class,
+				const struct device *start,
 				const struct device_type *type);
 extern struct device *class_dev_iter_next(struct class_dev_iter *iter);
 extern void class_dev_iter_exit(struct class_dev_iter *iter);
-- 
cgit v1.2.3


From 69df024ebbf8b1d4d8f0808b5700688fed9e45e9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:37 +0100
Subject: driver core: class: make class_for_each_device() options const

class_for_each_device() does not modify the struct class or the struct
device passed into it, so mark them as const * to enforce that.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-6-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 2 +-
 include/linux/device/class.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 5a60e8895165..4937d660c571 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -355,7 +355,7 @@ EXPORT_SYMBOL_GPL(class_dev_iter_exit);
  * @fn is allowed to do anything including calling back into class
  * code.  There's no locking restriction.
  */
-int class_for_each_device(struct class *class, struct device *start,
+int class_for_each_device(const struct class *class, const struct device *start,
 			  void *data, int (*fn)(struct device *, void *))
 {
 	struct class_dev_iter iter;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 1f5cfae8db88..fdbcd487e508 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -109,7 +109,7 @@ extern void class_dev_iter_init(struct class_dev_iter *iter,
 extern struct device *class_dev_iter_next(struct class_dev_iter *iter);
 extern void class_dev_iter_exit(struct class_dev_iter *iter);
 
-extern int class_for_each_device(struct class *class, struct device *start,
+extern int class_for_each_device(const struct class *class, const struct device *start,
 				 void *data,
 				 int (*fn)(struct device *dev, void *data));
 extern struct device *class_find_device(struct class *class,
-- 
cgit v1.2.3


From cf41015ea8d3f2ae451d9b7e39ef42569caeb875 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:38 +0100
Subject: driver core: class: make class_find_device*() options const

The class_find_device*() functions do not modify the struct class or the
struct device passed into it, so mark them as const * to enforce that.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-7-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         |  2 +-
 include/linux/device/class.h | 25 ++++++++++++-------------
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 4937d660c571..52ba0187e66d 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -402,7 +402,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
  * @match is allowed to do anything including calling back into class
  * code.  There's no locking restriction.
  */
-struct device *class_find_device(struct class *class, struct device *start,
+struct device *class_find_device(const struct class *class, const struct device *start,
 				 const void *data,
 				 int (*match)(struct device *, const void *))
 {
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index fdbcd487e508..dfa8958105e7 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -112,8 +112,8 @@ extern void class_dev_iter_exit(struct class_dev_iter *iter);
 extern int class_for_each_device(const struct class *class, const struct device *start,
 				 void *data,
 				 int (*fn)(struct device *dev, void *data));
-extern struct device *class_find_device(struct class *class,
-					struct device *start, const void *data,
+extern struct device *class_find_device(const struct class *class,
+					const struct device *start, const void *data,
 					int (*match)(struct device *, const void *));
 
 /**
@@ -122,7 +122,7 @@ extern struct device *class_find_device(struct class *class,
  * @class: class type
  * @name: name of the device to match
  */
-static inline struct device *class_find_device_by_name(struct class *class,
+static inline struct device *class_find_device_by_name(const struct class *class,
 						       const char *name)
 {
 	return class_find_device(class, NULL, name, device_match_name);
@@ -134,8 +134,8 @@ static inline struct device *class_find_device_by_name(struct class *class,
  * @class: class type
  * @np: of_node of the device to match.
  */
-static inline struct device *
-class_find_device_by_of_node(struct class *class, const struct device_node *np)
+static inline struct device *class_find_device_by_of_node(const struct class *class,
+							  const struct device_node *np)
 {
 	return class_find_device(class, NULL, np, device_match_of_node);
 }
@@ -146,9 +146,8 @@ class_find_device_by_of_node(struct class *class, const struct device_node *np)
  * @class: class type
  * @fwnode: fwnode of the device to match.
  */
-static inline struct device *
-class_find_device_by_fwnode(struct class *class,
-			    const struct fwnode_handle *fwnode)
+static inline struct device *class_find_device_by_fwnode(const struct class *class,
+							 const struct fwnode_handle *fwnode)
 {
 	return class_find_device(class, NULL, fwnode, device_match_fwnode);
 }
@@ -159,7 +158,7 @@ class_find_device_by_fwnode(struct class *class,
  * @class: class type
  * @devt: device type of the device to match.
  */
-static inline struct device *class_find_device_by_devt(struct class *class,
+static inline struct device *class_find_device_by_devt(const struct class *class,
 						       dev_t devt)
 {
 	return class_find_device(class, NULL, &devt, device_match_devt);
@@ -173,14 +172,14 @@ struct acpi_device;
  * @class: class type
  * @adev: ACPI_COMPANION device to match.
  */
-static inline struct device *
-class_find_device_by_acpi_dev(struct class *class, const struct acpi_device *adev)
+static inline struct device *class_find_device_by_acpi_dev(const struct class *class,
+							   const struct acpi_device *adev)
 {
 	return class_find_device(class, NULL, adev, device_match_acpi_dev);
 }
 #else
-static inline struct device *
-class_find_device_by_acpi_dev(struct class *class, const void *adev)
+static inline struct device *class_find_device_by_acpi_dev(const struct class *class,
+							   const void *adev)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 80842a92907bd68454591f9c7a73778f130ac38f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:39 +0100
Subject: driver core: class: make class_create/remove_file*() options const

The class_create_file*() and class_remove_file*() functions do not
modify the struct class at all, so mark them as const * to enforce that.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-8-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         |  4 ++--
 include/linux/device/class.h | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 52ba0187e66d..3d65221b0dcb 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -87,7 +87,7 @@ static const struct kobj_type class_ktype = {
 static struct kset *class_kset;
 
 
-int class_create_file_ns(struct class *cls, const struct class_attribute *attr,
+int class_create_file_ns(const struct class *cls, const struct class_attribute *attr,
 			 const void *ns)
 {
 	int error;
@@ -101,7 +101,7 @@ int class_create_file_ns(struct class *cls, const struct class_attribute *attr,
 }
 EXPORT_SYMBOL_GPL(class_create_file_ns);
 
-void class_remove_file_ns(struct class *cls, const struct class_attribute *attr,
+void class_remove_file_ns(const struct class *cls, const struct class_attribute *attr,
 			  const void *ns)
 {
 	if (cls)
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index dfa8958105e7..75c1451fcc63 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -200,20 +200,20 @@ struct class_attribute {
 #define CLASS_ATTR_WO(_name) \
 	struct class_attribute class_attr_##_name = __ATTR_WO(_name)
 
-extern int __must_check class_create_file_ns(struct class *class,
+extern int __must_check class_create_file_ns(const struct class *class,
 					     const struct class_attribute *attr,
 					     const void *ns);
-extern void class_remove_file_ns(struct class *class,
+extern void class_remove_file_ns(const struct class *class,
 				 const struct class_attribute *attr,
 				 const void *ns);
 
-static inline int __must_check class_create_file(struct class *class,
-					const struct class_attribute *attr)
+static inline int __must_check class_create_file(const struct class *class,
+						 const struct class_attribute *attr)
 {
 	return class_create_file_ns(class, attr, NULL);
 }
 
-static inline void class_remove_file(struct class *class,
+static inline void class_remove_file(const struct class *class,
 				     const struct class_attribute *attr)
 {
 	return class_remove_file_ns(class, attr, NULL);
-- 
cgit v1.2.3


From d2fff09656369d5465585ad2c72d8ba8ada758b5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:40 +0100
Subject: driver core: device: make device_destroy() take a const class *

device_destroy() does not modify the struct class passed into it, so
mark it as const to enforce this rule.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-9-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 2 +-
 include/linux/device.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 57076837b33e..f3b7040ef9b6 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4383,7 +4383,7 @@ EXPORT_SYMBOL_GPL(device_create_with_groups);
  * This call unregisters and cleans up a device that was created with a
  * call to device_create().
  */
-void device_destroy(struct class *class, dev_t devt)
+void device_destroy(const struct class *class, dev_t devt)
 {
 	struct device *dev;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 12dc08aa5c0f..2562451b875b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1021,7 +1021,7 @@ __printf(6, 7) struct device *
 device_create_with_groups(struct class *cls, struct device *parent, dev_t devt,
 			  void *drvdata, const struct attribute_group **groups,
 			  const char *fmt, ...);
-void device_destroy(struct class *cls, dev_t devt);
+void device_destroy(const struct class *cls, dev_t devt);
 
 int __must_check device_add_groups(struct device *dev,
 				   const struct attribute_group **groups);
-- 
cgit v1.2.3


From 9fa120fbd507e58af3494b8765427fdc8181e1eb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:42 +0100
Subject: driver core: device: mark struct class in struct device as constant

The pointer to a struct class in a struct device should never be used to
change anything in that class.  So mark it as constant to enforce this
requirement.

This requires a few minor changes to some internal driver core functions
to enforce the const * being used here now.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-11-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 10 +++++-----
 include/linux/device.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f3b7040ef9b6..0970db630839 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2820,7 +2820,7 @@ EXPORT_SYMBOL_GPL(devm_device_add_groups);
 
 static int device_add_attrs(struct device *dev)
 {
-	struct class *class = dev->class;
+	const struct class *class = dev->class;
 	const struct device_type *type = dev->type;
 	int error;
 
@@ -2887,7 +2887,7 @@ static int device_add_attrs(struct device *dev)
 
 static void device_remove_attrs(struct device *dev)
 {
-	struct class *class = dev->class;
+	const struct class *class = dev->class;
 	const struct device_type *type = dev->type;
 
 	if (dev->physical_location) {
@@ -3120,7 +3120,7 @@ struct kobject *virtual_device_parent(struct device *dev)
 
 struct class_dir {
 	struct kobject kobj;
-	struct class *class;
+	const struct class *class;
 };
 
 #define to_class_dir(obj) container_of(obj, struct class_dir, kobj)
@@ -3145,7 +3145,7 @@ static const struct kobj_type class_dir_ktype = {
 };
 
 static struct kobject *
-class_dir_create_and_add(struct class *class, struct kobject *parent_kobj)
+class_dir_create_and_add(const struct class *class, struct kobject *parent_kobj)
 {
 	struct class_dir *dir;
 	int retval;
@@ -4580,7 +4580,7 @@ static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
 				     kgid_t kgid)
 {
 	struct kobject *kobj = &dev->kobj;
-	struct class *class = dev->class;
+	const struct class *class = dev->class;
 	const struct device_type *type = dev->type;
 	int error;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 2562451b875b..ea07b95b06e5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -631,7 +631,7 @@ struct device {
 	spinlock_t		devres_lock;
 	struct list_head	devres_head;
 
-	struct class		*class;
+	const struct class	*class;
 	const struct attribute_group **groups;	/* optional groups */
 
 	void	(*release)(struct device *dev);
-- 
cgit v1.2.3


From 2bd5c63978b771462da10d8771f56a1e656501d9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:18:43 +0100
Subject: driver core: device: make device_create*() take a const struct class
 *

The functions device_create() and device_create_with_groups() do not
modify the struct class passed into it, so enforce this by changing the
function parameters to be struct const class.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313181843.1207845-12-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 6 +++---
 include/linux/device.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 0970db630839..f1889b9cab45 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4253,7 +4253,7 @@ static void device_create_release(struct device *dev)
 }
 
 static __printf(6, 0) struct device *
-device_create_groups_vargs(struct class *class, struct device *parent,
+device_create_groups_vargs(const struct class *class, struct device *parent,
 			   dev_t devt, void *drvdata,
 			   const struct attribute_group **groups,
 			   const char *fmt, va_list args)
@@ -4317,7 +4317,7 @@ error:
  * Note: the struct class passed to this function must have previously
  * been created with a call to class_create().
  */
-struct device *device_create(struct class *class, struct device *parent,
+struct device *device_create(const struct class *class, struct device *parent,
 			     dev_t devt, void *drvdata, const char *fmt, ...)
 {
 	va_list vargs;
@@ -4358,7 +4358,7 @@ EXPORT_SYMBOL_GPL(device_create);
  * Note: the struct class passed to this function must have previously
  * been created with a call to class_create().
  */
-struct device *device_create_with_groups(struct class *class,
+struct device *device_create_with_groups(const struct class *class,
 					 struct device *parent, dev_t devt,
 					 void *drvdata,
 					 const struct attribute_group **groups,
diff --git a/include/linux/device.h b/include/linux/device.h
index ea07b95b06e5..862397ae520d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1015,10 +1015,10 @@ bool device_is_bound(struct device *dev);
  * Easy functions for dynamically creating devices on the fly
  */
 __printf(5, 6) struct device *
-device_create(struct class *cls, struct device *parent, dev_t devt,
+device_create(const struct class *cls, struct device *parent, dev_t devt,
 	      void *drvdata, const char *fmt, ...);
 __printf(6, 7) struct device *
-device_create_with_groups(struct class *cls, struct device *parent, dev_t devt,
+device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt,
 			  void *drvdata, const struct attribute_group **groups,
 			  const char *fmt, ...);
 void device_destroy(const struct class *cls, dev_t devt);
-- 
cgit v1.2.3


From c984f5d5d45bd5f80d6a9d8541e809300c963aca Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Tue, 7 Mar 2023 11:27:36 +0000
Subject: ACPI: CPPC: Add auto select register read/write support

For some AMD shared memory based systems, the autonomous selection bit
needed to be set explicitly. Add autonomous selection register related
APIs to acpi driver, which amd_pstate driver uses later.

Acked-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
[ rjw: Fixed up kerneldoc comments, white space adjustment, subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/cppc_acpi.h | 11 ++++++
 2 files changed, 107 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 523bad4e3a0c..7ff269a78c20 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1433,6 +1433,102 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
 }
 EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
 
+/**
+ * cppc_get_auto_sel_caps - Read autonomous selection register.
+ * @cpunum : CPU from which to read register.
+ * @perf_caps : struct where autonomous selection register value is updated.
+ */
+int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
+{
+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
+	struct cpc_register_resource *auto_sel_reg;
+	u64  auto_sel;
+
+	if (!cpc_desc) {
+		pr_debug("No CPC descriptor for CPU:%d\n", cpunum);
+		return -ENODEV;
+	}
+
+	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
+
+	if (!CPC_SUPPORTED(auto_sel_reg))
+		pr_warn_once("Autonomous mode is not unsupported!\n");
+
+	if (CPC_IN_PCC(auto_sel_reg)) {
+		int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
+		struct cppc_pcc_data *pcc_ss_data = NULL;
+		int ret = 0;
+
+		if (pcc_ss_id < 0)
+			return -ENODEV;
+
+		pcc_ss_data = pcc_data[pcc_ss_id];
+
+		down_write(&pcc_ss_data->pcc_lock);
+
+		if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) {
+			cpc_read(cpunum, auto_sel_reg, &auto_sel);
+			perf_caps->auto_sel = (bool)auto_sel;
+		} else {
+			ret = -EIO;
+		}
+
+		up_write(&pcc_ss_data->pcc_lock);
+
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps);
+
+/**
+ * cppc_set_auto_sel - Write autonomous selection register.
+ * @cpu    : CPU to which to write register.
+ * @enable : the desired value of autonomous selection resiter to be updated.
+ */
+int cppc_set_auto_sel(int cpu, bool enable)
+{
+	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
+	struct cpc_register_resource *auto_sel_reg;
+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
+	struct cppc_pcc_data *pcc_ss_data = NULL;
+	int ret = -EINVAL;
+
+	if (!cpc_desc) {
+		pr_debug("No CPC descriptor for CPU:%d\n", cpu);
+		return -ENODEV;
+	}
+
+	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
+
+	if (CPC_IN_PCC(auto_sel_reg)) {
+		if (pcc_ss_id < 0) {
+			pr_debug("Invalid pcc_ss_id\n");
+			return -ENODEV;
+		}
+
+		if (CPC_SUPPORTED(auto_sel_reg)) {
+			ret = cpc_write(cpu, auto_sel_reg, enable);
+			if (ret)
+				return ret;
+		}
+
+		pcc_ss_data = pcc_data[pcc_ss_id];
+
+		down_write(&pcc_ss_data->pcc_lock);
+		/* after writing CPC, transfer the ownership of PCC to platform */
+		ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
+		up_write(&pcc_ss_data->pcc_lock);
+	} else {
+		ret = -ENOTSUPP;
+		pr_debug("_CPC in PCC is not supported\n");
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cppc_set_auto_sel);
+
 /**
  * cppc_set_enable - Set to enable CPPC on the processor by writing the
  * Continuous Performance Control package EnableRegister field.
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 6b487a5bd638..6126c977ece0 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -109,6 +109,7 @@ struct cppc_perf_caps {
 	u32 lowest_freq;
 	u32 nominal_freq;
 	u32 energy_perf;
+	bool auto_sel;
 };
 
 struct cppc_perf_ctrls {
@@ -153,6 +154,8 @@ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
 extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val);
 extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
 extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
+extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
+extern int cppc_set_auto_sel(int cpu, bool enable);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
@@ -214,6 +217,14 @@ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
 {
 	return -ENOTSUPP;
 }
+static inline int cppc_set_auto_sel(int cpu, bool enable)
+{
+	return -ENOTSUPP;
+}
+static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
+{
+	return -ENOTSUPP;
+}
 #endif /* !CONFIG_ACPI_CPPC_LIB */
 
 #endif /* _CPPC_ACPI_H*/
-- 
cgit v1.2.3


From 2dd6d0ebf74049256160a3d03dabbd92fe0b8599 Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Tue, 7 Mar 2023 11:27:38 +0000
Subject: cpufreq: amd-pstate: Add guided autonomous mode

From ACPI spec below 3 modes for CPPC can be defined:

 1. Non autonomous: OS scaling governor specifies operating frequency/
    performance level through `Desired Performance` register and platform
    follows that.

 2. Guided autonomous: OS scaling governor specifies min and max
    frequencies/ performance levels through `Minimum Performance` and
    `Maximum Performance` register, and platform can autonomously select an
    operating frequency in this range.

 3. Fully autonomous: OS only hints (via EPP) to platform for the required
    energy performance preference for the workload and platform autonomously
    scales the frequency.

Currently (1) is supported by amd_pstate as passive mode, and (3) is
implemented by EPP support. This change is to support (2).

In guided autonomous mode the min_perf is based on the input from the
scaling governor. For example, in case of schedutil this value depends
on the current utilization. And max_perf is set to max capacity.

To activate guided auto mode ``amd_pstate=guided`` command line
parameter has to be passed in the kernel.

Acked-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 15 +++++++----
 drivers/cpufreq/amd-pstate.c                    | 34 ++++++++++++++++++++-----
 include/linux/amd-pstate.h                      |  2 ++
 3 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1e38f3ae0e62..1e5fe4891dd7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -344,11 +344,11 @@
 			  Do not enable amd_pstate as the default
 			  scaling driver for the supported processors
 			passive
-			  Use amd_pstate as a scaling driver, driver requests a
-			  desired performance on this abstract scale and the power
-			  management firmware translates the requests into actual
-			  hardware states (core frequency, data fabric and memory
-			  clocks etc.)
+			  Use amd_pstate with passive mode as a scaling driver.
+			  In this mode autonomous selection is disabled.
+			  Driver requests a desired performance level and platform
+			  tries to match the same performance level if it is
+			  satisfied by guaranteed performance level.
 			active
 			  Use amd_pstate_epp driver instance as the scaling driver,
 			  driver provides a hint to the hardware if software wants
@@ -356,6 +356,11 @@
 			  to the CPPC firmware. then CPPC power algorithm will
 			  calculate the runtime workload and adjust the realtime cores
 			  frequency.
+			guided
+			  Activate guided autonomous mode. Driver requests minimum and
+			  maximum performance level and the platform autonomously
+			  selects a performance level in this range and appropriate
+			  to the current workload.
 
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 73c7643b2697..61dcd7b89b26 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -308,7 +308,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
 		   cppc_perf.lowest_nonlinear_perf);
 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
 
-	return 0;
+	if (cppc_state == AMD_PSTATE_ACTIVE)
+		return 0;
+
+	ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf);
+	if (ret) {
+		pr_warn("failed to get auto_sel, ret: %d\n", ret);
+		return 0;
+	}
+
+	ret = cppc_set_auto_sel(cpudata->cpu,
+			(cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
+
+	if (ret)
+		pr_warn("failed to set auto_sel, ret: %d\n", ret);
+
+	return ret;
 }
 
 DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
@@ -385,12 +400,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
 }
 
 static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
-			      u32 des_perf, u32 max_perf, bool fast_switch)
+			      u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags)
 {
 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
 	u64 value = prev;
 
 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
+
+	if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
+		min_perf = des_perf;
+		des_perf = 0;
+	}
+
 	value &= ~AMD_CPPC_MIN_PERF(~0L);
 	value |= AMD_CPPC_MIN_PERF(min_perf);
 
@@ -445,7 +466,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
 
 	cpufreq_freq_transition_begin(policy, &freqs);
 	amd_pstate_update(cpudata, min_perf, des_perf,
-			  max_perf, false);
+			  max_perf, false, policy->governor->flags);
 	cpufreq_freq_transition_end(policy, &freqs, false);
 
 	return 0;
@@ -479,7 +500,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	if (max_perf < min_perf)
 		max_perf = min_perf;
 
-	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true);
+	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
+			policy->governor->flags);
 	cpufreq_cpu_put(policy);
 }
 
@@ -1279,7 +1301,7 @@ static int __init amd_pstate_init(void)
 	/* capability check */
 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
 		pr_debug("AMD CPPC MSR based functionality is supported\n");
-		if (cppc_state == AMD_PSTATE_PASSIVE)
+		if (cppc_state != AMD_PSTATE_ACTIVE)
 			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
 	} else {
 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
@@ -1341,7 +1363,7 @@ static int __init amd_pstate_param(char *str)
 		if (cppc_state == AMD_PSTATE_ACTIVE)
 			current_pstate_driver = &amd_pstate_epp_driver;
 
-		if (cppc_state == AMD_PSTATE_PASSIVE)
+		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
 			current_pstate_driver = &amd_pstate_driver;
 
 		return 0;
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index f5f22418e64b..c10ebf8c42e6 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -97,6 +97,7 @@ enum amd_pstate_mode {
 	AMD_PSTATE_DISABLE = 0,
 	AMD_PSTATE_PASSIVE,
 	AMD_PSTATE_ACTIVE,
+	AMD_PSTATE_GUIDED,
 	AMD_PSTATE_MAX,
 };
 
@@ -104,6 +105,7 @@ static const char * const amd_pstate_mode_string[] = {
 	[AMD_PSTATE_DISABLE]     = "disable",
 	[AMD_PSTATE_PASSIVE]     = "passive",
 	[AMD_PSTATE_ACTIVE]      = "active",
+	[AMD_PSTATE_GUIDED]      = "guided",
 	NULL,
 };
 #endif /* _LINUX_AMD_PSTATE_H */
-- 
cgit v1.2.3


From 99be6588118bedc522fa989146f35efceff07c1e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Wed, 15 Mar 2023 16:39:25 -0700
Subject: kunit: Use gfp in kunit_alloc_resource() kernel-doc

Copy/pasting the code from the kernel-doc here doesn't compile because
kunit_alloc_resource() takes a gfp flags argument. Pass the gfp
argument from the caller to complete the example.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/resource.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/kunit/resource.h b/include/kunit/resource.h
index cf6fb8f2ac1b..c0d88b318e90 100644
--- a/include/kunit/resource.h
+++ b/include/kunit/resource.h
@@ -72,7 +72,7 @@ typedef void (*kunit_resource_free_t)(struct kunit_resource *);
  *		params.gfp = gfp;
  *
  *		return kunit_alloc_resource(test, kunit_kmalloc_init,
- *			kunit_kmalloc_free, &params);
+ *			kunit_kmalloc_free, gfp, &params);
  *	}
  *
  * Resources can also be named, with lookup/removal done on a name
-- 
cgit v1.2.3


From 6365ba64b4dbe8b59ddaeaa724b281f3787715d5 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Date: Wed, 8 Mar 2023 15:05:31 +0000
Subject: ptp: kvm: Use decrypted memory in confidential guest on x86

KVM_HC_CLOCK_PAIRING currently fails inside SEV-SNP guests because the
guest passes an address to static data to the host. In confidential
computing the host can't access arbitrary guest memory so handling the
hypercall runs into an "rmpfault". To make the hypercall work, the guest
needs to explicitly mark the memory as decrypted. Do that in
kvm_arch_ptp_init(), but retain the previous behavior for
non-confidential guests to save us from having to allocate memory.

Add a new arch-specific function (kvm_arch_ptp_exit()) to free the
allocation and mark the memory as encrypted again.

Signed-off-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Link: https://lore.kernel.org/r/20230308150531.477741-1-jpiotrowski@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_kvm_arm.c    |  4 +++
 drivers/ptp/ptp_kvm_common.c |  1 +
 drivers/ptp/ptp_kvm_x86.c    | 59 +++++++++++++++++++++++++++++++++++---------
 include/linux/ptp_kvm.h      |  1 +
 4 files changed, 54 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c
index b7d28c8dfb84..e68e6943167b 100644
--- a/drivers/ptp/ptp_kvm_arm.c
+++ b/drivers/ptp/ptp_kvm_arm.c
@@ -22,6 +22,10 @@ int kvm_arch_ptp_init(void)
 	return 0;
 }
 
+void kvm_arch_ptp_exit(void)
+{
+}
+
 int kvm_arch_ptp_get_clock(struct timespec64 *ts)
 {
 	return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
diff --git a/drivers/ptp/ptp_kvm_common.c b/drivers/ptp/ptp_kvm_common.c
index 9141162c4237..2418977989be 100644
--- a/drivers/ptp/ptp_kvm_common.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -130,6 +130,7 @@ static struct kvm_ptp_clock kvm_ptp_clock;
 static void __exit ptp_kvm_exit(void)
 {
 	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+	kvm_arch_ptp_exit();
 }
 
 static int __init ptp_kvm_init(void)
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
index 4991054a2135..902844cc1a17 100644
--- a/drivers/ptp/ptp_kvm_x86.c
+++ b/drivers/ptp/ptp_kvm_x86.c
@@ -14,27 +14,64 @@
 #include <uapi/linux/kvm_para.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_kvm.h>
+#include <linux/set_memory.h>
 
 static phys_addr_t clock_pair_gpa;
-static struct kvm_clock_pairing clock_pair;
+static struct kvm_clock_pairing clock_pair_glbl;
+static struct kvm_clock_pairing *clock_pair;
 
 int kvm_arch_ptp_init(void)
 {
+	struct page *p;
 	long ret;
 
 	if (!kvm_para_available())
 		return -ENODEV;
 
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	if (!pvclock_get_pvti_cpu0_va())
-		return -ENODEV;
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+		p = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!p)
+			return -ENOMEM;
+
+		clock_pair = page_address(p);
+		ret = set_memory_decrypted((unsigned long)clock_pair, 1);
+		if (ret) {
+			__free_page(p);
+			clock_pair = NULL;
+			goto nofree;
+		}
+	} else {
+		clock_pair = &clock_pair_glbl;
+	}
+
+	clock_pair_gpa = slow_virt_to_phys(clock_pair);
+	if (!pvclock_get_pvti_cpu0_va()) {
+		ret = -ENODEV;
+		goto err;
+	}
 
 	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
 			     KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS)
-		return -ENODEV;
+	if (ret == -KVM_ENOSYS) {
+		ret = -ENODEV;
+		goto err;
+	}
 
 	return ret;
+
+err:
+	kvm_arch_ptp_exit();
+nofree:
+	return ret;
+}
+
+void kvm_arch_ptp_exit(void)
+{
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+		WARN_ON(set_memory_encrypted((unsigned long)clock_pair, 1));
+		free_page((unsigned long)clock_pair);
+		clock_pair = NULL;
+	}
 }
 
 int kvm_arch_ptp_get_clock(struct timespec64 *ts)
@@ -49,8 +86,8 @@ int kvm_arch_ptp_get_clock(struct timespec64 *ts)
 		return -EOPNOTSUPP;
 	}
 
-	ts->tv_sec = clock_pair.sec;
-	ts->tv_nsec = clock_pair.nsec;
+	ts->tv_sec = clock_pair->sec;
+	ts->tv_nsec = clock_pair->nsec;
 
 	return 0;
 }
@@ -81,9 +118,9 @@ int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
 			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
 			return -EOPNOTSUPP;
 		}
-		tspec->tv_sec = clock_pair.sec;
-		tspec->tv_nsec = clock_pair.nsec;
-		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+		tspec->tv_sec = clock_pair->sec;
+		tspec->tv_nsec = clock_pair->nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair->tsc);
 	} while (pvclock_read_retry(src, version));
 
 	*cs = &kvm_clock;
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
index c2e28deef33a..746fd67c3480 100644
--- a/include/linux/ptp_kvm.h
+++ b/include/linux/ptp_kvm.h
@@ -14,6 +14,7 @@ struct timespec64;
 struct clocksource;
 
 int kvm_arch_ptp_init(void);
+void kvm_arch_ptp_exit(void);
 int kvm_arch_ptp_get_clock(struct timespec64 *ts);
 int kvm_arch_ptp_get_crosststamp(u64 *cycle,
 		struct timespec64 *tspec, struct clocksource **cs);
-- 
cgit v1.2.3


From 36bd28c1cb0dbf48645cfe43159907fb3253b33a Mon Sep 17 00:00:00 2001
From: haozhe chang <haozhe.chang@mediatek.com>
Date: Thu, 16 Mar 2023 17:58:20 +0800
Subject: wwan: core: Support slicing in port TX flow of WWAN subsystem

wwan_port_fops_write inputs the SKB parameter to the TX callback of
the WWAN device driver. However, the WWAN device (e.g., t7xx) may
have an MTU less than the size of SKB, causing the TX buffer to be
sliced and copied once more in the WWAN device driver.

This patch implements the slicing in the WWAN subsystem and gives
the WWAN devices driver the option to slice(by frag_len) or not. By
doing so, the additional memory copy is reduced.

Meanwhile, this patch gives WWAN devices driver the option to reserve
headroom in fragments for the device-specific metadata.

Signed-off-by: haozhe chang <haozhe.chang@mediatek.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Link: https://lore.kernel.org/r/20230316095826.181904-1-haozhe.chang@mediatek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wwan/iosm/iosm_ipc_port.c  |  3 +-
 drivers/net/wwan/mhi_wwan_ctrl.c       |  2 +-
 drivers/net/wwan/rpmsg_wwan_ctrl.c     |  2 +-
 drivers/net/wwan/t7xx/t7xx_port_wwan.c | 36 +++++++++++----------
 drivers/net/wwan/wwan_core.c           | 58 ++++++++++++++++++++++++++--------
 drivers/net/wwan/wwan_hwsim.c          |  2 +-
 drivers/usb/class/cdc-wdm.c            |  3 +-
 include/linux/wwan.h                   | 11 +++++++
 8 files changed, 81 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wwan/iosm/iosm_ipc_port.c b/drivers/net/wwan/iosm/iosm_ipc_port.c
index b6d81c627277..5d5b4183e14a 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_port.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_port.c
@@ -63,7 +63,8 @@ struct iosm_cdev *ipc_port_init(struct iosm_imem *ipc_imem,
 	ipc_port->ipc_imem = ipc_imem;
 
 	ipc_port->iosm_port = wwan_create_port(ipc_port->dev, port_type,
-					       &ipc_wwan_ctrl_ops, ipc_port);
+					       &ipc_wwan_ctrl_ops, NULL,
+					       ipc_port);
 
 	return ipc_port;
 }
diff --git a/drivers/net/wwan/mhi_wwan_ctrl.c b/drivers/net/wwan/mhi_wwan_ctrl.c
index f7ca52353f40..e9f979d2d851 100644
--- a/drivers/net/wwan/mhi_wwan_ctrl.c
+++ b/drivers/net/wwan/mhi_wwan_ctrl.c
@@ -237,7 +237,7 @@ static int mhi_wwan_ctrl_probe(struct mhi_device *mhi_dev,
 
 	/* Register as a wwan port, id->driver_data contains wwan port type */
 	port = wwan_create_port(&cntrl->mhi_dev->dev, id->driver_data,
-				&wwan_pops, mhiwwan);
+				&wwan_pops, NULL, mhiwwan);
 	if (IS_ERR(port)) {
 		kfree(mhiwwan);
 		return PTR_ERR(port);
diff --git a/drivers/net/wwan/rpmsg_wwan_ctrl.c b/drivers/net/wwan/rpmsg_wwan_ctrl.c
index 31c24420ab2e..06f4b02f1552 100644
--- a/drivers/net/wwan/rpmsg_wwan_ctrl.c
+++ b/drivers/net/wwan/rpmsg_wwan_ctrl.c
@@ -129,7 +129,7 @@ static int rpmsg_wwan_ctrl_probe(struct rpmsg_device *rpdev)
 
 	/* Register as a wwan port, id.driver_data contains wwan port type */
 	port = wwan_create_port(parent, rpdev->id.driver_data,
-				&rpmsg_wwan_pops, rpwwan);
+				&rpmsg_wwan_pops, NULL, rpwwan);
 	if (IS_ERR(port))
 		return PTR_ERR(port);
 
diff --git a/drivers/net/wwan/t7xx/t7xx_port_wwan.c b/drivers/net/wwan/t7xx/t7xx_port_wwan.c
index 24bd21942403..17389c8f6600 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_wwan.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_wwan.c
@@ -54,13 +54,13 @@ static void t7xx_port_ctrl_stop(struct wwan_port *port)
 static int t7xx_port_ctrl_tx(struct wwan_port *port, struct sk_buff *skb)
 {
 	struct t7xx_port *port_private = wwan_port_get_drvdata(port);
-	size_t len, offset, chunk_len = 0, txq_mtu = CLDMA_MTU;
 	const struct t7xx_port_conf *port_conf;
+	struct sk_buff *cur = skb, *cloned;
 	struct t7xx_fsm_ctl *ctl;
 	enum md_state md_state;
+	int cnt = 0, ret;
 
-	len = skb->len;
-	if (!len || !port_private->chan_enable)
+	if (!port_private->chan_enable)
 		return -EINVAL;
 
 	port_conf = port_private->port_conf;
@@ -72,23 +72,21 @@ static int t7xx_port_ctrl_tx(struct wwan_port *port, struct sk_buff *skb)
 		return -ENODEV;
 	}
 
-	for (offset = 0; offset < len; offset += chunk_len) {
-		struct sk_buff *skb_ccci;
-		int ret;
-
-		chunk_len = min(len - offset, txq_mtu - sizeof(struct ccci_header));
-		skb_ccci = t7xx_port_alloc_skb(chunk_len);
-		if (!skb_ccci)
-			return -ENOMEM;
-
-		skb_put_data(skb_ccci, skb->data + offset, chunk_len);
-		ret = t7xx_port_send_skb(port_private, skb_ccci, 0, 0);
+	while (cur) {
+		cloned = skb_clone(cur, GFP_KERNEL);
+		cloned->len = skb_headlen(cur);
+		ret = t7xx_port_send_skb(port_private, cloned, 0, 0);
 		if (ret) {
-			dev_kfree_skb_any(skb_ccci);
+			dev_kfree_skb(cloned);
 			dev_err(port_private->dev, "Write error on %s port, %d\n",
 				port_conf->name, ret);
-			return ret;
+			return cnt ? cnt + ret : ret;
 		}
+		cnt += cur->len;
+		if (cur == skb)
+			cur = skb_shinfo(skb)->frag_list;
+		else
+			cur = cur->next;
 	}
 
 	dev_kfree_skb(skb);
@@ -154,13 +152,17 @@ static int t7xx_port_wwan_disable_chl(struct t7xx_port *port)
 static void t7xx_port_wwan_md_state_notify(struct t7xx_port *port, unsigned int state)
 {
 	const struct t7xx_port_conf *port_conf = port->port_conf;
+	unsigned int header_len = sizeof(struct ccci_header);
+	struct wwan_port_caps caps;
 
 	if (state != MD_STATE_READY)
 		return;
 
 	if (!port->wwan.wwan_port) {
+		caps.frag_len = CLDMA_MTU - header_len;
+		caps.headroom_len = header_len;
 		port->wwan.wwan_port = wwan_create_port(port->dev, port_conf->port_type,
-							&wwan_ops, port);
+							&wwan_ops, &caps, port);
 		if (IS_ERR(port->wwan.wwan_port))
 			dev_err(port->dev, "Unable to create WWWAN port %s", port_conf->name);
 	}
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 966d0ccd2276..2e1c01cf00a9 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -67,6 +67,8 @@ struct wwan_device {
  * @rxq: Buffer inbound queue
  * @waitqueue: The waitqueue for port fops (read/write/poll)
  * @data_lock: Port specific data access serialization
+ * @headroom_len: SKB reserved headroom size
+ * @frag_len: Length to fragment packet
  * @at_data: AT port specific data
  */
 struct wwan_port {
@@ -79,6 +81,8 @@ struct wwan_port {
 	struct sk_buff_head rxq;
 	wait_queue_head_t waitqueue;
 	struct mutex data_lock;	/* Port specific data access serialization */
+	size_t headroom_len;
+	size_t frag_len;
 	union {
 		struct {
 			struct ktermios termios;
@@ -426,6 +430,7 @@ static int __wwan_port_dev_assign_name(struct wwan_port *port, const char *fmt)
 struct wwan_port *wwan_create_port(struct device *parent,
 				   enum wwan_port_type type,
 				   const struct wwan_port_ops *ops,
+				   struct wwan_port_caps *caps,
 				   void *drvdata)
 {
 	struct wwan_device *wwandev;
@@ -459,6 +464,8 @@ struct wwan_port *wwan_create_port(struct device *parent,
 
 	port->type = type;
 	port->ops = ops;
+	port->frag_len = caps ? caps->frag_len : SIZE_MAX;
+	port->headroom_len = caps ? caps->headroom_len : 0;
 	mutex_init(&port->ops_lock);
 	skb_queue_head_init(&port->rxq);
 	init_waitqueue_head(&port->waitqueue);
@@ -702,30 +709,53 @@ static ssize_t wwan_port_fops_read(struct file *filp, char __user *buf,
 static ssize_t wwan_port_fops_write(struct file *filp, const char __user *buf,
 				    size_t count, loff_t *offp)
 {
+	struct sk_buff *skb, *head = NULL, *tail = NULL;
 	struct wwan_port *port = filp->private_data;
-	struct sk_buff *skb;
+	size_t frag_len, remain = count;
 	int ret;
 
 	ret = wwan_wait_tx(port, !!(filp->f_flags & O_NONBLOCK));
 	if (ret)
 		return ret;
 
-	skb = alloc_skb(count, GFP_KERNEL);
-	if (!skb)
-		return -ENOMEM;
+	do {
+		frag_len = min(remain, port->frag_len);
+		skb = alloc_skb(frag_len + port->headroom_len, GFP_KERNEL);
+		if (!skb) {
+			ret = -ENOMEM;
+			goto freeskb;
+		}
+		skb_reserve(skb, port->headroom_len);
+
+		if (!head) {
+			head = skb;
+		} else if (!tail) {
+			skb_shinfo(head)->frag_list = skb;
+			tail = skb;
+		} else {
+			tail->next = skb;
+			tail = skb;
+		}
 
-	if (copy_from_user(skb_put(skb, count), buf, count)) {
-		kfree_skb(skb);
-		return -EFAULT;
-	}
+		if (copy_from_user(skb_put(skb, frag_len), buf + count - remain, frag_len)) {
+			ret = -EFAULT;
+			goto freeskb;
+		}
 
-	ret = wwan_port_op_tx(port, skb, !!(filp->f_flags & O_NONBLOCK));
-	if (ret) {
-		kfree_skb(skb);
-		return ret;
-	}
+		if (skb != head) {
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+	} while (remain -= frag_len);
+
+	ret = wwan_port_op_tx(port, head, !!(filp->f_flags & O_NONBLOCK));
+	if (!ret)
+		return count;
 
-	return count;
+freeskb:
+	kfree_skb(head);
+	return ret;
 }
 
 static __poll_t wwan_port_fops_poll(struct file *filp, poll_table *wait)
diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c
index 2397a903d8f5..dfbdaa259a3f 100644
--- a/drivers/net/wwan/wwan_hwsim.c
+++ b/drivers/net/wwan/wwan_hwsim.c
@@ -205,7 +205,7 @@ static struct wwan_hwsim_port *wwan_hwsim_port_new(struct wwan_hwsim_dev *dev)
 
 	port->wwan = wwan_create_port(&dev->dev, WWAN_PORT_AT,
 				      &wwan_hwsim_port_ops,
-				      port);
+				      NULL, port);
 	if (IS_ERR(port->wwan)) {
 		err = PTR_ERR(port->wwan);
 		goto err_free_port;
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 1f0951be15ab..c553decb5461 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -929,7 +929,8 @@ static void wdm_wwan_init(struct wdm_device *desc)
 		return;
 	}
 
-	port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops, desc);
+	port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops,
+				NULL, desc);
 	if (IS_ERR(port)) {
 		dev_err(&intf->dev, "%s: Unable to create WWAN port\n",
 			dev_name(intf->usb_dev));
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 24d76500b1cc..01fa15506286 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -64,11 +64,21 @@ struct wwan_port_ops {
 			    poll_table *wait);
 };
 
+/** struct wwan_port_caps - The WWAN port capbilities
+ * @frag_len: WWAN port TX fragments length
+ * @headroom_len: WWAN port TX fragments reserved headroom length
+ */
+struct wwan_port_caps {
+	size_t frag_len;
+	unsigned int headroom_len;
+};
+
 /**
  * wwan_create_port - Add a new WWAN port
  * @parent: Device to use as parent and shared by all WWAN ports
  * @type: WWAN port type
  * @ops: WWAN port operations
+ * @caps: WWAN port capabilities
  * @drvdata: Pointer to caller driver data
  *
  * Allocate and register a new WWAN port. The port will be automatically exposed
@@ -86,6 +96,7 @@ struct wwan_port_ops {
 struct wwan_port *wwan_create_port(struct device *parent,
 				   enum wwan_port_type type,
 				   const struct wwan_port_ops *ops,
+				   struct wwan_port_caps *caps,
 				   void *drvdata);
 
 /**
-- 
cgit v1.2.3


From c641e9279f3530aa2fe4bcb250477b555b75104a Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:55 +0200
Subject: vxlan: Expose helper vxlan_build_gbp_hdr

The function vxlan_build_gbp_hdr will be used by other modules to build
gbp option in vxlan header according to gbp flags.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Gavi Teitz <gavi@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_core.c | 19 -------------------
 include/net/vxlan.h            | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 8cc147c0d069..561fe1b314f5 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2093,25 +2093,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 	return false;
 }
 
-static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, struct vxlan_metadata *md)
-{
-	struct vxlanhdr_gbp *gbp;
-
-	if (!md->gbp)
-		return;
-
-	gbp = (struct vxlanhdr_gbp *)vxh;
-	vxh->vx_flags |= VXLAN_HF_GBP;
-
-	if (md->gbp & VXLAN_GBP_DONT_LEARN)
-		gbp->dont_learn = 1;
-
-	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
-		gbp->policy_applied = 1;
-
-	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
-}
-
 static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol)
 {
 	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b7b2e9abfb37..20bd7d893e10 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -572,4 +572,23 @@ static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
 	return true;
 }
 
+static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md)
+{
+	struct vxlanhdr_gbp *gbp;
+
+	if (!md->gbp)
+		return;
+
+	gbp = (struct vxlanhdr_gbp *)vxh;
+	vxh->vx_flags |= VXLAN_HF_GBP;
+
+	if (md->gbp & VXLAN_GBP_DONT_LEARN)
+		gbp->dont_learn = 1;
+
+	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
+		gbp->policy_applied = 1;
+
+	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
+}
+
 #endif
-- 
cgit v1.2.3


From bc9d003dc48c381763c3a6309bfc5eecf9962b9c Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:57 +0200
Subject: ip_tunnel: Preserve pointer const in ip_tunnel_info_opts

Change ip_tunnel_info_opts( ) from static function to macro to cast return
value and preserve the const-ness of the pointer.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fca357679816..255b32a90850 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -67,6 +67,12 @@ struct ip_tunnel_key {
 	GENMASK((sizeof_field(struct ip_tunnel_info,		\
 			      options_len) * BITS_PER_BYTE) - 1, 0)
 
+#define ip_tunnel_info_opts(info)				\
+	_Generic(info,						\
+		 const struct ip_tunnel_info * : ((const void *)((info) + 1)),\
+		 struct ip_tunnel_info * : ((void *)((info) + 1))\
+	)
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 #ifdef CONFIG_DST_CACHE
@@ -485,11 +491,6 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 	}
 }
 
-static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
-{
-	return info + 1;
-}
-
 static inline void ip_tunnel_info_opts_get(void *to,
 					   const struct ip_tunnel_info *info)
 {
-- 
cgit v1.2.3


From 6ee44c518159c364e5d30ed85d357fe1d8e2c141 Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:58 +0200
Subject: net/mlx5e: TC, Add support for VxLAN GBP encap/decap flows offload

Add HW offloading support for TC flows with VxLAN GBP encap/decap.

Example of encap rule:
tc filter add dev eth0 protocol ip ingress flower \
    action tunnel_key set id 42 vxlan_opts 512 \
    action mirred egress redirect dev vxlan1

Example of decap rule:
tc filter add dev vxlan1 protocol ip ingress flower \
    enc_key_id 42 enc_dst_port 4789 vxlan_opts 1024 \
    action tunnel_key unset action mirred egress redirect dev eth0

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Gavi Teitz <gavi@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c  | 72 +++++++++++++++++++++-
 include/linux/mlx5/device.h                        |  6 ++
 include/linux/mlx5/mlx5_ifc.h                      | 13 +++-
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
index 1f62c702b625..a184d739d5f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2018 Mellanox Technologies. */
 
+#include <net/ip_tunnels.h>
 #include <net/vxlan.h>
 #include "lib/vxlan.h"
 #include "en/tc_tun.h"
@@ -86,9 +87,11 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
 	__be32 tun_id = tunnel_id_to_key32(tun_key->tun_id);
 	struct udphdr *udp = (struct udphdr *)(buf);
+	const struct vxlan_metadata *md;
 	struct vxlanhdr *vxh;
 
-	if (tun_key->tun_flags & TUNNEL_VXLAN_OPT)
+	if ((tun_key->tun_flags & TUNNEL_VXLAN_OPT) &&
+	    e->tun_info->options_len != sizeof(*md))
 		return -EOPNOTSUPP;
 	vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
 	*ip_proto = IPPROTO_UDP;
@@ -96,6 +99,57 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	udp->dest = tun_key->tp_dst;
 	vxh->vx_flags = VXLAN_HF_VNI;
 	vxh->vx_vni = vxlan_vni_field(tun_id);
+	if (tun_key->tun_flags & TUNNEL_VXLAN_OPT) {
+		md = ip_tunnel_info_opts(e->tun_info);
+		vxlan_build_gbp_hdr(vxh, md);
+	}
+
+	return 0;
+}
+
+static int mlx5e_tc_tun_parse_vxlan_gbp_option(struct mlx5e_priv *priv,
+					       struct mlx5_flow_spec *spec,
+					       struct flow_cls_offload *f)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct netlink_ext_ack *extack = f->common.extack;
+	struct flow_match_enc_opts enc_opts;
+	void *misc5_c, *misc5_v;
+	u32 *gbp, *gbp_mask;
+
+	flow_rule_match_enc_opts(rule, &enc_opts);
+
+	if (memchr_inv(&enc_opts.mask->data, 0, sizeof(enc_opts.mask->data)) &&
+	    !MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(priv->mdev, tunnel_header_0_1)) {
+		NL_SET_ERR_MSG_MOD(extack, "Matching on VxLAN GBP is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_opts.key->dst_opt_type != TUNNEL_VXLAN_OPT) {
+		NL_SET_ERR_MSG_MOD(extack, "Wrong VxLAN option type: not GBP");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_opts.key->len != sizeof(*gbp) ||
+	    enc_opts.mask->len != sizeof(*gbp_mask)) {
+		NL_SET_ERR_MSG_MOD(extack, "VxLAN GBP option/mask len is not 32 bits");
+		return -EINVAL;
+	}
+
+	gbp = (u32 *)&enc_opts.key->data[0];
+	gbp_mask = (u32 *)&enc_opts.mask->data[0];
+
+	if (*gbp_mask & ~VXLAN_GBP_MASK) {
+		NL_SET_ERR_MSG_FMT_MOD(extack, "Wrong VxLAN GBP mask(0x%08X)\n", *gbp_mask);
+		return -EINVAL;
+	}
+
+	misc5_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_5);
+	misc5_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_5);
+	MLX5_SET(fte_match_set_misc5, misc5_c, tunnel_header_0, *gbp_mask);
+	MLX5_SET(fte_match_set_misc5, misc5_v, tunnel_header_0, *gbp);
+
+	spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5;
 
 	return 0;
 }
@@ -122,6 +176,14 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 	if (!enc_keyid.mask->keyid)
 		return 0;
 
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
+		int err;
+
+		err = mlx5e_tc_tun_parse_vxlan_gbp_option(priv, spec, f);
+		if (err)
+			return err;
+	}
+
 	/* match on VNI is required */
 
 	if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
@@ -143,6 +205,12 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 	return 0;
 }
 
+static bool mlx5e_tc_tun_encap_info_equal_vxlan(struct mlx5e_encap_key *a,
+						struct mlx5e_encap_key *b)
+{
+	return mlx5e_tc_tun_encap_info_equal_options(a, b, TUNNEL_VXLAN_OPT);
+}
+
 static int mlx5e_tc_tun_get_remote_ifindex(struct net_device *mirred_dev)
 {
 	const struct vxlan_dev *vxlan = netdev_priv(mirred_dev);
@@ -160,6 +228,6 @@ struct mlx5e_tc_tunnel vxlan_tunnel = {
 	.generate_ip_tun_hdr  = mlx5e_gen_ip_tunnel_header_vxlan,
 	.parse_udp_ports      = mlx5e_tc_tun_parse_udp_ports_vxlan,
 	.parse_tunnel         = mlx5e_tc_tun_parse_vxlan,
-	.encap_info_equal     = mlx5e_tc_tun_encap_info_equal_generic,
+	.encap_info_equal     = mlx5e_tc_tun_encap_info_equal_vxlan,
 	.get_remote_ifindex   = mlx5e_tc_tun_get_remote_ifindex,
 };
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 71b06ebad402..af4dd536a52c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1357,6 +1357,12 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap)
 
+#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, ft_field_support_2_esw_fdb.cap)
+
+#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, ft_field_support_2_esw_fdb.cap)
+
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
 		 mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d2c164f0778c..e47d6c58da35 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -404,10 +404,13 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         metadata_reg_c_0[0x1];
 };
 
+/* Table 2170 - Flow Table Fields Supported 2 Format */
 struct mlx5_ifc_flow_table_fields_supported_2_bits {
 	u8         reserved_at_0[0xe];
 	u8         bth_opcode[0x1];
-	u8         reserved_at_f[0x11];
+	u8         reserved_at_f[0x1];
+	u8         tunnel_header_0_1[0x1];
+	u8         reserved_at_11[0xf];
 
 	u8         reserved_at_20[0x60];
 };
@@ -895,7 +898,13 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
 
-	u8      reserved_at_800[0x1000];
+	u8      reserved_at_800[0xC00];
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_esw_fdb;
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_esw_fdb;
+
+	u8      reserved_at_1500[0x300];
 
 	u8      sw_steering_fdb_action_drop_icm_address_rx[0x40];
 
-- 
cgit v1.2.3


From 94c540fbfc80ef95ec398fc77da1920de2946edb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:30 +0000
Subject: udp: preserve const qualifier in udp_sk()

We can change udp_sk() to propagate const qualifier of its argument,
thanks to container_of_const()

This should avoid some potential errors caused by accidental
(const -> not_const) promotion.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a2892e151644..43c1fb2d2c21 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -97,10 +97,7 @@ struct udp_sock {
 
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
 
-static inline struct udp_sock *udp_sk(const struct sock *sk)
-{
-	return (struct udp_sock *)sk;
-}
+#define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
 
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
-- 
cgit v1.2.3


From 0a2db4630b72486ec2f207ae433c2156b7fd3837 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:32 +0000
Subject: raw: preserve const qualifier in raw_sk()

We can change raw_sk() to propagate const qualifier of its argument,
thanks to container_of_const()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 7ad15830cf38..c215af02f758 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -83,10 +83,7 @@ struct raw_sock {
 	u32		   ipmr_table;
 };
 
-static inline struct raw_sock *raw_sk(const struct sock *sk)
-{
-	return (struct raw_sock *)sk;
-}
+#define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk)
 
 static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
 				       int dif, int sdif)
-- 
cgit v1.2.3


From 47fcae28b9ec409423ba7f67f93e8345acce8a36 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:33 +0000
Subject: ipv6: raw: preserve const qualifier in raw6_sk()

We can change raw6_sk() to propagate its argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 37dfdcfcdd54..839247a4f48e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -336,10 +336,7 @@ static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
 	return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
 }
 
-static inline struct raw6_sock *raw6_sk(const struct sock *sk)
-{
-	return (struct raw6_sock *)sk;
-}
+#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk)
 
 #define ipv6_only_sock(sk)	(sk->sk_ipv6only)
 #define ipv6_sk_rxinfo(sk)	((sk)->sk_family == PF_INET6 && \
-- 
cgit v1.2.3


From ae6084b739925d84ef4a481d8eaa2187455f2e2f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:34 +0000
Subject: dccp: preserve const qualifier in dccp_sk()

We can change dccp_sk() to propagate its argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 07e547c02fd8..325af611909f 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -305,10 +305,8 @@ struct dccp_sock {
 	struct timer_list		dccps_xmit_timer;
 };
 
-static inline struct dccp_sock *dccp_sk(const struct sock *sk)
-{
-	return (struct dccp_sock *)sk;
-}
+#define dccp_sk(ptr)	container_of_const(ptr, struct dccp_sock, \
+					   dccps_inet_connection.icsk_inet.sk)
 
 static inline const char *dccp_role(const struct sock *sk)
 {
-- 
cgit v1.2.3


From b064ba9c3cfaf3d22d6153ec3c245eaa4d913674 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:35 +0000
Subject: af_unix: preserve const qualifier in unix_sk()

We can change unix_sk() to propagate its argument const qualifier,
thanks to container_of_const().

We need to change dump_common_audit_data() 'struct unix_sock *u'
local var to get a const attribute.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h | 5 +----
 security/lsm_audit.c  | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 45ebde587138..824c258143a3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -74,10 +74,7 @@ struct unix_sock {
 #endif
 };
 
-static inline struct unix_sock *unix_sk(const struct sock *sk)
-{
-	return (struct unix_sock *)sk;
-}
+#define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk)
 
 #define peer_wait peer_wq.wait
 
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 00d3bdd386e2..368e77ca43c4 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -310,7 +310,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_NET:
 		if (a->u.net->sk) {
 			const struct sock *sk = a->u.net->sk;
-			struct unix_sock *u;
+			const struct unix_sock *u;
 			struct unix_address *addr;
 			int len = 0;
 			char *p = NULL;
-- 
cgit v1.2.3


From c7154ca8e075cc456fe773879263b33ae307a59e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:37 +0000
Subject: x25: preserve const qualifier in [a]x25_sk()

We can change [a]x25_sk() to propagate their argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ax25.h | 5 +----
 include/net/x25.h  | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index f8cf3629a419..0d939e5aee4e 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -260,10 +260,7 @@ struct ax25_sock {
 	struct ax25_cb		*cb;
 };
 
-static inline struct ax25_sock *ax25_sk(const struct sock *sk)
-{
-	return (struct ax25_sock *) sk;
-}
+#define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk)
 
 static inline struct ax25_cb *sk_to_ax25(const struct sock *sk)
 {
diff --git a/include/net/x25.h b/include/net/x25.h
index d7d6c2b4ffa7..597eb53c471e 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -177,10 +177,7 @@ struct x25_forward {
 	atomic_t		refcnt;
 };
 
-static inline struct x25_sock *x25_sk(const struct sock *sk)
-{
-	return (struct x25_sock *)sk;
-}
+#define x25_sk(ptr) container_of_const(ptr, struct x25_sock, sk)
 
 /* af_x25.c */
 extern int  sysctl_x25_restart_request_timeout;
-- 
cgit v1.2.3


From e9d9da91548b21e189fcd0259a0f2d26d1afc509 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:39 +0000
Subject: tcp: preserve const qualifier in tcp_sk()

We can change tcp_sk() to propagate its argument const qualifier,
thanks to container_of_const().

We have two places where a const sock pointer has to be upgraded
to a write one. We have been using const qualifier for lockless
listeners to clearly identify points where writes could happen.

Add tcp_sk_rw() helper to better document these.

tcp_inbound_md5_hash(), __tcp_grow_window(), tcp_reset_check()
and tcp_rack_reo_wnd() get an additional const qualififer
for their @tp local variables.

smc_check_reset_syn_req() also needs a similar change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 10 ++++++----
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_input.c     |  4 ++--
 net/ipv4/tcp_minisocks.c |  5 +++--
 net/ipv4/tcp_output.c    |  9 +++++++--
 net/ipv4/tcp_recovery.c  |  2 +-
 7 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ca7f05a130d2..b4c08ac86983 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -472,10 +472,12 @@ enum tsq_flags {
 	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
 };
 
-static inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
+#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+
+/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket.
+ * Used in context of (lockless) tcp listeners.
+ */
+#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
 
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..a0a91a988272 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -529,7 +529,7 @@ static inline void tcp_synq_overflow(const struct sock *sk)
 
 	last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
 	if (!time_between32(now, last_overflow, last_overflow + HZ))
-		WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
+		WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now);
 }
 
 /* syncookies: no recent synqueue overflow on this listening socket? */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 01569de651b6..fd68d49490f2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4570,7 +4570,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	const __u8 *hash_location = NULL;
 	struct tcp_md5sig_key *hash_expected;
 	const struct tcphdr *th = tcp_hdr(skb);
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 	int genhash, l3index;
 	u8 newhash[16];
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 754ddbe0577f..2b75cd9e2e92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -458,7 +458,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
 static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
 			     unsigned int skbtruesize)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
 	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
@@ -5693,7 +5693,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
  */
 static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
 			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9a7ef7732c24..dac0d62120e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -463,7 +463,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
 				    struct request_sock *req,
 				    struct tcp_sock *newtp)
 {
@@ -492,7 +492,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct tcp_request_sock *treq = tcp_rsk(req);
 	struct inet_connection_sock *newicsk;
-	struct tcp_sock *oldtp, *newtp;
+	const struct tcp_sock *oldtp;
+	struct tcp_sock *newtp;
 	u32 seq;
 
 	if (!newsk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1e743b39a83..cfe128b81a01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4127,8 +4127,13 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 	if (!res) {
 		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-		if (unlikely(tcp_passive_fastopen(sk)))
-			tcp_sk(sk)->total_retrans++;
+		if (unlikely(tcp_passive_fastopen(sk))) {
+			/* sk has const attribute because listeners are lockless.
+			 * However in this case, we are dealing with a passive fastopen
+			 * socket thus we can change total_retrans value.
+			 */
+			tcp_sk_rw(sk)->total_retrans++;
+		}
 		trace_tcp_retransmit_synack(sk, req);
 	}
 	return res;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 50abaa941387..acf4869c5d3b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -4,7 +4,7 @@
 
 static u32 tcp_rack_reo_wnd(const struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!tp->reord_seen) {
 		/* If reordering has not been observed, be aggressive during
-- 
cgit v1.2.3


From 7ce93729091dff24e6a1c3578b58b36f4b1cf10a Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 10 Mar 2023 16:27:28 -0500
Subject: dyndbg: cleanup dynamic usage in ib_srp.c

Currently, in dynamic_debug.h we only provide
DEFINE_DYNAMIC_DEBUG_METADATA() and DYNAMIC_DEBUG_BRANCH()
definitions if CONFIG_DYNAMIC_CORE is enabled. Thus, drivers
such as infiniband srp (see: drivers/infiniband/ulp/srp/ib_srp.c)
must provide their own definitions for !CONFIG_DYNAMIC_CORE.

Thus, let's move this !CONFIG_DYNAMIC_CORE case into dynamic_debug.h.
However, the dynamic debug interfaces should really only be defined
if CONFIG_DYNAMIC_DEBUG is set or CONFIG_DYNAMIC_CORE is set along
with DYNAMIC_DEBUG_MODULE, (see:
Documentation/admin-guide/dynamic-debug-howto.rst). Thus, the
undefined case becomes: !((CONFIG_DYNAMIC_DEBUG ||
(CONFIG_DYNAMIC_CORE && DYNAMIC_DEBUG_MODULE)).
With those changes in place, we can remove the !CONFIG_DYNAMIC_CORE
case from ib_srp.c

This change was prompted by a build breakeage in ib_srp.c stemming
from the inclusion of dynamic_debug.h unconditionally in module.h, due
to commit 7deabd674988 ("dyndbg: use the module notifier callbacks").
In that case, if we have CONFIG_DYNAMIC_CORE=y and
CONFIG_DYNAMIC_DEBUG=n then the definitions for
DEFINE_DYNAMIC_DEBUG_METADATA() and DYNAMIC_DEBUG_BRANCH() are defined
once in ib_srp.c and then again in the dynamic_debug.h. This had been
working prior to the above referenced commit because dynamic_debug.h
was only pulled into ib_srp.c conditinally via printk.h if
CONFIG_DYNAMIC_DEBUG was set.

Also, the exported functions in lib/dynamic_debug.c itself may
not have a prototype if CONFIG_DYNAMIC_DEBUG=n and
CONFIG_DYNAMIC_CORE=y. This would trigger the -Wmissing-prototypes
warning.

The exported functions are behind (include/linux/dynamic_debug.h):

if defined(CONFIG_DYNAMIC_DEBUG) || \
 (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))

Thus, by adding -DDYNAMIC_CONFIG_MODULE to the lib/Makefile we
can ensure that the exported functions have a prototype in all cases,
since lib/dynamic_debug.c is built whenever
CONFIG_DYNAMIC_DEBUG_CORE=y.

Fixes: 7deabd674988 ("dyndbg: use the module notifier callbacks")
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303071444.sIbZTDCy-lkp@intel.com/
Signed-off-by: Jason Baron <jbaron@akamai.com>
[mcgrof: adjust commit log, and remove urldefense from URL]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 drivers/infiniband/ulp/srp/ib_srp.c |  5 ----
 include/linux/dynamic_debug.h       | 59 +++++++++++++++++++++++--------------
 lib/Makefile                        |  3 ++
 3 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index df21b30b7735..40ba2c6927c6 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -62,11 +62,6 @@ MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator");
 MODULE_LICENSE("Dual BSD/GPL");
 
-#if !defined(CONFIG_DYNAMIC_DEBUG)
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)
-#define DYNAMIC_DEBUG_BRANCH(descriptor) false
-#endif
-
 static unsigned int srp_sg_tablesize;
 static unsigned int cmd_sg_entries;
 static unsigned int indirect_sg_entries;
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index f401414341fb..061dd84d09f3 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -128,14 +128,16 @@ struct ddebug_class_param {
 	const struct ddebug_class_map *map;
 };
 
-#if defined(CONFIG_DYNAMIC_DEBUG_CORE)
+/*
+ * pr_debug() and friends are globally enabled or modules have selectively
+ * enabled them.
+ */
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
 
 extern __printf(2, 3)
 void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
 
-extern int ddebug_dyndbg_module_param_cb(char *param, char *val,
-					const char *modname);
-
 struct device;
 
 extern __printf(3, 4)
@@ -284,10 +286,6 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 				   KERN_DEBUG, prefix_str, prefix_type,	\
 				   rowsize, groupsize, buf, len, ascii)
 
-struct kernel_param;
-int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp);
-int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp);
-
 /* for test only, generally expect drm.debug style macro wrappers */
 #define __pr_debug_cls(cls, fmt, ...) do {			\
 	BUILD_BUG_ON_MSG(!__builtin_constant_p(cls),		\
@@ -295,23 +293,14 @@ int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp);
 	dynamic_pr_debug_cls(cls, fmt, ##__VA_ARGS__);		\
 	} while (0)
 
-#else /* !CONFIG_DYNAMIC_DEBUG_CORE */
+#else /* !(CONFIG_DYNAMIC_DEBUG || (CONFIG_DYNAMIC_DEBUG_CORE && DYNAMIC_DEBUG_MODULE)) */
 
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/printk.h>
 
-static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
-						const char *modname)
-{
-	if (!strcmp(param, "dyndbg")) {
-		/* avoid pr_warn(), which wants pr_fmt() fully defined */
-		printk(KERN_WARNING "dyndbg param is supported only in "
-			"CONFIG_DYNAMIC_DEBUG builds\n");
-		return 0; /* allow and ignore */
-	}
-	return -EINVAL;
-}
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)
+#define DYNAMIC_DEBUG_BRANCH(descriptor) false
 
 #define dynamic_pr_debug(fmt, ...)					\
 	do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0)
@@ -324,14 +313,40 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
 				rowsize, groupsize, buf, len, ascii);	\
 	} while (0)
 
+#endif /* CONFIG_DYNAMIC_DEBUG || (CONFIG_DYNAMIC_DEBUG_CORE && DYNAMIC_DEBUG_MODULE) */
+
+
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+
+extern int ddebug_dyndbg_module_param_cb(char *param, char *val,
+					const char *modname);
+struct kernel_param;
+int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp);
+int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp);
+
+#else
+
+static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
+						const char *modname)
+{
+	if (!strcmp(param, "dyndbg")) {
+		/* avoid pr_warn(), which wants pr_fmt() fully defined */
+		printk(KERN_WARNING "dyndbg param is supported only in "
+			"CONFIG_DYNAMIC_DEBUG builds\n");
+		return 0; /* allow and ignore */
+	}
+	return -EINVAL;
+}
+
 struct kernel_param;
 static inline int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp)
 { return 0; }
 static inline int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp)
 { return 0; }
 
-#endif /* !CONFIG_DYNAMIC_DEBUG_CORE */
+#endif
+
 
 extern const struct kernel_param_ops param_ops_dyndbg_classes;
 
-#endif
+#endif /* _DYNAMIC_DEBUG_H */
diff --git a/lib/Makefile b/lib/Makefile
index baf2821f7a00..7afcd85f78f6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -231,6 +231,9 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
 
 obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
+#ensure exported functions have prototypes
+CFLAGS_dynamic_debug.o := -DDYNAMIC_DEBUG_MODULE
+
 obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o
 
 obj-$(CONFIG_NLATTR) += nlattr.o
-- 
cgit v1.2.3


From 3703bd54cd37e7875f51ece8df8c85c184e40bba Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 8 Mar 2023 15:38:46 +0800
Subject: kallsyms: Delete an unused parameter related to
 {module_}kallsyms_on_each_symbol()

The parameter 'struct module *' in the hook function associated with
{module_}kallsyms_on_each_symbol() is no longer used. Delete it.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/kallsyms.h   | 7 +++----
 include/linux/module.h     | 6 ++----
 kernel/kallsyms.c          | 5 ++---
 kernel/kallsyms_selftest.c | 6 +++---
 kernel/livepatch/core.c    | 3 +--
 kernel/module/kallsyms.c   | 5 ++---
 kernel/trace/ftrace.c      | 3 +--
 7 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 0065209cc004..fe3c9993b5bf 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -67,8 +67,7 @@ static inline void *dereference_symbol_descriptor(void *ptr)
 
 #ifdef CONFIG_KALLSYMS
 unsigned long kallsyms_sym_address(int idx);
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
-				      unsigned long),
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
 			    void *data);
 int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
 				  const char *name, void *data);
@@ -166,8 +165,8 @@ static inline bool kallsyms_show_value(const struct cred *cred)
 	return false;
 }
 
-static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
-					  unsigned long), void *data)
+static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
+					  void *data)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/linux/module.h b/include/linux/module.h
index c3b357196470..d2d900077bde 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -965,13 +965,11 @@ static inline bool module_sig_ok(struct module *module)
 
 #if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
 int module_kallsyms_on_each_symbol(const char *modname,
-				   int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
+				   int (*fn)(void *, const char *, unsigned long),
 				   void *data);
 #else
 static inline int module_kallsyms_on_each_symbol(const char *modname,
-						 int (*fn)(void *, const char *,
-						 struct module *, unsigned long),
+						 int (*fn)(void *, const char *, unsigned long),
 						 void *data)
 {
 	return -EOPNOTSUPP;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 83f499182c9a..77747391f49b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -288,8 +288,7 @@ unsigned long kallsyms_lookup_name(const char *name)
  * Iterate over all symbols in vmlinux.  For symbols from modules use
  * module_kallsyms_on_each_symbol instead.
  */
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
-				      unsigned long),
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
 			    void *data)
 {
 	char namebuf[KSYM_NAME_LEN];
@@ -299,7 +298,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 
 	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
 		off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
-		ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
+		ret = fn(data, namebuf, kallsyms_sym_address(i));
 		if (ret != 0)
 			return ret;
 		cond_resched();
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index bfbc12da3326..a2e3745d15c4 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -95,7 +95,7 @@ static struct test_item test_items[] = {
 
 static char stub_name[KSYM_NAME_LEN];
 
-static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr)
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
 {
 	*(u32 *)data += strlen(name);
 
@@ -154,7 +154,7 @@ static void test_kallsyms_compression_ratio(void)
 	pr_info(" ---------------------------------------------------------\n");
 }
 
-static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
+static int lookup_name(void *data, const char *name, unsigned long addr)
 {
 	u64 t0, t1, t;
 	struct test_stat *stat = (struct test_stat *)data;
@@ -207,7 +207,7 @@ static bool match_cleanup_name(const char *s, const char *name)
 	return !strncmp(s, name, len);
 }
 
-static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr)
+static int find_symbol(void *data, const char *name, unsigned long addr)
 {
 	struct test_stat *stat = (struct test_stat *)data;
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..1de2c40cc378 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -142,8 +142,7 @@ static int klp_match_callback(void *data, unsigned long addr)
 	return 0;
 }
 
-static int klp_find_callback(void *data, const char *name,
-			     struct module *mod, unsigned long addr)
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
 {
 	struct klp_find_arg *args = data;
 
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 8e89f459ffdd..3320586584f1 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -503,8 +503,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 }
 
 int module_kallsyms_on_each_symbol(const char *modname,
-				   int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
+				   int (*fn)(void *, const char *, unsigned long),
 				   void *data)
 {
 	struct module *mod;
@@ -533,7 +532,7 @@ int module_kallsyms_on_each_symbol(const char *modname,
 				continue;
 
 			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
-				 mod, kallsyms_symbol_value(sym));
+				 kallsyms_symbol_value(sym));
 			if (ret != 0)
 				goto out;
 		}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 29baa97d0d53..76caca8f496a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8391,8 +8391,7 @@ struct kallsyms_data {
  * and returns 1 in case we resolved all the requested symbols,
  * 0 otherwise.
  */
-static int kallsyms_callback(void *data, const char *name,
-			     struct module *mod, unsigned long addr)
+static int kallsyms_callback(void *data, const char *name, unsigned long addr)
 {
 	struct kallsyms_data *args = data;
 	const char **sym;
-- 
cgit v1.2.3


From a17b738cdefb77ffb9ce0cade430557a8bc1df10 Mon Sep 17 00:00:00 2001
From: Emanuel Strobel <emanuel.strobel@yahoo.com>
Date: Fri, 3 Feb 2023 10:34:05 +0100
Subject: media: rc: add common keymap for Dreambox RC10/RC0 and RC20/RC-BT
 remotes

Add a common keymap for the RC10/RC0 and RC20/RC-BT remotes used with
the Dreambox One and Dreambox Two DVB-S/T boxes. The maps are combined
since the IR codes do not conflict and both boxes have shipped with
both remote designs over time.

Both remote types can be programmed to control TVs, so include non-IR
keys that are used to switch-to or toggle the remote mode:

- DREAM in RC10/RC0 switches to (Dreambox) STB control mode
- TV in RC10/RC0 switches to TV control mode
- MODE in RC20/RC-BT toggles between STB/TV/BT control modes

In the RC20 keymap the Android MIC (voice search) key maps to KEY_HELP
and EXIT is mapped to KEY_ESC to replicate the go-backwards navigation
behaviour in the Android vendor OS that ships on Dreambox devices.

Signed-off-by: Emanuel Strobel <emanuel.strobel@yahoo.com>
Signed-off-by: Christian Hewitt <christianshewitt@gmail.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/rc/keymaps/Makefile      |   1 +
 drivers/media/rc/keymaps/rc-dreambox.c | 151 +++++++++++++++++++++++++++++++++
 include/media/rc-map.h                 |   1 +
 3 files changed, 153 insertions(+)
 create mode 100644 drivers/media/rc/keymaps/rc-dreambox.c

(limited to 'include')

diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile
index f513ff5caf4e..6931c89fca99 100644
--- a/drivers/media/rc/keymaps/Makefile
+++ b/drivers/media/rc/keymaps/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_RC_MAP) += \
 			rc-dm1105-nec.o \
 			rc-dntv-live-dvb-t.o \
 			rc-dntv-live-dvbt-pro.o \
+			rc-dreambox.o \
 			rc-dtt200u.o \
 			rc-dvbsky.o \
 			rc-dvico-mce.o \
diff --git a/drivers/media/rc/keymaps/rc-dreambox.c b/drivers/media/rc/keymaps/rc-dreambox.c
new file mode 100644
index 000000000000..dea024fa3a22
--- /dev/null
+++ b/drivers/media/rc/keymaps/rc-dreambox.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2021 Emanuel Strobel <emanuel.strobel@yahoo.com>
+ */
+
+#include <media/rc-map.h>
+#include <linux/module.h>
+
+/*
+ * Keytable for Dreambox RC10/RC0 and RC20/RC-BT remote controls
+ *
+ * Keys that are not IR addressable:
+ *
+ * // DREAM switches to STB control mode
+ * // TV switches to TV control mode
+ * // MODE toggles STB/TV/BT control modes
+ *
+ */
+
+static struct rc_map_table dreambox[] = {
+	/* Dreambox RC10/RC0/RCU-BT remote */
+	{ 0x3200, KEY_POWER },
+
+	// DREAM
+	{ 0x3290, KEY_HELP },
+	// TV
+
+	{ 0x3201, KEY_1 },
+	{ 0x3202, KEY_2 },
+	{ 0x3203, KEY_3 },
+	{ 0x3204, KEY_4 },
+	{ 0x3205, KEY_5 },
+	{ 0x3206, KEY_6 },
+	{ 0x3207, KEY_7 },
+	{ 0x3208, KEY_8 },
+	{ 0x3209, KEY_9 },
+	{ 0x320a, KEY_PREVIOUS },
+	{ 0x320b, KEY_0 },
+	{ 0x320c, KEY_NEXT },
+
+	{ 0x321f, KEY_RED },
+	{ 0x3220, KEY_GREEN },
+	{ 0x3221, KEY_YELLOW },
+	{ 0x3222, KEY_BLUE },
+
+	{ 0x3210, KEY_INFO },
+	{ 0x3212, KEY_MENU },
+	{ 0x320e, KEY_AUDIO },
+	{ 0x3218, KEY_PVR },
+
+	{ 0x3213, KEY_LEFT },
+	{ 0x3211, KEY_UP },
+	{ 0x3215, KEY_RIGHT },
+	{ 0x3217, KEY_DOWN },
+	{ 0x3214, KEY_OK },
+
+	{ 0x3219, KEY_VOLUMEUP },
+	{ 0x321c, KEY_VOLUMEDOWN },
+
+	{ 0x321d, KEY_ESC }, // EXIT
+	{ 0x321a, KEY_MUTE },
+
+	{ 0x321b, KEY_PAGEUP },
+	{ 0x321e, KEY_PAGEDOWN },
+
+	{ 0x3223, KEY_PREVIOUSSONG },
+	{ 0x3224, KEY_PLAYPAUSE },
+	{ 0x3225, KEY_STOP },
+	{ 0x3226, KEY_NEXTSONG },
+
+	{ 0x3227, KEY_TV },
+	{ 0x3228, KEY_RADIO },
+	{ 0x3229, KEY_TEXT },
+	{ 0x322a, KEY_RECORD },
+
+	/* Dreambox RC20/RC-BT */
+	{ 0x3407, KEY_MUTE },
+	// MODE
+	{ 0x3401, KEY_POWER },
+
+	{ 0x3432, KEY_PREVIOUSSONG },
+	{ 0x3433, KEY_PLAYPAUSE },
+	{ 0x3435, KEY_NEXTSONG },
+
+	{ 0x3436, KEY_RECORD },
+	{ 0x3434, KEY_STOP },
+	{ 0x3425, KEY_TEXT },
+
+	{ 0x341f, KEY_RED },
+	{ 0x3420, KEY_GREEN },
+	{ 0x3421, KEY_YELLOW },
+	{ 0x3422, KEY_BLUE },
+
+	{ 0x341b, KEY_INFO },
+	{ 0x341c, KEY_MENU },
+	{ 0x3430, KEY_AUDIO },
+	{ 0x3431, KEY_PVR },
+
+	{ 0x3414, KEY_LEFT },
+	{ 0x3411, KEY_UP },
+	{ 0x3416, KEY_RIGHT },
+	{ 0x3419, KEY_DOWN },
+	{ 0x3415, KEY_OK },
+
+	{ 0x3413, KEY_VOLUMEUP },
+	{ 0x3418, KEY_VOLUMEDOWN },
+
+	{ 0x3412, KEY_ESC }, // EXIT
+	{ 0x3426, KEY_HELP }, // MIC
+
+	{ 0x3417, KEY_PAGEUP },
+	{ 0x341a, KEY_PAGEDOWN },
+
+	{ 0x3404, KEY_1 },
+	{ 0x3405, KEY_2 },
+	{ 0x3406, KEY_3 },
+	{ 0x3408, KEY_4 },
+	{ 0x3409, KEY_5 },
+	{ 0x340a, KEY_6 },
+	{ 0x340c, KEY_7 },
+	{ 0x340d, KEY_8 },
+	{ 0x340e, KEY_9 },
+	{ 0x340b, KEY_PREVIOUS },
+	{ 0x3410, KEY_0 },
+	{ 0x340f, KEY_NEXT },
+};
+
+static struct rc_map_list dreambox_map = {
+	.map = {
+		.scan     = dreambox,
+		.size     = ARRAY_SIZE(dreambox),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DREAMBOX,
+	}
+};
+
+static int __init init_rc_map_dreambox(void)
+{
+	return rc_map_register(&dreambox_map);
+}
+
+static void __exit exit_rc_map_dreambox(void)
+{
+	rc_map_unregister(&dreambox_map);
+}
+
+module_init(init_rc_map_dreambox)
+module_exit(exit_rc_map_dreambox)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Emanuel Strobel <emanuel.strobel@yahoo.com>");
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index 793b54342dff..94ee968d4722 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -241,6 +241,7 @@ struct rc_map *rc_map_get(const char *name);
 #define RC_MAP_DM1105_NEC                "rc-dm1105-nec"
 #define RC_MAP_DNTV_LIVE_DVB_T           "rc-dntv-live-dvb-t"
 #define RC_MAP_DNTV_LIVE_DVBT_PRO        "rc-dntv-live-dvbt-pro"
+#define RC_MAP_DREAMBOX                  "rc-dreambox"
 #define RC_MAP_DTT200U                   "rc-dtt200u"
 #define RC_MAP_DVBSKY                    "rc-dvbsky"
 #define RC_MAP_DVICO_MCE		 "rc-dvico-mce"
-- 
cgit v1.2.3


From 12f9225882563f67b7c32489cc1f8f5088f65b28 Mon Sep 17 00:00:00 2001
From: Christian Hewitt <christianshewitt@gmail.com>
Date: Sat, 4 Feb 2023 08:45:01 +0100
Subject: media: rc: add Beelink Mini MXIII keymap

Add a keymap for the simple IR (NEC) remote used with the Beelink
Mini MXIII Android STB device.

Signed-off-by: Christian Hewitt <christianshewitt@gmail.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/rc/keymaps/Makefile           |  1 +
 drivers/media/rc/keymaps/rc-beelink-mxiii.c | 57 +++++++++++++++++++++++++++++
 include/media/rc-map.h                      |  1 +
 3 files changed, 59 insertions(+)
 create mode 100644 drivers/media/rc/keymaps/rc-beelink-mxiii.c

(limited to 'include')

diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile
index 6931c89fca99..f19558fdab0c 100644
--- a/drivers/media/rc/keymaps/Makefile
+++ b/drivers/media/rc/keymaps/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_RC_MAP) += \
 			rc-avertv-303.o \
 			rc-azurewave-ad-tu700.o \
 			rc-beelink-gs1.o \
+			rc-beelink-mxiii.o \
 			rc-behold-columbus.o \
 			rc-behold.o \
 			rc-budget-ci-old.o \
diff --git a/drivers/media/rc/keymaps/rc-beelink-mxiii.c b/drivers/media/rc/keymaps/rc-beelink-mxiii.c
new file mode 100644
index 000000000000..01180cd92205
--- /dev/null
+++ b/drivers/media/rc/keymaps/rc-beelink-mxiii.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Christian Hewitt <christianshewitt@gmail.com>
+ *
+ */
+
+#include <media/rc-map.h>
+#include <linux/module.h>
+
+/*
+ * Keytable for the Beelink Mini MXIII remote control
+ *
+ */
+
+static struct rc_map_table beelink_mxiii[] = {
+	{ 0xb2dc, KEY_POWER },
+
+	{ 0xb288, KEY_MUTE },
+	{ 0xb282, KEY_HOME },
+
+	{ 0xb2ca, KEY_UP },
+	{ 0xb299, KEY_LEFT },
+	{ 0xb2ce, KEY_OK },
+	{ 0xb2c1, KEY_RIGHT },
+	{ 0xb2d2, KEY_DOWN },
+
+	{ 0xb2c5, KEY_MENU },
+	{ 0xb29a, KEY_BACK },
+
+	{ 0xb281, KEY_VOLUMEDOWN },
+	{ 0xb280, KEY_VOLUMEUP },
+};
+
+static struct rc_map_list beelink_mxiii_map = {
+	.map = {
+		.scan     = beelink_mxiii,
+		.size     = ARRAY_SIZE(beelink_mxiii),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_BEELINK_MXIII,
+	}
+};
+
+static int __init init_rc_map_beelink_mxiii(void)
+{
+	return rc_map_register(&beelink_mxiii_map);
+}
+
+static void __exit exit_rc_map_beelink_mxiii(void)
+{
+	rc_map_unregister(&beelink_mxiii_map);
+}
+
+module_init(init_rc_map_beelink_mxiii)
+module_exit(exit_rc_map_beelink_mxiii)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hewitt <christianshewitt@gmail.com");
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index 94ee968d4722..4676545ffd8f 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -225,6 +225,7 @@ struct rc_map *rc_map_get(const char *name);
 #define RC_MAP_AVERTV_303                "rc-avertv-303"
 #define RC_MAP_AZUREWAVE_AD_TU700        "rc-azurewave-ad-tu700"
 #define RC_MAP_BEELINK_GS1               "rc-beelink-gs1"
+#define RC_MAP_BEELINK_MXIII             "rc-beelink-mxiii"
 #define RC_MAP_BEHOLD                    "rc-behold"
 #define RC_MAP_BEHOLD_COLUMBUS           "rc-behold-columbus"
 #define RC_MAP_BUDGET_CI_OLD             "rc-budget-ci-old"
-- 
cgit v1.2.3


From de163422206076d764e800486f9c28a0ede7410a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 4 Feb 2023 00:31:29 +0100
Subject: media: drop unnecessary networking includes

dvb_net.h includes a bunch of core networking headers which increases
the number of objects rebuilt when we touch them. They are unnecessary
for the header itself and only one driver has an indirect dependency.

tveeprom.h includes if_packet to gain access to ETH_ALEN. This
is a bit of an overkill because if_packet.h pulls in skbuff.h.
The definition of ETH_ALEN is in the uAPI header, which is
very rarely touched, so switch to including that.

This results in roughly 250 fewer objects built when skbuff.h
is touched (6028 -> 5788).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/pci/ttpci/budget-av.c  | 1 +
 drivers/media/usb/dvb-usb/pctv452e.c | 2 ++
 include/media/dvb_net.h              | 6 ++----
 include/media/tveeprom.h             | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/media/pci/ttpci/budget-av.c b/drivers/media/pci/ttpci/budget-av.c
index 3cb83005cf09..84068f4d4e36 100644
--- a/drivers/media/pci/ttpci/budget-av.c
+++ b/drivers/media/pci/ttpci/budget-av.c
@@ -31,6 +31,7 @@
 #include "dvb-pll.h"
 #include <media/drv-intf/saa7146_vv.h>
 #include <linux/module.h>
+#include <linux/etherdevice.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
diff --git a/drivers/media/usb/dvb-usb/pctv452e.c b/drivers/media/usb/dvb-usb/pctv452e.c
index f0794c68c622..da42c989e071 100644
--- a/drivers/media/usb/dvb-usb/pctv452e.c
+++ b/drivers/media/usb/dvb-usb/pctv452e.c
@@ -26,6 +26,8 @@
 #include <media/dvb_ca_en50221.h>
 #include "ttpci-eeprom.h"
 
+#include <linux/etherdevice.h>
+
 static int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off).");
diff --git a/include/media/dvb_net.h b/include/media/dvb_net.h
index 5e31d37f25fa..9980b1dd750b 100644
--- a/include/media/dvb_net.h
+++ b/include/media/dvb_net.h
@@ -19,13 +19,11 @@
 #define _DVB_NET_H_
 
 #include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
 
 #include <media/dvbdev.h>
 
+struct net_device;
+
 #define DVB_NET_DEVICES_MAX 10
 
 #ifdef CONFIG_DVB_NET
diff --git a/include/media/tveeprom.h b/include/media/tveeprom.h
index b56eaee82aa5..f37c9b15ffdb 100644
--- a/include/media/tveeprom.h
+++ b/include/media/tveeprom.h
@@ -5,7 +5,7 @@
  *	      eeproms.
  */
 
-#include <linux/if_ether.h>
+#include <uapi/linux/if_ether.h>
 
 /**
  * enum tveeprom_audio_processor - Specifies the type of audio processor
-- 
cgit v1.2.3


From 7777694f80669cb2cdb467fc0fadf3376fbf836d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 2 Mar 2023 13:57:23 +0100
Subject: media: saa7146: drop overlay support

Destructive overlay support (i.e. where the video frame is DMA-ed
straight into a framebuffer) is effectively dead. It was a
necessary evil in the early days when computers were not fast enough
to copy SDTV video frames around, but today that's no longer a problem.

It requires access to the framebuffer memory, which is a bad idea and
very hard to do safely. In addition, in drm it is today almost
impossible to get hold of the framebuffer address.

So drop support for this.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/saa7146/saa7146_fops.c  |  26 +--
 drivers/media/common/saa7146/saa7146_hlp.c   | 296 ------------------------
 drivers/media/common/saa7146/saa7146_video.c | 326 +--------------------------
 drivers/staging/media/av7110/av7110_v4l.c    |  17 +-
 include/media/drv-intf/saa7146_vv.h          |  21 --
 5 files changed, 7 insertions(+), 679 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index e9a15de6126e..08c8e73cef2c 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -516,28 +516,10 @@ int saa7146_vv_init(struct saa7146_dev* dev, struct saa7146_ext_vv *ext_vv)
 	   configuration data) */
 	dev->ext_vv_data = ext_vv;
 
-	vv->d_clipping.cpu_addr =
-		dma_alloc_coherent(&dev->pci->dev, SAA7146_CLIPPING_MEM,
-				   &vv->d_clipping.dma_handle, GFP_KERNEL);
-	if( NULL == vv->d_clipping.cpu_addr ) {
-		ERR("out of memory. aborting.\n");
-		kfree(vv);
-		v4l2_ctrl_handler_free(hdl);
-		v4l2_device_unregister(&dev->v4l2_dev);
-		return -ENOMEM;
-	}
-
 	saa7146_video_uops.init(dev,vv);
 	if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE)
 		saa7146_vbi_uops.init(dev,vv);
 
-	vv->ov_fb.fmt.width = vv->standard->h_max_out;
-	vv->ov_fb.fmt.height = vv->standard->v_max_out;
-	vv->ov_fb.fmt.pixelformat = V4L2_PIX_FMT_RGB565;
-	vv->ov_fb.fmt.bytesperline = 2 * vv->ov_fb.fmt.width;
-	vv->ov_fb.fmt.sizeimage = vv->ov_fb.fmt.bytesperline * vv->ov_fb.fmt.height;
-	vv->ov_fb.fmt.colorspace = V4L2_COLORSPACE_SRGB;
-
 	fmt = &vv->video_fmt;
 	fmt->width = 384;
 	fmt->height = 288;
@@ -561,8 +543,6 @@ int saa7146_vv_init(struct saa7146_dev* dev, struct saa7146_ext_vv *ext_vv)
 
 	timer_setup(&vv->vbi_read_timeout, NULL, 0);
 
-	vv->ov_fb.capability = V4L2_FBUF_CAP_LIST_CLIPPING;
-	vv->ov_fb.flags = V4L2_FBUF_FLAG_PRIMARY;
 	dev->vv_data = vv;
 	dev->vv_callback = &vv_callback;
 
@@ -577,8 +557,6 @@ int saa7146_vv_release(struct saa7146_dev* dev)
 	DEB_EE("dev:%p\n", dev);
 
 	v4l2_device_unregister(&dev->v4l2_dev);
-	dma_free_coherent(&dev->pci->dev, SAA7146_CLIPPING_MEM,
-			  vv->d_clipping.cpu_addr, vv->d_clipping.dma_handle);
 	v4l2_ctrl_handler_free(&dev->ctrl_handler);
 	kfree(vv);
 	dev->vv_data = NULL;
@@ -608,7 +586,7 @@ int saa7146_register_device(struct video_device *vfd, struct saa7146_dev *dev,
 	for (i = 0; i < dev->ext_vv_data->num_stds; i++)
 		vfd->tvnorms |= dev->ext_vv_data->stds[i].id;
 	strscpy(vfd->name, name, sizeof(vfd->name));
-	vfd->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OVERLAY |
+	vfd->device_caps = V4L2_CAP_VIDEO_CAPTURE |
 			   V4L2_CAP_READWRITE | V4L2_CAP_STREAMING;
 	vfd->device_caps |= dev->ext_vv_data->capabilities;
 	if (type == VFL_TYPE_VIDEO)
@@ -616,7 +594,7 @@ int saa7146_register_device(struct video_device *vfd, struct saa7146_dev *dev,
 			~(V4L2_CAP_VBI_CAPTURE | V4L2_CAP_SLICED_VBI_OUTPUT);
 	else
 		vfd->device_caps &=
-			~(V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OVERLAY | V4L2_CAP_AUDIO);
+			~(V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_AUDIO);
 	video_set_drvdata(vfd, dev);
 
 	err = video_register_device(vfd, type, -1);
diff --git a/drivers/media/common/saa7146/saa7146_hlp.c b/drivers/media/common/saa7146/saa7146_hlp.c
index 6c9946a402ee..6792a96d0ba3 100644
--- a/drivers/media/common/saa7146/saa7146_hlp.c
+++ b/drivers/media/common/saa7146/saa7146_hlp.c
@@ -306,221 +306,6 @@ static int calculate_v_scale_registers(struct saa7146_dev *dev, enum v4l2_field
 }
 
 /* simple bubble-sort algorithm with duplicate elimination */
-static int sort_and_eliminate(u32* values, int* count)
-{
-	int low = 0, high = 0, top = 0;
-	int cur = 0, next = 0;
-
-	/* sanity checks */
-	if( (0 > *count) || (NULL == values) ) {
-		return -EINVAL;
-	}
-
-	/* bubble sort the first @count items of the array @values */
-	for( top = *count; top > 0; top--) {
-		for( low = 0, high = 1; high < top; low++, high++) {
-			if( values[low] > values[high] )
-				swap(values[low], values[high]);
-		}
-	}
-
-	/* remove duplicate items */
-	for( cur = 0, next = 1; next < *count; next++) {
-		if( values[cur] != values[next])
-			values[++cur] = values[next];
-	}
-
-	*count = cur + 1;
-
-	return 0;
-}
-
-static void calculate_clipping_registers_rect(struct saa7146_dev *dev, struct saa7146_fh *fh,
-	struct saa7146_video_dma *vdma2, u32* clip_format, u32* arbtr_ctrl, enum v4l2_field field)
-{
-	struct saa7146_vv *vv = dev->vv_data;
-	__le32 *clipping = vv->d_clipping.cpu_addr;
-
-	int width = vv->ov.win.w.width;
-	int height =  vv->ov.win.w.height;
-	int clipcount = vv->ov.nclips;
-
-	u32 line_list[32];
-	u32 pixel_list[32];
-	int numdwords = 0;
-
-	int i = 0, j = 0;
-	int cnt_line = 0, cnt_pixel = 0;
-
-	int x[32], y[32], w[32], h[32];
-
-	/* clear out memory */
-	memset(&line_list[0],  0x00, sizeof(u32)*32);
-	memset(&pixel_list[0], 0x00, sizeof(u32)*32);
-	memset(clipping,  0x00, SAA7146_CLIPPING_MEM);
-
-	/* fill the line and pixel-lists */
-	for(i = 0; i < clipcount; i++) {
-		int l = 0, r = 0, t = 0, b = 0;
-
-		x[i] = vv->ov.clips[i].c.left;
-		y[i] = vv->ov.clips[i].c.top;
-		w[i] = vv->ov.clips[i].c.width;
-		h[i] = vv->ov.clips[i].c.height;
-
-		if( w[i] < 0) {
-			x[i] += w[i]; w[i] = -w[i];
-		}
-		if( h[i] < 0) {
-			y[i] += h[i]; h[i] = -h[i];
-		}
-		if( x[i] < 0) {
-			w[i] += x[i]; x[i] = 0;
-		}
-		if( y[i] < 0) {
-			h[i] += y[i]; y[i] = 0;
-		}
-		if( 0 != vv->vflip ) {
-			y[i] = height - y[i] - h[i];
-		}
-
-		l = x[i];
-		r = x[i]+w[i];
-		t = y[i];
-		b = y[i]+h[i];
-
-		/* insert left/right coordinates */
-		pixel_list[ 2*i   ] = min_t(int, l, width);
-		pixel_list[(2*i)+1] = min_t(int, r, width);
-		/* insert top/bottom coordinates */
-		line_list[ 2*i   ] = min_t(int, t, height);
-		line_list[(2*i)+1] = min_t(int, b, height);
-	}
-
-	/* sort and eliminate lists */
-	cnt_line = cnt_pixel = 2*clipcount;
-	sort_and_eliminate( &pixel_list[0], &cnt_pixel );
-	sort_and_eliminate( &line_list[0], &cnt_line );
-
-	/* calculate the number of used u32s */
-	numdwords = max_t(int, (cnt_line+1), (cnt_pixel+1))*2;
-	numdwords = max_t(int, 4, numdwords);
-	numdwords = min_t(int, 64, numdwords);
-
-	/* fill up cliptable */
-	for(i = 0; i < cnt_pixel; i++) {
-		clipping[2*i] |= cpu_to_le32(pixel_list[i] << 16);
-	}
-	for(i = 0; i < cnt_line; i++) {
-		clipping[(2*i)+1] |= cpu_to_le32(line_list[i] << 16);
-	}
-
-	/* fill up cliptable with the display infos */
-	for(j = 0; j < clipcount; j++) {
-
-		for(i = 0; i < cnt_pixel; i++) {
-
-			if( x[j] < 0)
-				x[j] = 0;
-
-			if( pixel_list[i] < (x[j] + w[j])) {
-
-				if ( pixel_list[i] >= x[j] ) {
-					clipping[2*i] |= cpu_to_le32(1 << j);
-				}
-			}
-		}
-		for(i = 0; i < cnt_line; i++) {
-
-			if( y[j] < 0)
-				y[j] = 0;
-
-			if( line_list[i] < (y[j] + h[j]) ) {
-
-				if( line_list[i] >= y[j] ) {
-					clipping[(2*i)+1] |= cpu_to_le32(1 << j);
-				}
-			}
-		}
-	}
-
-	/* adjust arbitration control register */
-	*arbtr_ctrl &= 0xffff00ff;
-	*arbtr_ctrl |= 0x00001c00;
-
-	vdma2->base_even	= vv->d_clipping.dma_handle;
-	vdma2->base_odd		= vv->d_clipping.dma_handle;
-	vdma2->prot_addr	= vv->d_clipping.dma_handle+((sizeof(u32))*(numdwords));
-	vdma2->base_page	= 0x04;
-	vdma2->pitch		= 0x00;
-	vdma2->num_line_byte	= (0 << 16 | (sizeof(u32))*(numdwords-1) );
-
-	/* set clipping-mode. this depends on the field(s) used */
-	*clip_format &= 0xfffffff7;
-	if (V4L2_FIELD_HAS_BOTH(field)) {
-		*clip_format |= 0x00000008;
-	} else {
-		*clip_format |= 0x00000000;
-	}
-}
-
-/* disable clipping */
-static void saa7146_disable_clipping(struct saa7146_dev *dev)
-{
-	u32 clip_format	= saa7146_read(dev, CLIP_FORMAT_CTRL);
-
-	/* mask out relevant bits (=lower word)*/
-	clip_format &= MASK_W1;
-
-	/* upload clipping-registers*/
-	saa7146_write(dev, CLIP_FORMAT_CTRL,clip_format);
-	saa7146_write(dev, MC2, (MASK_05 | MASK_21));
-
-	/* disable video dma2 */
-	saa7146_write(dev, MC1, MASK_21);
-}
-
-static void saa7146_set_clipping_rect(struct saa7146_fh *fh)
-{
-	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-	enum v4l2_field field = vv->ov.win.field;
-	struct	saa7146_video_dma vdma2;
-	u32 clip_format;
-	u32 arbtr_ctrl;
-
-	/* check clipcount, disable clipping if clipcount == 0*/
-	if (vv->ov.nclips == 0) {
-		saa7146_disable_clipping(dev);
-		return;
-	}
-
-	clip_format = saa7146_read(dev, CLIP_FORMAT_CTRL);
-	arbtr_ctrl = saa7146_read(dev, PCI_BT_V1);
-
-	calculate_clipping_registers_rect(dev, fh, &vdma2, &clip_format, &arbtr_ctrl, field);
-
-	/* set clipping format */
-	clip_format &= 0xffff0008;
-	clip_format |= (SAA7146_CLIPPING_RECT << 4);
-
-	/* prepare video dma2 */
-	saa7146_write(dev, BASE_EVEN2,		vdma2.base_even);
-	saa7146_write(dev, BASE_ODD2,		vdma2.base_odd);
-	saa7146_write(dev, PROT_ADDR2,		vdma2.prot_addr);
-	saa7146_write(dev, BASE_PAGE2,		vdma2.base_page);
-	saa7146_write(dev, PITCH2,		vdma2.pitch);
-	saa7146_write(dev, NUM_LINE_BYTE2,	vdma2.num_line_byte);
-
-	/* prepare the rest */
-	saa7146_write(dev, CLIP_FORMAT_CTRL,clip_format);
-	saa7146_write(dev, PCI_BT_V1, arbtr_ctrl);
-
-	/* upload clip_control-register, clipping-registers, enable video dma2 */
-	saa7146_write(dev, MC2, (MASK_05 | MASK_21 | MASK_03 | MASK_19));
-	saa7146_write(dev, MC1, (MASK_05 | MASK_21));
-}
-
 static void saa7146_set_window(struct saa7146_dev *dev, int width, int height, enum v4l2_field field)
 {
 	struct saa7146_vv *vv = dev->vv_data;
@@ -556,62 +341,6 @@ static void saa7146_set_window(struct saa7146_dev *dev, int width, int height, e
 	saa7146_write(dev, MC2, (MASK_05 | MASK_06 | MASK_21 | MASK_22) );
 }
 
-/* calculate the new memory offsets for a desired position */
-static void saa7146_set_position(struct saa7146_dev *dev, int w_x, int w_y, int w_height, enum v4l2_field field, u32 pixelformat)
-{
-	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pixelformat);
-
-	int b_depth = vv->ov_fmt->depth;
-	int b_bpl = vv->ov_fb.fmt.bytesperline;
-	/* The unsigned long cast is to remove a 64-bit compile warning since
-	   it looks like a 64-bit address is cast to a 32-bit value, even
-	   though the base pointer is really a 32-bit physical address that
-	   goes into a 32-bit DMA register.
-	   FIXME: might not work on some 64-bit platforms, but see the FIXME
-	   in struct v4l2_framebuffer (videodev2.h) for that.
-	 */
-	u32 base = (u32)(unsigned long)vv->ov_fb.base;
-
-	struct	saa7146_video_dma vdma1;
-
-	/* calculate memory offsets for picture, look if we shall top-down-flip */
-	vdma1.pitch	= 2*b_bpl;
-	if ( 0 == vv->vflip ) {
-		vdma1.base_even = base + (w_y * (vdma1.pitch/2)) + (w_x * (b_depth / 8));
-		vdma1.base_odd  = vdma1.base_even + (vdma1.pitch / 2);
-		vdma1.prot_addr = vdma1.base_even + (w_height * (vdma1.pitch / 2));
-	}
-	else {
-		vdma1.base_even = base + ((w_y+w_height) * (vdma1.pitch/2)) + (w_x * (b_depth / 8));
-		vdma1.base_odd  = vdma1.base_even - (vdma1.pitch / 2);
-		vdma1.prot_addr = vdma1.base_odd - (w_height * (vdma1.pitch / 2));
-	}
-
-	if (V4L2_FIELD_HAS_BOTH(field)) {
-	} else if (field == V4L2_FIELD_ALTERNATE) {
-		/* fixme */
-		vdma1.base_odd = vdma1.prot_addr;
-		vdma1.pitch /= 2;
-	} else if (field == V4L2_FIELD_TOP) {
-		vdma1.base_odd = vdma1.prot_addr;
-		vdma1.pitch /= 2;
-	} else if (field == V4L2_FIELD_BOTTOM) {
-		vdma1.base_odd = vdma1.base_even;
-		vdma1.base_even = vdma1.prot_addr;
-		vdma1.pitch /= 2;
-	}
-
-	if ( 0 != vv->vflip ) {
-		vdma1.pitch *= -1;
-	}
-
-	vdma1.base_page = sfmt->swap;
-	vdma1.num_line_byte = (vv->standard->v_field<<16)+vv->standard->h_pixels;
-
-	saa7146_write_out_dma(dev, 1, &vdma1);
-}
-
 static void saa7146_set_output_format(struct saa7146_dev *dev, unsigned long palette)
 {
 	u32 clip_format = saa7146_read(dev, CLIP_FORMAT_CTRL);
@@ -645,30 +374,6 @@ void saa7146_set_hps_source_and_sync(struct saa7146_dev *dev, int source, int sy
 }
 EXPORT_SYMBOL_GPL(saa7146_set_hps_source_and_sync);
 
-int saa7146_enable_overlay(struct saa7146_fh *fh)
-{
-	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-
-	saa7146_set_window(dev, vv->ov.win.w.width, vv->ov.win.w.height, vv->ov.win.field);
-	saa7146_set_position(dev, vv->ov.win.w.left, vv->ov.win.w.top, vv->ov.win.w.height, vv->ov.win.field, vv->ov_fmt->pixelformat);
-	saa7146_set_output_format(dev, vv->ov_fmt->trans);
-	saa7146_set_clipping_rect(fh);
-
-	/* enable video dma1 */
-	saa7146_write(dev, MC1, (MASK_06 | MASK_22));
-	return 0;
-}
-
-void saa7146_disable_overlay(struct saa7146_fh *fh)
-{
-	struct saa7146_dev *dev = fh->dev;
-
-	/* disable clipping + video dma1 */
-	saa7146_disable_clipping(dev);
-	saa7146_write(dev, MC1, MASK_22);
-}
-
 void saa7146_write_out_dma(struct saa7146_dev* dev, int which, struct saa7146_video_dma* vdma)
 {
 	int where = 0;
@@ -1011,7 +716,6 @@ void saa7146_set_capture(struct saa7146_dev *dev, struct saa7146_buf *buf, struc
 
 	saa7146_set_window(dev, buf->fmt->width, buf->fmt->height, buf->fmt->field);
 	saa7146_set_output_format(dev, sfmt->trans);
-	saa7146_disable_clipping(dev);
 
 	if ( vv->last_field == V4L2_FIELD_INTERLACED ) {
 	} else if ( vv->last_field == V4L2_FIELD_TOP ) {
diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c
index 2296765079a4..27c97218ee53 100644
--- a/drivers/media/common/saa7146/saa7146_video.c
+++ b/drivers/media/common/saa7146/saa7146_video.c
@@ -14,9 +14,6 @@ MODULE_PARM_DESC(max_memory, "maximum memory usage for capture buffers (default:
 #define IS_CAPTURE_ACTIVE(fh) \
 	(((vv->video_status & STATUS_CAPTURE) != 0) && (vv->video_fh == fh))
 
-#define IS_OVERLAY_ACTIVE(fh) \
-	(((vv->video_status & STATUS_OVERLAY) != 0) && (vv->video_fh == fh))
-
 /* format descriptions for capture and preview */
 static struct saa7146_format formats[] = {
 	{
@@ -91,105 +88,6 @@ struct saa7146_format* saa7146_format_by_fourcc(struct saa7146_dev *dev, int fou
 	return NULL;
 }
 
-static int vidioc_try_fmt_vid_overlay(struct file *file, void *fh, struct v4l2_format *f);
-
-int saa7146_start_preview(struct saa7146_fh *fh)
-{
-	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-	struct v4l2_format fmt;
-	int ret = 0, err = 0;
-
-	DEB_EE("dev:%p, fh:%p\n", dev, fh);
-
-	/* check if we have overlay information */
-	if (vv->ov.fh == NULL) {
-		DEB_D("no overlay data available. try S_FMT first.\n");
-		return -EAGAIN;
-	}
-
-	/* check if streaming capture is running */
-	if (IS_CAPTURE_ACTIVE(fh) != 0) {
-		DEB_D("streaming capture is active\n");
-		return -EBUSY;
-	}
-
-	/* check if overlay is running */
-	if (IS_OVERLAY_ACTIVE(fh) != 0) {
-		if (vv->video_fh == fh) {
-			DEB_D("overlay is already active\n");
-			return 0;
-		}
-		DEB_D("overlay is already active in another open\n");
-		return -EBUSY;
-	}
-
-	if (0 == saa7146_res_get(fh, RESOURCE_DMA1_HPS|RESOURCE_DMA2_CLP)) {
-		DEB_D("cannot get necessary overlay resources\n");
-		return -EBUSY;
-	}
-
-	fmt.fmt.win = vv->ov.win;
-	err = vidioc_try_fmt_vid_overlay(NULL, fh, &fmt);
-	if (0 != err) {
-		saa7146_res_free(vv->video_fh, RESOURCE_DMA1_HPS|RESOURCE_DMA2_CLP);
-		return -EBUSY;
-	}
-	vv->ov.win = fmt.fmt.win;
-
-	DEB_D("%dx%d+%d+%d 0x%08x field=%s\n",
-	      vv->ov.win.w.width, vv->ov.win.w.height,
-	      vv->ov.win.w.left, vv->ov.win.w.top,
-	      vv->ov_fmt->pixelformat, v4l2_field_names[vv->ov.win.field]);
-
-	if (0 != (ret = saa7146_enable_overlay(fh))) {
-		DEB_D("enabling overlay failed: %d\n", ret);
-		saa7146_res_free(vv->video_fh, RESOURCE_DMA1_HPS|RESOURCE_DMA2_CLP);
-		return ret;
-	}
-
-	vv->video_status = STATUS_OVERLAY;
-	vv->video_fh = fh;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(saa7146_start_preview);
-
-int saa7146_stop_preview(struct saa7146_fh *fh)
-{
-	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-
-	DEB_EE("dev:%p, fh:%p\n", dev, fh);
-
-	/* check if streaming capture is running */
-	if (IS_CAPTURE_ACTIVE(fh) != 0) {
-		DEB_D("streaming capture is active\n");
-		return -EBUSY;
-	}
-
-	/* check if overlay is running at all */
-	if ((vv->video_status & STATUS_OVERLAY) == 0) {
-		DEB_D("no active overlay\n");
-		return 0;
-	}
-
-	if (vv->video_fh != fh) {
-		DEB_D("overlay is active, but in another open\n");
-		return -EBUSY;
-	}
-
-	vv->video_status = 0;
-	vv->video_fh = NULL;
-
-	saa7146_disable_overlay(fh);
-
-	saa7146_res_free(fh, RESOURCE_DMA1_HPS|RESOURCE_DMA2_CLP);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(saa7146_stop_preview);
-
 /********************************************************************************/
 /* common pagetable functions */
 
@@ -319,7 +217,7 @@ static int video_begin(struct saa7146_fh *fh)
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_format *fmt = NULL;
 	unsigned int resource;
-	int ret = 0, err = 0;
+	int ret = 0;
 
 	DEB_EE("dev:%p, fh:%p\n", dev, fh);
 
@@ -332,16 +230,6 @@ static int video_begin(struct saa7146_fh *fh)
 		return -EBUSY;
 	}
 
-	if ((vv->video_status & STATUS_OVERLAY) != 0) {
-		DEB_S("warning: suspending overlay video for streaming capture\n");
-		vv->ov_suspend = vv->video_fh;
-		err = saa7146_stop_preview(vv->video_fh); /* side effect: video_status is now 0, video_fh is NULL */
-		if (0 != err) {
-			DEB_D("suspending video failed. aborting\n");
-			return err;
-		}
-	}
-
 	fmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
 	/* we need to have a valid format set here */
 	if (!fmt)
@@ -356,10 +244,6 @@ static int video_begin(struct saa7146_fh *fh)
 	ret = saa7146_res_get(fh, resource);
 	if (0 == ret) {
 		DEB_S("cannot get capture resource %d\n", resource);
-		if (vv->ov_suspend != NULL) {
-			saa7146_start_preview(vv->ov_suspend);
-			vv->ov_suspend = NULL;
-		}
 		return -EBUSY;
 	}
 
@@ -429,11 +313,6 @@ static int video_end(struct saa7146_fh *fh, struct file *file)
 
 	saa7146_res_free(fh, resource);
 
-	if (vv->ov_suspend != NULL) {
-		saa7146_start_preview(vv->ov_suspend);
-		vv->ov_suspend = NULL;
-	}
-
 	return 0;
 }
 
@@ -443,64 +322,13 @@ static int vidioc_querycap(struct file *file, void *fh, struct v4l2_capability *
 
 	strscpy((char *)cap->driver, "saa7146 v4l2", sizeof(cap->driver));
 	strscpy((char *)cap->card, dev->ext->name, sizeof(cap->card));
-	cap->capabilities = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OVERLAY |
+	cap->capabilities = V4L2_CAP_VIDEO_CAPTURE |
 			    V4L2_CAP_READWRITE | V4L2_CAP_STREAMING |
 			    V4L2_CAP_DEVICE_CAPS;
 	cap->capabilities |= dev->ext_vv_data->capabilities;
 	return 0;
 }
 
-static int vidioc_g_fbuf(struct file *file, void *fh, struct v4l2_framebuffer *fb)
-{
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-
-	*fb = vv->ov_fb;
-	fb->capability = V4L2_FBUF_CAP_LIST_CLIPPING;
-	fb->flags = V4L2_FBUF_FLAG_PRIMARY;
-	return 0;
-}
-
-static int vidioc_s_fbuf(struct file *file, void *fh, const struct v4l2_framebuffer *fb)
-{
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_format *fmt;
-
-	DEB_EE("VIDIOC_S_FBUF\n");
-
-	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/* check args */
-	fmt = saa7146_format_by_fourcc(dev, fb->fmt.pixelformat);
-	if (NULL == fmt)
-		return -EINVAL;
-
-	/* planar formats are not allowed for overlay video, clipping and video dma would clash */
-	if (fmt->flags & FORMAT_IS_PLANAR)
-		DEB_S("planar pixelformat '%4.4s' not allowed for overlay\n",
-		      (char *)&fmt->pixelformat);
-
-	/* check if overlay is running */
-	if (IS_OVERLAY_ACTIVE(fh) != 0) {
-		if (vv->video_fh != fh) {
-			DEB_D("refusing to change framebuffer information while overlay is active in another open\n");
-			return -EBUSY;
-		}
-	}
-
-	/* ok, accept it */
-	vv->ov_fb = *fb;
-	vv->ov_fmt = fmt;
-
-	if (vv->ov_fb.fmt.bytesperline < vv->ov_fb.fmt.width) {
-		vv->ov_fb.fmt.bytesperline = vv->ov_fb.fmt.width * fmt->depth / 8;
-		DEB_D("setting bytesperline to %d\n", vv->ov_fb.fmt.bytesperline);
-	}
-	return 0;
-}
-
 static int vidioc_enum_fmt_vid_cap(struct file *file, void *fh, struct v4l2_fmtdesc *f)
 {
 	if (f->index >= ARRAY_SIZE(formats))
@@ -557,13 +385,6 @@ int saa7146_s_ctrl(struct v4l2_ctrl *ctrl)
 	default:
 		return -EINVAL;
 	}
-
-	if ((vv->video_status & STATUS_OVERLAY) != 0) { /* CHECK: && (vv->video_fh == fh)) */
-		struct saa7146_fh *fh = vv->video_fh;
-
-		saa7146_stop_preview(fh);
-		saa7146_start_preview(fh);
-	}
 	return 0;
 }
 
@@ -590,15 +411,6 @@ static int vidioc_g_fmt_vid_cap(struct file *file, void *fh, struct v4l2_format
 	return 0;
 }
 
-static int vidioc_g_fmt_vid_overlay(struct file *file, void *fh, struct v4l2_format *f)
-{
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-
-	f->fmt.win = vv->ov.win;
-	return 0;
-}
-
 static int vidioc_g_fmt_vbi_cap(struct file *file, void *fh, struct v4l2_format *f)
 {
 	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
@@ -673,66 +485,6 @@ static int vidioc_try_fmt_vid_cap(struct file *file, void *fh, struct v4l2_forma
 	return 0;
 }
 
-
-static int vidioc_try_fmt_vid_overlay(struct file *file, void *fh, struct v4l2_format *f)
-{
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-	struct v4l2_window *win = &f->fmt.win;
-	enum v4l2_field field;
-	int maxw, maxh;
-
-	DEB_EE("dev:%p\n", dev);
-
-	if (NULL == vv->ov_fb.base) {
-		DEB_D("no fb base set\n");
-		return -EINVAL;
-	}
-	if (NULL == vv->ov_fmt) {
-		DEB_D("no fb fmt set\n");
-		return -EINVAL;
-	}
-	if (win->w.width < 48 || win->w.height < 32) {
-		DEB_D("min width/height. (%d,%d)\n",
-		      win->w.width, win->w.height);
-		return -EINVAL;
-	}
-	if (win->clipcount > 16) {
-		DEB_D("clipcount too big\n");
-		return -EINVAL;
-	}
-
-	field = win->field;
-	maxw  = vv->standard->h_max_out;
-	maxh  = vv->standard->v_max_out;
-
-	if (V4L2_FIELD_ANY == field) {
-		field = (win->w.height > maxh / 2)
-			? V4L2_FIELD_INTERLACED
-			: V4L2_FIELD_TOP;
-		}
-	switch (field) {
-	case V4L2_FIELD_TOP:
-	case V4L2_FIELD_BOTTOM:
-	case V4L2_FIELD_ALTERNATE:
-		maxh = maxh / 2;
-		break;
-	case V4L2_FIELD_INTERLACED:
-		break;
-	default:
-		DEB_D("no known field mode '%d'\n", field);
-		return -EINVAL;
-	}
-
-	win->field = field;
-	if (win->w.width > maxw)
-		win->w.width = maxw;
-	if (win->w.height > maxh)
-		win->w.height = maxh;
-
-	return 0;
-}
-
 static int vidioc_s_fmt_vid_cap(struct file *file, void *__fh, struct v4l2_format *f)
 {
 	struct saa7146_fh *fh = __fh;
@@ -754,35 +506,6 @@ static int vidioc_s_fmt_vid_cap(struct file *file, void *__fh, struct v4l2_forma
 	return 0;
 }
 
-static int vidioc_s_fmt_vid_overlay(struct file *file, void *__fh, struct v4l2_format *f)
-{
-	struct saa7146_fh *fh = __fh;
-	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
-	int err;
-
-	DEB_EE("V4L2_BUF_TYPE_VIDEO_OVERLAY: dev:%p, fh:%p\n", dev, fh);
-	err = vidioc_try_fmt_vid_overlay(file, fh, f);
-	if (0 != err)
-		return err;
-	vv->ov.win    = f->fmt.win;
-	vv->ov.nclips = f->fmt.win.clipcount;
-	if (vv->ov.nclips > 16)
-		vv->ov.nclips = 16;
-	memcpy(vv->ov.clips, f->fmt.win.clips,
-	       sizeof(struct v4l2_clip) * vv->ov.nclips);
-
-	/* vv->ov.fh is used to indicate that we have valid overlay information, too */
-	vv->ov.fh = fh;
-
-	/* check if our current overlay is active */
-	if (IS_OVERLAY_ACTIVE(fh) != 0) {
-		saa7146_stop_preview(fh);
-		saa7146_start_preview(fh);
-	}
-	return 0;
-}
-
 static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
 {
 	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
@@ -815,7 +538,7 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	int found = 0;
-	int err, i;
+	int i;
 
 	DEB_EE("VIDIOC_S_STD\n");
 
@@ -824,15 +547,6 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 		return -EBUSY;
 	}
 
-	if ((vv->video_status & STATUS_OVERLAY) != 0) {
-		vv->ov_suspend = vv->video_fh;
-		err = saa7146_stop_preview(vv->video_fh); /* side effect: video_status is now 0, video_fh is NULL */
-		if (0 != err) {
-			DEB_D("suspending video failed. aborting\n");
-			return err;
-		}
-	}
-
 	for (i = 0; i < dev->ext_vv_data->num_stds; i++)
 		if (id & dev->ext_vv_data->stds[i].id)
 			break;
@@ -843,11 +557,6 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 		found = 1;
 	}
 
-	if (vv->ov_suspend != NULL) {
-		saa7146_start_preview(vv->ov_suspend);
-		vv->ov_suspend = NULL;
-	}
-
 	if (!found) {
 		DEB_EE("VIDIOC_S_STD: standard not found\n");
 		return -EINVAL;
@@ -857,18 +566,6 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 	return 0;
 }
 
-static int vidioc_overlay(struct file *file, void *fh, unsigned int on)
-{
-	int err;
-
-	DEB_D("VIDIOC_OVERLAY on:%d\n", on);
-	if (on)
-		err = saa7146_start_preview(fh);
-	else
-		err = saa7146_stop_preview(fh);
-	return err;
-}
-
 static int vidioc_reqbufs(struct file *file, void *__fh, struct v4l2_requestbuffers *b)
 {
 	struct saa7146_fh *fh = __fh;
@@ -969,17 +666,10 @@ static int vidioc_streamoff(struct file *file, void *__fh, enum v4l2_buf_type ty
 const struct v4l2_ioctl_ops saa7146_video_ioctl_ops = {
 	.vidioc_querycap             = vidioc_querycap,
 	.vidioc_enum_fmt_vid_cap     = vidioc_enum_fmt_vid_cap,
-	.vidioc_enum_fmt_vid_overlay = vidioc_enum_fmt_vid_cap,
 	.vidioc_g_fmt_vid_cap        = vidioc_g_fmt_vid_cap,
 	.vidioc_try_fmt_vid_cap      = vidioc_try_fmt_vid_cap,
 	.vidioc_s_fmt_vid_cap        = vidioc_s_fmt_vid_cap,
-	.vidioc_g_fmt_vid_overlay    = vidioc_g_fmt_vid_overlay,
-	.vidioc_try_fmt_vid_overlay  = vidioc_try_fmt_vid_overlay,
-	.vidioc_s_fmt_vid_overlay    = vidioc_s_fmt_vid_overlay,
 
-	.vidioc_overlay		     = vidioc_overlay,
-	.vidioc_g_fbuf		     = vidioc_g_fbuf,
-	.vidioc_s_fbuf		     = vidioc_s_fbuf,
 	.vidioc_reqbufs              = vidioc_reqbufs,
 	.vidioc_querybuf             = vidioc_querybuf,
 	.vidioc_qbuf                 = vidioc_qbuf,
@@ -1096,7 +786,7 @@ static int buffer_prepare(struct videobuf_queue *q,
 			saa7146_pgtable_alloc(dev->pci, &buf->pt[0]);
 		}
 
-		err = videobuf_iolock(q,&buf->vb, &vv->ov_fb);
+		err = videobuf_iolock(q, &buf->vb, NULL);
 		if (err)
 			goto oops;
 		err = saa7146_pgtable_build(dev,buf);
@@ -1211,8 +901,6 @@ static void video_close(struct saa7146_dev *dev, struct file *file)
 
 	if (IS_CAPTURE_ACTIVE(fh) != 0)
 		video_end(fh, file);
-	else if (IS_OVERLAY_ACTIVE(fh) != 0)
-		saa7146_stop_preview(fh);
 
 	videobuf_stop(q);
 	/* hmm, why is this function declared void? */
@@ -1268,12 +956,6 @@ static ssize_t video_read(struct file *file, char __user *data, size_t count, lo
 		ret = video_end(fh, file);
 	}
 out:
-	/* restart overlay if it was active before */
-	if (vv->ov_suspend != NULL) {
-		saa7146_start_preview(vv->ov_suspend);
-		vv->ov_suspend = NULL;
-	}
-
 	return ret;
 }
 
diff --git a/drivers/staging/media/av7110/av7110_v4l.c b/drivers/staging/media/av7110/av7110_v4l.c
index c89f536f699c..374f78b84c04 100644
--- a/drivers/staging/media/av7110/av7110_v4l.c
+++ b/drivers/staging/media/av7110/av7110_v4l.c
@@ -216,22 +216,12 @@ static const struct v4l2_audio msp3400_v4l2_audio = {
 static int av7110_dvb_c_switch(struct saa7146_fh *fh)
 {
 	struct saa7146_dev *dev = fh->dev;
-	struct saa7146_vv *vv = dev->vv_data;
 	struct av7110 *av7110 = (struct av7110*)dev->ext_priv;
 	u16 adswitch;
-	int source, sync, err;
+	int source, sync;
 
 	dprintk(4, "%p\n", av7110);
 
-	if ((vv->video_status & STATUS_OVERLAY) != 0) {
-		vv->ov_suspend = vv->video_fh;
-		err = saa7146_stop_preview(vv->video_fh); /* side effect: video_status is now 0, video_fh is NULL */
-		if (err != 0) {
-			dprintk(2, "suspending video failed\n");
-			vv->ov_suspend = NULL;
-		}
-	}
-
 	if (0 != av7110->current_input) {
 		dprintk(1, "switching to analog TV:\n");
 		adswitch = 1;
@@ -300,11 +290,6 @@ static int av7110_dvb_c_switch(struct saa7146_fh *fh)
 
 	saa7146_set_hps_source_and_sync(dev, source, sync);
 
-	if (vv->ov_suspend != NULL) {
-		saa7146_start_preview(vv->ov_suspend);
-		vv->ov_suspend = NULL;
-	}
-
 	return 0;
 }
 
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index 635805fb35e8..932961e8f5ab 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -76,13 +76,6 @@ struct saa7146_dmaqueue {
 	struct timer_list	timeout;
 };
 
-struct saa7146_overlay {
-	struct saa7146_fh	*fh;
-	struct v4l2_window	win;
-	struct v4l2_clip	clips[16];
-	int			nclips;
-};
-
 /* per open data */
 struct saa7146_fh {
 	/* Must be the first field! */
@@ -98,7 +91,6 @@ struct saa7146_fh {
 	unsigned int resources;	/* resource management for device open */
 };
 
-#define STATUS_OVERLAY	0x01
 #define STATUS_CAPTURE	0x02
 
 struct saa7146_vv
@@ -116,12 +108,6 @@ struct saa7146_vv
 	int				video_status;
 	struct saa7146_fh		*video_fh;
 
-	/* video overlay */
-	struct saa7146_overlay		ov;
-	struct v4l2_framebuffer		ov_fb;
-	struct saa7146_format		*ov_fmt;
-	struct saa7146_fh		*ov_suspend;
-
 	/* video capture */
 	struct saa7146_dmaqueue		video_dmaq;
 	struct v4l2_pix_format		video_fmt;
@@ -140,8 +126,6 @@ struct saa7146_vv
 	int	current_hps_source;
 	int	current_hps_sync;
 
-	struct saa7146_dma	d_clipping;	/* pointer to clipping memory */
-
 	unsigned int resources;	/* resource management for device */
 };
 
@@ -192,9 +176,6 @@ int saa7146_vv_init(struct saa7146_dev* dev, struct saa7146_ext_vv *ext_vv);
 int saa7146_vv_release(struct saa7146_dev* dev);
 
 /* from saa7146_hlp.c */
-int saa7146_enable_overlay(struct saa7146_fh *fh);
-void saa7146_disable_overlay(struct saa7146_fh *fh);
-
 void saa7146_set_capture(struct saa7146_dev *dev, struct saa7146_buf *buf, struct saa7146_buf *next);
 void saa7146_write_out_dma(struct saa7146_dev* dev, int which, struct saa7146_video_dma* vdma) ;
 void saa7146_set_hps_source_and_sync(struct saa7146_dev *saa, int source, int sync);
@@ -204,8 +185,6 @@ void saa7146_set_gpio(struct saa7146_dev *saa, u8 pin, u8 data);
 extern const struct v4l2_ioctl_ops saa7146_video_ioctl_ops;
 extern const struct v4l2_ioctl_ops saa7146_vbi_ioctl_ops;
 extern const struct saa7146_use_ops saa7146_video_uops;
-int saa7146_start_preview(struct saa7146_fh *fh);
-int saa7146_stop_preview(struct saa7146_fh *fh);
 long saa7146_video_do_ioctl(struct file *file, unsigned int cmd, void *arg);
 int saa7146_s_ctrl(struct v4l2_ctrl *ctrl);
 
-- 
cgit v1.2.3


From fc650d2eba1087589bbb6ede866756df13b9a9dc Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 2 Mar 2023 13:57:29 +0100
Subject: media: videodev.h: drop V4L2_FBUF_CAP_LIST/BITMAP_CLIPPING

These two capabilities are no longer supported, so no
longer define them when compiling the kernel.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 17a9b975177a..b5b3d1fddea2 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1224,8 +1224,10 @@ struct v4l2_framebuffer {
 /*  Flags for the 'capability' field. Read only */
 #define V4L2_FBUF_CAP_EXTERNOVERLAY	0x0001
 #define V4L2_FBUF_CAP_CHROMAKEY		0x0002
+#ifndef __KERNEL__
 #define V4L2_FBUF_CAP_LIST_CLIPPING     0x0004
 #define V4L2_FBUF_CAP_BITMAP_CLIPPING	0x0008
+#endif
 #define V4L2_FBUF_CAP_LOCAL_ALPHA	0x0010
 #define V4L2_FBUF_CAP_GLOBAL_ALPHA	0x0020
 #define V4L2_FBUF_CAP_LOCAL_INV_ALPHA	0x0040
-- 
cgit v1.2.3


From a50ee4afc77e4df14bc146e03b3fa28daeec14a9 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 2 Mar 2023 10:57:46 +0100
Subject: media: subdev: Use 'shall' instead of 'may' in route validation

Route validation docs use the word 'may'. Change that to 'shall' for
emphasis.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-subdev.c | 2 +-
 include/media/v4l2-subdev.h           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index c7154a7f696a..4c309207c8bd 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -1738,7 +1738,7 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 
 		/*
 		 * V4L2_SUBDEV_ROUTING_NO_STREAM_MIX: Streams on the same pad
-		 * may not be routed to streams on different pads.
+		 * shall not be routed to streams on different pads.
 		 */
 		if (disallow & V4L2_SUBDEV_ROUTING_NO_STREAM_MIX) {
 			if (remote_pads[route->sink_pad] != U32_MAX &&
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 7245887ef002..f6acb055579a 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1642,13 +1642,13 @@ u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state,
  * enum v4l2_subdev_routing_restriction - Subdevice internal routing restrictions
  *
  * @V4L2_SUBDEV_ROUTING_NO_1_TO_N:
- *	an input stream may not be routed to multiple output streams (stream
+ *	an input stream shall not be routed to multiple output streams (stream
  *	duplication)
  * @V4L2_SUBDEV_ROUTING_NO_N_TO_1:
- *	multiple input streams may not be routed to the same output stream
+ *	multiple input streams shall not be routed to the same output stream
  *	(stream merging)
  * @V4L2_SUBDEV_ROUTING_NO_STREAM_MIX:
- *	streams on the same pad may not be routed to streams on different pads
+ *	streams on the same pad shall not be routed to streams on different pads
  * @V4L2_SUBDEV_ROUTING_ONLY_1_TO_1:
  *	only non-overlapping 1-to-1 stream routing is allowed (a combination of
  *	@V4L2_SUBDEV_ROUTING_NO_1_TO_N and @V4L2_SUBDEV_ROUTING_NO_N_TO_1)
-- 
cgit v1.2.3


From 698a619a04bee1359358d5cba552f008709b79bc Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 2 Mar 2023 10:57:47 +0100
Subject: media: subdev: Split V4L2_SUBDEV_ROUTING_NO_STREAM_MIX

V4L2_SUBDEV_ROUTING_NO_STREAM_MIX routing validation flag means that all
routes from a sink pad must go to the same source pad and all routes
going to the same source pad must originate from the same sink pad.

This does not cover all use cases. For example, if a device routes
all streams from a single sink pad to any of the source pads, but
streams from multiple sink pads can go to the same source pad, the
current flag is too restrictive.

Split the flag into two parts, V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX
and V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX, which add the restriction
only on one side of the device. Together they mean the same as
V4L2_SUBDEV_ROUTING_NO_STREAM_MIX.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-subdev.c | 15 +++++++++++----
 include/media/v4l2-subdev.h           | 17 ++++++++++++++---
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 4c309207c8bd..b376c47b083d 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -1737,10 +1737,10 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 		}
 
 		/*
-		 * V4L2_SUBDEV_ROUTING_NO_STREAM_MIX: Streams on the same pad
-		 * shall not be routed to streams on different pads.
+		 * V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX: all streams from a
+		 * sink pad must be routed to a single source pad.
 		 */
-		if (disallow & V4L2_SUBDEV_ROUTING_NO_STREAM_MIX) {
+		if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX) {
 			if (remote_pads[route->sink_pad] != U32_MAX &&
 			    remote_pads[route->sink_pad] != route->source_pad) {
 				dev_dbg(sd->dev,
@@ -1749,6 +1749,14 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 				goto out;
 			}
 
+			remote_pads[route->sink_pad] = route->source_pad;
+		}
+
+		/*
+		 * V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX: all streams on a
+		 * source pad must originate from a single sink pad.
+		 */
+		if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX) {
 			if (remote_pads[route->source_pad] != U32_MAX &&
 			    remote_pads[route->source_pad] != route->sink_pad) {
 				dev_dbg(sd->dev,
@@ -1757,7 +1765,6 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 				goto out;
 			}
 
-			remote_pads[route->sink_pad] = route->source_pad;
 			remote_pads[route->source_pad] = route->sink_pad;
 		}
 
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index f6acb055579a..f921d8a7898e 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1647,19 +1647,30 @@ u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state,
  * @V4L2_SUBDEV_ROUTING_NO_N_TO_1:
  *	multiple input streams shall not be routed to the same output stream
  *	(stream merging)
- * @V4L2_SUBDEV_ROUTING_NO_STREAM_MIX:
- *	streams on the same pad shall not be routed to streams on different pads
+ * @V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX:
+ *	all streams from a sink pad must be routed to a single source pad
+ * @V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX:
+ *	all streams on a source pad must originate from a single sink pad
  * @V4L2_SUBDEV_ROUTING_ONLY_1_TO_1:
  *	only non-overlapping 1-to-1 stream routing is allowed (a combination of
  *	@V4L2_SUBDEV_ROUTING_NO_1_TO_N and @V4L2_SUBDEV_ROUTING_NO_N_TO_1)
+ * @V4L2_SUBDEV_ROUTING_NO_STREAM_MIX:
+ *	all streams from a sink pad must be routed to a single source pad, and
+ *	that source pad shall not get routes from any other sink pad
+ *	(a combination of @V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX and
+ *	@V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX)
  */
 enum v4l2_subdev_routing_restriction {
 	V4L2_SUBDEV_ROUTING_NO_1_TO_N = BIT(0),
 	V4L2_SUBDEV_ROUTING_NO_N_TO_1 = BIT(1),
-	V4L2_SUBDEV_ROUTING_NO_STREAM_MIX = BIT(2),
+	V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX = BIT(2),
+	V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX = BIT(3),
 	V4L2_SUBDEV_ROUTING_ONLY_1_TO_1 =
 		V4L2_SUBDEV_ROUTING_NO_1_TO_N |
 		V4L2_SUBDEV_ROUTING_NO_N_TO_1,
+	V4L2_SUBDEV_ROUTING_NO_STREAM_MIX =
+		V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX |
+		V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX,
 };
 
 /**
-- 
cgit v1.2.3


From a1299df6718b718256fd9bf6d7f9bed44c826e61 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 2 Mar 2023 10:57:48 +0100
Subject: media: subdev: Add V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING

A common case with subdev routing is that on the subdevice just before
the DMA engines (video nodes), no multiplexing is allowed on the source
pads, as the DMA engine can only handle a single stream.

In some other situations one might also want to do the same check on the
sink side.

Add new routing validation flags to check these:
V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING and
V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-subdev.c | 36 ++++++++++++++++++++++++++++++++---
 include/media/v4l2-subdev.h           | 11 +++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index b376c47b083d..bfb102f2121a 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -1708,7 +1708,8 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 	unsigned int i, j;
 	int ret = -EINVAL;
 
-	if (disallow & V4L2_SUBDEV_ROUTING_NO_STREAM_MIX) {
+	if (disallow & (V4L2_SUBDEV_ROUTING_NO_STREAM_MIX |
+			V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING)) {
 		remote_pads = kcalloc(sd->entity.num_pads, sizeof(*remote_pads),
 				      GFP_KERNEL);
 		if (!remote_pads)
@@ -1748,8 +1749,6 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 					i, "sink");
 				goto out;
 			}
-
-			remote_pads[route->sink_pad] = route->source_pad;
 		}
 
 		/*
@@ -1764,7 +1763,38 @@ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd,
 					i, "source");
 				goto out;
 			}
+		}
+
+		/*
+		 * V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING: Pads on the sink
+		 * side can not do stream multiplexing, i.e. there can be only
+		 * a single stream in a sink pad.
+		 */
+		if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING) {
+			if (remote_pads[route->sink_pad] != U32_MAX) {
+				dev_dbg(sd->dev,
+					"route %u attempts to multiplex on %s pad %u\n",
+					i, "sink", route->sink_pad);
+				goto out;
+			}
+		}
 
+		/*
+		 * V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING: Pads on the
+		 * source side can not do stream multiplexing, i.e. there can
+		 * be only a single stream in a source pad.
+		 */
+		if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING) {
+			if (remote_pads[route->source_pad] != U32_MAX) {
+				dev_dbg(sd->dev,
+					"route %u attempts to multiplex on %s pad %u\n",
+					i, "source", route->source_pad);
+				goto out;
+			}
+		}
+
+		if (remote_pads) {
+			remote_pads[route->sink_pad] = route->source_pad;
 			remote_pads[route->source_pad] = route->sink_pad;
 		}
 
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index f921d8a7898e..1d2877700cdb 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1651,6 +1651,10 @@ u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state,
  *	all streams from a sink pad must be routed to a single source pad
  * @V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX:
  *	all streams on a source pad must originate from a single sink pad
+ * @V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING:
+ *	source pads shall not contain multiplexed streams
+ * @V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING:
+ *	sink pads shall not contain multiplexed streams
  * @V4L2_SUBDEV_ROUTING_ONLY_1_TO_1:
  *	only non-overlapping 1-to-1 stream routing is allowed (a combination of
  *	@V4L2_SUBDEV_ROUTING_NO_1_TO_N and @V4L2_SUBDEV_ROUTING_NO_N_TO_1)
@@ -1659,18 +1663,25 @@ u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state,
  *	that source pad shall not get routes from any other sink pad
  *	(a combination of @V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX and
  *	@V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX)
+ * @V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING:
+ *	no multiplexed streams allowed on either source or sink sides.
  */
 enum v4l2_subdev_routing_restriction {
 	V4L2_SUBDEV_ROUTING_NO_1_TO_N = BIT(0),
 	V4L2_SUBDEV_ROUTING_NO_N_TO_1 = BIT(1),
 	V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX = BIT(2),
 	V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX = BIT(3),
+	V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING = BIT(4),
+	V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING = BIT(5),
 	V4L2_SUBDEV_ROUTING_ONLY_1_TO_1 =
 		V4L2_SUBDEV_ROUTING_NO_1_TO_N |
 		V4L2_SUBDEV_ROUTING_NO_N_TO_1,
 	V4L2_SUBDEV_ROUTING_NO_STREAM_MIX =
 		V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX |
 		V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX,
+	V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING =
+		V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING |
+		V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING,
 };
 
 /**
-- 
cgit v1.2.3


From fec53f44945877c8627da4d3ad70e3ac7e204f38 Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:09 -0700
Subject: net: mscc: ocelot: expose ocelot_pll5_init routine

Ocelot chips have an internal PLL that must be used when communicating
through external phys. Expose the init routine, so it can be used by other
drivers.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c         | 31 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_vsc7514.c | 30 -----------------------------
 include/soc/mscc/ocelot.h                  |  2 ++
 3 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 08acb7b89086..9b8403e29445 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -7,6 +7,7 @@
 #include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
+#include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
 #include "ocelot_vcap.h"
@@ -211,6 +212,36 @@ static void ocelot_mact_init(struct ocelot *ocelot)
 	ocelot_write(ocelot, MACACCESS_CMD_INIT, ANA_TABLES_MACACCESS);
 }
 
+void ocelot_pll5_init(struct ocelot *ocelot)
+{
+	/* Configure PLL5. This will need a proper CCF driver
+	 * The values are coming from the VTSS API for Ocelot
+	 */
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
+		     HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
+		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
+		     HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
+		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
+		     HSIO_PLL5G_CFG0_ENA_BIAS |
+		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
+		     HSIO_PLL5G_CFG0_ENA_CP1 |
+		     HSIO_PLL5G_CFG0_SELCPI(2) |
+		     HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
+		     HSIO_PLL5G_CFG0_SELBGV820(4) |
+		     HSIO_PLL5G_CFG0_DIV4 |
+		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
+		     HSIO_PLL5G_CFG0_ENA_LANE);
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
+		     HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
+		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
+		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
+		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
+		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
+		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
+}
+EXPORT_SYMBOL(ocelot_pll5_init);
+
 static void ocelot_vcap_enable(struct ocelot *ocelot, int port)
 {
 	ocelot_write_gix(ocelot, ANA_PORT_VCAP_S2_CFG_S2_ENA |
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 7388c3b0535c..97e90e2869d4 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -18,7 +18,6 @@
 
 #include <soc/mscc/ocelot.h>
 #include <soc/mscc/ocelot_vcap.h>
-#include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/vsc7514_regs.h>
 #include "ocelot_fdma.h"
 #include "ocelot.h"
@@ -26,35 +25,6 @@
 #define VSC7514_VCAP_POLICER_BASE			128
 #define VSC7514_VCAP_POLICER_MAX			191
 
-static void ocelot_pll5_init(struct ocelot *ocelot)
-{
-	/* Configure PLL5. This will need a proper CCF driver
-	 * The values are coming from the VTSS API for Ocelot
-	 */
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
-		     HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
-		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
-		     HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
-		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
-		     HSIO_PLL5G_CFG0_ENA_BIAS |
-		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
-		     HSIO_PLL5G_CFG0_ENA_CP1 |
-		     HSIO_PLL5G_CFG0_SELCPI(2) |
-		     HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
-		     HSIO_PLL5G_CFG0_SELBGV820(4) |
-		     HSIO_PLL5G_CFG0_DIV4 |
-		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
-		     HSIO_PLL5G_CFG0_ENA_LANE);
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
-		     HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
-		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
-		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
-		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
-		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
-		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
-}
-
 static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops)
 {
 	int ret;
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2080879e4134..751d9b250615 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1183,4 +1183,6 @@ ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port,
 }
 #endif
 
+void ocelot_pll5_init(struct ocelot *ocelot);
+
 #endif
-- 
cgit v1.2.3


From 69f7f89c0db52c5a3fe1bc9ba69d8248b5ee0bca Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:10 -0700
Subject: net: mscc: ocelot: expose generic phylink_mac_config routine

The ocelot-switch driver can utilize the phylink_mac_config routine. Move
this to the ocelot library location and export the symbol to make this
possible.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c     | 26 ++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_net.c | 21 +++------------------
 include/soc/mscc/ocelot.h              |  3 +++
 3 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 9b8403e29445..8292e93a3782 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -809,6 +809,32 @@ static int ocelot_port_flush(struct ocelot *ocelot, int port)
 	return err;
 }
 
+void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
+			       unsigned int link_an_mode,
+			       const struct phylink_link_state *state)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+
+	/* Disable HDX fast control */
+	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
+			   DEV_PORT_MISC);
+
+	/* SGMII only for now */
+	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
+			   PCS1G_MODE_CFG);
+	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
+
+	/* Enable PCS */
+	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
+
+	/* No aneg on SGMII */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+
+	/* No loopback */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+}
+EXPORT_SYMBOL_GPL(ocelot_phylink_mac_config);
+
 void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
 				  unsigned int link_an_mode,
 				  phy_interface_t interface,
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index ca4bde861397..590a2b2816ad 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1675,25 +1675,10 @@ static void vsc7514_phylink_mac_config(struct phylink_config *config,
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct ocelot_port_private *priv = netdev_priv(ndev);
-	struct ocelot_port *ocelot_port = &priv->port;
-
-	/* Disable HDX fast control */
-	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
-			   DEV_PORT_MISC);
-
-	/* SGMII only for now */
-	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
-			   PCS1G_MODE_CFG);
-	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
-
-	/* Enable PCS */
-	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
-
-	/* No aneg on SGMII */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+	struct ocelot *ocelot = priv->port.ocelot;
+	int port = priv->port.index;
 
-	/* No loopback */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+	ocelot_phylink_mac_config(ocelot, port, link_an_mode, state);
 }
 
 static void vsc7514_phylink_mac_link_down(struct phylink_config *config,
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 751d9b250615..87ade87d3540 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1111,6 +1111,9 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
+			       unsigned int link_an_mode,
+			       const struct phylink_link_state *state);
 void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
 				  unsigned int link_an_mode,
 				  phy_interface_t interface,
-- 
cgit v1.2.3


From dfca93ed51a7cf8bfda876705816a5e55381ac4a Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:11 -0700
Subject: net: mscc: ocelot: expose serdes configuration function

During chip initialization, ports that use SGMII / QSGMII to interface to
external phys need to be configured on the VSC7513 and VSC7514. Expose this
configuration routine, so it can be used by DSA drivers.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c     | 40 ++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_net.c | 29 +++---------------------
 include/soc/mscc/ocelot.h              |  4 ++++
 3 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8292e93a3782..1502bb2c8ea7 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -7,6 +7,7 @@
 #include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
+#include <linux/phy/phy.h>
 #include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
@@ -809,6 +810,45 @@ static int ocelot_port_flush(struct ocelot *ocelot, int port)
 	return err;
 }
 
+int ocelot_port_configure_serdes(struct ocelot *ocelot, int port,
+				 struct device_node *portnp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct device *dev = ocelot->dev;
+	int err;
+
+	/* Ensure clock signals and speed are set on all QSGMII links */
+	if (ocelot_port->phy_mode == PHY_INTERFACE_MODE_QSGMII)
+		ocelot_port_rmwl(ocelot_port, 0,
+				 DEV_CLOCK_CFG_MAC_TX_RST |
+				 DEV_CLOCK_CFG_MAC_RX_RST,
+				 DEV_CLOCK_CFG);
+
+	if (ocelot_port->phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
+		struct phy *serdes = of_phy_get(portnp, NULL);
+
+		if (IS_ERR(serdes)) {
+			err = PTR_ERR(serdes);
+			dev_err_probe(dev, err,
+				      "missing SerDes phys for port %d\n",
+				      port);
+			return err;
+		}
+
+		err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET,
+				       ocelot_port->phy_mode);
+		of_phy_put(serdes);
+		if (err) {
+			dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
+				port, ERR_PTR(err));
+			return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ocelot_port_configure_serdes);
+
 void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
 			       unsigned int link_an_mode,
 			       const struct phylink_link_state *state)
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 590a2b2816ad..21a87a3fc556 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1742,34 +1742,11 @@ static int ocelot_port_phylink_create(struct ocelot *ocelot, int port,
 		return -EINVAL;
 	}
 
-	/* Ensure clock signals and speed are set on all QSGMII links */
-	if (phy_mode == PHY_INTERFACE_MODE_QSGMII)
-		ocelot_port_rmwl(ocelot_port, 0,
-				 DEV_CLOCK_CFG_MAC_TX_RST |
-				 DEV_CLOCK_CFG_MAC_RX_RST,
-				 DEV_CLOCK_CFG);
-
 	ocelot_port->phy_mode = phy_mode;
 
-	if (phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
-		struct phy *serdes = of_phy_get(portnp, NULL);
-
-		if (IS_ERR(serdes)) {
-			err = PTR_ERR(serdes);
-			dev_err_probe(dev, err,
-				      "missing SerDes phys for port %d\n",
-				      port);
-			return err;
-		}
-
-		err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET, phy_mode);
-		of_phy_put(serdes);
-		if (err) {
-			dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
-				port, ERR_PTR(err));
-			return err;
-		}
-	}
+	err = ocelot_port_configure_serdes(ocelot, port, portnp);
+	if (err)
+		return err;
 
 	priv = container_of(ocelot_port, struct ocelot_port_private, port);
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 87ade87d3540..d757b5e26d26 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -644,6 +644,7 @@ enum ocelot_tag_prefix {
 };
 
 struct ocelot;
+struct device_node;
 
 struct ocelot_ops {
 	struct net_device *(*port_to_netdev)(struct ocelot *ocelot, int port);
@@ -1111,6 +1112,9 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+int ocelot_port_configure_serdes(struct ocelot *ocelot, int port,
+				 struct device_node *portnp);
+
 void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
 			       unsigned int link_an_mode,
 			       const struct phylink_link_state *state);
-- 
cgit v1.2.3


From e0aeb9b90acf6ee7c2d11141522ffbb5481734d3 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@nvidia.com>
Date: Tue, 14 Mar 2023 10:58:39 +0200
Subject: xfrm: add new device offload acquire flag

During XFRM acquire flow, a default SA is created to be updated later,
once acquire netlink message is handled in user space. When the relevant
policy is offloaded this default SA is also offloaded to IPsec offload
supporting driver, however this SA does not have context suitable for
offloading in HW, nor is interesting to offload to HW, consequently needs
a special driver handling apart from other offloaded SA(s).
Add a special flag that marks such SA so driver can handle it correctly.

Signed-off-by: Raed Salem <raeds@nvidia.com>
Link: https://lore.kernel.org/r/f5da0834d8c6b82ab9ba38bd4a0c55e71f0e3dab.1678714336.git.leon@kernel.org
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/net/xfrm.h    | 5 +++++
 net/xfrm/xfrm_state.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3e1f70e8e424..33ee3f5936e6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -138,6 +138,10 @@ enum {
 	XFRM_DEV_OFFLOAD_PACKET,
 };
 
+enum {
+	XFRM_DEV_OFFLOAD_FLAG_ACQ = 1,
+};
+
 struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
@@ -145,6 +149,7 @@ struct xfrm_dev_offload {
 	unsigned long		offload_handle;
 	u8			dir : 2;
 	u8			type : 2;
+	u8			flags : 2;
 };
 
 struct xfrm_mode {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2ab3e09e2227..7cca0a1fa5ff 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1272,6 +1272,7 @@ found:
 			xso->dir = xdo->dir;
 			xso->dev = xdo->dev;
 			xso->real_dev = xdo->real_dev;
+			xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
 			netdev_tracker_alloc(xso->dev, &xso->dev_tracker,
 					     GFP_ATOMIC);
 			error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL);
-- 
cgit v1.2.3


From a69e332b4ef9f697a4558bab9c442a02b3659fcb Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 18 Mar 2023 21:32:41 +0100
Subject: net: phy: smsc: export functions for use by meson-gxl PHY driver

The Amlogic Meson internal PHY's have the same register layout as
certain SMSC PHY's (also for non-c22-standard registers). This seems
to be more than just coincidence. Apparently they also need the same
workaround for EDPD mode (energy detect power down). Therefore let's
export SMSC PHY driver functionality for use by the meson-gxl PHY
driver.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Chris Healy <healych@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/smsc.c  | 20 +++++++++++++-------
 include/linux/smscphy.h |  6 ++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 721871184205..730964b856ab 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -54,7 +54,7 @@ static int smsc_phy_ack_interrupt(struct phy_device *phydev)
 	return rc < 0 ? rc : 0;
 }
 
-static int smsc_phy_config_intr(struct phy_device *phydev)
+int smsc_phy_config_intr(struct phy_device *phydev)
 {
 	int rc;
 
@@ -75,8 +75,9 @@ static int smsc_phy_config_intr(struct phy_device *phydev)
 
 	return rc < 0 ? rc : 0;
 }
+EXPORT_SYMBOL_GPL(smsc_phy_config_intr);
 
-static irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
+irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
 {
 	int irq_status;
 
@@ -95,18 +96,20 @@ static irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
 
 	return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_GPL(smsc_phy_handle_interrupt);
 
-static int smsc_phy_config_init(struct phy_device *phydev)
+int smsc_phy_config_init(struct phy_device *phydev)
 {
 	struct smsc_phy_priv *priv = phydev->priv;
 
-	if (!priv->energy_enable || phydev->irq != PHY_POLL)
+	if (!priv || !priv->energy_enable || phydev->irq != PHY_POLL)
 		return 0;
 
 	/* Enable energy detect power down mode */
 	return phy_set_bits(phydev, MII_LAN83C185_CTRL_STATUS,
 			    MII_LAN83C185_EDPWRDOWN);
 }
+EXPORT_SYMBOL_GPL(smsc_phy_config_init);
 
 static int smsc_phy_reset(struct phy_device *phydev)
 {
@@ -186,7 +189,7 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev)
  * The workaround is only applicable to poll mode. Energy Detect Power-Down may
  * not be used in interrupt mode lest link change detection becomes unreliable.
  */
-static int lan87xx_read_status(struct phy_device *phydev)
+int lan87xx_read_status(struct phy_device *phydev)
 {
 	struct smsc_phy_priv *priv = phydev->priv;
 	int err;
@@ -195,7 +198,8 @@ static int lan87xx_read_status(struct phy_device *phydev)
 	if (err)
 		return err;
 
-	if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) {
+	if (!phydev->link && priv && priv->energy_enable &&
+	    phydev->irq == PHY_POLL) {
 		/* Disable EDPD to wake up PHY */
 		int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
 		if (rc < 0)
@@ -229,6 +233,7 @@ static int lan87xx_read_status(struct phy_device *phydev)
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(lan87xx_read_status);
 
 static int smsc_get_sset_count(struct phy_device *phydev)
 {
@@ -269,7 +274,7 @@ static void smsc_get_stats(struct phy_device *phydev,
 		data[i] = smsc_get_stat(phydev, i);
 }
 
-static int smsc_phy_probe(struct phy_device *phydev)
+int smsc_phy_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
 	struct smsc_phy_priv *priv;
@@ -294,6 +299,7 @@ static int smsc_phy_probe(struct phy_device *phydev)
 
 	return clk_set_rate(refclk, 50 * 1000 * 1000);
 }
+EXPORT_SYMBOL_GPL(smsc_phy_probe);
 
 static struct phy_driver smsc_phy_driver[] = {
 {
diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h
index 1a136271ba6a..80f37c1dba58 100644
--- a/include/linux/smscphy.h
+++ b/include/linux/smscphy.h
@@ -28,4 +28,10 @@
 #define MII_LAN83C185_MODE_POWERDOWN 0xC0 /* Power Down mode */
 #define MII_LAN83C185_MODE_ALL       0xE0 /* All capable mode */
 
+int smsc_phy_config_intr(struct phy_device *phydev);
+irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev);
+int smsc_phy_config_init(struct phy_device *phydev);
+int lan87xx_read_status(struct phy_device *phydev);
+int smsc_phy_probe(struct phy_device *phydev);
+
 #endif /* __LINUX_SMSCPHY_H__ */
-- 
cgit v1.2.3


From 419405c92299d793b95053aa54d95e2d3f45a1a4 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Mon, 6 Mar 2023 08:56:49 +0100
Subject: interconnect: drop racy registration API

Now that all interconnect drivers have been converted to the new
provider registration API, the old racy interface can be removed.

Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20230306075651.2449-22-johan+linaro@kernel.org
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 drivers/interconnect/core.c           | 16 ----------------
 include/linux/interconnect-provider.h | 11 -----------
 2 files changed, 27 deletions(-)

(limited to 'include')

diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
index 4175a42bf748..da456329c524 100644
--- a/drivers/interconnect/core.c
+++ b/drivers/interconnect/core.c
@@ -1084,22 +1084,6 @@ void icc_provider_deregister(struct icc_provider *provider)
 }
 EXPORT_SYMBOL_GPL(icc_provider_deregister);
 
-int icc_provider_add(struct icc_provider *provider)
-{
-	icc_provider_init(provider);
-
-	return icc_provider_register(provider);
-}
-EXPORT_SYMBOL_GPL(icc_provider_add);
-
-void icc_provider_del(struct icc_provider *provider)
-{
-	WARN_ON(!list_empty(&provider->nodes));
-
-	icc_provider_deregister(provider);
-}
-EXPORT_SYMBOL_GPL(icc_provider_del);
-
 static const struct of_device_id __maybe_unused ignore_list[] = {
 	{ .compatible = "qcom,sc7180-ipa-virt" },
 	{ .compatible = "qcom,sc8180x-ipa-virt" },
diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index d12cd18aab3f..b9af9016a95e 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -125,8 +125,6 @@ int icc_nodes_remove(struct icc_provider *provider);
 void icc_provider_init(struct icc_provider *provider);
 int icc_provider_register(struct icc_provider *provider);
 void icc_provider_deregister(struct icc_provider *provider);
-int icc_provider_add(struct icc_provider *provider);
-void icc_provider_del(struct icc_provider *provider);
 struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec);
 void icc_sync_state(struct device *dev);
 
@@ -179,15 +177,6 @@ static inline int icc_provider_register(struct icc_provider *provider)
 
 static inline void icc_provider_deregister(struct icc_provider *provider) { }
 
-static inline int icc_provider_add(struct icc_provider *provider)
-{
-	return -ENOTSUPP;
-}
-
-static inline void icc_provider_del(struct icc_provider *provider)
-{
-}
-
 static inline struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec)
 {
 	return ERR_PTR(-ENOTSUPP);
-- 
cgit v1.2.3


From 7370f639bbc733864e152ead3f26257f55b14ef4 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Thu, 22 Dec 2022 12:56:37 +0100
Subject: media: v4l2-ctrls: Fix doc for v4l2_ctrl_request_hdl_find

We should call v4l2_ctrl_request_hdl_put() instead of
v4l2_ctrl_request_put_hdl(). Fix the typo.

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/media/v4l2-ctrls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index e59d9a234631..7788eeb3e2bb 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -1343,7 +1343,7 @@ void v4l2_ctrl_request_complete(struct media_request *req,
  * @parent: The parent control handler ('priv' in media_request_object_find())
  *
  * This function finds the control handler in the request. It may return
- * NULL if not found. When done, you must call v4l2_ctrl_request_put_hdl()
+ * NULL if not found. When done, you must call v4l2_ctrl_request_hdl_put()
  * with the returned handler pointer.
  *
  * If the request is not in state VALIDATING or QUEUED, then this function
-- 
cgit v1.2.3


From f22f9aaf6c3d92ebd5ad9e67acc03afebaaeb289 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 17 Mar 2023 12:43:07 -0400
Subject: selinux: remove the runtime disable functionality

After working with the larger SELinux-based distros for several
years, we're finally at a place where we can disable the SELinux
runtime disable functionality.  The existing kernel deprecation
notice explains the functionality and why we want to remove it:

  The selinuxfs "disable" node allows SELinux to be disabled at
  runtime prior to a policy being loaded into the kernel.  If
  disabled via this mechanism, SELinux will remain disabled until
  the system is rebooted.

  The preferred method of disabling SELinux is via the "selinux=0"
  boot parameter, but the selinuxfs "disable" node was created to
  make it easier for systems with primitive bootloaders that did not
  allow for easy modification of the kernel command line.
  Unfortunately, allowing for SELinux to be disabled at runtime makes
  it difficult to secure the kernel's LSM hooks using the
  "__ro_after_init" feature.

It is that last sentence, mentioning the '__ro_after_init' hardening,
which is the real motivation for this change, and if you look at the
diffstat you'll see that the impact of this patch reaches across all
the different LSMs, helping prevent tampering at the LSM hook level.

From a SELinux perspective, it is important to note that if you
continue to disable SELinux via "/etc/selinux/config" it may appear
that SELinux is disabled, but it is simply in an uninitialized state.
If you load a policy with `load_policy -i`, you will see SELinux
come alive just as if you had loaded the policy during early-boot.

It is also worth noting that the "/sys/fs/selinux/disable" file is
always writable now, regardless of the Kconfig settings, but writing
to the file has no effect on the system, other than to display an
error on the console if a non-zero/true value is written.

Finally, in the several years where we have been working on
deprecating this functionality, there has only been one instance of
someone mentioning any user visible breakage.  In this particular
case it was an individual's kernel test system, and the workaround
documented in the deprecation notice ("selinux=0" on the kernel
command line) resolved the issue without problem.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/ABI/obsolete/sysfs-selinux-disable | 26 -----------
 Documentation/ABI/removed/sysfs-selinux-disable  | 29 ++++++++++++
 include/linux/lsm_hooks.h                        |  7 ---
 security/Kconfig                                 |  5 ---
 security/apparmor/lsm.c                          |  6 +--
 security/bpf/hooks.c                             |  4 +-
 security/commoncap.c                             |  2 +-
 security/landlock/cred.c                         |  2 +-
 security/landlock/fs.c                           |  2 +-
 security/landlock/ptrace.c                       |  2 +-
 security/landlock/setup.c                        |  4 +-
 security/loadpin/loadpin.c                       |  2 +-
 security/lockdown/lockdown.c                     |  2 +-
 security/security.c                              |  4 +-
 security/selinux/Kconfig                         | 24 ----------
 security/selinux/hooks.c                         | 57 +-----------------------
 security/selinux/include/security.h              | 21 ---------
 security/selinux/selinuxfs.c                     | 43 +++---------------
 security/smack/smack_lsm.c                       |  4 +-
 security/tomoyo/tomoyo.c                         |  6 +--
 security/yama/yama_lsm.c                         |  2 +-
 21 files changed, 58 insertions(+), 196 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/sysfs-selinux-disable
 create mode 100644 Documentation/ABI/removed/sysfs-selinux-disable

(limited to 'include')

diff --git a/Documentation/ABI/obsolete/sysfs-selinux-disable b/Documentation/ABI/obsolete/sysfs-selinux-disable
deleted file mode 100644
index c340278e3cf8..000000000000
--- a/Documentation/ABI/obsolete/sysfs-selinux-disable
+++ /dev/null
@@ -1,26 +0,0 @@
-What:		/sys/fs/selinux/disable
-Date:		April 2005 (predates git)
-KernelVersion:	2.6.12-rc2 (predates git)
-Contact:	selinux@vger.kernel.org
-Description:
-
-	The selinuxfs "disable" node allows SELinux to be disabled at runtime
-	prior to a policy being loaded into the kernel.  If disabled via this
-	mechanism, SELinux will remain disabled until the system is rebooted.
-
-	The preferred method of disabling SELinux is via the "selinux=0" boot
-	parameter, but the selinuxfs "disable" node was created to make it
-	easier for systems with primitive bootloaders that did not allow for
-	easy modification of the kernel command line.  Unfortunately, allowing
-	for SELinux to be disabled at runtime makes it difficult to secure the
-	kernel's LSM hooks using the "__ro_after_init" feature.
-
-	Thankfully, the need for the SELinux runtime disable appears to be
-	gone, the default Kconfig configuration disables this selinuxfs node,
-	and only one of the major distributions, Fedora, supports disabling
-	SELinux at runtime.  Fedora is in the process of removing the
-	selinuxfs "disable" node and once that is complete we will start the
-	slow process of removing this code from the kernel.
-
-	More information on /sys/fs/selinux/disable can be found under the
-	CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.
diff --git a/Documentation/ABI/removed/sysfs-selinux-disable b/Documentation/ABI/removed/sysfs-selinux-disable
new file mode 100644
index 000000000000..cb783c64cab3
--- /dev/null
+++ b/Documentation/ABI/removed/sysfs-selinux-disable
@@ -0,0 +1,29 @@
+What:		/sys/fs/selinux/disable
+Date:		April 2005 (predates git)
+KernelVersion:	2.6.12-rc2 (predates git)
+Contact:	selinux@vger.kernel.org
+Description:
+
+	REMOVAL UPDATE: The SELinux runtime disable functionality was removed
+	in March 2023, the original deprecation notice is shown below.
+
+	The selinuxfs "disable" node allows SELinux to be disabled at runtime
+	prior to a policy being loaded into the kernel.  If disabled via this
+	mechanism, SELinux will remain disabled until the system is rebooted.
+
+	The preferred method of disabling SELinux is via the "selinux=0" boot
+	parameter, but the selinuxfs "disable" node was created to make it
+	easier for systems with primitive bootloaders that did not allow for
+	easy modification of the kernel command line.  Unfortunately, allowing
+	for SELinux to be disabled at runtime makes it difficult to secure the
+	kernel's LSM hooks using the "__ro_after_init" feature.
+
+	Thankfully, the need for the SELinux runtime disable appears to be
+	gone, the default Kconfig configuration disables this selinuxfs node,
+	and only one of the major distributions, Fedora, supports disabling
+	SELinux at runtime.  Fedora is in the process of removing the
+	selinuxfs "disable" node and once that is complete we will start the
+	slow process of removing this code from the kernel.
+
+	More information on /sys/fs/selinux/disable can be found under the
+	CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 6e156d2acffc..af87b962f5f7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1763,13 +1763,6 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
 }
 #endif /* CONFIG_SECURITY_SELINUX_DISABLE */
 
-/* Currently required to handle SELinux runtime hook disable. */
-#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
-#define __lsm_ro_after_init
-#else
-#define __lsm_ro_after_init	__ro_after_init
-#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
-
 extern int lsm_inode_alloc(struct inode *inode);
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/security/Kconfig b/security/Kconfig
index e6db09a779b7..9009893fb3f5 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -32,11 +32,6 @@ config SECURITY
 
 	  If you are unsure how to answer this question, answer N.
 
-config SECURITY_WRITABLE_HOOKS
-	depends on SECURITY
-	bool
-	default n
-
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index d6cc4812ca53..cebba4824e60 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1209,13 +1209,13 @@ static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb
 /*
  * The cred blob is a pointer to, not an instance of, an aa_label.
  */
-struct lsm_blob_sizes apparmor_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = {
 	.lbs_cred = sizeof(struct aa_label *),
 	.lbs_file = sizeof(struct aa_file_ctx),
 	.lbs_task = sizeof(struct aa_task_ctx),
 };
 
-static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list apparmor_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
 	LSM_HOOK_INIT(capget, apparmor_capget),
@@ -1427,7 +1427,7 @@ static const struct kernel_param_ops param_ops_aaintbool = {
 	.get = param_get_aaintbool
 };
 /* Boot time disable flag */
-static int apparmor_enabled __lsm_ro_after_init = 1;
+static int apparmor_enabled __ro_after_init = 1;
 module_param_named(enabled, apparmor_enabled, aaintbool, 0444);
 
 static int __init apparmor_enabled_setup(char *str)
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index e5971fa74fd7..cfaf1d0e6a5f 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -6,7 +6,7 @@
 #include <linux/lsm_hooks.h>
 #include <linux/bpf_lsm.h>
 
-static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
 	LSM_HOOK_INIT(NAME, bpf_lsm_##NAME),
 	#include <linux/lsm_hook_defs.h>
@@ -22,7 +22,7 @@ static int __init bpf_lsm_init(void)
 	return 0;
 }
 
-struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes bpf_lsm_blob_sizes __ro_after_init = {
 	.lbs_inode = sizeof(struct bpf_storage_blob),
 	.lbs_task = sizeof(struct bpf_storage_blob),
 };
diff --git a/security/commoncap.c b/security/commoncap.c
index 5bb7d1e96277..0b3fc2f3afe7 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1440,7 +1440,7 @@ int cap_mmap_file(struct file *file, unsigned long reqprot,
 
 #ifdef CONFIG_SECURITY
 
-static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list capability_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(capable, cap_capable),
 	LSM_HOOK_INIT(settime, cap_settime),
 	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
diff --git a/security/landlock/cred.c b/security/landlock/cred.c
index ec6c37f04a19..13dff2a31545 100644
--- a/security/landlock/cred.c
+++ b/security/landlock/cred.c
@@ -34,7 +34,7 @@ static void hook_cred_free(struct cred *const cred)
 		landlock_put_ruleset_deferred(dom);
 }
 
-static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
 	LSM_HOOK_INIT(cred_free, hook_cred_free),
 };
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index adcea0fe7e68..1c0c198f6fdb 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1280,7 +1280,7 @@ static int hook_file_truncate(struct file *const file)
 	return -EACCES;
 }
 
-static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
 
 	LSM_HOOK_INIT(sb_delete, hook_sb_delete),
diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c
index 4c5b9cd71286..8a06d6c492bf 100644
--- a/security/landlock/ptrace.c
+++ b/security/landlock/ptrace.c
@@ -108,7 +108,7 @@ static int hook_ptrace_traceme(struct task_struct *const parent)
 	return task_ptrace(parent, current);
 }
 
-static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
 };
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index 3f196d2ce4f9..0f6113528fa4 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -15,9 +15,9 @@
 #include "ptrace.h"
 #include "setup.h"
 
-bool landlock_initialized __lsm_ro_after_init = false;
+bool landlock_initialized __ro_after_init = false;
 
-struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes landlock_blob_sizes __ro_after_init = {
 	.lbs_cred = sizeof(struct landlock_cred_security),
 	.lbs_file = sizeof(struct landlock_file_security),
 	.lbs_inode = sizeof(struct landlock_inode_security),
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index d73a281adf86..b9d773f11232 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -214,7 +214,7 @@ static int loadpin_load_data(enum kernel_load_data_id id, bool contents)
 	return loadpin_check(NULL, (enum kernel_read_file_id) id);
 }
 
-static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list loadpin_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security),
 	LSM_HOOK_INIT(kernel_read_file, loadpin_read_file),
 	LSM_HOOK_INIT(kernel_load_data, loadpin_load_data),
diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
index a79b985e917e..68d19632aeb7 100644
--- a/security/lockdown/lockdown.c
+++ b/security/lockdown/lockdown.c
@@ -71,7 +71,7 @@ static int lockdown_is_locked_down(enum lockdown_reason what)
 	return 0;
 }
 
-static struct security_hook_list lockdown_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list lockdown_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(locked_down, lockdown_is_locked_down),
 };
 
diff --git a/security/security.c b/security/security.c
index cf6cc576736f..f4e45992472e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -74,14 +74,14 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
 };
 
-struct security_hook_heads security_hook_heads __lsm_ro_after_init;
+struct security_hook_heads security_hook_heads __ro_after_init;
 static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);
 
 static struct kmem_cache *lsm_file_cache;
 static struct kmem_cache *lsm_inode_cache;
 
 char *lsm_names;
-static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;
+static struct lsm_blob_sizes blob_sizes __ro_after_init;
 
 /* Boot-time LSM user choice */
 static __initdata const char *chosen_lsm_order;
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index 4ea946123255..95a186ec0fcb 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -23,30 +23,6 @@ config SECURITY_SELINUX_BOOTPARAM
 
 	  If you are unsure how to answer this question, answer N.
 
-config SECURITY_SELINUX_DISABLE
-	bool "NSA SELinux runtime disable"
-	depends on SECURITY_SELINUX
-	select SECURITY_WRITABLE_HOOKS
-	default n
-	help
-	  This option enables writing to a selinuxfs node 'disable', which
-	  allows SELinux to be disabled at runtime prior to the policy load.
-	  SELinux will then remain disabled until the next boot.
-	  This option is similar to the selinux=0 boot parameter, but is to
-	  support runtime disabling of SELinux, e.g. from /sbin/init, for
-	  portability across platforms where boot parameters are difficult
-	  to employ.
-
-	  NOTE: selecting this option will disable the '__ro_after_init'
-	  kernel hardening feature for security hooks.   Please consider
-	  using the selinux=0 boot parameter instead of enabling this
-	  option.
-
-	  WARNING: this option is deprecated and will be removed in a future
-	  kernel release.
-
-	  If you are unsure how to answer this question, answer N.
-
 config SECURITY_SELINUX_DEVELOP
 	bool "NSA SELinux Development Support"
 	depends on SECURITY_SELINUX
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a58971f9a16..79b4890e9936 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6769,7 +6769,7 @@ static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
 }
 #endif
 
-struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
 	.lbs_cred = sizeof(struct task_security_struct),
 	.lbs_file = sizeof(struct file_security_struct),
 	.lbs_inode = sizeof(struct inode_security_struct),
@@ -6905,7 +6905,7 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd)
  * safely. Breaking the ordering rules above might lead to NULL pointer derefs
  * when disabling SELinux at runtime.
  */
-static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
 	LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
 	LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
@@ -7253,7 +7253,6 @@ DEFINE_LSM(selinux) = {
 };
 
 #if defined(CONFIG_NETFILTER)
-
 static const struct nf_hook_ops selinux_nf_ops[] = {
 	{
 		.hook =		selinux_ip_postroute,
@@ -7328,56 +7327,4 @@ static int __init selinux_nf_ip_init(void)
 	return 0;
 }
 __initcall(selinux_nf_ip_init);
-
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-static void selinux_nf_ip_exit(void)
-{
-	pr_debug("SELinux:  Unregistering netfilter hooks\n");
-
-	unregister_pernet_subsys(&selinux_net_ops);
-}
-#endif
-
-#else /* CONFIG_NETFILTER */
-
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-#define selinux_nf_ip_exit()
-#endif
-
 #endif /* CONFIG_NETFILTER */
-
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-int selinux_disable(void)
-{
-	if (selinux_initialized()) {
-		/* Not permitted after initial policy load. */
-		return -EINVAL;
-	}
-
-	if (selinux_disabled()) {
-		/* Only do this once. */
-		return -EINVAL;
-	}
-
-	selinux_mark_disabled();
-
-	pr_info("SELinux:  Disabled at runtime.\n");
-
-	/*
-	 * Unregister netfilter hooks.
-	 * Must be done before security_delete_hooks() to avoid breaking
-	 * runtime disable.
-	 */
-	selinux_nf_ip_exit();
-
-	security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));
-
-	/* Try to destroy the avc node cache */
-	avc_disable();
-
-	/* Unregister selinuxfs. */
-	exit_sel_fs();
-
-	return 0;
-}
-#endif
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 312112d214bb..8746fafeb778 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -89,9 +89,6 @@ extern int selinux_enabled_boot;
 struct selinux_policy;
 
 struct selinux_state {
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-	bool disabled;
-#endif
 #ifdef CONFIG_SECURITY_SELINUX_DEVELOP
 	bool enforcing;
 #endif
@@ -148,23 +145,6 @@ static inline bool checkreqprot_get(void)
 	return 0;
 }
 
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-static inline bool selinux_disabled(void)
-{
-	return READ_ONCE(selinux_state.disabled);
-}
-
-static inline void selinux_mark_disabled(void)
-{
-	WRITE_ONCE(selinux_state.disabled, true);
-}
-#else
-static inline bool selinux_disabled(void)
-{
-	return false;
-}
-#endif
-
 static inline bool selinux_policycap_netpeer(void)
 {
 	struct selinux_state *state = &selinux_state;
@@ -404,7 +384,6 @@ struct selinux_kernel_status {
 extern void selinux_status_update_setenforce(int enforcing);
 extern void selinux_status_update_policyload(int seqno);
 extern void selinux_complete_init(void);
-extern int selinux_disable(void);
 extern void exit_sel_fs(void);
 extern struct path selinux_null;
 extern void selnl_notify_setenforce(int val);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 68688bc84912..69a583b91fc5 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -267,7 +267,6 @@ static const struct file_operations sel_handle_status_ops = {
 	.llseek		= generic_file_llseek,
 };
 
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
 static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 
@@ -275,16 +274,6 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 	char *page;
 	ssize_t length;
 	int new_value;
-	int enforcing;
-
-	/* NOTE: we are now officially considering runtime disable as
-	 *       deprecated, and using it will become increasingly painful
-	 *       (e.g. sleeping/blocking) as we progress through future
-	 *       kernel releases until eventually it is removed
-	 */
-	pr_err("SELinux:  Runtime disable is deprecated, use selinux=0 on the kernel cmdline.\n");
-	pr_err("SELinux:  https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n");
-	ssleep(15);
 
 	if (count >= PAGE_SIZE)
 		return -ENOMEM;
@@ -297,31 +286,21 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
-	length = -EINVAL;
-	if (sscanf(page, "%d", &new_value) != 1)
+	if (sscanf(page, "%d", &new_value) != 1) {
+		length = -EINVAL;
 		goto out;
+	}
+	length = count;
 
 	if (new_value) {
-		enforcing = enforcing_enabled();
-		length = selinux_disable();
-		if (length)
-			goto out;
-		audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_STATUS,
-			"enforcing=%d old_enforcing=%d auid=%u ses=%u"
-			" enabled=0 old-enabled=1 lsm=selinux res=1",
-			enforcing, enforcing,
-			from_kuid(&init_user_ns, audit_get_loginuid(current)),
-			audit_get_sessionid(current));
+		pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n");
+		pr_err("SELinux: Runtime disable is not supported, use selinux=0 on the kernel cmdline.\n");
 	}
 
-	length = count;
 out:
 	kfree(page);
 	return length;
 }
-#else
-#define sel_write_disable NULL
-#endif
 
 static const struct file_operations sel_disable_ops = {
 	.write		= sel_write_disable,
@@ -2194,13 +2173,3 @@ static int __init init_sel_fs(void)
 }
 
 __initcall(init_sel_fs);
-
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-void exit_sel_fs(void)
-{
-	sysfs_remove_mount_point(fs_kobj, "selinux");
-	dput(selinux_null.dentry);
-	kern_unmount(selinuxfs_mount);
-	unregister_filesystem(&sel_fs_type);
-}
-#endif
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index cfcbb748da25..bc3c3e553133 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4847,7 +4847,7 @@ static int smack_uring_cmd(struct io_uring_cmd *ioucmd)
 
 #endif /* CONFIG_IO_URING */
 
-struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes smack_blob_sizes __ro_after_init = {
 	.lbs_cred = sizeof(struct task_smack),
 	.lbs_file = sizeof(struct smack_known *),
 	.lbs_inode = sizeof(struct inode_smack),
@@ -4856,7 +4856,7 @@ struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
 	.lbs_superblock = sizeof(struct superblock_smack),
 };
 
-static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
 	LSM_HOOK_INIT(syslog, smack_syslog),
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index af04a7b7eb28..25006fddc964 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -499,7 +499,7 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
 	return tomoyo_socket_sendmsg_permission(sock, msg, size);
 }
 
-struct lsm_blob_sizes tomoyo_blob_sizes __lsm_ro_after_init = {
+struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
 	.lbs_task = sizeof(struct tomoyo_task),
 };
 
@@ -546,7 +546,7 @@ static void tomoyo_task_free(struct task_struct *task)
  * tomoyo_security_ops is a "struct security_operations" which is used for
  * registering TOMOYO.
  */
-static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
 	LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
 	LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
@@ -583,7 +583,7 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
 /* Lock for GC. */
 DEFINE_SRCU(tomoyo_ss);
 
-int tomoyo_enabled __lsm_ro_after_init = 1;
+int tomoyo_enabled __ro_after_init = 1;
 
 /**
  * tomoyo_init - Register TOMOYO Linux as a LSM module.
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 06e226166aab..478be269571a 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -421,7 +421,7 @@ static int yama_ptrace_traceme(struct task_struct *parent)
 	return rc;
 }
 
-static struct security_hook_list yama_hooks[] __lsm_ro_after_init = {
+static struct security_hook_list yama_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme),
 	LSM_HOOK_INIT(task_prctl, yama_task_prctl),
-- 
cgit v1.2.3


From f2de003e1426ccbefa281a066040da7699f6d461 Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Wed, 15 Mar 2023 10:52:28 -0500
Subject: dt-bindings: pinctrl: k3: Deprecate header with register constants

For convenience (less code duplication), the pin controller pin
configuration register values were defined in the bindings header.
These are not some IDs or other abstraction layer but raw numbers used
in the registers.

These constants do not fit the purpose of bindings. They do not
provide any abstraction, any hardware and driver independent ID. In
fact, the Linux pinctrl-single driver actually do not use the bindings
header at all.

All of the constants were moved already to headers local to DTS
(residing in DTS directory), so remove any references to the bindings
header and add a warning that it is deprecated.

Suggested-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/linux-arm-kernel/71c7feff-4189-f12f-7353-bce41a61119d@linaro.org/
Link: https://lore.kernel.org/r/20230315155228.1566883-4-nm@ti.com
Signed-off-by: Nishanth Menon <nm@ti.com>
---
 include/dt-bindings/pinctrl/k3.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/pinctrl/k3.h b/include/dt-bindings/pinctrl/k3.h
index 6bb9df1a264d..b5aca149664e 100644
--- a/include/dt-bindings/pinctrl/k3.h
+++ b/include/dt-bindings/pinctrl/k3.h
@@ -8,6 +8,13 @@
 #ifndef _DT_BINDINGS_PINCTRL_TI_K3_H
 #define _DT_BINDINGS_PINCTRL_TI_K3_H
 
+/*
+ * These bindings are deprecated, because they do not match the actual
+ * concept of bindings but rather contain pure register values.
+ * Instead include the header in the DTS source directory.
+ */
+#warning "These bindings are deprecated. Instead, use the header in the DTS source directory."
+
 #define PULLUDEN_SHIFT		(16)
 #define PULLTYPESEL_SHIFT	(17)
 #define RXACTIVE_SHIFT		(18)
-- 
cgit v1.2.3


From 2c5a06e5505a716387a4d86f1f39de506836435a Mon Sep 17 00:00:00 2001
From: Kiran K <kiran.k@intel.com>
Date: Thu, 16 Mar 2023 12:47:15 +0530
Subject: ACPI: utils: Fix acpi_evaluate_dsm_typed() redefinition error

acpi_evaluate_dsm_typed() needs to be gaurded with CONFIG_ACPI to avoid
a redefintion error when the stub is also enabled.

In file included from ../drivers/bluetooth/btintel.c:13:
../include/acpi/acpi_bus.h:57:1: error: redefinition of 'acpi_evaluate_dsm_typed'
   57 | acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid,..
      | ^~~~~~~~~~~~~~~~~~~~~~~
In file included from ../drivers/bluetooth/btintel.c:12:
../include/linux/acpi.h:967:34: note: previous definition of
'acpi_evaluate_dsm_typed' with type 'union acpi_object *(void *,
const guid_t *, u64,  u64,  union acpi_object *, acpi_object_type)'
{aka 'union acpi_object *(void *, const guid_t *, long long unsigned int,
long long unsigned int,  union acpi_object *, unsigned int)'}
  967 | static inline union acpi_object
*acpi_evaluate_dsm_typed(acpi_handle handle,

Fixes: 1b94ad7ccc21 ("ACPI: utils: Add acpi_evaluate_dsm_typed() and acpi_check_dsm() stubs")
Signed-off-by: Kiran K <kiran.k@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 57acb895c038..a6affc0550b0 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -52,7 +52,7 @@ bool acpi_dock_match(acpi_handle handle);
 bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs);
 union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid,
 			u64 rev, u64 func, union acpi_object *argv4);
-
+#ifdef CONFIG_ACPI
 static inline union acpi_object *
 acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev,
 			u64 func, union acpi_object *argv4,
@@ -68,6 +68,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev,
 
 	return obj;
 }
+#endif
 
 #define	ACPI_INIT_DSM_ARGV4(cnt, eles)			\
 	{						\
-- 
cgit v1.2.3


From 54bdd67d0f88489ac88f7664b56cb7c93799d84d Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 20 Mar 2023 12:49:26 -0700
Subject: blk-mq: remove hybrid polling

io_uring provides the only way user space can poll completions, and that
always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
code, so remove it and everything supporting it.

Hybrid polling was effectively killed off with 9650b453a3d4b1, "block:
ignore RWF_HIPRI hint for sync dio", but still potentially reachable
through io_uring until d729cf9acb93119, "io_uring: don't sleep when
polling for I/O", but hybrid polling probably should not have been
reachable through that async interface from the beginning.

Fixes: 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio")
Fixes: d729cf9acb93 ("io_uring: don't sleep when polling for I/O")
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20230320194926.3353144-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/stable/sysfs-block |  15 +--
 block/blk-core.c                     |   6 -
 block/blk-mq-debugfs.c               |  26 -----
 block/blk-mq.c                       | 205 +----------------------------------
 block/blk-stat.c                     |  18 ---
 block/blk-sysfs.c                    |  25 +----
 include/linux/blk-mq.h               |   2 -
 include/linux/blkdev.h               |  12 --
 io_uring/rw.c                        |   2 +-
 9 files changed, 12 insertions(+), 299 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 282de3680367..c57e5b7cb532 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -336,18 +336,11 @@ What:		/sys/block/<disk>/queue/io_poll_delay
 Date:		November 2016
 Contact:	linux-block@vger.kernel.org
 Description:
-		[RW] If polling is enabled, this controls what kind of polling
-		will be performed. It defaults to -1, which is classic polling.
+		[RW] This was used to control what kind of polling will be
+		performed.  It is now fixed to -1, which is classic polling.
 		In this mode, the CPU will repeatedly ask for completions
-		without giving up any time.  If set to 0, a hybrid polling mode
-		is used, where the kernel will attempt to make an educated guess
-		at when the IO will complete. Based on this guess, the kernel
-		will put the process issuing IO to sleep for an amount of time,
-		before entering a classic poll loop. This mode might be a little
-		slower than pure classic polling, but it will be more efficient.
-		If set to a value larger than 0, the kernel will put the process
-		issuing IO to sleep for this amount of microseconds before
-		entering classic polling.
+		without giving up any time.
+		<deprecated>
 
 
 What:		/sys/block/<disk>/queue/io_timeout
diff --git a/block/blk-core.c b/block/blk-core.c
index 9e5e0277a4d9..269765d16cfd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 
 static void blk_free_queue(struct request_queue *q)
 {
-	if (q->poll_stat)
-		blk_stat_remove_callback(q, q->poll_cb);
-	blk_stat_free_callback(q->poll_cb);
-
 	blk_free_queue_stats(q->stats);
-	kfree(q->poll_stat);
-
 	if (queue_is_mq(q))
 		blk_mq_release(q);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b01818f8e216..212a7f301e73 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -15,33 +15,8 @@
 #include "blk-mq-tag.h"
 #include "blk-rq-qos.h"
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-	if (stat->nr_samples) {
-		seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
-			   stat->nr_samples, stat->mean, stat->min, stat->max);
-	} else {
-		seq_puts(m, "samples=0");
-	}
-}
-
 static int queue_poll_stat_show(void *data, struct seq_file *m)
 {
-	struct request_queue *q = data;
-	int bucket;
-
-	if (!q->poll_stat)
-		return 0;
-
-	for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
-		seq_printf(m, "read  (%d Bytes): ", 1 << (9 + bucket));
-		print_stat(m, &q->poll_stat[2 * bucket]);
-		seq_puts(m, "\n");
-
-		seq_printf(m, "write (%d Bytes): ",  1 << (9 + bucket));
-		print_stat(m, &q->poll_stat[2 * bucket + 1]);
-		seq_puts(m, "\n");
-	}
 	return 0;
 }
 
@@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
-	RQF_NAME(MQ_POLL_SLEPT),
 	RQF_NAME(TIMED_OUT),
 	RQF_NAME(ELV),
 	RQF_NAME(RESV),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a875b1cdff9b..4e30459df815 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -46,51 +46,15 @@
 
 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
 
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
-	int ddir, sectors, bucket;
-
-	ddir = rq_data_dir(rq);
-	sectors = blk_rq_stats_sectors(rq);
-
-	bucket = ddir + 2 * ilog2(sectors);
-
-	if (bucket < 0)
-		return -1;
-	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
-		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
-	return bucket;
-}
-
-#define BLK_QC_T_SHIFT		16
-#define BLK_QC_T_INTERNAL	(1U << 31)
-
 static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
 		blk_qc_t qc)
 {
-	return xa_load(&q->hctx_table,
-			(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
-}
-
-static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
-		blk_qc_t qc)
-{
-	unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
-
-	if (qc & BLK_QC_T_INTERNAL)
-		return blk_mq_tag_to_rq(hctx->sched_tags, tag);
-	return blk_mq_tag_to_rq(hctx->tags, tag);
+	return xa_load(&q->hctx_table, qc);
 }
 
 static inline blk_qc_t blk_rq_to_qc(struct request *rq)
 {
-	return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
-		(rq->tag != -1 ?
-		 rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+	return rq->mq_hctx->queue_num;
 }
 
 /*
@@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)
 
 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 {
-	if (rq->rq_flags & RQF_STATS) {
-		blk_mq_poll_stats_start(rq->q);
+	if (rq->rq_flags & RQF_STATS)
 		blk_stat_add(rq, now);
-	}
 
 	blk_mq_sched_completed_request(rq, now);
 	blk_account_io_done(rq, now);
@@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
-	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
-					     blk_mq_poll_stats_bkt,
-					     BLK_MQ_POLL_STATS_BKTS, q);
-	if (!q->poll_cb)
-		goto err_exit;
-
 	if (blk_mq_alloc_ctxs(q))
-		goto err_poll;
+		goto err_exit;
 
 	/* init q->mq_kobj and sw queues' kobjects */
 	blk_mq_sysfs_init(q);
@@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	q->nr_requests = set->queue_depth;
 
-	/*
-	 * Default to classic polling
-	 */
-	q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q);
@@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 err_hctxs:
 	blk_mq_release(q);
-err_poll:
-	blk_stat_free_callback(q->poll_cb);
-	q->poll_cb = NULL;
 err_exit:
 	q->mq_ops = NULL;
 	return -ENOMEM;
@@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
-	if (q->poll_stat)
-		return true;
-
-	return blk_stats_alloc_enable(q);
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
-	/*
-	 * We don't arm the callback if polling stats are not enabled or the
-	 * callback is already active.
-	 */
-	if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
-		return;
-
-	blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
-	struct request_queue *q = cb->data;
-	int bucket;
-
-	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
-		if (cb->stat[bucket].nr_samples)
-			q->poll_stat[bucket] = cb->stat[bucket];
-	}
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
-				       struct request *rq)
-{
-	unsigned long ret = 0;
-	int bucket;
-
-	/*
-	 * If stats collection isn't on, don't sleep but turn it on for
-	 * future users
-	 */
-	if (!blk_poll_stats_enable(q))
-		return 0;
-
-	/*
-	 * As an optimistic guess, use half of the mean service time
-	 * for this type of request. We can (and should) make this smarter.
-	 * For instance, if the completion latencies are tight, we can
-	 * get closer than just half the mean. This is especially
-	 * important on devices where the completion latencies are longer
-	 * than ~10 usec. We do use the stats for the relevant IO size
-	 * if available which does lead to better estimates.
-	 */
-	bucket = blk_mq_poll_stats_bkt(rq);
-	if (bucket < 0)
-		return ret;
-
-	if (q->poll_stat[bucket].nr_samples)
-		ret = (q->poll_stat[bucket].mean + 1) / 2;
-
-	return ret;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
-{
-	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
-	struct request *rq = blk_qc_to_rq(hctx, qc);
-	struct hrtimer_sleeper hs;
-	enum hrtimer_mode mode;
-	unsigned int nsecs;
-	ktime_t kt;
-
-	/*
-	 * If a request has completed on queue that uses an I/O scheduler, we
-	 * won't get back a request from blk_qc_to_rq.
-	 */
-	if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
-		return false;
-
-	/*
-	 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
-	 *
-	 *  0:	use half of prev avg
-	 * >0:	use this specific value
-	 */
-	if (q->poll_nsec > 0)
-		nsecs = q->poll_nsec;
-	else
-		nsecs = blk_mq_poll_nsecs(q, rq);
-
-	if (!nsecs)
-		return false;
-
-	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
-
-	/*
-	 * This will be replaced with the stats tracking code, using
-	 * 'avg_completion_time / 2' as the pre-sleep target.
-	 */
-	kt = nsecs;
-
-	mode = HRTIMER_MODE_REL;
-	hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
-	hrtimer_set_expires(&hs.timer, kt);
-
-	do {
-		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
-			break;
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		hrtimer_sleeper_start_expires(&hs, mode);
-		if (hs.task)
-			io_schedule();
-		hrtimer_cancel(&hs.timer);
-		mode = HRTIMER_MODE_ABS;
-	} while (hs.task && !signal_pending(current));
-
-	__set_current_state(TASK_RUNNING);
-	destroy_hrtimer_on_stack(&hs.timer);
-
-	/*
-	 * If we sleep, have the caller restart the poll loop to reset the
-	 * state.  Like for the other success return cases, the caller is
-	 * responsible for checking if the IO completed.  If the IO isn't
-	 * complete, we'll get called again and will go straight to the busy
-	 * poll loop.
-	 */
-	return true;
-}
-
-static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
-			       struct io_comp_batch *iob, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+		unsigned int flags)
 {
 	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
 	long state = get_current_state();
@@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
 	return 0;
 }
 
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
-		unsigned int flags)
-{
-	if (!(flags & BLK_POLL_NOSLEEP) &&
-	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
-		if (blk_mq_poll_hybrid(q, cookie))
-			return 1;
-	}
-	return blk_mq_poll_classic(q, cookie, iob, flags);
-}
-
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
 	return rq->mq_ctx->cpu;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index c6ca16abf911..74a1a8c32d86 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
 
 	kfree(stats);
 }
-
-bool blk_stats_alloc_enable(struct request_queue *q)
-{
-	struct blk_rq_stat *poll_stat;
-
-	poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
-				GFP_ATOMIC);
-	if (!poll_stat)
-		return false;
-
-	if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
-		kfree(poll_stat);
-		return true;
-	}
-
-	blk_stat_add_callback(q, q->poll_cb);
-	return false;
-}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f1fce1c7fa44..1a743b4f2958 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
 {
-	int val;
-
-	if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
-		val = BLK_MQ_POLL_CLASSIC;
-	else
-		val = q->poll_nsec / 1000;
-
-	return sprintf(page, "%d\n", val);
+	return sprintf(page, "%d\n", -1);
 }
 
 static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
 				size_t count)
 {
-	int err, val;
-
-	if (!q->mq_ops || !q->mq_ops->poll)
-		return -EINVAL;
-
-	err = kstrtoint(page, 10, &val);
-	if (err < 0)
-		return err;
-
-	if (val == BLK_MQ_POLL_CLASSIC)
-		q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-	else if (val >= 0)
-		q->poll_nsec = val * 1000;
-	else
-		return -EINVAL;
-
 	return count;
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index dd5ce1137f04..1dacb2c81fdd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 /* queue has elevator attached */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1aee08f8c18..6ede578dfbc6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,12 +44,6 @@ extern const struct device_type disk_type;
 extern struct device_type part_type;
 extern struct class block_class;
 
-/* Must be consistent with blk_mq_poll_stats_bkt() */
-#define BLK_MQ_POLL_STATS_BKTS 16
-
-/* Doing classic polling */
-#define BLK_MQ_POLL_CLASSIC -1
-
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -468,10 +462,6 @@ struct request_queue {
 #endif
 
 	unsigned int		rq_timeout;
-	int			poll_nsec;
-
-	struct blk_stat_callback	*poll_cb;
-	struct blk_rq_stat	*poll_stat;
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
@@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno);
 
 /* only poll the hardware once, don't continue until a completion was found */
 #define BLK_POLL_ONESHOT		(1 << 0)
-/* do not sleep to wait for the expected completion time */
-#define BLK_POLL_NOSLEEP		(1 << 1)
 int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
 int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
 			unsigned int flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e200..a099dc0543d9 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req)
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
-	unsigned int poll_flags = BLK_POLL_NOSLEEP;
+	unsigned int poll_flags = 0;
 	DEFINE_IO_COMP_BATCH(iob);
 	int nr_events = 0;
 
-- 
cgit v1.2.3


From d54c1fd4a51e8fbc7f9da86b0cd338a4f7cd2bb2 Mon Sep 17 00:00:00 2001
From: Qin Jian <qinjian@cqplus1.com>
Date: Mon, 19 Dec 2022 09:51:30 +0800
Subject: clk: Add Sunplus SP7021 clock driver

Add clock driver for Sunplus SP7021 SoC.

Signed-off-by: Qin Jian <qinjian@cqplus1.com>
Link: https://lore.kernel.org/r/20221219015130.42621-1-qinjian@cqplus1.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 MAINTAINERS                  |   1 +
 drivers/clk/Kconfig          |  10 +
 drivers/clk/Makefile         |   1 +
 drivers/clk/clk-sp7021.c     | 713 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  19 ++
 5 files changed, 744 insertions(+)
 create mode 100644 drivers/clk/clk-sp7021.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d5bc223f305..f72107a6e86f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2849,6 +2849,7 @@ F:	Documentation/devicetree/bindings/reset/sunplus,reset.yaml
 F:	arch/arm/boot/dts/sunplus-sp7021*.dts*
 F:	arch/arm/configs/sp7021_*defconfig
 F:	arch/arm/mach-sunplus/
+F:	drivers/clk/clk-sp7021.c
 F:	drivers/irqchip/irq-sp7021-intc.c
 F:	drivers/reset/reset-sunplus.c
 F:	include/dt-bindings/clock/sunplus,sp7021-clkc.h
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index b6c5bf69a2b2..54e01bb2f568 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -436,6 +436,16 @@ config COMMON_CLK_K210
 	help
 	  Support for the Canaan Kendryte K210 RISC-V SoC clocks.
 
+config COMMON_CLK_SP7021
+	tristate "Clock driver for Sunplus SP7021 SoC"
+	depends on SOC_SP7021 || COMPILE_TEST
+	default SOC_SP7021
+	help
+	  This driver supports the Sunplus SP7021 SoC clocks.
+	  It implements SP7021 PLLs/gate.
+	  Not all features of the PLL are currently supported
+	  by the driver.
+
 source "drivers/clk/actions/Kconfig"
 source "drivers/clk/analogbits/Kconfig"
 source "drivers/clk/baikal-t1/Kconfig"
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index e3ca0d058a25..7bbf77acf181 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_COMMON_CLK_SI5351)		+= clk-si5351.o
 obj-$(CONFIG_COMMON_CLK_SI514)		+= clk-si514.o
 obj-$(CONFIG_COMMON_CLK_SI544)		+= clk-si544.o
 obj-$(CONFIG_COMMON_CLK_SI570)		+= clk-si570.o
+obj-$(CONFIG_COMMON_CLK_SP7021)		+= clk-sp7021.o
 obj-$(CONFIG_COMMON_CLK_STM32F)		+= clk-stm32f4.o
 obj-$(CONFIG_COMMON_CLK_STM32H7)	+= clk-stm32h7.o
 obj-$(CONFIG_COMMON_CLK_STM32MP157)	+= clk-stm32mp1.o
diff --git a/drivers/clk/clk-sp7021.c b/drivers/clk/clk-sp7021.c
new file mode 100644
index 000000000000..8fec14120105
--- /dev/null
+++ b/drivers/clk/clk-sp7021.c
@@ -0,0 +1,713 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/*
+ * Copyright (C) Sunplus Technology Co., Ltd.
+ *       All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/clk-provider.h>
+#include <linux/of.h>
+#include <linux/bitfield.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+
+#include <dt-bindings/clock/sunplus,sp7021-clkc.h>
+
+/* speical div_width values for PLLTV/PLLA */
+#define DIV_TV		33
+#define DIV_A		34
+
+/* PLLTV parameters */
+enum {
+	SEL_FRA,
+	SDM_MOD,
+	PH_SEL,
+	NFRA,
+	DIVR,
+	DIVN,
+	DIVM,
+	P_MAX
+};
+
+#define MASK_SEL_FRA	GENMASK(1, 1)
+#define MASK_SDM_MOD	GENMASK(2, 2)
+#define MASK_PH_SEL	GENMASK(4, 4)
+#define MASK_NFRA	GENMASK(12, 6)
+#define MASK_DIVR	GENMASK(8, 7)
+#define MASK_DIVN	GENMASK(7, 0)
+#define MASK_DIVM	GENMASK(14, 8)
+
+/* HIWORD_MASK FIELD_PREP */
+#define HWM_FIELD_PREP(mask, value)		\
+({						\
+	u32 _m = mask;				\
+	(_m << 16) | FIELD_PREP(_m, value);	\
+})
+
+struct sp_pll {
+	struct clk_hw hw;
+	void __iomem *reg;
+	spinlock_t lock;	/* lock for reg */
+	int div_shift;
+	int div_width;
+	int pd_bit;		/* power down bit idx */
+	int bp_bit;		/* bypass bit idx */
+	unsigned long brate;	/* base rate, TODO: replace brate with muldiv */
+	u32 p[P_MAX];		/* for hold PLLTV/PLLA parameters */
+};
+
+#define to_sp_pll(_hw)	container_of(_hw, struct sp_pll, hw)
+
+struct sp_clk_gate_info {
+	u16	reg;		/* reg_index_shift */
+	u16	ext_parent;	/* parent is extclk */
+};
+
+static const struct sp_clk_gate_info sp_clk_gates[] = {
+	{ 0x02 },
+	{ 0x05 },
+	{ 0x06 },
+	{ 0x07 },
+	{ 0x09 },
+	{ 0x0b, 1 },
+	{ 0x0f, 1 },
+	{ 0x14 },
+	{ 0x15 },
+	{ 0x16 },
+	{ 0x17 },
+	{ 0x18, 1 },
+	{ 0x19, 1 },
+	{ 0x1a, 1 },
+	{ 0x1b, 1 },
+	{ 0x1c, 1 },
+	{ 0x1d, 1 },
+	{ 0x1e },
+	{ 0x1f, 1 },
+	{ 0x20 },
+	{ 0x21 },
+	{ 0x22 },
+	{ 0x23 },
+	{ 0x24 },
+	{ 0x25 },
+	{ 0x26 },
+	{ 0x2a },
+	{ 0x2b },
+	{ 0x2d },
+	{ 0x2e },
+	{ 0x30 },
+	{ 0x31 },
+	{ 0x32 },
+	{ 0x33 },
+	{ 0x3d },
+	{ 0x3e },
+	{ 0x3f },
+	{ 0x42 },
+	{ 0x44 },
+	{ 0x4b },
+	{ 0x4c },
+	{ 0x4d },
+	{ 0x4e },
+	{ 0x4f },
+	{ 0x50 },
+	{ 0x55 },
+	{ 0x60 },
+	{ 0x61 },
+	{ 0x6a },
+	{ 0x73 },
+	{ 0x86 },
+	{ 0x8a },
+	{ 0x8b },
+	{ 0x8d },
+	{ 0x8e },
+	{ 0x8f },
+	{ 0x90 },
+	{ 0x92 },
+	{ 0x93 },
+	{ 0x95 },
+	{ 0x96 },
+	{ 0x97 },
+	{ 0x98 },
+	{ 0x99 },
+};
+
+#define _M		1000000UL
+#define F_27M		(27 * _M)
+
+/*********************************** PLL_TV **********************************/
+
+/* TODO: set proper FVCO range */
+#define FVCO_MIN	(100 * _M)
+#define FVCO_MAX	(200 * _M)
+
+#define F_MIN		(FVCO_MIN / 8)
+#define F_MAX		(FVCO_MAX)
+
+static long plltv_integer_div(struct sp_pll *clk, unsigned long freq)
+{
+	/* valid m values: 27M must be divisible by m */
+	static const u32 m_table[] = {
+		1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32
+	};
+	u32 m, n, r;
+	unsigned long fvco, nf;
+	long ret;
+
+	freq = clamp(freq, F_MIN, F_MAX);
+
+	/* DIVR 0~3 */
+	for (r = 0; r <= 3; r++) {
+		fvco = freq << r;
+		if (fvco <= FVCO_MAX)
+			break;
+	}
+
+	/* DIVM */
+	for (m = 0; m < ARRAY_SIZE(m_table); m++) {
+		nf = fvco * m_table[m];
+		n = nf / F_27M;
+		if ((n * F_27M) == nf)
+			break;
+	}
+	if (m >= ARRAY_SIZE(m_table)) {
+		ret = -EINVAL;
+		goto err_not_found;
+	}
+
+	/* save parameters */
+	clk->p[SEL_FRA] = 0;
+	clk->p[DIVR]    = r;
+	clk->p[DIVN]    = n;
+	clk->p[DIVM]    = m_table[m];
+
+	return freq;
+
+err_not_found:
+	pr_err("%s: %s freq:%lu not found a valid setting\n",
+	       __func__, clk_hw_get_name(&clk->hw), freq);
+
+	return ret;
+}
+
+/* parameters for PLLTV fractional divider */
+static const u32 pt[][5] = {
+	/* conventional fractional */
+	{
+		1,			/* factor */
+		5,			/* 5 * p0 (nint) */
+		1,			/* 1 * p0 */
+		F_27M,			/* F_27M / p0 */
+		1,			/* p0 / p2 */
+	},
+	/* phase rotation */
+	{
+		10,			/* factor */
+		54,			/* 5.4 * p0 (nint) */
+		2,			/* 0.2 * p0 */
+		F_27M / 10,		/* F_27M / p0 */
+		5,			/* p0 / p2 */
+	},
+};
+
+static const u32 sdm_mod_vals[] = { 91, 55 };
+
+static long plltv_fractional_div(struct sp_pll *clk, unsigned long freq)
+{
+	u32 m, r;
+	u32 nint, nfra;
+	u32 df_quotient_min = 210000000;
+	u32 df_remainder_min = 0;
+	unsigned long fvco, nf, f, fout = 0;
+	int sdm, ph;
+
+	freq = clamp(freq, F_MIN, F_MAX);
+
+	/* DIVR 0~3 */
+	for (r = 0; r <= 3; r++) {
+		fvco = freq << r;
+		if (fvco <= FVCO_MAX)
+			break;
+	}
+	f = F_27M >> r;
+
+	/* PH_SEL */
+	for (ph = ARRAY_SIZE(pt) - 1; ph >= 0; ph--) {
+		const u32 *pp = pt[ph];
+
+		/* SDM_MOD */
+		for (sdm = 0; sdm < ARRAY_SIZE(sdm_mod_vals); sdm++) {
+			u32 mod = sdm_mod_vals[sdm];
+
+			/* DIVM 1~32 */
+			for (m = 1; m <= 32; m++) {
+				u32 df; /* diff freq */
+				u32 df_quotient, df_remainder;
+
+				nf = fvco * m;
+				nint = nf / pp[3];
+
+				if (nint < pp[1])
+					continue;
+				if (nint > pp[1])
+					break;
+
+				nfra = (((nf % pp[3]) * mod * pp[4]) + (F_27M / 2)) / F_27M;
+				if (nfra) {
+					u32 df0 = f * (nint + pp[2]) / pp[0];
+					u32 df1 = f * (mod - nfra) / mod / pp[4];
+
+					df = df0 - df1;
+				} else {
+					df = f * (nint) / pp[0];
+				}
+
+				df_quotient  = df / m;
+				df_remainder = ((df % m) * 1000) / m;
+
+				if (freq > df_quotient) {
+					df_quotient  = freq - df_quotient - 1;
+					df_remainder = 1000 - df_remainder;
+				} else {
+					df_quotient = df_quotient - freq;
+				}
+
+				if (df_quotient_min > df_quotient ||
+				    (df_quotient_min == df_quotient &&
+				    df_remainder_min > df_remainder)) {
+					/* found a closer freq, save parameters */
+					clk->p[SEL_FRA] = 1;
+					clk->p[SDM_MOD] = sdm;
+					clk->p[PH_SEL]  = ph;
+					clk->p[NFRA]    = nfra;
+					clk->p[DIVR]    = r;
+					clk->p[DIVM]    = m;
+
+					fout = df / m;
+					df_quotient_min = df_quotient;
+					df_remainder_min = df_remainder;
+				}
+			}
+		}
+	}
+
+	if (!fout) {
+		pr_err("%s: %s freq:%lu not found a valid setting\n",
+		       __func__, clk_hw_get_name(&clk->hw), freq);
+		return -EINVAL;
+	}
+
+	return fout;
+}
+
+static long plltv_div(struct sp_pll *clk, unsigned long freq)
+{
+	if (freq % 100)
+		return plltv_fractional_div(clk, freq);
+
+	return plltv_integer_div(clk, freq);
+}
+
+static int plltv_set_rate(struct sp_pll *clk)
+{
+	unsigned long flags;
+	u32 r0, r1, r2;
+
+	r0  = BIT(clk->bp_bit + 16);
+	r0 |= HWM_FIELD_PREP(MASK_SEL_FRA, clk->p[SEL_FRA]);
+	r0 |= HWM_FIELD_PREP(MASK_SDM_MOD, clk->p[SDM_MOD]);
+	r0 |= HWM_FIELD_PREP(MASK_PH_SEL, clk->p[PH_SEL]);
+	r0 |= HWM_FIELD_PREP(MASK_NFRA, clk->p[NFRA]);
+
+	r1  = HWM_FIELD_PREP(MASK_DIVR, clk->p[DIVR]);
+
+	r2  = HWM_FIELD_PREP(MASK_DIVN, clk->p[DIVN] - 1);
+	r2 |= HWM_FIELD_PREP(MASK_DIVM, clk->p[DIVM] - 1);
+
+	spin_lock_irqsave(&clk->lock, flags);
+	writel(r0, clk->reg);
+	writel(r1, clk->reg + 4);
+	writel(r2, clk->reg + 8);
+	spin_unlock_irqrestore(&clk->lock, flags);
+
+	return 0;
+}
+
+/*********************************** PLL_A ***********************************/
+
+/* from Q628_PLLs_REG_setting.xlsx */
+static const struct {
+	u32 rate;
+	u32 regs[5];
+} pa[] = {
+	{
+		.rate = 135475200,
+		.regs = {
+			0x4801,
+			0x02df,
+			0x248f,
+			0x0211,
+			0x33e9
+		}
+	},
+	{
+		.rate = 147456000,
+		.regs = {
+			0x4801,
+			0x1adf,
+			0x2490,
+			0x0349,
+			0x33e9
+		}
+	},
+	{
+		.rate = 196608000,
+		.regs = {
+			0x4801,
+			0x42ef,
+			0x2495,
+			0x01c6,
+			0x33e9
+		}
+	},
+};
+
+static int plla_set_rate(struct sp_pll *clk)
+{
+	const u32 *pp = pa[clk->p[0]].regs;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&clk->lock, flags);
+	for (i = 0; i < ARRAY_SIZE(pa->regs); i++)
+		writel(0xffff0000 | pp[i], clk->reg + (i * 4));
+	spin_unlock_irqrestore(&clk->lock, flags);
+
+	return 0;
+}
+
+static long plla_round_rate(struct sp_pll *clk, unsigned long rate)
+{
+	int i = ARRAY_SIZE(pa);
+
+	while (--i) {
+		if (rate >= pa[i].rate)
+			break;
+	}
+	clk->p[0] = i;
+
+	return pa[i].rate;
+}
+
+/********************************** SP_PLL ***********************************/
+
+static long sp_pll_calc_div(struct sp_pll *clk, unsigned long rate)
+{
+	u32 fbdiv;
+	u32 max = 1 << clk->div_width;
+
+	fbdiv = DIV_ROUND_CLOSEST(rate, clk->brate);
+	if (fbdiv > max)
+		fbdiv = max;
+
+	return fbdiv;
+}
+
+static long sp_pll_round_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long *prate)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+	long ret;
+
+	if (rate == *prate) {
+		ret = *prate; /* bypass */
+	} else if (clk->div_width == DIV_A) {
+		ret = plla_round_rate(clk, rate);
+	} else if (clk->div_width == DIV_TV) {
+		ret = plltv_div(clk, rate);
+		if (ret < 0)
+			ret = *prate;
+	} else {
+		ret = sp_pll_calc_div(clk, rate) * clk->brate;
+	}
+
+	return ret;
+}
+
+static unsigned long sp_pll_recalc_rate(struct clk_hw *hw,
+					unsigned long prate)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+	u32 reg = readl(clk->reg);
+	unsigned long ret;
+
+	if (reg & BIT(clk->bp_bit)) {
+		ret = prate; /* bypass */
+	} else if (clk->div_width == DIV_A) {
+		ret = pa[clk->p[0]].rate;
+	} else if (clk->div_width == DIV_TV) {
+		u32 m, r, reg2;
+
+		r = FIELD_GET(MASK_DIVR, readl(clk->reg + 4));
+		reg2 = readl(clk->reg + 8);
+		m = FIELD_GET(MASK_DIVM, reg2) + 1;
+
+		if (reg & MASK_SEL_FRA) {
+			/* fractional divider */
+			u32 sdm  = FIELD_GET(MASK_SDM_MOD, reg);
+			u32 ph   = FIELD_GET(MASK_PH_SEL, reg);
+			u32 nfra = FIELD_GET(MASK_NFRA, reg);
+			const u32 *pp = pt[ph];
+			unsigned long r0, r1;
+
+			ret = prate >> r;
+			r0  = ret * (pp[1] + pp[2]) / pp[0];
+			r1  = ret * (sdm_mod_vals[sdm] - nfra) / sdm_mod_vals[sdm] / pp[4];
+			ret = (r0 - r1) / m;
+		} else {
+			/* integer divider */
+			u32 n = FIELD_GET(MASK_DIVN, reg2) + 1;
+
+			ret = (prate / m * n) >> r;
+		}
+	} else {
+		u32 fbdiv = ((reg >> clk->div_shift) & ((1 << clk->div_width) - 1)) + 1;
+
+		ret = clk->brate * fbdiv;
+	}
+
+	return ret;
+}
+
+static int sp_pll_set_rate(struct clk_hw *hw, unsigned long rate,
+			   unsigned long prate)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+	unsigned long flags;
+	u32 reg;
+
+	reg = BIT(clk->bp_bit + 16); /* HIWORD_MASK */
+
+	if (rate == prate) {
+		reg |= BIT(clk->bp_bit); /* bypass */
+	} else if (clk->div_width == DIV_A) {
+		return plla_set_rate(clk);
+	} else if (clk->div_width == DIV_TV) {
+		return plltv_set_rate(clk);
+	} else if (clk->div_width) {
+		u32 fbdiv = sp_pll_calc_div(clk, rate);
+		u32 mask = GENMASK(clk->div_shift + clk->div_width - 1, clk->div_shift);
+
+		reg |= mask << 16;
+		reg |= ((fbdiv - 1) << clk->div_shift) & mask;
+	}
+
+	spin_lock_irqsave(&clk->lock, flags);
+	writel(reg, clk->reg);
+	spin_unlock_irqrestore(&clk->lock, flags);
+
+	return 0;
+}
+
+static int sp_pll_enable(struct clk_hw *hw)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+
+	writel(BIT(clk->pd_bit + 16) | BIT(clk->pd_bit), clk->reg);
+
+	return 0;
+}
+
+static void sp_pll_disable(struct clk_hw *hw)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+
+	writel(BIT(clk->pd_bit + 16), clk->reg);
+}
+
+static int sp_pll_is_enabled(struct clk_hw *hw)
+{
+	struct sp_pll *clk = to_sp_pll(hw);
+
+	return readl(clk->reg) & BIT(clk->pd_bit);
+}
+
+static const struct clk_ops sp_pll_ops = {
+	.enable = sp_pll_enable,
+	.disable = sp_pll_disable,
+	.is_enabled = sp_pll_is_enabled,
+	.round_rate = sp_pll_round_rate,
+	.recalc_rate = sp_pll_recalc_rate,
+	.set_rate = sp_pll_set_rate
+};
+
+static const struct clk_ops sp_pll_sub_ops = {
+	.enable = sp_pll_enable,
+	.disable = sp_pll_disable,
+	.is_enabled = sp_pll_is_enabled,
+	.recalc_rate = sp_pll_recalc_rate,
+};
+
+static struct clk_hw *sp_pll_register(struct device *dev, const char *name,
+				      const struct clk_parent_data *parent_data,
+				      void __iomem *reg, int pd_bit, int bp_bit,
+				      unsigned long brate, int shift, int width,
+				      unsigned long flags)
+{
+	struct sp_pll *pll;
+	struct clk_hw *hw;
+	struct clk_init_data initd = {
+		.name = name,
+		.parent_data = parent_data,
+		.ops = (bp_bit >= 0) ? &sp_pll_ops : &sp_pll_sub_ops,
+		.num_parents = 1,
+		.flags = flags,
+	};
+	int ret;
+
+	pll = devm_kzalloc(dev, sizeof(*pll), GFP_KERNEL);
+	if (!pll)
+		return ERR_PTR(-ENOMEM);
+
+	pll->hw.init = &initd;
+	pll->reg = reg;
+	pll->pd_bit = pd_bit;
+	pll->bp_bit = bp_bit;
+	pll->brate = brate;
+	pll->div_shift = shift;
+	pll->div_width = width;
+	spin_lock_init(&pll->lock);
+
+	hw = &pll->hw;
+	ret = devm_clk_hw_register(dev, hw);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return hw;
+}
+
+#define PLLA_CTL	(pll_base + 0x1c)
+#define PLLE_CTL	(pll_base + 0x30)
+#define PLLF_CTL	(pll_base + 0x34)
+#define PLLTV_CTL	(pll_base + 0x38)
+
+static int sp7021_clk_probe(struct platform_device *pdev)
+{
+	static const u32 sp_clken[] = {
+		0x67ef, 0x03ff, 0xff03, 0xfff0, 0x0004, /* G0.1~5  */
+		0x0000, 0x8000, 0xffff, 0x0040, 0x0000, /* G0.6~10 */
+	};
+	static struct clk_parent_data pd_ext, pd_sys, pd_e;
+	struct device *dev = &pdev->dev;
+	void __iomem *clk_base, *pll_base, *sys_base;
+	struct clk_hw_onecell_data *clk_data;
+	struct clk_hw **hws;
+	int i;
+
+	clk_base = devm_platform_ioremap_resource(pdev, 0);
+	if (!clk_base)
+		return -ENXIO;
+	pll_base = devm_platform_ioremap_resource(pdev, 1);
+	if (!pll_base)
+		return -ENXIO;
+	sys_base = devm_platform_ioremap_resource(pdev, 2);
+	if (!sys_base)
+		return -ENXIO;
+
+	/* enable default clks */
+	for (i = 0; i < ARRAY_SIZE(sp_clken); i++)
+		writel((sp_clken[i] << 16) | sp_clken[i], clk_base + i * 4);
+
+	clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, CLK_MAX),
+				GFP_KERNEL);
+	if (!clk_data)
+		return -ENOMEM;
+
+	hws = clk_data->hws;
+	pd_ext.index = 0;
+
+	/* PLLs */
+	hws[PLL_A] = sp_pll_register(dev, "plla", &pd_ext, PLLA_CTL,
+				     11, 12, 27000000, 0, DIV_A, 0);
+	if (IS_ERR(hws[PLL_A]))
+		return PTR_ERR(hws[PLL_A]);
+
+	hws[PLL_E] = sp_pll_register(dev, "plle", &pd_ext, PLLE_CTL,
+				     6, 2, 50000000, 0, 0, 0);
+	if (IS_ERR(hws[PLL_E]))
+		return PTR_ERR(hws[PLL_E]);
+	pd_e.hw = hws[PLL_E];
+	hws[PLL_E_2P5] = sp_pll_register(dev, "plle_2p5", &pd_e, PLLE_CTL,
+					 13, -1, 2500000, 0, 0, 0);
+	if (IS_ERR(hws[PLL_E_2P5]))
+		return PTR_ERR(hws[PLL_E_2P5]);
+	hws[PLL_E_25] = sp_pll_register(dev, "plle_25", &pd_e, PLLE_CTL,
+					12, -1, 25000000, 0, 0, 0);
+	if (IS_ERR(hws[PLL_E_25]))
+		return PTR_ERR(hws[PLL_E_25]);
+	hws[PLL_E_112P5] = sp_pll_register(dev, "plle_112p5", &pd_e, PLLE_CTL,
+					   11, -1, 112500000, 0, 0, 0);
+	if (IS_ERR(hws[PLL_E_112P5]))
+		return PTR_ERR(hws[PLL_E_112P5]);
+
+	hws[PLL_F] = sp_pll_register(dev, "pllf", &pd_ext, PLLF_CTL,
+				     0, 10, 13500000, 1, 4, 0);
+	if (IS_ERR(hws[PLL_F]))
+		return PTR_ERR(hws[PLL_F]);
+
+	hws[PLL_TV] = sp_pll_register(dev, "plltv", &pd_ext, PLLTV_CTL,
+				      0, 15, 27000000, 0, DIV_TV, 0);
+	if (IS_ERR(hws[PLL_TV]))
+		return PTR_ERR(hws[PLL_TV]);
+	hws[PLL_TV_A] = devm_clk_hw_register_divider(dev, "plltv_a", "plltv", 0,
+						     PLLTV_CTL + 4, 5, 1,
+						     CLK_DIVIDER_POWER_OF_TWO,
+						     &to_sp_pll(hws[PLL_TV])->lock);
+	if (IS_ERR(hws[PLL_TV_A]))
+		return PTR_ERR(hws[PLL_TV_A]);
+
+	/* system clock, should not be disabled */
+	hws[PLL_SYS] = sp_pll_register(dev, "pllsys", &pd_ext, sys_base,
+				       10, 9, 13500000, 0, 4, CLK_IS_CRITICAL);
+	if (IS_ERR(hws[PLL_SYS]))
+		return PTR_ERR(hws[PLL_SYS]);
+	pd_sys.hw = hws[PLL_SYS];
+
+	/* gates */
+	for (i = 0; i < ARRAY_SIZE(sp_clk_gates); i++) {
+		char name[10];
+		u32 j = sp_clk_gates[i].reg;
+		struct clk_parent_data *pd = sp_clk_gates[i].ext_parent ? &pd_ext : &pd_sys;
+
+		sprintf(name, "%02d_0x%02x", i, j);
+		hws[i] = devm_clk_hw_register_gate_parent_data(dev, name, pd, 0,
+							       clk_base + (j >> 4) * 4,
+							       j & 0x0f,
+							       CLK_GATE_HIWORD_MASK,
+							       NULL);
+		if (IS_ERR(hws[i]))
+			return PTR_ERR(hws[i]);
+	}
+
+	clk_data->num = CLK_MAX;
+
+	return devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, clk_data);
+}
+
+static const struct of_device_id sp7021_clk_dt_ids[] = {
+	{ .compatible = "sunplus,sp7021-clkc" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, sp7021_clk_dt_ids);
+
+static struct platform_driver sp7021_clk_driver = {
+	.probe  = sp7021_clk_probe,
+	.driver = {
+		.name = "sp7021-clk",
+		.of_match_table = sp7021_clk_dt_ids,
+	},
+};
+module_platform_driver(sp7021_clk_driver);
+
+MODULE_AUTHOR("Sunplus Technology");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clock driver for Sunplus SP7021 SoC");
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 842e72a5348f..0e3bc3eb9911 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -608,6 +608,25 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	__devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
 			       NULL, (flags), (reg), (bit_idx),		      \
 			       (clk_gate_flags), (lock))
+/**
+ * devm_clk_hw_register_gate_parent_data - register a gate clock with the
+ * clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_data: parent clk data
+ * @flags: framework-specific flags for this clock
+ * @reg: register address to control gating of this clock
+ * @bit_idx: which bit in the register controls gating of this clock
+ * @clk_gate_flags: gate-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_gate_parent_data(dev, name, parent_data, flags,  \
+					      reg, bit_idx, clk_gate_flags,   \
+					      lock)                           \
+	__devm_clk_hw_register_gate((dev), NULL, (name), NULL, NULL,	      \
+				    (parent_data), (flags), (reg), (bit_idx), \
+				    (clk_gate_flags), (lock))
+
 void clk_unregister_gate(struct clk *clk);
 void clk_hw_unregister_gate(struct clk_hw *hw);
 int clk_gate_is_enabled(struct clk_hw *hw);
-- 
cgit v1.2.3


From 4765a9722e09765866e131ec31f7b9cf4c1f4854 Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Sun, 19 Mar 2023 12:57:50 +0000
Subject: net: pcs: add driver for MediaTek SGMII PCS

The SGMII core found in several MediaTek SoCs is identical to what can
also be found in MediaTek's MT7531 Ethernet switch IC.
As this has not always been clear, both drivers developed different
implementations to deal with the PCS.
Recently Alexander Couzens pointed out this fact which lead to the
development of this shared driver.

Add a dedicated driver, mostly by copying the code now found in the
Ethernet driver. The now redundant code will be removed by a follow-up
commit.

Suggested-by: Alexander Couzens <lynxis@fe80.eu>
Suggested-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Tested-by: Frank Wunderlich <frank-w@public-files.de>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                       |   8 +
 drivers/net/pcs/Kconfig           |   7 +
 drivers/net/pcs/Makefile          |   1 +
 drivers/net/pcs/pcs-mtk-lynxi.c   | 305 ++++++++++++++++++++++++++++++++++++++
 include/linux/pcs/pcs-mtk-lynxi.h |  13 ++
 5 files changed, 334 insertions(+)
 create mode 100644 drivers/net/pcs/pcs-mtk-lynxi.c
 create mode 100644 include/linux/pcs/pcs-mtk-lynxi.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 30ca644d704f..54a2a8122a97 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13042,6 +13042,14 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/mediatek/
 
+MEDIATEK ETHERNET PCS DRIVER
+M:	Alexander Couzens <lynxis@fe80.eu>
+M:	Daniel Golle <daniel@makrotopia.org>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/pcs/pcs-mtk-lynxi.c
+F:	include/linux/pcs/pcs-mtk-lynxi.h
+
 MEDIATEK I2C CONTROLLER DRIVER
 M:	Qii Wang <qii.wang@mediatek.com>
 L:	linux-i2c@vger.kernel.org
diff --git a/drivers/net/pcs/Kconfig b/drivers/net/pcs/Kconfig
index 6e7e6c346a3e..7c34fb7cbf7b 100644
--- a/drivers/net/pcs/Kconfig
+++ b/drivers/net/pcs/Kconfig
@@ -18,6 +18,13 @@ config PCS_LYNX
 	  This module provides helpers to phylink for managing the Lynx PCS
 	  which is part of the Layerscape and QorIQ Ethernet SERDES.
 
+config PCS_MTK_LYNXI
+	tristate
+	select REGMAP
+	help
+	  This module provides helpers to phylink for managing the LynxI PCS
+	  which is part of MediaTek's SoC and Ethernet switch ICs.
+
 config PCS_RZN1_MIIC
 	tristate "Renesas RZ/N1 MII converter"
 	depends on OF && (ARCH_RZN1 || COMPILE_TEST)
diff --git a/drivers/net/pcs/Makefile b/drivers/net/pcs/Makefile
index 4c780d8f2e98..9b9afd6b1c22 100644
--- a/drivers/net/pcs/Makefile
+++ b/drivers/net/pcs/Makefile
@@ -5,5 +5,6 @@ pcs_xpcs-$(CONFIG_PCS_XPCS)	:= pcs-xpcs.o pcs-xpcs-nxp.o
 
 obj-$(CONFIG_PCS_XPCS)		+= pcs_xpcs.o
 obj-$(CONFIG_PCS_LYNX)		+= pcs-lynx.o
+obj-$(CONFIG_PCS_MTK_LYNXI)	+= pcs-mtk-lynxi.o
 obj-$(CONFIG_PCS_RZN1_MIIC)	+= pcs-rzn1-miic.o
 obj-$(CONFIG_PCS_ALTERA_TSE)	+= pcs-altera-tse.o
diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c
new file mode 100644
index 000000000000..888452325edc
--- /dev/null
+++ b/drivers/net/pcs/pcs-mtk-lynxi.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018-2019 MediaTek Inc.
+/* A library for MediaTek SGMII circuit
+ *
+ * Author: Sean Wang <sean.wang@mediatek.com>
+ * Author: Alexander Couzens <lynxis@fe80.eu>
+ * Author: Daniel Golle <daniel@makrotopia.org>
+ *
+ */
+
+#include <linux/mdio.h>
+#include <linux/of.h>
+#include <linux/pcs/pcs-mtk-lynxi.h>
+#include <linux/phylink.h>
+#include <linux/regmap.h>
+
+/* SGMII subsystem config registers */
+/* BMCR (low 16) BMSR (high 16) */
+#define SGMSYS_PCS_CONTROL_1		0x0
+#define SGMII_BMCR			GENMASK(15, 0)
+#define SGMII_BMSR			GENMASK(31, 16)
+
+#define SGMSYS_PCS_DEVICE_ID		0x4
+#define SGMII_LYNXI_DEV_ID		0x4d544950
+
+#define SGMSYS_PCS_ADVERTISE		0x8
+#define SGMII_ADVERTISE			GENMASK(15, 0)
+#define SGMII_LPA			GENMASK(31, 16)
+
+#define SGMSYS_PCS_SCRATCH		0x14
+#define SGMII_DEV_VERSION		GENMASK(31, 16)
+
+/* Register to programmable link timer, the unit in 2 * 8ns */
+#define SGMSYS_PCS_LINK_TIMER		0x18
+#define SGMII_LINK_TIMER_MASK		GENMASK(19, 0)
+#define SGMII_LINK_TIMER_VAL(ns)	FIELD_PREP(SGMII_LINK_TIMER_MASK, \
+						   ((ns) / 2 / 8))
+
+/* Register to control remote fault */
+#define SGMSYS_SGMII_MODE		0x20
+#define SGMII_IF_MODE_SGMII		BIT(0)
+#define SGMII_SPEED_DUPLEX_AN		BIT(1)
+#define SGMII_SPEED_MASK		GENMASK(3, 2)
+#define SGMII_SPEED_10			FIELD_PREP(SGMII_SPEED_MASK, 0)
+#define SGMII_SPEED_100			FIELD_PREP(SGMII_SPEED_MASK, 1)
+#define SGMII_SPEED_1000		FIELD_PREP(SGMII_SPEED_MASK, 2)
+#define SGMII_DUPLEX_HALF		BIT(4)
+#define SGMII_REMOTE_FAULT_DIS		BIT(8)
+
+/* Register to reset SGMII design */
+#define SGMSYS_RESERVED_0		0x34
+#define SGMII_SW_RESET			BIT(0)
+
+/* Register to set SGMII speed, ANA RG_ Control Signals III */
+#define SGMII_PHY_SPEED_MASK		GENMASK(3, 2)
+#define SGMII_PHY_SPEED_1_25G		FIELD_PREP(SGMII_PHY_SPEED_MASK, 0)
+#define SGMII_PHY_SPEED_3_125G		FIELD_PREP(SGMII_PHY_SPEED_MASK, 1)
+
+/* Register to power up QPHY */
+#define SGMSYS_QPHY_PWR_STATE_CTRL	0xe8
+#define	SGMII_PHYA_PWD			BIT(4)
+
+/* Register to QPHY wrapper control */
+#define SGMSYS_QPHY_WRAP_CTRL		0xec
+#define SGMII_PN_SWAP_MASK		GENMASK(1, 0)
+#define SGMII_PN_SWAP_TX_RX		(BIT(0) | BIT(1))
+
+/* struct mtk_pcs_lynxi -  This structure holds each sgmii regmap andassociated
+ *                         data
+ * @regmap:                The register map pointing at the range used to setup
+ *                         SGMII modes
+ * @dev:                   Pointer to device owning the PCS
+ * @ana_rgc3:              The offset of register ANA_RGC3 relative to regmap
+ * @interface:             Currently configured interface mode
+ * @pcs:                   Phylink PCS structure
+ * @flags:                 Flags indicating hardware properties
+ */
+struct mtk_pcs_lynxi {
+	struct regmap		*regmap;
+	u32			ana_rgc3;
+	phy_interface_t		interface;
+	struct			phylink_pcs pcs;
+	u32			flags;
+};
+
+static struct mtk_pcs_lynxi *pcs_to_mtk_pcs_lynxi(struct phylink_pcs *pcs)
+{
+	return container_of(pcs, struct mtk_pcs_lynxi, pcs);
+}
+
+static void mtk_pcs_lynxi_get_state(struct phylink_pcs *pcs,
+				    struct phylink_link_state *state)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	unsigned int bm, adv;
+
+	/* Read the BMSR and LPA */
+	regmap_read(mpcs->regmap, SGMSYS_PCS_CONTROL_1, &bm);
+	regmap_read(mpcs->regmap, SGMSYS_PCS_ADVERTISE, &adv);
+
+	phylink_mii_c22_pcs_decode_state(state, FIELD_GET(SGMII_BMSR, bm),
+					 FIELD_GET(SGMII_LPA, adv));
+}
+
+static int mtk_pcs_lynxi_config(struct phylink_pcs *pcs, unsigned int mode,
+				phy_interface_t interface,
+				const unsigned long *advertising,
+				bool permit_pause_to_mac)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	bool mode_changed = false, changed, use_an;
+	unsigned int rgc3, sgm_mode, bmcr;
+	int advertise, link_timer;
+
+	advertise = phylink_mii_c22_pcs_encode_advertisement(interface,
+							     advertising);
+	if (advertise < 0)
+		return advertise;
+
+	/* Clearing IF_MODE_BIT0 switches the PCS to BASE-X mode, and
+	 * we assume that fixes it's speed at bitrate = line rate (in
+	 * other words, 1000Mbps or 2500Mbps).
+	 */
+	if (interface == PHY_INTERFACE_MODE_SGMII) {
+		sgm_mode = SGMII_IF_MODE_SGMII;
+		if (phylink_autoneg_inband(mode)) {
+			sgm_mode |= SGMII_REMOTE_FAULT_DIS |
+				    SGMII_SPEED_DUPLEX_AN;
+			use_an = true;
+		} else {
+			use_an = false;
+		}
+	} else if (phylink_autoneg_inband(mode)) {
+		/* 1000base-X or 2500base-X autoneg */
+		sgm_mode = SGMII_REMOTE_FAULT_DIS;
+		use_an = linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					   advertising);
+	} else {
+		/* 1000base-X or 2500base-X without autoneg */
+		sgm_mode = 0;
+		use_an = false;
+	}
+
+	if (use_an)
+		bmcr = BMCR_ANENABLE;
+	else
+		bmcr = 0;
+
+	if (mpcs->interface != interface) {
+		link_timer = phylink_get_link_timer_ns(interface);
+		if (link_timer < 0)
+			return link_timer;
+
+		/* PHYA power down */
+		regmap_set_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL,
+				SGMII_PHYA_PWD);
+
+		/* Reset SGMII PCS state */
+		regmap_set_bits(mpcs->regmap, SGMSYS_RESERVED_0,
+				SGMII_SW_RESET);
+
+		if (mpcs->flags & MTK_SGMII_FLAG_PN_SWAP)
+			regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_WRAP_CTRL,
+					   SGMII_PN_SWAP_MASK,
+					   SGMII_PN_SWAP_TX_RX);
+
+		if (interface == PHY_INTERFACE_MODE_2500BASEX)
+			rgc3 = SGMII_PHY_SPEED_3_125G;
+		else
+			rgc3 = SGMII_PHY_SPEED_1_25G;
+
+		/* Configure the underlying interface speed */
+		regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3,
+				   SGMII_PHY_SPEED_MASK, rgc3);
+
+		/* Setup the link timer */
+		regmap_write(mpcs->regmap, SGMSYS_PCS_LINK_TIMER,
+			     SGMII_LINK_TIMER_VAL(link_timer));
+
+		mpcs->interface = interface;
+		mode_changed = true;
+	}
+
+	/* Update the advertisement, noting whether it has changed */
+	regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE,
+				 SGMII_ADVERTISE, advertise, &changed);
+
+	/* Update the sgmsys mode register */
+	regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE,
+			   SGMII_REMOTE_FAULT_DIS | SGMII_SPEED_DUPLEX_AN |
+			   SGMII_IF_MODE_SGMII, sgm_mode);
+
+	/* Update the BMCR */
+	regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1,
+			   BMCR_ANENABLE, bmcr);
+
+	/* Release PHYA power down state
+	 * Only removing bit SGMII_PHYA_PWD isn't enough.
+	 * There are cases when the SGMII_PHYA_PWD register contains 0x9 which
+	 * prevents SGMII from working. The SGMII still shows link but no traffic
+	 * can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was
+	 * taken from a good working state of the SGMII interface.
+	 * Unknown how much the QPHY needs but it is racy without a sleep.
+	 * Tested on mt7622 & mt7986.
+	 */
+	usleep_range(50, 100);
+	regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0);
+
+	return changed || mode_changed;
+}
+
+static void mtk_pcs_lynxi_restart_an(struct phylink_pcs *pcs)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+
+	regmap_set_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, BMCR_ANRESTART);
+}
+
+static void mtk_pcs_lynxi_link_up(struct phylink_pcs *pcs, unsigned int mode,
+				  phy_interface_t interface, int speed,
+				  int duplex)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	unsigned int sgm_mode;
+
+	if (!phylink_autoneg_inband(mode)) {
+		/* Force the speed and duplex setting */
+		if (speed == SPEED_10)
+			sgm_mode = SGMII_SPEED_10;
+		else if (speed == SPEED_100)
+			sgm_mode = SGMII_SPEED_100;
+		else
+			sgm_mode = SGMII_SPEED_1000;
+
+		if (duplex != DUPLEX_FULL)
+			sgm_mode |= SGMII_DUPLEX_HALF;
+
+		regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE,
+				   SGMII_DUPLEX_HALF | SGMII_SPEED_MASK,
+				   sgm_mode);
+	}
+}
+
+static const struct phylink_pcs_ops mtk_pcs_lynxi_ops = {
+	.pcs_get_state = mtk_pcs_lynxi_get_state,
+	.pcs_config = mtk_pcs_lynxi_config,
+	.pcs_an_restart = mtk_pcs_lynxi_restart_an,
+	.pcs_link_up = mtk_pcs_lynxi_link_up,
+};
+
+struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev,
+					 struct regmap *regmap, u32 ana_rgc3,
+					 u32 flags)
+{
+	struct mtk_pcs_lynxi *mpcs;
+	u32 id, ver;
+	int ret;
+
+	ret = regmap_read(regmap, SGMSYS_PCS_DEVICE_ID, &id);
+	if (ret < 0)
+		return NULL;
+
+	if (id != SGMII_LYNXI_DEV_ID) {
+		dev_err(dev, "unknown PCS device id %08x\n", id);
+		return NULL;
+	}
+
+	ret = regmap_read(regmap, SGMSYS_PCS_SCRATCH, &ver);
+	if (ret < 0)
+		return NULL;
+
+	ver = FIELD_GET(SGMII_DEV_VERSION, ver);
+	if (ver != 0x1) {
+		dev_err(dev, "unknown PCS device version %04x\n", ver);
+		return NULL;
+	}
+
+	dev_dbg(dev, "MediaTek LynxI SGMII PCS (id 0x%08x, ver 0x%04x)\n", id,
+		ver);
+
+	mpcs = kzalloc(sizeof(*mpcs), GFP_KERNEL);
+	if (!mpcs)
+		return NULL;
+
+	mpcs->ana_rgc3 = ana_rgc3;
+	mpcs->regmap = regmap;
+	mpcs->flags = flags;
+	mpcs->pcs.ops = &mtk_pcs_lynxi_ops;
+	mpcs->pcs.poll = true;
+	mpcs->interface = PHY_INTERFACE_MODE_NA;
+
+	return &mpcs->pcs;
+}
+EXPORT_SYMBOL(mtk_pcs_lynxi_create);
+
+void mtk_pcs_lynxi_destroy(struct phylink_pcs *pcs)
+{
+	if (!pcs)
+		return;
+
+	kfree(pcs_to_mtk_pcs_lynxi(pcs));
+}
+EXPORT_SYMBOL(mtk_pcs_lynxi_destroy);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pcs/pcs-mtk-lynxi.h b/include/linux/pcs/pcs-mtk-lynxi.h
new file mode 100644
index 000000000000..be3b4ab32f4a
--- /dev/null
+++ b/include/linux/pcs/pcs-mtk-lynxi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PCS_MTK_LYNXI_H
+#define __LINUX_PCS_MTK_LYNXI_H
+
+#include <linux/phylink.h>
+#include <linux/regmap.h>
+
+#define MTK_SGMII_FLAG_PN_SWAP BIT(0)
+struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev,
+					 struct regmap *regmap,
+					 u32 ana_rgc3, u32 flags);
+void mtk_pcs_lynxi_destroy(struct phylink_pcs *pcs);
+#endif
-- 
cgit v1.2.3


From 04aae213e719ec2bb310158c4025316ace50589b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:13 -0700
Subject: net: skbuff: rename __pkt_vlan_present_offset to __mono_tc_offset

vlan_present is gone since
commit 354259fa73e2 ("net: remove skb->vlan_present")
rename the offset field to what BPF is currently looking
for in this byte - mono_delivery_time and tc_at_ingress.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-2-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h                               | 4 ++--
 net/core/filter.c                                    | 8 ++++----
 tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3f3a2a82a86b..5a63878a4550 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -955,7 +955,7 @@ struct sk_buff {
 	__u8			csum_valid:1;
 
 	/* private: */
-	__u8			__pkt_vlan_present_offset[0];
+	__u8			__mono_tc_offset[0];
 	/* public: */
 	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
@@ -1078,7 +1078,7 @@ struct sk_buff {
 #define TC_AT_INGRESS_MASK		(1 << 7)
 #define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
 #endif
-#define PKT_VLAN_PRESENT_OFFSET	offsetof(struct sk_buff, __pkt_vlan_present_offset)
+#define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
 #ifdef __KERNEL__
 /*
diff --git a/net/core/filter.c b/net/core/filter.c
index 50f649f1b4a9..3370efad1dda 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9185,7 +9185,7 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
 	__u8 tmp_reg = BPF_REG_AX;
 
 	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      PKT_VLAN_PRESENT_OFFSET);
+			      SKB_BF_MONO_TC_OFFSET);
 	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
 				SKB_MONO_DELIVERY_TIME_MASK, 2);
 	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
@@ -9232,7 +9232,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
 		/* AX is needed because src_reg and dst_reg could be the same */
 		__u8 tmp_reg = BPF_REG_AX;
 
-		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
 		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
 					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
 		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
@@ -9267,14 +9267,14 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 	if (!prog->tstamp_type_access) {
 		__u8 tmp_reg = BPF_REG_AX;
 
-		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
 		/* Writing __sk_buff->tstamp as ingress, goto <clear> */
 		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
 		/* goto <store> */
 		*insn++ = BPF_JMP_A(2);
 		/* <clear>: mono_delivery_time */
 		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
-		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
 	}
 #endif
 
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index d5fe3d4b936c..ae7b6e50e405 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -68,17 +68,17 @@ static struct test_case test_cases[] = {
 #if defined(__x86_64__) || defined(__aarch64__)
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
-		.read  = "r11 = *(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset);"
+		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
 			 "w11 &= 160;"
 			 "if w11 != 0xa0 goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
-		.write = "r11 = *(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset);"
+		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
 			 "if w11 & 0x80 goto pc+1;"
 			 "goto pc+2;"
 			 "w11 &= -33;"
-			 "*(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset) = r11;"
+			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
 #endif
-- 
cgit v1.2.3


From b94e032b7ad6318b36615f0e0cc3b0d61a5531e8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:14 -0700
Subject: net: skbuff: reorder bytes 2 and 3 of the bitfield

BPF needs to know the offsets of fields it tries to access.
Zero-length fields are added to make offsetof() work.
This unfortunately partitions the bitfield (fields across
the zero-length members can't be coalesced).

Reorder bytes 2 and 3, BPF needs to know the offset of fields
previously in byte 3 and some fields in byte 2 should really
be optional.

The two bytes are always in the same cacheline so it should
not matter.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-3-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5a63878a4550..36d31e74db37 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -944,16 +944,6 @@ struct sk_buff {
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
 
-	__u8			l4_hash:1;
-	__u8			sw_hash:1;
-	__u8			wifi_acked_valid:1;
-	__u8			wifi_acked:1;
-	__u8			no_fcs:1;
-	/* Indicates the inner headers are valid in the skbuff. */
-	__u8			encapsulation:1;
-	__u8			encap_hdr_csum:1;
-	__u8			csum_valid:1;
-
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
@@ -966,6 +956,16 @@ struct sk_buff {
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
 #endif
+
+	__u8			l4_hash:1;
+	__u8			sw_hash:1;
+	__u8			wifi_acked_valid:1;
+	__u8			wifi_acked:1;
+	__u8			no_fcs:1;
+	/* Indicates the inner headers are valid in the skbuff. */
+	__u8			encapsulation:1;
+	__u8			encap_hdr_csum:1;
+	__u8			csum_valid:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
-- 
cgit v1.2.3


From c0ba861117c3e8deb03855d7dc5a7717958bbb18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:15 -0700
Subject: net: skbuff: move the fields BPF cares about directly next to the
 offset marker

To avoid more possible BPF dependencies with moving bitfields
around keep the fields BPF cares about right next to the offset
marker.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-4-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h                               | 18 +++++++++---------
 tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 36d31e74db37..6aeb0e7b9511 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -947,15 +947,15 @@ struct sk_buff {
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
-	__u8			remcsum_offload:1;
-	__u8			csum_complete_sw:1;
-	__u8			csum_level:2;
-	__u8			dst_pending_confirm:1;
 	__u8			mono_delivery_time:1;	/* See SKB_MONO_DELIVERY_TIME_MASK */
 #ifdef CONFIG_NET_CLS_ACT
-	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
+	__u8			tc_skip_classify:1;
 #endif
+	__u8			remcsum_offload:1;
+	__u8			csum_complete_sw:1;
+	__u8			csum_level:2;
+	__u8			dst_pending_confirm:1;
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
@@ -1072,11 +1072,11 @@ struct sk_buff {
  * around, you also must adapt these constants.
  */
 #ifdef __BIG_ENDIAN_BITFIELD
-#define TC_AT_INGRESS_MASK		(1 << 0)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 2)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
+#define TC_AT_INGRESS_MASK		(1 << 6)
 #else
-#define TC_AT_INGRESS_MASK		(1 << 7)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
+#define TC_AT_INGRESS_MASK		(1 << 1)
 #endif
 #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index ae7b6e50e405..4951aa978f33 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -69,15 +69,15 @@ static struct test_case test_cases[] = {
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
 		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "w11 &= 160;"
-			 "if w11 != 0xa0 goto pc+2;"
+			 "w11 &= 3;"
+			 "if w11 != 0x3 goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
 		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "if w11 & 0x80 goto pc+1;"
+			 "if w11 & 0x2 goto pc+1;"
 			 "goto pc+2;"
-			 "w11 &= -33;"
+			 "w11 &= -2;"
 			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
-- 
cgit v1.2.3


From 2d337b7158f8c38dacf8394028ab87b8a25ed707 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Wed, 1 Mar 2023 10:06:27 +0000
Subject: userfaultfd: move unprivileged_userfaultfd sysctl to its own file

The sysctl_unprivileged_userfaultfd is part of userfaultfd, move it to
its own file.

Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/userfaultfd.c              | 20 +++++++++++++++++++-
 include/linux/userfaultfd_k.h |  2 --
 kernel/sysctl.c               | 11 -----------
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..d01f803a6b11 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -32,7 +32,22 @@
 #include <linux/swapops.h>
 #include <linux/miscdevice.h>
 
-int sysctl_unprivileged_userfaultfd __read_mostly;
+static int sysctl_unprivileged_userfaultfd __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_userfaultfd_table[] = {
+	{
+		.procname	= "unprivileged_userfaultfd",
+		.data		= &sysctl_unprivileged_userfaultfd,
+		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+#endif
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -2178,6 +2193,9 @@ static int __init userfaultfd_init(void)
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 						init_once_userfaultfd_ctx);
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("vm", vm_userfaultfd_table);
+#endif
 	return 0;
 }
 __initcall(userfaultfd_init);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..fff49fec0258 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,8 +36,6 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
-extern int sysctl_unprivileged_userfaultfd;
-
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c240d2c99bc..c14552a662ae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2438,17 +2438,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= (void *)&mmap_rnd_compat_bits_min,
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
-#endif
-#ifdef CONFIG_USERFAULTFD
-	{
-		.procname	= "unprivileged_userfaultfd",
-		.data		= &sysctl_unprivileged_userfaultfd,
-		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
 #endif
 	{ }
 };
-- 
cgit v1.2.3


From 962de54828c5bb100eb8de881b79ed9c8b7468c0 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 9 Mar 2023 20:20:11 +0800
Subject: mm: hugetlb: move hugeltb sysctls to its own file

This moves all hugetlb sysctls to its own file, also kill an
useless hugetlb_treat_movable_handler() defination.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/hugetlb.h |  8 --------
 kernel/sysctl.c         | 32 -------------------------------
 mm/hugetlb.c            | 51 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7c977d234aba..4056b05d81ed 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -124,14 +124,6 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void hugetlb_dup_vma_private(struct vm_area_struct *vma);
 void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
-		loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
-		loff_t *);
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
-		loff_t *);
-
 int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     struct vm_area_struct *new_vma,
 			     unsigned long old_addr, unsigned long new_addr,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c14552a662ae..ce0297acf97c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,38 +2140,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
-#endif
-#ifdef CONFIG_HUGETLB_PAGE
-	{
-		.procname	= "nr_hugepages",
-		.data		= NULL,
-		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
-		.proc_handler	= hugetlb_sysctl_handler,
-	},
-#ifdef CONFIG_NUMA
-	{
-		.procname       = "nr_hugepages_mempolicy",
-		.data           = NULL,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
-	},
-#endif
-	 {
-		.procname	= "hugetlb_shm_group",
-		.data		= &sysctl_hugetlb_shm_group,
-		.maxlen		= sizeof(gid_t),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	 },
-	{
-		.procname	= "nr_overcommit_hugepages",
-		.data		= NULL,
-		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
-		.proc_handler	= hugetlb_overcommit_handler,
-	},
 #endif
 	{
 		.procname	= "lowmem_reserve_ratio",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..0ee77c55e44e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4202,6 +4202,12 @@ static void __init hugetlb_sysfs_init(void)
 	hugetlb_register_all_nodes();
 }
 
+#ifdef CONFIG_SYSCTL
+static void hugetlb_sysctl_init(void);
+#else
+static inline void hugetlb_sysctl_init(void) { }
+#endif
+
 static int __init hugetlb_init(void)
 {
 	int i;
@@ -4257,6 +4263,7 @@ static int __init hugetlb_init(void)
 
 	hugetlb_sysfs_init();
 	hugetlb_cgroup_file_init();
+	hugetlb_sysctl_init();
 
 #ifdef CONFIG_SMP
 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
@@ -4588,7 +4595,7 @@ out:
 	return ret;
 }
 
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			  void *buffer, size_t *length, loff_t *ppos)
 {
 
@@ -4597,7 +4604,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_NUMA
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 			  void *buffer, size_t *length, loff_t *ppos)
 {
 	return hugetlb_sysctl_handler_common(true, table, write,
@@ -4605,7 +4612,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_NUMA */
 
-int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+static int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -4634,6 +4641,44 @@ out:
 	return ret;
 }
 
+static struct ctl_table hugetlb_table[] = {
+	{
+		.procname	= "nr_hugepages",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= hugetlb_sysctl_handler,
+	},
+#ifdef CONFIG_NUMA
+	{
+		.procname       = "nr_hugepages_mempolicy",
+		.data           = NULL,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+	},
+#endif
+	{
+		.procname	= "hugetlb_shm_group",
+		.data		= &sysctl_hugetlb_shm_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nr_overcommit_hugepages",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= hugetlb_overcommit_handler,
+	},
+	{ }
+};
+
+static void hugetlb_sysctl_init(void)
+{
+	register_sysctl_init("vm", hugetlb_table);
+}
 #endif /* CONFIG_SYSCTL */
 
 void hugetlb_report_meminfo(struct seq_file *m)
-- 
cgit v1.2.3


From efb339a83368ab25de1a18c0fdff85e01c13a1ea Mon Sep 17 00:00:00 2001
From: Peter Gonda <pgonda@google.com>
Date: Tue, 7 Mar 2023 20:24:39 +0100
Subject: crypto: ccp - Name -1 return value as SEV_RET_NO_FW_CALL

The PSP can return a "firmware error" code of -1 in circumstances where
the PSP has not actually been called. To make this protocol unambiguous,
name the value SEV_RET_NO_FW_CALL.

  [ bp: Massage a bit. ]

Signed-off-by: Peter Gonda <pgonda@google.com>
Signed-off-by: Dionna Glaze <dionnaglaze@google.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20221207010210.2563293-2-dionnaglaze@google.com
---
 Documentation/virt/coco/sev-guest.rst | 4 ++--
 drivers/crypto/ccp/sev-dev.c          | 8 +++++---
 include/uapi/linux/psp-sev.h          | 7 +++++++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index bf593e88cfd9..aa3e4c6a1f90 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -40,8 +40,8 @@ along with a description:
 The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device.
 The ioctl accepts struct snp_user_guest_request. The input and output structure is
 specified through the req_data and resp_data field respectively. If the ioctl fails
-to execute due to a firmware error, then fw_err code will be set otherwise the
-fw_err will be set to 0x00000000000000ff.
+to execute due to a firmware error, then fw_err code will be set. Otherwise, fw_err
+will be set to 0x00000000ffffffff, i.e., the lower 32-bits are -1.
 
 The firmware checks that the message sequence counter is one greater than
 the guests message sequence counter. If guest driver fails to increment message
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index e2f25926eb51..823c67a43c38 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -444,10 +444,10 @@ static int __sev_init_ex_locked(int *error)
 
 static int __sev_platform_init_locked(int *error)
 {
+	int rc = 0, psp_ret = SEV_RET_NO_FW_CALL;
 	struct psp_device *psp = psp_master;
-	struct sev_device *sev;
-	int rc = 0, psp_ret = -1;
 	int (*init_function)(int *error);
+	struct sev_device *sev;
 
 	if (!psp || !psp->sev_data)
 		return -ENODEV;
@@ -475,9 +475,11 @@ static int __sev_platform_init_locked(int *error)
 		 * initialization function should succeed by replacing the state
 		 * with a reset state.
 		 */
-		dev_err(sev->dev, "SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state.");
+		dev_err(sev->dev,
+"SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state.");
 		rc = init_function(&psp_ret);
 	}
+
 	if (error)
 		*error = psp_ret;
 
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index 91b4c63d5cbf..1c9da485318f 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -36,6 +36,13 @@ enum {
  * SEV Firmware status code
  */
 typedef enum {
+	/*
+	 * This error code is not in the SEV spec. Its purpose is to convey that
+	 * there was an error that prevented the SEV firmware from being called.
+	 * The SEV API error codes are 16 bits, so the -1 value will not overlap
+	 * with possible values from the specification.
+	 */
+	SEV_RET_NO_FW_CALL = -1,
 	SEV_RET_SUCCESS = 0,
 	SEV_RET_INVALID_PLATFORM_STATE,
 	SEV_RET_INVALID_GUEST_STATE,
-- 
cgit v1.2.3


From 7062e1c727ec99a9c5b40586964304d60a43f240 Mon Sep 17 00:00:00 2001
From: Simon Trimmer <simont@opensource.cirrus.com>
Date: Mon, 20 Mar 2023 11:22:38 +0000
Subject: firmware: cs_dsp: Introduce no_core_startstop for self-booting DSPs

There are devices containing Halo Core DSPs that self-boot, cs_dsp is
used to manage the running firmware but the host does not have direct
control over starting and stopping the DSP and so cs_dsp should consider
the DSP to be always running.

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230320112245.115720-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 14 +++++++++++++-
 include/linux/firmware/cirrus/cs_dsp.h |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index f558b390fbfe..97b73a254380 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -301,6 +301,7 @@ struct cs_dsp_ops {
 static const struct cs_dsp_ops cs_dsp_adsp1_ops;
 static const struct cs_dsp_ops cs_dsp_adsp2_ops[];
 static const struct cs_dsp_ops cs_dsp_halo_ops;
+static const struct cs_dsp_ops cs_dsp_halo_ao_ops;
 
 struct cs_dsp_buf {
 	struct list_head list;
@@ -2821,7 +2822,10 @@ EXPORT_SYMBOL_NS_GPL(cs_dsp_adsp2_init, FW_CS_DSP);
  */
 int cs_dsp_halo_init(struct cs_dsp *dsp)
 {
-	dsp->ops = &cs_dsp_halo_ops;
+	if (dsp->no_core_startstop)
+		dsp->ops = &cs_dsp_halo_ao_ops;
+	else
+		dsp->ops = &cs_dsp_halo_ops;
 
 	return cs_dsp_common_init(dsp);
 }
@@ -3187,6 +3191,14 @@ static const struct cs_dsp_ops cs_dsp_halo_ops = {
 	.stop_core = cs_dsp_halo_stop_core,
 };
 
+static const struct cs_dsp_ops cs_dsp_halo_ao_ops = {
+	.parse_sizes = cs_dsp_adsp2_parse_sizes,
+	.validate_version = cs_dsp_halo_validate_version,
+	.setup_algs = cs_dsp_halo_setup_algs,
+	.region_to_reg = cs_dsp_halo_region_to_reg,
+	.show_fw_status = cs_dsp_halo_show_fw_status,
+};
+
 /**
  * cs_dsp_chunk_write() - Format data to a DSP memory chunk
  * @ch: Pointer to the chunk structure
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index cad828e21c72..29cd11d5a3cf 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -156,6 +156,7 @@ struct cs_dsp {
 	unsigned int sysclk_reg;
 	unsigned int sysclk_mask;
 	unsigned int sysclk_shift;
+	bool no_core_startstop;
 
 	struct list_head alg_regions;
 
-- 
cgit v1.2.3


From e496112529006ce0c2cfe67d1136186e2786d2e8 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 20 Mar 2023 11:22:45 +0000
Subject: ASoC: cs35l56: Add driver for Cirrus Logic CS35L56

The CS35L56 combines a high-performance mono audio amplifier, Class-H
tracking inductive boost converter, Halo Core(TM) DSP and a DC-DC boost
converter supporting Class-H tracking.

Supported control interfaces are I2C, SPI or SoundWire.
Supported audio interfaces are I2S/TDM or SoundWire.

Most chip functionality is controlled by on-board ROM firmware that is
always running. The driver must apply patch/tune to the firmware
before using the CS35L56.

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230320112245.115720-9-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 MAINTAINERS                       |    1 +
 include/sound/cs35l56.h           |  266 +++++++
 sound/soc/codecs/Kconfig          |   40 +
 sound/soc/codecs/Makefile         |   10 +
 sound/soc/codecs/cs35l56-i2c.c    |   83 +++
 sound/soc/codecs/cs35l56-sdw.c    |  528 ++++++++++++++
 sound/soc/codecs/cs35l56-shared.c |  390 ++++++++++
 sound/soc/codecs/cs35l56-spi.c    |   81 ++
 sound/soc/codecs/cs35l56.c        | 1461 +++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/cs35l56.h        |   77 ++
 10 files changed, 2937 insertions(+)
 create mode 100644 include/sound/cs35l56.h
 create mode 100644 sound/soc/codecs/cs35l56-i2c.c
 create mode 100644 sound/soc/codecs/cs35l56-sdw.c
 create mode 100644 sound/soc/codecs/cs35l56-shared.c
 create mode 100644 sound/soc/codecs/cs35l56-spi.c
 create mode 100644 sound/soc/codecs/cs35l56.c
 create mode 100644 sound/soc/codecs/cs35l56.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 02fe90adf0af..e52938f962b5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4905,6 +4905,7 @@ L:	patches@opensource.cirrus.com
 S:	Maintained
 F:	Documentation/devicetree/bindings/sound/cirrus,cs*
 F:	include/dt-bindings/sound/cs*
+F:	include/sound/cs*
 F:	sound/pci/hda/cs*
 F:	sound/pci/hda/hda_cs_dsp_ctl.*
 F:	sound/soc/codecs/cs*
diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
new file mode 100644
index 000000000000..5f8ea2dfaa21
--- /dev/null
+++ b/include/sound/cs35l56.h
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common definitions for Cirrus Logic CS35L56 smart amp
+ *
+ * Copyright (C) 2023 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef __CS35L56_H
+#define __CS35L56_H
+
+#include <linux/firmware/cirrus/cs_dsp.h>
+#include <linux/regulator/consumer.h>
+#include <linux/regmap.h>
+
+#define CS35L56_DEVID					0x0000000
+#define CS35L56_REVID					0x0000004
+#define CS35L56_RELID					0x000000C
+#define CS35L56_OTPID					0x0000010
+#define CS35L56_SFT_RESET				0x0000020
+#define CS35L56_GLOBAL_ENABLES				0x0002014
+#define CS35L56_BLOCK_ENABLES				0x0002018
+#define CS35L56_BLOCK_ENABLES2				0x000201C
+#define CS35L56_REFCLK_INPUT				0x0002C04
+#define CS35L56_GLOBAL_SAMPLE_RATE			0x0002C0C
+#define CS35L56_ASP1_ENABLES1				0x0004800
+#define CS35L56_ASP1_CONTROL1				0x0004804
+#define CS35L56_ASP1_CONTROL2				0x0004808
+#define CS35L56_ASP1_CONTROL3				0x000480C
+#define CS35L56_ASP1_FRAME_CONTROL1			0x0004810
+#define CS35L56_ASP1_FRAME_CONTROL5			0x0004820
+#define CS35L56_ASP1_DATA_CONTROL1			0x0004830
+#define CS35L56_ASP1_DATA_CONTROL5			0x0004840
+#define CS35L56_DACPCM1_INPUT				0x0004C00
+#define CS35L56_DACPCM2_INPUT				0x0004C08
+#define CS35L56_ASP1TX1_INPUT				0x0004C20
+#define CS35L56_ASP1TX2_INPUT				0x0004C24
+#define CS35L56_ASP1TX3_INPUT				0x0004C28
+#define CS35L56_ASP1TX4_INPUT				0x0004C2C
+#define CS35L56_DSP1RX1_INPUT				0x0004C40
+#define CS35L56_DSP1RX2_INPUT				0x0004C44
+#define CS35L56_SWIRE_DP3_CH1_INPUT			0x0004C70
+#define CS35L56_SWIRE_DP3_CH2_INPUT			0x0004C74
+#define CS35L56_SWIRE_DP3_CH3_INPUT			0x0004C78
+#define CS35L56_SWIRE_DP3_CH4_INPUT			0x0004C7C
+#define CS35L56_SWIRE_DP3_CH5_INPUT			0x0004C80
+#define CS35L56_SWIRE_DP3_CH6_INPUT			0x0004C84
+#define CS35L56_IRQ1_CFG				0x000E000
+#define CS35L56_IRQ1_STATUS				0x000E004
+#define CS35L56_IRQ1_EINT_1				0x000E010
+#define CS35L56_IRQ1_EINT_2				0x000E014
+#define CS35L56_IRQ1_EINT_4				0x000E01C
+#define CS35L56_IRQ1_EINT_8				0x000E02C
+#define CS35L56_IRQ1_EINT_18				0x000E054
+#define CS35L56_IRQ1_EINT_20				0x000E05C
+#define CS35L56_IRQ1_MASK_1				0x000E090
+#define CS35L56_IRQ1_MASK_2				0x000E094
+#define CS35L56_IRQ1_MASK_4				0x000E09C
+#define CS35L56_IRQ1_MASK_8				0x000E0AC
+#define CS35L56_IRQ1_MASK_18				0x000E0D4
+#define CS35L56_IRQ1_MASK_20				0x000E0DC
+#define CS35L56_DSP_VIRTUAL1_MBOX_1			0x0011020
+#define CS35L56_DSP_VIRTUAL1_MBOX_2			0x0011024
+#define CS35L56_DSP_VIRTUAL1_MBOX_3			0x0011028
+#define CS35L56_DSP_VIRTUAL1_MBOX_4			0x001102C
+#define CS35L56_DSP_VIRTUAL1_MBOX_5			0x0011030
+#define CS35L56_DSP_VIRTUAL1_MBOX_6			0x0011034
+#define CS35L56_DSP_VIRTUAL1_MBOX_7			0x0011038
+#define CS35L56_DSP_VIRTUAL1_MBOX_8			0x001103C
+#define CS35L56_DSP_RESTRICT_STS1			0x00190F0
+#define CS35L56_DSP1_XMEM_PACKED_0			0x2000000
+#define CS35L56_DSP1_XMEM_PACKED_6143			0x2005FFC
+#define CS35L56_DSP1_XMEM_UNPACKED32_0			0x2400000
+#define CS35L56_DSP1_XMEM_UNPACKED32_4095		0x2403FFC
+#define CS35L56_DSP1_SYS_INFO_ID			0x25E0000
+#define CS35L56_DSP1_SYS_INFO_END			0x25E004C
+#define CS35L56_DSP1_AHBM_WINDOW_DEBUG_0		0x25E2040
+#define CS35L56_DSP1_AHBM_WINDOW_DEBUG_1		0x25E2044
+#define CS35L56_DSP1_XMEM_UNPACKED24_0			0x2800000
+#define CS35L56_DSP1_HALO_STATE_A1			0x2801E58
+#define CS35L56_DSP1_HALO_STATE				0x28021E0
+#define CS35L56_DSP1_PM_CUR_STATE_A1			0x2804000
+#define CS35L56_DSP1_PM_CUR_STATE			0x2804308
+#define CS35L56_DSP1_XMEM_UNPACKED24_8191		0x2807FFC
+#define CS35L56_DSP1_CORE_BASE				0x2B80000
+#define CS35L56_DSP1_SCRATCH1				0x2B805C0
+#define CS35L56_DSP1_SCRATCH2				0x2B805C8
+#define CS35L56_DSP1_SCRATCH3				0x2B805D0
+#define CS35L56_DSP1_SCRATCH4				0x2B805D8
+#define CS35L56_DSP1_YMEM_PACKED_0			0x2C00000
+#define CS35L56_DSP1_YMEM_PACKED_4604			0x2C047F0
+#define CS35L56_DSP1_YMEM_UNPACKED32_0			0x3000000
+#define CS35L56_DSP1_YMEM_UNPACKED32_3070		0x3002FF8
+#define CS35L56_DSP1_YMEM_UNPACKED24_0			0x3400000
+#define CS35L56_MAIN_RENDER_USER_MUTE			0x3400024
+#define CS35L56_MAIN_RENDER_USER_VOLUME			0x340002C
+#define CS35L56_MAIN_POSTURE_NUMBER			0x3400094
+#define CS35L56_TRANSDUCER_ACTUAL_PS			0x3400150
+#define CS35L56_DSP1_YMEM_UNPACKED24_6141		0x3405FF4
+#define CS35L56_DSP1_PMEM_0				0x3800000
+#define CS35L56_DSP1_PMEM_5114				0x3804FE8
+
+/* DEVID */
+#define CS35L56_DEVID_MASK				0x00FFFFFF
+
+/* REVID */
+#define CS35L56_AREVID_MASK				0x000000F0
+#define CS35L56_MTLREVID_MASK				0x0000000F
+#define CS35L56_REVID_B0				0x000000B0
+
+/* ASP_ENABLES1 */
+#define CS35L56_ASP_RX2_EN_SHIFT			17
+#define CS35L56_ASP_RX1_EN_SHIFT			16
+#define CS35L56_ASP_TX4_EN_SHIFT			3
+#define CS35L56_ASP_TX3_EN_SHIFT			2
+#define CS35L56_ASP_TX2_EN_SHIFT			1
+#define CS35L56_ASP_TX1_EN_SHIFT			0
+
+/* ASP_CONTROL1 */
+#define CS35L56_ASP_BCLK_FREQ_MASK			0x0000003F
+#define CS35L56_ASP_BCLK_FREQ_SHIFT			0
+
+/* ASP_CONTROL2 */
+#define CS35L56_ASP_RX_WIDTH_MASK			0xFF000000
+#define CS35L56_ASP_RX_WIDTH_SHIFT			24
+#define CS35L56_ASP_TX_WIDTH_MASK			0x00FF0000
+#define CS35L56_ASP_TX_WIDTH_SHIFT			16
+#define CS35L56_ASP_FMT_MASK				0x00000700
+#define CS35L56_ASP_FMT_SHIFT				8
+#define CS35L56_ASP_BCLK_INV_MASK			0x00000040
+#define CS35L56_ASP_FSYNC_INV_MASK			0x00000004
+
+/* ASP_CONTROL3 */
+#define CS35L56_ASP1_DOUT_HIZ_CTRL_MASK			0x00000003
+
+/* ASP_DATA_CONTROL1 */
+#define CS35L56_ASP_TX_WL_MASK				0x0000003F
+
+/* ASP_DATA_CONTROL5 */
+#define CS35L56_ASP_RX_WL_MASK				0x0000003F
+
+/* ASPTXn_INPUT */
+#define CS35L56_ASP_TXn_SRC_MASK			0x0000007F
+
+/* SWIRETX[1..7]_SRC SDWTXn INPUT */
+#define CS35L56_SWIRETXn_SRC_MASK			0x0000007F
+
+/* IRQ1_STATUS */
+#define CS35L56_IRQ1_STS_MASK				0x00000001
+
+/* IRQ1_EINT_1 */
+#define CS35L56_AMP_SHORT_ERR_EINT1_MASK		0x80000000
+
+/* IRQ1_EINT_2 */
+#define CS35L56_DSP_VIRTUAL2_MBOX_WR_EINT1_MASK		0x00200000
+
+/* IRQ1_EINT_4 */
+#define CS35L56_OTP_BOOT_DONE_MASK			0x00000002
+
+/* IRQ1_EINT_8 */
+#define CS35L56_TEMP_ERR_EINT1_MASK			0x80000000
+
+/* Mixer input sources */
+#define CS35L56_INPUT_SRC_NONE				0x00
+#define CS35L56_INPUT_SRC_ASP1RX1			0x08
+#define CS35L56_INPUT_SRC_ASP1RX2			0x09
+#define CS35L56_INPUT_SRC_VMON				0x18
+#define CS35L56_INPUT_SRC_IMON				0x19
+#define CS35L56_INPUT_SRC_ERR_VOL			0x20
+#define CS35L56_INPUT_SRC_CLASSH			0x21
+#define CS35L56_INPUT_SRC_VDDBMON			0x28
+#define CS35L56_INPUT_SRC_VBSTMON			0x29
+#define CS35L56_INPUT_SRC_DSP1TX1			0x32
+#define CS35L56_INPUT_SRC_DSP1TX2			0x33
+#define CS35L56_INPUT_SRC_DSP1TX3			0x34
+#define CS35L56_INPUT_SRC_DSP1TX4			0x35
+#define CS35L56_INPUT_SRC_DSP1TX5			0x36
+#define CS35L56_INPUT_SRC_DSP1TX6			0x37
+#define CS35L56_INPUT_SRC_DSP1TX7			0x38
+#define CS35L56_INPUT_SRC_DSP1TX8			0x39
+#define CS35L56_INPUT_SRC_TEMPMON			0x3A
+#define CS35L56_INPUT_SRC_INTERPOLATOR			0x40
+#define CS35L56_INPUT_SRC_SWIRE_RX1			0x44
+#define CS35L56_INPUT_SRC_SWIRE_RX2			0x45
+#define CS35L56_INPUT_SRC_SWIRE_RX3			0x46
+#define CS35L56_INPUT_MASK				0x7F
+
+#define CS35L56_NUM_INPUT_SRC				22
+
+/* ASP formats */
+#define CS35L56_ASP_FMT_DSP_A				0
+#define CS35L56_ASP_FMT_I2S				2
+
+/* ASP HiZ modes */
+#define CS35L56_ASP_UNUSED_HIZ_OFF_HIZ			3
+
+/* MAIN_RENDER_ACTUAL_PS */
+#define CS35L56_PS0					0
+#define CS35L56_PS3					3
+
+/* CS35L56_DSP_RESTRICT_STS1 */
+#define CS35L56_RESTRICTED_MASK				0x7
+
+/* CS35L56_MAIN_RENDER_USER_MUTE */
+#define CS35L56_MAIN_RENDER_USER_MUTE_MASK		1
+
+/* CS35L56_MAIN_RENDER_USER_VOLUME */
+#define CS35L56_MAIN_RENDER_USER_VOLUME_MIN		-400
+#define CS35L56_MAIN_RENDER_USER_VOLUME_MAX		400
+#define CS35L56_MAIN_RENDER_USER_VOLUME_MASK		0x0000FFC0
+#define CS35L56_MAIN_RENDER_USER_VOLUME_SHIFT		6
+#define CS35L56_MAIN_RENDER_USER_VOLUME_SIGNBIT		9
+
+/* CS35L56_MAIN_POSTURE_NUMBER */
+#define CS35L56_MAIN_POSTURE_MIN			0
+#define CS35L56_MAIN_POSTURE_MAX			255
+#define CS35L56_MAIN_POSTURE_MASK			CS35L56_MAIN_POSTURE_MAX
+
+/* Software Values */
+#define CS35L56_HALO_STATE_SHUTDOWN			1
+#define CS35L56_HALO_STATE_BOOT_DONE			2
+
+#define CS35L56_MBOX_CMD_AUDIO_PLAY			0x0B000001
+#define CS35L56_MBOX_CMD_AUDIO_PAUSE			0x0B000002
+#define CS35L56_MBOX_CMD_HIBERNATE_NOW			0x02000001
+#define CS35L56_MBOX_CMD_WAKEUP				0x02000002
+#define CS35L56_MBOX_CMD_PREVENT_AUTO_HIBERNATE		0x02000003
+#define CS35L56_MBOX_CMD_ALLOW_AUTO_HIBERNATE		0x02000004
+#define CS35L56_MBOX_CMD_SHUTDOWN			0x02000005
+#define CS35L56_MBOX_CMD_SYSTEM_RESET			0x02000007
+
+#define CS35L56_MBOX_TIMEOUT_US				5000
+#define CS35L56_MBOX_POLL_US				250
+
+#define CS35L56_PS0_POLL_US				500
+#define CS35L56_PS0_TIMEOUT_US				50000
+#define CS35L56_PS3_POLL_US				500
+#define CS35L56_PS3_TIMEOUT_US				300000
+
+#define CS35L56_CONTROL_PORT_READY_US			2200
+#define CS35L56_HALO_STATE_POLL_US			1000
+#define CS35L56_HALO_STATE_TIMEOUT_US			50000
+#define CS35L56_HIBERNATE_WAKE_POLL_US			500
+#define CS35L56_HIBERNATE_WAKE_TIMEOUT_US		5000
+#define CS35L56_RESET_PULSE_MIN_US			1100
+
+#define CS35L56_SDW1_PLAYBACK_PORT			1
+#define CS35L56_SDW1_CAPTURE_PORT			3
+
+#define CS35L56_NUM_BULK_SUPPLIES			3
+#define CS35L56_NUM_DSP_REGIONS				5
+
+extern struct regmap_config cs35l56_regmap_i2c;
+extern struct regmap_config cs35l56_regmap_spi;
+extern struct regmap_config cs35l56_regmap_sdw;
+
+extern const struct cs_dsp_region cs35l56_dsp1_regions[CS35L56_NUM_DSP_REGIONS];
+extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC];
+extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC];
+
+void cs35l56_patch(struct device *dev, struct regmap *regmap, u8 revid);
+void cs35l56_reread_firmware_registers(struct device *dev, struct regmap *regmap);
+int cs35l56_get_bclk_freq_id(unsigned int freq);
+void cs35l56_fill_supply_names(struct regulator_bulk_data *data);
+
+#endif /* ifndef __CS35L56_H */
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 07747565c3b5..c9fb09cf8e6b 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -68,6 +68,9 @@ config SND_SOC_ALL_CODECS
 	imply SND_SOC_CS35L41_I2C
 	imply SND_SOC_CS35L45_I2C
 	imply SND_SOC_CS35L45_SPI
+	imply SND_SOC_CS35L56_I2C
+	imply SND_SOC_CS35L56_SPI
+	imply SND_SOC_CS35L56_SDW
 	imply SND_SOC_CS42L42
 	imply SND_SOC_CS42L42_SDW
 	imply SND_SOC_CS42L51_I2C
@@ -364,6 +367,7 @@ config SND_SOC_WM_ADSP
 	default y if SND_SOC_WM2200=y
 	default y if SND_SOC_CS35L41_SPI=y
 	default y if SND_SOC_CS35L41_I2C=y
+	default y if SND_SOC_CS35L56=y
 	default m if SND_SOC_MADERA=m
 	default m if SND_SOC_CS47L24=m
 	default m if SND_SOC_WM5102=m
@@ -371,6 +375,7 @@ config SND_SOC_WM_ADSP
 	default m if SND_SOC_WM2200=m
 	default m if SND_SOC_CS35L41_SPI=m
 	default m if SND_SOC_CS35L41_I2C=m
+	default m if SND_SOC_CS35L56=m
 
 config SND_SOC_AB8500_CODEC
 	tristate
@@ -711,6 +716,41 @@ config SND_SOC_CS35L45_I2C
 	  Enable support for Cirrus Logic CS35L45 smart speaker amplifier
 	  with I2C control.
 
+config SND_SOC_CS35L56
+	tristate
+
+config SND_SOC_CS35L56_SHARED
+	tristate
+
+config SND_SOC_CS35L56_I2C
+	tristate "Cirrus Logic CS35L56 CODEC (I2C)"
+	depends on I2C
+	depends on SOUNDWIRE || !SOUNDWIRE
+	select REGMAP_I2C
+	select SND_SOC_CS35L56
+	select SND_SOC_CS35L56_SHARED
+	help
+	  Enable support for Cirrus Logic CS35L56 boosted amplifier with I2C control
+
+config SND_SOC_CS35L56_SPI
+	tristate "Cirrus Logic CS35L56 CODEC (SPI)"
+	depends on SPI_MASTER
+	depends on SOUNDWIRE || !SOUNDWIRE
+	select REGMAP_SPI
+	select SND_SOC_CS35L56
+	select SND_SOC_CS35L56_SHARED
+	help
+	  Enable support for Cirrus Logic CS35L56 boosted amplifier with SPI control
+
+config SND_SOC_CS35L56_SDW
+	tristate "Cirrus Logic CS35L56 CODEC (SDW)"
+	depends on SOUNDWIRE
+	select REGMAP
+	select SND_SOC_CS35L56
+	select SND_SOC_CS35L56_SHARED
+	help
+	  Enable support for Cirrus Logic CS35L56 boosted amplifier with SoundWire control
+
 config SND_SOC_CS42L42_CORE
 	tristate
 
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index f1ca18f7946c..25ebce58a0ba 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -66,6 +66,11 @@ snd-soc-cs35l41-i2c-objs := cs35l41-i2c.o
 snd-soc-cs35l45-objs := cs35l45.o cs35l45-tables.o
 snd-soc-cs35l45-spi-objs := cs35l45-spi.o
 snd-soc-cs35l45-i2c-objs := cs35l45-i2c.o
+snd-soc-cs35l56-objs := cs35l56.o
+snd-soc-cs35l56-shared-objs := cs35l56-shared.o
+snd-soc-cs35l56-i2c-objs := cs35l56-i2c.o
+snd-soc-cs35l56-spi-objs := cs35l56-spi.o
+snd-soc-cs35l56-sdw-objs := cs35l56-sdw.o
 snd-soc-cs42l42-objs := cs42l42.o
 snd-soc-cs42l42-i2c-objs := cs42l42-i2c.o
 snd-soc-cs42l42-sdw-objs := cs42l42-sdw.o
@@ -433,6 +438,11 @@ obj-$(CONFIG_SND_SOC_CS35L41_I2C)	+= snd-soc-cs35l41-i2c.o
 obj-$(CONFIG_SND_SOC_CS35L45)	+= snd-soc-cs35l45.o
 obj-$(CONFIG_SND_SOC_CS35L45_SPI)	+= snd-soc-cs35l45-spi.o
 obj-$(CONFIG_SND_SOC_CS35L45_I2C)	+= snd-soc-cs35l45-i2c.o
+obj-$(CONFIG_SND_SOC_CS35L56)	+= snd-soc-cs35l56.o
+obj-$(CONFIG_SND_SOC_CS35L56_SHARED)	+= snd-soc-cs35l56-shared.o
+obj-$(CONFIG_SND_SOC_CS35L56_I2C)	+= snd-soc-cs35l56-i2c.o
+obj-$(CONFIG_SND_SOC_CS35L56_SPI)	+= snd-soc-cs35l56-spi.o
+obj-$(CONFIG_SND_SOC_CS35L56_SDW)	+= snd-soc-cs35l56-sdw.o
 obj-$(CONFIG_SND_SOC_CS42L42_CORE)	+= snd-soc-cs42l42.o
 obj-$(CONFIG_SND_SOC_CS42L42)	+= snd-soc-cs42l42-i2c.o
 obj-$(CONFIG_SND_SOC_CS42L42_SDW)	+= snd-soc-cs42l42-sdw.o
diff --git a/sound/soc/codecs/cs35l56-i2c.c b/sound/soc/codecs/cs35l56-i2c.c
new file mode 100644
index 000000000000..4b7f034a7670
--- /dev/null
+++ b/sound/soc/codecs/cs35l56-i2c.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// CS35L56 ALSA SoC audio driver I2C binding
+//
+// Copyright (C) 2023 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+#include <linux/acpi.h>
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "cs35l56.h"
+
+static int cs35l56_i2c_probe(struct i2c_client *client)
+{
+	struct cs35l56_private *cs35l56;
+	struct device *dev = &client->dev;
+	const struct regmap_config *regmap_config = &cs35l56_regmap_i2c;
+	int ret;
+
+	cs35l56 = devm_kzalloc(dev, sizeof(struct cs35l56_private), GFP_KERNEL);
+	if (!cs35l56)
+		return -ENOMEM;
+
+	cs35l56->dev = dev;
+	cs35l56->irq = client->irq;
+	cs35l56->can_hibernate = true;
+
+	i2c_set_clientdata(client, cs35l56);
+	cs35l56->regmap = devm_regmap_init_i2c(client, regmap_config);
+	if (IS_ERR(cs35l56->regmap)) {
+		ret = PTR_ERR(cs35l56->regmap);
+		return dev_err_probe(cs35l56->dev, ret, "Failed to allocate register map\n");
+	}
+
+	ret = cs35l56_common_probe(cs35l56);
+	if (ret != 0)
+		return ret;
+
+	ret = cs35l56_init(cs35l56);
+	if (ret == 0)
+		ret = cs35l56_irq_request(cs35l56);
+	if (ret < 0)
+		cs35l56_remove(cs35l56);
+
+	return ret;
+}
+
+static void cs35l56_i2c_remove(struct i2c_client *client)
+{
+	struct cs35l56_private *cs35l56 = i2c_get_clientdata(client);
+
+	cs35l56_remove(cs35l56);
+}
+
+static const struct i2c_device_id cs35l56_id_i2c[] = {
+	{ "cs35l56", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, cs35l56_id_i2c);
+
+static struct i2c_driver cs35l56_i2c_driver = {
+	.driver = {
+		.name		= "cs35l56",
+		.pm = &cs35l56_pm_ops_i2c_spi,
+	},
+	.id_table	= cs35l56_id_i2c,
+	.probe_new	= cs35l56_i2c_probe,
+	.remove		= cs35l56_i2c_remove,
+};
+
+module_i2c_driver(cs35l56_i2c_driver);
+
+MODULE_DESCRIPTION("ASoC CS35L56 I2C driver");
+MODULE_IMPORT_NS(SND_SOC_CS35L56_CORE);
+MODULE_IMPORT_NS(SND_SOC_CS35L56_SHARED);
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c
new file mode 100644
index 000000000000..448ef3609f4c
--- /dev/null
+++ b/sound/soc/codecs/cs35l56-sdw.c
@@ -0,0 +1,528 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// CS35L56 ALSA SoC audio driver SoundWire binding
+//
+// Copyright (C) 2023 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include <linux/soundwire/sdw_type.h>
+#include <linux/swab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "cs35l56.h"
+
+/* Register addresses are offset when sent over SoundWire */
+#define CS35L56_SDW_ADDR_OFFSET		0x8000
+
+static int cs35l56_sdw_read_one(struct sdw_slave *peripheral, unsigned int reg, void *buf)
+{
+	int ret;
+
+	ret = sdw_nread_no_pm(peripheral, reg, 4, (u8 *)buf);
+	if (ret != 0) {
+		dev_err(&peripheral->dev, "Read failed @%#x:%d\n", reg, ret);
+		return ret;
+	}
+
+	swab32s((u32 *)buf);
+
+	return 0;
+}
+
+static int cs35l56_sdw_read(void *context, const void *reg_buf,
+			    const size_t reg_size, void *val_buf,
+			    size_t val_size)
+{
+	struct sdw_slave *peripheral = context;
+	u8 *buf8 = val_buf;
+	unsigned int reg, bytes;
+	int ret;
+
+	reg = le32_to_cpu(*(const __le32 *)reg_buf);
+	reg += CS35L56_SDW_ADDR_OFFSET;
+
+	if (val_size == 4)
+		return cs35l56_sdw_read_one(peripheral, reg, val_buf);
+
+	while (val_size) {
+		bytes = SDW_REG_NO_PAGE - (reg & SDW_REGADDR); /* to end of page */
+		if (bytes > val_size)
+			bytes = val_size;
+
+		ret = sdw_nread_no_pm(peripheral, reg, bytes, buf8);
+		if (ret != 0) {
+			dev_err(&peripheral->dev, "Read failed @%#x..%#x:%d\n",
+				reg, reg + bytes - 1, ret);
+			return ret;
+		}
+
+		swab32_array((u32 *)buf8, bytes / 4);
+		val_size -= bytes;
+		reg += bytes;
+		buf8 += bytes;
+	}
+
+	return 0;
+}
+
+static inline void cs35l56_swab_copy(void *dest, const void *src, size_t nbytes)
+{
+	u32 *dest32 = dest;
+	const u32 *src32 = src;
+
+	for (; nbytes > 0; nbytes -= 4)
+		*dest32++ = swab32(*src32++);
+}
+
+static int cs35l56_sdw_write_one(struct sdw_slave *peripheral, unsigned int reg, const void *buf)
+{
+	u32 val_le = swab32(*(u32 *)buf);
+	int ret;
+
+	ret = sdw_nwrite_no_pm(peripheral, reg, 4, (u8 *)&val_le);
+	if (ret != 0) {
+		dev_err(&peripheral->dev, "Write failed @%#x:%d\n", reg, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cs35l56_sdw_gather_write(void *context,
+				    const void *reg_buf, size_t reg_size,
+				    const void *val_buf, size_t val_size)
+{
+	struct sdw_slave *peripheral = context;
+	const u8 *src_be = val_buf;
+	u32 val_le_buf[64];	/* Define u32 so it is 32-bit aligned */
+	unsigned int reg, bytes;
+	int ret;
+
+	reg = le32_to_cpu(*(const __le32 *)reg_buf);
+	reg += CS35L56_SDW_ADDR_OFFSET;
+
+	if (val_size == 4)
+		return cs35l56_sdw_write_one(peripheral, reg, src_be);
+
+	while (val_size) {
+		bytes = SDW_REG_NO_PAGE - (reg & SDW_REGADDR); /* to end of page */
+		if (bytes > val_size)
+			bytes = val_size;
+		if (bytes > sizeof(val_le_buf))
+			bytes = sizeof(val_le_buf);
+
+		cs35l56_swab_copy(val_le_buf, src_be, bytes);
+
+		ret = sdw_nwrite_no_pm(peripheral, reg, bytes, (u8 *)val_le_buf);
+		if (ret != 0) {
+			dev_err(&peripheral->dev, "Write failed @%#x..%#x:%d\n",
+				reg, reg + bytes - 1, ret);
+			return ret;
+		}
+
+		val_size -= bytes;
+		reg += bytes;
+		src_be += bytes;
+	}
+
+	return 0;
+}
+
+static int cs35l56_sdw_write(void *context, const void *val_buf, size_t val_size)
+{
+	const u8 *src_buf = val_buf;
+
+	/* First word of val_buf contains the destination address */
+	return cs35l56_sdw_gather_write(context, &src_buf[0], 4, &src_buf[4], val_size - 4);
+}
+
+/*
+ * Registers are big-endian on I2C and SPI but little-endian on SoundWire.
+ * Exported firmware controls are big-endian on I2C/SPI but little-endian on
+ * SoundWire. Firmware files are always big-endian and are opaque blobs.
+ * Present a big-endian regmap and hide the endianness swap, so that the ALSA
+ * byte controls always have the same byte order, and firmware file blobs
+ * can be written verbatim.
+ */
+static const struct regmap_bus cs35l56_regmap_bus_sdw = {
+	.read = cs35l56_sdw_read,
+	.write = cs35l56_sdw_write,
+	.gather_write = cs35l56_sdw_gather_write,
+	.reg_format_endian_default = REGMAP_ENDIAN_LITTLE,
+	.val_format_endian_default = REGMAP_ENDIAN_BIG,
+};
+
+static void cs35l56_sdw_init(struct sdw_slave *peripheral)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+	int ret;
+
+	pm_runtime_get_noresume(cs35l56->dev);
+
+	regcache_cache_only(cs35l56->regmap, false);
+
+	ret = cs35l56_init(cs35l56);
+	if (ret < 0) {
+		regcache_cache_only(cs35l56->regmap, true);
+		goto out;
+	}
+
+	/*
+	 * cs35l56_init can return with !init_done if it triggered
+	 * a soft reset.
+	 */
+	if (cs35l56->init_done) {
+		/* Enable SoundWire interrupts */
+		sdw_write_no_pm(peripheral, CS35L56_SDW_GEN_INT_MASK_1,
+				CS35L56_SDW_INT_MASK_CODEC_IRQ);
+	}
+
+out:
+	pm_runtime_mark_last_busy(cs35l56->dev);
+	pm_runtime_put_autosuspend(cs35l56->dev);
+}
+
+static int cs35l56_sdw_interrupt(struct sdw_slave *peripheral,
+				 struct sdw_slave_intr_status *status)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+
+	/* SoundWire core holds our pm_runtime when calling this function. */
+
+	dev_dbg(cs35l56->dev, "int control_port=%#x\n", status->control_port);
+
+	if ((status->control_port & SDW_SCP_INT1_IMPL_DEF) == 0)
+		return 0;
+
+	/*
+	 * Prevent bus manager suspending and possibly issuing a
+	 * bus-reset before the queued work has run.
+	 */
+	pm_runtime_get_noresume(cs35l56->dev);
+
+	/*
+	 * Mask and clear until it has been handled. The read of GEN_INT_STAT_1
+	 * is required as per the SoundWire spec for interrupt status bits
+	 * to clear. GEN_INT_MASK_1 masks the _inputs_ to GEN_INT_STAT1.
+	 * None of the interrupts are time-critical so use the
+	 * power-efficient queue.
+	 */
+	sdw_write_no_pm(peripheral, CS35L56_SDW_GEN_INT_MASK_1, 0);
+	sdw_read_no_pm(peripheral, CS35L56_SDW_GEN_INT_STAT_1);
+	sdw_write_no_pm(peripheral, CS35L56_SDW_GEN_INT_STAT_1, 0xFF);
+	queue_work(system_power_efficient_wq, &cs35l56->sdw_irq_work);
+
+	return 0;
+}
+
+static void cs35l56_sdw_irq_work(struct work_struct *work)
+{
+	struct cs35l56_private *cs35l56 = container_of(work,
+						       struct cs35l56_private,
+						       sdw_irq_work);
+
+	cs35l56_irq(-1, cs35l56);
+
+	/* unmask interrupts */
+	if (!cs35l56->sdw_irq_no_unmask)
+		sdw_write_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_MASK_1,
+				CS35L56_SDW_INT_MASK_CODEC_IRQ);
+
+	pm_runtime_put_autosuspend(cs35l56->dev);
+}
+
+static int cs35l56_sdw_read_prop(struct sdw_slave *peripheral)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+	struct sdw_slave_prop *prop = &peripheral->prop;
+	struct sdw_dpn_prop *ports;
+
+	ports = devm_kcalloc(cs35l56->dev, 2, sizeof(*ports), GFP_KERNEL);
+	if (!ports)
+		return -ENOMEM;
+
+	prop->source_ports = BIT(CS35L56_SDW1_CAPTURE_PORT);
+	prop->sink_ports = BIT(CS35L56_SDW1_PLAYBACK_PORT);
+	prop->paging_support = true;
+	prop->clk_stop_mode1 = false;
+	prop->quirks = SDW_SLAVE_QUIRKS_INVALID_INITIAL_PARITY;
+	prop->scp_int1_mask = SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY | SDW_SCP_INT1_IMPL_DEF;
+
+	/* DP1 - playback */
+	ports[0].num = CS35L56_SDW1_PLAYBACK_PORT;
+	ports[0].type = SDW_DPN_FULL;
+	ports[0].ch_prep_timeout = 10;
+	prop->sink_dpn_prop = &ports[0];
+
+	/* DP3 - capture */
+	ports[1].num = CS35L56_SDW1_CAPTURE_PORT;
+	ports[1].type = SDW_DPN_FULL;
+	ports[1].ch_prep_timeout = 10;
+	prop->src_dpn_prop = &ports[1];
+
+	return 0;
+}
+
+static int cs35l56_sdw_update_status(struct sdw_slave *peripheral,
+				     enum sdw_slave_status status)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+
+	switch (status) {
+	case SDW_SLAVE_ATTACHED:
+		dev_dbg(cs35l56->dev, "%s: ATTACHED\n", __func__);
+		if (cs35l56->sdw_attached)
+			break;
+
+		if (!cs35l56->init_done || cs35l56->soft_resetting)
+			cs35l56_sdw_init(peripheral);
+
+		cs35l56->sdw_attached = true;
+		break;
+	case SDW_SLAVE_UNATTACHED:
+		dev_dbg(cs35l56->dev, "%s: UNATTACHED\n", __func__);
+		cs35l56->sdw_attached = false;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int cs35l56_a1_kick_divider(struct cs35l56_private *cs35l56,
+				   struct sdw_slave *peripheral)
+{
+	unsigned int curr_scale_reg, next_scale_reg;
+	int curr_scale, next_scale, ret;
+
+	if (!cs35l56->init_done)
+		return 0;
+
+	if (peripheral->bus->params.curr_bank) {
+		curr_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B1;
+		next_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B0;
+	} else {
+		curr_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B0;
+		next_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B1;
+	}
+
+	/*
+	 * Current clock scale value must be different to new value.
+	 * Modify current to guarantee this. If next still has the dummy
+	 * value we wrote when it was current, the core code has not set
+	 * a new scale so restore its original good value
+	 */
+	curr_scale = sdw_read_no_pm(peripheral, curr_scale_reg);
+	if (curr_scale < 0) {
+		dev_err(cs35l56->dev, "Failed to read current clock scale: %d\n", curr_scale);
+		return curr_scale;
+	}
+
+	next_scale = sdw_read_no_pm(peripheral, next_scale_reg);
+	if (next_scale < 0) {
+		dev_err(cs35l56->dev, "Failed to read next clock scale: %d\n", next_scale);
+		return next_scale;
+	}
+
+	if (next_scale == CS35L56_SDW_INVALID_BUS_SCALE) {
+		next_scale = cs35l56->old_sdw_clock_scale;
+		ret = sdw_write_no_pm(peripheral, next_scale_reg, next_scale);
+		if (ret < 0) {
+			dev_err(cs35l56->dev, "Failed to modify current clock scale: %d\n", ret);
+			return ret;
+		}
+	}
+
+	cs35l56->old_sdw_clock_scale = curr_scale;
+	ret = sdw_write_no_pm(peripheral, curr_scale_reg, CS35L56_SDW_INVALID_BUS_SCALE);
+	if (ret < 0) {
+		dev_err(cs35l56->dev, "Failed to modify current clock scale: %d\n", ret);
+		return ret;
+	}
+
+	dev_dbg(cs35l56->dev, "Next bus scale: %#x\n", next_scale);
+
+	return 0;
+}
+
+static int cs35l56_sdw_bus_config(struct sdw_slave *peripheral,
+				  struct sdw_bus_params *params)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+	int sclk;
+
+	sclk = params->curr_dr_freq / 2;
+	dev_dbg(cs35l56->dev, "%s: sclk=%u c=%u r=%u\n", __func__, sclk, params->col, params->row);
+
+	if (cs35l56->rev < 0xb0)
+		return cs35l56_a1_kick_divider(cs35l56, peripheral);
+
+	return 0;
+}
+
+static int __maybe_unused cs35l56_sdw_clk_stop(struct sdw_slave *peripheral,
+					       enum sdw_clk_stop_mode mode,
+					       enum sdw_clk_stop_type type)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+
+	dev_dbg(cs35l56->dev, "%s: mode:%d type:%d\n", __func__, mode, type);
+
+	return 0;
+}
+
+static const struct sdw_slave_ops cs35l56_sdw_ops = {
+	.read_prop = cs35l56_sdw_read_prop,
+	.interrupt_callback = cs35l56_sdw_interrupt,
+	.update_status = cs35l56_sdw_update_status,
+	.bus_config = cs35l56_sdw_bus_config,
+#ifdef DEBUG
+	.clk_stop = cs35l56_sdw_clk_stop,
+#endif
+};
+
+static int __maybe_unused cs35l56_sdw_handle_unattach(struct cs35l56_private *cs35l56)
+{
+	struct sdw_slave *peripheral = cs35l56->sdw_peripheral;
+
+	if (peripheral->unattach_request) {
+		/* Cannot access registers until bus is re-initialized. */
+		dev_dbg(cs35l56->dev, "Wait for initialization_complete\n");
+		if (!wait_for_completion_timeout(&peripheral->initialization_complete,
+						 msecs_to_jiffies(5000))) {
+			dev_err(cs35l56->dev, "initialization_complete timed out\n");
+			return -ETIMEDOUT;
+		}
+
+		peripheral->unattach_request = 0;
+
+		/*
+		 * Don't call regcache_mark_dirty(), we can't be sure that the
+		 * Manager really did issue a Bus Reset.
+		 */
+	}
+
+	return 0;
+}
+
+static int __maybe_unused cs35l56_sdw_runtime_suspend(struct device *dev)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
+
+	if (!cs35l56->init_done)
+		return 0;
+
+	return cs35l56_runtime_suspend(dev);
+}
+
+static int __maybe_unused cs35l56_sdw_runtime_resume(struct device *dev)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
+	int ret;
+
+	dev_dbg(dev, "Runtime resume\n");
+
+	if (!cs35l56->init_done)
+		return 0;
+
+	ret = cs35l56_sdw_handle_unattach(cs35l56);
+	if (ret < 0)
+		return ret;
+
+	ret = cs35l56_runtime_resume_common(cs35l56);
+	if (ret)
+		return ret;
+
+	/* Re-enable SoundWire interrupts */
+	sdw_write_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_MASK_1,
+			CS35L56_SDW_INT_MASK_CODEC_IRQ);
+
+	return 0;
+}
+
+static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_device_id *id)
+{
+	struct device *dev = &peripheral->dev;
+	struct cs35l56_private *cs35l56;
+	int ret;
+
+	cs35l56 = devm_kzalloc(dev, sizeof(*cs35l56), GFP_KERNEL);
+	if (!cs35l56)
+		return -ENOMEM;
+
+	cs35l56->dev = dev;
+	cs35l56->sdw_peripheral = peripheral;
+	INIT_WORK(&cs35l56->sdw_irq_work, cs35l56_sdw_irq_work);
+
+	dev_set_drvdata(dev, cs35l56);
+
+	cs35l56->regmap = devm_regmap_init(dev, &cs35l56_regmap_bus_sdw,
+					   peripheral, &cs35l56_regmap_sdw);
+	if (IS_ERR(cs35l56->regmap)) {
+		ret = PTR_ERR(cs35l56->regmap);
+		return dev_err_probe(dev, ret, "Failed to allocate register map\n");
+	}
+
+	/* Start in cache-only until device is enumerated */
+	regcache_cache_only(cs35l56->regmap, true);
+
+	ret = cs35l56_common_probe(cs35l56);
+	if (ret != 0)
+		return ret;
+
+	return 0;
+}
+
+static int cs35l56_sdw_remove(struct sdw_slave *peripheral)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
+
+	/* Disable SoundWire interrupts */
+	cs35l56->sdw_irq_no_unmask = true;
+	cancel_work_sync(&cs35l56->sdw_irq_work);
+	sdw_write_no_pm(peripheral, CS35L56_SDW_GEN_INT_MASK_1, 0);
+	sdw_read_no_pm(peripheral, CS35L56_SDW_GEN_INT_STAT_1);
+	sdw_write_no_pm(peripheral, CS35L56_SDW_GEN_INT_STAT_1, 0xFF);
+
+	return cs35l56_remove(cs35l56);
+}
+
+static const struct dev_pm_ops cs35l56_sdw_pm = {
+	SET_RUNTIME_PM_OPS(cs35l56_sdw_runtime_suspend, cs35l56_sdw_runtime_resume, NULL)
+};
+
+static const struct sdw_device_id cs35l56_sdw_id[] = {
+	SDW_SLAVE_ENTRY(0x01FA, 0x3556, 0),
+	{},
+};
+MODULE_DEVICE_TABLE(sdw, cs35l56_sdw_id);
+
+static struct sdw_driver cs35l56_sdw_driver = {
+	.driver = {
+		.name = "cs35l56",
+		.pm = &cs35l56_sdw_pm,
+	},
+	.probe = cs35l56_sdw_probe,
+	.remove = cs35l56_sdw_remove,
+	.ops = &cs35l56_sdw_ops,
+	.id_table = cs35l56_sdw_id,
+};
+
+module_sdw_driver(cs35l56_sdw_driver);
+
+MODULE_DESCRIPTION("ASoC CS35L56 SoundWire driver");
+MODULE_IMPORT_NS(SND_SOC_CS35L56_CORE);
+MODULE_IMPORT_NS(SND_SOC_CS35L56_SHARED);
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
new file mode 100644
index 000000000000..d8bc06ad4888
--- /dev/null
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Components shared between ASoC and HDA CS35L56 drivers
+//
+// Copyright (C) 2023 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+#include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
+#include <linux/types.h>
+
+#include "cs35l56.h"
+
+static const struct reg_default cs35l56_reg_defaults[] = {
+	{ CS35L56_ASP1_ENABLES1,		0x00000000 },
+	{ CS35L56_ASP1_CONTROL1,		0x00000028 },
+	{ CS35L56_ASP1_CONTROL2,		0x18180200 },
+	{ CS35L56_ASP1_CONTROL3,		0x00000002 },
+	{ CS35L56_ASP1_FRAME_CONTROL1,		0x03020100 },
+	{ CS35L56_ASP1_FRAME_CONTROL5,		0x00020100 },
+	{ CS35L56_ASP1_DATA_CONTROL1,		0x00000018 },
+	{ CS35L56_ASP1_DATA_CONTROL5,		0x00000018 },
+	{ CS35L56_ASP1TX1_INPUT,		0x00000018 },
+	{ CS35L56_ASP1TX2_INPUT,		0x00000019 },
+	{ CS35L56_ASP1TX3_INPUT,		0x00000020 },
+	{ CS35L56_ASP1TX4_INPUT,		0x00000028 },
+	{ CS35L56_SWIRE_DP3_CH1_INPUT,		0x00000018 },
+	{ CS35L56_SWIRE_DP3_CH2_INPUT,		0x00000019 },
+	{ CS35L56_SWIRE_DP3_CH3_INPUT,		0x00000029 },
+	{ CS35L56_SWIRE_DP3_CH4_INPUT,		0x00000028 },
+	{ CS35L56_SWIRE_DP3_CH5_INPUT,		0x00000018 },
+	{ CS35L56_SWIRE_DP3_CH6_INPUT,		0x00000018 },
+	{ CS35L56_IRQ1_CFG,			0x00000000 },
+	{ CS35L56_IRQ1_MASK_1,			0x83ffffff },
+	{ CS35L56_IRQ1_MASK_2,			0xffff7fff },
+	{ CS35L56_IRQ1_MASK_4,			0xe0ffffff },
+	{ CS35L56_IRQ1_MASK_8,			0xfc000fff },
+	{ CS35L56_IRQ1_MASK_18,			0x1f7df0ff },
+	{ CS35L56_IRQ1_MASK_20,			0x15c00000 },
+	/* CS35L56_MAIN_RENDER_USER_MUTE - soft register, no default	*/
+	/* CS35L56_MAIN_RENDER_USER_VOLUME - soft register, no default	*/
+	/* CS35L56_MAIN_POSTURE_NUMBER - soft register, no default	*/
+};
+
+/*
+ * The Ax devices have different default register values to that of B0,
+ * establish a common set of register defaults.
+ */
+static const struct reg_sequence cs35l56_reva_patch[] = {
+	{ CS35L56_SWIRE_DP3_CH5_INPUT,	0x00000018 },
+	{ CS35L56_SWIRE_DP3_CH6_INPUT,	0x00000018 },
+};
+
+void cs35l56_patch(struct device *dev, struct regmap *regmap, u8 revid)
+{
+	int ret;
+
+	if (revid >= CS35L56_REVID_B0)
+		return;
+
+	ret = regmap_register_patch(regmap, cs35l56_reva_patch,
+				    ARRAY_SIZE(cs35l56_reva_patch));
+	if (ret)
+		dev_err(dev, "Failed to apply patch: %d\n", ret);
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_patch, SND_SOC_CS35L56_SHARED);
+
+static bool cs35l56_is_dsp_memory(unsigned int reg)
+{
+	switch (reg) {
+	case CS35L56_DSP1_XMEM_PACKED_0 ... CS35L56_DSP1_XMEM_PACKED_6143:
+	case CS35L56_DSP1_XMEM_UNPACKED32_0 ... CS35L56_DSP1_XMEM_UNPACKED32_4095:
+	case CS35L56_DSP1_XMEM_UNPACKED24_0 ... CS35L56_DSP1_XMEM_UNPACKED24_8191:
+	case CS35L56_DSP1_YMEM_PACKED_0 ... CS35L56_DSP1_YMEM_PACKED_4604:
+	case CS35L56_DSP1_YMEM_UNPACKED32_0 ... CS35L56_DSP1_YMEM_UNPACKED32_3070:
+	case CS35L56_DSP1_YMEM_UNPACKED24_0 ... CS35L56_DSP1_YMEM_UNPACKED24_6141:
+	case CS35L56_DSP1_PMEM_0 ... CS35L56_DSP1_PMEM_5114:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs35l56_readable_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case CS35L56_DEVID:
+	case CS35L56_REVID:
+	case CS35L56_RELID:
+	case CS35L56_OTPID:
+	case CS35L56_SFT_RESET:
+	case CS35L56_GLOBAL_ENABLES:
+	case CS35L56_BLOCK_ENABLES:
+	case CS35L56_BLOCK_ENABLES2:
+	case CS35L56_REFCLK_INPUT:
+	case CS35L56_GLOBAL_SAMPLE_RATE:
+	case CS35L56_ASP1_ENABLES1:
+	case CS35L56_ASP1_CONTROL1:
+	case CS35L56_ASP1_CONTROL2:
+	case CS35L56_ASP1_CONTROL3:
+	case CS35L56_ASP1_FRAME_CONTROL1:
+	case CS35L56_ASP1_FRAME_CONTROL5:
+	case CS35L56_ASP1_DATA_CONTROL1:
+	case CS35L56_ASP1_DATA_CONTROL5:
+	case CS35L56_DACPCM1_INPUT:
+	case CS35L56_DACPCM2_INPUT:
+	case CS35L56_ASP1TX1_INPUT:
+	case CS35L56_ASP1TX2_INPUT:
+	case CS35L56_ASP1TX3_INPUT:
+	case CS35L56_ASP1TX4_INPUT:
+	case CS35L56_DSP1RX1_INPUT:
+	case CS35L56_DSP1RX2_INPUT:
+	case CS35L56_SWIRE_DP3_CH1_INPUT:
+	case CS35L56_SWIRE_DP3_CH2_INPUT:
+	case CS35L56_SWIRE_DP3_CH3_INPUT:
+	case CS35L56_SWIRE_DP3_CH4_INPUT:
+	case CS35L56_SWIRE_DP3_CH5_INPUT:
+	case CS35L56_SWIRE_DP3_CH6_INPUT:
+	case CS35L56_IRQ1_CFG:
+	case CS35L56_IRQ1_STATUS:
+	case CS35L56_IRQ1_EINT_1 ... CS35L56_IRQ1_EINT_8:
+	case CS35L56_IRQ1_EINT_18:
+	case CS35L56_IRQ1_EINT_20:
+	case CS35L56_IRQ1_MASK_1:
+	case CS35L56_IRQ1_MASK_2:
+	case CS35L56_IRQ1_MASK_4:
+	case CS35L56_IRQ1_MASK_8:
+	case CS35L56_IRQ1_MASK_18:
+	case CS35L56_IRQ1_MASK_20:
+	case CS35L56_DSP_VIRTUAL1_MBOX_1:
+	case CS35L56_DSP_VIRTUAL1_MBOX_2:
+	case CS35L56_DSP_VIRTUAL1_MBOX_3:
+	case CS35L56_DSP_VIRTUAL1_MBOX_4:
+	case CS35L56_DSP_VIRTUAL1_MBOX_5:
+	case CS35L56_DSP_VIRTUAL1_MBOX_6:
+	case CS35L56_DSP_VIRTUAL1_MBOX_7:
+	case CS35L56_DSP_VIRTUAL1_MBOX_8:
+	case CS35L56_DSP_RESTRICT_STS1:
+	case CS35L56_DSP1_SYS_INFO_ID ... CS35L56_DSP1_SYS_INFO_END:
+	case CS35L56_DSP1_AHBM_WINDOW_DEBUG_0:
+	case CS35L56_DSP1_AHBM_WINDOW_DEBUG_1:
+	case CS35L56_DSP1_SCRATCH1:
+	case CS35L56_DSP1_SCRATCH2:
+	case CS35L56_DSP1_SCRATCH3:
+	case CS35L56_DSP1_SCRATCH4:
+		return true;
+	default:
+		return cs35l56_is_dsp_memory(reg);
+	}
+}
+
+static bool cs35l56_precious_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case CS35L56_DSP1_XMEM_PACKED_0 ... CS35L56_DSP1_XMEM_PACKED_6143:
+	case CS35L56_DSP1_YMEM_PACKED_0 ... CS35L56_DSP1_YMEM_PACKED_4604:
+	case CS35L56_DSP1_PMEM_0 ... CS35L56_DSP1_PMEM_5114:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs35l56_volatile_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case CS35L56_DEVID:
+	case CS35L56_REVID:
+	case CS35L56_RELID:
+	case CS35L56_OTPID:
+	case CS35L56_SFT_RESET:
+	case CS35L56_GLOBAL_ENABLES:		   /* owned by firmware */
+	case CS35L56_BLOCK_ENABLES:		   /* owned by firmware */
+	case CS35L56_BLOCK_ENABLES2:		   /* owned by firmware */
+	case CS35L56_REFCLK_INPUT:		   /* owned by firmware */
+	case CS35L56_GLOBAL_SAMPLE_RATE:	   /* owned by firmware */
+	case CS35L56_DACPCM1_INPUT:		   /* owned by firmware */
+	case CS35L56_DACPCM2_INPUT:		   /* owned by firmware */
+	case CS35L56_DSP1RX1_INPUT:		   /* owned by firmware */
+	case CS35L56_DSP1RX2_INPUT:		   /* owned by firmware */
+	case CS35L56_IRQ1_STATUS:
+	case CS35L56_IRQ1_EINT_1 ... CS35L56_IRQ1_EINT_8:
+	case CS35L56_IRQ1_EINT_18:
+	case CS35L56_IRQ1_EINT_20:
+	case CS35L56_DSP_VIRTUAL1_MBOX_1:
+	case CS35L56_DSP_VIRTUAL1_MBOX_2:
+	case CS35L56_DSP_VIRTUAL1_MBOX_3:
+	case CS35L56_DSP_VIRTUAL1_MBOX_4:
+	case CS35L56_DSP_VIRTUAL1_MBOX_5:
+	case CS35L56_DSP_VIRTUAL1_MBOX_6:
+	case CS35L56_DSP_VIRTUAL1_MBOX_7:
+	case CS35L56_DSP_VIRTUAL1_MBOX_8:
+	case CS35L56_DSP_RESTRICT_STS1:
+	case CS35L56_DSP1_SYS_INFO_ID ... CS35L56_DSP1_SYS_INFO_END:
+	case CS35L56_DSP1_AHBM_WINDOW_DEBUG_0:
+	case CS35L56_DSP1_AHBM_WINDOW_DEBUG_1:
+	case CS35L56_DSP1_SCRATCH1:
+	case CS35L56_DSP1_SCRATCH2:
+	case CS35L56_DSP1_SCRATCH3:
+	case CS35L56_DSP1_SCRATCH4:
+		return true;
+	case CS35L56_MAIN_RENDER_USER_MUTE:
+	case CS35L56_MAIN_RENDER_USER_VOLUME:
+	case CS35L56_MAIN_POSTURE_NUMBER:
+		return false;
+	default:
+		return cs35l56_is_dsp_memory(reg);
+	}
+}
+
+static const u32 cs35l56_firmware_registers[] = {
+	CS35L56_MAIN_RENDER_USER_MUTE,
+	CS35L56_MAIN_RENDER_USER_VOLUME,
+	CS35L56_MAIN_POSTURE_NUMBER,
+};
+
+void cs35l56_reread_firmware_registers(struct device *dev, struct regmap *regmap)
+{
+	int i;
+	unsigned int val;
+
+	for (i = 0; i < ARRAY_SIZE(cs35l56_firmware_registers); i++) {
+		regmap_read(regmap, cs35l56_firmware_registers[i], &val);
+		dev_dbg(dev, "%s: %d: %#x: %#x\n", __func__,
+			i, cs35l56_firmware_registers[i], val);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_reread_firmware_registers, SND_SOC_CS35L56_SHARED);
+
+const struct cs_dsp_region cs35l56_dsp1_regions[] = {
+	{ .type = WMFW_HALO_PM_PACKED,	.base = CS35L56_DSP1_PMEM_0 },
+	{ .type = WMFW_HALO_XM_PACKED,	.base = CS35L56_DSP1_XMEM_PACKED_0 },
+	{ .type = WMFW_HALO_YM_PACKED,	.base = CS35L56_DSP1_YMEM_PACKED_0 },
+	{ .type = WMFW_ADSP2_XM,	.base = CS35L56_DSP1_XMEM_UNPACKED24_0 },
+	{ .type = WMFW_ADSP2_YM,	.base = CS35L56_DSP1_YMEM_UNPACKED24_0 },
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_dsp1_regions, SND_SOC_CS35L56_SHARED);
+
+static const u32 cs35l56_bclk_valid_for_pll_freq_table[] = {
+	[0x0C] = 128000,
+	[0x0F] = 256000,
+	[0x11] = 384000,
+	[0x12] = 512000,
+	[0x15] = 768000,
+	[0x17] = 1024000,
+	[0x1A] = 1500000,
+	[0x1B] = 1536000,
+	[0x1C] = 2000000,
+	[0x1D] = 2048000,
+	[0x1E] = 2400000,
+	[0x20] = 3000000,
+	[0x21] = 3072000,
+	[0x23] = 4000000,
+	[0x24] = 4096000,
+	[0x25] = 4800000,
+	[0x27] = 6000000,
+	[0x28] = 6144000,
+	[0x29] = 6250000,
+	[0x2A] = 6400000,
+	[0x2E] = 8000000,
+	[0x2F] = 8192000,
+	[0x30] = 9600000,
+	[0x32] = 12000000,
+	[0x33] = 12288000,
+	[0x37] = 13500000,
+	[0x38] = 19200000,
+	[0x39] = 22579200,
+	[0x3B] = 24576000,
+};
+
+int cs35l56_get_bclk_freq_id(unsigned int freq)
+{
+	int i;
+
+	if (freq == 0)
+		return -EINVAL;
+
+	/* The BCLK frequency must be a valid PLL REFCLK */
+	for (i = 0; i < ARRAY_SIZE(cs35l56_bclk_valid_for_pll_freq_table); ++i) {
+		if (cs35l56_bclk_valid_for_pll_freq_table[i] == freq)
+			return i;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_get_bclk_freq_id, SND_SOC_CS35L56_SHARED);
+
+static const char * const cs35l56_supplies[/* auto-sized */] = {
+	"VDD_P",
+	"VDD_IO",
+	"VDD_A",
+};
+
+void cs35l56_fill_supply_names(struct regulator_bulk_data *data)
+{
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(cs35l56_supplies) != CS35L56_NUM_BULK_SUPPLIES);
+	for (i = 0; i < ARRAY_SIZE(cs35l56_supplies); i++)
+		data[i].supply = cs35l56_supplies[i];
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_fill_supply_names, SND_SOC_CS35L56_SHARED);
+
+const char * const cs35l56_tx_input_texts[] = {
+	"None", "ASP1RX1", "ASP1RX2", "VMON", "IMON", "ERRVOL", "CLASSH",
+	"VDDBMON", "VBSTMON", "DSP1TX1", "DSP1TX2", "DSP1TX3", "DSP1TX4",
+	"DSP1TX5", "DSP1TX6", "DSP1TX7", "DSP1TX8", "TEMPMON",
+	"INTERPOLATOR", "SDW1RX1", "SDW1RX2", "SDW2RX1",
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_tx_input_texts, SND_SOC_CS35L56_SHARED);
+
+const unsigned int cs35l56_tx_input_values[] = {
+	CS35L56_INPUT_SRC_NONE,
+	CS35L56_INPUT_SRC_ASP1RX1,
+	CS35L56_INPUT_SRC_ASP1RX2,
+	CS35L56_INPUT_SRC_VMON,
+	CS35L56_INPUT_SRC_IMON,
+	CS35L56_INPUT_SRC_ERR_VOL,
+	CS35L56_INPUT_SRC_CLASSH,
+	CS35L56_INPUT_SRC_VDDBMON,
+	CS35L56_INPUT_SRC_VBSTMON,
+	CS35L56_INPUT_SRC_DSP1TX1,
+	CS35L56_INPUT_SRC_DSP1TX2,
+	CS35L56_INPUT_SRC_DSP1TX3,
+	CS35L56_INPUT_SRC_DSP1TX4,
+	CS35L56_INPUT_SRC_DSP1TX5,
+	CS35L56_INPUT_SRC_DSP1TX6,
+	CS35L56_INPUT_SRC_DSP1TX7,
+	CS35L56_INPUT_SRC_DSP1TX8,
+	CS35L56_INPUT_SRC_TEMPMON,
+	CS35L56_INPUT_SRC_INTERPOLATOR,
+	CS35L56_INPUT_SRC_SWIRE_RX1,
+	CS35L56_INPUT_SRC_SWIRE_RX2,
+	CS35L56_INPUT_SRC_SWIRE_RX3,
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_tx_input_values, SND_SOC_CS35L56_SHARED);
+
+struct regmap_config cs35l56_regmap_i2c = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+	.max_register = CS35L56_DSP1_PMEM_5114,
+	.reg_defaults = cs35l56_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(cs35l56_reg_defaults),
+	.volatile_reg = cs35l56_volatile_reg,
+	.readable_reg = cs35l56_readable_reg,
+	.precious_reg = cs35l56_precious_reg,
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_regmap_i2c, SND_SOC_CS35L56_SHARED);
+
+struct regmap_config cs35l56_regmap_spi = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.pad_bits = 16,
+	.reg_stride = 4,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+	.max_register = CS35L56_DSP1_PMEM_5114,
+	.reg_defaults = cs35l56_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(cs35l56_reg_defaults),
+	.volatile_reg = cs35l56_volatile_reg,
+	.readable_reg = cs35l56_readable_reg,
+	.precious_reg = cs35l56_precious_reg,
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_regmap_spi, SND_SOC_CS35L56_SHARED);
+
+struct regmap_config cs35l56_regmap_sdw = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.reg_format_endian = REGMAP_ENDIAN_LITTLE,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+	.max_register = CS35L56_DSP1_PMEM_5114,
+	.reg_defaults = cs35l56_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(cs35l56_reg_defaults),
+	.volatile_reg = cs35l56_volatile_reg,
+	.readable_reg = cs35l56_readable_reg,
+	.precious_reg = cs35l56_precious_reg,
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_regmap_sdw, SND_SOC_CS35L56_SHARED);
+
+MODULE_DESCRIPTION("ASoC CS35L56 Shared");
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c
new file mode 100644
index 000000000000..80dcf37daae2
--- /dev/null
+++ b/sound/soc/codecs/cs35l56-spi.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// CS35L56 ALSA SoC audio driver SPI binding
+//
+// Copyright (C) 2023 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/regmap.h>
+#include <linux/spi/spi.h>
+#include <linux/types.h>
+
+#include "cs35l56.h"
+
+static int cs35l56_spi_probe(struct spi_device *spi)
+{
+	const struct regmap_config *regmap_config = &cs35l56_regmap_spi;
+	struct cs35l56_private *cs35l56;
+	int ret;
+
+	cs35l56 = devm_kzalloc(&spi->dev, sizeof(struct cs35l56_private), GFP_KERNEL);
+	if (!cs35l56)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, cs35l56);
+	cs35l56->regmap = devm_regmap_init_spi(spi, regmap_config);
+	if (IS_ERR(cs35l56->regmap)) {
+		ret = PTR_ERR(cs35l56->regmap);
+		return dev_err_probe(&spi->dev, ret, "Failed to allocate register map\n");
+		return ret;
+	}
+
+	cs35l56->dev = &spi->dev;
+	cs35l56->irq = spi->irq;
+
+	ret = cs35l56_common_probe(cs35l56);
+	if (ret != 0)
+		return ret;
+
+	ret = cs35l56_init(cs35l56);
+	if (ret == 0)
+		ret = cs35l56_irq_request(cs35l56);
+	if (ret < 0)
+		cs35l56_remove(cs35l56);
+
+	return ret;
+}
+
+static void cs35l56_spi_remove(struct spi_device *spi)
+{
+	struct cs35l56_private *cs35l56 = spi_get_drvdata(spi);
+
+	cs35l56_remove(cs35l56);
+}
+
+static const struct spi_device_id cs35l56_id_spi[] = {
+	{ "cs35l56", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(spi, cs35l56_id_spi);
+
+static struct spi_driver cs35l56_spi_driver = {
+	.driver = {
+		.name		= "cs35l56",
+		.pm = &cs35l56_pm_ops_i2c_spi,
+	},
+	.id_table	= cs35l56_id_spi,
+	.probe		= cs35l56_spi_probe,
+	.remove		= cs35l56_spi_remove,
+};
+
+module_spi_driver(cs35l56_spi_driver);
+
+MODULE_DESCRIPTION("ASoC CS35L56 SPI driver");
+MODULE_IMPORT_NS(SND_SOC_CS35L56_CORE);
+MODULE_IMPORT_NS(SND_SOC_CS35L56_SHARED);
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
new file mode 100644
index 000000000000..90fc79b5666d
--- /dev/null
+++ b/sound/soc/codecs/cs35l56.c
@@ -0,0 +1,1461 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Driver for Cirrus Logic CS35L56 smart amp
+//
+// Copyright (C) 2023 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+#include <linux/acpi.h>
+#include <linux/completion.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/soc-dapm.h>
+#include <sound/tlv.h>
+
+#include "wm_adsp.h"
+#include "cs35l56.h"
+
+static int cs35l56_dsp_event(struct snd_soc_dapm_widget *w,
+			     struct snd_kcontrol *kcontrol, int event);
+
+static int cs35l56_wait_dsp_ready(struct cs35l56_private *cs35l56)
+{
+	int ret;
+
+	if (!cs35l56->fw_patched) {
+		/* block until firmware download completes */
+		ret = wait_for_completion_timeout(&cs35l56->dsp_ready_completion,
+						  msecs_to_jiffies(25000));
+		if (!ret) {
+			dev_err(cs35l56->dev, "dsp_ready_completion timeout\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	return 0;
+}
+
+static int cs35l56_dspwait_get_volsw(struct snd_kcontrol *kcontrol,
+				     struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+	int ret = cs35l56_wait_dsp_ready(cs35l56);
+
+	if (ret)
+		return ret;
+
+	return snd_soc_get_volsw(kcontrol, ucontrol);
+}
+
+static int cs35l56_dspwait_put_volsw(struct snd_kcontrol *kcontrol,
+				     struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+	int ret = cs35l56_wait_dsp_ready(cs35l56);
+
+	if (ret)
+		return ret;
+
+	return snd_soc_put_volsw(kcontrol, ucontrol);
+}
+
+static DECLARE_TLV_DB_SCALE(vol_tlv, -10000, 25, 0);
+
+static const struct snd_kcontrol_new cs35l56_controls[] = {
+	SOC_SINGLE_EXT("Speaker Switch",
+		       CS35L56_MAIN_RENDER_USER_MUTE, 0, 1, 1,
+		       cs35l56_dspwait_get_volsw, cs35l56_dspwait_put_volsw),
+	SOC_SINGLE_S_EXT_TLV("Speaker Volume",
+			     CS35L56_MAIN_RENDER_USER_VOLUME,
+			     6, -400, 400, 9, 0,
+			     cs35l56_dspwait_get_volsw,
+			     cs35l56_dspwait_put_volsw,
+			     vol_tlv),
+	SOC_SINGLE_EXT("Posture Number", CS35L56_MAIN_POSTURE_NUMBER,
+		       0, 255, 0,
+		       cs35l56_dspwait_get_volsw, cs35l56_dspwait_put_volsw),
+};
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_asp1tx1_enum,
+				  CS35L56_ASP1TX1_INPUT,
+				  0, CS35L56_ASP_TXn_SRC_MASK,
+				  cs35l56_tx_input_texts,
+				  cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new asp1_tx1_mux =
+	SOC_DAPM_ENUM("ASP1TX1 SRC", cs35l56_asp1tx1_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_asp1tx2_enum,
+				  CS35L56_ASP1TX2_INPUT,
+				  0, CS35L56_ASP_TXn_SRC_MASK,
+				  cs35l56_tx_input_texts,
+				  cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new asp1_tx2_mux =
+	SOC_DAPM_ENUM("ASP1TX2 SRC", cs35l56_asp1tx2_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_asp1tx3_enum,
+				  CS35L56_ASP1TX3_INPUT,
+				  0, CS35L56_ASP_TXn_SRC_MASK,
+				  cs35l56_tx_input_texts,
+				  cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new asp1_tx3_mux =
+	SOC_DAPM_ENUM("ASP1TX3 SRC", cs35l56_asp1tx3_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_asp1tx4_enum,
+				  CS35L56_ASP1TX4_INPUT,
+				  0, CS35L56_ASP_TXn_SRC_MASK,
+				  cs35l56_tx_input_texts,
+				  cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new asp1_tx4_mux =
+	SOC_DAPM_ENUM("ASP1TX4 SRC", cs35l56_asp1tx4_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx1_enum,
+				CS35L56_SWIRE_DP3_CH1_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx1_mux =
+	SOC_DAPM_ENUM("SDW1TX1 SRC", cs35l56_sdw1tx1_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx2_enum,
+				CS35L56_SWIRE_DP3_CH2_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx2_mux =
+	SOC_DAPM_ENUM("SDW1TX2 SRC", cs35l56_sdw1tx2_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx3_enum,
+				CS35L56_SWIRE_DP3_CH3_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx3_mux =
+	SOC_DAPM_ENUM("SDW1TX3 SRC", cs35l56_sdw1tx3_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx4_enum,
+				CS35L56_SWIRE_DP3_CH4_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx4_mux =
+	SOC_DAPM_ENUM("SDW1TX4 SRC", cs35l56_sdw1tx4_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx5_enum,
+				CS35L56_SWIRE_DP3_CH5_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx5_mux =
+	SOC_DAPM_ENUM("SDW1TX5 SRC", cs35l56_sdw1tx5_enum);
+
+static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx6_enum,
+				CS35L56_SWIRE_DP3_CH6_INPUT,
+				0, CS35L56_SWIRETXn_SRC_MASK,
+				cs35l56_tx_input_texts,
+				cs35l56_tx_input_values);
+
+static const struct snd_kcontrol_new sdw1_tx6_mux =
+	SOC_DAPM_ENUM("SDW1TX6 SRC", cs35l56_sdw1tx6_enum);
+
+static const struct snd_soc_dapm_widget cs35l56_dapm_widgets[] = {
+	SND_SOC_DAPM_REGULATOR_SUPPLY("VDD_B", 0, 0),
+	SND_SOC_DAPM_REGULATOR_SUPPLY("VDD_AMP", 0, 0),
+
+	SND_SOC_DAPM_OUT_DRV("AMP", SND_SOC_NOPM, 0, 0, NULL, 0),
+	SND_SOC_DAPM_OUTPUT("SPK"),
+
+	SND_SOC_DAPM_PGA_E("DSP1", SND_SOC_NOPM, 0, 0, NULL, 0, cs35l56_dsp_event,
+			   SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+
+	SND_SOC_DAPM_AIF_IN("ASP1RX1", NULL, 0, CS35L56_ASP1_ENABLES1,
+			    CS35L56_ASP_RX1_EN_SHIFT, 0),
+	SND_SOC_DAPM_AIF_IN("ASP1RX2", NULL, 1, CS35L56_ASP1_ENABLES1,
+			    CS35L56_ASP_RX2_EN_SHIFT, 0),
+	SND_SOC_DAPM_AIF_OUT("ASP1TX1", NULL, 0, CS35L56_ASP1_ENABLES1,
+			     CS35L56_ASP_TX1_EN_SHIFT, 0),
+	SND_SOC_DAPM_AIF_OUT("ASP1TX2", NULL, 1, CS35L56_ASP1_ENABLES1,
+			     CS35L56_ASP_TX2_EN_SHIFT, 0),
+	SND_SOC_DAPM_AIF_OUT("ASP1TX3", NULL, 2, CS35L56_ASP1_ENABLES1,
+			     CS35L56_ASP_TX3_EN_SHIFT, 0),
+	SND_SOC_DAPM_AIF_OUT("ASP1TX4", NULL, 3, CS35L56_ASP1_ENABLES1,
+			     CS35L56_ASP_TX4_EN_SHIFT, 0),
+
+	SND_SOC_DAPM_MUX("ASP1 TX1 Source", SND_SOC_NOPM, 0, 0, &asp1_tx1_mux),
+	SND_SOC_DAPM_MUX("ASP1 TX2 Source", SND_SOC_NOPM, 0, 0, &asp1_tx2_mux),
+	SND_SOC_DAPM_MUX("ASP1 TX3 Source", SND_SOC_NOPM, 0, 0, &asp1_tx3_mux),
+	SND_SOC_DAPM_MUX("ASP1 TX4 Source", SND_SOC_NOPM, 0, 0, &asp1_tx4_mux),
+
+	SND_SOC_DAPM_MUX("SDW1 TX1 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx1_mux),
+	SND_SOC_DAPM_MUX("SDW1 TX2 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx2_mux),
+	SND_SOC_DAPM_MUX("SDW1 TX3 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx3_mux),
+	SND_SOC_DAPM_MUX("SDW1 TX4 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx4_mux),
+	SND_SOC_DAPM_MUX("SDW1 TX5 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx5_mux),
+	SND_SOC_DAPM_MUX("SDW1 TX6 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx6_mux),
+
+	SND_SOC_DAPM_SIGGEN("VMON ADC"),
+	SND_SOC_DAPM_SIGGEN("IMON ADC"),
+	SND_SOC_DAPM_SIGGEN("ERRVOL ADC"),
+	SND_SOC_DAPM_SIGGEN("CLASSH ADC"),
+	SND_SOC_DAPM_SIGGEN("VDDBMON ADC"),
+	SND_SOC_DAPM_SIGGEN("VBSTMON ADC"),
+	SND_SOC_DAPM_SIGGEN("TEMPMON ADC"),
+};
+
+#define CS35L56_SRC_ROUTE(name) \
+	{ name" Source", "ASP1RX1", "ASP1RX1" }, \
+	{ name" Source", "ASP1RX2", "ASP1RX2" }, \
+	{ name" Source", "VMON", "VMON ADC" }, \
+	{ name" Source", "IMON", "IMON ADC" }, \
+	{ name" Source", "ERRVOL", "ERRVOL ADC" },   \
+	{ name" Source", "CLASSH", "CLASSH ADC" },   \
+	{ name" Source", "VDDBMON", "VDDBMON ADC" }, \
+	{ name" Source", "VBSTMON", "VBSTMON ADC" }, \
+	{ name" Source", "DSP1TX1", "DSP1" }, \
+	{ name" Source", "DSP1TX2", "DSP1" }, \
+	{ name" Source", "DSP1TX3", "DSP1" }, \
+	{ name" Source", "DSP1TX4", "DSP1" }, \
+	{ name" Source", "DSP1TX5", "DSP1" }, \
+	{ name" Source", "DSP1TX6", "DSP1" }, \
+	{ name" Source", "DSP1TX7", "DSP1" }, \
+	{ name" Source", "DSP1TX8", "DSP1" }, \
+	{ name" Source", "TEMPMON", "TEMPMON ADC" }, \
+	{ name" Source", "INTERPOLATOR", "AMP" }, \
+	{ name" Source", "SDW1RX1", "SDW1 Playback" }, \
+	{ name" Source", "SDW1RX2", "SDW1 Playback" },
+
+static const struct snd_soc_dapm_route cs35l56_audio_map[] = {
+	{ "AMP", NULL, "VDD_B" },
+	{ "AMP", NULL, "VDD_AMP" },
+
+	{ "ASP1RX1", NULL, "ASP1 Playback" },
+	{ "ASP1RX2", NULL, "ASP1 Playback" },
+	{ "DSP1", NULL, "ASP1RX1" },
+	{ "DSP1", NULL, "ASP1RX2" },
+	{ "DSP1", NULL, "SDW1 Playback" },
+	{ "AMP", NULL, "DSP1" },
+	{ "SPK", NULL, "AMP" },
+
+	CS35L56_SRC_ROUTE("ASP1 TX1")
+	CS35L56_SRC_ROUTE("ASP1 TX2")
+	CS35L56_SRC_ROUTE("ASP1 TX3")
+	CS35L56_SRC_ROUTE("ASP1 TX4")
+
+	{ "ASP1TX1", NULL, "ASP1 TX1 Source" },
+	{ "ASP1TX2", NULL, "ASP1 TX2 Source" },
+	{ "ASP1TX3", NULL, "ASP1 TX3 Source" },
+	{ "ASP1TX4", NULL, "ASP1 TX4 Source" },
+	{ "ASP1 Capture", NULL, "ASP1TX1" },
+	{ "ASP1 Capture", NULL, "ASP1TX2" },
+	{ "ASP1 Capture", NULL, "ASP1TX3" },
+	{ "ASP1 Capture", NULL, "ASP1TX4" },
+
+	CS35L56_SRC_ROUTE("SDW1 TX1")
+	CS35L56_SRC_ROUTE("SDW1 TX2")
+	CS35L56_SRC_ROUTE("SDW1 TX3")
+	CS35L56_SRC_ROUTE("SDW1 TX4")
+	CS35L56_SRC_ROUTE("SDW1 TX5")
+	CS35L56_SRC_ROUTE("SDW1 TX6")
+	{ "SDW1 Capture", NULL, "SDW1 TX1 Source" },
+	{ "SDW1 Capture", NULL, "SDW1 TX2 Source" },
+	{ "SDW1 Capture", NULL, "SDW1 TX3 Source" },
+	{ "SDW1 Capture", NULL, "SDW1 TX4 Source" },
+	{ "SDW1 Capture", NULL, "SDW1 TX5 Source" },
+	{ "SDW1 Capture", NULL, "SDW1 TX6 Source" },
+};
+
+static int cs35l56_mbox_send(struct cs35l56_private *cs35l56, unsigned int command)
+{
+	unsigned int val;
+	int ret;
+
+	regmap_write(cs35l56->regmap, CS35L56_DSP_VIRTUAL1_MBOX_1, command);
+	ret = regmap_read_poll_timeout(cs35l56->regmap, CS35L56_DSP_VIRTUAL1_MBOX_1,
+				       val, (val == 0),
+				       CS35L56_MBOX_POLL_US, CS35L56_MBOX_TIMEOUT_US);
+	if (ret) {
+		dev_warn(cs35l56->dev, "MBOX command %#x failed: %d\n", command, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cs35l56_dsp_event(struct snd_soc_dapm_widget *w,
+			     struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+
+	dev_dbg(cs35l56->dev, "%s: %d\n", __func__, event);
+
+	return wm_adsp_event(w, kcontrol, event);
+}
+
+irqreturn_t cs35l56_irq(int irq, void *data)
+{
+	struct cs35l56_private *cs35l56 = data;
+	unsigned int status1 = 0, status8 = 0, status20 = 0;
+	unsigned int mask1, mask8, mask20;
+	unsigned int rv, val;
+	irqreturn_t ret = IRQ_NONE;
+
+	if (!cs35l56->init_done)
+		return IRQ_NONE;
+
+	mutex_lock(&cs35l56->irq_lock);
+
+	rv = pm_runtime_resume_and_get(cs35l56->dev);
+	if (rv < 0) {
+		dev_err(cs35l56->dev, "irq: failed to get pm_runtime: %d\n", rv);
+		goto err_unlock;
+	}
+
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_STATUS, &val);
+	if ((val & CS35L56_IRQ1_STS_MASK) == 0) {
+		dev_dbg(cs35l56->dev, "Spurious IRQ: no pending interrupt\n");
+		goto err;
+	}
+
+	/* Ack interrupts */
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_EINT_1, &status1);
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_MASK_1, &mask1);
+	status1 &= ~mask1;
+	regmap_write(cs35l56->regmap, CS35L56_IRQ1_EINT_1, status1);
+
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_EINT_8, &status8);
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_MASK_8, &mask8);
+	status8 &= ~mask8;
+	regmap_write(cs35l56->regmap, CS35L56_IRQ1_EINT_8, status8);
+
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_EINT_20, &status20);
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_MASK_20, &mask20);
+	status20 &= ~mask20;
+	/* We don't want EINT20 but they default to unmasked: force mask */
+	regmap_write(cs35l56->regmap, CS35L56_IRQ1_MASK_20, 0xffffffff);
+
+	dev_dbg(cs35l56->dev, "%s: %#x %#x\n", __func__, status1, status8);
+
+	/* Check to see if unmasked bits are active */
+	if (!status1 && !status8 && !status20)
+		goto err;
+
+	if (status1 & CS35L56_AMP_SHORT_ERR_EINT1_MASK)
+		dev_crit(cs35l56->dev, "Amp short error\n");
+
+	if (status8 & CS35L56_TEMP_ERR_EINT1_MASK)
+		dev_crit(cs35l56->dev, "Overtemp error\n");
+
+	ret = IRQ_HANDLED;
+
+err:
+	pm_runtime_put(cs35l56->dev);
+err_unlock:
+	mutex_unlock(&cs35l56->irq_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_irq, SND_SOC_CS35L56_CORE);
+
+int cs35l56_irq_request(struct cs35l56_private *cs35l56)
+{
+	int ret;
+
+	if (!cs35l56->irq)
+		return 0;
+
+	ret = devm_request_threaded_irq(cs35l56->dev, cs35l56->irq, NULL,
+					cs35l56_irq,
+					IRQF_ONESHOT | IRQF_SHARED | IRQF_TRIGGER_LOW,
+					"cs35l56", cs35l56);
+	if (ret < 0)
+		dev_err(cs35l56->dev, "Failed to get IRQ: %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_irq_request, SND_SOC_CS35L56_CORE);
+
+static int cs35l56_asp_dai_set_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(codec_dai->component);
+	unsigned int val;
+
+	dev_dbg(cs35l56->dev, "%s: %#x\n", __func__, fmt);
+
+	switch (fmt & SND_SOC_DAIFMT_CLOCK_PROVIDER_MASK) {
+	case SND_SOC_DAIFMT_CBC_CFC:
+		break;
+	default:
+		dev_err(cs35l56->dev, "Unsupported clock source mode\n");
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_DSP_A:
+		val = CS35L56_ASP_FMT_DSP_A << CS35L56_ASP_FMT_SHIFT;
+		cs35l56->tdm_mode = true;
+		break;
+	case SND_SOC_DAIFMT_I2S:
+		val = CS35L56_ASP_FMT_I2S << CS35L56_ASP_FMT_SHIFT;
+		cs35l56->tdm_mode = false;
+		break;
+	default:
+		dev_err(cs35l56->dev, "Unsupported DAI format\n");
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	case SND_SOC_DAIFMT_NB_IF:
+		val |= CS35L56_ASP_FSYNC_INV_MASK;
+		break;
+	case SND_SOC_DAIFMT_IB_NF:
+		val |= CS35L56_ASP_BCLK_INV_MASK;
+		break;
+	case SND_SOC_DAIFMT_IB_IF:
+		val |= CS35L56_ASP_BCLK_INV_MASK | CS35L56_ASP_FSYNC_INV_MASK;
+		break;
+	case SND_SOC_DAIFMT_NB_NF:
+		break;
+	default:
+		dev_err(cs35l56->dev, "Invalid clock invert\n");
+		return -EINVAL;
+	}
+
+	regmap_update_bits(cs35l56->regmap,
+			   CS35L56_ASP1_CONTROL2,
+			   CS35L56_ASP_FMT_MASK |
+			   CS35L56_ASP_BCLK_INV_MASK | CS35L56_ASP_FSYNC_INV_MASK,
+			   val);
+
+	/* Hi-Z DOUT in unused slots and when all TX are disabled */
+	regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_CONTROL3,
+			   CS35L56_ASP1_DOUT_HIZ_CTRL_MASK,
+			   CS35L56_ASP_UNUSED_HIZ_OFF_HIZ);
+
+	return 0;
+}
+
+static void cs35l56_set_asp_slot_positions(struct cs35l56_private *cs35l56,
+					   unsigned int reg, unsigned long mask)
+{
+	unsigned int reg_val, channel_shift;
+	int bit_num;
+
+	/* Init all slots to 63 */
+	switch (reg) {
+	case CS35L56_ASP1_FRAME_CONTROL1:
+		reg_val = 0x3f3f3f3f;
+		break;
+	case CS35L56_ASP1_FRAME_CONTROL5:
+		reg_val = 0x3f3f3f;
+		break;
+	}
+
+	/* Enable consecutive TX1..TXn for each of the slots set in mask */
+	channel_shift = 0;
+	for_each_set_bit(bit_num, &mask, 32) {
+		reg_val &= ~(0x3f << channel_shift);
+		reg_val |= bit_num << channel_shift;
+		channel_shift += 8;
+	}
+
+	regmap_write(cs35l56->regmap, reg, reg_val);
+}
+
+static int cs35l56_asp_dai_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+					unsigned int rx_mask, int slots, int slot_width)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+
+	if ((slots == 0) || (slot_width == 0)) {
+		dev_dbg(cs35l56->dev, "tdm config cleared\n");
+		cs35l56->asp_slot_width = 0;
+		cs35l56->asp_slot_count = 0;
+		return 0;
+	}
+
+	if (slot_width > (CS35L56_ASP_RX_WIDTH_MASK >> CS35L56_ASP_RX_WIDTH_SHIFT)) {
+		dev_err(cs35l56->dev, "tdm invalid slot width %d\n", slot_width);
+		return -EINVAL;
+	}
+
+	/* More than 32 slots would give an unsupportable BCLK frequency */
+	if (slots > 32) {
+		dev_err(cs35l56->dev, "tdm invalid slot count %d\n", slots);
+		return -EINVAL;
+	}
+
+	cs35l56->asp_slot_width = (u8)slot_width;
+	cs35l56->asp_slot_count = (u8)slots;
+
+	// Note: rx/tx is from point of view of the CPU end
+	if (tx_mask == 0)
+		tx_mask = 0x3;	// ASPRX1/RX2 in slots 0 and 1
+
+	if (rx_mask == 0)
+		rx_mask = 0xf;	// ASPTX1..TX4 in slots 0..3
+
+	cs35l56_set_asp_slot_positions(cs35l56, CS35L56_ASP1_FRAME_CONTROL1, rx_mask);
+	cs35l56_set_asp_slot_positions(cs35l56, CS35L56_ASP1_FRAME_CONTROL5, tx_mask);
+
+	dev_dbg(cs35l56->dev, "tdm slot width: %u count: %u tx_mask: %#x rx_mask: %#x\n",
+		cs35l56->asp_slot_width, cs35l56->asp_slot_count, tx_mask, rx_mask);
+
+	return 0;
+}
+
+static int cs35l56_asp_dai_hw_params(struct snd_pcm_substream *substream,
+				     struct snd_pcm_hw_params *params,
+				     struct snd_soc_dai *dai)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	unsigned int rate = params_rate(params);
+	u8 asp_width, asp_wl;
+
+	asp_wl = params_width(params);
+	if (cs35l56->asp_slot_width)
+		asp_width = cs35l56->asp_slot_width;
+	else
+		asp_width = asp_wl;
+
+	dev_dbg(cs35l56->dev, "%s: wl=%d, width=%d, rate=%d", __func__, asp_wl, asp_width, rate);
+
+	if (!cs35l56->sysclk_set) {
+		unsigned int slots = cs35l56->asp_slot_count;
+		unsigned int bclk_freq;
+		int freq_id;
+
+		if (slots == 0) {
+			slots = params_channels(params);
+
+			/* I2S always has an even number of slots */
+			if (!cs35l56->tdm_mode)
+				slots = round_up(slots, 2);
+		}
+
+		bclk_freq = asp_width * slots * rate;
+		freq_id = cs35l56_get_bclk_freq_id(bclk_freq);
+		if (freq_id < 0) {
+			dev_err(cs35l56->dev, "%s: Invalid BCLK %u\n", __func__, bclk_freq);
+			return -EINVAL;
+		}
+
+		regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_CONTROL1,
+				   CS35L56_ASP_BCLK_FREQ_MASK,
+				   freq_id << CS35L56_ASP_BCLK_FREQ_SHIFT);
+	}
+
+	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
+		regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_CONTROL2,
+				   CS35L56_ASP_RX_WIDTH_MASK, asp_width <<
+				   CS35L56_ASP_RX_WIDTH_SHIFT);
+		regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_DATA_CONTROL5,
+				   CS35L56_ASP_RX_WL_MASK, asp_wl);
+	} else {
+		regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_CONTROL2,
+				   CS35L56_ASP_TX_WIDTH_MASK, asp_width <<
+				   CS35L56_ASP_TX_WIDTH_SHIFT);
+		regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_DATA_CONTROL1,
+				   CS35L56_ASP_TX_WL_MASK, asp_wl);
+	}
+
+	return 0;
+}
+
+static int cs35l56_asp_dai_set_sysclk(struct snd_soc_dai *dai,
+				      int clk_id, unsigned int freq, int dir)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	int freq_id;
+
+	if (freq == 0) {
+		cs35l56->sysclk_set = false;
+		return 0;
+	}
+
+	freq_id = cs35l56_get_bclk_freq_id(freq);
+	if (freq_id < 0)
+		return freq_id;
+
+	regmap_update_bits(cs35l56->regmap, CS35L56_ASP1_CONTROL1,
+			   CS35L56_ASP_BCLK_FREQ_MASK,
+			   freq_id << CS35L56_ASP_BCLK_FREQ_SHIFT);
+	cs35l56->sysclk_set = true;
+
+	return 0;
+}
+
+static int cs35l56_mute_stream(struct snd_soc_dai *dai, int mute, int stream)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	unsigned int val;
+	int ret;
+
+	dev_dbg(cs35l56->dev, "%s: %d %s\n", __func__, stream, mute ? "mute" : "unmute");
+
+	if (stream != SNDRV_PCM_STREAM_PLAYBACK)
+		return 0;
+
+	if (mute) {
+		ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_AUDIO_PAUSE);
+	} else {
+		ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_AUDIO_PLAY);
+		if (ret == 0) {
+			/* Wait for firmware to enter PS0 power state */
+			ret = regmap_read_poll_timeout(cs35l56->regmap,
+						       CS35L56_TRANSDUCER_ACTUAL_PS,
+						       val, (val == CS35L56_PS0),
+						       CS35L56_PS0_POLL_US,
+						       CS35L56_PS0_TIMEOUT_US);
+			if (ret)
+				dev_err(cs35l56->dev, "PS0 wait failed: %d\n", ret);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+static const struct snd_soc_dai_ops cs35l56_ops = {
+	.set_fmt = cs35l56_asp_dai_set_fmt,
+	.set_tdm_slot = cs35l56_asp_dai_set_tdm_slot,
+	.hw_params = cs35l56_asp_dai_hw_params,
+	.set_sysclk = cs35l56_asp_dai_set_sysclk,
+	.mute_stream = cs35l56_mute_stream,
+};
+
+static void cs35l56_sdw_dai_shutdown(struct snd_pcm_substream *substream,
+				     struct snd_soc_dai *dai)
+{
+	snd_soc_dai_set_dma_data(dai, substream, NULL);
+}
+
+static int cs35l56_sdw_dai_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+					unsigned int rx_mask, int slots, int slot_width)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+
+	/* rx/tx are from point of view of the CPU end so opposite to our rx/tx */
+	cs35l56->rx_mask = tx_mask;
+	cs35l56->tx_mask = rx_mask;
+
+	return 0;
+}
+
+static int cs35l56_sdw_dai_hw_params(struct snd_pcm_substream *substream,
+				     struct snd_pcm_hw_params *params,
+				     struct snd_soc_dai *dai)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	struct sdw_stream_runtime *sdw_stream = snd_soc_dai_get_dma_data(dai, substream);
+	struct sdw_stream_config sconfig;
+	struct sdw_port_config pconfig;
+	int ret;
+
+	dev_dbg(cs35l56->dev, "%s: rate %d\n", __func__, params_rate(params));
+
+	if (!cs35l56->init_done)
+		return -ENODEV;
+
+	if (!sdw_stream)
+		return -EINVAL;
+
+	memset(&sconfig, 0, sizeof(sconfig));
+	memset(&pconfig, 0, sizeof(pconfig));
+
+	sconfig.frame_rate = params_rate(params);
+	sconfig.bps = snd_pcm_format_width(params_format(params));
+
+	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
+		sconfig.direction = SDW_DATA_DIR_RX;
+		pconfig.num = CS35L56_SDW1_PLAYBACK_PORT;
+		pconfig.ch_mask = cs35l56->rx_mask;
+	} else {
+		sconfig.direction = SDW_DATA_DIR_TX;
+		pconfig.num = CS35L56_SDW1_CAPTURE_PORT;
+		pconfig.ch_mask = cs35l56->tx_mask;
+	}
+
+	if (pconfig.ch_mask == 0) {
+		sconfig.ch_count = params_channels(params);
+		pconfig.ch_mask = GENMASK(sconfig.ch_count - 1, 0);
+	} else {
+		sconfig.ch_count = hweight32(pconfig.ch_mask);
+	}
+
+	ret = sdw_stream_add_slave(cs35l56->sdw_peripheral, &sconfig, &pconfig,
+				   1, sdw_stream);
+	if (ret) {
+		dev_err(dai->dev, "Failed to add sdw stream: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cs35l56_sdw_dai_hw_free(struct snd_pcm_substream *substream,
+				   struct snd_soc_dai *dai)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	struct sdw_stream_runtime *sdw_stream = snd_soc_dai_get_dma_data(dai, substream);
+
+	if (!cs35l56->sdw_peripheral)
+		return -EINVAL;
+
+	sdw_stream_remove_slave(cs35l56->sdw_peripheral, sdw_stream);
+
+	return 0;
+}
+
+static int cs35l56_sdw_dai_set_stream(struct snd_soc_dai *dai,
+				      void *sdw_stream, int direction)
+{
+	if (!sdw_stream)
+		return 0;
+
+	snd_soc_dai_dma_data_set(dai, direction, sdw_stream);
+
+	return 0;
+}
+
+static const struct snd_soc_dai_ops cs35l56_sdw_dai_ops = {
+	.set_tdm_slot = cs35l56_sdw_dai_set_tdm_slot,
+	.shutdown = cs35l56_sdw_dai_shutdown,
+	.hw_params = cs35l56_sdw_dai_hw_params,
+	.hw_free = cs35l56_sdw_dai_hw_free,
+	.mute_stream = cs35l56_mute_stream,
+	.set_stream = cs35l56_sdw_dai_set_stream,
+};
+
+static struct snd_soc_dai_driver cs35l56_dai[] = {
+	{
+		.name = "cs35l56-asp1",
+		.id = 0,
+		.playback = {
+			.stream_name = "ASP1 Playback",
+			.channels_min = 1,
+			.channels_max = 2,
+			.rates = CS35L56_RATES,
+			.formats = CS35L56_RX_FORMATS,
+		},
+		.capture = {
+			.stream_name = "ASP1 Capture",
+			.channels_min = 1,
+			.channels_max = 4,
+			.rates = CS35L56_RATES,
+			.formats = CS35L56_TX_FORMATS,
+		},
+		.ops = &cs35l56_ops,
+		.symmetric_rate = 1,
+		.symmetric_sample_bits = 1,
+	},
+	{
+		.name = "cs35l56-sdw1",
+		.id = 1,
+		.playback = {
+			.stream_name = "SDW1 Playback",
+			.channels_min = 1,
+			.channels_max = 2,
+			.rates = CS35L56_RATES,
+			.formats = CS35L56_RX_FORMATS,
+		},
+		.capture = {
+			.stream_name = "SDW1 Capture",
+			.channels_min = 1,
+			.channels_max = 6,
+			.rates = CS35L56_RATES,
+			.formats = CS35L56_TX_FORMATS,
+		},
+		.symmetric_rate = 1,
+		.ops = &cs35l56_sdw_dai_ops,
+	}
+};
+
+static int cs35l56_wait_for_firmware_boot(struct cs35l56_private *cs35l56)
+{
+	unsigned int reg;
+	unsigned int val;
+	int ret;
+
+	if (cs35l56->rev < CS35L56_REVID_B0)
+		reg = CS35L56_DSP1_HALO_STATE_A1;
+	else
+		reg = CS35L56_DSP1_HALO_STATE;
+
+	ret = regmap_read_poll_timeout(cs35l56->regmap, reg,
+				       val,
+				       (val < 0xFFFF) && (val >= CS35L56_HALO_STATE_BOOT_DONE),
+				       CS35L56_HALO_STATE_POLL_US,
+				       CS35L56_HALO_STATE_TIMEOUT_US);
+
+	if ((ret < 0) && (ret != -ETIMEDOUT)) {
+		dev_err(cs35l56->dev, "Failed to read HALO_STATE: %d\n", ret);
+		return ret;
+	}
+
+	if ((ret == -ETIMEDOUT) || (val != CS35L56_HALO_STATE_BOOT_DONE)) {
+		dev_err(cs35l56->dev, "Firmware boot fail: HALO_STATE=%#x\n", val);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static const struct reg_sequence cs35l56_system_reset_seq[] = {
+	REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET),
+};
+
+static void cs35l56_system_reset(struct cs35l56_private *cs35l56)
+{
+	cs35l56->soft_resetting = true;
+
+	/*
+	 * Must enter cache-only first so there can't be any more register
+	 * accesses other than the controlled system reset sequence below.
+	 */
+	regcache_cache_only(cs35l56->regmap, true);
+	regmap_multi_reg_write_bypassed(cs35l56->regmap,
+					cs35l56_system_reset_seq,
+					ARRAY_SIZE(cs35l56_system_reset_seq));
+
+	/* On SoundWire the registers won't be accessible until it re-enumerates. */
+	if (cs35l56->sdw_peripheral)
+		return;
+
+	usleep_range(CS35L56_CONTROL_PORT_READY_US, CS35L56_CONTROL_PORT_READY_US + 400);
+	regcache_cache_only(cs35l56->regmap, false);
+}
+
+static void cs35l56_dsp_work(struct work_struct *work)
+{
+	struct cs35l56_private *cs35l56 = container_of(work,
+						       struct cs35l56_private,
+						       dsp_work);
+	unsigned int reg;
+	unsigned int val;
+	int ret = 0;
+
+	if (!wait_for_completion_timeout(&cs35l56->init_completion,
+					 msecs_to_jiffies(5000))) {
+		dev_err(cs35l56->dev, "%s: init_completion timed out\n", __func__);
+		goto complete;
+	}
+
+	if (!cs35l56->init_done || cs35l56->removing)
+		goto complete;
+
+	cs35l56->dsp.part = devm_kasprintf(cs35l56->dev, GFP_KERNEL, "cs35l56%s-%02x",
+					   cs35l56->secured ? "s" : "", cs35l56->rev);
+
+	if (!cs35l56->dsp.part)
+		goto complete;
+
+	pm_runtime_get_sync(cs35l56->dev);
+
+	/*
+	 * Disable SoundWire interrupts to prevent race with IRQ work.
+	 * Setting sdw_irq_no_unmask prevents the handler re-enabling
+	 * the SoundWire interrupt.
+	 */
+	if (cs35l56->sdw_peripheral) {
+		cs35l56->sdw_irq_no_unmask = true;
+		cancel_work_sync(&cs35l56->sdw_irq_work);
+		sdw_write_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_MASK_1, 0);
+		sdw_read_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_STAT_1);
+		sdw_write_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_STAT_1, 0xFF);
+	}
+
+	ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_SHUTDOWN);
+	if (ret) {
+		dev_dbg(cs35l56->dev, "%s: CS35L56_MBOX_CMD_SHUTDOWN ret %d\n", __func__, ret);
+		goto err;
+	}
+
+	if (cs35l56->rev < CS35L56_REVID_B0)
+		reg = CS35L56_DSP1_PM_CUR_STATE_A1;
+	else
+		reg = CS35L56_DSP1_PM_CUR_STATE;
+
+	ret = regmap_read_poll_timeout(cs35l56->regmap, reg,
+				       val, (val == CS35L56_HALO_STATE_SHUTDOWN),
+				       CS35L56_HALO_STATE_POLL_US,
+				       CS35L56_HALO_STATE_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(cs35l56->dev, "Failed to poll PM_CUR_STATE to 1 is %d (ret %d)\n",
+			val, ret);
+
+	/* Use wm_adsp to load and apply the firmware patch and coefficient files */
+	ret = wm_adsp_power_up(&cs35l56->dsp);
+	if (ret) {
+		dev_dbg(cs35l56->dev, "%s: wm_adsp_power_up ret %d\n", __func__, ret);
+		goto err;
+	}
+
+	if (cs35l56->removing)
+		goto err;
+
+	mutex_lock(&cs35l56->irq_lock);
+
+	init_completion(&cs35l56->init_completion);
+
+	cs35l56_system_reset(cs35l56);
+
+	if (cs35l56->sdw_peripheral) {
+		/*
+		 * The system-reset causes the CS35L56 to detach from the bus.
+		 * Wait for the manager to re-enumerate the CS35L56 and
+		 * cs35l56_init() to run again.
+		 */
+		if (!wait_for_completion_timeout(&cs35l56->init_completion,
+						 msecs_to_jiffies(5000))) {
+			dev_err(cs35l56->dev, "%s: init_completion timed out (SDW)\n", __func__);
+			goto err_unlock;
+		}
+	} else if (cs35l56_init(cs35l56)) {
+		goto err_unlock;
+	}
+
+	cs35l56->fw_patched = true;
+
+err_unlock:
+	mutex_unlock(&cs35l56->irq_lock);
+err:
+	pm_runtime_mark_last_busy(cs35l56->dev);
+	pm_runtime_put_autosuspend(cs35l56->dev);
+
+	/* Re-enable SoundWire interrupts */
+	if (cs35l56->sdw_peripheral) {
+		cs35l56->sdw_irq_no_unmask = false;
+		sdw_write_no_pm(cs35l56->sdw_peripheral, CS35L56_SDW_GEN_INT_MASK_1,
+				CS35L56_SDW_INT_MASK_CODEC_IRQ);
+	}
+
+complete:
+	complete_all(&cs35l56->dsp_ready_completion);
+}
+
+static int cs35l56_component_probe(struct snd_soc_component *component)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+	struct dentry *debugfs_root = component->debugfs_root;
+
+	BUILD_BUG_ON(ARRAY_SIZE(cs35l56_tx_input_texts) != ARRAY_SIZE(cs35l56_tx_input_values));
+
+	cs35l56->removing = false;
+	cs35l56->component = component;
+	wm_adsp2_component_probe(&cs35l56->dsp, component);
+
+	debugfs_create_bool("init_done", 0444, debugfs_root, &cs35l56->init_done);
+	debugfs_create_bool("can_hibernate", 0444, debugfs_root, &cs35l56->can_hibernate);
+	debugfs_create_bool("fw_patched", 0444, debugfs_root, &cs35l56->fw_patched);
+
+	queue_work(cs35l56->dsp_wq, &cs35l56->dsp_work);
+
+	return 0;
+}
+
+static void cs35l56_component_remove(struct snd_soc_component *component)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+
+	cs35l56->removing = true;
+	complete(&cs35l56->init_completion);
+	cancel_work_sync(&cs35l56->dsp_work);
+}
+
+static int cs35l56_set_bias_level(struct snd_soc_component *component,
+				  enum snd_soc_bias_level level)
+{
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+	int ret = 0;
+
+	switch (level) {
+	case SND_SOC_BIAS_STANDBY:
+		/*
+		 * Wait for patching to complete when transitioning from
+		 * BIAS_OFF to BIAS_STANDBY
+		 */
+		if (snd_soc_component_get_bias_level(component) == SND_SOC_BIAS_OFF)
+			ret = cs35l56_wait_dsp_ready(cs35l56);
+
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static const struct snd_soc_component_driver soc_component_dev_cs35l56 = {
+	.probe = cs35l56_component_probe,
+	.remove = cs35l56_component_remove,
+
+	.dapm_widgets = cs35l56_dapm_widgets,
+	.num_dapm_widgets = ARRAY_SIZE(cs35l56_dapm_widgets),
+	.dapm_routes = cs35l56_audio_map,
+	.num_dapm_routes = ARRAY_SIZE(cs35l56_audio_map),
+	.controls = cs35l56_controls,
+	.num_controls = ARRAY_SIZE(cs35l56_controls),
+
+	.set_bias_level = cs35l56_set_bias_level,
+};
+
+static const struct reg_sequence cs35l56_hibernate_seq[] = {
+	/* This must be the last register access */
+	REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_HIBERNATE_NOW),
+};
+
+static const struct reg_sequence cs35l56_hibernate_wake_seq[] = {
+	REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_WAKEUP),
+};
+
+int cs35l56_runtime_suspend(struct device *dev)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
+	unsigned int val;
+	int ret;
+
+	if (!cs35l56->init_done)
+		return 0;
+
+	/* Firmware must have entered a power-save state */
+	ret = regmap_read_poll_timeout(cs35l56->regmap,
+				       CS35L56_TRANSDUCER_ACTUAL_PS,
+				       val, (val >= CS35L56_PS3),
+				       CS35L56_PS3_POLL_US,
+				       CS35L56_PS3_TIMEOUT_US);
+	if (ret)
+		dev_warn(cs35l56->dev, "PS3 wait failed: %d\n", ret);
+
+	/* Clear BOOT_DONE so it can be used to detect a reboot */
+	regmap_write(cs35l56->regmap, CS35L56_IRQ1_EINT_4, CS35L56_OTP_BOOT_DONE_MASK);
+
+	if (!cs35l56->can_hibernate) {
+		regcache_cache_only(cs35l56->regmap, true);
+		dev_dbg(dev, "Suspended: no hibernate");
+
+		return 0;
+	}
+
+	/*
+	 * Enable auto-hibernate. If it is woken by some other wake source
+	 * it will automatically return to hibernate.
+	 */
+	ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_ALLOW_AUTO_HIBERNATE);
+	if (ret)
+		dev_warn(cs35l56->dev, "ALLOW_HIBERNATE failed: %d\n", ret);
+
+	/*
+	 * Must enter cache-only first so there can't be any more register
+	 * accesses other than the controlled hibernate sequence below.
+	 */
+	regcache_cache_only(cs35l56->regmap, true);
+
+	regmap_multi_reg_write_bypassed(cs35l56->regmap,
+					cs35l56_hibernate_seq,
+					ARRAY_SIZE(cs35l56_hibernate_seq));
+
+	dev_dbg(dev, "Suspended: hibernate");
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_runtime_suspend, SND_SOC_CS35L56_CORE);
+
+static int __maybe_unused cs35l56_runtime_resume_i2c_spi(struct device *dev)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
+
+	if (!cs35l56->init_done)
+		return 0;
+
+	return cs35l56_runtime_resume_common(cs35l56);
+}
+
+int cs35l56_runtime_resume_common(struct cs35l56_private *cs35l56)
+{
+	unsigned int val;
+	int ret;
+
+	if (!cs35l56->can_hibernate) {
+		regcache_cache_only(cs35l56->regmap, false);
+		goto out_sync;
+	}
+
+	if (!cs35l56->sdw_peripheral) {
+		/*
+		 * Dummy transaction to trigger I2C/SPI auto-wake. This will NAK on I2C.
+		 * Must be done before releasing cache-only.
+		 */
+		regmap_multi_reg_write_bypassed(cs35l56->regmap,
+						cs35l56_hibernate_wake_seq,
+						ARRAY_SIZE(cs35l56_hibernate_wake_seq));
+
+		usleep_range(CS35L56_CONTROL_PORT_READY_US,
+			     CS35L56_CONTROL_PORT_READY_US + 400);
+	}
+
+	regcache_cache_only(cs35l56->regmap, false);
+
+	ret = cs35l56_wait_for_firmware_boot(cs35l56);
+	if (ret) {
+		dev_err(cs35l56->dev, "Hibernate wake failed: %d\n", ret);
+		goto err;
+	}
+
+	ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_PREVENT_AUTO_HIBERNATE);
+	if (ret)
+		goto err;
+
+out_sync:
+	/* BOOT_DONE will be 1 if the amp reset */
+	regmap_read(cs35l56->regmap, CS35L56_IRQ1_EINT_4, &val);
+	if (val & CS35L56_OTP_BOOT_DONE_MASK) {
+		dev_dbg(cs35l56->dev, "Registers reset in suspend\n");
+		regcache_mark_dirty(cs35l56->regmap);
+	}
+
+	regcache_sync(cs35l56->regmap);
+
+	dev_dbg(cs35l56->dev, "Resumed");
+
+	return 0;
+
+err:
+	regmap_write(cs35l56->regmap, CS35L56_DSP_VIRTUAL1_MBOX_1,
+		     CS35L56_MBOX_CMD_HIBERNATE_NOW);
+
+	regcache_cache_only(cs35l56->regmap, true);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_runtime_resume_common, SND_SOC_CS35L56_CORE);
+
+static int cs35l56_dsp_init(struct cs35l56_private *cs35l56)
+{
+	struct wm_adsp *dsp;
+	int ret;
+
+	cs35l56->dsp_wq = create_singlethread_workqueue("cs35l56-dsp");
+	if (!cs35l56->dsp_wq)
+		return -ENOMEM;
+
+	INIT_WORK(&cs35l56->dsp_work, cs35l56_dsp_work);
+	init_completion(&cs35l56->dsp_ready_completion);
+
+	dsp = &cs35l56->dsp;
+	dsp->part = "cs35l56";
+	dsp->cs_dsp.num = 1;
+	dsp->cs_dsp.type = WMFW_HALO;
+	dsp->cs_dsp.rev = 0;
+	dsp->fw = 12;
+	dsp->cs_dsp.dev = cs35l56->dev;
+	dsp->cs_dsp.regmap = cs35l56->regmap;
+	dsp->cs_dsp.base = CS35L56_DSP1_CORE_BASE;
+	dsp->cs_dsp.base_sysinfo = CS35L56_DSP1_SYS_INFO_ID;
+	dsp->cs_dsp.mem = cs35l56_dsp1_regions;
+	dsp->cs_dsp.num_mems = ARRAY_SIZE(cs35l56_dsp1_regions);
+	dsp->cs_dsp.no_core_startstop = true;
+	dsp->wmfw_optional = true;
+
+	dev_dbg(cs35l56->dev, "DSP system name: '%s'\n", dsp->system_name);
+
+	ret = wm_halo_init(dsp);
+	if (ret != 0) {
+		dev_err(cs35l56->dev, "wm_halo_init failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cs35l56_acpi_get_name(struct cs35l56_private *cs35l56)
+{
+	acpi_handle handle = ACPI_HANDLE(cs35l56->dev);
+	const char *sub;
+
+	/* If there is no ACPI_HANDLE, there is no ACPI for this system, return 0 */
+	if (!handle)
+		return 0;
+
+	sub = acpi_get_subsystem_id(handle);
+	if (IS_ERR(sub)) {
+		/* If bad ACPI, return 0 and fallback to legacy firmware path, otherwise fail */
+		if (PTR_ERR(sub) == -ENODATA)
+			return 0;
+		else
+			return PTR_ERR(sub);
+	}
+
+	cs35l56->dsp.system_name = sub;
+	dev_dbg(cs35l56->dev, "Subsystem ID: %s\n", cs35l56->dsp.system_name);
+
+	return 0;
+}
+
+int cs35l56_common_probe(struct cs35l56_private *cs35l56)
+{
+	int ret;
+
+	init_completion(&cs35l56->init_completion);
+	mutex_init(&cs35l56->irq_lock);
+
+	dev_set_drvdata(cs35l56->dev, cs35l56);
+
+	cs35l56_fill_supply_names(cs35l56->supplies);
+	ret = devm_regulator_bulk_get(cs35l56->dev, ARRAY_SIZE(cs35l56->supplies),
+				      cs35l56->supplies);
+	if (ret != 0)
+		return dev_err_probe(cs35l56->dev, ret, "Failed to request supplies\n");
+
+	/* Reset could be controlled by the BIOS or shared by multiple amps */
+	cs35l56->reset_gpio = devm_gpiod_get_optional(cs35l56->dev, "reset", GPIOD_OUT_LOW);
+	if (IS_ERR(cs35l56->reset_gpio)) {
+		ret = PTR_ERR(cs35l56->reset_gpio);
+		/*
+		 * If RESET is shared the first amp to probe will grab the reset
+		 * line and reset all the amps
+		 */
+		if (ret != -EBUSY)
+			return dev_err_probe(cs35l56->dev, ret, "Failed to get reset GPIO\n");
+
+		dev_info(cs35l56->dev, "Reset GPIO busy, assume shared reset\n");
+		cs35l56->reset_gpio = NULL;
+	}
+
+	ret = regulator_bulk_enable(ARRAY_SIZE(cs35l56->supplies), cs35l56->supplies);
+	if (ret != 0)
+		return dev_err_probe(cs35l56->dev, ret, "Failed to enable supplies\n");
+
+	if (cs35l56->reset_gpio) {
+		/* satisfy minimum reset pulse width spec */
+		usleep_range(CS35L56_RESET_PULSE_MIN_US,
+			     CS35L56_RESET_PULSE_MIN_US + 400);
+		gpiod_set_value_cansleep(cs35l56->reset_gpio, 1);
+	}
+
+	ret = cs35l56_acpi_get_name(cs35l56);
+	if (ret != 0)
+		goto err;
+
+	ret = cs35l56_dsp_init(cs35l56);
+	if (ret < 0) {
+		dev_err_probe(cs35l56->dev, ret, "DSP init failed\n");
+		goto err;
+	}
+
+	ret = devm_snd_soc_register_component(cs35l56->dev,
+					      &soc_component_dev_cs35l56,
+					      cs35l56_dai, ARRAY_SIZE(cs35l56_dai));
+	if (ret < 0) {
+		dev_err_probe(cs35l56->dev, ret, "Register codec failed\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	gpiod_set_value_cansleep(cs35l56->reset_gpio, 0);
+	regulator_bulk_disable(ARRAY_SIZE(cs35l56->supplies), cs35l56->supplies);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_common_probe, SND_SOC_CS35L56_CORE);
+
+int cs35l56_init(struct cs35l56_private *cs35l56)
+{
+	int ret;
+	unsigned int devid, revid, otpid, secured;
+
+	/*
+	 * Check whether the actions associated with soft reset or one time
+	 * init need to be performed.
+	 */
+	if (cs35l56->soft_resetting)
+		goto post_soft_reset;
+
+	if (cs35l56->init_done)
+		return 0;
+
+	pm_runtime_set_autosuspend_delay(cs35l56->dev, 100);
+	pm_runtime_use_autosuspend(cs35l56->dev);
+	pm_runtime_set_active(cs35l56->dev);
+	pm_runtime_enable(cs35l56->dev);
+
+	/*
+	 * If the system is not using a reset_gpio then issue a
+	 * dummy read to force a wakeup.
+	 */
+	if (!cs35l56->reset_gpio)
+		regmap_read(cs35l56->regmap, CS35L56_DSP_VIRTUAL1_MBOX_1, &devid);
+
+	/* Wait for control port to be ready (datasheet tIRS). */
+	usleep_range(CS35L56_CONTROL_PORT_READY_US,
+		     CS35L56_CONTROL_PORT_READY_US + 400);
+
+	/*
+	 * The HALO_STATE register is in different locations on Ax and B0
+	 * devices so the REVID needs to be determined before waiting for the
+	 * firmware to boot.
+	 */
+	ret = regmap_read(cs35l56->regmap, CS35L56_REVID, &revid);
+	if (ret < 0) {
+		dev_err(cs35l56->dev, "Get Revision ID failed\n");
+		return ret;
+	}
+	cs35l56->rev = revid & (CS35L56_AREVID_MASK | CS35L56_MTLREVID_MASK);
+
+	ret = cs35l56_wait_for_firmware_boot(cs35l56);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(cs35l56->regmap, CS35L56_DEVID, &devid);
+	if (ret < 0) {
+		dev_err(cs35l56->dev, "Get Device ID failed\n");
+		return ret;
+	}
+	devid &= CS35L56_DEVID_MASK;
+
+	switch (devid) {
+	case 0x35A56:
+		break;
+	default:
+		dev_err(cs35l56->dev, "Unknown device %x\n", devid);
+		return ret;
+	}
+
+	ret = regmap_read(cs35l56->regmap, CS35L56_DSP_RESTRICT_STS1, &secured);
+	if (ret) {
+		dev_err(cs35l56->dev, "Get Secure status failed\n");
+		return ret;
+	}
+
+	/* When any bus is restricted treat the device as secured */
+	if (secured & CS35L56_RESTRICTED_MASK)
+		cs35l56->secured = true;
+
+	ret = regmap_read(cs35l56->regmap, CS35L56_OTPID, &otpid);
+	if (ret < 0) {
+		dev_err(cs35l56->dev, "Get OTP ID failed\n");
+		return ret;
+	}
+
+	dev_info(cs35l56->dev, "Cirrus Logic CS35L56%s Rev %02X OTP%d\n",
+		 cs35l56->secured ? "s" : "", cs35l56->rev, otpid);
+
+	cs35l56_patch(cs35l56->dev, cs35l56->regmap, cs35l56->rev);
+
+	/* Wake source interrupts default to unmasked, so mask them */
+	regmap_write(cs35l56->regmap, CS35L56_IRQ1_MASK_20, 0xffffffff);
+	regmap_update_bits(cs35l56->regmap, CS35L56_IRQ1_MASK_1,
+			   CS35L56_AMP_SHORT_ERR_EINT1_MASK,
+			   0);
+	regmap_update_bits(cs35l56->regmap, CS35L56_IRQ1_MASK_8,
+			   CS35L56_TEMP_ERR_EINT1_MASK,
+			   0);
+
+	if (!cs35l56->reset_gpio) {
+		dev_dbg(cs35l56->dev, "No reset gpio: using soft reset\n");
+		cs35l56_system_reset(cs35l56);
+		if (cs35l56->sdw_peripheral) {
+			/* Keep alive while we wait for re-enumeration */
+			pm_runtime_get_noresume(cs35l56->dev);
+			return 0;
+		}
+	}
+
+post_soft_reset:
+	if (cs35l56->soft_resetting) {
+		cs35l56->soft_resetting = false;
+
+		/* Done re-enumerating after one-time init so release the keep-alive */
+		if (cs35l56->sdw_peripheral && !cs35l56->init_done)
+			pm_runtime_put_noidle(cs35l56->dev);
+
+		regcache_mark_dirty(cs35l56->regmap);
+		ret = cs35l56_wait_for_firmware_boot(cs35l56);
+		if (ret)
+			return ret;
+
+		dev_dbg(cs35l56->dev, "Firmware rebooted after soft reset\n");
+	}
+
+	/* Disable auto-hibernate so that runtime_pm has control */
+	ret = cs35l56_mbox_send(cs35l56, CS35L56_MBOX_CMD_PREVENT_AUTO_HIBERNATE);
+	if (ret)
+		return ret;
+
+	/* Populate soft registers in the regmap cache */
+	cs35l56_reread_firmware_registers(cs35l56->dev, cs35l56->regmap);
+
+	/* Registers could be dirty after soft reset or SoundWire enumeration */
+	regcache_sync(cs35l56->regmap);
+
+	cs35l56->init_done = true;
+	complete(&cs35l56->init_completion);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_init, SND_SOC_CS35L56_CORE);
+
+int cs35l56_remove(struct cs35l56_private *cs35l56)
+{
+	cs35l56->init_done = false;
+
+	/*
+	 * WAKE IRQs unmask if CS35L56 hibernates so free the handler to
+	 * prevent it racing with remove().
+	 */
+	if (cs35l56->irq)
+		devm_free_irq(cs35l56->dev, cs35l56->irq, cs35l56);
+
+	flush_workqueue(cs35l56->dsp_wq);
+	destroy_workqueue(cs35l56->dsp_wq);
+
+	pm_runtime_suspend(cs35l56->dev);
+	pm_runtime_disable(cs35l56->dev);
+
+	regcache_cache_only(cs35l56->regmap, true);
+
+	kfree(cs35l56->dsp.system_name);
+
+	gpiod_set_value_cansleep(cs35l56->reset_gpio, 0);
+	regulator_bulk_disable(ARRAY_SIZE(cs35l56->supplies), cs35l56->supplies);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_remove, SND_SOC_CS35L56_CORE);
+
+const struct dev_pm_ops cs35l56_pm_ops_i2c_spi = {
+	SET_RUNTIME_PM_OPS(cs35l56_runtime_suspend, cs35l56_runtime_resume_i2c_spi, NULL)
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_pm_ops_i2c_spi, SND_SOC_CS35L56_CORE);
+
+MODULE_DESCRIPTION("ASoC CS35L56 driver");
+MODULE_IMPORT_NS(SND_SOC_CS35L56_SHARED);
+MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>");
+MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs35l56.h b/sound/soc/codecs/cs35l56.h
new file mode 100644
index 000000000000..efc4b99180f9
--- /dev/null
+++ b/sound/soc/codecs/cs35l56.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Driver for Cirrus Logic CS35L56 smart amp
+ *
+ * Copyright (C) 2023 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef CS35L56_H
+#define CS35L56_H
+
+#include <linux/completion.h>
+#include <linux/regulator/consumer.h>
+#include <linux/pm_runtime.h>
+#include <linux/workqueue.h>
+#include <sound/cs35l56.h>
+#include "wm_adsp.h"
+
+#define CS35L56_SDW_GEN_INT_STAT_1	0xc0
+#define CS35L56_SDW_GEN_INT_MASK_1	0xc1
+#define CS35L56_SDW_INT_MASK_CODEC_IRQ	BIT(0)
+
+#define CS35L56_SDW_INVALID_BUS_SCALE	0xf
+
+#define CS35L56_RX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE)
+#define CS35L56_TX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE \
+			    | SNDRV_PCM_FMTBIT_S32_LE)
+
+#define CS35L56_RATES (SNDRV_PCM_RATE_48000)
+
+struct sdw_slave;
+
+struct cs35l56_private {
+	struct wm_adsp dsp; /* must be first member */
+	struct work_struct dsp_work;
+	struct workqueue_struct *dsp_wq;
+	struct completion dsp_ready_completion;
+	struct mutex irq_lock;
+	struct snd_soc_component *component;
+	struct device *dev;
+	struct regmap *regmap;
+	struct regulator_bulk_data supplies[CS35L56_NUM_BULK_SUPPLIES];
+	int irq;
+	struct sdw_slave *sdw_peripheral;
+	u8 rev;
+	struct work_struct sdw_irq_work;
+	bool secured;
+	bool sdw_irq_no_unmask;
+	bool soft_resetting;
+	bool init_done;
+	bool sdw_attached;
+	bool removing;
+	bool fw_patched;
+	bool can_hibernate;
+	struct completion init_completion;
+	struct gpio_desc *reset_gpio;
+
+	u32 rx_mask;
+	u32 tx_mask;
+	u8 asp_slot_width;
+	u8 asp_slot_count;
+	bool tdm_mode;
+	bool sysclk_set;
+	u8 old_sdw_clock_scale;
+};
+
+extern const struct dev_pm_ops cs35l56_pm_ops_i2c_spi;
+
+int cs35l56_runtime_suspend(struct device *dev);
+int cs35l56_runtime_resume_common(struct cs35l56_private *cs35l56);
+irqreturn_t cs35l56_irq(int irq, void *data);
+int cs35l56_irq_request(struct cs35l56_private *cs35l56);
+int cs35l56_common_probe(struct cs35l56_private *cs35l56);
+int cs35l56_init(struct cs35l56_private *cs35l56);
+int cs35l56_remove(struct cs35l56_private *cs35l56);
+
+#endif /* ifndef CS35L56_H */
-- 
cgit v1.2.3


From cb3cdef33136baceada86ba2a21ba30cd53a9087 Mon Sep 17 00:00:00 2001
From: Jyri Sarha <jyri.sarha@intel.com>
Date: Tue, 21 Mar 2023 11:26:53 +0200
Subject: ASoC: SOF: ipc4: Add macros for chain-dma message bits

In the chained DMA mode, the firmware allocates buffers for the host
and link DMA, and takes care of copying data between host- and
link-DMA buffers in a low-latency thread. This is different to a
regular pipeline, no processing is allowed, and the connection between
host- and link DMA is handled with a dedicated IPC.

This patch exposes the macros needed to create the required IPC messages.

Signed-off-by: Jyri Sarha <jyri.sarha@intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230321092654.7292-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/ipc4/header.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h
index 49ff1558a171..78568abe2673 100644
--- a/include/sound/sof/ipc4/header.h
+++ b/include/sound/sof/ipc4/header.h
@@ -196,6 +196,35 @@ enum sof_ipc4_pipeline_state {
 #define SOF_IPC4_GLB_LOAD_LIBRARY_LIB_ID_SHIFT	16
 #define SOF_IPC4_GLB_LOAD_LIBRARY_LIB_ID(x)	((x) << SOF_IPC4_GLB_LOAD_LIBRARY_LIB_ID_SHIFT)
 
+/* chain dma ipc message */
+#define SOF_IPC4_GLB_CHAIN_DMA_HOST_ID_SHIFT	0
+#define SOF_IPC4_GLB_CHAIN_DMA_HOST_ID_MASK	GENMASK(4, 0)
+#define SOF_IPC4_GLB_CHAIN_DMA_HOST_ID(x)	(((x) << SOF_IPC4_GLB_CHAIN_DMA_HOST_ID_SHIFT) & \
+						 SOF_IPC4_GLB_CHAIN_DMA_HOST_ID_MASK)
+
+#define SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_SHIFT	8
+#define SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK	GENMASK(12, 8)
+#define SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(x)	(((x) << SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_SHIFT) & \
+						 SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK)
+
+#define SOF_IPC4_GLB_CHAIN_DMA_ALLOCATE_SHIFT	16
+#define SOF_IPC4_GLB_CHAIN_DMA_ALLOCATE_MASK	BIT(16)
+#define SOF_IPC4_GLB_CHAIN_DMA_ALLOCATE(x)	(((x) & 1) << SOF_IPC4_GLB_CHAIN_DMA_ALLOCATE_SHIFT)
+
+#define SOF_IPC4_GLB_CHAIN_DMA_ENABLE_SHIFT	17
+#define SOF_IPC4_GLB_CHAIN_DMA_ENABLE_MASK	BIT(17)
+#define SOF_IPC4_GLB_CHAIN_DMA_ENABLE(x)	(((x) & 1) << SOF_IPC4_GLB_CHAIN_DMA_ENABLE_SHIFT)
+
+#define SOF_IPC4_GLB_CHAIN_DMA_SCS_SHIFT	18
+#define SOF_IPC4_GLB_CHAIN_DMA_SCS_MASK		BIT(18)
+#define SOF_IPC4_GLB_CHAIN_DMA_SCS(x)		(((x) & 1) << SOF_IPC4_GLB_CHAIN_DMA_SCS_SHIFT)
+
+#define SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE_SHIFT 0
+#define SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE_MASK  GENMASK(24, 0)
+#define SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE(x)	   (((x) << \
+						     SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE_SHIFT) & \
+						    SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE_MASK)
+
 enum sof_ipc4_channel_config {
 	/* one channel only. */
 	SOF_IPC4_CHANNEL_CONFIG_MONO,
-- 
cgit v1.2.3


From ca5ce0caa67fa9eeecaa29d895c2e4c3151c159e Mon Sep 17 00:00:00 2001
From: Jyri Sarha <jyri.sarha@intel.com>
Date: Tue, 21 Mar 2023 11:26:54 +0200
Subject: ASoC: SOF: ipc4/intel: Add support for chained DMA

Add logic for setting up and tearing down chained DMA connections.

Since pipelines are not used, all the logic to set the pipeline states
can be bypassed, with only the DMA programming sequences remaining. In
addition the same format needs to be used for host- and link-DMA,
without the usual fixup to use the S32_LE format on the link.

Note however that for convenience and compatibility with existing
definitions, the topology relies on the concept of pipelines with a
'USE_CHAIN_DMA' token indicating that all the logic shall be bypassed.

Unlike 'normal' ALSA sequences, the chain DMA is not programmed in
hw_params/hw_free. The IPC message to set-up and tear-down chained DMA
are sent in sof_ipc4_trigger_pipelines(), but the contents prepared
earlier.

Chained DMA is only supported by the Intel HDA DAI for now, and only
S16_LE and S32_LE formats are supported for now.

Signed-off-by: Jyri Sarha <jyri.sarha@intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20230321092654.7292-4-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h   |   1 +
 sound/soc/sof/intel/hda-dai-ops.c |  18 +++++-
 sound/soc/sof/ipc4-pcm.c          | 122 ++++++++++++++++++++++++++++++++++++--
 sound/soc/sof/ipc4-topology.c     | 120 ++++++++++++++++++++++++++++++++++++-
 sound/soc/sof/ipc4-topology.h     |   2 +
 5 files changed, 255 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index bd02842124f9..bbc37877aaff 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -54,6 +54,7 @@
 #define SOF_TKN_SCHED_DYNAMIC_PIPELINE		206
 #define SOF_TKN_SCHED_LP_MODE			207
 #define SOF_TKN_SCHED_MEM_USAGE			208
+#define SOF_TKN_SCHED_USE_CHAIN_DMA		209
 
 /* volume */
 #define SOF_TKN_VOLUME_RAMP_STEP_TYPE		250
diff --git a/sound/soc/sof/intel/hda-dai-ops.c b/sound/soc/sof/intel/hda-dai-ops.c
index be109f33715f..de48f13259f1 100644
--- a/sound/soc/sof/intel/hda-dai-ops.c
+++ b/sound/soc/sof/intel/hda-dai-ops.c
@@ -277,6 +277,15 @@ static const struct hda_dai_widget_dma_ops hda_ipc4_dma_ops = {
 	.post_trigger = hda_ipc4_post_trigger
 };
 
+static const struct hda_dai_widget_dma_ops hda_ipc4_chain_dma_ops = {
+	.get_hext_stream = hda_get_hext_stream,
+	.assign_hext_stream = hda_assign_hext_stream,
+	.release_hext_stream = hda_release_hext_stream,
+	.setup_hext_stream = hda_setup_hext_stream,
+	.reset_hext_stream = hda_reset_hext_stream,
+	.trigger = hda_trigger,
+};
+
 static int hda_ipc3_post_trigger(struct snd_sof_dev *sdev, struct snd_soc_dai *cpu_dai,
 				 struct snd_pcm_substream *substream, int cmd)
 {
@@ -331,8 +340,15 @@ hda_select_dai_widget_ops(struct snd_sof_dev *sdev, struct snd_sof_widget *swidg
 	{
 		struct sof_ipc4_copier *ipc4_copier = sdai->private;
 
-		if (ipc4_copier->dai_type == SOF_DAI_INTEL_HDA)
+		if (ipc4_copier->dai_type == SOF_DAI_INTEL_HDA) {
+			struct snd_sof_widget *pipe_widget = swidget->spipe->pipe_widget;
+			struct sof_ipc4_pipeline *pipeline = pipe_widget->private;
+
+			if (pipeline->use_chain_dma)
+				return &hda_ipc4_chain_dma_ops;
+
 			return &hda_ipc4_dma_ops;
+		}
 		break;
 	}
 	default:
diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c
index 4598057b7f28..db64200ba1e5 100644
--- a/sound/soc/sof/ipc4-pcm.c
+++ b/sound/soc/sof/ipc4-pcm.c
@@ -193,6 +193,88 @@ sof_ipc4_update_pipeline_state(struct snd_sof_dev *sdev, int state, int cmd,
  * prepare ioctl before the START trigger.
  */
 
+/*
+ * Chained DMA is a special case where there is no processing on
+ * DSP. The samples are just moved over by host side DMA to a single
+ * buffer on DSP and directly from there to link DMA. However, the
+ * model on SOF driver has two notional pipelines, one at host DAI,
+ * and another at link DAI. They both shall have the use_chain_dma
+ * attribute.
+ */
+
+static int sof_ipc4_chain_dma_trigger(struct snd_sof_dev *sdev,
+				      struct snd_sof_pcm_stream_pipeline_list *pipeline_list,
+				      int state, int cmd)
+{
+	bool allocate, enable, set_fifo_size;
+	struct sof_ipc4_msg msg = {{ 0 }};
+	int i;
+
+	switch (state) {
+	case SOF_IPC4_PIPE_RUNNING: /* Allocate and start chained dma */
+		allocate = true;
+		enable = true;
+		/*
+		 * SOF assumes creation of a new stream from the presence of fifo_size
+		 * in the message, so we must leave it out in pause release case.
+		 */
+		if (cmd == SNDRV_PCM_TRIGGER_PAUSE_RELEASE)
+			set_fifo_size = false;
+		else
+			set_fifo_size = true;
+		break;
+	case SOF_IPC4_PIPE_PAUSED: /* Disable chained DMA. */
+		allocate = true;
+		enable = false;
+		set_fifo_size = false;
+		break;
+	case SOF_IPC4_PIPE_RESET: /* Disable and free chained DMA. */
+		allocate = false;
+		enable = false;
+		set_fifo_size = false;
+		break;
+	default:
+		dev_err(sdev->dev, "Unexpected state %d", state);
+		return -EINVAL;
+	}
+
+	msg.primary = SOF_IPC4_MSG_TYPE_SET(SOF_IPC4_GLB_CHAIN_DMA);
+	msg.primary |= SOF_IPC4_MSG_DIR(SOF_IPC4_MSG_REQUEST);
+	msg.primary |= SOF_IPC4_MSG_TARGET(SOF_IPC4_FW_GEN_MSG);
+
+	/*
+	 * To set-up the DMA chain, the host DMA ID and SCS setting
+	 * are retrieved from the host pipeline configuration. Likewise
+	 * the link DMA ID and fifo_size are retrieved from the link
+	 * pipeline configuration.
+	 */
+	for (i = 0; i < pipeline_list->count; i++) {
+		struct snd_sof_pipeline *spipe = pipeline_list->pipelines[i];
+		struct snd_sof_widget *pipe_widget = spipe->pipe_widget;
+		struct sof_ipc4_pipeline *pipeline = pipe_widget->private;
+
+		if (!pipeline->use_chain_dma) {
+			dev_err(sdev->dev,
+				"All pipelines in chained DMA stream should have use_chain_dma attribute set.");
+			return -EINVAL;
+		}
+
+		msg.primary |= pipeline->msg.primary;
+
+		/* Add fifo_size (actually DMA buffer size) field to the message */
+		if (set_fifo_size)
+			msg.extension |= pipeline->msg.extension;
+	}
+
+	if (allocate)
+		msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_ALLOCATE_MASK;
+
+	if (enable)
+		msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_ENABLE_MASK;
+
+	return sof_ipc_tx_message(sdev->ipc, &msg, 0, NULL, 0);
+}
+
 static int sof_ipc4_trigger_pipelines(struct snd_soc_component *component,
 				      struct snd_pcm_substream *substream, int state, int cmd)
 {
@@ -201,6 +283,8 @@ static int sof_ipc4_trigger_pipelines(struct snd_soc_component *component,
 	struct snd_sof_pcm_stream_pipeline_list *pipeline_list;
 	struct sof_ipc4_fw_data *ipc4_data = sdev->private;
 	struct ipc4_pipeline_set_state_data *trigger_list;
+	struct snd_sof_widget *pipe_widget;
+	struct sof_ipc4_pipeline *pipeline;
 	struct snd_sof_pipeline *spipe;
 	struct snd_sof_pcm *spcm;
 	int ret;
@@ -218,6 +302,17 @@ static int sof_ipc4_trigger_pipelines(struct snd_soc_component *component,
 	if (!pipeline_list->pipelines || !pipeline_list->count)
 		return 0;
 
+	spipe = pipeline_list->pipelines[0];
+	pipe_widget = spipe->pipe_widget;
+	pipeline = pipe_widget->private;
+
+	/*
+	 * If use_chain_dma attribute is set we proceed to chained DMA
+	 * trigger function that handles the rest for the substream.
+	 */
+	if (pipeline->use_chain_dma)
+		return sof_ipc4_chain_dma_trigger(sdev, pipeline_list, state, cmd);
+
 	/* allocate memory for the pipeline data */
 	trigger_list = kzalloc(struct_size(trigger_list, pipeline_ids, pipeline_list->count),
 			       GFP_KERNEL);
@@ -422,8 +517,10 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd,
 	struct snd_soc_component *component = snd_soc_rtdcom_lookup(rtd, SOF_AUDIO_PCM_DRV_NAME);
 	struct snd_sof_dai *dai = snd_sof_find_dai(component, rtd->dai_link->name);
 	struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(component);
+	struct snd_soc_dai *cpu_dai = asoc_rtd_to_cpu(rtd, 0);
 	struct sof_ipc4_copier *ipc4_copier;
-	int ret;
+	bool use_chain_dma = false;
+	int dir;
 
 	if (!dai) {
 		dev_err(component->dev, "%s: No DAI found with name %s\n", __func__,
@@ -438,9 +535,26 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd,
 		return -EINVAL;
 	}
 
-	ret = sof_ipc4_pcm_dai_link_fixup_rate(sdev, params, ipc4_copier);
-	if (ret)
-		return ret;
+	for_each_pcm_streams(dir) {
+		struct snd_soc_dapm_widget *w = snd_soc_dai_get_widget(cpu_dai, dir);
+
+		if (w) {
+			struct snd_sof_widget *swidget = w->dobj.private;
+			struct snd_sof_widget *pipe_widget = swidget->spipe->pipe_widget;
+			struct sof_ipc4_pipeline *pipeline = pipe_widget->private;
+
+			if (pipeline->use_chain_dma)
+				use_chain_dma = true;
+		}
+	}
+
+	/* Chain DMA does not use copiers, so no fixup needed */
+	if (!use_chain_dma) {
+		int ret = sof_ipc4_pcm_dai_link_fixup_rate(sdev, params, ipc4_copier);
+
+		if (ret)
+			return ret;
+	}
 
 	switch (ipc4_copier->dai_type) {
 	case SOF_DAI_INTEL_SSP:
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 3a4a3267017b..f1e1aed94da4 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -19,6 +19,7 @@
 
 #define SOF_IPC4_GAIN_PARAM_ID  0
 #define SOF_IPC4_TPLG_ABI_SIZE 6
+#define SOF_IPC4_CHAIN_DMA_BUF_SIZE_MS 2
 
 static DEFINE_IDA(alh_group_ida);
 static DEFINE_IDA(pipeline_ida);
@@ -26,6 +27,8 @@ static DEFINE_IDA(pipeline_ida);
 static const struct sof_topology_token ipc4_sched_tokens[] = {
 	{SOF_TKN_SCHED_LP_MODE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
 		offsetof(struct sof_ipc4_pipeline, lp_mode)},
+	{SOF_TKN_SCHED_USE_CHAIN_DMA, SND_SOC_TPLG_TUPLE_TYPE_BOOL, get_token_u16,
+		offsetof(struct sof_ipc4_pipeline, use_chain_dma)},
 	{SOF_TKN_SCHED_CORE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
 		offsetof(struct sof_ipc4_pipeline, core_id)},
 };
@@ -475,6 +478,8 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 	struct snd_soc_component *scomp = swidget->scomp;
 	struct snd_sof_dai *dai = swidget->private;
 	struct sof_ipc4_copier *ipc4_copier;
+	struct snd_sof_widget *pipe_widget;
+	struct sof_ipc4_pipeline *pipeline;
 	int node_type = 0;
 	int ret;
 
@@ -512,6 +517,16 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget)
 
 	ipc4_copier->data.gtw_cfg.node_id = SOF_IPC4_NODE_TYPE(node_type);
 
+	pipe_widget = swidget->spipe->pipe_widget;
+	pipeline = pipe_widget->private;
+	if (pipeline->use_chain_dma && ipc4_copier->dai_type != SOF_DAI_INTEL_HDA) {
+		dev_err(scomp->dev,
+			"Bad DAI type '%d', Chained DMA is only supported by HDA DAIs (%d).\n",
+			ipc4_copier->dai_type, SOF_DAI_INTEL_HDA);
+		ret = -ENODEV;
+		goto free_available_fmt;
+	}
+
 	switch (ipc4_copier->dai_type) {
 	case SOF_DAI_INTEL_ALH:
 	{
@@ -643,6 +658,12 @@ static int sof_ipc4_widget_setup_comp_pipeline(struct snd_sof_widget *swidget)
 
 	swidget->core = pipeline->core_id;
 
+	if (pipeline->use_chain_dma) {
+		dev_dbg(scomp->dev, "Set up chain DMA for %s\n", swidget->widget->name);
+		swidget->private = pipeline;
+		return 0;
+	}
+
 	/* parse one set of pipeline tokens */
 	ret = sof_update_ipc_object(scomp, swidget, SOF_PIPELINE_TOKENS, swidget->tuples,
 				    swidget->num_tuples, sizeof(*swidget), 1);
@@ -1103,11 +1124,21 @@ static void sof_ipc4_unprepare_copier_module(struct snd_sof_widget *swidget)
 	pipeline->mem_usage = 0;
 
 	if (WIDGET_IS_AIF(swidget->id) || swidget->id == snd_soc_dapm_buffer) {
+		if (pipeline->use_chain_dma) {
+			pipeline->msg.primary = 0;
+			pipeline->msg.extension = 0;
+		}
 		ipc4_copier = swidget->private;
 	} else if (WIDGET_IS_DAI(swidget->id)) {
 		struct snd_sof_dai *dai = swidget->private;
 
 		ipc4_copier = dai->private;
+
+		if (pipeline->use_chain_dma) {
+			pipeline->msg.primary = 0;
+			pipeline->msg.extension = 0;
+		}
+
 		if (ipc4_copier->dai_type == SOF_DAI_INTEL_ALH) {
 			struct sof_ipc4_copier_data *copier_data = &ipc4_copier->data;
 			struct sof_ipc4_alh_configuration_blob *blob;
@@ -1344,13 +1375,44 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 			return ret;
 		}
 
-		pipe_widget = swidget->spipe->pipe_widget;
-		pipeline = pipe_widget->private;
 		ipc4_copier = (struct sof_ipc4_copier *)swidget->private;
 		gtw_attr = ipc4_copier->gtw_attr;
 		copier_data = &ipc4_copier->data;
 		available_fmt = &ipc4_copier->available_fmt;
 
+		pipe_widget = swidget->spipe->pipe_widget;
+		pipeline = pipe_widget->private;
+
+		if (pipeline->use_chain_dma) {
+			u32 host_dma_id;
+			u32 fifo_size;
+
+			host_dma_id = platform_params->stream_tag - 1;
+			pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_HOST_ID(host_dma_id);
+
+			/* Set SCS bit for S16_LE format only */
+			if (params_format(fe_params) == SNDRV_PCM_FORMAT_S16_LE)
+				pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_SCS_MASK;
+
+			/*
+			 * Despite its name the bitfield 'fifo_size' is used to define DMA buffer
+			 * size. The expression calculates 2ms buffer size.
+			 */
+			fifo_size = DIV_ROUND_UP((SOF_IPC4_CHAIN_DMA_BUF_SIZE_MS *
+						  params_rate(fe_params) *
+						  params_channels(fe_params) *
+						  params_physical_width(fe_params)), 8000);
+			pipeline->msg.extension |= SOF_IPC4_GLB_EXT_CHAIN_DMA_FIFO_SIZE(fifo_size);
+
+			/*
+			 * Chain DMA does not support stream timestamping, set node_id to invalid
+			 * to skip the code in sof_ipc4_get_stream_start_offset().
+			 */
+			copier_data->gtw_cfg.node_id = SOF_IPC4_INVALID_NODE_ID;
+
+			return 0;
+		}
+
 		/*
 		 * Use the input_pin_fmts to match pcm params for playback and the output_pin_fmts
 		 * for capture.
@@ -1375,6 +1437,12 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget,
 	case snd_soc_dapm_dai_in:
 	case snd_soc_dapm_dai_out:
 	{
+		struct snd_sof_widget *pipe_widget = swidget->spipe->pipe_widget;
+		struct sof_ipc4_pipeline *pipeline = pipe_widget->private;
+
+		if (pipeline->use_chain_dma)
+			return 0;
+
 		dai = swidget->private;
 
 		ipc4_copier = (struct sof_ipc4_copier *)dai->private;
@@ -1921,6 +1989,9 @@ static int sof_ipc4_widget_setup(struct snd_sof_dev *sdev, struct snd_sof_widget
 	case snd_soc_dapm_scheduler:
 		pipeline = swidget->private;
 
+		if (pipeline->use_chain_dma)
+			return 0;
+
 		dev_dbg(sdev->dev, "pipeline: %d memory pages: %d\n", swidget->pipeline_id,
 			pipeline->mem_usage);
 
@@ -1943,6 +2014,10 @@ static int sof_ipc4_widget_setup(struct snd_sof_dev *sdev, struct snd_sof_widget
 	{
 		struct sof_ipc4_copier *ipc4_copier = swidget->private;
 
+		pipeline = pipe_widget->private;
+		if (pipeline->use_chain_dma)
+			return 0;
+
 		ipc_size = ipc4_copier->ipc_config_size;
 		ipc_data = ipc4_copier->ipc_config_data;
 
@@ -1955,6 +2030,10 @@ static int sof_ipc4_widget_setup(struct snd_sof_dev *sdev, struct snd_sof_widget
 		struct snd_sof_dai *dai = swidget->private;
 		struct sof_ipc4_copier *ipc4_copier = dai->private;
 
+		pipeline = pipe_widget->private;
+		if (pipeline->use_chain_dma)
+			return 0;
+
 		ipc_size = ipc4_copier->ipc_config_size;
 		ipc_data = ipc4_copier->ipc_config_data;
 
@@ -2066,6 +2145,9 @@ static int sof_ipc4_widget_free(struct snd_sof_dev *sdev, struct snd_sof_widget
 		struct sof_ipc4_msg msg = {{ 0 }};
 		u32 header;
 
+		if (pipeline->use_chain_dma)
+			return 0;
+
 		header = SOF_IPC4_GLB_PIPE_INSTANCE_ID(swidget->instance_id);
 		header |= SOF_IPC4_MSG_TYPE_SET(SOF_IPC4_GLB_DELETE_PIPELINE);
 		header |= SOF_IPC4_MSG_DIR(SOF_IPC4_MSG_REQUEST);
@@ -2082,7 +2164,11 @@ static int sof_ipc4_widget_free(struct snd_sof_dev *sdev, struct snd_sof_widget
 		pipeline->state = SOF_IPC4_PIPE_UNINITIALIZED;
 		ida_free(&pipeline_ida, swidget->instance_id);
 	} else {
-		ida_free(&fw_module->m_ida, swidget->instance_id);
+		struct snd_sof_widget *pipe_widget = swidget->spipe->pipe_widget;
+		struct sof_ipc4_pipeline *pipeline = pipe_widget->private;
+
+		if (!pipeline->use_chain_dma)
+			ida_free(&fw_module->m_ida, swidget->instance_id);
 	}
 
 	mutex_unlock(&ipc4_data->pipeline_state_mutex);
@@ -2234,12 +2320,27 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
 {
 	struct snd_sof_widget *src_widget = sroute->src_widget;
 	struct snd_sof_widget *sink_widget = sroute->sink_widget;
+	struct snd_sof_widget *src_pipe_widget = src_widget->spipe->pipe_widget;
+	struct snd_sof_widget *sink_pipe_widget = sink_widget->spipe->pipe_widget;
 	struct sof_ipc4_fw_module *src_fw_module = src_widget->module_info;
 	struct sof_ipc4_fw_module *sink_fw_module = sink_widget->module_info;
+	struct sof_ipc4_pipeline *src_pipeline = src_pipe_widget->private;
+	struct sof_ipc4_pipeline *sink_pipeline = sink_pipe_widget->private;
 	struct sof_ipc4_msg msg = {{ 0 }};
 	u32 header, extension;
 	int ret;
 
+	/* no route set up if chain DMA is used */
+	if (src_pipeline->use_chain_dma || sink_pipeline->use_chain_dma) {
+		if (!src_pipeline->use_chain_dma || !sink_pipeline->use_chain_dma) {
+			dev_err(sdev->dev,
+				"use_chain_dma must be set for both src %s and sink %s pipelines\n",
+				src_widget->widget->name, sink_widget->widget->name);
+			return -EINVAL;
+		}
+		return 0;
+	}
+
 	sroute->src_queue_id = sof_ipc4_get_queue_id(src_widget, sink_widget,
 						     SOF_PIN_TYPE_OUTPUT);
 	if (sroute->src_queue_id < 0) {
@@ -2310,9 +2411,17 @@ static int sof_ipc4_route_free(struct snd_sof_dev *sdev, struct snd_sof_route *s
 	struct sof_ipc4_fw_module *src_fw_module = src_widget->module_info;
 	struct sof_ipc4_fw_module *sink_fw_module = sink_widget->module_info;
 	struct sof_ipc4_msg msg = {{ 0 }};
+	struct snd_sof_widget *src_pipe_widget = src_widget->spipe->pipe_widget;
+	struct snd_sof_widget *sink_pipe_widget = sink_widget->spipe->pipe_widget;
+	struct sof_ipc4_pipeline *src_pipeline = src_pipe_widget->private;
+	struct sof_ipc4_pipeline *sink_pipeline = sink_pipe_widget->private;
 	u32 header, extension;
 	int ret = 0;
 
+	/* no route is set up if chain DMA is used */
+	if (src_pipeline->use_chain_dma || sink_pipeline->use_chain_dma)
+		return 0;
+
 	dev_dbg(sdev->dev, "unbind modules %s:%d -> %s:%d\n",
 		src_widget->widget->name, sroute->src_queue_id,
 		sink_widget->widget->name, sroute->dst_queue_id);
@@ -2374,6 +2483,11 @@ static int sof_ipc4_dai_config(struct snd_sof_dev *sdev, struct snd_sof_widget *
 
 	switch (ipc4_copier->dai_type) {
 	case SOF_DAI_INTEL_HDA:
+		if (pipeline->use_chain_dma) {
+			pipeline->msg.primary &= ~SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK;
+			pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(data->dai_data);
+			break;
+		}
 		gtw_attr = ipc4_copier->gtw_attr;
 		gtw_attr->lp_buffer_alloc = pipeline->lp_mode;
 		pipeline->skip_during_fe_trigger = true;
diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h
index 015027b23588..cf007282867b 100644
--- a/sound/soc/sof/ipc4-topology.h
+++ b/sound/soc/sof/ipc4-topology.h
@@ -126,6 +126,7 @@ struct sof_ipc4_copier_config_set_sink_format {
  * @mem_usage: Memory usage
  * @core_id: Target core for the pipeline
  * @state: Pipeline state
+ * @use_chain_dma: flag to indicate if the firmware shall use chained DMA
  * @msg: message structure for pipeline
  * @skip_during_fe_trigger: skip triggering this pipeline during the FE DAI trigger
  */
@@ -135,6 +136,7 @@ struct sof_ipc4_pipeline {
 	uint32_t mem_usage;
 	uint32_t core_id;
 	int state;
+	bool use_chain_dma;
 	struct sof_ipc4_msg msg;
 	bool skip_during_fe_trigger;
 };
-- 
cgit v1.2.3


From fa8c052b4c614aa1d2d60e5c9f40e9d885bf9511 Mon Sep 17 00:00:00 2001
From: "Vlad.Karpovich" <vkarpovi@opensource.cirrus.com>
Date: Wed, 15 Mar 2023 10:47:18 -0500
Subject: ASoC: cs35l45: Support for GPIO pins configuration.

Adds device tree configuration for cs35l45 GPIOs

Signed-off-by: Vlad Karpovich <vkarpovi@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230315154722.3911463-1-vkarpovi@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/dt-bindings/sound/cs35l45.h | 57 +++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/cs35l45-tables.c   | 14 +++++++++
 sound/soc/codecs/cs35l45.c          | 56 ++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/cs35l45.h          | 27 ++++++++++++++++--
 4 files changed, 152 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/sound/cs35l45.h b/include/dt-bindings/sound/cs35l45.h
index 076da4b2c28d..25386af18445 100644
--- a/include/dt-bindings/sound/cs35l45.h
+++ b/include/dt-bindings/sound/cs35l45.h
@@ -17,4 +17,61 @@
 #define CS35L45_ASP_TX_HIZ_UNUSED	0x1
 #define CS35L45_ASP_TX_HIZ_DISABLED	0x2
 
+/*
+ * Optional GPIOX Sub-nodes:
+ *  The cs35l45 node can have up to three "cirrus,gpio-ctrlX" ('X' = [1,2,3])
+ *  sub-nodes for configuring the GPIO pins.
+ *
+ * - gpio-dir : GPIO pin direction. Valid only when 'gpio-ctrl'
+ *   is 1.
+ *    0 = Output
+ *    1 = Input (Default)
+ *
+ * - gpio-lvl : GPIO level. Valid only when 'gpio-ctrl' is 1 and 'gpio-dir' is 0.
+ *
+ *    0 = Low (Default)
+ *    1 = High
+ *
+ * - gpio-op-cfg : GPIO output configuration. Valid only when 'gpio-ctrl' is 1
+ *   and 'gpio-dir' is 0.
+ *
+ *    0 = CMOS (Default)
+ *    1 = Open Drain
+ *
+ * - gpio-pol : GPIO output polarity select. Valid only when 'gpio-ctrl' is 1
+ *   and 'gpio-dir' is 0.
+ *
+ *    0 = Non-inverted, Active High (Default)
+ *    1 = Inverted, Active Low
+ *
+ * - gpio-invert : Defines the polarity of the GPIO pin if configured
+ *   as input.
+ *
+ *    0 = Not inverted (Default)
+ *    1 = Inverted
+ *
+ * - gpio-ctrl : Defines the function of the GPIO pin.
+ *
+ * GPIO1:
+ *   0 = High impedance input (Default)
+ *   1 = Pin acts as a GPIO, direction controlled by 'gpio-dir'
+ *   2 = Pin acts as MDSYNC, direction controlled by MDSYNC
+ *   3-7 = Reserved
+ *
+ * GPIO2:
+ *   0 = High impedance input (Default)
+ *   1 = Pin acts as a GPIO, direction controlled by 'gpio-dir'
+ *   2 = Pin acts as open drain INT
+ *   3 = Reserved
+ *   4 = Pin acts as push-pull output INT. Active low.
+ *   5 = Pin acts as push-pull output INT. Active high.
+ *   6,7 = Reserved
+ *
+ * GPIO3:
+ *   0 = High impedance input (Default)
+ *   1 = Pin acts as a GPIO, direction controlled by 'gpio-dir'
+ *   2-7 = Reserved
+ */
+#define CS35L45_NUM_GPIOS	0x3
+
 #endif /* DT_CS35L45_H */
diff --git a/sound/soc/codecs/cs35l45-tables.c b/sound/soc/codecs/cs35l45-tables.c
index 4b1320a2e6e9..997ea418a6dc 100644
--- a/sound/soc/codecs/cs35l45-tables.c
+++ b/sound/soc/codecs/cs35l45-tables.c
@@ -43,6 +43,9 @@ EXPORT_SYMBOL_NS_GPL(cs35l45_apply_patch, SND_SOC_CS35L45);
 static const struct reg_default cs35l45_defaults[] = {
 	{ CS35L45_BLOCK_ENABLES,		0x00003323 },
 	{ CS35L45_BLOCK_ENABLES2,		0x00000010 },
+	{ CS35L45_SYNC_GPIO1,			0x00000007 },
+	{ CS35L45_INTB_GPIO2_MCLK_REF,		0x00000005 },
+	{ CS35L45_GPIO3,			0x00000005 },
 	{ CS35L45_REFCLK_INPUT,			0x00000510 },
 	{ CS35L45_GLOBAL_SAMPLE_RATE,		0x00000003 },
 	{ CS35L45_ASP_ENABLES1,			0x00000000 },
@@ -61,6 +64,9 @@ static const struct reg_default cs35l45_defaults[] = {
 	{ CS35L45_ASPTX4_INPUT,			0x00000028 },
 	{ CS35L45_ASPTX5_INPUT,			0x00000048 },
 	{ CS35L45_AMP_PCM_CONTROL,		0x00100000 },
+	{ CS35L45_GPIO1_CTRL1,			0x81000001 },
+	{ CS35L45_GPIO2_CTRL1,			0x81000001 },
+	{ CS35L45_GPIO3_CTRL1,			0x81000001 },
 };
 
 static bool cs35l45_readable_reg(struct device *dev, unsigned int reg)
@@ -72,6 +78,9 @@ static bool cs35l45_readable_reg(struct device *dev, unsigned int reg)
 	case CS35L45_BLOCK_ENABLES:
 	case CS35L45_BLOCK_ENABLES2:
 	case CS35L45_ERROR_RELEASE:
+	case CS35L45_SYNC_GPIO1:
+	case CS35L45_INTB_GPIO2_MCLK_REF:
+	case CS35L45_GPIO3:
 	case CS35L45_REFCLK_INPUT:
 	case CS35L45_GLOBAL_SAMPLE_RATE:
 	case CS35L45_ASP_ENABLES1:
@@ -92,6 +101,10 @@ static bool cs35l45_readable_reg(struct device *dev, unsigned int reg)
 	case CS35L45_AMP_PCM_CONTROL:
 	case CS35L45_AMP_PCM_HPF_TST:
 	case CS35L45_IRQ1_EINT_4:
+	case CS35L45_GPIO_STATUS1:
+	case CS35L45_GPIO1_CTRL1:
+	case CS35L45_GPIO2_CTRL1:
+	case CS35L45_GPIO3_CTRL1:
 		return true;
 	default:
 		return false;
@@ -107,6 +120,7 @@ static bool cs35l45_volatile_reg(struct device *dev, unsigned int reg)
 	case CS35L45_ERROR_RELEASE:
 	case CS35L45_AMP_PCM_HPF_TST:	/* not cachable */
 	case CS35L45_IRQ1_EINT_4:
+	case CS35L45_GPIO_STATUS1:
 		return true;
 	default:
 		return false;
diff --git a/sound/soc/codecs/cs35l45.c b/sound/soc/codecs/cs35l45.c
index 855d9f13e6ff..c87dccb3382e 100644
--- a/sound/soc/codecs/cs35l45.c
+++ b/sound/soc/codecs/cs35l45.c
@@ -536,7 +536,63 @@ static int __maybe_unused cs35l45_runtime_resume(struct device *dev)
 
 static int cs35l45_apply_property_config(struct cs35l45_private *cs35l45)
 {
+	struct device_node *node = cs35l45->dev->of_node;
+	unsigned int gpio_regs[] = {CS35L45_GPIO1_CTRL1, CS35L45_GPIO2_CTRL1,
+				    CS35L45_GPIO3_CTRL1};
+	unsigned int pad_regs[] = {CS35L45_SYNC_GPIO1,
+				   CS35L45_INTB_GPIO2_MCLK_REF, CS35L45_GPIO3};
+	struct device_node *child;
 	unsigned int val;
+	char of_name[32];
+	int ret, i;
+
+	if (!node)
+		return 0;
+
+	for (i = 0; i < CS35L45_NUM_GPIOS; i++) {
+		sprintf(of_name, "cirrus,gpio-ctrl%d", i + 1);
+		child = of_get_child_by_name(node, of_name);
+		if (!child)
+			continue;
+
+		ret = of_property_read_u32(child, "gpio-dir", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, gpio_regs[i],
+					   CS35L45_GPIO_DIR_MASK,
+					   val << CS35L45_GPIO_DIR_SHIFT);
+
+		ret = of_property_read_u32(child, "gpio-lvl", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, gpio_regs[i],
+					   CS35L45_GPIO_LVL_MASK,
+					   val << CS35L45_GPIO_LVL_SHIFT);
+
+		ret = of_property_read_u32(child, "gpio-op-cfg", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, gpio_regs[i],
+					   CS35L45_GPIO_OP_CFG_MASK,
+					   val << CS35L45_GPIO_OP_CFG_SHIFT);
+
+		ret = of_property_read_u32(child, "gpio-pol", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, gpio_regs[i],
+					   CS35L45_GPIO_POL_MASK,
+					   val << CS35L45_GPIO_POL_SHIFT);
+
+		ret = of_property_read_u32(child, "gpio-ctrl", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, pad_regs[i],
+					   CS35L45_GPIO_CTRL_MASK,
+					   val << CS35L45_GPIO_CTRL_SHIFT);
+
+		ret = of_property_read_u32(child, "gpio-invert", &val);
+		if (!ret)
+			regmap_update_bits(cs35l45->regmap, pad_regs[i],
+					   CS35L45_GPIO_INVERT_MASK,
+					   val << CS35L45_GPIO_INVERT_SHIFT);
+
+		of_node_put(child);
+	}
 
 	if (device_property_read_u32(cs35l45->dev,
 				     "cirrus,asp-sdout-hiz-ctrl", &val) == 0) {
diff --git a/sound/soc/codecs/cs35l45.h b/sound/soc/codecs/cs35l45.h
index 53fe9d2b7b15..f3a54fc57d53 100644
--- a/sound/soc/codecs/cs35l45.h
+++ b/sound/soc/codecs/cs35l45.h
@@ -14,6 +14,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
+#include <dt-bindings/sound/cs35l45.h>
 
 #define CS35L45_DEVID				0x00000000
 #define CS35L45_REVID				0x00000004
@@ -24,6 +25,9 @@
 #define CS35L45_BLOCK_ENABLES			0x00002018
 #define CS35L45_BLOCK_ENABLES2			0x0000201C
 #define CS35L45_ERROR_RELEASE			0x00002034
+#define CS35L45_SYNC_GPIO1			0x00002430
+#define CS35L45_INTB_GPIO2_MCLK_REF		0x00002434
+#define CS35L45_GPIO3				0x00002438
 #define CS35L45_REFCLK_INPUT			0x00002C04
 #define CS35L45_GLOBAL_SAMPLE_RATE		0x00002C0C
 #define CS35L45_BOOST_CCM_CFG			0x00003808
@@ -48,8 +52,11 @@
 #define CS35L45_AMP_PCM_CONTROL			0x00007000
 #define CS35L45_AMP_PCM_HPF_TST			0x00007004
 #define CS35L45_IRQ1_EINT_4			0x0000E01C
-#define CS35L45_LASTREG				0x0000E01C
-
+#define CS35L45_GPIO_STATUS1			0x0000F000
+#define CS35L45_GPIO1_CTRL1			0x0000F008
+#define CS35L45_GPIO2_CTRL1			0x0000F00C
+#define CS35L45_GPIO3_CTRL1			0x0000F010
+#define CS35L45_LASTREG			0x0000F010
 /* SFT_RESET */
 #define CS35L45_SOFT_RESET_TRIGGER		0x5A000000
 
@@ -165,6 +172,22 @@
 #define CS35L45_OTP_BOOT_DONE_STS_MASK		BIT(1)
 #define CS35L45_OTP_BUSY_MASK			BIT(0)
 
+/* GPIOX_CTRL1 */
+#define CS35L45_GPIO_DIR_SHIFT			31
+#define CS35L45_GPIO_DIR_MASK			BIT(31)
+#define CS35L45_GPIO_LVL_SHIFT			15
+#define CS35L45_GPIO_LVL_MASK			BIT(15)
+#define CS35L45_GPIO_OP_CFG_SHIFT		14
+#define CS35L45_GPIO_OP_CFG_MASK		BIT(14)
+#define CS35L45_GPIO_POL_SHIFT			12
+#define CS35L45_GPIO_POL_MASK			BIT(12)
+
+/* SYNC_GPIO1, INTB_GPIO2_MCLK_REF, GPIO3 */
+#define CS35L45_GPIO_CTRL_SHIFT		20
+#define CS35L45_GPIO_CTRL_MASK			GENMASK(22, 20)
+#define CS35L45_GPIO_INVERT_SHIFT		19
+#define CS35L45_GPIO_INVERT_MASK		BIT(19)
+
 /* Mixer sources */
 #define CS35L45_PCM_SRC_MASK			0x7F
 #define CS35L45_PCM_SRC_ZERO			0x00
-- 
cgit v1.2.3


From 0144e3b85d7b42e8a4cda991c0e81f131897457a Mon Sep 17 00:00:00 2001
From: Dionna Glaze <dionnaglaze@google.com>
Date: Tue, 7 Mar 2023 20:24:49 +0100
Subject: x86/sev: Change snp_guest_issue_request()'s fw_err argument

The GHCB specification declares that the firmware error value for
a guest request will be stored in the lower 32 bits of EXIT_INFO_2.  The
upper 32 bits are for the VMM's own error code. The fw_err argument to
snp_guest_issue_request() is thus a misnomer, and callers will need
access to all 64 bits.

The type of unsigned long also causes problems, since sw_exit_info2 is
u64 (unsigned long long) vs the argument's unsigned long*. Change this
type for issuing the guest request. Pass the ioctl command struct's error
field directly instead of in a local variable, since an incomplete guest
request may not set the error code, and uninitialized stack memory would
be written back to user space.

The firmware might not even be called, so bookend the call with the no
firmware call error and clear the error.

Since the "fw_err" field is really exitinfo2 split into the upper bits'
vmm error code and lower bits' firmware error code, convert the 64 bit
value to a union.

  [ bp:
   - Massage commit message
   - adjust code
   - Fix a build issue as
   Reported-by: kernel test robot <lkp@intel.com>
   Link: https://lore.kernel.org/oe-kbuild-all/202303070609.vX6wp2Af-lkp@intel.com
   - print exitinfo2 in hex
   Tom:
    - Correct -EIO exit case. ]

Signed-off-by: Dionna Glaze <dionnaglaze@google.com>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230214164638.1189804-5-dionnaglaze@google.com
Link: https://lore.kernel.org/r/20230307192449.24732-12-bp@alien8.de
---
 Documentation/virt/coco/sev-guest.rst   | 20 +++++----
 arch/x86/include/asm/sev-common.h       |  4 --
 arch/x86/include/asm/sev.h              | 10 +++--
 arch/x86/kernel/sev.c                   | 15 +++----
 drivers/virt/coco/sev-guest/sev-guest.c | 72 ++++++++++++++++++---------------
 include/uapi/linux/sev-guest.h          | 18 ++++++++-
 6 files changed, 83 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index aa3e4c6a1f90..68b0d2363af8 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -37,11 +37,11 @@ along with a description:
       the return value.  General error numbers (-ENOMEM, -EINVAL)
       are not detailed, but errors with specific meanings are.
 
-The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device.
-The ioctl accepts struct snp_user_guest_request. The input and output structure is
-specified through the req_data and resp_data field respectively. If the ioctl fails
-to execute due to a firmware error, then fw_err code will be set. Otherwise, fw_err
-will be set to 0x00000000ffffffff, i.e., the lower 32-bits are -1.
+The guest ioctl should be issued on a file descriptor of the /dev/sev-guest
+device.  The ioctl accepts struct snp_user_guest_request. The input and
+output structure is specified through the req_data and resp_data field
+respectively. If the ioctl fails to execute due to a firmware error, then
+the fw_error code will be set, otherwise fw_error will be set to -1.
 
 The firmware checks that the message sequence counter is one greater than
 the guests message sequence counter. If guest driver fails to increment message
@@ -57,8 +57,14 @@ counter (e.g. counter overflow), then -EIO will be returned.
                 __u64 req_data;
                 __u64 resp_data;
 
-                /* firmware error code on failure (see psp-sev.h) */
-                __u64 fw_err;
+                /* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */
+                union {
+                        __u64 exitinfo2;
+                        struct {
+                                __u32 fw_error;
+                                __u32 vmm_error;
+                        };
+                };
         };
 
 2.1 SNP_GET_REPORT
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b63be696b776..0759af9b1acf 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -128,10 +128,6 @@ struct snp_psc_desc {
 	struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
 } __packed;
 
-/* Guest message request error codes */
-#define SNP_GUEST_REQ_INVALID_LEN	BIT_ULL(32)
-#define SNP_GUEST_REQ_ERR_BUSY		BIT_ULL(33)
-
 #define GHCB_MSR_TERM_REQ		0x100
 #define GHCB_MSR_TERM_REASON_SET_POS	12
 #define GHCB_MSR_TERM_REASON_SET_MASK	0xf
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ebc271bb6d8e..13dc2a9d23c1 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -9,6 +9,8 @@
 #define __ASM_ENCRYPTED_STATE_H
 
 #include <linux/types.h>
+#include <linux/sev-guest.h>
+
 #include <asm/insn.h>
 #include <asm/sev-common.h>
 #include <asm/bootparam.h>
@@ -185,6 +187,9 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
 
 	return rc;
 }
+
+struct snp_guest_request_ioctl;
+
 void setup_ghcb(void);
 void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
 					 unsigned int npages);
@@ -196,7 +201,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
 void snp_set_wakeup_secondary_cpu(void);
 bool snp_init(struct boot_params *bp);
 void __init __noreturn snp_abort(void);
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -216,8 +221,7 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag
 static inline void snp_set_wakeup_secondary_cpu(void) { }
 static inline bool snp_init(struct boot_params *bp) { return false; }
 static inline void snp_abort(void) { }
-static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input,
-					  unsigned long *fw_err)
+static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
 {
 	return -ENOTTY;
 }
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 3f664ab277c4..b031244d6d2d 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -22,6 +22,8 @@
 #include <linux/efi.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
 
 #include <asm/cpu_entry_area.h>
 #include <asm/stacktrace.h>
@@ -2175,7 +2177,7 @@ static int __init init_sev_config(char *str)
 }
 __setup("sev=", init_sev_config);
 
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
 {
 	struct ghcb_state state;
 	struct es_em_ctxt ctxt;
@@ -2183,8 +2185,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
 	struct ghcb *ghcb;
 	int ret;
 
-	if (!fw_err)
-		return -EINVAL;
+	rio->exitinfo2 = SEV_RET_NO_FW_CALL;
 
 	/*
 	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -2209,16 +2210,16 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
 	if (ret)
 		goto e_put;
 
-	*fw_err = ghcb->save.sw_exit_info_2;
-	switch (*fw_err) {
+	rio->exitinfo2 = ghcb->save.sw_exit_info_2;
+	switch (rio->exitinfo2) {
 	case 0:
 		break;
 
-	case SNP_GUEST_REQ_ERR_BUSY:
+	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
 		ret = -EAGAIN;
 		break;
 
-	case SNP_GUEST_REQ_INVALID_LEN:
+	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
 		/* Number of expected pages are returned in RBX */
 		if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
 			input->data_npages = ghcb_get_rbx(ghcb);
diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c
index 0c7b47acba2a..97dbe715e96a 100644
--- a/drivers/virt/coco/sev-guest/sev-guest.c
+++ b/drivers/virt/coco/sev-guest/sev-guest.c
@@ -332,11 +332,12 @@ static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8
 	return __enc_payload(snp_dev, req, payload, sz);
 }
 
-static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, __u64 *fw_err)
+static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code,
+				  struct snp_guest_request_ioctl *rio)
 {
-	unsigned long err = 0xff, override_err = 0;
 	unsigned long req_start = jiffies;
 	unsigned int override_npages = 0;
+	u64 override_err = 0;
 	int rc;
 
 retry_request:
@@ -346,7 +347,7 @@ retry_request:
 	 * sequence number must be incremented or the VMPCK must be deleted to
 	 * prevent reuse of the IV.
 	 */
-	rc = snp_issue_guest_request(exit_code, &snp_dev->input, &err);
+	rc = snp_issue_guest_request(exit_code, &snp_dev->input, rio);
 	switch (rc) {
 	case -ENOSPC:
 		/*
@@ -364,7 +365,7 @@ retry_request:
 		 * request buffer size was too small and give the caller the
 		 * required buffer size.
 		 */
-		override_err	= SNP_GUEST_REQ_INVALID_LEN;
+		override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
 
 		/*
 		 * If this call to the firmware succeeds, the sequence number can
@@ -377,7 +378,7 @@ retry_request:
 		goto retry_request;
 
 	/*
-	 * The host may return SNP_GUEST_REQ_ERR_EBUSY if the request has been
+	 * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
 	 * throttled. Retry in the driver to avoid returning and reusing the
 	 * message sequence number on a different message.
 	 */
@@ -398,27 +399,29 @@ retry_request:
 	 */
 	snp_inc_msg_seqno(snp_dev);
 
-	if (fw_err)
-		*fw_err = override_err ?: err;
+	if (override_err) {
+		rio->exitinfo2 = override_err;
+
+		/*
+		 * If an extended guest request was issued and the supplied certificate
+		 * buffer was not large enough, a standard guest request was issued to
+		 * prevent IV reuse. If the standard request was successful, return -EIO
+		 * back to the caller as would have originally been returned.
+		 */
+		if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+			rc = -EIO;
+	}
 
 	if (override_npages)
 		snp_dev->input.data_npages = override_npages;
 
-	/*
-	 * If an extended guest request was issued and the supplied certificate
-	 * buffer was not large enough, a standard guest request was issued to
-	 * prevent IV reuse. If the standard request was successful, return -EIO
-	 * back to the caller as would have originally been returned.
-	 */
-	if (!rc && override_err == SNP_GUEST_REQ_INVALID_LEN)
-		return -EIO;
-
 	return rc;
 }
 
-static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, int msg_ver,
-				u8 type, void *req_buf, size_t req_sz, void *resp_buf,
-				u32 resp_sz, __u64 *fw_err)
+static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code,
+				struct snp_guest_request_ioctl *rio, u8 type,
+				void *req_buf, size_t req_sz, void *resp_buf,
+				u32 resp_sz)
 {
 	u64 seqno;
 	int rc;
@@ -432,7 +435,7 @@ static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, in
 	memset(snp_dev->response, 0, sizeof(struct snp_guest_msg));
 
 	/* Encrypt the userspace provided payload in snp_dev->secret_request. */
-	rc = enc_payload(snp_dev, seqno, msg_ver, type, req_buf, req_sz);
+	rc = enc_payload(snp_dev, seqno, rio->msg_version, type, req_buf, req_sz);
 	if (rc)
 		return rc;
 
@@ -443,12 +446,16 @@ static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, in
 	memcpy(snp_dev->request, &snp_dev->secret_request,
 	       sizeof(snp_dev->secret_request));
 
-	rc = __handle_guest_request(snp_dev, exit_code, fw_err);
+	rc = __handle_guest_request(snp_dev, exit_code, rio);
 	if (rc) {
-		if (rc == -EIO && *fw_err == SNP_GUEST_REQ_INVALID_LEN)
+		if (rc == -EIO &&
+		    rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
 			return rc;
 
-		dev_alert(snp_dev->dev, "Detected error from ASP request. rc: %d, fw_err: %llu\n", rc, *fw_err);
+		dev_alert(snp_dev->dev,
+			  "Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
+			  rc, rio->exitinfo2);
+
 		snp_disable_vmpck(snp_dev);
 		return rc;
 	}
@@ -488,9 +495,9 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io
 	if (!resp)
 		return -ENOMEM;
 
-	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
+	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg,
 				  SNP_MSG_REPORT_REQ, &req, sizeof(req), resp->data,
-				  resp_len, &arg->fw_err);
+				  resp_len);
 	if (rc)
 		goto e_free;
 
@@ -528,9 +535,8 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque
 	if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req)))
 		return -EFAULT;
 
-	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
-				  SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len,
-				  &arg->fw_err);
+	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg,
+				  SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len);
 	if (rc)
 		return rc;
 
@@ -590,12 +596,12 @@ cmd:
 		return -ENOMEM;
 
 	snp_dev->input.data_npages = npages;
-	ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg->msg_version,
+	ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg,
 				   SNP_MSG_REPORT_REQ, &req.data,
-				   sizeof(req.data), resp->data, resp_len, &arg->fw_err);
+				   sizeof(req.data), resp->data, resp_len);
 
 	/* If certs length is invalid then copy the returned length */
-	if (arg->fw_err == SNP_GUEST_REQ_INVALID_LEN) {
+	if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) {
 		req.certs_len = snp_dev->input.data_npages << PAGE_SHIFT;
 
 		if (copy_to_user((void __user *)arg->req_data, &req, sizeof(req)))
@@ -630,7 +636,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
 	if (copy_from_user(&input, argp, sizeof(input)))
 		return -EFAULT;
 
-	input.fw_err = 0xff;
+	input.exitinfo2 = 0xff;
 
 	/* Message version must be non-zero */
 	if (!input.msg_version)
@@ -661,7 +667,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
 
 	mutex_unlock(&snp_cmd_mutex);
 
-	if (input.fw_err && copy_to_user(argp, &input, sizeof(input)))
+	if (input.exitinfo2 && copy_to_user(argp, &input, sizeof(input)))
 		return -EFAULT;
 
 	return ret;
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 256aaeff7e65..2aa39112cf8d 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -52,8 +52,14 @@ struct snp_guest_request_ioctl {
 	__u64 req_data;
 	__u64 resp_data;
 
-	/* firmware error code on failure (see psp-sev.h) */
-	__u64 fw_err;
+	/* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */
+	union {
+		__u64 exitinfo2;
+		struct {
+			__u32 fw_error;
+			__u32 vmm_error;
+		};
+	};
 };
 
 struct snp_ext_report_req {
@@ -77,4 +83,12 @@ struct snp_ext_report_req {
 /* Get SNP extended report as defined in the GHCB specification version 2. */
 #define SNP_GET_EXT_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x2, struct snp_guest_request_ioctl)
 
+/* Guest message request EXIT_INFO_2 constants */
+#define SNP_GUEST_FW_ERR_MASK		GENMASK_ULL(31, 0)
+#define SNP_GUEST_VMM_ERR_SHIFT		32
+#define SNP_GUEST_VMM_ERR(x)		(((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
+
+#define SNP_GUEST_VMM_ERR_INVALID_LEN	1
+#define SNP_GUEST_VMM_ERR_BUSY		2
+
 #endif /* __UAPI_LINUX_SEV_GUEST_H_ */
-- 
cgit v1.2.3


From 59495740f79524bebe4ecb8097cf206f86587b25 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 21 Mar 2023 15:04:18 +0100
Subject: ftrace: Let unregister_ftrace_direct_multi() call
 ftrace_free_filter()

A common pattern when using the ftrace_direct_multi API is to unregister
the ops and also immediately free its filter. We've noticed it's very
easy for users to miss calling ftrace_free_filter().

This adds a "free_filters" argument to unregister_ftrace_direct_multi()
to both remind the user they should free filters and also to make their
life easier.

Link: https://lkml.kernel.org/r/20230321140424.345218-2-revest@chromium.org

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Florent Revest <revest@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h                      | 6 ++++--
 kernel/bpf/trampoline.c                     | 2 +-
 kernel/trace/ftrace.c                       | 6 +++++-
 samples/ftrace/ftrace-direct-multi-modify.c | 3 +--
 samples/ftrace/ftrace-direct-multi.c        | 3 +--
 5 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 366c730beaa3..5b68ee874bc1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -407,7 +407,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 				unsigned long new_addr);
 unsigned long ftrace_find_rec_direct(unsigned long ip);
 int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
+int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
+				   bool free_filters);
 int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr);
 
@@ -446,7 +447,8 @@ static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned
 {
 	return -ENODEV;
 }
-static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
+						 bool free_filters)
 {
 	return -ENODEV;
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..88bc23f1e10a 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -198,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	int ret;
 
 	if (tr->func.ftrace_managed)
-		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
+		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr, false);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0feea145bb29..98434196c6a1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5805,7 +5805,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
  *  0 on success
  *  -EINVAL - The @ops object was not properly registered.
  */
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
+				   bool free_filters)
 {
 	struct ftrace_hash *hash = ops->func_hash->filter_hash;
 	int err;
@@ -5823,6 +5824,9 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	/* cleanup for possible another register call */
 	ops->func = NULL;
 	ops->trampoline = 0;
+
+	if (free_filters)
+		ftrace_free_filter(ops);
 	return err;
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c
index b58c594efb51..196b43971cb5 100644
--- a/samples/ftrace/ftrace-direct-multi-modify.c
+++ b/samples/ftrace/ftrace-direct-multi-modify.c
@@ -151,8 +151,7 @@ static int __init ftrace_direct_multi_init(void)
 static void __exit ftrace_direct_multi_exit(void)
 {
 	kthread_stop(simple_tsk);
-	unregister_ftrace_direct_multi(&direct, my_tramp);
-	ftrace_free_filter(&direct);
+	unregister_ftrace_direct_multi(&direct, my_tramp, true);
 }
 
 module_init(ftrace_direct_multi_init);
diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c
index c27cf130c319..ea0e88ee5e43 100644
--- a/samples/ftrace/ftrace-direct-multi.c
+++ b/samples/ftrace/ftrace-direct-multi.c
@@ -78,8 +78,7 @@ static int __init ftrace_direct_multi_init(void)
 
 static void __exit ftrace_direct_multi_exit(void)
 {
-	unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp);
-	ftrace_free_filter(&direct);
+	unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp, true);
 }
 
 module_init(ftrace_direct_multi_init);
-- 
cgit v1.2.3


From 8788ca164eb4bad9a2a82c7778223c15ef800b8b Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 21 Mar 2023 15:04:20 +0100
Subject: ftrace: Remove the legacy _ftrace_direct API

This API relies on a single global ops, used for all direct calls
registered with it. However, to implement arm64 direct calls, we need
each ops to point to a single direct call trampoline.

Link: https://lkml.kernel.org/r/20230321140424.345218-4-revest@chromium.org

Signed-off-by: Florent Revest <revest@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  32 ----
 kernel/trace/ftrace.c  | 393 -------------------------------------------------
 2 files changed, 425 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5b68ee874bc1..2f400c9f0787 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -397,14 +397,6 @@ struct ftrace_func_entry {
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 extern int ftrace_direct_func_count;
-int register_ftrace_direct(unsigned long ip, unsigned long addr);
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr);
-int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr);
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr);
-int ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
-				struct dyn_ftrace *rec,
-				unsigned long old_addr,
-				unsigned long new_addr);
 unsigned long ftrace_find_rec_direct(unsigned long ip);
 int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
@@ -415,30 +407,6 @@ int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr
 #else
 struct ftrace_ops;
 # define ftrace_direct_func_count 0
-static inline int register_ftrace_direct(unsigned long ip, unsigned long addr)
-{
-	return -ENOTSUPP;
-}
-static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
-{
-	return -ENOTSUPP;
-}
-static inline int modify_ftrace_direct(unsigned long ip,
-				       unsigned long old_addr, unsigned long new_addr)
-{
-	return -ENOTSUPP;
-}
-static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
-{
-	return NULL;
-}
-static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
-					      struct dyn_ftrace *rec,
-					      unsigned long old_addr,
-					      unsigned long new_addr)
-{
-	return -ENODEV;
-}
 static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
 {
 	return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 98434196c6a1..fbc9abfa1a3e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2591,20 +2591,6 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
 
 	arch_ftrace_set_direct_caller(fregs, addr);
 }
-
-static struct ftrace_ops direct_ops = {
-	.func		= call_direct_funcs,
-	.flags		= FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
-			  | FTRACE_OPS_FL_PERMANENT,
-	/*
-	 * By declaring the main trampoline as this trampoline
-	 * it will never have one allocated for it. Allocated
-	 * trampolines should not call direct functions.
-	 * The direct_ops should only be called by the builtin
-	 * ftrace_regs_caller trampoline.
-	 */
-	.trampoline	= FTRACE_REGS_ADDR,
-};
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 /**
@@ -5301,387 +5287,8 @@ struct ftrace_direct_func {
 
 static LIST_HEAD(ftrace_direct_funcs);
 
-/**
- * ftrace_find_direct_func - test an address if it is a registered direct caller
- * @addr: The address of a registered direct caller
- *
- * This searches to see if a ftrace direct caller has been registered
- * at a specific address, and if so, it returns a descriptor for it.
- *
- * This can be used by architecture code to see if an address is
- * a direct caller (trampoline) attached to a fentry/mcount location.
- * This is useful for the function_graph tracer, as it may need to
- * do adjustments if it traced a location that also has a direct
- * trampoline attached to it.
- */
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
-{
-	struct ftrace_direct_func *entry;
-	bool found = false;
-
-	/* May be called by fgraph trampoline (protected by rcu tasks) */
-	list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
-		if (entry->addr == addr) {
-			found = true;
-			break;
-		}
-	}
-	if (found)
-		return entry;
-
-	return NULL;
-}
-
-static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
-{
-	struct ftrace_direct_func *direct;
-
-	direct = kmalloc(sizeof(*direct), GFP_KERNEL);
-	if (!direct)
-		return NULL;
-	direct->addr = addr;
-	direct->count = 0;
-	list_add_rcu(&direct->next, &ftrace_direct_funcs);
-	ftrace_direct_func_count++;
-	return direct;
-}
-
 static int register_ftrace_function_nolock(struct ftrace_ops *ops);
 
-/**
- * register_ftrace_direct - Call a custom trampoline directly
- * @ip: The address of the nop at the beginning of a function
- * @addr: The address of the trampoline to call at @ip
- *
- * This is used to connect a direct call from the nop location (@ip)
- * at the start of ftrace traced functions. The location that it calls
- * (@addr) must be able to handle a direct call, and save the parameters
- * of the function being traced, and restore them (or inject new ones
- * if needed), before returning.
- *
- * Returns:
- *  0 on success
- *  -EBUSY - Another direct function is already attached (there can be only one)
- *  -ENODEV - @ip does not point to a ftrace nop location (or not supported)
- *  -ENOMEM - There was an allocation failure.
- */
-int register_ftrace_direct(unsigned long ip, unsigned long addr)
-{
-	struct ftrace_direct_func *direct;
-	struct ftrace_func_entry *entry;
-	struct ftrace_hash *free_hash = NULL;
-	struct dyn_ftrace *rec;
-	int ret = -ENODEV;
-
-	mutex_lock(&direct_mutex);
-
-	ip = ftrace_location(ip);
-	if (!ip)
-		goto out_unlock;
-
-	/* See if there's a direct function at @ip already */
-	ret = -EBUSY;
-	if (ftrace_find_rec_direct(ip))
-		goto out_unlock;
-
-	ret = -ENODEV;
-	rec = lookup_rec(ip, ip);
-	if (!rec)
-		goto out_unlock;
-
-	/*
-	 * Check if the rec says it has a direct call but we didn't
-	 * find one earlier?
-	 */
-	if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
-		goto out_unlock;
-
-	/* Make sure the ip points to the exact record */
-	if (ip != rec->ip) {
-		ip = rec->ip;
-		/* Need to check this ip for a direct. */
-		if (ftrace_find_rec_direct(ip))
-			goto out_unlock;
-	}
-
-	ret = -ENOMEM;
-	direct = ftrace_find_direct_func(addr);
-	if (!direct) {
-		direct = ftrace_alloc_direct_func(addr);
-		if (!direct)
-			goto out_unlock;
-	}
-
-	entry = ftrace_add_rec_direct(ip, addr, &free_hash);
-	if (!entry)
-		goto out_unlock;
-
-	ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
-
-	if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
-		ret = register_ftrace_function_nolock(&direct_ops);
-		if (ret)
-			ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
-	}
-
-	if (ret) {
-		remove_hash_entry(direct_functions, entry);
-		kfree(entry);
-		if (!direct->count) {
-			list_del_rcu(&direct->next);
-			synchronize_rcu_tasks();
-			kfree(direct);
-			if (free_hash)
-				free_ftrace_hash(free_hash);
-			free_hash = NULL;
-			ftrace_direct_func_count--;
-		}
-	} else {
-		direct->count++;
-	}
- out_unlock:
-	mutex_unlock(&direct_mutex);
-
-	if (free_hash) {
-		synchronize_rcu_tasks();
-		free_ftrace_hash(free_hash);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_direct);
-
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
-						   struct dyn_ftrace **recp)
-{
-	struct ftrace_func_entry *entry;
-	struct dyn_ftrace *rec;
-
-	rec = lookup_rec(*ip, *ip);
-	if (!rec)
-		return NULL;
-
-	entry = __ftrace_lookup_ip(direct_functions, rec->ip);
-	if (!entry) {
-		WARN_ON(rec->flags & FTRACE_FL_DIRECT);
-		return NULL;
-	}
-
-	WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-
-	/* Passed in ip just needs to be on the call site */
-	*ip = rec->ip;
-
-	if (recp)
-		*recp = rec;
-
-	return entry;
-}
-
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
-{
-	struct ftrace_direct_func *direct;
-	struct ftrace_func_entry *entry;
-	struct ftrace_hash *hash;
-	int ret = -ENODEV;
-
-	mutex_lock(&direct_mutex);
-
-	ip = ftrace_location(ip);
-	if (!ip)
-		goto out_unlock;
-
-	entry = find_direct_entry(&ip, NULL);
-	if (!entry)
-		goto out_unlock;
-
-	hash = direct_ops.func_hash->filter_hash;
-	if (hash->count == 1)
-		unregister_ftrace_function(&direct_ops);
-
-	ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
-
-	WARN_ON(ret);
-
-	remove_hash_entry(direct_functions, entry);
-
-	direct = ftrace_find_direct_func(addr);
-	if (!WARN_ON(!direct)) {
-		/* This is the good path (see the ! before WARN) */
-		direct->count--;
-		WARN_ON(direct->count < 0);
-		if (!direct->count) {
-			list_del_rcu(&direct->next);
-			synchronize_rcu_tasks();
-			kfree(direct);
-			kfree(entry);
-			ftrace_direct_func_count--;
-		}
-	}
- out_unlock:
-	mutex_unlock(&direct_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-
-static struct ftrace_ops stub_ops = {
-	.func		= ftrace_stub,
-};
-
-/**
- * ftrace_modify_direct_caller - modify ftrace nop directly
- * @entry: The ftrace hash entry of the direct helper for @rec
- * @rec: The record representing the function site to patch
- * @old_addr: The location that the site at @rec->ip currently calls
- * @new_addr: The location that the site at @rec->ip should call
- *
- * An architecture may overwrite this function to optimize the
- * changing of the direct callback on an ftrace nop location.
- * This is called with the ftrace_lock mutex held, and no other
- * ftrace callbacks are on the associated record (@rec). Thus,
- * it is safe to modify the ftrace record, where it should be
- * currently calling @old_addr directly, to call @new_addr.
- *
- * This is called with direct_mutex locked.
- *
- * Safety checks should be made to make sure that the code at
- * @rec->ip is currently calling @old_addr. And this must
- * also update entry->direct to @new_addr.
- */
-int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
-				       struct dyn_ftrace *rec,
-				       unsigned long old_addr,
-				       unsigned long new_addr)
-{
-	unsigned long ip = rec->ip;
-	int ret;
-
-	lockdep_assert_held(&direct_mutex);
-
-	/*
-	 * The ftrace_lock was used to determine if the record
-	 * had more than one registered user to it. If it did,
-	 * we needed to prevent that from changing to do the quick
-	 * switch. But if it did not (only a direct caller was attached)
-	 * then this function is called. But this function can deal
-	 * with attached callers to the rec that we care about, and
-	 * since this function uses standard ftrace calls that take
-	 * the ftrace_lock mutex, we need to release it.
-	 */
-	mutex_unlock(&ftrace_lock);
-
-	/*
-	 * By setting a stub function at the same address, we force
-	 * the code to call the iterator and the direct_ops helper.
-	 * This means that @ip does not call the direct call, and
-	 * we can simply modify it.
-	 */
-	ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
-	if (ret)
-		goto out_lock;
-
-	ret = register_ftrace_function_nolock(&stub_ops);
-	if (ret) {
-		ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-		goto out_lock;
-	}
-
-	entry->direct = new_addr;
-
-	/*
-	 * By removing the stub, we put back the direct call, calling
-	 * the @new_addr.
-	 */
-	unregister_ftrace_function(&stub_ops);
-	ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
- out_lock:
-	mutex_lock(&ftrace_lock);
-
-	return ret;
-}
-
-/**
- * modify_ftrace_direct - Modify an existing direct call to call something else
- * @ip: The instruction pointer to modify
- * @old_addr: The address that the current @ip calls directly
- * @new_addr: The address that the @ip should call
- *
- * This modifies a ftrace direct caller at an instruction pointer without
- * having to disable it first. The direct call will switch over to the
- * @new_addr without missing anything.
- *
- * Returns: zero on success. Non zero on error, which includes:
- *  -ENODEV : the @ip given has no direct caller attached
- *  -EINVAL : the @old_addr does not match the current direct caller
- */
-int modify_ftrace_direct(unsigned long ip,
-			 unsigned long old_addr, unsigned long new_addr)
-{
-	struct ftrace_direct_func *direct, *new_direct = NULL;
-	struct ftrace_func_entry *entry;
-	struct dyn_ftrace *rec;
-	int ret = -ENODEV;
-
-	mutex_lock(&direct_mutex);
-
-	mutex_lock(&ftrace_lock);
-
-	ip = ftrace_location(ip);
-	if (!ip)
-		goto out_unlock;
-
-	entry = find_direct_entry(&ip, &rec);
-	if (!entry)
-		goto out_unlock;
-
-	ret = -EINVAL;
-	if (entry->direct != old_addr)
-		goto out_unlock;
-
-	direct = ftrace_find_direct_func(old_addr);
-	if (WARN_ON(!direct))
-		goto out_unlock;
-	if (direct->count > 1) {
-		ret = -ENOMEM;
-		new_direct = ftrace_alloc_direct_func(new_addr);
-		if (!new_direct)
-			goto out_unlock;
-		direct->count--;
-		new_direct->count++;
-	} else {
-		direct->addr = new_addr;
-	}
-
-	/*
-	 * If there's no other ftrace callback on the rec->ip location,
-	 * then it can be changed directly by the architecture.
-	 * If there is another caller, then we just need to change the
-	 * direct caller helper to point to @new_addr.
-	 */
-	if (ftrace_rec_count(rec) == 1) {
-		ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
-	} else {
-		entry->direct = new_addr;
-		ret = 0;
-	}
-
-	if (unlikely(ret && new_direct)) {
-		direct->count++;
-		list_del_rcu(&new_direct->next);
-		synchronize_rcu_tasks();
-		kfree(new_direct);
-		ftrace_direct_func_count--;
-	}
-
- out_unlock:
-	mutex_unlock(&ftrace_lock);
-	mutex_unlock(&direct_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-
 #define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
 
 static int check_direct_multi(struct ftrace_ops *ops)
-- 
cgit v1.2.3


From da8bdfbd422333fbb7c85ac1d7f18592d17d6665 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 21 Mar 2023 15:04:21 +0100
Subject: ftrace: Rename _ftrace_direct_multi APIs to _ftrace_direct APIs

Now that the original _ftrace_direct APIs are gone, the "_multi"
suffixes only add confusion.

Link: https://lkml.kernel.org/r/20230321140424.345218-5-revest@chromium.org

Signed-off-by: Florent Revest <revest@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h                      | 20 ++++++++---------
 kernel/bpf/trampoline.c                     | 12 +++++-----
 kernel/trace/ftrace.c                       | 34 ++++++++++++++---------------
 kernel/trace/trace_selftest.c               |  9 ++++----
 samples/Kconfig                             |  2 +-
 samples/ftrace/ftrace-direct-modify.c       |  8 +++----
 samples/ftrace/ftrace-direct-multi-modify.c |  8 +++----
 samples/ftrace/ftrace-direct-multi.c        |  4 ++--
 samples/ftrace/ftrace-direct-too.c          |  6 ++---
 samples/ftrace/ftrace-direct.c              |  6 ++---
 10 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2f400c9f0787..abee60865fc7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -398,11 +398,11 @@ struct ftrace_func_entry {
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 extern int ftrace_direct_func_count;
 unsigned long ftrace_find_rec_direct(unsigned long ip);
-int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
-				   bool free_filters);
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
-int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr);
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+			     bool free_filters);
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
 
 #else
 struct ftrace_ops;
@@ -411,20 +411,20 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
 {
 	return 0;
 }
-static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static inline int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
 	return -ENODEV;
 }
-static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
-						 bool free_filters)
+static inline int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+					   bool free_filters)
 {
 	return -ENODEV;
 }
-static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static inline int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
 	return -ENODEV;
 }
-static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
 {
 	return -ENODEV;
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 88bc23f1e10a..a14d0af534b3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -45,8 +45,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd
 		lockdep_assert_held_once(&tr->mutex);
 
 		/* Instead of updating the trampoline here, we propagate
-		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
-		 * retry register_ftrace_direct_multi() after updating the
+		 * -EAGAIN to register_ftrace_direct(). Then we can
+		 * retry register_ftrace_direct() after updating the
 		 * trampoline.
 		 */
 		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
@@ -198,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	int ret;
 
 	if (tr->func.ftrace_managed)
-		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr, false);
+		ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
@@ -215,9 +215,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
 
 	if (tr->func.ftrace_managed) {
 		if (lock_direct_mutex)
-			ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+			ret = modify_ftrace_direct(tr->fops, (long)new_addr);
 		else
-			ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+			ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
 	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
 	}
@@ -243,7 +243,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 
 	if (tr->func.ftrace_managed) {
 		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
-		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+		ret = register_ftrace_direct(tr->fops, (long)new_addr);
 	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 	}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fbc9abfa1a3e..845c4012630f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5318,7 +5318,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
 }
 
 /**
- * register_ftrace_direct_multi - Call a custom trampoline directly
+ * register_ftrace_direct - Call a custom trampoline directly
  * for multiple functions registered in @ops
  * @ops: The address of the struct ftrace_ops object
  * @addr: The address of the trampoline to call at @ops functions
@@ -5339,7 +5339,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
  *  -ENODEV  - @ip does not point to a ftrace nop location (or not supported)
  *  -ENOMEM  - There was an allocation failure.
  */
-int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
 	struct ftrace_hash *hash, *free_hash = NULL;
 	struct ftrace_func_entry *entry, *new;
@@ -5397,11 +5397,11 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	}
 	return err;
 }
-EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
 
 /**
- * unregister_ftrace_direct_multi - Remove calls to custom trampoline
- * previously registered by register_ftrace_direct_multi for @ops object.
+ * unregister_ftrace_direct - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct for @ops object.
  * @ops: The address of the struct ftrace_ops object
  *
  * This is used to remove a direct calls to @addr from the nop locations
@@ -5412,8 +5412,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
  *  0 on success
  *  -EINVAL - The @ops object was not properly registered.
  */
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
-				   bool free_filters)
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+			     bool free_filters)
 {
 	struct ftrace_hash *hash = ops->func_hash->filter_hash;
 	int err;
@@ -5436,10 +5436,10 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr,
 		ftrace_free_filter(ops);
 	return err;
 }
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
 
 static int
-__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
 	struct ftrace_hash *hash;
 	struct ftrace_func_entry *entry, *iter;
@@ -5486,7 +5486,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 }
 
 /**
- * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
  * to call something else
  * @ops: The address of the struct ftrace_ops object
  * @addr: The address of the new trampoline to call at @ops functions
@@ -5503,19 +5503,19 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
  * Returns: zero on success. Non zero on error, which includes:
  *  -EINVAL - The @ops object was not properly registered.
  */
-int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
 {
 	if (check_direct_multi(ops))
 		return -EINVAL;
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
 		return -EINVAL;
 
-	return __modify_ftrace_direct_multi(ops, addr);
+	return __modify_ftrace_direct(ops, addr);
 }
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
 
 /**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * modify_ftrace_direct - Modify an existing direct 'multi' call
  * to call something else
  * @ops: The address of the struct ftrace_ops object
  * @addr: The address of the new trampoline to call at @ops functions
@@ -5529,7 +5529,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
  * Returns: zero on success. Non zero on error, which includes:
  *  -EINVAL - The @ops object was not properly registered.
  */
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
 	int err;
 
@@ -5539,11 +5539,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 		return -EINVAL;
 
 	mutex_lock(&direct_mutex);
-	err = __modify_ftrace_direct_multi(ops, addr);
+	err = __modify_ftrace_direct(ops, addr);
 	mutex_unlock(&direct_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 /**
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9ce80b3ad06d..84cd7ba31d27 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -872,7 +872,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	 * and make sure we get graph trace.
 	 */
 	ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
-	ret = register_ftrace_direct_multi(&direct, (unsigned long)trace_direct_tramp);
+	ret = register_ftrace_direct(&direct,
+				     (unsigned long)trace_direct_tramp);
 	if (ret)
 		goto out;
 
@@ -892,9 +893,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 
 	unregister_ftrace_graph(&fgraph_ops);
 
-	ret = unregister_ftrace_direct_multi(&direct,
-					     (unsigned long) trace_direct_tramp,
-					     true);
+	ret = unregister_ftrace_direct(&direct,
+				       (unsigned long) trace_direct_tramp,
+				       true);
 	if (ret)
 		goto out;
 
diff --git a/samples/Kconfig b/samples/Kconfig
index 30ef8bd48ba3..fd24daa99f34 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -38,7 +38,7 @@ config SAMPLE_FTRACE_DIRECT
 	  that hooks to wake_up_process and prints the parameters.
 
 config SAMPLE_FTRACE_DIRECT_MULTI
-	tristate "Build register_ftrace_direct_multi() example"
+	tristate "Build register_ftrace_direct() on multiple ips example"
 	depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m
 	depends on HAVE_SAMPLE_FTRACE_DIRECT_MULTI
 	help
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c
index f01ac74bac10..25fba66f61c0 100644
--- a/samples/ftrace/ftrace-direct-modify.c
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -116,7 +116,7 @@ static int simple_thread(void *arg)
 		if (ret)
 			continue;
 		t ^= 1;
-		ret = modify_ftrace_direct_multi(&direct, tramps[t]);
+		ret = modify_ftrace_direct(&direct, tramps[t]);
 		if (!ret)
 			my_tramp = tramps[t];
 		WARN_ON_ONCE(ret);
@@ -132,7 +132,7 @@ static int __init ftrace_direct_init(void)
 	int ret;
 
 	ftrace_set_filter_ip(&direct, (unsigned long) my_ip, 0, 0);
-	ret = register_ftrace_direct_multi(&direct, my_tramp);
+	ret = register_ftrace_direct(&direct, my_tramp);
 
 	if (!ret)
 		simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn");
@@ -142,12 +142,12 @@ static int __init ftrace_direct_init(void)
 static void __exit ftrace_direct_exit(void)
 {
 	kthread_stop(simple_tsk);
-	unregister_ftrace_direct_multi(&direct, my_tramp, true);
+	unregister_ftrace_direct(&direct, my_tramp, true);
 }
 
 module_init(ftrace_direct_init);
 module_exit(ftrace_direct_exit);
 
 MODULE_AUTHOR("Steven Rostedt");
-MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct_multi()");
+MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()");
 MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c
index 196b43971cb5..f72623899602 100644
--- a/samples/ftrace/ftrace-direct-multi-modify.c
+++ b/samples/ftrace/ftrace-direct-multi-modify.c
@@ -123,7 +123,7 @@ static int simple_thread(void *arg)
 		if (ret)
 			continue;
 		t ^= 1;
-		ret = modify_ftrace_direct_multi(&direct, tramps[t]);
+		ret = modify_ftrace_direct(&direct, tramps[t]);
 		if (!ret)
 			my_tramp = tramps[t];
 		WARN_ON_ONCE(ret);
@@ -141,7 +141,7 @@ static int __init ftrace_direct_multi_init(void)
 	ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
 	ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0);
 
-	ret = register_ftrace_direct_multi(&direct, my_tramp);
+	ret = register_ftrace_direct(&direct, my_tramp);
 
 	if (!ret)
 		simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn");
@@ -151,12 +151,12 @@ static int __init ftrace_direct_multi_init(void)
 static void __exit ftrace_direct_multi_exit(void)
 {
 	kthread_stop(simple_tsk);
-	unregister_ftrace_direct_multi(&direct, my_tramp, true);
+	unregister_ftrace_direct(&direct, my_tramp, true);
 }
 
 module_init(ftrace_direct_multi_init);
 module_exit(ftrace_direct_multi_exit);
 
 MODULE_AUTHOR("Jiri Olsa");
-MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct_multi()");
+MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()");
 MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c
index ea0e88ee5e43..1547c2c6be02 100644
--- a/samples/ftrace/ftrace-direct-multi.c
+++ b/samples/ftrace/ftrace-direct-multi.c
@@ -73,12 +73,12 @@ static int __init ftrace_direct_multi_init(void)
 	ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
 	ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0);
 
-	return register_ftrace_direct_multi(&direct, (unsigned long) my_tramp);
+	return register_ftrace_direct(&direct, (unsigned long) my_tramp);
 }
 
 static void __exit ftrace_direct_multi_exit(void)
 {
-	unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp, true);
+	unregister_ftrace_direct(&direct, (unsigned long) my_tramp, true);
 }
 
 module_init(ftrace_direct_multi_init);
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
index 05c3585ac15e..f28e7b99840f 100644
--- a/samples/ftrace/ftrace-direct-too.c
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -76,17 +76,17 @@ static int __init ftrace_direct_init(void)
 {
 	ftrace_set_filter_ip(&direct, (unsigned long) handle_mm_fault, 0, 0);
 
-	return register_ftrace_direct_multi(&direct, (unsigned long) my_tramp);
+	return register_ftrace_direct(&direct, (unsigned long) my_tramp);
 }
 
 static void __exit ftrace_direct_exit(void)
 {
-	unregister_ftrace_direct_multi(&direct, (unsigned long)my_tramp, true);
+	unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true);
 }
 
 module_init(ftrace_direct_init);
 module_exit(ftrace_direct_exit);
 
 MODULE_AUTHOR("Steven Rostedt");
-MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct_multi()");
+MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()");
 MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c
index 42ec9e39453b..d81a9473b585 100644
--- a/samples/ftrace/ftrace-direct.c
+++ b/samples/ftrace/ftrace-direct.c
@@ -69,17 +69,17 @@ static int __init ftrace_direct_init(void)
 {
 	ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
 
-	return register_ftrace_direct_multi(&direct, (unsigned long) my_tramp);
+	return register_ftrace_direct(&direct, (unsigned long) my_tramp);
 }
 
 static void __exit ftrace_direct_exit(void)
 {
-	unregister_ftrace_direct_multi(&direct, (unsigned long)my_tramp, true);
+	unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true);
 }
 
 module_init(ftrace_direct_init);
 module_exit(ftrace_direct_exit);
 
 MODULE_AUTHOR("Steven Rostedt");
-MODULE_DESCRIPTION("Example use case of using register_ftrace_direct_multi()");
+MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From dbaccb618fabde8b8596e341f8d76da63a9b0c2f Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 21 Mar 2023 15:04:22 +0100
Subject: ftrace: Store direct called addresses in their ops

All direct calls are now registered using the register_ftrace_direct API
so each ops can jump to only one direct-called trampoline.

By storing the direct called trampoline address directly in the ops we
can save one hashmap lookup in the direct call ops and implement arm64
direct calls on top of call ops.

Link: https://lkml.kernel.org/r/20230321140424.345218-6-revest@chromium.org

Signed-off-by: Florent Revest <revest@chromium.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 3 +++
 kernel/trace/ftrace.c  | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index abee60865fc7..6a532dd6789e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -321,6 +321,9 @@ struct ftrace_ops {
 	unsigned long			trampoline_size;
 	struct list_head		list;
 	ftrace_ops_func_t		ops_func;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	unsigned long			direct_call;
+#endif
 #endif
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 845c4012630f..3bef2abc037a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2583,9 +2583,8 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
 static void call_direct_funcs(unsigned long ip, unsigned long pip,
 			      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
-	unsigned long addr;
+	unsigned long addr = READ_ONCE(ops->direct_call);
 
-	addr = ftrace_find_rec_direct(ip);
 	if (!addr)
 		return;
 
@@ -5381,6 +5380,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	ops->func = call_direct_funcs;
 	ops->flags = MULTI_FLAGS;
 	ops->trampoline = FTRACE_REGS_ADDR;
+	ops->direct_call = addr;
 
 	err = register_ftrace_function_nolock(ops);
 
@@ -5455,6 +5455,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	/* Enable the tmp_ops to have the same functions as the direct ops */
 	ftrace_ops_init(&tmp_ops);
 	tmp_ops.func_hash = ops->func_hash;
+	tmp_ops.direct_call = addr;
 
 	err = register_ftrace_function_nolock(&tmp_ops);
 	if (err)
@@ -5476,6 +5477,8 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 			entry->direct = addr;
 		}
 	}
+	/* Prevent store tearing if a trampoline concurrently accesses the value */
+	WRITE_ONCE(ops->direct_call, addr);
 
 	mutex_unlock(&ftrace_lock);
 
-- 
cgit v1.2.3


From 60c8971899f3b34ad24857913c0784dab08962f0 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 21 Mar 2023 15:04:23 +0100
Subject: ftrace: Make DIRECT_CALLS work WITH_ARGS and !WITH_REGS

Direct called trampolines can be called in two ways:
- either from the ftrace callsite. In this case, they do not access any
  struct ftrace_regs nor pt_regs
- Or, if a ftrace ops is also attached, from the end of a ftrace
  trampoline. In this case, the call_direct_funcs ops is in charge of
  setting the direct call trampoline's address in a struct ftrace_regs

Since:

commit 9705bc709604 ("ftrace: pass fregs to arch_ftrace_set_direct_caller()")

The later case no longer requires a full pt_regs. It only needs a struct
ftrace_regs so DIRECT_CALLS can work with both WITH_ARGS or WITH_REGS.
With architectures like arm64 already abandoning WITH_REGS in favor of
WITH_ARGS, it's important to have DIRECT_CALLS work WITH_ARGS only.

Link: https://lkml.kernel.org/r/20230321140424.345218-7-revest@chromium.org

Signed-off-by: Florent Revest <revest@chromium.org>
Co-developed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 6 ++++++
 kernel/trace/Kconfig   | 2 +-
 kernel/trace/ftrace.c  | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6a532dd6789e..31f1e1df2af3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -241,6 +241,12 @@ enum {
 	FTRACE_OPS_FL_DIRECT			= BIT(17),
 };
 
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
+#define FTRACE_OPS_FL_SAVE_ARGS                        FTRACE_OPS_FL_SAVE_REGS
+#else
+#define FTRACE_OPS_FL_SAVE_ARGS                        0
+#endif
+
 /*
  * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
  * to a ftrace_ops. Note, the requests may fail.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a856d4a34c67..5b1e7fa41ca8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -257,7 +257,7 @@ config DYNAMIC_FTRACE_WITH_REGS
 
 config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	def_bool y
-	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
 	depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 
 config DYNAMIC_FTRACE_WITH_CALL_OPS
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3bef2abc037a..3b46dba3f69b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5288,7 +5288,7 @@ static LIST_HEAD(ftrace_direct_funcs);
 
 static int register_ftrace_function_nolock(struct ftrace_ops *ops);
 
-#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
 
 static int check_direct_multi(struct ftrace_ops *ops)
 {
-- 
cgit v1.2.3


From fee86a4ed536f4e521f3a4530242e152dd2a466b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 21 Mar 2023 15:04:24 +0100
Subject: ftrace: selftest: remove broken trace_direct_tramp

The ftrace selftest code has a trace_direct_tramp() function which it
uses as a direct call trampoline. This happens to work on x86, since the
direct call's return address is in the usual place, and can be returned
to via a RET, but in general the calling convention for direct calls is
different from regular function calls, and requires a trampoline written
in assembly.

On s390, regular function calls place the return address in %r14, and an
ftrace patch-site in an instrumented function places the trampoline's
return address (which is within the instrumented function) in %r0,
preserving the original %r14 value in-place. As a regular C function
will return to the address in %r14, using a C function as the trampoline
results in the trampoline returning to the caller of the instrumented
function, skipping the body of the instrumented function.

Note that the s390 issue is not detcted by the ftrace selftest code, as
the instrumented function is trivial, and returning back into the caller
happens to be equivalent.

On arm64, regular function calls place the return address in x30, and
an ftrace patch-site in an instrumented function saves this into r9
and places the trampoline's return address (within the instrumented
function) in x30. A regular C function will return to the address in
x30, but will not restore x9 into x30. Consequently, using a C function
as the trampoline results in returning to the trampoline's return
address having corrupted x30, such that when the instrumented function
returns, it will return back into itself.

To avoid future issues in this area, remove the trace_direct_tramp()
function, and require that each architecture with direct calls provides
a stub trampoline, named ftrace_stub_direct_tramp. This can be written
to handle the architecture's trampoline calling convention, and in
future could be used elsewhere (e.g. in the ftrace ops sample, to
measure the overhead of direct calls), so we may as well always build it
in.

Link: https://lkml.kernel.org/r/20230321140424.345218-8-revest@chromium.org

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Li Huafei <lihuafei1@huawei.com>
Cc: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Florent Revest <revest@chromium.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/s390/kernel/mcount.S     |  5 +++++
 arch/x86/kernel/ftrace_32.S   |  5 +++++
 arch/x86/kernel/ftrace_64.S   |  4 ++++
 include/linux/ftrace.h        |  2 ++
 kernel/trace/trace_selftest.c | 12 ++----------
 5 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 43ff91073d2a..6c10da43b538 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -32,6 +32,11 @@ ENTRY(ftrace_stub)
 	BR_EX	%r14
 ENDPROC(ftrace_stub)
 
+SYM_CODE_START(ftrace_stub_direct_tramp)
+	lgr	%r1, %r0
+	BR_EX	%r1
+SYM_CODE_END(ftrace_stub_direct_tramp)
+
 	.macro	ftrace_regs_entry, allregs=0
 	stg	%r14,(__SF_GPRS+8*8)(%r15)	# save traced function caller
 
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index a0ed0e4a2c0c..0d9a14528176 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -163,6 +163,11 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
 	jmp	.Lftrace_ret
 SYM_CODE_END(ftrace_regs_caller)
 
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+	CALL_DEPTH_ACCOUNT
+	RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 SYM_CODE_START(ftrace_graph_caller)
 	pushl	%eax
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index fb4f1e01b64a..970d8445fdc4 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -309,6 +309,10 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
 SYM_FUNC_END(ftrace_regs_caller)
 STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
 
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+	CALL_DEPTH_ACCOUNT
+	RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 31f1e1df2af3..931f3d904529 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -413,6 +413,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
 int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
 int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
 
+void ftrace_stub_direct_tramp(void);
+
 #else
 struct ftrace_ops;
 # define ftrace_direct_func_count 0
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 84cd7ba31d27..a931d9aaea26 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -786,14 +786,6 @@ static struct fgraph_ops fgraph_ops __initdata  = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 static struct ftrace_ops direct;
-#ifndef CALL_DEPTH_ACCOUNT
-#define CALL_DEPTH_ACCOUNT ""
-#endif
-
-noinline __noclone static void trace_direct_tramp(void)
-{
-	asm(CALL_DEPTH_ACCOUNT);
-}
 #endif
 
 /*
@@ -873,7 +865,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	 */
 	ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
 	ret = register_ftrace_direct(&direct,
-				     (unsigned long)trace_direct_tramp);
+				     (unsigned long)ftrace_stub_direct_tramp);
 	if (ret)
 		goto out;
 
@@ -894,7 +886,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	unregister_ftrace_graph(&fgraph_ops);
 
 	ret = unregister_ftrace_direct(&direct,
-				       (unsigned long) trace_direct_tramp,
+				       (unsigned long)ftrace_stub_direct_tramp,
 				       true);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From e11b521a7b69c2621bb2e5920bb96f6d2facdc7e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 24 Jan 2023 09:56:53 -0500
Subject: ftrace: Show a list of all functions that have ever been enabled

When debugging a crash that appears to be related to ftrace, but not for
sure, it is useful to know if a function was ever enabled by ftrace or
not. It could be that a BPF program was attached to it, or possibly a live
patch.

We are having crashes in the field where this information is not always
known. But having ftrace set a flag if a function has ever been attached
since boot up helps tremendously in trying to know if a crash had to do
with something using ftrace.

For analyzing crashes, the use of a kdump image can have access to the
flags. When looking at issues where the kernel did not panic, the
touched_functions file can simply be used.

Link: https://lore.kernel.org/linux-trace-kernel/20230124095653.6fd1640e@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Chris Li <chriscli@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  5 ++++-
 kernel/trace/ftrace.c  | 51 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 931f3d904529..327046f1278d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -548,6 +548,7 @@ bool is_ftrace_trampoline(unsigned long addr);
  *  DIRECT   - there is a direct function to call
  *  CALL_OPS - the record can use callsite-specific ops
  *  CALL_OPS_EN - the function is set up to use callsite-specific ops
+ *  TOUCHED  - A callback was added since boot up
  *
  * When a new ftrace_ops is registered and wants a function to save
  * pt_regs, the rec->flags REGS is set. When the function has been
@@ -567,9 +568,10 @@ enum {
 	FTRACE_FL_DIRECT_EN	= (1UL << 23),
 	FTRACE_FL_CALL_OPS	= (1UL << 22),
 	FTRACE_FL_CALL_OPS_EN	= (1UL << 21),
+	FTRACE_FL_TOUCHED	= (1UL << 20),
 };
 
-#define FTRACE_REF_MAX_SHIFT	21
+#define FTRACE_REF_MAX_SHIFT	20
 #define FTRACE_REF_MAX		((1UL << FTRACE_REF_MAX_SHIFT) - 1)
 
 #define ftrace_rec_count(rec)	((rec)->flags & FTRACE_REF_MAX)
@@ -628,6 +630,7 @@ enum {
 	FTRACE_ITER_PROBE	= (1 << 4),
 	FTRACE_ITER_MOD		= (1 << 5),
 	FTRACE_ITER_ENABLED	= (1 << 6),
+	FTRACE_ITER_TOUCHED	= (1 << 7),
 };
 
 void arch_ftrace_update_code(int command);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3b46dba3f69b..db8532a4d5c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,9 @@
 #include "trace_output.h"
 #include "trace_stat.h"
 
+/* Flags that do not get reset */
+#define FTRACE_NOCLEAR_FLAGS	(FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+
 #define FTRACE_INVALID_FUNCTION		"__ftrace_invalid_address__"
 
 #define FTRACE_WARN_ON(cond)			\
@@ -2256,7 +2259,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 		flag ^= rec->flags & FTRACE_FL_ENABLED;
 
 		if (update) {
-			rec->flags |= FTRACE_FL_ENABLED;
+			rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
 			if (flag & FTRACE_FL_REGS) {
 				if (rec->flags & FTRACE_FL_REGS)
 					rec->flags |= FTRACE_FL_REGS_EN;
@@ -2326,7 +2329,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 	if (update) {
 		/* If there's no more users, clear all flags */
 		if (!ftrace_rec_count(rec))
-			rec->flags &= FTRACE_FL_DISABLED;
+			rec->flags &= FTRACE_NOCLEAR_FLAGS;
 		else
 			/*
 			 * Just disable the record, but keep the ops TRAMP
@@ -3147,7 +3150,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		struct dyn_ftrace *rec;
 
 		do_for_each_ftrace_rec(pg, rec) {
-			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
 				pr_warn("  %pS flags:%lx\n",
 					(void *)rec->ip, rec->flags);
 		} while_for_each_ftrace_rec();
@@ -3598,7 +3601,10 @@ t_func_next(struct seq_file *m, loff_t *pos)
 		     !ftrace_lookup_ip(iter->hash, rec->ip)) ||
 
 		    ((iter->flags & FTRACE_ITER_ENABLED) &&
-		     !(rec->flags & FTRACE_FL_ENABLED))) {
+		     !(rec->flags & FTRACE_FL_ENABLED)) ||
+
+		    ((iter->flags & FTRACE_ITER_TOUCHED) &&
+		     !(rec->flags & FTRACE_FL_TOUCHED))) {
 
 			rec = NULL;
 			goto retry;
@@ -3857,7 +3863,7 @@ static int t_show(struct seq_file *m, void *v)
 		return 0;
 	}
 
-	if (iter->flags & FTRACE_ITER_ENABLED) {
+	if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
 		struct ftrace_ops *ops;
 
 		seq_printf(m, " (%ld)%s%s%s%s",
@@ -3959,6 +3965,31 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int
+ftrace_touched_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+
+	/*
+	 * This shows us what functions have ever been enabled
+	 * (traced, direct, patched, etc). Not sure if we want lockdown
+	 * to hide such critical information for an admin.
+	 * Although, perhaps it can show information we don't
+	 * want people to see, but if something had traced
+	 * something, we probably want to know about it.
+	 */
+
+	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->flags = FTRACE_ITER_TOUCHED;
+	iter->ops = &global_ops;
+
+	return 0;
+}
+
 /**
  * ftrace_regex_open - initialize function tracer filter files
  * @ops: The ftrace_ops that hold the hash filters
@@ -5872,6 +5903,13 @@ static const struct file_operations ftrace_enabled_fops = {
 	.release = seq_release_private,
 };
 
+static const struct file_operations ftrace_touched_fops = {
+	.open = ftrace_touched_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+};
+
 static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
@@ -6336,6 +6374,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 	trace_create_file("enabled_functions", TRACE_MODE_READ,
 			d_tracer, NULL, &ftrace_enabled_fops);
 
+	trace_create_file("touched_functions", TRACE_MODE_READ,
+			d_tracer, NULL, &ftrace_touched_fops);
+
 	ftrace_create_filter_files(&global_ops, d_tracer);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-- 
cgit v1.2.3


From 0f74f8b6675cc36d689abb4d9b3d75ab4049b7d7 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 19 Mar 2023 00:33:07 +0100
Subject: i3c: Make i3c_master_unregister() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function returned zero unconditionally. Switch the return type to void
and simplify the callers accordingly.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20230318233311.265186-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c                   | 6 +-----
 drivers/i3c/master/dw-i3c-master.c     | 5 +----
 drivers/i3c/master/i3c-master-cdns.c   | 5 +----
 drivers/i3c/master/mipi-i3c-hci/core.c | 4 +++-
 drivers/i3c/master/svc-i3c-master.c    | 5 +----
 include/linux/i3c/master.h             | 2 +-
 6 files changed, 8 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 54e4c34b4a22..04d6d54d2ab8 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -2695,17 +2695,13 @@ EXPORT_SYMBOL_GPL(i3c_master_register);
  * @master: master used to send frames on the bus
  *
  * Basically undo everything done in i3c_master_register().
- *
- * Return: 0 in case of success, a negative error code otherwise.
  */
-int i3c_master_unregister(struct i3c_master_controller *master)
+void i3c_master_unregister(struct i3c_master_controller *master)
 {
 	i3c_master_i2c_adapter_cleanup(master);
 	i3c_master_unregister_i3c_devs(master);
 	i3c_master_bus_cleanup(master);
 	device_unregister(&master->dev);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(i3c_master_unregister);
 
diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 4859dd75388d..8d04b3c72b0e 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -1185,11 +1185,8 @@ err_disable_core_clk:
 static int dw_i3c_remove(struct platform_device *pdev)
 {
 	struct dw_i3c_master *master = platform_get_drvdata(pdev);
-	int ret;
 
-	ret = i3c_master_unregister(&master->base);
-	if (ret)
-		return ret;
+	i3c_master_unregister(&master->base);
 
 	reset_control_assert(master->core_rst);
 
diff --git a/drivers/i3c/master/i3c-master-cdns.c b/drivers/i3c/master/i3c-master-cdns.c
index 5b37ffe5ad5b..454925c5e097 100644
--- a/drivers/i3c/master/i3c-master-cdns.c
+++ b/drivers/i3c/master/i3c-master-cdns.c
@@ -1665,11 +1665,8 @@ err_disable_pclk:
 static int cdns_i3c_master_remove(struct platform_device *pdev)
 {
 	struct cdns_i3c_master *master = platform_get_drvdata(pdev);
-	int ret;
 
-	ret = i3c_master_unregister(&master->base);
-	if (ret)
-		return ret;
+	i3c_master_unregister(&master->base);
 
 	clk_disable_unprepare(master->sysclk);
 	clk_disable_unprepare(master->pclk);
diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c
index 6aef5ce43cc1..f9bc58366a72 100644
--- a/drivers/i3c/master/mipi-i3c-hci/core.c
+++ b/drivers/i3c/master/mipi-i3c-hci/core.c
@@ -769,7 +769,9 @@ static int i3c_hci_remove(struct platform_device *pdev)
 {
 	struct i3c_hci *hci = platform_get_drvdata(pdev);
 
-	return i3c_master_unregister(&hci->master);
+	i3c_master_unregister(&hci->master);
+
+	return 0;
 }
 
 static const __maybe_unused struct of_device_id i3c_hci_of_match[] = {
diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index d6e9ed74cdcf..e5476d04b403 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -1572,11 +1572,8 @@ err_disable_clks:
 static int svc_i3c_master_remove(struct platform_device *pdev)
 {
 	struct svc_i3c_master *master = platform_get_drvdata(pdev);
-	int ret;
 
-	ret = i3c_master_unregister(&master->base);
-	if (ret)
-		return ret;
+	i3c_master_unregister(&master->base);
 
 	pm_runtime_dont_use_autosuspend(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index a12cda9dc715..0b52da4f2346 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -542,7 +542,7 @@ int i3c_master_register(struct i3c_master_controller *master,
 			struct device *parent,
 			const struct i3c_master_controller_ops *ops,
 			bool secondary);
-int i3c_master_unregister(struct i3c_master_controller *master);
+void i3c_master_unregister(struct i3c_master_controller *master);
 
 /**
  * i3c_dev_get_master_data() - get master private data attached to an I3C
-- 
cgit v1.2.3


From fbaa38214cd9e150764ccaa82e04ecf42cc1140c Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 11 Mar 2023 15:40:01 +0100
Subject: cxl/pci: Fix CDAT retrieval on big endian

The CDAT exposed in sysfs differs between little endian and big endian
arches:  On big endian, every 4 bytes are byte-swapped.

PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
such as pci_read_config_dword() implicitly swap bytes on big endian.
That way, the macros in include/uapi/linux/pci_regs.h work regardless of
the arch's endianness.  For an example of implicit byte-swapping, see
ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
(Load Word Byte-Reverse Indexed).

DOE Read/Write Data Mailbox Registers are unlike other registers in
Configuration Space in that they contain or receive a 4 byte portion of
an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
They need to be copied to or from the request/response buffer verbatim.
So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
byte-swapping.

The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
implicit byte-swapping.  Byte-swap requests after constructing them with
those macros and byte-swap responses before parsing them.

Change the request and response type to __le32 to avoid sparse warnings.
Per a request from Jonathan, replace sizeof(u32) with sizeof(__le32) for
consistency.

Fixes: c97006046c79 ("cxl/port: Read CDAT table")
Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: stable@vger.kernel.org # v6.0+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/3051114102f41d19df3debbee123129118fc5e6d.1678543498.git.lukas@wunner.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/pci.c  | 26 +++++++++++++-------------
 drivers/pci/doe.c       | 25 ++++++++++++++-----------
 include/linux/pci-doe.h |  8 ++++++--
 3 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 7328a2552411..49a99a84b6aa 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -462,7 +462,7 @@ static struct pci_doe_mb *find_cdat_doe(struct device *uport)
 	return NULL;
 }
 
-#define CDAT_DOE_REQ(entry_handle)					\
+#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
 	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
 		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
 	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
@@ -475,8 +475,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
 }
 
 struct cdat_doe_task {
-	u32 request_pl;
-	u32 response_pl[32];
+	__le32 request_pl;
+	__le32 response_pl[32];
 	struct completion c;
 	struct pci_doe_task task;
 };
@@ -510,10 +510,10 @@ static int cxl_cdat_get_length(struct device *dev,
 		return rc;
 	}
 	wait_for_completion(&t.c);
-	if (t.task.rv < sizeof(u32))
+	if (t.task.rv < sizeof(__le32))
 		return -EIO;
 
-	*length = t.response_pl[1];
+	*length = le32_to_cpu(t.response_pl[1]);
 	dev_dbg(dev, "CDAT length %zu\n", *length);
 
 	return 0;
@@ -524,13 +524,13 @@ static int cxl_cdat_read_table(struct device *dev,
 			       struct cxl_cdat *cdat)
 {
 	size_t length = cdat->length;
-	u32 *data = cdat->table;
+	__le32 *data = cdat->table;
 	int entry_handle = 0;
 
 	do {
 		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
 		size_t entry_dw;
-		u32 *entry;
+		__le32 *entry;
 		int rc;
 
 		rc = pci_doe_submit_task(cdat_doe, &t.task);
@@ -540,21 +540,21 @@ static int cxl_cdat_read_table(struct device *dev,
 		}
 		wait_for_completion(&t.c);
 		/* 1 DW header + 1 DW data min */
-		if (t.task.rv < (2 * sizeof(u32)))
+		if (t.task.rv < (2 * sizeof(__le32)))
 			return -EIO;
 
 		/* Get the CXL table access header entry handle */
 		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
-					 t.response_pl[0]);
+					 le32_to_cpu(t.response_pl[0]));
 		entry = t.response_pl + 1;
-		entry_dw = t.task.rv / sizeof(u32);
+		entry_dw = t.task.rv / sizeof(__le32);
 		/* Skip Header */
 		entry_dw -= 1;
-		entry_dw = min(length / sizeof(u32), entry_dw);
+		entry_dw = min(length / sizeof(__le32), entry_dw);
 		/* Prevent length < 1 DW from causing a buffer overflow */
 		if (entry_dw) {
-			memcpy(data, entry, entry_dw * sizeof(u32));
-			length -= entry_dw * sizeof(u32);
+			memcpy(data, entry, entry_dw * sizeof(__le32));
+			length -= entry_dw * sizeof(__le32);
 			data += entry_dw;
 		}
 	} while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY);
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index 66d9ab288646..6f097932ccbf 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -128,7 +128,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
 		return -EIO;
 
 	/* Length is 2 DW of header + length of payload in DW */
-	length = 2 + task->request_pl_sz / sizeof(u32);
+	length = 2 + task->request_pl_sz / sizeof(__le32);
 	if (length > PCI_DOE_MAX_LENGTH)
 		return -EIO;
 	if (length == PCI_DOE_MAX_LENGTH)
@@ -141,9 +141,9 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
 	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
 			       FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH,
 					  length));
-	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
+	for (i = 0; i < task->request_pl_sz / sizeof(__le32); i++)
 		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
-				       task->request_pl[i]);
+				       le32_to_cpu(task->request_pl[i]));
 
 	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
 
@@ -195,11 +195,11 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
 
 	/* First 2 dwords have already been read */
 	length -= 2;
-	payload_length = min(length, task->response_pl_sz / sizeof(u32));
+	payload_length = min(length, task->response_pl_sz / sizeof(__le32));
 	/* Read the rest of the response payload */
 	for (i = 0; i < payload_length; i++) {
-		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
-				      &task->response_pl[i]);
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		task->response_pl[i] = cpu_to_le32(val);
 		/* Prior to the last ack, ensure Data Object Ready */
 		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
 			return -EIO;
@@ -217,7 +217,7 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
 	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
 		return -EIO;
 
-	return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32);
+	return min(length, task->response_pl_sz / sizeof(__le32)) * sizeof(__le32);
 }
 
 static void signal_task_complete(struct pci_doe_task *task, int rv)
@@ -317,14 +317,16 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
 {
 	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
 				    *index);
+	__le32 request_pl_le = cpu_to_le32(request_pl);
+	__le32 response_pl_le;
 	u32 response_pl;
 	DECLARE_COMPLETION_ONSTACK(c);
 	struct pci_doe_task task = {
 		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
 		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
-		.request_pl = &request_pl,
+		.request_pl = &request_pl_le,
 		.request_pl_sz = sizeof(request_pl),
-		.response_pl = &response_pl,
+		.response_pl = &response_pl_le,
 		.response_pl_sz = sizeof(response_pl),
 		.complete = pci_doe_task_complete,
 		.private = &c,
@@ -340,6 +342,7 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
 	if (task.rv != sizeof(response_pl))
 		return -EIO;
 
+	response_pl = le32_to_cpu(response_pl_le);
 	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
 	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
 			      response_pl);
@@ -533,8 +536,8 @@ int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
 	 * DOE requests must be a whole number of DW and the response needs to
 	 * be big enough for at least 1 DW
 	 */
-	if (task->request_pl_sz % sizeof(u32) ||
-	    task->response_pl_sz < sizeof(u32))
+	if (task->request_pl_sz % sizeof(__le32) ||
+	    task->response_pl_sz < sizeof(__le32))
 		return -EINVAL;
 
 	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index ed9b4df792b8..43765eaf2342 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -34,6 +34,10 @@ struct pci_doe_mb;
  * @work: Used internally by the mailbox
  * @doe_mb: Used internally by the mailbox
  *
+ * Payloads are treated as opaque byte streams which are transmitted verbatim,
+ * without byte-swapping.  If payloads contain little-endian register values,
+ * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
+ *
  * The payload sizes and rv are specified in bytes with the following
  * restrictions concerning the protocol.
  *
@@ -45,9 +49,9 @@ struct pci_doe_mb;
  */
 struct pci_doe_task {
 	struct pci_doe_protocol prot;
-	u32 *request_pl;
+	__le32 *request_pl;
 	size_t request_pl_sz;
-	u32 *response_pl;
+	__le32 *response_pl;
 	size_t response_pl_sz;
 	int rv;
 	void (*complete)(struct pci_doe_task *task);
-- 
cgit v1.2.3


From 12de2f50244efdbc8e98f89a340255c3c847e1dc Mon Sep 17 00:00:00 2001
From: Keguang Zhang <keguang.zhang@gmail.com>
Date: Tue, 21 Mar 2023 19:18:14 +0800
Subject: dt-bindings: clock: Add Loongson-1 clock

Add devicetree binding document and related header file
for the Loongson-1 clock.

Signed-off-by: Keguang Zhang <keguang.zhang@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230321111817.71756-2-keguang.zhang@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/loongson,ls1x-clk.yaml          | 45 ++++++++++++++++++++++
 include/dt-bindings/clock/loongson,ls1x-clk.h      | 19 +++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml
 create mode 100644 include/dt-bindings/clock/loongson,ls1x-clk.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml b/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml
new file mode 100644
index 000000000000..01561a0f35d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/loongson,ls1x-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson-1 Clock Controller
+
+maintainers:
+  - Keguang Zhang <keguang.zhang@gmail.com>
+
+properties:
+  compatible:
+    enum:
+      - loongson,ls1b-clk
+      - loongson,ls1c-clk
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    clkc: clock-controller@1fe78030 {
+        compatible = "loongson,ls1b-clk";
+        reg = <0x1fe78030 0x8>;
+
+        clocks = <&xtal>;
+        #clock-cells = <1>;
+    };
+
+...
diff --git a/include/dt-bindings/clock/loongson,ls1x-clk.h b/include/dt-bindings/clock/loongson,ls1x-clk.h
new file mode 100644
index 000000000000..d400e9ac6002
--- /dev/null
+++ b/include/dt-bindings/clock/loongson,ls1x-clk.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Loongson-1 clock tree IDs
+ *
+ * Copyright (C) 2023 Keguang Zhang <keguang.zhang@gmail.com>
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_LS1X_CLK_H__
+#define __DT_BINDINGS_CLOCK_LS1X_CLK_H__
+
+#define LS1X_CLKID_PLL	0
+#define LS1X_CLKID_CPU	1
+#define LS1X_CLKID_DC	2
+#define LS1X_CLKID_AHB	3
+#define LS1X_CLKID_APB	4
+
+#define CLK_NR_CLKS	(LS1X_CLKID_APB + 1)
+
+#endif /* __DT_BINDINGS_CLOCK_LS1X_CLK_H__ */
-- 
cgit v1.2.3


From 09eed1192cec1755967f2af8394207acdde579a1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Mar 2023 04:01:14 +0000
Subject: neighbour: switch to standard rcu, instead of rcu_bh

rcu_bh is no longer a win, especially for objects freed
with standard call_rcu().

Switch neighbour code to no longer disable BH when not necessary.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/arp.h        |  8 +++---
 include/net/ndisc.h      | 12 ++++-----
 include/net/neighbour.h  |  6 ++---
 include/net/nexthop.h    |  6 ++---
 net/core/filter.c        | 16 +++++++-----
 net/core/neighbour.c     | 64 ++++++++++++++++++++++++------------------------
 net/ipv4/fib_semantics.c |  4 +--
 net/ipv4/ip_output.c     |  6 ++---
 net/ipv4/nexthop.c       |  8 +++---
 net/ipv4/route.c         |  4 +--
 net/ipv6/addrconf.c      | 14 +++++------
 net/ipv6/ip6_output.c    | 10 ++++----
 net/ipv6/route.c         | 12 ++++-----
 13 files changed, 87 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/include/net/arp.h b/include/net/arp.h
index d7ef4ec71dfe..e8747e0713c7 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -38,11 +38,11 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv4_neigh_lookup_noref(dev, key);
 	if (n && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
@@ -51,10 +51,10 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv4_neigh_lookup_noref(dev, key);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 void arp_init(void);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 07e5168cdaf9..52eae0943433 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -395,11 +395,11 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(dev, pkey);
 	if (n && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
@@ -409,10 +409,10 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(dev, pkey);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
@@ -420,10 +420,10 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 /* uses ipv6_stub and is meant for use outside of IPv6 core */
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index c8d39bba2a0d..3fa5774bddac 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -299,14 +299,14 @@ static inline struct neighbour *___neigh_lookup_noref(
 	const void *pkey,
 	struct net_device *dev)
 {
-	struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht);
+	struct neigh_hash_table *nht = rcu_dereference(tbl->nht);
 	struct neighbour *n;
 	u32 hash_val;
 
 	hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
-	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	for (n = rcu_dereference(nht->hash_buckets[hash_val]);
 	     n != NULL;
-	     n = rcu_dereference_bh(n->next)) {
+	     n = rcu_dereference(n->next)) {
 		if (n->dev == dev && key_eq(n, pkey))
 			return n;
 	}
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 28085b995ddc..9fa291a04621 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -498,7 +498,7 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 }
 
 /* Variant of nexthop_fib6_nh().
- * Caller should either hold rcu_read_lock_bh(), or RTNL.
+ * Caller should either hold rcu_read_lock(), or RTNL.
  */
 static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
 {
@@ -507,13 +507,13 @@ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
 	if (nh->is_group) {
 		struct nh_group *nh_grp;
 
-		nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+		nh_grp = rcu_dereference_rtnl(nh->nh_grp);
 		nh = nexthop_mpath_select(nh_grp, 0);
 		if (!nh)
 			return NULL;
 	}
 
-	nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+	nhi = rcu_dereference_rtnl(nh->nh_info);
 	if (nhi->family == AF_INET6)
 		return &nhi->fib6_nh;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index d052fac28d02..a8c8fd96c822 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2204,7 +2204,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 			return -ENOMEM;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	if (!nh) {
 		dst = skb_dst(skb);
 		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
@@ -2217,10 +2217,12 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 		int ret;
 
 		sock_confirm_neigh(skb, neigh);
+		local_bh_disable();
 		dev_xmit_recursion_inc();
 		ret = neigh_output(neigh, skb, false);
 		dev_xmit_recursion_dec();
-		rcu_read_unlock_bh();
+		local_bh_enable();
+		rcu_read_unlock();
 		return ret;
 	}
 	rcu_read_unlock_bh();
@@ -2302,7 +2304,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 			return -ENOMEM;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	if (!nh) {
 		struct dst_entry *dst = skb_dst(skb);
 		struct rtable *rt = container_of(dst, struct rtable, dst);
@@ -2314,7 +2316,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 	} else if (nh->nh_family == AF_INET) {
 		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
 	} else {
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		goto out_drop;
 	}
 
@@ -2322,13 +2324,15 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 		int ret;
 
 		sock_confirm_neigh(skb, neigh);
+		local_bh_disable();
 		dev_xmit_recursion_inc();
 		ret = neigh_output(neigh, skb, is_v6gw);
 		dev_xmit_recursion_dec();
-		rcu_read_unlock_bh();
+		local_bh_enable();
+		rcu_read_unlock();
 		return ret;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 out_drop:
 	kfree_skb(skb);
 	return -ENETDOWN;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 90d399b3f980..ddd0f32de20e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -614,7 +614,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __neigh_lookup_noref(tbl, pkey, dev);
 	if (n) {
 		if (!refcount_inc_not_zero(&n->refcnt))
@@ -622,7 +622,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 		NEIGH_CACHE_STAT_INC(tbl, hits);
 	}
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup);
@@ -2184,11 +2184,11 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
 
-		rcu_read_lock_bh();
-		nht = rcu_dereference_bh(tbl->nht);
+		rcu_read_lock();
+		nht = rcu_dereference(tbl->nht);
 		ndc.ndtc_hash_rnd = nht->hash_rnd[0];
 		ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 
 		if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
 			goto nla_put_failure;
@@ -2703,15 +2703,15 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 	if (filter->dev_idx || filter->master_idx)
 		flags |= NLM_F_DUMP_FILTERED;
 
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
+	rcu_read_lock();
+	nht = rcu_dereference(tbl->nht);
 
 	for (h = s_h; h < (1 << nht->hash_shift); h++) {
 		if (h > s_h)
 			s_idx = 0;
-		for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+		for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
 		     n != NULL;
-		     n = rcu_dereference_bh(n->next)) {
+		     n = rcu_dereference(n->next)) {
 			if (idx < s_idx || !net_eq(dev_net(n->dev), net))
 				goto next;
 			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
@@ -2730,7 +2730,7 @@ next:
 	}
 	rc = skb->len;
 out:
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	cb->args[1] = h;
 	cb->args[2] = idx;
 	return rc;
@@ -3075,20 +3075,20 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
 	int chain;
 	struct neigh_hash_table *nht;
 
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
+	rcu_read_lock();
+	nht = rcu_dereference(tbl->nht);
 
-	read_lock(&tbl->lock); /* avoid resizes */
+	read_lock_bh(&tbl->lock); /* avoid resizes */
 	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
 		struct neighbour *n;
 
-		for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+		for (n = rcu_dereference(nht->hash_buckets[chain]);
 		     n != NULL;
-		     n = rcu_dereference_bh(n->next))
+		     n = rcu_dereference(n->next))
 			cb(n, cookie);
 	}
-	read_unlock(&tbl->lock);
-	rcu_read_unlock_bh();
+	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -3138,7 +3138,7 @@ int neigh_xmit(int index, struct net_device *dev,
 		tbl = neigh_tables[index];
 		if (!tbl)
 			goto out;
-		rcu_read_lock_bh();
+		rcu_read_lock();
 		if (index == NEIGH_ARP_TABLE) {
 			u32 key = *((u32 *)addr);
 
@@ -3150,11 +3150,11 @@ int neigh_xmit(int index, struct net_device *dev,
 			neigh = __neigh_create(tbl, addr, dev, false);
 		err = PTR_ERR(neigh);
 		if (IS_ERR(neigh)) {
-			rcu_read_unlock_bh();
+			rcu_read_unlock();
 			goto out_kfree_skb;
 		}
 		err = neigh->output(neigh, skb);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 	}
 	else if (index == NEIGH_LINK_TABLE) {
 		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
@@ -3183,7 +3183,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
-		n = rcu_dereference_bh(nht->hash_buckets[bucket]);
+		n = rcu_dereference(nht->hash_buckets[bucket]);
 
 		while (n) {
 			if (!net_eq(dev_net(n->dev), net))
@@ -3201,7 +3201,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
-			n = rcu_dereference_bh(n->next);
+			n = rcu_dereference(n->next);
 		}
 
 		if (n)
@@ -3225,7 +3225,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 		if (v)
 			return n;
 	}
-	n = rcu_dereference_bh(n->next);
+	n = rcu_dereference(n->next);
 
 	while (1) {
 		while (n) {
@@ -3243,7 +3243,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
-			n = rcu_dereference_bh(n->next);
+			n = rcu_dereference(n->next);
 		}
 
 		if (n)
@@ -3252,7 +3252,7 @@ next:
 		if (++state->bucket >= (1 << nht->hash_shift))
 			break;
 
-		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
+		n = rcu_dereference(nht->hash_buckets[state->bucket]);
 	}
 
 	if (n && pos)
@@ -3354,7 +3354,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
 
 void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
 	__acquires(tbl->lock)
-	__acquires(rcu_bh)
+	__acquires(rcu)
 {
 	struct neigh_seq_state *state = seq->private;
 
@@ -3362,9 +3362,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	rcu_read_lock_bh();
-	state->nht = rcu_dereference_bh(tbl->nht);
-	read_lock(&tbl->lock);
+	rcu_read_lock();
+	state->nht = rcu_dereference(tbl->nht);
+	read_lock_bh(&tbl->lock);
 
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
@@ -3399,13 +3399,13 @@ EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
 	__releases(tbl->lock)
-	__releases(rcu_bh)
+	__releases(rcu)
 {
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
 
-	read_unlock(&tbl->lock);
-	rcu_read_unlock_bh();
+	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 574ff450c4d2..65ba18a91865 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2191,7 +2191,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 	if (nh->fib_nh_scope == RT_SCOPE_LINK) {
 		struct neighbour *n;
 
-		rcu_read_lock_bh();
+		rcu_read_lock();
 
 		if (likely(nh->fib_nh_gw_family == AF_INET))
 			n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
@@ -2204,7 +2204,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 		if (n)
 			state = READ_ONCE(n->nud_state);
 
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 	}
 
 	return !!(state & NUD_VALID);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cb04dbad9ea4..22a90a9392eb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -219,7 +219,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 			return res;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 	if (!IS_ERR(neigh)) {
 		int res;
@@ -227,10 +227,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 		sock_confirm_neigh(skb, neigh);
 		/* if crossing protocols, can not use the cached header */
 		res = neigh_output(neigh, skb, is_v6gw);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		return res;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
 			    __func__);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e28a99f1996b..f95142e56da0 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1124,13 +1124,13 @@ static bool ipv6_good_nh(const struct fib6_nh *nh)
 	int state = NUD_REACHABLE;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 	if (n)
 		state = READ_ONCE(n->nud_state);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return !!(state & NUD_VALID);
 }
@@ -1140,14 +1140,14 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
 	int state = NUD_REACHABLE;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 				      (__force u32)nh->fib_nh_gw4);
 	if (n)
 		state = READ_ONCE(n->nud_state);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return !!(state & NUD_VALID);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 232009d216c4..6a0a0bb452e9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -408,7 +408,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	struct net_device *dev = dst->dev;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	if (likely(rt->rt_gw_family == AF_INET)) {
 		n = ip_neigh_gw4(dev, rt->rt_gw4);
@@ -424,7 +424,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index faa47f9ea73a..31e0097878c5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1034,7 +1034,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 	unsigned int hash = inet6_addr_hash(net, &ifa->addr);
 	int err = 0;
 
-	spin_lock(&net->ipv6.addrconf_hash_lock);
+	spin_lock_bh(&net->ipv6.addrconf_hash_lock);
 
 	/* Ignore adding duplicate addresses on an interface */
 	if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
@@ -1044,7 +1044,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 		hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
 	}
 
-	spin_unlock(&net->ipv6.addrconf_hash_lock);
+	spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
 
 	return err;
 }
@@ -1139,15 +1139,15 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	/* For caller */
 	refcount_set(&ifa->refcnt, 1);
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	err = ipv6_add_addr_hash(idev->dev, ifa);
 	if (err < 0) {
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		goto out;
 	}
 
-	write_lock(&idev->lock);
+	write_lock_bh(&idev->lock);
 
 	/* Add to inet6_dev unicast addr list. */
 	ipv6_link_dev_addr(idev, ifa);
@@ -1158,9 +1158,9 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	}
 
 	in6_ifa_hold(ifa);
-	write_unlock(&idev->lock);
+	write_unlock_bh(&idev->lock);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e5ed39a3c65f..0b6140f0179d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -116,7 +116,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 			return res;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 
@@ -124,7 +124,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 		if (unlikely(!neigh))
 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 		if (IS_ERR(neigh)) {
-			rcu_read_unlock_bh();
+			rcu_read_unlock();
 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 			return -EINVAL;
@@ -132,7 +132,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	}
 	sock_confirm_neigh(skb, neigh);
 	ret = neigh_output(neigh, skb, false);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -1150,11 +1150,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	 * dst entry of the nexthop router
 	 */
 	rt = (struct rt6_info *) *dst;
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 				      rt6_nexthop(rt, &fl6->daddr));
 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	if (err) {
 		struct inet6_ifaddr *ifp;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e829bd880384..244df77fac87 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -633,7 +633,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 
 	nh_gw = &fib6_nh->fib_nh_gw6;
 	dev = fib6_nh->fib_nh_dev;
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	last_probe = READ_ONCE(fib6_nh->last_probe);
 	idev = __in6_dev_get(dev);
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
@@ -641,7 +641,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
 			goto out;
 
-		write_lock(&neigh->lock);
+		write_lock_bh(&neigh->lock);
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
@@ -649,7 +649,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 			if (work)
 				__neigh_set_probe_once(neigh);
 		}
-		write_unlock(&neigh->lock);
+		write_unlock_bh(&neigh->lock);
 	} else if (time_after(jiffies, last_probe +
 				       idev->cnf.rtr_probe_interval)) {
 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
@@ -667,7 +667,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 	}
 
 out:
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 #else
 static inline void rt6_probe(struct fib6_nh *fib6_nh)
@@ -683,7 +683,7 @@ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
 					  &fib6_nh->fib_nh_gw6);
 	if (neigh) {
@@ -701,7 +701,7 @@ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From fe602c87df1b6927562f4ee61edd851bb9578a49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Mar 2023 04:01:15 +0000
Subject: net: remove rcu_dereference_bh_rtnl()

This helper is no longer used in the tree.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 92ad75549e9c..f0c87baaf6c0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -61,16 +61,6 @@ static inline bool lockdep_rtnl_is_held(void)
 #define rcu_dereference_rtnl(p)					\
 	rcu_dereference_check(p, lockdep_rtnl_is_held())
 
-/**
- * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
- * @p: The pointer to read, prior to dereference
- *
- * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
- * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
- */
-#define rcu_dereference_bh_rtnl(p)				\
-	rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
-
 /**
  * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
  * @p: The pointer to read, prior to dereferencing
-- 
cgit v1.2.3


From 43b450632676fb60e9faeddff285d9fac94a4f58 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Mar 2023 09:18:07 +0100
Subject: open: return EINVAL for O_DIRECTORY | O_CREAT

After a couple of years and multiple LTS releases we received a report
that the behavior of O_DIRECTORY | O_CREAT changed starting with v5.7.

On kernels prior to v5.7 combinations of O_DIRECTORY, O_CREAT, O_EXCL
had the following semantics:

(1) open("/tmp/d", O_DIRECTORY | O_CREAT)
    * d doesn't exist:                create regular file
    * d exists and is a regular file: ENOTDIR
    * d exists and is a directory:    EISDIR

(2) open("/tmp/d", O_DIRECTORY | O_CREAT | O_EXCL)
    * d doesn't exist:                create regular file
    * d exists and is a regular file: EEXIST
    * d exists and is a directory:    EEXIST

(3) open("/tmp/d", O_DIRECTORY | O_EXCL)
    * d doesn't exist:                ENOENT
    * d exists and is a regular file: ENOTDIR
    * d exists and is a directory:    open directory

On kernels since to v5.7 combinations of O_DIRECTORY, O_CREAT, O_EXCL
have the following semantics:

(1) open("/tmp/d", O_DIRECTORY | O_CREAT)
    * d doesn't exist:                ENOTDIR (create regular file)
    * d exists and is a regular file: ENOTDIR
    * d exists and is a directory:    EISDIR

(2) open("/tmp/d", O_DIRECTORY | O_CREAT | O_EXCL)
    * d doesn't exist:                ENOTDIR (create regular file)
    * d exists and is a regular file: EEXIST
    * d exists and is a directory:    EEXIST

(3) open("/tmp/d", O_DIRECTORY | O_EXCL)
    * d doesn't exist:                ENOENT
    * d exists and is a regular file: ENOTDIR
    * d exists and is a directory:    open directory

This is a fairly substantial semantic change that userspace didn't
notice until Pedro took the time to deliberately figure out corner
cases. Since no one noticed this breakage we can somewhat safely assume
that O_DIRECTORY | O_CREAT combinations are likely unused.

The v5.7 breakage is especially weird because while ENOTDIR is returned
indicating failure a regular file is actually created. This doesn't make
a lot of sense.

Time was spent finding potential users of this combination. Searching on
codesearch.debian.net showed that codebases often express semantical
expectations about O_DIRECTORY | O_CREAT which are completely contrary
to what our code has done and currently does.

The expectation often is that this particular combination would create
and open a directory. This suggests users who tried to use that
combination would stumble upon the counterintuitive behavior no matter
if pre-v5.7 or post v5.7 and quickly realize neither semantics give them
what they want. For some examples see the code examples in [1] to [3]
and the discussion in [4].

There are various ways to address this issue. The lazy/simple option
would be to restore the pre-v5.7 behavior and to just live with that bug
forever. But since there's a real chance that the O_DIRECTORY | O_CREAT
quirk isn't relied upon we should try to get away with murder(ing bad
semantics) first. If we need to Frankenstein pre-v5.7 behavior later so
be it.

So let's simply return EINVAL categorically for O_DIRECTORY | O_CREAT
combinations. In addition to cleaning up the old bug this also opens up
the possiblity to make that flag combination do something more intuitive
in the future.

Starting with this commit the following semantics apply:

(1) open("/tmp/d", O_DIRECTORY | O_CREAT)
    * d doesn't exist:                EINVAL
    * d exists and is a regular file: EINVAL
    * d exists and is a directory:    EINVAL

(2) open("/tmp/d", O_DIRECTORY | O_CREAT | O_EXCL)
    * d doesn't exist:                EINVAL
    * d exists and is a regular file: EINVAL
    * d exists and is a directory:    EINVAL

(3) open("/tmp/d", O_DIRECTORY | O_EXCL)
    * d doesn't exist:                ENOENT
    * d exists and is a regular file: ENOTDIR
    * d exists and is a directory:    open directory

One additional note, O_TMPFILE is implemented as:

    #define __O_TMPFILE    020000000
    #define O_TMPFILE      (__O_TMPFILE | O_DIRECTORY)
    #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)

For older kernels it was important to return an explicit error when
O_TMPFILE wasn't supported. So O_TMPFILE requires that O_DIRECTORY is
raised alongside __O_TMPFILE. It also enforced that O_CREAT wasn't
specified. Since O_DIRECTORY | O_CREAT could be used to create a regular
allowing that combination together with __O_TMPFILE would've meant that
false positives were possible, i.e., that a regular file was created
instead of a O_TMPFILE. This could've been used to trick userspace into
thinking it operated on a O_TMPFILE when it wasn't.

Now that we block O_DIRECTORY | O_CREAT completely the check for O_CREAT
in the __O_TMPFILE branch via if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
can be dropped. Instead we can simply check verify that O_DIRECTORY is
raised via if (!(flags & O_DIRECTORY)) and explain this in two comments.

As Aleksa pointed out O_PATH is unaffected by this change since it
always returned EINVAL if O_CREAT was specified - with or without
O_DIRECTORY.

Link: https://lore.kernel.org/lkml/20230320071442.172228-1-pedro.falcato@gmail.com
Link: https://sources.debian.org/src/flatpak/1.14.4-1/subprojects/libglnx/glnx-dirfd.c/?hl=324#L324 [1]
Link: https://sources.debian.org/src/flatpak-builder/1.2.3-1/subprojects/libglnx/glnx-shutil.c/?hl=251#L251 [2]
Link: https://sources.debian.org/src/ostree/2022.7-2/libglnx/glnx-dirfd.c/?hl=324#L324 [3]
Link: https://www.openwall.com/lists/oss-security/2014/11/26/14 [4]
Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/open.c                              | 18 +++++++++++++-----
 include/uapi/asm-generic/fcntl.h       |  1 -
 tools/include/uapi/asm-generic/fcntl.h |  1 -
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/open.c b/fs/open.c
index 4401a73d4032..4478adcc4f3a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1196,13 +1196,21 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	}
 
 	/*
-	 * In order to ensure programs get explicit errors when trying to use
-	 * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
-	 * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
-	 * have to require userspace to explicitly set it.
+	 * Block bugs where O_DIRECTORY | O_CREAT created regular files.
+	 * Note, that blocking O_DIRECTORY | O_CREAT here also protects
+	 * O_TMPFILE below which requires O_DIRECTORY being raised.
 	 */
+	if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
+		return -EINVAL;
+
+	/* Now handle the creative implementation of O_TMPFILE. */
 	if (flags & __O_TMPFILE) {
-		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
+		/*
+		 * In order to ensure programs get explicit errors when trying
+		 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
+		 * is raised alongside __O_TMPFILE.
+		 */
+		if (!(flags & O_DIRECTORY))
 			return -EINVAL;
 		if (!(acc_mode & MAY_WRITE))
 			return -EINVAL;
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 1ecdb911add8..80f37a0d40d7 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -91,7 +91,6 @@
 
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
 
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
index b02c8e0f4057..1c7a0f6632c0 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -91,7 +91,6 @@
 
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
 
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
-- 
cgit v1.2.3


From 61587f1556fec39e8bafc40c8715f560639a4cf2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 21 Mar 2023 10:12:48 +0100
Subject: wifi: mac80211: add support for letting drivers register tc offload
 support

On newer MediaTek SoCs (e.g. MT7986), WLAN->WLAN or WLAN->Ethernet flows can
be offloaded by the SoC. In order to support that, the .ndo_setup_tc op is
needed.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20230321091248.30947-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  9 +++++++++
 net/mac80211/driver-ops.h  | 17 +++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/iface.c       | 11 +++++++++++
 net/mac80211/trace.h       | 25 +++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f12edca660ba..fcfe3e9aff3d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4227,6 +4227,10 @@ struct ieee80211_prep_tx_info {
  * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is
  *	not restored at HW reset by mac80211 so drivers need to take care of
  *	that.
+ * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware
+ *	flow offloading for flows originating from the vif.
+ *	Note that the driver must not assume that the vif driver_data is valid
+ *	at this point, since the callback can be called during netdev teardown.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4593,6 +4597,11 @@ struct ieee80211_ops {
 	int (*set_hw_timestamp)(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct cfg80211_set_hw_timestamp *hwts);
+	int (*net_setup_tc)(struct ieee80211_hw *hw,
+			    struct ieee80211_vif *vif,
+			    struct net_device *dev,
+			    enum tc_setup_type type,
+			    void *type_data);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index a68d606e6987..0bf208f5bbc5 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1502,6 +1502,23 @@ static inline int drv_net_fill_forward_path(struct ieee80211_local *local,
 	return ret;
 }
 
+static inline int drv_net_setup_tc(struct ieee80211_local *local,
+				   struct ieee80211_sub_if_data *sdata,
+				   struct net_device *dev,
+				   enum tc_setup_type type, void *type_data)
+{
+	int ret = -EOPNOTSUPP;
+
+	sdata = get_bss_sdata(sdata);
+	trace_drv_net_setup_tc(local, sdata, type);
+	if (local->ops->net_setup_tc)
+		ret = local->ops->net_setup_tc(&local->hw, &sdata->vif, dev,
+					       type, type_data);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
 int drv_change_vif_links(struct ieee80211_local *local,
 			 struct ieee80211_sub_if_data *sdata,
 			 u16 old_links, u16 new_links,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3d4edc25a69e..b2535614483e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1939,7 +1939,8 @@ void ieee80211_color_collision_detection_work(struct work_struct *work);
 /* interface handling */
 #define MAC80211_SUPPORTED_FEATURES_TX	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
 					 NETIF_F_HW_CSUM | NETIF_F_SG | \
-					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE)
+					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \
+					 NETIF_F_HW_TC)
 #define MAC80211_SUPPORTED_FEATURES_RX	(NETIF_F_RXCSUM)
 #define MAC80211_SUPPORTED_FEATURES	(MAC80211_SUPPORTED_FEATURES_TX | \
 					 MAC80211_SUPPORTED_FEATURES_RX)
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 23ed13f15067..bd2c48870add 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -813,6 +813,15 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	dev_fetch_sw_netstats(stats, dev->tstats);
 }
 
+static int ieee80211_netdev_setup_tc(struct net_device *dev,
+				     enum tc_setup_type type, void *type_data)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	return drv_net_setup_tc(local, sdata, dev, type, type_data);
+}
+
 static const struct net_device_ops ieee80211_dataif_ops = {
 	.ndo_open		= ieee80211_open,
 	.ndo_stop		= ieee80211_stop,
@@ -821,6 +830,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 	.ndo_set_rx_mode	= ieee80211_set_multicast_list,
 	.ndo_set_mac_address 	= ieee80211_change_mac,
 	.ndo_get_stats64	= ieee80211_get_stats64,
+	.ndo_setup_tc		= ieee80211_netdev_setup_tc,
 };
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
@@ -929,6 +939,7 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = {
 	.ndo_set_mac_address	= ieee80211_change_mac,
 	.ndo_get_stats64	= ieee80211_get_stats64,
 	.ndo_fill_forward_path	= ieee80211_netdev_fill_forward_path,
+	.ndo_setup_tc		= ieee80211_netdev_setup_tc,
 };
 
 static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 9f4377566c42..e0ccf5fe708a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2478,6 +2478,31 @@ DEFINE_EVENT(sta_event, drv_net_fill_forward_path,
 	TP_ARGS(local, sdata, sta)
 );
 
+TRACE_EVENT(drv_net_setup_tc,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 u8 type),
+
+	TP_ARGS(local, sdata, type),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u8, type)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->type = type;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " type:%d\n",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
+	)
+);
+
 TRACE_EVENT(drv_change_vif_links,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From fe4a6d2db3bad41e9f22c860596f355af8493ebb Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 14 Mar 2023 10:59:56 +0100
Subject: wifi: mac80211: implement support for yet another mesh A-MSDU format

MT7996 hardware supports mesh A-MSDU subframes in hardware, but uses a
big-endian length field

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/20230314095956.62085-7-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  | 11 +++++++----
 net/mac80211/rx.c       | 22 ++++++++++++++++------
 net/mac80211/sta_info.h |  5 ++++-
 net/wireless/util.c     | 36 +++++++++++++++++++++++-------------
 4 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7cebba1c4135..86cb048dc924 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6274,10 +6274,13 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
  * mesh control field.
  *
  * @skb: The input A-MSDU frame without any headers.
- * @mesh_hdr: use standard compliant mesh A-MSDU subframe header
+ * @mesh_hdr: the type of mesh header to test
+ *	0: non-mesh A-MSDU length field
+ *	1: big-endian mesh A-MSDU length field
+ *	2: little-endian mesh A-MSDU length field
  * Returns: true if subframe header lengths are valid for the @mesh_hdr mode
  */
-bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr);
+bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr);
 
 /**
  * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
@@ -6294,13 +6297,13 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr);
  * @extra_headroom: The hardware extra headroom for SKBs in the @list.
  * @check_da: DA to check in the inner ethernet header, or NULL
  * @check_sa: SA to check in the inner ethernet header, or NULL
- * @mesh_control: A-MSDU subframe header includes the mesh control field
+ * @mesh_control: see mesh_hdr in ieee80211_is_valid_amsdu
  */
 void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 			      const u8 *addr, enum nl80211_iftype iftype,
 			      const unsigned int extra_headroom,
 			      const u8 *check_da, const u8 *check_sa,
-			      bool mesh_control);
+			      u8 mesh_control);
 
 /**
  * ieee80211_get_8023_tunnel_proto - get RFC1042 or bridge tunnel encap protocol
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 85fb1d3eeb2f..1c957194554b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2983,13 +2983,23 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset)
 		return RX_DROP_UNUSABLE;
 
 	if (rx->sta && rx->sta->amsdu_mesh_control < 0) {
-		bool valid_std = ieee80211_is_valid_amsdu(skb, true);
-		bool valid_nonstd = ieee80211_is_valid_amsdu(skb, false);
+		s8 valid = -1;
+		int i;
+
+		for (i = 0; i <= 2; i++) {
+			if (!ieee80211_is_valid_amsdu(skb, i))
+				continue;
+
+			if (valid >= 0) {
+				/* ambiguous */
+				valid = -1;
+				break;
+			}
+
+			valid = i;
+		}
 
-		if (valid_std && !valid_nonstd)
-			rx->sta->amsdu_mesh_control = 1;
-		else if (valid_nonstd && !valid_std)
-			rx->sta->amsdu_mesh_control = 0;
+		rx->sta->amsdu_mesh_control = valid;
 	}
 
 	ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index e8e482a82d77..f354d470e174 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -623,7 +623,10 @@ struct link_sta_info {
  * @cparams: CoDel parameters for this station.
  * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
  * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer
- *	(-1: not yet known, 0: non-standard [without mesh header], 1: standard)
+ *	(-1: not yet known,
+ *	  0: non-mesh A-MSDU length field
+ *	  1: big-endian mesh A-MSDU length field
+ *	  2: little-endian mesh A-MSDU length field)
  * @fast_tx: TX fastpath information
  * @fast_rx: RX fastpath information
  * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d1a89e82ead0..3bc0c3072e78 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -776,7 +776,24 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
 	return frame;
 }
 
-bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr)
+static u16
+ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type)
+{
+	__le16 *field_le = field;
+	__be16 *field_be = field;
+	u16 len;
+
+	if (hdr_type >= 2)
+		len = le16_to_cpu(*field_le);
+	else
+		len = be16_to_cpu(*field_be);
+	if (hdr_type)
+		len += __ieee80211_get_mesh_hdrlen(mesh_flags);
+
+	return len;
+}
+
+bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr)
 {
 	int offset = 0, remaining, subframe_len, padding;
 
@@ -790,12 +807,8 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr)
 		if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0)
 			return false;
 
-		if (mesh_hdr)
-			len = le16_to_cpu(*(__le16 *)&hdr.len) +
-			      __ieee80211_get_mesh_hdrlen(hdr.mesh_flags);
-		else
-			len = ntohs(hdr.len);
-
+		len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags,
+						      mesh_hdr);
 		subframe_len = sizeof(struct ethhdr) + len;
 		padding = (4 - subframe_len) & 0x3;
 		remaining = skb->len - offset;
@@ -812,7 +825,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 			      const u8 *addr, enum nl80211_iftype iftype,
 			      const unsigned int extra_headroom,
 			      const u8 *check_da, const u8 *check_sa,
-			      bool mesh_control)
+			      u8 mesh_control)
 {
 	unsigned int hlen = ALIGN(extra_headroom, 4);
 	struct sk_buff *frame = NULL;
@@ -837,11 +850,8 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 		skb_copy_bits(skb, offset, &hdr, copy_len);
 		if (iftype == NL80211_IFTYPE_MESH_POINT)
 			mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags);
-		if (mesh_control)
-			len = le16_to_cpu(*(__le16 *)&hdr.eth.h_proto) + mesh_len;
-		else
-			len = ntohs(hdr.eth.h_proto);
-
+		len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags,
+						      mesh_control);
 		subframe_len = sizeof(struct ethhdr) + len;
 		padding = (4 - subframe_len) & 0x3;
 
-- 
cgit v1.2.3


From 8e40c3b6e1538401d2cf8d43087ebe1db8026af9 Mon Sep 17 00:00:00 2001
From: Manikanta Pubbisetty <quic_mpubbise@quicinc.com>
Date: Wed, 8 Mar 2023 16:15:56 +0530
Subject: wifi: nl80211: Update the documentation of
 NL80211_SCAN_FLAG_COLOCATED_6GHZ

Currently when NL80211_SCAN_FLAG_COLOCATED_6GHZ is set in the scan flags,
in addition to the co-located APs, PSC channels in the 6 GHz band would
also be scanned if the user space has asked for it. In other words, the
scan would happen on PSC channels & co-located 6 GHz channels that were
reported in the RNR IE.

Update the documentation of NL80211_SCAN_FLAG_COLOCATED_6GHZ flag to
reflect the above said behavior.

Signed-off-by: Manikanta Pubbisetty <quic_mpubbise@quicinc.com>
Link: https://lore.kernel.org/r/20230308104556.9399-1-quic_mpubbise@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9a0ac0363f1f..14e958a32b84 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6544,7 +6544,9 @@ enum nl80211_timeout_reason {
  *	channels on which APs are expected to be found. Note that when not set,
  *	the scan logic would scan all 6GHz channels, but since transmission of
  *	probe requests on non PSC channels is limited, it is highly likely that
- *	these channels would passively be scanned.
+ *	these channels would passively be scanned. Also note that when the flag
+ *	is set, in addition to the colocated APs, PSC channels would also be
+ *	scanned if the user space has asked for it.
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
-- 
cgit v1.2.3


From bd54f3c29077f23dad92ef82a78061b40be30c65 Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Mon, 5 Dec 2022 16:50:37 -0800
Subject: wifi: mac80211: generate EMA beacons in AP mode

Add APIs to generate an array of beacons for an EMA AP (enhanced
multiple BSSID advertisements), each including a single MBSSID element.
EMA profile periodicity equals the count of elements.

- ieee80211_beacon_get_template_ema_list() - Generate and return all
EMA beacon templates. Drivers must call ieee80211_beacon_free_ema_list()
to free the memory. No change in the prototype for the existing API,
ieee80211_beacon_get_template(), which should be used for non-EMA AP.

- ieee80211_beacon_get_template_ema_index() - Generate a beacon which
includes the multiple BSSID element at the given index. Drivers can use
this function in a loop until NULL is returned which indicates end of
available MBSSID elements.

- ieee80211_beacon_free_ema_list() - free the memory allocated for the
list of EMA beacon templates.

Modify existing functions ieee80211_beacon_get_ap(),
ieee80211_get_mbssid_beacon_len() and ieee80211_beacon_add_mbssid()
to accept a new parameter for EMA index.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Co-developed-by: John Crispin <john@phrozen.org>
Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/20221206005040.3177-2-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  68 +++++++++++++++++++++++
 net/mac80211/cfg.c         |  11 ++--
 net/mac80211/ieee80211_i.h |  10 +++-
 net/mac80211/tx.c          | 134 +++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 205 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fcfe3e9aff3d..679421d37a42 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5285,6 +5285,74 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_mutable_offsets *offs,
 			      unsigned int link_id);
 
+/**
+ * ieee80211_beacon_get_template_ema_index - EMA beacon template generation
+ * @hw: pointer obtained from ieee80211_alloc_hw().
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @offs: &struct ieee80211_mutable_offsets pointer to struct that will
+ *	receive the offsets that may be updated by the driver.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP).
+ * @ema_index: index of the beacon in the EMA set.
+ *
+ * This function follows the same rules as ieee80211_beacon_get_template()
+ * but returns a beacon template which includes multiple BSSID element at the
+ * requested index.
+ *
+ * Return: The beacon template. %NULL indicates the end of EMA templates.
+ */
+struct sk_buff *
+ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					struct ieee80211_mutable_offsets *offs,
+					unsigned int link_id, u8 ema_index);
+
+/**
+ * struct ieee80211_ema_beacons - List of EMA beacons
+ * @cnt: count of EMA beacons.
+ *
+ * @bcn: array of EMA beacons.
+ * @bcn.skb: the skb containing this specific beacon
+ * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will
+ *	receive the offsets that may be updated by the driver.
+ */
+struct ieee80211_ema_beacons {
+	u8 cnt;
+	struct {
+		struct sk_buff *skb;
+		struct ieee80211_mutable_offsets offs;
+	} bcn[];
+};
+
+/**
+ * ieee80211_beacon_get_template_ema_list - EMA beacon template generation
+ * @hw: pointer obtained from ieee80211_alloc_hw().
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP)
+ *
+ * This function follows the same rules as ieee80211_beacon_get_template()
+ * but allocates and returns a pointer to list of all beacon templates required
+ * to cover all profiles in the multiple BSSID set. Each template includes only
+ * one multiple BSSID element.
+ *
+ * Driver must call ieee80211_beacon_free_ema_list() to free the memory.
+ *
+ * Return: EMA beacon templates of type struct ieee80211_ema_beacons *.
+ *	%NULL on error.
+ */
+struct ieee80211_ema_beacons *
+ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       unsigned int link_id);
+
+/**
+ * ieee80211_beacon_free_ema_list - free an EMA beacon template list
+ * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers.
+ *
+ * This function will free a list previously acquired by calling
+ * ieee80211_beacon_get_template_ema_list()
+ */
+void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons);
+
 /**
  * ieee80211_beacon_get_tim - beacon generation function
  * @hw: pointer obtained from ieee80211_alloc_hw().
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 760ad934f9e1..db5fa334b801 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1122,11 +1122,11 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 	if (params->mbssid_ies) {
 		mbssid = params->mbssid_ies;
 		size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
-		size += ieee80211_get_mbssid_beacon_len(mbssid);
+		size += ieee80211_get_mbssid_beacon_len(mbssid, mbssid->cnt);
 	} else if (old && old->mbssid_ies) {
 		mbssid = old->mbssid_ies;
 		size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
-		size += ieee80211_get_mbssid_beacon_len(mbssid);
+		size += ieee80211_get_mbssid_beacon_len(mbssid, mbssid->cnt);
 	}
 
 	new = kzalloc(size, GFP_KERNEL);
@@ -3406,8 +3406,11 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 
 	len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
 	      beacon->proberesp_ies_len + beacon->assocresp_ies_len +
-	      beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len +
-	      ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies);
+	      beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len;
+
+	if (beacon->mbssid_ies)
+		len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies,
+						       beacon->mbssid_ies->cnt);
 
 	new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
 	if (!new_beacon)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c8c037a9e4e2..84d10e993eca 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1186,13 +1186,17 @@ ieee80211_vif_get_shift(struct ieee80211_vif *vif)
 }
 
 static inline int
-ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems)
+ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, u8 i)
 {
-	int i, len = 0;
+	int len = 0;
 
-	if (!elems)
+	if (!elems || !elems->cnt || i > elems->cnt)
 		return 0;
 
+	if (i < elems->cnt)
+		return elems->elem[i].len;
+
+	/* i == elems->cnt, calculate total length of all MBSSID elements */
 	for (i = 0; i < elems->cnt; i++)
 		len += elems->elem[i].len;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index de17926484bd..139eec6c64da 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5212,13 +5212,20 @@ ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 }
 
 static void
-ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon)
+ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon,
+			    u8 i)
 {
-	int i;
+	if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt ||
+	    i > beacon->mbssid_ies->cnt)
+		return;
 
-	if (!beacon->mbssid_ies)
+	if (i < beacon->mbssid_ies->cnt) {
+		skb_put_data(skb, beacon->mbssid_ies->elem[i].data,
+			     beacon->mbssid_ies->elem[i].len);
 		return;
+	}
 
+	/* i == beacon->mbssid_ies->cnt, include all MBSSID elements */
 	for (i = 0; i < beacon->mbssid_ies->cnt; i++)
 		skb_put_data(skb, beacon->mbssid_ies->elem[i].data,
 			     beacon->mbssid_ies->elem[i].len);
@@ -5231,7 +5238,8 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 			struct ieee80211_mutable_offsets *offs,
 			bool is_template,
 			struct beacon_data *beacon,
-			struct ieee80211_chanctx_conf *chanctx_conf)
+			struct ieee80211_chanctx_conf *chanctx_conf,
+			u8 ema_index)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -5250,7 +5258,9 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	/* headroom, head length,
 	 * tail length, maximum TIM length and multiple BSSID length
 	 */
-	mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies);
+	mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies,
+						     ema_index);
+
 	skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
 			    beacon->tail_len + 256 +
 			    local->hw.extra_beacon_tailroom + mbssid_len);
@@ -5268,7 +5278,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 		offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0];
 
 		if (mbssid_len) {
-			ieee80211_beacon_add_mbssid(skb, beacon);
+			ieee80211_beacon_add_mbssid(skb, beacon, ema_index);
 			offs->mbssid_off = skb->len - mbssid_len;
 		}
 
@@ -5287,12 +5297,51 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	return skb;
 }
 
+static struct ieee80211_ema_beacons *
+ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw,
+				 struct ieee80211_vif *vif,
+				 struct ieee80211_link_data *link,
+				 struct ieee80211_mutable_offsets *offs,
+				 bool is_template, struct beacon_data *beacon,
+				 struct ieee80211_chanctx_conf *chanctx_conf)
+{
+	struct ieee80211_ema_beacons *ema = NULL;
+
+	if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt)
+		return NULL;
+
+	ema = kzalloc(struct_size(ema, bcn, beacon->mbssid_ies->cnt),
+		      GFP_ATOMIC);
+	if (!ema)
+		return NULL;
+
+	for (ema->cnt = 0; ema->cnt < beacon->mbssid_ies->cnt; ema->cnt++) {
+		ema->bcn[ema->cnt].skb =
+			ieee80211_beacon_get_ap(hw, vif, link,
+						&ema->bcn[ema->cnt].offs,
+						is_template, beacon,
+						chanctx_conf, ema->cnt);
+		if (!ema->bcn[ema->cnt].skb)
+			break;
+	}
+
+	if (ema->cnt == beacon->mbssid_ies->cnt)
+		return ema;
+
+	ieee80211_beacon_free_ema_list(ema);
+	return NULL;
+}
+
+#define IEEE80211_INCLUDE_ALL_MBSSID_ELEMS -1
+
 static struct sk_buff *
 __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		       struct ieee80211_vif *vif,
 		       struct ieee80211_mutable_offsets *offs,
 		       bool is_template,
-		       unsigned int link_id)
+		       unsigned int link_id,
+		       int ema_index,
+		       struct ieee80211_ema_beacons **ema_beacons)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct beacon_data *beacon = NULL;
@@ -5321,8 +5370,29 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		if (!beacon)
 			goto out;
 
-		skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template,
-					      beacon, chanctx_conf);
+		if (ema_beacons) {
+			*ema_beacons =
+				ieee80211_beacon_get_ap_ema_list(hw, vif, link,
+								 offs,
+								 is_template,
+								 beacon,
+								 chanctx_conf);
+		} else {
+			if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) {
+				if (ema_index >= beacon->mbssid_ies->cnt)
+					goto out; /* End of MBSSID elements */
+
+				if (ema_index <= IEEE80211_INCLUDE_ALL_MBSSID_ELEMS)
+					ema_index = beacon->mbssid_ies->cnt;
+			} else {
+				ema_index = 0;
+			}
+
+			skb = ieee80211_beacon_get_ap(hw, vif, link, offs,
+						      is_template, beacon,
+						      chanctx_conf,
+						      ema_index);
+		}
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 		struct ieee80211_hdr *hdr;
@@ -5410,10 +5480,50 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_mutable_offsets *offs,
 			      unsigned int link_id)
 {
-	return __ieee80211_beacon_get(hw, vif, offs, true, link_id);
+	return __ieee80211_beacon_get(hw, vif, offs, true, link_id,
+				      IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL);
 }
 EXPORT_SYMBOL(ieee80211_beacon_get_template);
 
+struct sk_buff *
+ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					struct ieee80211_mutable_offsets *offs,
+					unsigned int link_id, u8 ema_index)
+{
+	return __ieee80211_beacon_get(hw, vif, offs, true, link_id, ema_index,
+				      NULL);
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_index);
+
+void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons)
+{
+	u8 i;
+
+	if (!ema_beacons)
+		return;
+
+	for (i = 0; i < ema_beacons->cnt; i++)
+		kfree_skb(ema_beacons->bcn[i].skb);
+
+	kfree(ema_beacons);
+}
+EXPORT_SYMBOL(ieee80211_beacon_free_ema_list);
+
+struct ieee80211_ema_beacons *
+ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       unsigned int link_id)
+{
+	struct ieee80211_ema_beacons *ema_beacons = NULL;
+
+	WARN_ON(__ieee80211_beacon_get(hw, vif, NULL, false, link_id, 0,
+				       &ema_beacons));
+
+	return ema_beacons;
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_list);
+
 struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
 					 u16 *tim_offset, u16 *tim_length,
@@ -5421,7 +5531,9 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 {
 	struct ieee80211_mutable_offsets offs = {};
 	struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false,
-						     link_id);
+						     link_id,
+						     IEEE80211_INCLUDE_ALL_MBSSID_ELEMS,
+						     NULL);
 	struct sk_buff *copy;
 	int shift;
 
-- 
cgit v1.2.3


From e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Fri, 24 Feb 2023 08:50:00 -0800
Subject: livepatch,sched: Add livepatch task switching to cond_resched()

There have been reports [1][2] of live patches failing to complete
within a reasonable amount of time due to CPU-bound kthreads.

Fix it by patching tasks in cond_resched().

There are four different flavors of cond_resched(), depending on the
kernel configuration.  Hook into all of them.

A more elegant solution might be to use a preempt notifier.  However,
non-ORC unwinders can't unwind a preempted task reliably.

[1] https://lore.kernel.org/lkml/20220507174628.2086373-1-song@kernel.org/
[2] https://lkml.kernel.org/lkml/20230120-vhost-klp-switching-v1-0-7c2b65519c43@kernel.org

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Link: https://lore.kernel.org/r/4ae981466b7814ec221014fc2554b2f86f3fb70b.1677257135.git.jpoimboe@kernel.org
---
 include/linux/livepatch.h       |   1 +
 include/linux/livepatch_sched.h |  29 +++++++++++
 include/linux/sched.h           |  20 ++++++--
 kernel/livepatch/core.c         |   1 +
 kernel/livepatch/transition.c   | 107 ++++++++++++++++++++++++++++++++++------
 kernel/sched/core.c             |  64 +++++++++++++++++++++---
 6 files changed, 194 insertions(+), 28 deletions(-)
 create mode 100644 include/linux/livepatch_sched.h

(limited to 'include')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 293e29960c6e..9b9b38e89563 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
 #include <linux/ftrace.h>
 #include <linux/completion.h>
 #include <linux/list.h>
+#include <linux/livepatch_sched.h>
 
 #if IS_ENABLED(CONFIG_LIVEPATCH)
 
diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h
new file mode 100644
index 000000000000..013794fb5da0
--- /dev/null
+++ b/include/linux/livepatch_sched.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_LIVEPATCH_SCHED_H_
+#define _LINUX_LIVEPATCH_SCHED_H_
+
+#include <linux/jump_label.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_LIVEPATCH
+
+void __klp_sched_try_switch(void);
+
+#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+static __always_inline void klp_sched_try_switch(void)
+{
+	if (static_branch_unlikely(&klp_sched_try_switch_key))
+		__klp_sched_try_switch();
+}
+
+#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+#else /* !CONFIG_LIVEPATCH */
+static inline void klp_sched_try_switch(void) {}
+static inline void __klp_sched_try_switch(void) {}
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..6d654eb4cabd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
+#include <linux/livepatch_sched.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -2070,6 +2071,9 @@ extern int __cond_resched(void);
 
 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
 DECLARE_STATIC_CALL(cond_resched, __cond_resched);
 
 static __always_inline int _cond_resched(void)
@@ -2078,6 +2082,7 @@ static __always_inline int _cond_resched(void)
 }
 
 #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
 extern int dynamic_cond_resched(void);
 
 static __always_inline int _cond_resched(void)
@@ -2085,20 +2090,25 @@ static __always_inline int _cond_resched(void)
 	return dynamic_cond_resched();
 }
 
-#else
+#else /* !CONFIG_PREEMPTION */
 
 static inline int _cond_resched(void)
 {
+	klp_sched_try_switch();
 	return __cond_resched();
 }
 
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
 
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
 
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+	klp_sched_try_switch();
+	return 0;
+}
 
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
 
 #define cond_resched() ({			\
 	__might_resched(__FILE__, __LINE__, 0);	\
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..eea7c8ec6e05 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
  *
  * - klp_ftrace_handler()
  * - klp_update_patch_state()
+ * - __klp_sched_try_switch()
  */
 DEFINE_MUTEX(klp_mutex);
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 824e2e3f07dd..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,6 +9,7 @@
 
 #include <linux/cpu.h>
 #include <linux/stacktrace.h>
+#include <linux/static_call.h>
 #include "core.h"
 #include "patch.h"
 #include "transition.h"
@@ -26,6 +27,25 @@ static int klp_target_state = KLP_UNDEFINED;
 
 static unsigned int klp_signals_cnt;
 
+/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched().  This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
 /*
  * This work can be performed periodically to finish patching or unpatching any
  * "straggler" tasks which failed to transition in the first attempt.
@@ -174,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
 	 * barrier (smp_rmb) for two cases:
 	 *
 	 * 1) Enforce the order of the TIF_PATCH_PENDING read and the
-	 *    klp_target_state read.  The corresponding write barrier is in
-	 *    klp_init_transition().
+	 *    klp_target_state read.  The corresponding write barriers are in
+	 *    klp_init_transition() and klp_reverse_transition().
 	 *
 	 * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
 	 *    of func->transition, if klp_ftrace_handler() is called later on
@@ -343,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
 	return !ret;
 }
 
+void __klp_sched_try_switch(void)
+{
+	if (likely(!klp_patch_pending(current)))
+		return;
+
+	/*
+	 * This function is called from cond_resched() which is called in many
+	 * places throughout the kernel.  Using the klp_mutex here might
+	 * deadlock.
+	 *
+	 * Instead, disable preemption to prevent racing with other callers of
+	 * klp_try_switch_task().  Thanks to task_call_func() they won't be
+	 * able to switch this task while it's running.
+	 */
+	preempt_disable();
+
+	/*
+	 * Make sure current didn't get patched between the above check and
+	 * preempt_disable().
+	 */
+	if (unlikely(!klp_patch_pending(current)))
+		goto out;
+
+	/*
+	 * Enforce the order of the TIF_PATCH_PENDING read above and the
+	 * klp_target_state read in klp_try_switch_task().  The corresponding
+	 * write barriers are in klp_init_transition() and
+	 * klp_reverse_transition().
+	 */
+	smp_rmb();
+
+	klp_try_switch_task(current);
+
+out:
+	preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
 /*
  * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
  * Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -449,7 +507,8 @@ void klp_try_complete_transition(void)
 		return;
 	}
 
-	/* we're done, now cleanup the data structures */
+	/* Done!  Now cleanup the data structures. */
+	klp_cond_resched_disable();
 	patch = klp_transition_patch;
 	klp_complete_transition();
 
@@ -501,6 +560,8 @@ void klp_start_transition(void)
 			set_tsk_thread_flag(task, TIF_PATCH_PENDING);
 	}
 
+	klp_cond_resched_enable();
+
 	klp_signals_cnt = 0;
 }
 
@@ -556,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 * see a func in transition with a task->patch_state of KLP_UNDEFINED.
 	 *
 	 * Also enforce the order of the klp_target_state write and future
-	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
-	 * set a task->patch_state to KLP_UNDEFINED.
+	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+	 * __klp_sched_try_switch() don't set a task->patch_state to
+	 * KLP_UNDEFINED.
 	 */
 	smp_wmb();
 
@@ -593,14 +655,10 @@ void klp_reverse_transition(void)
 		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
 						   "unpatching to patching");
 
-	klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
-	klp_target_state = !klp_target_state;
-
 	/*
 	 * Clear all TIF_PATCH_PENDING flags to prevent races caused by
-	 * klp_update_patch_state() running in parallel with
-	 * klp_start_transition().
+	 * klp_update_patch_state() or __klp_sched_try_switch() running in
+	 * parallel with the reverse transition.
 	 */
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, task)
@@ -610,9 +668,28 @@ void klp_reverse_transition(void)
 	for_each_possible_cpu(cpu)
 		clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
 
-	/* Let any remaining calls to klp_update_patch_state() complete */
+	/*
+	 * Make sure all existing invocations of klp_update_patch_state() and
+	 * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+	 * starting the reverse transition.
+	 */
 	klp_synchronize_transition();
 
+	/*
+	 * All patching has stopped, now re-initialize the global variables to
+	 * prepare for the reverse transition.
+	 */
+	klp_transition_patch->enabled = !klp_transition_patch->enabled;
+	klp_target_state = !klp_target_state;
+
+	/*
+	 * Enforce the order of the klp_target_state write and the
+	 * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+	 * klp_update_patch_state() and __klp_sched_try_switch() don't set
+	 * task->patch_state to the wrong value.
+	 */
+	smp_wmb();
+
 	klp_start_transition();
 }
 
@@ -626,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
 	 * the task flag up to date with the parent here.
 	 *
 	 * The operation is serialized against all klp_*_transition()
-	 * operations by the tasklist_lock. The only exception is
-	 * klp_update_patch_state(current), but we cannot race with
-	 * that because we are current.
+	 * operations by the tasklist_lock. The only exceptions are
+	 * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+	 * cannot race with them because we are current.
 	 */
 	if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
 		set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ddd9610be56..b9616f153946 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8525,6 +8525,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
 int __sched dynamic_cond_resched(void)
 {
+	klp_sched_try_switch();
 	if (!static_branch_unlikely(&sk_dynamic_cond_resched))
 		return 0;
 	return __cond_resched();
@@ -8673,13 +8674,17 @@ int sched_dynamic_mode(const char *str)
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
 
-void sched_dynamic_update(int mode)
+DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
 {
 	/*
 	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
 	 * the ZERO state, which is invalid.
 	 */
-	preempt_dynamic_enable(cond_resched);
+	if (!klp_override)
+		preempt_dynamic_enable(cond_resched);
 	preempt_dynamic_enable(might_resched);
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8687,36 +8692,79 @@ void sched_dynamic_update(int mode)
 
 	switch (mode) {
 	case preempt_dynamic_none:
-		preempt_dynamic_enable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_enable(cond_resched);
 		preempt_dynamic_disable(might_resched);
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: none\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: none\n");
 		break;
 
 	case preempt_dynamic_voluntary:
-		preempt_dynamic_enable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_enable(cond_resched);
 		preempt_dynamic_enable(might_resched);
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: voluntary\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: voluntary\n");
 		break;
 
 	case preempt_dynamic_full:
-		preempt_dynamic_disable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
 		preempt_dynamic_disable(might_resched);
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: full\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: full\n");
 		break;
 	}
 
 	preempt_dynamic_mode = mode;
 }
 
+void sched_dynamic_update(int mode)
+{
+	mutex_lock(&sched_dynamic_mutex);
+	__sched_dynamic_update(mode);
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+	__klp_sched_try_switch();
+	return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+	mutex_lock(&sched_dynamic_mutex);
+
+	klp_override = true;
+	static_call_update(cond_resched, klp_cond_resched);
+
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+	mutex_lock(&sched_dynamic_mutex);
+
+	klp_override = false;
+	__sched_dynamic_update(preempt_dynamic_mode);
+
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
 static int __init setup_preempt_mode(char *str)
 {
 	int mode = sched_dynamic_mode(str);
-- 
cgit v1.2.3


From 6dddd93938b3651cfeba7158ac179b4e6d3c1553 Mon Sep 17 00:00:00 2001
From: Yonatan Nachum <ynachum@amazon.com>
Date: Sun, 19 Feb 2023 08:13:28 +0000
Subject: RDMA/efa: Add data polling capability feature bit

Add feature bit to existing device caps field. EFA supports data polling
of 128 bytes blocks.

The flag indicates that the NIC guarentees that a 128 byte aligned block
is written in order, ie that observing the last 8 bits of the block mean
the prior 127 bytes are also written.

It is useful for "last data polling" acceleration techniques.

Link: https://lore.kernel.org/r/20230219081328.10419-1-mrgolin@amazon.com
Reviewed-by: Yehuda Yitschak <yehuday@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Yonatan Nachum <ynachum@amazon.com>
Signed-off-by: Michael Margolin <mrgolin@amazon.com>
Acked-by: Gal Pressman <gal.pressman@linux.dev>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 7 +++++--
 drivers/infiniband/hw/efa/efa_verbs.c           | 5 ++++-
 include/uapi/rdma/efa-abi.h                     | 3 ++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index d4b9226088bd..3db791e6c030 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_CMDS_H_
@@ -618,7 +618,9 @@ struct efa_admin_feature_device_attr_desc {
 	 *    TX queues
 	 * 1 : rnr_retry - If set, RNR retry is supported on
 	 *    modify QP command
-	 * 31:2 : reserved - MBZ
+	 * 2 : data_polling_128 - If set, 128 bytes data
+	 *    polling is supported
+	 * 31:3 : reserved - MBZ
 	 */
 	u32 device_caps;
 
@@ -991,6 +993,7 @@ struct efa_admin_host_info {
 /* feature_device_attr_desc */
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2)
 
 /* create_eq_cmd */
 #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 31454643f8c5..c20136e9d3d1 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/dma-buf.h>
@@ -250,6 +250,9 @@ int efa_query_device(struct ib_device *ibdev,
 		if (EFA_DEV_CAP(dev, RNR_RETRY))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
 
+		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
+
 		if (dev->neqs)
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
 
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 163ac79556d6..74406b4817ce 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef EFA_ABI_USER_H
@@ -120,6 +120,7 @@ enum {
 	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
 	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
 	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
+	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
 };
 
 struct efa_ibv_ex_query_device_resp {
-- 
cgit v1.2.3


From 073828e954459b883f23e53999d31e4c55ab9654 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Wed, 22 Mar 2023 12:13:29 +0100
Subject: ACPI: processor: Fix evaluating _PDC method when running as Xen dom0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In ACPI systems, the OS can direct power management, as opposed to the
firmware.  This OS-directed Power Management is called OSPM.  Part of
telling the firmware that the OS going to direct power management is
making ACPI "_PDC" (Processor Driver Capabilities) calls.  These _PDC
methods must be evaluated for every processor object.  If these _PDC
calls are not completed for every processor it can lead to
inconsistency and later failures in things like the CPU frequency
driver.

In a Xen system, the dom0 kernel is responsible for system-wide power
management.  The dom0 kernel is in charge of OSPM.  However, the
number of CPUs available to dom0 can be different than the number of
CPUs physically present on the system.

This leads to a problem: the dom0 kernel needs to evaluate _PDC for
all the processors, but it can't always see them.

In dom0 kernels, ignore the existing ACPI method for determining if a
processor is physically present because it might not be accurate.
Instead, ask the hypervisor for this information.

Fix this by introducing a custom function to use when running as Xen
dom0 in order to check whether a processor object matches a CPU that's
online.  Such checking is done using the existing information fetched
by the Xen pCPU subsystem, extending it to also store the ACPI ID.

This ensures that _PDC method gets evaluated for all physically online
CPUs, regardless of the number of CPUs made available to dom0.

Fixes: 5d554a7bb064 ("ACPI: processor: add internal processor_physically_present()")
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_pdc.c | 11 +++++++++++
 drivers/xen/pcpu.c           | 20 ++++++++++++++++++++
 include/xen/xen.h            | 11 +++++++++++
 3 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/processor_pdc.c b/drivers/acpi/processor_pdc.c
index 8c3f82c9fff3..18fb04523f93 100644
--- a/drivers/acpi/processor_pdc.c
+++ b/drivers/acpi/processor_pdc.c
@@ -14,6 +14,8 @@
 #include <linux/acpi.h>
 #include <acpi/processor.h>
 
+#include <xen/xen.h>
+
 #include "internal.h"
 
 static bool __init processor_physically_present(acpi_handle handle)
@@ -47,6 +49,15 @@ static bool __init processor_physically_present(acpi_handle handle)
 		return false;
 	}
 
+	if (xen_initial_domain())
+		/*
+		 * When running as a Xen dom0 the number of processors Linux
+		 * sees can be different from the real number of processors on
+		 * the system, and we still need to execute _PDC for all of
+		 * them.
+		 */
+		return xen_processor_present(acpi_id);
+
 	type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
 	cpuid = acpi_get_cpuid(handle, type, acpi_id);
 
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
index fd3a644b0855..b3e3d1bb37f3 100644
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@@ -58,6 +58,7 @@ struct pcpu {
 	struct list_head list;
 	struct device dev;
 	uint32_t cpu_id;
+	uint32_t acpi_id;
 	uint32_t flags;
 };
 
@@ -249,6 +250,7 @@ static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
 
 	INIT_LIST_HEAD(&pcpu->list);
 	pcpu->cpu_id = info->xen_cpuid;
+	pcpu->acpi_id = info->acpi_id;
 	pcpu->flags = info->flags;
 
 	/* Need hold on xen_pcpu_lock before pcpu list manipulations */
@@ -381,3 +383,21 @@ err1:
 	return ret;
 }
 arch_initcall(xen_pcpu_init);
+
+#ifdef CONFIG_ACPI
+bool __init xen_processor_present(uint32_t acpi_id)
+{
+	const struct pcpu *pcpu;
+	bool online = false;
+
+	mutex_lock(&xen_pcpu_lock);
+	list_for_each_entry(pcpu, &xen_pcpus, list)
+		if (pcpu->acpi_id == acpi_id) {
+			online = pcpu->flags & XEN_PCPU_FLAGS_ONLINE;
+			break;
+		}
+	mutex_unlock(&xen_pcpu_lock);
+
+	return online;
+}
+#endif
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 7adf59837c25..0efeb652f9b8 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -71,4 +71,15 @@ static inline void xen_free_unpopulated_pages(unsigned int nr_pages,
 }
 #endif
 
+#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) && defined(CONFIG_X86)
+bool __init xen_processor_present(uint32_t acpi_id);
+#else
+#include <linux/bug.h>
+static inline bool xen_processor_present(uint32_t acpi_id)
+{
+	BUG();
+	return false;
+}
+#endif
+
 #endif	/* _XEN_XEN_H */
-- 
cgit v1.2.3


From 6f56ad1b92328997e1b1792047099df6f8d7acb5 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Wed, 15 Mar 2023 21:48:01 +0000
Subject: netfilter: nft_redir: use `struct nf_nat_range2` throughout and
 deduplicate eval call-backs

`nf_nat_redirect_ipv4` takes a `struct nf_nat_ipv4_multi_range_compat`,
but converts it internally to a `struct nf_nat_range2`.  Change the
function to take the latter, factor out the code now shared with
`nf_nat_redirect_ipv6`, move the conversion to the xt_REDIRECT module,
and update the ipv4 range initialization in the nft_redir module.

Replace a bare hex constant for 127.0.0.1 with a macro.

Remove `WARN_ON`.  `nf_nat_setup_info` calls `nf_ct_is_confirmed`:

	/* Can't setup nat info for confirmed ct. */
	if (nf_ct_is_confirmed(ct))
		return NF_ACCEPT;

This means that `ct` cannot be null or the kernel will crash, and
implies that `ctinfo` is `IP_CT_NEW` or `IP_CT_RELATED`.

nft_redir has separate ipv4 and ipv6 call-backs which share much of
their code, and an inet one switch containing a switch that calls one of
the others based on the family of the packet.  Merge the ipv4 and ipv6
ones into the inet one in order to get rid of the duplicate code.

Const-qualify the `priv` pointer since we don't need to write through
it.

Assign `priv->flags` to the range instead of OR-ing it in.

Set the `NF_NAT_RANGE_PROTO_SPECIFIED` flag once during init, rather
than on every eval.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_nat_redirect.h |  3 +-
 net/netfilter/nf_nat_redirect.c         | 71 +++++++++++++---------------
 net/netfilter/nft_redir.c               | 84 ++++++++++++---------------------
 net/netfilter/xt_REDIRECT.c             | 10 +++-
 4 files changed, 72 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index 2418653a66db..279380de904c 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -6,8 +6,7 @@
 #include <uapi/linux/netfilter/nf_nat.h>
 
 unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
-		     const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum);
 unsigned int
 nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index f91579c821e9..6616ba5d0b04 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -10,6 +10,7 @@
 
 #include <linux/if.h>
 #include <linux/inetdevice.h>
+#include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
@@ -24,54 +25,56 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_redirect.h>
 
+static unsigned int
+nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
+		const union nf_inet_addr *newdst)
+{
+	struct nf_nat_range2 newrange;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+	memset(&newrange, 0, sizeof(newrange));
+
+	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS;
+	newrange.min_addr	= *newdst;
+	newrange.max_addr	= *newdst;
+	newrange.min_proto	= range->min_proto;
+	newrange.max_proto	= range->max_proto;
+
+	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
 unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
-		     const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	__be32 newdst;
-	struct nf_nat_range2 newrange;
+	union nf_inet_addr newdst = {};
 
 	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
 		hooknum != NF_INET_LOCAL_OUT);
 
-	ct = nf_ct_get(skb, &ctinfo);
-	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
-
 	/* Local packets: make them go to loopback */
 	if (hooknum == NF_INET_LOCAL_OUT) {
-		newdst = htonl(0x7F000001);
+		newdst.ip = htonl(INADDR_LOOPBACK);
 	} else {
 		const struct in_device *indev;
 
-		newdst = 0;
-
 		indev = __in_dev_get_rcu(skb->dev);
 		if (indev) {
 			const struct in_ifaddr *ifa;
 
 			ifa = rcu_dereference(indev->ifa_list);
 			if (ifa)
-				newdst = ifa->ifa_local;
+				newdst.ip = ifa->ifa_local;
 		}
 
-		if (!newdst)
+		if (!newdst.ip)
 			return NF_DROP;
 	}
 
-	/* Transfer from original range. */
-	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
-	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
-	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.ip = newdst;
-	newrange.max_addr.ip = newdst;
-	newrange.min_proto   = mr->range[0].min;
-	newrange.max_proto   = mr->range[0].max;
-
-	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+	return nf_nat_redirect(skb, range, &newdst);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
 
@@ -81,14 +84,10 @@ unsigned int
 nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_nat_range2 newrange;
-	struct in6_addr newdst;
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
+	union nf_inet_addr newdst = {};
 
-	ct = nf_ct_get(skb, &ctinfo);
 	if (hooknum == NF_INET_LOCAL_OUT) {
-		newdst = loopback_addr;
+		newdst.in6 = loopback_addr;
 	} else {
 		struct inet6_dev *idev;
 		struct inet6_ifaddr *ifa;
@@ -98,7 +97,7 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		if (idev != NULL) {
 			read_lock_bh(&idev->lock);
 			list_for_each_entry(ifa, &idev->addr_list, if_list) {
-				newdst = ifa->addr;
+				newdst.in6 = ifa->addr;
 				addr = true;
 				break;
 			}
@@ -109,12 +108,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 			return NF_DROP;
 	}
 
-	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.in6	= newdst;
-	newrange.max_addr.in6	= newdst;
-	newrange.min_proto	= range->min_proto;
-	newrange.max_proto	= range->max_proto;
-
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+	return nf_nat_redirect(skb, range, &newdst);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 67cec56bc84a..a70196ffcb1e 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -64,6 +64,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
 		} else {
 			priv->sreg_proto_max = priv->sreg_proto_min;
 		}
+
+		priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 	}
 
 	if (tb[NFTA_REDIR_FLAGS]) {
@@ -99,25 +101,37 @@ nla_put_failure:
 	return -1;
 }
 
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
+static void nft_redir_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
 {
-	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_ipv4_multi_range_compat mr;
+	const struct nft_redir *priv = nft_expr_priv(expr);
+	struct nf_nat_range2 range;
 
-	memset(&mr, 0, sizeof(mr));
+	memset(&range, 0, sizeof(range));
+	range.flags = priv->flags;
 	if (priv->sreg_proto_min) {
-		mr.range[0].min.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_min]);
-		mr.range[0].max.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_max]);
-		mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto.all = (__force __be16)
+			nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+		range.max_proto.all = (__force __be16)
+			nft_reg_load16(&regs->data[priv->sreg_proto_max]);
 	}
 
-	mr.range[0].flags |= priv->flags;
-
-	regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
+							  nft_hook(pkt));
+		break;
+#ifdef CONFIG_NF_TABLES_IPV6
+	case NFPROTO_IPV6:
+		regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+							  nft_hook(pkt));
+		break;
+#endif
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
 }
 
 static void
@@ -130,7 +144,7 @@ static struct nft_expr_type nft_redir_ipv4_type;
 static const struct nft_expr_ops nft_redir_ipv4_ops = {
 	.type		= &nft_redir_ipv4_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_ipv4_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_ipv4_destroy,
 	.dump		= nft_redir_dump,
@@ -148,28 +162,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
 };
 
 #ifdef CONFIG_NF_TABLES_IPV6
-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
-{
-	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_range2 range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_min]);
-		range.max_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_max]);
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	range.flags |= priv->flags;
-
-	regs->verdict.code =
-		nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
-}
-
 static void
 nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@@ -180,7 +172,7 @@ static struct nft_expr_type nft_redir_ipv6_type;
 static const struct nft_expr_ops nft_redir_ipv6_ops = {
 	.type		= &nft_redir_ipv6_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_ipv6_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_ipv6_destroy,
 	.dump		= nft_redir_dump,
@@ -199,20 +191,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
 #endif
 
 #ifdef CONFIG_NF_TABLES_INET
-static void nft_redir_inet_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
-{
-	switch (nft_pf(pkt)) {
-	case NFPROTO_IPV4:
-		return nft_redir_ipv4_eval(expr, regs, pkt);
-	case NFPROTO_IPV6:
-		return nft_redir_ipv6_eval(expr, regs, pkt);
-	}
-
-	WARN_ON_ONCE(1);
-}
-
 static void
 nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@@ -223,7 +201,7 @@ static struct nft_expr_type nft_redir_inet_type;
 static const struct nft_expr_ops nft_redir_inet_ops = {
 	.type		= &nft_redir_inet_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_inet_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_inet_destroy,
 	.dump		= nft_redir_dump,
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 353ca7801251..ff66b56a3f97 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-/* FIXME: Take multiple ranges --RR */
 static int redirect_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
 static unsigned int
 redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
+	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+	struct nf_nat_range2 range = {
+		.flags       = mr->range[0].flags,
+		.min_proto   = mr->range[0].min,
+		.max_proto   = mr->range[0].max,
+	};
+
+	return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
 }
 
 static struct xt_target redirect_tg_reg[] __read_mostly = {
-- 
cgit v1.2.3


From d18a04157fc171fd48075e3dc96471bd3b87f0dd Mon Sep 17 00:00:00 2001
From: Douglas Raillard <douglas.raillard@arm.com>
Date: Mon, 6 Mar 2023 12:27:43 +0000
Subject: rcu: Fix rcu_torture_read ftrace event

Fix the rcutorturename field so that its size is correctly reported in
the text format embedded in trace.dat files. As it stands, it is
reported as being of size 1:

    field:char rcutorturename[8];   offset:8;       size:1; signed:0;

Signed-off-by: Douglas Raillard <douglas.raillard@arm.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Cc: stable@vger.kernel.org
Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()")
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
[ boqun: Add "Cc" and "Fixes" tags per Steven ]
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/trace/events/rcu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 90b2fb0292cb..012fa0d171b2 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -768,7 +768,7 @@ TRACE_EVENT_RCU(rcu_torture_read,
 	TP_ARGS(rcutorturename, rhp, secs, c_old, c),
 
 	TP_STRUCT__entry(
-		__field(char, rcutorturename[RCUTORTURENAME_LEN])
+		__array(char, rcutorturename, RCUTORTURENAME_LEN)
 		__field(struct rcu_head *, rhp)
 		__field(unsigned long, secs)
 		__field(unsigned long, c_old)
-- 
cgit v1.2.3


From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Wed, 22 Mar 2023 12:47:54 -0700
Subject: bpf: return long from bpf_map_ops funcs

This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.

The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.

For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:

; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
  ...
  46:	call   0xffffffffe103291c	; htab_map_update_elem
; if (err && err != -EEXIST) {
  4b:	cmp    $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax

kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:

movl $0xffffffef, %r9d
...
movl %r9d, %eax

...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef

Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 14 +++++++-------
 include/linux/filter.h         |  6 +++---
 kernel/bpf/arraymap.c          | 12 ++++++------
 kernel/bpf/bloom_filter.c      | 12 ++++++------
 kernel/bpf/bpf_cgrp_storage.c  |  6 +++---
 kernel/bpf/bpf_inode_storage.c |  6 +++---
 kernel/bpf/bpf_struct_ops.c    |  6 +++---
 kernel/bpf/bpf_task_storage.c  |  6 +++---
 kernel/bpf/cpumap.c            |  8 ++++----
 kernel/bpf/devmap.c            | 24 ++++++++++++------------
 kernel/bpf/hashtab.c           | 36 ++++++++++++++++++------------------
 kernel/bpf/local_storage.c     |  6 +++---
 kernel/bpf/lpm_trie.c          |  6 +++---
 kernel/bpf/queue_stack_maps.c  | 22 +++++++++++-----------
 kernel/bpf/reuseport_array.c   |  2 +-
 kernel/bpf/ringbuf.c           |  6 +++---
 kernel/bpf/stackmap.c          |  6 +++---
 kernel/bpf/verifier.c          | 14 +++++++-------
 net/core/bpf_sk_storage.c      |  6 +++---
 net/core/sock_map.c            |  8 ++++----
 net/xdp/xskmap.c               |  8 ++++----
 21 files changed, 110 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ef98fb92987..ec0df059f562 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,11 +96,11 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
-	int (*map_delete_elem)(struct bpf_map *map, void *key);
-	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
-	int (*map_pop_elem)(struct bpf_map *map, void *value);
-	int (*map_peek_elem)(struct bpf_map *map, void *value);
+	long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
+	long (*map_delete_elem)(struct bpf_map *map, void *key);
+	long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	long (*map_pop_elem)(struct bpf_map *map, void *value);
+	long (*map_peek_elem)(struct bpf_map *map, void *value);
 	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
 
 	/* funcs called by prog_array and perf_event_array map */
@@ -139,7 +139,7 @@ struct bpf_map_ops {
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
 	/* Misc helpers.*/
-	int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
+	long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
 
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
@@ -157,7 +157,7 @@ struct bpf_map_ops {
 	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 					      struct bpf_func_state *caller,
 					      struct bpf_func_state *callee);
-	int (*map_for_each_callback)(struct bpf_map *map,
+	long (*map_for_each_callback)(struct bpf_map *map,
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index efa5d4a1677e..23c08c31bea9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
-						  u64 flags, const u64 flag_mask,
-						  void *lookup_elem(struct bpf_map *map, u32 key))
+static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
+						   u64 flags, const u64 flag_mask,
+						   void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1588c793a715..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),
 };
 
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				   void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				    void *callback_ctx, u64 flags)
 {
 	u32 i, key, num_elems = 0;
 	struct bpf_array *array;
@@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	return 0;
 }
 
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *old_ptr;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 6350c5d35a9b..db19784601a7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -41,7 +41,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
 	return h & bloom->bitset_mask;
 }
 
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +56,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
 	return 0;
 }
 
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +73,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 	return 0;
 }
 
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
 
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
@@ -177,8 +177,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-EINVAL);
 }
 
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
-				 void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+				  void *value, u64 flags)
 {
 	/* The eBPF program should use map_push_elem instead */
 	return -EINVAL;
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c975cacdd16b..f5b016a5484d 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -93,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+					 void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct cgroup *cgroup;
@@ -125,7 +125,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct cgroup *cgroup;
 	int err, fd;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index ad2ab0187e45..9a5f05151898 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -91,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct file *f;
@@ -127,7 +127,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct file *f;
 	int fd, err;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 38903fb52f98..ba7a94276e3b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -349,8 +349,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 					   model, flags, tlinks, NULL);
 }
 
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+					   void *value, u64 flags)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -524,7 +524,7 @@ unlock:
 	return err;
 }
 
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 {
 	enum bpf_struct_ops_state prev_state;
 	struct bpf_struct_ops_map *st_map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index c88cc04c17c1..ab5bd1ef58c4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -120,8 +120,8 @@ out:
 	return ERR_PTR(err);
 }
 
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
-					    void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct task_struct *task;
@@ -173,7 +173,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	return 0;
 }
 
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct task_struct *task;
 	unsigned int f_flags;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 871809e71b4e..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
 	}
 }
 
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	struct bpf_cpumap_val cpumap_value = {};
@@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __cpu_map_lookup_elem);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 19b036a228f7..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 	kfree(dev);
 }
 
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -897,8 +897,8 @@ err_out:
 	return ERR_PTR(-EINVAL);
 }
 
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
-				 void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				  void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	return __dev_map_update_elem(current->nsproxy->net_ns,
 				     map, key, value, map_flags);
 }
 
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
-				     void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+				       void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -999,21 +999,21 @@ out_err:
 	return err;
 }
 
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
 					 map, key, value, map_flags);
 }
 
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
 				      __dev_map_lookup_elem);
 }
 
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0df4b0c10f59..96b645bba3a4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1073,8 +1073,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1175,8 +1175,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
-				    u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old = NULL;
@@ -1242,9 +1242,9 @@ err:
 	return ret;
 }
 
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags,
-					 bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags,
+					  bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1297,9 +1297,9 @@ err:
 	return ret;
 }
 
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					     void *value, u64 map_flags,
-					     bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					      void *value, u64 map_flags,
+					      bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1364,21 +1364,21 @@ err:
 	return ret;
 }
 
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 map_flags)
 {
 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
 }
 
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					   void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					    void *value, u64 map_flags)
 {
 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
 						 false);
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -1414,7 +1414,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -2134,8 +2134,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
 };
 
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				  void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				   void *callback_ctx, u64 flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a993560f200a..4c7bbec4a9e4 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
 	return &READ_ONCE(storage->buf)->data[0];
 }
 
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 flags)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
 	bpf_map_area_free(map);
 }
 
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index dc23f2ac9cde..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
-			    void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+			     void *_key, void *value, u64 flags)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key;
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 63ecbbcb349d..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
 	bpf_map_area_free(qs);
 }
 
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -124,7 +124,7 @@ out:
 }
 
 
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -156,32 +156,32 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
-				     u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+				      u64 flags)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 71cb72f5b733..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
 {
 	struct reuseport_array *array = reuseport_array(map);
 	u32 index = *(u32 *)key;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0d2a45ff83f1..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+				    u64 flags)
 {
 	return -ENOTSUPP;
 }
 
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -ENOTSUPP;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 0f1d8dced933..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
 	return 0;
 }
 
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *old_bucket;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5693e4a92752..50c995697f0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17692,21 +17692,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
 				     (void *(*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
-				     (int (*)(struct bpf_map *map, void *key))NULL));
+				     (long (*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
-				     (int (*)(struct bpf_map *map, void *key, void *value,
+				     (long (*)(struct bpf_map *map, void *key, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_push_elem,
-				     (int (*)(struct bpf_map *map, void *value,
+				     (long (*)(struct bpf_map *map, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_redirect,
-				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+				     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
-				     (int (*)(struct bpf_map *map,
+				     (long (*)(struct bpf_map *map,
 					      bpf_callback_t callback_fn,
 					      void *callback_ctx,
 					      u64 flags))NULL));
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 24c3dc0d62e5..cb0f5a105b89 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -94,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(err);
 }
 
-static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct socket *sock;
@@ -114,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 	return err;
 }
 
-static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct socket *sock;
 	int fd, err;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9b854e236d23..7c189c2e2fbf 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
 	__sock_map_delete(stab, sk, link_raw);
 }
 
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	u32 i = *(u32 *)key;
@@ -587,8 +587,8 @@ out:
 	return ret;
 }
 
-static int sock_map_update_elem(struct bpf_map *map, void *key,
-				void *value, u64 flags)
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+				 void *value, u64 flags)
 {
 	struct sock *sk = (struct sock *)value;
 	int ret;
@@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
 	raw_spin_unlock_bh(&bucket->lock);
 }
 
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
 	u32 hash, key_size = map->key_size;
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 0c38d7175922..2c1427074a3b 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -223,7 +223,7 @@ out:
 	return err;
 }
 
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __xsk_map_lookup_elem);
-- 
cgit v1.2.3


From e02c625de580a1bc8166c5f95f23a50fa59ee1bf Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:23 +0100
Subject: ata: pata_parport: Remove pi_swab16 and pi_swab32

Convert comm and kbic drivers to use standard swab16.
Remove pi_swab16 and pi_swab32.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/comm.c |  7 +++++--
 drivers/ata/pata_parport/kbic.c |  7 +++++--
 include/linux/pata_parport.h    | 17 -----------------
 3 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 1775e7ed9336..11ed9fb57744 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -165,11 +165,14 @@ static void comm_write_block( PIA *pi, char * buf, int count )
                 break;
 
         case 3: w3(0x48); (void)r1();
-                for (k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+		for (k = 0; k < count / 2; k++)
+			w4w(swab16(((u16 *)buf)[k]));
                 break;
 
         case 4: w3(0x48); (void)r1();
-                for (k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+		for (k = 0; k < count / 4; k++)
+			w4l(swab16(((u16 *)buf)[2 * k]) |
+			    swab16(((u16 *)buf)[2 * k + 1]) << 16);
                 break;
 
 
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index f0960eb68635..93430ca32a52 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -213,12 +213,15 @@ static void kbic_write_block( PIA *pi, char * buf, int count )
 		break;
 
 	case 4: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
-                for(k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+		for (k = 0; k < count / 2; k++)
+			w4w(swab16(((u16 *)buf)[k]));
                 w2(4); w2(0); w2(4);
                 break;
 
         case 5: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
-                for(k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+		for (k = 0; k < count / 4; k++)
+			w4l(swab16(((u16 *)buf)[2 * k]) |
+			    swab16(((u16 *)buf)[2 * k + 1]) << 16);
                 w2(4); w2(0); w2(4);
                 break;
 
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 58781846f282..458544fe5e6c 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -54,23 +54,6 @@ typedef struct pi_adapter PIA;	/* for paride protocol modules */
 #define r4w()			(delay_p, inw(pi->port + 4))
 #define r4l()			(delay_p, inl(pi->port + 4))
 
-static inline u16 pi_swab16(char *b, int k)
-{
-	union { u16 u; char t[2]; } r;
-
-	r.t[0] = b[2 * k + 1]; r.t[1] = b[2 * k];
-	return r.u;
-}
-
-static inline u32 pi_swab32(char *b, int k)
-{
-	union { u32 u; char f[4]; } r;
-
-	r.f[0] = b[4 * k + 1]; r.f[1] = b[4 * k];
-	r.f[2] = b[4 * k + 3]; r.f[3] = b[4 * k + 2];
-	return r.u;
-}
-
 struct pi_protocol {
 	char name[8];
 
-- 
cgit v1.2.3


From 2c08ec0f06a6ac71b78e26f82a88093094d70dc4 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:25 +0100
Subject: ata: pata_parport: Introduce module_pata_parport_driver macro

Introduce module_pata_parport_driver macro and use it in protocol
drivers to reduce boilerplate code. Remove paride_(un)register
compatibility defines.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c  | 13 +------------
 drivers/ata/pata_parport/bpck.c  | 13 +------------
 drivers/ata/pata_parport/bpck6.c | 13 +------------
 drivers/ata/pata_parport/comm.c  | 13 +------------
 drivers/ata/pata_parport/dstr.c  | 13 +------------
 drivers/ata/pata_parport/epat.c  |  4 ++--
 drivers/ata/pata_parport/epia.c  | 13 +------------
 drivers/ata/pata_parport/fit2.c  | 13 +------------
 drivers/ata/pata_parport/fit3.c  | 13 +------------
 drivers/ata/pata_parport/friq.c  | 13 +------------
 drivers/ata/pata_parport/frpw.c  | 13 +------------
 drivers/ata/pata_parport/kbic.c  | 10 +++++-----
 drivers/ata/pata_parport/ktti.c  | 13 +------------
 drivers/ata/pata_parport/on20.c  | 13 +------------
 drivers/ata/pata_parport/on26.c  | 13 +------------
 include/linux/pata_parport.h     | 14 +++++++++++---
 16 files changed, 31 insertions(+), 166 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index b66508bedbd0..9e6098f90162 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -147,16 +147,5 @@ static struct pi_protocol aten = {
 	.log_adapter	= aten_log_adapter,
 };
 
-static int __init aten_init(void)
-{
-	return paride_register(&aten);
-}
-
-static void __exit aten_exit(void)
-{
-	paride_unregister( &aten );
-}
-
 MODULE_LICENSE("GPL");
-module_init(aten_init)
-module_exit(aten_exit)
+module_pata_parport_driver(aten);
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 5fb3cf9ba11d..b9174cf8863c 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -462,16 +462,5 @@ static struct pi_protocol bpck = {
 	.log_adapter	= bpck_log_adapter,
 };
 
-static int __init bpck_init(void)
-{
-	return paride_register(&bpck);
-}
-
-static void __exit bpck_exit(void)
-{
-	paride_unregister(&bpck);
-}
-
 MODULE_LICENSE("GPL");
-module_init(bpck_init)
-module_exit(bpck_exit)
+module_pata_parport_driver(bpck);
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 3b3a40e48b21..3c358e66db25 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -245,19 +245,8 @@ static struct pi_protocol bpck6 = {
 	.release_proto	= bpck6_release_proto,
 };
 
-static int __init bpck6_init(void)
-{
-	return paride_register(&bpck6);
-}
-
-static void __exit bpck6_exit(void)
-{
-	paride_unregister(&bpck6);
-}
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Micro Solutions Inc.");
 MODULE_DESCRIPTION("BACKPACK Protocol module, compatible with PARIDE");
 module_param(verbose, bool, 0644);
-module_init(bpck6_init)
-module_exit(bpck6_exit)
+module_pata_parport_driver(bpck6);
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 11ed9fb57744..47f0fbccc3aa 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -206,16 +206,5 @@ static struct pi_protocol comm = {
 	.log_adapter	= comm_log_adapter,
 };
 
-static int __init comm_init(void)
-{
-	return paride_register(&comm);
-}
-
-static void __exit comm_exit(void)
-{
-	paride_unregister(&comm);
-}
-
 MODULE_LICENSE("GPL");
-module_init(comm_init)
-module_exit(comm_exit)
+module_pata_parport_driver(comm);
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index edf414d186a6..e733a2512e17 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -218,16 +218,5 @@ static struct pi_protocol dstr = {
 	.log_adapter	= dstr_log_adapter,
 };
 
-static int __init dstr_init(void)
-{
-	return paride_register(&dstr);
-}
-
-static void __exit dstr_exit(void)
-{
-	paride_unregister(&dstr);
-}
-
 MODULE_LICENSE("GPL");
-module_init(dstr_init)
-module_exit(dstr_exit)
+module_pata_parport_driver(dstr);
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 93ee91d9338b..7583d07083a8 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -327,12 +327,12 @@ static int __init epat_init(void)
 #ifdef CONFIG_PATA_PARPORT_EPATC8
 	epatc8 = 1;
 #endif
-	return paride_register(&epat);
+	return pata_parport_register_driver(&epat);
 }
 
 static void __exit epat_exit(void)
 {
-	paride_unregister(&epat);
+	pata_parport_unregister_driver(&epat);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 417d5a3c7f72..2bcb18a6845a 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -301,16 +301,5 @@ static struct pi_protocol epia = {
 	.log_adapter	= epia_log_adapter,
 };
 
-static int __init epia_init(void)
-{
-	return paride_register(&epia);
-}
-
-static void __exit epia_exit(void)
-{
-	paride_unregister(&epia);
-}
-
 MODULE_LICENSE("GPL");
-module_init(epia_init)
-module_exit(epia_exit)
+module_pata_parport_driver(epia);
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index 3c7a1069b026..c63f0cd2ea52 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -136,16 +136,5 @@ static struct pi_protocol fit2 = {
 	.log_adapter	= fit2_log_adapter,
 };
 
-static int __init fit2_init(void)
-{
-	return paride_register(&fit2);
-}
-
-static void __exit fit2_exit(void)
-{
-	paride_unregister(&fit2);
-}
-
 MODULE_LICENSE("GPL");
-module_init(fit2_init)
-module_exit(fit2_exit)
+module_pata_parport_driver(fit2);
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index cd95f4f0edc2..adbef142c88f 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -196,16 +196,5 @@ static struct pi_protocol fit3 = {
 	.log_adapter	= fit3_log_adapter,
 };
 
-static int __init fit3_init(void)
-{
-	return paride_register(&fit3);
-}
-
-static void __exit fit3_exit(void)
-{
-	paride_unregister(&fit3);
-}
-
 MODULE_LICENSE("GPL");
-module_init(fit3_init)
-module_exit(fit3_exit)
+module_pata_parport_driver(fit3);
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index da1d0cb016d6..e740fe933e20 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -261,16 +261,5 @@ static struct pi_protocol friq = {
 	.release_proto	= friq_release_proto,
 };
 
-static int __init friq_init(void)
-{
-	return paride_register(&friq);
-}
-
-static void __exit friq_exit(void)
-{
-	paride_unregister(&friq);
-}
-
 MODULE_LICENSE("GPL");
-module_init(friq_init)
-module_exit(friq_exit)
+module_pata_parport_driver(friq);
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index 7bc8fa16d5d8..8c8681812bed 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -298,16 +298,5 @@ static struct pi_protocol frpw = {
 	.log_adapter	= frpw_log_adapter,
 };
 
-static int __init frpw_init(void)
-{
-	return paride_register(&frpw);
-}
-
-static void __exit frpw_exit(void)
-{
-	paride_unregister(&frpw);
-}
-
 MODULE_LICENSE("GPL");
-module_init(frpw_init)
-module_exit(frpw_exit)
+module_pata_parport_driver(frpw);
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index 93430ca32a52..b120597043cc 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -288,19 +288,19 @@ static int __init kbic_init(void)
 {
 	int rv;
 
-	rv = paride_register(&k951);
+	rv = pata_parport_register_driver(&k951);
 	if (rv < 0)
 		return rv;
-	rv = paride_register(&k971);
+	rv = pata_parport_register_driver(&k971);
 	if (rv < 0)
-		paride_unregister(&k951);
+		pata_parport_unregister_driver(&k951);
 	return rv;
 }
 
 static void __exit kbic_exit(void)
 {
-	paride_unregister(&k951);
-	paride_unregister(&k971);
+	pata_parport_unregister_driver(&k951);
+	pata_parport_unregister_driver(&k971);
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index fc4f707fed1f..15463cd18968 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -113,16 +113,5 @@ static struct pi_protocol ktti = {
 	.log_adapter	= ktti_log_adapter,
 };
 
-static int __init ktti_init(void)
-{
-	return paride_register(&ktti);
-}
-
-static void __exit ktti_exit(void)
-{
-	paride_unregister(&ktti);
-}
-
 MODULE_LICENSE("GPL");
-module_init(ktti_init)
-module_exit(ktti_exit)
+module_pata_parport_driver(ktti);
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index 995fc41e3122..f2a601e77842 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -138,16 +138,5 @@ static struct pi_protocol on20 = {
 	.log_adapter	= on20_log_adapter,
 };
 
-static int __init on20_init(void)
-{
-	return paride_register(&on20);
-}
-
-static void __exit on20_exit(void)
-{
-	paride_unregister(&on20);
-}
-
 MODULE_LICENSE("GPL");
-module_init(on20_init)
-module_exit(on20_exit)
+module_pata_parport_driver(on20);
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index 35f1c481a782..66f04015f19a 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -304,16 +304,5 @@ static struct pi_protocol on26 = {
 	.log_adapter	= on26_log_adapter,
 };
 
-static int __init on26_init(void)
-{
-	return paride_register(&on26);
-}
-
-static void __exit on26_exit(void)
-{
-	paride_unregister(&on26);
-}
-
 MODULE_LICENSE("GPL");
-module_init(on26_init)
-module_exit(on26_exit)
+module_pata_parport_driver(on26);
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 458544fe5e6c..9614ce53470a 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -87,8 +87,16 @@ struct pi_protocol {
 
 int pata_parport_register_driver(struct pi_protocol *pr);
 void pata_parport_unregister_driver(struct pi_protocol *pr);
-/* defines for old paride protocol modules */
-#define paride_register pata_parport_register_driver
-#define paride_unregister pata_parport_unregister_driver
+
+/**
+ * module_pata_parport_driver() - Helper macro for registering a pata_parport driver
+ * @__pi_protocol: pi_protocol struct
+ *
+ * Helper macro for pata_parport drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_pata_parport_driver(__pi_protocol) \
+	module_driver(__pi_protocol, pata_parport_register_driver, pata_parport_unregister_driver)
 
 #endif /* LINUX_PATA_PARPORT_H */
-- 
cgit v1.2.3


From ec6e7a51d9eb1cd27840f7d8d5d05328631c9a15 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:26 +0100
Subject: ata: pata_parport: remove devtype from struct pi_adapter

Only bpck driver uses devtype but it never gets set in pata_parport.
Remove it.
As most bpck devices are CD-ROMs, always run the code that depends
on devtype == PI_PCD.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/bpck.c | 4 ++--
 include/linux/pata_parport.h    | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index b9174cf8863c..96386a10c22f 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -241,14 +241,14 @@ static void bpck_connect ( PIA *pi  )
 
 	WR(5,8);
 
-	if (pi->devtype == PI_PCD) {
+/*	if (pi->devtype == PI_PCD) {	possibly wrong, purpose unknown */
 		WR(0x46,0x10);		/* fiddle with ESS logic ??? */
 		WR(0x4c,0x38);
 		WR(0x4d,0x88);
 		WR(0x46,0xa0);
 		WR(0x41,0);
 		WR(0x4e,8);
-		}
+/*	}*/
 }
 
 static void bpck_disconnect ( PIA *pi )
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 9614ce53470a..3fc6b002c7c8 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -11,15 +11,12 @@
 
 #include <linux/libata.h>
 
-#define PI_PCD	1	/* dummy for paride protocol modules */
-
 struct pi_adapter {
 	struct device dev;
 	struct pi_protocol *proto;	/* adapter protocol */
 	int port;			/* base address of parallel port */
 	int mode;			/* transfer mode in use */
 	int delay;			/* adapter delay setting */
-	int devtype;			/* dummy for paride protocol modules */
 	char *device;			/* dummy for paride protocol modules */
 	int unit;			/* unit number for chained adapters */
 	int saved_r0;			/* saved port state */
-- 
cgit v1.2.3


From a4f2ff92ed4a0305920a46aba0ae8d50139548c0 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:27 +0100
Subject: ata: pata_parport: remove device from struct pi_adapter

device is never set in pata_parport, remove it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c  |  3 +--
 drivers/ata/pata_parport/bpck.c  | 12 ++++++------
 drivers/ata/pata_parport/bpck6.c | 10 ++++------
 drivers/ata/pata_parport/comm.c  |  3 +--
 drivers/ata/pata_parport/dstr.c  |  3 +--
 drivers/ata/pata_parport/epat.c  |  8 ++++----
 drivers/ata/pata_parport/epia.c  |  7 +++----
 drivers/ata/pata_parport/fit2.c  |  5 +++--
 drivers/ata/pata_parport/fit3.c  |  4 ++--
 drivers/ata/pata_parport/friq.c  |  6 +++---
 drivers/ata/pata_parport/frpw.c  | 12 +++++-------
 drivers/ata/pata_parport/kbic.c  |  3 +--
 drivers/ata/pata_parport/ktti.c  |  5 +++--
 drivers/ata/pata_parport/on20.c  |  3 +--
 drivers/ata/pata_parport/on26.c  |  3 +--
 include/linux/pata_parport.h     |  1 -
 16 files changed, 39 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index 9e6098f90162..bf83e4188c23 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -124,8 +124,7 @@ static void aten_log_adapter( PIA *pi, char * scratch, int verbose )
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
-        printk("%s: aten %s, ATEN EH-100 at 0x%x, ",
-                pi->device,ATEN_VERSION,pi->port);
+	printk("aten %s, ATEN EH-100 at 0x%x, ", ATEN_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 96386a10c22f..c33ca1cb72ac 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -335,8 +335,8 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
 	}
 
 	if (verbose) {
-	    printk("%s: bpck: 0x%x unit %d mode %d: ",
-		   pi->device,pi->port,pi->unit,pi->mode);
+		printk("bpck: 0x%x unit %d mode %d: ",
+		       pi->port, pi->unit, pi->mode);
 	    for (i=0;i<TEST_LEN;i++) printk("%3d",buf[i]);
 	    printk("\n");
 	}
@@ -432,13 +432,13 @@ static void bpck_log_adapter( PIA *pi, char * scratch, int verbose )
 	   for(i=0;i<128;i++)
 		if ((scratch[i] < ' ') || (scratch[i] > '~'))
 		    scratch[i] = '.';
-	   printk("%s: bpck EEPROM: %64.64s\n",pi->device,scratch);
-	   printk("%s:              %64.64s\n",pi->device,&scratch[64]);
+	   printk("bpck EEPROM: %64.64s\n", scratch);
+	   printk("             %64.64s\n", &scratch[64]);
 	}
 #endif
 
-	printk("%s: bpck %s, backpack %8.8s unit %d",
-		pi->device,BPCK_VERSION,&scratch[110],pi->unit);
+	printk("bpck %s, backpack %8.8s unit %d",
+		BPCK_VERSION, &scratch[110], pi->unit);
 	printk(" at 0x%x, mode %d (%s), delay %d\n",pi->port,
 		pi->mode,mode_string[pi->mode],pi->delay);
 }
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 3c358e66db25..13bfd60655dd 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -200,11 +200,9 @@ static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose )
 	char *mode_string[5]=
 		{"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
 
-	printk("%s: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n",pi->device);
-	printk("%s: Copyright 2001 by Micro Solutions, Inc., DeKalb IL.\n",pi->device);
-	printk("%s: BACKPACK %s, Micro Solutions BACKPACK Drive at 0x%x\n",
-		pi->device,BACKPACK_VERSION,pi->port);
-	printk("%s: Unit: %d Mode:%d (%s) Delay %d\n",pi->device,
+	printk("BACKPACK %s, Micro Solutions BACKPACK Drive at 0x%x\n",
+		BACKPACK_VERSION, pi->port);
+	printk("Unit: %d Mode:%d (%s) Delay %d\n",
 		pi->unit,pi->mode,mode_string[pi->mode],pi->delay);
 }
 
@@ -217,7 +215,7 @@ static int bpck6_init_proto(PIA *pi)
 		return 0;
 	}
 
-	printk(KERN_ERR "%s: ERROR COULDN'T ALLOCATE MEMORY\n", pi->device); 
+	printk(KERN_ERR "ERROR COULDN'T ALLOCATE MEMORY\n");
 	return -1;
 }
 
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 47f0fbccc3aa..908f6c44548b 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -183,8 +183,7 @@ static void comm_log_adapter( PIA *pi, char * scratch, int verbose )
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
 
-        printk("%s: comm %s, DataStor Commuter at 0x%x, ",
-                pi->device,COMM_VERSION,pi->port);
+	printk("comm %s, DataStor Commuter at 0x%x, ", COMM_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index e733a2512e17..3732e9b79429 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -195,8 +195,7 @@ static void dstr_log_adapter( PIA *pi, char * scratch, int verbose )
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
 
-        printk("%s: dstr %s, DataStor EP2000 at 0x%x, ",
-                pi->device,DSTR_VERSION,pi->port);
+	printk("dstr %s, DataStor EP2000 at 0x%x, ", DSTR_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 7583d07083a8..59720daa9c19 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -280,8 +280,8 @@ static int epat_test_proto( PIA *pi, char * scratch, int verbose )
         epat_disconnect(pi);
 
         if (verbose)  {
-            printk("%s: epat: port 0x%x, mode %d, ccr %x, test=(%d,%d,%d)\n",
-		   pi->device,pi->port,pi->mode,cc,e[0],e[1],f);
+		printk("epat: port 0x%x, mode %d, ccr %x, test=(%d,%d,%d)\n",
+		       pi->port, pi->mode, cc, e[0], e[1], f);
 	}
 	
         return (e[0] && e[1]) || f;
@@ -298,8 +298,8 @@ static void epat_log_adapter( PIA *pi, char * scratch, int verbose )
         ver = RR(0xb);
         epat_disconnect(pi);
 
-	printk("%s: epat %s, Shuttle EPAT chip %x at 0x%x, ",
-		pi->device,EPAT_VERSION,ver,pi->port);
+	printk("epat %s, Shuttle EPAT chip %x at 0x%x, ",
+		EPAT_VERSION, ver, pi->port);
 	printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 2bcb18a6845a..610269083441 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -263,8 +263,8 @@ static int epia_test_proto( PIA *pi, char * scratch, int verbose )
         epia_disconnect(pi);
 
         if (verbose)  {
-            printk("%s: epia: port 0x%x, mode %d, test=(%d,%d,%d)\n",
-                   pi->device,pi->port,pi->mode,e[0],e[1],f);
+		printk("epia: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+		       pi->port, pi->mode, e[0], e[1], f);
         }
         
         return (e[0] && e[1]) || f;
@@ -277,8 +277,7 @@ static void epia_log_adapter( PIA *pi, char * scratch, int verbose )
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
 
-        printk("%s: epia %s, Shuttle EPIA at 0x%x, ",
-                pi->device,EPIA_VERSION,pi->port);
+	printk("epia %s, Shuttle EPIA at 0x%x, ", EPIA_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index c63f0cd2ea52..28de8e4e41c3 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -115,8 +115,9 @@ static void fit2_disconnect ( PIA *pi )
 
 static void fit2_log_adapter( PIA *pi, char * scratch, int verbose )
 
-{       printk("%s: fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
-                pi->device,FIT2_VERSION,pi->port,pi->delay);
+{
+	printk("fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
+		FIT2_VERSION, pi->port, pi->delay);
 
 }
 
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index adbef142c88f..0366f3123508 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -173,9 +173,9 @@ static void fit3_log_adapter( PIA *pi, char * scratch, int verbose )
 
 {       char    *mode_string[3] = {"4-bit","8-bit","EPP"};
 
-	printk("%s: fit3 %s, FIT 3000 adapter at 0x%x, "
+	printk("fit3 %s, FIT 3000 adapter at 0x%x, "
 	       "mode %d (%s), delay %d\n",
-                pi->device,FIT3_VERSION,pi->port,
+		FIT3_VERSION, pi->port,
 		pi->mode,mode_string[pi->mode],pi->delay);
 
 }
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index e740fe933e20..9306a2c78d90 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -208,8 +208,8 @@ static int friq_test_proto( PIA *pi, char * scratch, int verbose )
 	friq_disconnect(pi);
 
         if (verbose)  {
-            printk("%s: friq: port 0x%x, mode %d, test=(%d,%d,%d)\n",
-                   pi->device,pi->port,pi->mode,e[0],e[1],r);
+		printk("friq: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+		       pi->port, pi->mode, e[0], e[1], r);
         }
 
         return (r || (e[0] && e[1]));
@@ -221,7 +221,7 @@ static void friq_log_adapter( PIA *pi, char * scratch, int verbose )
 {       char    *mode_string[6] = {"4-bit","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
 
-        printk("%s: friq %s, Freecom IQ ASIC-2 adapter at 0x%x, ", pi->device,
+	printk("friq %s, Freecom IQ ASIC-2 adapter at 0x%x, ",
 		FRIQ_VERSION,pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index 8c8681812bed..63f2165fec63 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -231,15 +231,13 @@ static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
 
 	if (((pi->private%2) == 0) && (pi->mode > 2)) {
 	   if (verbose) 
-		printk("%s: frpw: Xilinx does not support mode %d\n",
-			pi->device, pi->mode);
+		printk("frpw: Xilinx does not support mode %d\n", pi->mode);
 	   return 1;
 	}
 
 	if (((pi->private%2) == 1) && (pi->mode == 2)) {
 	   if (verbose)
-		printk("%s: frpw: ASIC does not support mode 2\n",
-			pi->device);
+		printk("frpw: ASIC does not support mode 2\n");
 	   return 1;
 	}
 
@@ -261,8 +259,8 @@ static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
 	frpw_disconnect(pi);
 
         if (verbose)  {
-            printk("%s: frpw: port 0x%x, chip %ld, mode %d, test=(%d,%d,%d)\n",
-                   pi->device,pi->port,(pi->private%2),pi->mode,e[0],e[1],r);
+		printk("frpw: port 0x%x, chip %ld, mode %d, test=(%d,%d,%d)\n",
+		       pi->port, (pi->private%2), pi->mode, e[0], e[1], r);
         }
 
         return (r || (e[0] && e[1]));
@@ -274,7 +272,7 @@ static void frpw_log_adapter( PIA *pi, char * scratch, int verbose )
 {       char    *mode_string[6] = {"4-bit","8-bit","EPP",
 				   "EPP-8","EPP-16","EPP-32"};
 
-        printk("%s: frpw %s, Freecom (%s) adapter at 0x%x, ", pi->device,
+	printk("frpw %s, Freecom (%s) adapter at 0x%x, ",
 		FRPW_VERSION,((pi->private%2) == 0)?"Xilinx":"ASIC",pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index b120597043cc..9a99b9e35d41 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -235,8 +235,7 @@ static void kbic_log_adapter( PIA *pi, char * scratch,
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP_16","EPP-32"};
 
-        printk("%s: kbic %s, KingByte %s at 0x%x, ",
-                pi->device,KBIC_VERSION,chip,pi->port);
+	printk("kbic %s, KingByte %s at 0x%x, ", KBIC_VERSION, chip, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index 15463cd18968..d87eb3c139bc 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -92,8 +92,9 @@ static void ktti_disconnect ( PIA *pi )
 
 static void ktti_log_adapter( PIA *pi, char * scratch, int verbose )
 
-{       printk("%s: ktti %s, KT adapter at 0x%x, delay %d\n",
-                pi->device,KTTI_VERSION,pi->port,pi->delay);
+{
+	printk("ktti %s, KT adapter at 0x%x, delay %d\n",
+		KTTI_VERSION, pi->port, pi->delay);
 
 }
 
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index f2a601e77842..0a1e60e20656 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -115,8 +115,7 @@ static void on20_log_adapter( PIA *pi, char * scratch, int verbose )
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
-        printk("%s: on20 %s, OnSpec 90c20 at 0x%x, ",
-                pi->device,ON20_VERSION,pi->port);
+	printk("on20 %s, OnSpec 90c20 at 0x%x, ", ON20_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index 66f04015f19a..ceb5018caeec 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -280,8 +280,7 @@ static void on26_log_adapter( PIA *pi, char * scratch, int verbose )
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
 
-        printk("%s: on26 %s, OnSpec 90c26 at 0x%x, ",
-                pi->device,ON26_VERSION,pi->port);
+	printk("on26 %s, OnSpec 90c26 at 0x%x, ", ON26_VERSION, pi->port);
         printk("mode %d (%s), delay %d\n",pi->mode,
 		mode_string[pi->mode],pi->delay);
 
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 3fc6b002c7c8..dcab769aa639 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -17,7 +17,6 @@ struct pi_adapter {
 	int port;			/* base address of parallel port */
 	int mode;			/* transfer mode in use */
 	int delay;			/* adapter delay setting */
-	char *device;			/* dummy for paride protocol modules */
 	int unit;			/* unit number for chained adapters */
 	int saved_r0;			/* saved port state */
 	int saved_r2;			/* saved port state */
-- 
cgit v1.2.3


From 882ff0ca354ada5f52e26d74039b2210b3070dcc Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:28 +0100
Subject: ata: pata_parport: remove typedef struct PIA

Remove typedef struct PIA and use struct pi_adapter directly.
Fix formatting (excessive spaces) while at it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c  | 14 +++++++-------
 drivers/ata/pata_parport/bpck.c  | 24 ++++++++++++------------
 drivers/ata/pata_parport/bpck6.c | 22 +++++++++++-----------
 drivers/ata/pata_parport/comm.c  | 14 +++++++-------
 drivers/ata/pata_parport/dstr.c  | 14 +++++++-------
 drivers/ata/pata_parport/epat.c  | 16 ++++++++--------
 drivers/ata/pata_parport/epia.c  | 16 ++++++++--------
 drivers/ata/pata_parport/fit2.c  | 14 +++++++-------
 drivers/ata/pata_parport/fit3.c  | 14 +++++++-------
 drivers/ata/pata_parport/friq.c  | 20 ++++++++++----------
 drivers/ata/pata_parport/frpw.c  | 20 ++++++++++----------
 drivers/ata/pata_parport/kbic.c  | 24 ++++++++++++------------
 drivers/ata/pata_parport/ktti.c  | 14 +++++++-------
 drivers/ata/pata_parport/on20.c  | 14 +++++++-------
 drivers/ata/pata_parport/on26.c  | 16 ++++++++--------
 include/linux/pata_parport.h     |  2 --
 16 files changed, 128 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index bf83e4188c23..4579e554fbad 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -35,7 +35,7 @@
 
 static int  cont_map[2] = { 0x08, 0x20 };
 
-static void  aten_write_regr( PIA *pi, int cont, int regr, int val)
+static void aten_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -44,7 +44,7 @@ static void  aten_write_regr( PIA *pi, int cont, int regr, int val)
 	w0(r); w2(0xe); w2(6); w0(val); w2(7); w2(6); w2(0xc);
 }
 
-static int aten_read_regr( PIA *pi, int cont, int regr )
+static int aten_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int  a, b, r;
 
@@ -67,7 +67,7 @@ static int aten_read_regr( PIA *pi, int cont, int regr )
 	return -1;
 }
 
-static void aten_read_block( PIA *pi, char * buf, int count )
+static void aten_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int  k, a, b, c, d;
 
@@ -95,7 +95,7 @@ static void aten_read_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void aten_write_block( PIA *pi, char * buf, int count )
+static void aten_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int k;
 
@@ -107,20 +107,20 @@ static void aten_write_block( PIA *pi, char * buf, int count )
 	w2(0xc);
 }
 
-static void aten_connect ( PIA *pi  )
+static void aten_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
 	w2(0xc);	
 }
 
-static void aten_disconnect ( PIA *pi )
+static void aten_disconnect(struct pi_adapter *pi)
 
 {       w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void aten_log_adapter( PIA *pi, char * scratch, int verbose )
+static void aten_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index c33ca1cb72ac..46805c584730 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -46,7 +46,7 @@
 
 static int  cont_map[3] = { 0x40, 0x48, 0 };
 
-static int bpck_read_regr( PIA *pi, int cont, int regr )
+static int bpck_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int r, l, h;
 
@@ -77,7 +77,7 @@ static int bpck_read_regr( PIA *pi, int cont, int regr )
 	return -1;
 }	
 
-static void bpck_write_regr( PIA *pi, int cont, int regr, int val )
+static void bpck_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int	r;
 
@@ -106,7 +106,7 @@ static void bpck_write_regr( PIA *pi, int cont, int regr, int val )
 #define WR(r,v)		bpck_write_regr(pi,2,r,v)
 #define RR(r)		(bpck_read_regr(pi,2,r))
 
-static void bpck_write_block( PIA *pi, char * buf, int count )
+static void bpck_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int i;
 
@@ -147,7 +147,7 @@ static void bpck_write_block( PIA *pi, char * buf, int count )
  	}
 }
 
-static void bpck_read_block( PIA *pi, char * buf, int count )
+static void bpck_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int i, l, h;
 
@@ -194,7 +194,7 @@ static void bpck_read_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static int bpck_probe_unit ( PIA *pi )
+static int bpck_probe_unit(struct pi_adapter *pi)
 
 {	int o1, o0, f7, id;
 	int t, s;
@@ -217,7 +217,7 @@ static int bpck_probe_unit ( PIA *pi )
 	return 1;
 }
 	
-static void bpck_connect ( PIA *pi  )
+static void bpck_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
 	w0(0xff-pi->unit); w2(4); w0(pi->unit);
@@ -251,14 +251,14 @@ static void bpck_connect ( PIA *pi  )
 /*	}*/
 }
 
-static void bpck_disconnect ( PIA *pi )
+static void bpck_disconnect(struct pi_adapter *pi)
 
 {	w0(0); 
 	if (pi->mode >= 2) { w2(9); w2(0); } else t2(2);
 	w2(0x4c); w0(pi->saved_r0);
 } 
 
-static void bpck_force_spp ( PIA *pi )
+static void bpck_force_spp(struct pi_adapter *pi)
 
 /* This fakes the EPP protocol to turn off EPP ... */
 
@@ -276,7 +276,7 @@ static void bpck_force_spp ( PIA *pi )
 
 #define TEST_LEN  16
 
-static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
+static int bpck_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 
 {	int i, e, l, h, om;
 	char buf[TEST_LEN];
@@ -346,7 +346,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
 	return e;
 }
 
-static void bpck_read_eeprom ( PIA *pi, char * buf )
+static void bpck_read_eeprom(struct pi_adapter *pi, char *buf)
 
 {       int i, j, k, p, v, f, om, od;
 
@@ -397,7 +397,7 @@ static void bpck_read_eeprom ( PIA *pi, char * buf )
 	pi->mode = om; pi->delay = od;
 }
 
-static int bpck_test_port ( PIA *pi ) 	/* check for 8-bit port */
+static int bpck_test_port(struct pi_adapter *pi)	/* check for 8-bit port */
 
 {	int	i, r, m;
 
@@ -416,7 +416,7 @@ static int bpck_test_port ( PIA *pi ) 	/* check for 8-bit port */
 	return 5;
 }
 
-static void bpck_log_adapter( PIA *pi, char * scratch, int verbose )
+static void bpck_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {	char	*mode_string[5] = { "4-bit","8-bit","EPP-8",
 				    "EPP-16","EPP-32" };
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 13bfd60655dd..8ed3cf3b627e 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -56,7 +56,7 @@ static bool verbose; /* set this to 1 to see debugging messages and whatnot */
 #define ATAPI_DEVICE_CONTROL 0x0e /* device control (write)   */
 /****************************************************************/
 
-static int bpck6_read_regr(PIA *pi, int cont, int reg)
+static int bpck6_read_regr(struct pi_adapter *pi, int cont, int reg)
 {
 	unsigned int out;
 
@@ -69,7 +69,7 @@ static int bpck6_read_regr(PIA *pi, int cont, int reg)
 	return(out);
 }
 
-static void bpck6_write_regr(PIA *pi, int cont, int reg, int val)
+static void bpck6_write_regr(struct pi_adapter *pi, int cont, int reg, int val)
 {
 	/* check for bad settings */
 	if (reg>=0 && reg<=7 && cont>=0 && cont<=1)
@@ -78,17 +78,17 @@ static void bpck6_write_regr(PIA *pi, int cont, int reg, int val)
 	}
 }
 
-static void bpck6_write_block( PIA *pi, char * buf, int len )
+static void bpck6_write_block(struct pi_adapter *pi, char *buf, int len)
 {
 	ppc6_wr_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1); 
 }
 
-static void bpck6_read_block( PIA *pi, char * buf, int len )
+static void bpck6_read_block(struct pi_adapter *pi, char *buf, int len)
 {
 	ppc6_rd_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1);
 }
 
-static void bpck6_connect ( PIA *pi  )
+static void bpck6_connect(struct pi_adapter *pi)
 {
 	if(verbose)
 	{
@@ -112,7 +112,7 @@ static void bpck6_connect ( PIA *pi  )
 	ppc6_wr_extout(PPCSTRUCT(pi),0x3);
 }
 
-static void bpck6_disconnect ( PIA *pi )
+static void bpck6_disconnect(struct pi_adapter *pi)
 {
 	if(verbose)
 	{
@@ -122,7 +122,7 @@ static void bpck6_disconnect ( PIA *pi )
 	ppc6_close(PPCSTRUCT(pi));
 }
 
-static int bpck6_test_port ( PIA *pi )   /* check for 8-bit port */
+static int bpck6_test_port(struct pi_adapter *pi)   /* check for 8-bit port */
 {
 	if(verbose)
 	{
@@ -154,7 +154,7 @@ static int bpck6_test_port ( PIA *pi )   /* check for 8-bit port */
 	}
 }
 
-static int bpck6_probe_unit ( PIA *pi )
+static int bpck6_probe_unit(struct pi_adapter *pi)
 {
 	int out;
 
@@ -195,7 +195,7 @@ static int bpck6_probe_unit ( PIA *pi )
   	}
 }
 
-static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose )
+static void bpck6_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 {
 	char *mode_string[5]=
 		{"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
@@ -206,7 +206,7 @@ static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose )
 		pi->unit,pi->mode,mode_string[pi->mode],pi->delay);
 }
 
-static int bpck6_init_proto(PIA *pi)
+static int bpck6_init_proto(struct pi_adapter *pi)
 {
 	Interface *p = kzalloc(sizeof(Interface), GFP_KERNEL);
 
@@ -219,7 +219,7 @@ static int bpck6_init_proto(PIA *pi)
 	return -1;
 }
 
-static void bpck6_release_proto(PIA *pi)
+static void bpck6_release_proto(struct pi_adapter *pi)
 {
 	kfree((void *)(pi->private)); 
 }
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 908f6c44548b..06c6fa29295b 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -42,7 +42,7 @@
 
 static int  cont_map[2] = { 0x08, 0x10 };
 
-static int comm_read_regr( PIA *pi, int cont, int regr )
+static int comm_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int     l, h, r;
 
@@ -68,7 +68,7 @@ static int comm_read_regr( PIA *pi, int cont, int regr )
         return -1;
 }       
 
-static void comm_write_regr( PIA *pi, int cont, int regr, int val )
+static void comm_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {       int  r;
 
@@ -87,7 +87,7 @@ static void comm_write_regr( PIA *pi, int cont, int regr, int val )
         }
 }
 
-static void comm_connect ( PIA *pi  )
+static void comm_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -98,14 +98,14 @@ static void comm_connect ( PIA *pi  )
         w2(4); w0(0xe0); w2(0xc); w2(0xc); w2(4);
 }
 
-static void comm_disconnect ( PIA *pi )
+static void comm_disconnect(struct pi_adapter *pi)
 
 {       w2(0); w2(0); w2(0); w2(4); 
 	w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void comm_read_block( PIA *pi, char * buf, int count )
+static void comm_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     i, l, h;
 
@@ -146,7 +146,7 @@ static void comm_read_block( PIA *pi, char * buf, int count )
 
 /* NB: Watch out for the byte swapped writes ! */
 
-static void comm_write_block( PIA *pi, char * buf, int count )
+static void comm_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int	k;
 
@@ -179,7 +179,7 @@ static void comm_write_block( PIA *pi, char * buf, int count )
         }
 }
 
-static void comm_log_adapter( PIA *pi, char * scratch, int verbose )
+static void comm_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
 
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index 3732e9b79429..8cac71eb9c03 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -44,7 +44,7 @@
 
 static int  cont_map[2] = { 0x20, 0x40 };
 
-static int dstr_read_regr( PIA *pi, int cont, int regr )
+static int dstr_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int     a, b, r;
 
@@ -71,7 +71,7 @@ static int dstr_read_regr( PIA *pi, int cont, int regr )
         return -1;
 }       
 
-static void dstr_write_regr(  PIA *pi, int cont, int regr, int val )
+static void dstr_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {       int  r;
 
@@ -98,21 +98,21 @@ static void dstr_write_regr(  PIA *pi, int cont, int regr, int val )
 		 w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);w0(0x78);\
 		 w0(x);w2(5);w2(4);
 
-static void dstr_connect ( PIA *pi  )
+static void dstr_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
         w2(4); CCP(0xe0); w0(0xff);
 }
 
-static void dstr_disconnect ( PIA *pi )
+static void dstr_disconnect(struct pi_adapter *pi)
 
 {       CCP(0x30);
         w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void dstr_read_block( PIA *pi, char * buf, int count )
+static void dstr_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     k, a, b;
 
@@ -154,7 +154,7 @@ static void dstr_read_block( PIA *pi, char * buf, int count )
         }
 }
 
-static void dstr_write_block( PIA *pi, char * buf, int count )
+static void dstr_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int	k;
 
@@ -190,7 +190,7 @@ static void dstr_write_block( PIA *pi, char * buf, int count )
 }
 
 
-static void dstr_log_adapter( PIA *pi, char * scratch, int verbose )
+static void dstr_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 59720daa9c19..9276dcb261d8 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -44,7 +44,7 @@ MODULE_PARM_DESC(epatc8, "support for the Shuttle EP1284 chip, "
 
 static int cont_map[3] = { 0x18, 0x10, 0 };
 
-static void epat_write_regr( PIA *pi, int cont, int regr, int val)
+static void epat_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -65,7 +65,7 @@ static void epat_write_regr( PIA *pi, int cont, int regr, int val)
 	}
 }
 
-static int epat_read_regr( PIA *pi, int cont, int regr )
+static int epat_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int  a, b, r;
 
@@ -94,7 +94,7 @@ static int epat_read_regr( PIA *pi, int cont, int regr )
 	return -1;	/* never gets here */
 }
 
-static void epat_read_block( PIA *pi, char * buf, int count )
+static void epat_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int  k, ph, a, b;
 
@@ -159,7 +159,7 @@ static void epat_read_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void epat_write_block( PIA *pi, char * buf, int count )   
+static void epat_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int ph, k;
 
@@ -210,7 +210,7 @@ static void epat_write_block( PIA *pi, char * buf, int count )
 #define CPP(x) 	w2(4);w0(0x22);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
                 w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
 
-static void epat_connect ( PIA *pi )
+static void epat_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -242,13 +242,13 @@ static void epat_connect ( PIA *pi )
 	}
 }
 
-static void epat_disconnect (PIA *pi)
+static void epat_disconnect(struct pi_adapter *pi)
 {	CPP(0x30);
 	w0(pi->saved_r0);
 	w2(pi->saved_r2);
 }
 
-static int epat_test_proto( PIA *pi, char * scratch, int verbose )
+static int epat_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       int     k, j, f, cc;
 	int	e[2] = {0,0};
@@ -287,7 +287,7 @@ static int epat_test_proto( PIA *pi, char * scratch, int verbose )
         return (e[0] && e[1]) || f;
 }
 
-static void epat_log_adapter( PIA *pi, char * scratch, int verbose )
+static void epat_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {	int	ver;
         char    *mode_string[6] = 
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 610269083441..85b1aba995e1 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -46,7 +46,7 @@
 
 static int cont_map[2] = { 0, 0x80 };
 
-static int epia_read_regr( PIA *pi, int cont, int regr )
+static int epia_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int     a, b, r;
 
@@ -79,7 +79,7 @@ static int epia_read_regr( PIA *pi, int cont, int regr )
         return -1;
 }       
 
-static void epia_write_regr( PIA *pi, int cont, int regr, int val)
+static void epia_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {       int  r;
 
@@ -110,7 +110,7 @@ static void epia_write_regr( PIA *pi, int cont, int regr, int val)
    2048 byte reads (the last two being used in the CDrom drivers.
 */
 
-static void epia_connect ( PIA *pi  )
+static void epia_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -124,7 +124,7 @@ static void epia_connect ( PIA *pi  )
         WR(0x86,8);  
 }
 
-static void epia_disconnect ( PIA *pi )
+static void epia_disconnect(struct pi_adapter *pi)
 
 {       /* WR(0x84,0x10); */
         w0(pi->saved_r0);
@@ -133,7 +133,7 @@ static void epia_disconnect ( PIA *pi )
         w2(pi->saved_r2);
 } 
 
-static void epia_read_block( PIA *pi, char * buf, int count )
+static void epia_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     k, ph, a, b;
 
@@ -193,7 +193,7 @@ static void epia_read_block( PIA *pi, char * buf, int count )
         }
 }
 
-static void epia_write_block( PIA *pi, char * buf, int count )
+static void epia_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     ph, k, last, d;
 
@@ -234,7 +234,7 @@ static void epia_write_block( PIA *pi, char * buf, int count )
 
 }
 
-static int epia_test_proto( PIA *pi, char * scratch, int verbose )
+static int epia_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       int     j, k, f;
 	int	e[2] = {0,0};
@@ -272,7 +272,7 @@ static int epia_test_proto( PIA *pi, char * scratch, int verbose )
 }
 
 
-static void epia_log_adapter( PIA *pi, char * scratch, int verbose )
+static void epia_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index 28de8e4e41c3..67e095d6a718 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -37,13 +37,13 @@ devices.
 
 */
 
-static void  fit2_write_regr( PIA *pi, int cont, int regr, int val)
+static void fit2_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	if (cont == 1) return;
 	w2(0xc); w0(regr); w2(4); w0(val); w2(5); w0(0); w2(4);
 }
 
-static int fit2_read_regr( PIA *pi, int cont, int regr )
+static int fit2_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int  a, b, r;
 
@@ -61,7 +61,7 @@ static int fit2_read_regr( PIA *pi, int cont, int regr )
 
 }
 
-static void fit2_read_block( PIA *pi, char * buf, int count )
+static void fit2_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int  k, a, b, c, d;
 
@@ -87,7 +87,7 @@ static void fit2_read_block( PIA *pi, char * buf, int count )
 
 }
 
-static void fit2_write_block( PIA *pi, char * buf, int count )
+static void fit2_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int k;
 
@@ -100,20 +100,20 @@ static void fit2_write_block( PIA *pi, char * buf, int count )
 	w2(4);
 }
 
-static void fit2_connect ( PIA *pi  )
+static void fit2_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
 	w2(0xcc); 
 }
 
-static void fit2_disconnect ( PIA *pi )
+static void fit2_disconnect(struct pi_adapter *pi)
 
 {       w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void fit2_log_adapter( PIA *pi, char * scratch, int verbose )
+static void fit2_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {
 	printk("fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index 0366f3123508..01e862a94b96 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -39,7 +39,7 @@
 
 */
 
-static void  fit3_write_regr( PIA *pi, int cont, int regr, int val)
+static void fit3_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	if (cont == 1) return;
 
@@ -59,7 +59,7 @@ static void  fit3_write_regr( PIA *pi, int cont, int regr, int val)
 	}
 }
 
-static int fit3_read_regr( PIA *pi, int cont, int regr )
+static int fit3_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int  a, b;
 
@@ -92,7 +92,7 @@ static int fit3_read_regr( PIA *pi, int cont, int regr )
 
 }
 
-static void fit3_read_block( PIA *pi, char * buf, int count )
+static void fit3_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int  k, a, b, c, d;
 
@@ -131,7 +131,7 @@ static void fit3_read_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void fit3_write_block( PIA *pi, char * buf, int count )
+static void fit3_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int k;
 
@@ -152,7 +152,7 @@ static void fit3_write_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void fit3_connect ( PIA *pi  )
+static void fit3_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -162,14 +162,14 @@ static void fit3_connect ( PIA *pi  )
 		}
 }
 
-static void fit3_disconnect ( PIA *pi )
+static void fit3_disconnect(struct pi_adapter *pi)
 
 {       w2(0xc); w0(0xa); w2(0x8); w2(0xc);
 	w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void fit3_log_adapter( PIA *pi, char * scratch, int verbose )
+static void fit3_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[3] = {"4-bit","8-bit","EPP"};
 
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index 9306a2c78d90..9f8f4e6b1a7c 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -48,7 +48,7 @@
 
 static int  cont_map[2] = { 0x08, 0x10 };
 
-static int friq_read_regr( PIA *pi, int cont, int regr )
+static int friq_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int	h,l,r;
 
@@ -63,7 +63,7 @@ static int friq_read_regr( PIA *pi, int cont, int regr )
 
 }
 
-static void friq_write_regr( PIA *pi, int cont, int regr, int val)
+static void friq_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -74,7 +74,7 @@ static void friq_write_regr( PIA *pi, int cont, int regr, int val)
 	w2(5);w2(7);w2(5);w2(4);
 }
 
-static void friq_read_block_int( PIA *pi, char * buf, int count, int regr )
+static void friq_read_block_int(struct pi_adapter *pi, char *buf, int count, int regr)
 
 {       int     h, l, k, ph;
 
@@ -129,12 +129,12 @@ static void friq_read_block_int( PIA *pi, char * buf, int count, int regr )
         }
 }
 
-static void friq_read_block( PIA *pi, char * buf, int count)
+static void friq_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	friq_read_block_int(pi,buf,count,0x08);
 }
 
-static void friq_write_block( PIA *pi, char * buf, int count )
+static void friq_write_block(struct pi_adapter *pi, char *buf, int count)
  
 {	int	k;
 
@@ -166,21 +166,21 @@ static void friq_write_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void friq_connect ( PIA *pi  )
+static void friq_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
 	w2(4);
 }
 
-static void friq_disconnect ( PIA *pi )
+static void friq_disconnect(struct pi_adapter *pi)
 
 {       CMD(0x20);
 	w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static int friq_test_proto( PIA *pi, char * scratch, int verbose )
+static int friq_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
@@ -216,7 +216,7 @@ static int friq_test_proto( PIA *pi, char * scratch, int verbose )
 }
 
 
-static void friq_log_adapter( PIA *pi, char * scratch, int verbose )
+static void friq_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[6] = {"4-bit","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
@@ -233,7 +233,7 @@ static void friq_log_adapter( PIA *pi, char * scratch, int verbose )
 
 }
 
-static void friq_release_proto( PIA *pi)
+static void friq_release_proto(struct pi_adapter *pi)
 {
 	if (pi->private) {		/* turn off the power */
 		friq_connect(pi);
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index 63f2165fec63..e93570190dab 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -44,7 +44,7 @@
 
 static int  cont_map[2] = { 0x08, 0x10 };
 
-static int frpw_read_regr( PIA *pi, int cont, int regr )
+static int frpw_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int	h,l,r;
 
@@ -60,7 +60,7 @@ static int frpw_read_regr( PIA *pi, int cont, int regr )
 
 }
 
-static void frpw_write_regr( PIA *pi, int cont, int regr, int val)
+static void frpw_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -71,7 +71,7 @@ static void frpw_write_regr( PIA *pi, int cont, int regr, int val)
 	w2(5);w2(7);w2(5);w2(4);
 }
 
-static void frpw_read_block_int( PIA *pi, char * buf, int count, int regr )
+static void frpw_read_block_int(struct pi_adapter *pi, char *buf, int count, int regr)
 
 {       int     h, l, k, ph;
 
@@ -132,12 +132,12 @@ static void frpw_read_block_int( PIA *pi, char * buf, int count, int regr )
         }
 }
 
-static void frpw_read_block( PIA *pi, char * buf, int count)
+static void frpw_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	frpw_read_block_int(pi,buf,count,0x08);
 }
 
-static void frpw_write_block( PIA *pi, char * buf, int count )
+static void frpw_write_block(struct pi_adapter *pi, char *buf, int count)
  
 {	int	k;
 
@@ -170,14 +170,14 @@ static void frpw_write_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void frpw_connect ( PIA *pi  )
+static void frpw_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
 	w2(4);
 }
 
-static void frpw_disconnect ( PIA *pi )
+static void frpw_disconnect(struct pi_adapter *pi)
 
 {       w2(4); w0(0x20); cec4;
 	w0(pi->saved_r0);
@@ -188,7 +188,7 @@ static void frpw_disconnect ( PIA *pi )
    between the Xilinx and ASIC implementations of the Freecom adapter.
 */
 
-static int frpw_test_pnp ( PIA *pi )
+static int frpw_test_pnp(struct pi_adapter *pi)
 
 /*  returns chip_type:   0 = Xilinx, 1 = ASIC   */
 
@@ -221,7 +221,7 @@ static int frpw_test_pnp ( PIA *pi )
    a hack :-(
 */
 
-static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
+static int frpw_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
@@ -267,7 +267,7 @@ static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
 }
 
 
-static void frpw_log_adapter( PIA *pi, char * scratch, int verbose )
+static void frpw_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[6] = {"4-bit","8-bit","EPP",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index 9a99b9e35d41..fffb79054144 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -42,7 +42,7 @@
 
 static int  cont_map[2] = { 0x80, 0x40 };
 
-static int kbic_read_regr( PIA *pi, int cont, int regr )
+static int kbic_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int     a, b, s;
 
@@ -72,7 +72,7 @@ static int kbic_read_regr( PIA *pi, int cont, int regr )
 	return -1;
 }       
 
-static void  kbic_write_regr( PIA *pi, int cont, int regr, int val)
+static void kbic_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {       int  s;
 
@@ -96,14 +96,14 @@ static void  kbic_write_regr( PIA *pi, int cont, int regr, int val)
 	}
 }
 
-static void k951_connect ( PIA *pi  )
+static void k951_connect(struct pi_adapter *pi)
 
 { 	pi->saved_r0 = r0();
         pi->saved_r2 = r2();
         w2(4); 
 }
 
-static void k951_disconnect ( PIA *pi )
+static void k951_disconnect(struct pi_adapter *pi)
 
 {      	w0(pi->saved_r0);
         w2(pi->saved_r2);
@@ -112,7 +112,7 @@ static void k951_disconnect ( PIA *pi )
 #define	CCP(x)	w2(0xc4);w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);\
 		w0(0x78);w0(x);w2(0xc5);w2(0xc4);w0(0xff);
 
-static void k971_connect ( PIA *pi  )
+static void k971_connect(struct pi_adapter *pi)
 
 { 	pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -120,7 +120,7 @@ static void k971_connect ( PIA *pi  )
         w2(4); 
 }
 
-static void k971_disconnect ( PIA *pi )
+static void k971_disconnect(struct pi_adapter *pi)
 
 {       CCP(0x30);
 	w0(pi->saved_r0);
@@ -131,7 +131,7 @@ static void k971_disconnect ( PIA *pi )
    have this property.
 */
 
-static void kbic_read_block( PIA *pi, char * buf, int count )
+static void kbic_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     k, a, b;
 
@@ -189,7 +189,7 @@ static void kbic_read_block( PIA *pi, char * buf, int count )
         }
 }
 
-static void kbic_write_block( PIA *pi, char * buf, int count )
+static void kbic_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     k;
 
@@ -229,8 +229,8 @@ static void kbic_write_block( PIA *pi, char * buf, int count )
 
 }
 
-static void kbic_log_adapter( PIA *pi, char * scratch, 
-			      int verbose, char * chip )
+static void kbic_log_adapter(struct pi_adapter *pi, char *scratch,
+			      int verbose, char *chip)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP_16","EPP-32"};
@@ -241,12 +241,12 @@ static void kbic_log_adapter( PIA *pi, char * scratch,
 
 }
 
-static void k951_log_adapter( PIA *pi, char * scratch, int verbose )
+static void k951_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {	kbic_log_adapter(pi,scratch,verbose,"KBIC-951A");
 }
 
-static void k971_log_adapter( PIA *pi, char * scratch, int verbose )
+static void k971_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       kbic_log_adapter(pi,scratch,verbose,"KBIC-971A");
 }
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index d87eb3c139bc..ffb2af0ce045 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -29,7 +29,7 @@
 
 static int  cont_map[2] = { 0x10, 0x08 };
 
-static void  ktti_write_regr( PIA *pi, int cont, int regr, int val)
+static void ktti_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -39,7 +39,7 @@ static void  ktti_write_regr( PIA *pi, int cont, int regr, int val)
 	w0(val); w2(3); w0(0); w2(6); w2(0xb);
 }
 
-static int ktti_read_regr( PIA *pi, int cont, int regr )
+static int ktti_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int  a, b, r;
 
@@ -51,7 +51,7 @@ static int ktti_read_regr( PIA *pi, int cont, int regr )
 
 }
 
-static void ktti_read_block( PIA *pi, char * buf, int count )
+static void ktti_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int  k, a, b;
 
@@ -64,7 +64,7 @@ static void ktti_read_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void ktti_write_block( PIA *pi, char * buf, int count )
+static void ktti_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int k;
 
@@ -76,21 +76,21 @@ static void ktti_write_block( PIA *pi, char * buf, int count )
 	}
 }
 
-static void ktti_connect ( PIA *pi  )
+static void ktti_connect(struct pi_adapter *pi)
 
 {       pi->saved_r0 = r0();
         pi->saved_r2 = r2();
 	w2(0xb); w2(0xa); w0(0); w2(3); w2(6);	
 }
 
-static void ktti_disconnect ( PIA *pi )
+static void ktti_disconnect(struct pi_adapter *pi)
 
 {       w2(0xb); w2(0xa); w0(0xa0); w2(3); w2(4);
 	w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void ktti_log_adapter( PIA *pi, char * scratch, int verbose )
+static void ktti_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {
 	printk("ktti %s, KT adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index 0a1e60e20656..fb3399329fb6 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -33,7 +33,7 @@
    cont = 1 - access the IDE command set 
 */
 
-static int on20_read_regr( PIA *pi, int cont, int regr )
+static int on20_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {	int h,l, r ;
 
@@ -56,7 +56,7 @@ static int on20_read_regr( PIA *pi, int cont, int regr )
 	return -1;
 }	
 
-static void on20_write_regr( PIA *pi, int cont, int regr, int val )
+static void on20_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {	int r;
 
@@ -67,7 +67,7 @@ static void on20_write_regr( PIA *pi, int cont, int regr, int val )
 	op(0); vl(val);
 }
 
-static void on20_connect ( PIA *pi)
+static void on20_connect(struct pi_adapter *pi)
 
 {	pi->saved_r0 = r0();
         pi->saved_r2 = r2();
@@ -77,14 +77,14 @@ static void on20_connect ( PIA *pi)
 	       else   { op(2); vl(0); op(2); vl(8); }
 }
 
-static void on20_disconnect ( PIA *pi )
+static void on20_disconnect(struct pi_adapter *pi)
 
 {	w2(4);w0(7);w2(4);w2(0xc);w2(4);
         w0(pi->saved_r0);
         w2(pi->saved_r2);
 } 
 
-static void on20_read_block( PIA *pi, char * buf, int count )
+static void on20_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int     k, l, h; 
 
@@ -101,7 +101,7 @@ static void on20_read_block( PIA *pi, char * buf, int count )
 	w2(4);
 }
 
-static void on20_write_block(  PIA *pi, char * buf, int count )
+static void on20_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {	int	k;
 
@@ -111,7 +111,7 @@ static void on20_write_block(  PIA *pi, char * buf, int count )
 	w2(4);
 }
 
-static void on20_log_adapter( PIA *pi, char * scratch, int verbose )
+static void on20_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index ceb5018caeec..7e5fc499fcc5 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -44,7 +44,7 @@
    cont = 1 - access the IDE command set 
 */
 
-static int on26_read_regr( PIA *pi, int cont, int regr )
+static int on26_read_regr(struct pi_adapter *pi, int cont, int regr)
 
 {       int     a, b, r;
 
@@ -73,7 +73,7 @@ static int on26_read_regr( PIA *pi, int cont, int regr )
         return -1;
 }       
 
-static void on26_write_regr( PIA *pi, int cont, int regr, int val )
+static void on26_write_regr(struct pi_adapter *pi, int cont, int regr, int val)
 
 {       int  r;
 
@@ -99,7 +99,7 @@ static void on26_write_regr( PIA *pi, int cont, int regr, int val )
 #define  CCP(x)  w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
 		 w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
 
-static void on26_connect ( PIA *pi )
+static void on26_connect(struct pi_adapter *pi)
 
 {       int	x;
 
@@ -113,7 +113,7 @@ static void on26_connect ( PIA *pi )
 	w0(2); P1; w0(x); P2;
 }
 
-static void on26_disconnect ( PIA *pi )
+static void on26_disconnect(struct pi_adapter *pi)
 
 {       if (pi->mode >= 2) { w3(4); w3(4); w3(4); w3(4); }
 	              else { w0(4); P1; w0(4); P1; }
@@ -124,7 +124,7 @@ static void on26_disconnect ( PIA *pi )
 
 #define	RESET_WAIT  200
 
-static int on26_test_port( PIA *pi)  /* hard reset */
+static int on26_test_port(struct pi_adapter *pi)  /* hard reset */
 
 {       int     i, m, d, x=0, y=0;
 
@@ -183,7 +183,7 @@ static int on26_test_port( PIA *pi)  /* hard reset */
 }
 
 
-static void on26_read_block( PIA *pi, char * buf, int count )
+static void on26_read_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int     k, a, b;
 
@@ -232,7 +232,7 @@ static void on26_read_block( PIA *pi, char * buf, int count )
         }
 }
 
-static void on26_write_block( PIA *pi, char * buf, int count )
+static void on26_write_block(struct pi_adapter *pi, char *buf, int count)
 
 {       int	k;
 
@@ -275,7 +275,7 @@ static void on26_write_block( PIA *pi, char * buf, int count )
 
 }
 
-static void on26_log_adapter( PIA *pi, char * scratch, int verbose )
+static void on26_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index dcab769aa639..381b4d0e3574 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -24,8 +24,6 @@ struct pi_adapter {
 	struct pardevice *pardev;	/* pointer to pardevice */
 };
 
-typedef struct pi_adapter PIA;	/* for paride protocol modules */
-
 /* registers are addressed as (cont,regr)
  *	cont: 0 for command register file, 1 for control register(s)
  *	regr: 0-7 for register number.
-- 
cgit v1.2.3


From 3a7474ba54ef29233af3de8b1f97ec006c2f902a Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:29 +0100
Subject: ata: pata_parport: remove verbose parameter from log_adapter()

verbose parameter of log_adapter() is unused, remove it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c         |  2 +-
 drivers/ata/pata_parport/bpck.c         |  4 +---
 drivers/ata/pata_parport/bpck6.c        |  2 +-
 drivers/ata/pata_parport/comm.c         |  2 +-
 drivers/ata/pata_parport/dstr.c         |  2 +-
 drivers/ata/pata_parport/epat.c         |  2 +-
 drivers/ata/pata_parport/epia.c         |  2 +-
 drivers/ata/pata_parport/fit2.c         |  2 +-
 drivers/ata/pata_parport/fit3.c         |  2 +-
 drivers/ata/pata_parport/friq.c         |  2 +-
 drivers/ata/pata_parport/frpw.c         |  2 +-
 drivers/ata/pata_parport/kbic.c         | 15 +++++++--------
 drivers/ata/pata_parport/ktti.c         |  2 +-
 drivers/ata/pata_parport/on20.c         |  2 +-
 drivers/ata/pata_parport/on26.c         |  2 +-
 drivers/ata/pata_parport/pata_parport.c |  2 +-
 include/linux/pata_parport.h            |  2 +-
 17 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index 4579e554fbad..0a98954f380f 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -120,7 +120,7 @@ static void aten_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void aten_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void aten_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 46805c584730..1a3e3d5b1b25 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -416,7 +416,7 @@ static int bpck_test_port(struct pi_adapter *pi)	/* check for 8-bit port */
 	return 5;
 }
 
-static void bpck_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void bpck_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {	char	*mode_string[5] = { "4-bit","8-bit","EPP-8",
 				    "EPP-16","EPP-32" };
@@ -428,13 +428,11 @@ static void bpck_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
 	bpck_read_eeprom(pi,scratch);
 
 #ifdef DUMP_EEPROM
-	if (verbose) {
 	   for(i=0;i<128;i++)
 		if ((scratch[i] < ' ') || (scratch[i] > '~'))
 		    scratch[i] = '.';
 	   printk("bpck EEPROM: %64.64s\n", scratch);
 	   printk("             %64.64s\n", &scratch[64]);
-	}
 #endif
 
 	printk("bpck %s, backpack %8.8s unit %d",
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 8ed3cf3b627e..68f7fdcab9be 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -195,7 +195,7 @@ static int bpck6_probe_unit(struct pi_adapter *pi)
   	}
 }
 
-static void bpck6_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void bpck6_log_adapter(struct pi_adapter *pi, char *scratch)
 {
 	char *mode_string[5]=
 		{"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 06c6fa29295b..69a66658aa29 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -179,7 +179,7 @@ static void comm_write_block(struct pi_adapter *pi, char *buf, int count)
         }
 }
 
-static void comm_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void comm_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
 
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index 8cac71eb9c03..17b1a7cb0a15 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -190,7 +190,7 @@ static void dstr_write_block(struct pi_adapter *pi, char *buf, int count)
 }
 
 
-static void dstr_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void dstr_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 9276dcb261d8..b125df16f160 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -287,7 +287,7 @@ static int epat_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
         return (e[0] && e[1]) || f;
 }
 
-static void epat_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void epat_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {	int	ver;
         char    *mode_string[6] = 
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 85b1aba995e1..452d3a8e17af 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -272,7 +272,7 @@ static int epia_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void epia_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void epia_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index 67e095d6a718..632c51af84d7 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -113,7 +113,7 @@ static void fit2_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void fit2_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void fit2_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {
 	printk("fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index 01e862a94b96..bdf90cb536c2 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -169,7 +169,7 @@ static void fit3_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void fit3_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void fit3_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[3] = {"4-bit","8-bit","EPP"};
 
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index 9f8f4e6b1a7c..ee922b40bc95 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -216,7 +216,7 @@ static int friq_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void friq_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void friq_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[6] = {"4-bit","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index e93570190dab..f17e0a4f66c2 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -267,7 +267,7 @@ static int frpw_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void frpw_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void frpw_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[6] = {"4-bit","8-bit","EPP",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index fffb79054144..3718441f60aa 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -229,8 +229,7 @@ static void kbic_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static void kbic_log_adapter(struct pi_adapter *pi, char *scratch,
-			      int verbose, char *chip)
+static void kbic_log_adapter(struct pi_adapter *pi, char *scratch, char *chip)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP_16","EPP-32"};
@@ -241,14 +240,14 @@ static void kbic_log_adapter(struct pi_adapter *pi, char *scratch,
 
 }
 
-static void k951_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
-
-{	kbic_log_adapter(pi,scratch,verbose,"KBIC-951A");
+static void k951_log_adapter(struct pi_adapter *pi, char *scratch)
+{
+	kbic_log_adapter(pi, scratch, "KBIC-951A");
 }
 
-static void k971_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
-
-{       kbic_log_adapter(pi,scratch,verbose,"KBIC-971A");
+static void k971_log_adapter(struct pi_adapter *pi, char *scratch)
+{
+	kbic_log_adapter(pi, scratch, "KBIC-971A");
 }
 
 static struct pi_protocol k951 = {
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index ffb2af0ce045..ad7f0314f962 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -90,7 +90,7 @@ static void ktti_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void ktti_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void ktti_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {
 	printk("ktti %s, KT adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index fb3399329fb6..12a423f61996 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -111,7 +111,7 @@ static void on20_write_block(struct pi_adapter *pi, char *buf, int count)
 	w2(4);
 }
 
-static void on20_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void on20_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index 7e5fc499fcc5..ee5a0cc74900 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -275,7 +275,7 @@ static void on26_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static void on26_log_adapter(struct pi_adapter *pi, char *scratch, int verbose)
+static void on26_log_adapter(struct pi_adapter *pi, char *scratch)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 58bbed107403..41180039f17d 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -478,7 +478,7 @@ static struct pi_adapter *pi_init_one(struct parport *parport,
 		goto out_unreg_parport;
 	}
 
-	pi->proto->log_adapter(pi, scratch, 1);
+	pi->proto->log_adapter(pi, scratch);
 
 	host = ata_host_alloc_pinfo(&pi->pardev->dev, ppi, 1);
 	if (!host)
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 381b4d0e3574..033cabede51c 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -68,7 +68,7 @@ struct pi_protocol {
 	int (*test_port)(struct pi_adapter *pi);
 	int (*probe_unit)(struct pi_adapter *pi);
 	int (*test_proto)(struct pi_adapter *pi, char *scratch, int verbose);
-	void (*log_adapter)(struct pi_adapter *pi, char *scratch, int verbose);
+	void (*log_adapter)(struct pi_adapter *pi, char *scratch);
 
 	int (*init_proto)(struct pi_adapter *pi);
 	void (*release_proto)(struct pi_adapter *pi);
-- 
cgit v1.2.3


From 5b77db9ccff444ddd3755e43b7bfa8782a5ac125 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:30 +0100
Subject: ata: pata_parport: remove scratch parameter from log_adapter()

scratch parameter of log_adapter() is only used by bpck driver.
Remove it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c         |  2 +-
 drivers/ata/pata_parport/bpck.c         |  4 ++--
 drivers/ata/pata_parport/bpck6.c        |  2 +-
 drivers/ata/pata_parport/comm.c         |  2 +-
 drivers/ata/pata_parport/dstr.c         |  2 +-
 drivers/ata/pata_parport/epat.c         |  2 +-
 drivers/ata/pata_parport/epia.c         |  2 +-
 drivers/ata/pata_parport/fit2.c         |  2 +-
 drivers/ata/pata_parport/fit3.c         |  2 +-
 drivers/ata/pata_parport/friq.c         |  2 +-
 drivers/ata/pata_parport/frpw.c         |  2 +-
 drivers/ata/pata_parport/kbic.c         | 10 +++++-----
 drivers/ata/pata_parport/ktti.c         |  2 +-
 drivers/ata/pata_parport/on20.c         |  2 +-
 drivers/ata/pata_parport/on26.c         |  2 +-
 drivers/ata/pata_parport/pata_parport.c |  2 +-
 include/linux/pata_parport.h            |  2 +-
 17 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index 0a98954f380f..3f5c50c2c43a 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -120,7 +120,7 @@ static void aten_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void aten_log_adapter(struct pi_adapter *pi, char *scratch)
+static void aten_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 1a3e3d5b1b25..f475a25769f0 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -416,11 +416,11 @@ static int bpck_test_port(struct pi_adapter *pi)	/* check for 8-bit port */
 	return 5;
 }
 
-static void bpck_log_adapter(struct pi_adapter *pi, char *scratch)
+static void bpck_log_adapter(struct pi_adapter *pi)
 
 {	char	*mode_string[5] = { "4-bit","8-bit","EPP-8",
 				    "EPP-16","EPP-32" };
-
+	char scratch[128];
 #ifdef DUMP_EEPROM
 	int i;
 #endif
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 68f7fdcab9be..41395a97d77c 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -195,7 +195,7 @@ static int bpck6_probe_unit(struct pi_adapter *pi)
   	}
 }
 
-static void bpck6_log_adapter(struct pi_adapter *pi, char *scratch)
+static void bpck6_log_adapter(struct pi_adapter *pi)
 {
 	char *mode_string[5]=
 		{"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 69a66658aa29..88476072b708 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -179,7 +179,7 @@ static void comm_write_block(struct pi_adapter *pi, char *buf, int count)
         }
 }
 
-static void comm_log_adapter(struct pi_adapter *pi, char *scratch)
+static void comm_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
 
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index 17b1a7cb0a15..a8b238828061 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -190,7 +190,7 @@ static void dstr_write_block(struct pi_adapter *pi, char *buf, int count)
 }
 
 
-static void dstr_log_adapter(struct pi_adapter *pi, char *scratch)
+static void dstr_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index b125df16f160..a30313a2c3d7 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -287,7 +287,7 @@ static int epat_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
         return (e[0] && e[1]) || f;
 }
 
-static void epat_log_adapter(struct pi_adapter *pi, char *scratch)
+static void epat_log_adapter(struct pi_adapter *pi)
 
 {	int	ver;
         char    *mode_string[6] = 
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 452d3a8e17af..ece7862dc058 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -272,7 +272,7 @@ static int epia_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void epia_log_adapter(struct pi_adapter *pi, char *scratch)
+static void epia_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index 632c51af84d7..056e92d8e015 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -113,7 +113,7 @@ static void fit2_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void fit2_log_adapter(struct pi_adapter *pi, char *scratch)
+static void fit2_log_adapter(struct pi_adapter *pi)
 
 {
 	printk("fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index bdf90cb536c2..fa37f7f17fb2 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -169,7 +169,7 @@ static void fit3_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void fit3_log_adapter(struct pi_adapter *pi, char *scratch)
+static void fit3_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[3] = {"4-bit","8-bit","EPP"};
 
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index ee922b40bc95..bf597ee520b7 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -216,7 +216,7 @@ static int friq_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void friq_log_adapter(struct pi_adapter *pi, char *scratch)
+static void friq_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[6] = {"4-bit","8-bit",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index f17e0a4f66c2..9b8db1122154 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -267,7 +267,7 @@ static int frpw_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
 }
 
 
-static void frpw_log_adapter(struct pi_adapter *pi, char *scratch)
+static void frpw_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[6] = {"4-bit","8-bit","EPP",
 				   "EPP-8","EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index 3718441f60aa..b23632c1da39 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -229,7 +229,7 @@ static void kbic_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static void kbic_log_adapter(struct pi_adapter *pi, char *scratch, char *chip)
+static void kbic_log_adapter(struct pi_adapter *pi, char *chip)
 
 {       char    *mode_string[6] = {"4-bit","5/3","8-bit",
 				   "EPP-8","EPP_16","EPP-32"};
@@ -240,14 +240,14 @@ static void kbic_log_adapter(struct pi_adapter *pi, char *scratch, char *chip)
 
 }
 
-static void k951_log_adapter(struct pi_adapter *pi, char *scratch)
+static void k951_log_adapter(struct pi_adapter *pi)
 {
-	kbic_log_adapter(pi, scratch, "KBIC-951A");
+	kbic_log_adapter(pi, "KBIC-951A");
 }
 
-static void k971_log_adapter(struct pi_adapter *pi, char *scratch)
+static void k971_log_adapter(struct pi_adapter *pi)
 {
-	kbic_log_adapter(pi, scratch, "KBIC-971A");
+	kbic_log_adapter(pi, "KBIC-971A");
 }
 
 static struct pi_protocol k951 = {
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index ad7f0314f962..62b0692d1b94 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -90,7 +90,7 @@ static void ktti_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static void ktti_log_adapter(struct pi_adapter *pi, char *scratch)
+static void ktti_log_adapter(struct pi_adapter *pi)
 
 {
 	printk("ktti %s, KT adapter at 0x%x, delay %d\n",
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index 12a423f61996..a3b0708aed71 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -111,7 +111,7 @@ static void on20_write_block(struct pi_adapter *pi, char *buf, int count)
 	w2(4);
 }
 
-static void on20_log_adapter(struct pi_adapter *pi, char *scratch)
+static void on20_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[2] = {"4-bit","8-bit"};
 
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index ee5a0cc74900..8dc8296d50e6 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -275,7 +275,7 @@ static void on26_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static void on26_log_adapter(struct pi_adapter *pi, char *scratch)
+static void on26_log_adapter(struct pi_adapter *pi)
 
 {       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
 				   "EPP-16","EPP-32"};
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 41180039f17d..95a99051470c 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -478,7 +478,7 @@ static struct pi_adapter *pi_init_one(struct parport *parport,
 		goto out_unreg_parport;
 	}
 
-	pi->proto->log_adapter(pi, scratch);
+	pi->proto->log_adapter(pi);
 
 	host = ata_host_alloc_pinfo(&pi->pardev->dev, ppi, 1);
 	if (!host)
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index 033cabede51c..c44d30b3e886 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -68,7 +68,7 @@ struct pi_protocol {
 	int (*test_port)(struct pi_adapter *pi);
 	int (*probe_unit)(struct pi_adapter *pi);
 	int (*test_proto)(struct pi_adapter *pi, char *scratch, int verbose);
-	void (*log_adapter)(struct pi_adapter *pi, char *scratch);
+	void (*log_adapter)(struct pi_adapter *pi);
 
 	int (*init_proto)(struct pi_adapter *pi);
 	void (*release_proto)(struct pi_adapter *pi);
-- 
cgit v1.2.3


From 8d7494a06a14ce8e2d792b95f28c839367a8ebdc Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:35 +0100
Subject: ata: pata_parport: remove verbose parameter from test_proto()

verbose parameter of test_proto() is now unused, remove it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/bpck.c         | 2 +-
 drivers/ata/pata_parport/epat.c         | 2 +-
 drivers/ata/pata_parport/epia.c         | 2 +-
 drivers/ata/pata_parport/friq.c         | 2 +-
 drivers/ata/pata_parport/frpw.c         | 2 +-
 drivers/ata/pata_parport/pata_parport.c | 2 +-
 include/linux/pata_parport.h            | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 793ef2c7849e..2072e291fd16 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -274,7 +274,7 @@ static void bpck_force_spp(struct pi_adapter *pi)
 
 #define TEST_LEN  16
 
-static int bpck_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
+static int bpck_test_proto(struct pi_adapter *pi, char *scratch)
 
 {	int i, e, l, h, om;
 	char buf[TEST_LEN];
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 12dd53a06dd0..4877b39ed04c 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -246,7 +246,7 @@ static void epat_disconnect(struct pi_adapter *pi)
 	w2(pi->saved_r2);
 }
 
-static int epat_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
+static int epat_test_proto(struct pi_adapter *pi, char *scratch)
 
 {       int     k, j, f, cc;
 	int	e[2] = {0,0};
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 1216763d0c6d..e7401351463a 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -232,7 +232,7 @@ static void epia_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static int epia_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
+static int epia_test_proto(struct pi_adapter *pi, char *scratch)
 
 {       int     j, k, f;
 	int	e[2] = {0,0};
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index ed05e6503147..df15b210f355 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -178,7 +178,7 @@ static void friq_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static int friq_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
+static int friq_test_proto(struct pi_adapter *pi, char *scratch)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index c0d50e2ed57d..0d4e84806350 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -219,7 +219,7 @@ static int frpw_test_pnp(struct pi_adapter *pi)
    a hack :-(
 */
 
-static int frpw_test_proto(struct pi_adapter *pi, char *scratch, int verbose)
+static int frpw_test_proto(struct pi_adapter *pi, char *scratch)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 95a99051470c..c85593788951 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -306,7 +306,7 @@ static int pi_test_proto(struct pi_adapter *pi, char *scratch)
 
 	parport_claim_or_block(pi->pardev);
 	if (pi->proto->test_proto)
-		res = pi->proto->test_proto(pi, scratch, 1);
+		res = pi->proto->test_proto(pi, scratch);
 	else
 		res = default_test_proto(pi, scratch);
 	parport_release(pi->pardev);
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index c44d30b3e886..e45bb1896003 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -67,7 +67,7 @@ struct pi_protocol {
 
 	int (*test_port)(struct pi_adapter *pi);
 	int (*probe_unit)(struct pi_adapter *pi);
-	int (*test_proto)(struct pi_adapter *pi, char *scratch, int verbose);
+	int (*test_proto)(struct pi_adapter *pi, char *scratch);
 	void (*log_adapter)(struct pi_adapter *pi);
 
 	int (*init_proto)(struct pi_adapter *pi);
-- 
cgit v1.2.3


From b42251a867a9859a57228c0eadd6f342a339b83d Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:36 +0100
Subject: ata: pata_parport: remove scratch parameter from test_proto()

Don't pass around a pointer to scratch buffer. Use local buffers in
protocols that need it.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/bpck.c         |  2 +-
 drivers/ata/pata_parport/epat.c         |  3 ++-
 drivers/ata/pata_parport/epia.c         |  3 ++-
 drivers/ata/pata_parport/friq.c         |  3 ++-
 drivers/ata/pata_parport/frpw.c         |  3 ++-
 drivers/ata/pata_parport/pata_parport.c | 23 +++++++++++------------
 include/linux/pata_parport.h            |  2 +-
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 2072e291fd16..ecb98bf0e6de 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -274,7 +274,7 @@ static void bpck_force_spp(struct pi_adapter *pi)
 
 #define TEST_LEN  16
 
-static int bpck_test_proto(struct pi_adapter *pi, char *scratch)
+static int bpck_test_proto(struct pi_adapter *pi)
 
 {	int i, e, l, h, om;
 	char buf[TEST_LEN];
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 4877b39ed04c..609614b2c69e 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -246,10 +246,11 @@ static void epat_disconnect(struct pi_adapter *pi)
 	w2(pi->saved_r2);
 }
 
-static int epat_test_proto(struct pi_adapter *pi, char *scratch)
+static int epat_test_proto(struct pi_adapter *pi)
 
 {       int     k, j, f, cc;
 	int	e[2] = {0,0};
+	char scratch[512];
 
         epat_connect(pi);
 	cc = RR(0xd);
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index e7401351463a..970532619aeb 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -232,10 +232,11 @@ static void epia_write_block(struct pi_adapter *pi, char *buf, int count)
 
 }
 
-static int epia_test_proto(struct pi_adapter *pi, char *scratch)
+static int epia_test_proto(struct pi_adapter *pi)
 
 {       int     j, k, f;
 	int	e[2] = {0,0};
+	char scratch[512];
 
         epia_connect(pi);
         for (j=0;j<2;j++) {
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index df15b210f355..1888e8bcb884 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -178,10 +178,11 @@ static void friq_disconnect(struct pi_adapter *pi)
         w2(pi->saved_r2);
 } 
 
-static int friq_test_proto(struct pi_adapter *pi, char *scratch)
+static int friq_test_proto(struct pi_adapter *pi)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
+	char scratch[512];
 
 	pi->saved_r0 = r0();	
 	w0(0xff); udelay(20); CMD(0x3d); /* turn the power on */
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index 0d4e84806350..484b5b9d31eb 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -219,10 +219,11 @@ static int frpw_test_pnp(struct pi_adapter *pi)
    a hack :-(
 */
 
-static int frpw_test_proto(struct pi_adapter *pi, char *scratch)
+static int frpw_test_proto(struct pi_adapter *pi)
 
 {       int     j, k, r;
 	int	e[2] = {0,0};
+	char scratch[512];
 
 	if ((pi->private>>1) != pi->port)
 	   pi->private = frpw_test_pnp(pi) + 2*pi->port;
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index c85593788951..b6499f2160da 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -276,7 +276,7 @@ static void pi_release(struct pi_adapter *pi)
 	module_put(pi->proto->owner);
 }
 
-static int default_test_proto(struct pi_adapter *pi, char *scratch)
+static int default_test_proto(struct pi_adapter *pi)
 {
 	int j, k;
 	int e[2] = { 0, 0 };
@@ -300,21 +300,21 @@ static int default_test_proto(struct pi_adapter *pi, char *scratch)
 	return e[0] && e[1];	/* not here if both > 0 */
 }
 
-static int pi_test_proto(struct pi_adapter *pi, char *scratch)
+static int pi_test_proto(struct pi_adapter *pi)
 {
 	int res;
 
 	parport_claim_or_block(pi->pardev);
 	if (pi->proto->test_proto)
-		res = pi->proto->test_proto(pi, scratch);
+		res = pi->proto->test_proto(pi);
 	else
-		res = default_test_proto(pi, scratch);
+		res = default_test_proto(pi);
 	parport_release(pi->pardev);
 
 	return res;
 }
 
-static bool pi_probe_mode(struct pi_adapter *pi, int max, char *scratch)
+static bool pi_probe_mode(struct pi_adapter *pi, int max)
 {
 	int best, range;
 
@@ -326,7 +326,7 @@ static bool pi_probe_mode(struct pi_adapter *pi, int max, char *scratch)
 			range = 8;
 		if (range == 8 && pi->port % 8)
 			return false;
-		return !pi_test_proto(pi, scratch);
+		return !pi_test_proto(pi);
 	}
 	best = -1;
 	for (pi->mode = 0; pi->mode < max; pi->mode++) {
@@ -335,14 +335,14 @@ static bool pi_probe_mode(struct pi_adapter *pi, int max, char *scratch)
 			range = 8;
 		if (range == 8 && pi->port % 8)
 			break;
-		if (!pi_test_proto(pi, scratch))
+		if (!pi_test_proto(pi))
 			best = pi->mode;
 	}
 	pi->mode = best;
 	return best > -1;
 }
 
-static bool pi_probe_unit(struct pi_adapter *pi, int unit, char *scratch)
+static bool pi_probe_unit(struct pi_adapter *pi, int unit)
 {
 	int max, s, e;
 
@@ -367,14 +367,14 @@ static bool pi_probe_unit(struct pi_adapter *pi, int unit, char *scratch)
 		for (pi->unit = s; pi->unit < e; pi->unit++) {
 			if (pi->proto->probe_unit(pi)) {
 				parport_release(pi->pardev);
-				return pi_probe_mode(pi, max, scratch);
+				return pi_probe_mode(pi, max);
 			}
 		}
 		parport_release(pi->pardev);
 		return false;
 	}
 
-	return pi_probe_mode(pi, max, scratch);
+	return pi_probe_mode(pi, max);
 }
 
 static void pata_parport_dev_release(struct device *dev)
@@ -420,7 +420,6 @@ static struct pi_adapter *pi_init_one(struct parport *parport,
 			struct pi_protocol *pr, int mode, int unit, int delay)
 {
 	struct pardev_cb par_cb = { };
-	char scratch[512];
 	const struct ata_port_info *ppi[] = { &pata_parport_port_info };
 	struct ata_host *host;
 	struct pi_adapter *pi;
@@ -473,7 +472,7 @@ static struct pi_adapter *pi_init_one(struct parport *parport,
 	if (!pi->pardev)
 		goto out_module_put;
 
-	if (!pi_probe_unit(pi, unit, scratch)) {
+	if (!pi_probe_unit(pi, unit)) {
 		dev_info(&pi->dev, "Adapter not found\n");
 		goto out_unreg_parport;
 	}
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
index e45bb1896003..bbfa4e63ee85 100644
--- a/include/linux/pata_parport.h
+++ b/include/linux/pata_parport.h
@@ -67,7 +67,7 @@ struct pi_protocol {
 
 	int (*test_port)(struct pi_adapter *pi);
 	int (*probe_unit)(struct pi_adapter *pi);
-	int (*test_proto)(struct pi_adapter *pi, char *scratch);
+	int (*test_proto)(struct pi_adapter *pi);
 	void (*log_adapter)(struct pi_adapter *pi);
 
 	int (*init_proto)(struct pi_adapter *pi);
-- 
cgit v1.2.3


From fe027ff984c61f4ac5e79823fef30ced4f46d23d Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Sat, 18 Feb 2023 23:01:38 +0100
Subject: ata: pata_parport: move pata_parport.h to drivers/ata/pata_parport

Now that paride is gone, pata_parport.h does not need to be in
include/linux. Move it to drivers/ata/pata_parport.

Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/pata_parport/aten.c         |  3 +-
 drivers/ata/pata_parport/bpck.c         |  3 +-
 drivers/ata/pata_parport/bpck6.c        |  3 +-
 drivers/ata/pata_parport/comm.c         |  3 +-
 drivers/ata/pata_parport/dstr.c         |  3 +-
 drivers/ata/pata_parport/epat.c         |  3 +-
 drivers/ata/pata_parport/epia.c         |  3 +-
 drivers/ata/pata_parport/fit2.c         |  3 +-
 drivers/ata/pata_parport/fit3.c         |  3 +-
 drivers/ata/pata_parport/friq.c         |  3 +-
 drivers/ata/pata_parport/frpw.c         |  3 +-
 drivers/ata/pata_parport/kbic.c         |  3 +-
 drivers/ata/pata_parport/ktti.c         |  3 +-
 drivers/ata/pata_parport/on20.c         |  3 +-
 drivers/ata/pata_parport/on26.c         |  3 +-
 drivers/ata/pata_parport/pata_parport.c |  2 +-
 drivers/ata/pata_parport/pata_parport.h | 96 +++++++++++++++++++++++++++++++++
 include/linux/pata_parport.h            | 96 ---------------------------------
 18 files changed, 112 insertions(+), 127 deletions(-)
 create mode 100644 drivers/ata/pata_parport/pata_parport.h
 delete mode 100644 include/linux/pata_parport.h

(limited to 'include')

diff --git a/drivers/ata/pata_parport/aten.c b/drivers/ata/pata_parport/aten.c
index f0d63b8513e0..1bd248c42f8b 100644
--- a/drivers/ata/pata_parport/aten.c
+++ b/drivers/ata/pata_parport/aten.c
@@ -16,8 +16,7 @@
 #include <linux/wait.h>
 #include <linux/types.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define j44(a,b)                ((((a>>4)&0x0f)|(b&0xf0))^0x88)
 
diff --git a/drivers/ata/pata_parport/bpck.c b/drivers/ata/pata_parport/bpck.c
index 472029a21d59..1c5035a09554 100644
--- a/drivers/ata/pata_parport/bpck.c
+++ b/drivers/ata/pata_parport/bpck.c
@@ -14,8 +14,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #undef r2
 #undef w2
diff --git a/drivers/ata/pata_parport/bpck6.c b/drivers/ata/pata_parport/bpck6.c
index 683a11131acd..964bc688e280 100644
--- a/drivers/ata/pata_parport/bpck6.c
+++ b/drivers/ata/pata_parport/bpck6.c
@@ -18,9 +18,8 @@
 #include <linux/types.h>
 #include <asm/io.h>
 #include <linux/parport.h>
-
 #include "ppc6lnx.c"
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
 
diff --git a/drivers/ata/pata_parport/comm.c b/drivers/ata/pata_parport/comm.c
index 0483caa80544..4c2f9ad60ad8 100644
--- a/drivers/ata/pata_parport/comm.c
+++ b/drivers/ata/pata_parport/comm.c
@@ -15,8 +15,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 /* mode codes:  0  nybble reads, 8-bit writes
                 1  8-bit reads and writes
diff --git a/drivers/ata/pata_parport/dstr.c b/drivers/ata/pata_parport/dstr.c
index c5af7a5fa636..2524684be206 100644
--- a/drivers/ata/pata_parport/dstr.c
+++ b/drivers/ata/pata_parport/dstr.c
@@ -14,8 +14,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 /* mode codes:  0  nybble reads, 8-bit writes
                 1  8-bit reads and writes
diff --git a/drivers/ata/pata_parport/epat.c b/drivers/ata/pata_parport/epat.c
index 0315f98326f8..b146999368ae 100644
--- a/drivers/ata/pata_parport/epat.c
+++ b/drivers/ata/pata_parport/epat.c
@@ -16,8 +16,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define j44(a,b)		(((a>>4)&0x0f)+(b&0xf0))
 #define j53(a,b)		(((a>>3)&0x1f)+((b<<4)&0xe0))
diff --git a/drivers/ata/pata_parport/epia.c b/drivers/ata/pata_parport/epia.c
index 77869639773d..f6db2f79fe99 100644
--- a/drivers/ata/pata_parport/epia.c
+++ b/drivers/ata/pata_parport/epia.c
@@ -17,8 +17,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 /* mode codes:  0  nybble reads on port 1, 8-bit writes
                 1  5/3 reads on ports 1 & 2, 8-bit writes
diff --git a/drivers/ata/pata_parport/fit2.c b/drivers/ata/pata_parport/fit2.c
index 3536d8c07955..fd3b2ce426a5 100644
--- a/drivers/ata/pata_parport/fit2.c
+++ b/drivers/ata/pata_parport/fit2.c
@@ -20,8 +20,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define j44(a,b)                (((a>>4)&0x0f)|(b&0xf0))
 
diff --git a/drivers/ata/pata_parport/fit3.c b/drivers/ata/pata_parport/fit3.c
index 9f5320c750e2..75df656ac472 100644
--- a/drivers/ata/pata_parport/fit3.c
+++ b/drivers/ata/pata_parport/fit3.c
@@ -24,8 +24,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define j44(a,b)                (((a>>3)&0x0f)|((b<<1)&0xf0))
 
diff --git a/drivers/ata/pata_parport/friq.c b/drivers/ata/pata_parport/friq.c
index 8883b3c509cc..1647264cd9a8 100644
--- a/drivers/ata/pata_parport/friq.c
+++ b/drivers/ata/pata_parport/friq.c
@@ -27,8 +27,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define CMD(x)		w2(4);w0(0xff);w0(0xff);w0(0x73);w0(0x73);\
 			w0(0xc9);w0(0xc9);w0(0x26);w0(0x26);w0(x);w0(x);
diff --git a/drivers/ata/pata_parport/frpw.c b/drivers/ata/pata_parport/frpw.c
index 1ef8be79d793..3ec0abf16fa6 100644
--- a/drivers/ata/pata_parport/frpw.c
+++ b/drivers/ata/pata_parport/frpw.c
@@ -20,8 +20,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define cec4		w2(0xc);w2(0xe);w2(0xe);w2(0xc);w2(4);w2(4);w2(4);
 #define j44(l,h)	(((l>>4)&0x0f)|(h&0xf0))
diff --git a/drivers/ata/pata_parport/kbic.c b/drivers/ata/pata_parport/kbic.c
index 29f4f1e14d21..8213e62f8f00 100644
--- a/drivers/ata/pata_parport/kbic.c
+++ b/drivers/ata/pata_parport/kbic.c
@@ -19,8 +19,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define r12w()			(delay_p,inw(pi->port+1)&0xffff) 
 
diff --git a/drivers/ata/pata_parport/ktti.c b/drivers/ata/pata_parport/ktti.c
index 742051f6ea10..4890b1f12348 100644
--- a/drivers/ata/pata_parport/ktti.c
+++ b/drivers/ata/pata_parport/ktti.c
@@ -16,8 +16,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define j44(a,b)                (((a>>4)&0x0f)|(b&0xf0))
 
diff --git a/drivers/ata/pata_parport/on20.c b/drivers/ata/pata_parport/on20.c
index 6956b91efb47..276ace12d490 100644
--- a/drivers/ata/pata_parport/on20.c
+++ b/drivers/ata/pata_parport/on20.c
@@ -13,8 +13,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define op(f)	w2(4);w0(f);w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4);
 #define vl(v)	w2(4);w0(v);w2(5);w2(7);w2(5);w2(4);
diff --git a/drivers/ata/pata_parport/on26.c b/drivers/ata/pata_parport/on26.c
index 1d90eb9b541e..dc47a54b121f 100644
--- a/drivers/ata/pata_parport/on26.c
+++ b/drivers/ata/pata_parport/on26.c
@@ -14,8 +14,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <asm/io.h>
-
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 /* mode codes:  0  nybble reads, 8-bit writes
                 1  8-bit reads and writes
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index b6499f2160da..b02a7d16551c 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -6,7 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/parport.h>
-#include <linux/pata_parport.h>
+#include "pata_parport.h"
 
 #define DRV_NAME "pata_parport"
 
diff --git a/drivers/ata/pata_parport/pata_parport.h b/drivers/ata/pata_parport/pata_parport.h
new file mode 100644
index 000000000000..bbfa4e63ee85
--- /dev/null
+++ b/drivers/ata/pata_parport/pata_parport.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *	pata_parport.h	(c) 1997-8  Grant R. Guenther <grant@torque.net>
+ *				    Under the terms of the GPL.
+ *
+ * This file defines the interface for parallel port IDE adapter chip drivers.
+ */
+
+#ifndef LINUX_PATA_PARPORT_H
+#define LINUX_PATA_PARPORT_H
+
+#include <linux/libata.h>
+
+struct pi_adapter {
+	struct device dev;
+	struct pi_protocol *proto;	/* adapter protocol */
+	int port;			/* base address of parallel port */
+	int mode;			/* transfer mode in use */
+	int delay;			/* adapter delay setting */
+	int unit;			/* unit number for chained adapters */
+	int saved_r0;			/* saved port state */
+	int saved_r2;			/* saved port state */
+	unsigned long private;		/* for protocol module */
+	struct pardevice *pardev;	/* pointer to pardevice */
+};
+
+/* registers are addressed as (cont,regr)
+ *	cont: 0 for command register file, 1 for control register(s)
+ *	regr: 0-7 for register number.
+ */
+
+/* macros and functions exported to the protocol modules */
+#define delay_p			(pi->delay ? udelay(pi->delay) : (void)0)
+#define out_p(offs, byte)	do { outb(byte, pi->port + offs); delay_p; } while (0)
+#define in_p(offs)		(delay_p, inb(pi->port + offs))
+
+#define w0(byte)		out_p(0, byte)
+#define r0()			in_p(0)
+#define w1(byte)		out_p(1, byte)
+#define r1()			in_p(1)
+#define w2(byte)		out_p(2, byte)
+#define r2()			in_p(2)
+#define w3(byte)		out_p(3, byte)
+#define w4(byte)		out_p(4, byte)
+#define r4()			in_p(4)
+#define w4w(data)		do { outw(data, pi->port + 4); delay_p; } while (0)
+#define w4l(data)		do { outl(data, pi->port + 4); delay_p; } while (0)
+#define r4w()			(delay_p, inw(pi->port + 4))
+#define r4l()			(delay_p, inl(pi->port + 4))
+
+struct pi_protocol {
+	char name[8];
+
+	int max_mode;
+	int epp_first;		/* modes >= this use 8 ports */
+
+	int default_delay;
+	int max_units;		/* max chained units probed for */
+
+	void (*write_regr)(struct pi_adapter *pi, int cont, int regr, int val);
+	int (*read_regr)(struct pi_adapter *pi, int cont, int regr);
+	void (*write_block)(struct pi_adapter *pi, char *buf, int count);
+	void (*read_block)(struct pi_adapter *pi, char *buf, int count);
+
+	void (*connect)(struct pi_adapter *pi);
+	void (*disconnect)(struct pi_adapter *pi);
+
+	int (*test_port)(struct pi_adapter *pi);
+	int (*probe_unit)(struct pi_adapter *pi);
+	int (*test_proto)(struct pi_adapter *pi);
+	void (*log_adapter)(struct pi_adapter *pi);
+
+	int (*init_proto)(struct pi_adapter *pi);
+	void (*release_proto)(struct pi_adapter *pi);
+	struct module *owner;
+	struct device_driver driver;
+	struct scsi_host_template sht;
+};
+
+#define PATA_PARPORT_SHT ATA_PIO_SHT
+
+int pata_parport_register_driver(struct pi_protocol *pr);
+void pata_parport_unregister_driver(struct pi_protocol *pr);
+
+/**
+ * module_pata_parport_driver() - Helper macro for registering a pata_parport driver
+ * @__pi_protocol: pi_protocol struct
+ *
+ * Helper macro for pata_parport drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_pata_parport_driver(__pi_protocol) \
+	module_driver(__pi_protocol, pata_parport_register_driver, pata_parport_unregister_driver)
+
+#endif /* LINUX_PATA_PARPORT_H */
diff --git a/include/linux/pata_parport.h b/include/linux/pata_parport.h
deleted file mode 100644
index bbfa4e63ee85..000000000000
--- a/include/linux/pata_parport.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *	pata_parport.h	(c) 1997-8  Grant R. Guenther <grant@torque.net>
- *				    Under the terms of the GPL.
- *
- * This file defines the interface for parallel port IDE adapter chip drivers.
- */
-
-#ifndef LINUX_PATA_PARPORT_H
-#define LINUX_PATA_PARPORT_H
-
-#include <linux/libata.h>
-
-struct pi_adapter {
-	struct device dev;
-	struct pi_protocol *proto;	/* adapter protocol */
-	int port;			/* base address of parallel port */
-	int mode;			/* transfer mode in use */
-	int delay;			/* adapter delay setting */
-	int unit;			/* unit number for chained adapters */
-	int saved_r0;			/* saved port state */
-	int saved_r2;			/* saved port state */
-	unsigned long private;		/* for protocol module */
-	struct pardevice *pardev;	/* pointer to pardevice */
-};
-
-/* registers are addressed as (cont,regr)
- *	cont: 0 for command register file, 1 for control register(s)
- *	regr: 0-7 for register number.
- */
-
-/* macros and functions exported to the protocol modules */
-#define delay_p			(pi->delay ? udelay(pi->delay) : (void)0)
-#define out_p(offs, byte)	do { outb(byte, pi->port + offs); delay_p; } while (0)
-#define in_p(offs)		(delay_p, inb(pi->port + offs))
-
-#define w0(byte)		out_p(0, byte)
-#define r0()			in_p(0)
-#define w1(byte)		out_p(1, byte)
-#define r1()			in_p(1)
-#define w2(byte)		out_p(2, byte)
-#define r2()			in_p(2)
-#define w3(byte)		out_p(3, byte)
-#define w4(byte)		out_p(4, byte)
-#define r4()			in_p(4)
-#define w4w(data)		do { outw(data, pi->port + 4); delay_p; } while (0)
-#define w4l(data)		do { outl(data, pi->port + 4); delay_p; } while (0)
-#define r4w()			(delay_p, inw(pi->port + 4))
-#define r4l()			(delay_p, inl(pi->port + 4))
-
-struct pi_protocol {
-	char name[8];
-
-	int max_mode;
-	int epp_first;		/* modes >= this use 8 ports */
-
-	int default_delay;
-	int max_units;		/* max chained units probed for */
-
-	void (*write_regr)(struct pi_adapter *pi, int cont, int regr, int val);
-	int (*read_regr)(struct pi_adapter *pi, int cont, int regr);
-	void (*write_block)(struct pi_adapter *pi, char *buf, int count);
-	void (*read_block)(struct pi_adapter *pi, char *buf, int count);
-
-	void (*connect)(struct pi_adapter *pi);
-	void (*disconnect)(struct pi_adapter *pi);
-
-	int (*test_port)(struct pi_adapter *pi);
-	int (*probe_unit)(struct pi_adapter *pi);
-	int (*test_proto)(struct pi_adapter *pi);
-	void (*log_adapter)(struct pi_adapter *pi);
-
-	int (*init_proto)(struct pi_adapter *pi);
-	void (*release_proto)(struct pi_adapter *pi);
-	struct module *owner;
-	struct device_driver driver;
-	struct scsi_host_template sht;
-};
-
-#define PATA_PARPORT_SHT ATA_PIO_SHT
-
-int pata_parport_register_driver(struct pi_protocol *pr);
-void pata_parport_unregister_driver(struct pi_protocol *pr);
-
-/**
- * module_pata_parport_driver() - Helper macro for registering a pata_parport driver
- * @__pi_protocol: pi_protocol struct
- *
- * Helper macro for pata_parport drivers which do not do anything special in module
- * init/exit. This eliminates a lot of boilerplate. Each module may only
- * use this macro once, and calling it replaces module_init() and module_exit()
- */
-#define module_pata_parport_driver(__pi_protocol) \
-	module_driver(__pi_protocol, pata_parport_register_driver, pata_parport_unregister_driver)
-
-#endif /* LINUX_PATA_PARPORT_H */
-- 
cgit v1.2.3


From 8d857540f517501d34e5810a369707a8181d1bbf Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Fri, 24 Feb 2023 23:40:23 +0300
Subject: ata: drop unused ata_id_to_hd_driveid()

This function was renamed from ide_id_to_hd_driveid() and moved from
drivers/ide/ to <linux/ata.h> but it never got used by libata, thus it
became useless after drivers/ide/ removal...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 0c18499f60b6..3240116647d5 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -16,7 +16,6 @@
 #include <linux/bits.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <asm/byteorder.h>
 
 /* defines only for the constants which don't work well as enums */
 #define ATA_DMA_BOUNDARY	0xffffUL
@@ -1067,26 +1066,6 @@ static inline bool ata_id_is_lba_capacity_ok(u16 *id)
 	return false;	/* LBA capacity value may be bad */
 }
 
-static inline void ata_id_to_hd_driveid(u16 *id)
-{
-#ifdef __BIG_ENDIAN
-	/* accessed in struct hd_driveid as 8-bit values */
-	id[ATA_ID_MAX_MULTSECT]	 = __cpu_to_le16(id[ATA_ID_MAX_MULTSECT]);
-	id[ATA_ID_CAPABILITY]	 = __cpu_to_le16(id[ATA_ID_CAPABILITY]);
-	id[ATA_ID_OLD_PIO_MODES] = __cpu_to_le16(id[ATA_ID_OLD_PIO_MODES]);
-	id[ATA_ID_OLD_DMA_MODES] = __cpu_to_le16(id[ATA_ID_OLD_DMA_MODES]);
-	id[ATA_ID_MULTSECT]	 = __cpu_to_le16(id[ATA_ID_MULTSECT]);
-
-	/* as 32-bit values */
-	*(u32 *)&id[ATA_ID_LBA_CAPACITY] = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-	*(u32 *)&id[ATA_ID_SPG]		 = ata_id_u32(id, ATA_ID_SPG);
-
-	/* as 64-bit value */
-	*(u64 *)&id[ATA_ID_LBA_CAPACITY_2] =
-		ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
-#endif
-}
-
 static inline bool ata_ok(u8 status)
 {
 	return ((status & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | ATA_ERR))
-- 
cgit v1.2.3


From dc2e107e2d48ebe629985a080d457896ccf18f84 Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Fri, 24 Feb 2023 23:40:24 +0300
Subject: ata: drop unused ata_id_is_lba_capacity_ok()

This function was renamed from lba_capacity_is_ok()() and moved from
drivers/ide/ to <linux/ata.h> but it never got used by libata, thus it
became useless after drivers/ide/ removal...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 3240116647d5..c224dbddb9b2 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -1016,56 +1016,6 @@ static inline bool atapi_id_dmadir(const u16 *dev_id)
 	return ata_id_major_version(dev_id) >= 7 && (dev_id[62] & 0x8000);
 }
 
-/*
- * ata_id_is_lba_capacity_ok() performs a sanity check on
- * the claimed LBA capacity value for the device.
- *
- * Returns 1 if LBA capacity looks sensible, 0 otherwise.
- *
- * It is called only once for each device.
- */
-static inline bool ata_id_is_lba_capacity_ok(u16 *id)
-{
-	unsigned long lba_sects, chs_sects, head, tail;
-
-	/* No non-LBA info .. so valid! */
-	if (id[ATA_ID_CYLS] == 0)
-		return true;
-
-	lba_sects = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-
-	/*
-	 * The ATA spec tells large drives to return
-	 * C/H/S = 16383/16/63 independent of their size.
-	 * Some drives can be jumpered to use 15 heads instead of 16.
-	 * Some drives can be jumpered to use 4092 cyls instead of 16383.
-	 */
-	if ((id[ATA_ID_CYLS] == 16383 ||
-	     (id[ATA_ID_CYLS] == 4092 && id[ATA_ID_CUR_CYLS] == 16383)) &&
-	    id[ATA_ID_SECTORS] == 63 &&
-	    (id[ATA_ID_HEADS] == 15 || id[ATA_ID_HEADS] == 16) &&
-	    (lba_sects >= 16383 * 63 * id[ATA_ID_HEADS]))
-		return true;
-
-	chs_sects = id[ATA_ID_CYLS] * id[ATA_ID_HEADS] * id[ATA_ID_SECTORS];
-
-	/* perform a rough sanity check on lba_sects: within 10% is OK */
-	if (lba_sects - chs_sects < chs_sects/10)
-		return true;
-
-	/* some drives have the word order reversed */
-	head = (lba_sects >> 16) & 0xffff;
-	tail = lba_sects & 0xffff;
-	lba_sects = head | (tail << 16);
-
-	if (lba_sects - chs_sects < chs_sects/10) {
-		*(__le32 *)&id[ATA_ID_LBA_CAPACITY] = __cpu_to_le32(lba_sects);
-		return true;	/* LBA capacity is (now) good */
-	}
-
-	return false;	/* LBA capacity value may be bad */
-}
-
 static inline bool ata_ok(u8 status)
 {
 	return ((status & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | ATA_ERR))
-- 
cgit v1.2.3


From 05f0adefd48a2ccbd66b3793d220beb9dcaf6988 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@zary.sk>
Date: Tue, 7 Mar 2023 23:46:06 +0100
Subject: ata: parport_pc: add 16-bit and 8-bit fast EPP transfer flags

PARPORT_EPP_FAST flag currently uses 32-bit I/O port access for data
read/write (insl/outsl).
Add PARPORT_EPP_FAST_16 and PARPORT_EPP_FAST_8 that use insw/outsw
and insb/outsb (and PARPORT_EPP_FAST_32 as alias for PARPORT_EPP_FAST).

Signed-off-by: Ondrej Zary <linux@zary.sk>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/parport/parport_pc.c | 20 ++++++++++++++++----
 include/uapi/linux/parport.h |  3 +++
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 88e125e36230..3bacbaf16f42 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -305,9 +305,15 @@ static size_t parport_pc_epp_read_data(struct parport *port, void *buf,
 		}
 		return got;
 	}
-	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03))
+	if ((length > 1) && ((flags & PARPORT_EPP_FAST_32)
+			   || flags & PARPORT_EPP_FAST_16
+			   || flags & PARPORT_EPP_FAST_8)) {
+		if ((flags & PARPORT_EPP_FAST_32)
+		    && !(((long)buf | length) & 0x03))
 			insl(EPPDATA(port), buf, (length >> 2));
+		else if ((flags & PARPORT_EPP_FAST_16)
+			 && !(((long)buf | length) & 0x01))
+			insw(EPPDATA(port), buf, length >> 1);
 		else
 			insb(EPPDATA(port), buf, length);
 		if (inb(STATUS(port)) & 0x01) {
@@ -334,9 +340,15 @@ static size_t parport_pc_epp_write_data(struct parport *port, const void *buf,
 {
 	size_t written = 0;
 
-	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03))
+	if ((length > 1) && ((flags & PARPORT_EPP_FAST_32)
+			   || flags & PARPORT_EPP_FAST_16
+			   || flags & PARPORT_EPP_FAST_8)) {
+		if ((flags & PARPORT_EPP_FAST_32)
+		    && !(((long)buf | length) & 0x03))
 			outsl(EPPDATA(port), buf, (length >> 2));
+		else if ((flags & PARPORT_EPP_FAST_16)
+			 && !(((long)buf | length) & 0x01))
+			outsw(EPPDATA(port), buf, length >> 1);
 		else
 			outsb(EPPDATA(port), buf, length);
 		if (inb(STATUS(port)) & 0x01) {
diff --git a/include/uapi/linux/parport.h b/include/uapi/linux/parport.h
index f41388f88dc3..fe93e41fc205 100644
--- a/include/uapi/linux/parport.h
+++ b/include/uapi/linux/parport.h
@@ -90,6 +90,9 @@ typedef enum {
 /* Flags for block transfer operations. */
 #define PARPORT_EPP_FAST		(1<<0) /* Unreliable counts. */
 #define PARPORT_W91284PIC		(1<<1) /* have a Warp9 w91284pic in the device */
+#define PARPORT_EPP_FAST_32		PARPORT_EPP_FAST /* 32-bit EPP transfers */
+#define PARPORT_EPP_FAST_16		(1<<2) /* 16-bit EPP transfers */
+#define PARPORT_EPP_FAST_8		(1<<3) /* 8-bit EPP transfers */
 
 /* The rest is for the kernel only */
 #endif /* _UAPI_PARPORT_H_ */
-- 
cgit v1.2.3


From 1cc6571f562774f1d928dc8b3cff50829b86e970 Mon Sep 17 00:00:00 2001
From: Nick Child <nnac123@linux.ibm.com>
Date: Tue, 21 Mar 2023 10:07:25 -0500
Subject: netdev: Enforce index cap in netdev_get_tx_queue

When requesting a TX queue at a given index, warn on out-of-bounds
referencing if the index is greater than the allocated number of
queues.

Specifically, since this function is used heavily in the networking
stack use DEBUG_NET_WARN_ON_ONCE to avoid executing a new branch on
every packet.

Signed-off-by: Nick Child <nnac123@linux.ibm.com>
Link: https://lore.kernel.org/r/20230321150725.127229-2-nnac123@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7621c512765f..674ee5daa7b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2484,6 +2484,7 @@ static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
+	DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
 	return &dev->_tx[index];
 }
 
-- 
cgit v1.2.3


From 4ee9b0dcf09f426fbad7ed132d73ea2ba379d8ee Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Mar 2023 15:58:54 +0000
Subject: net: phylink: remove an_enabled

The Autoneg bit in the advertising bitmap and state->an_enabled are
always identical. state->an_enabled is now no longer used by any
drivers, so lets kill this duplication.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 37 +++++++++++++++++--------------------
 include/linux/phylink.h   |  2 --
 2 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1a2f074685fa..f7da96f0c75b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -843,7 +843,6 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode)
 		phylink_set(pl->supported, Autoneg);
 		phylink_set(pl->supported, Asym_Pause);
 		phylink_set(pl->supported, Pause);
-		pl->link_config.an_enabled = true;
 		pl->cfg_link_an_mode = MLO_AN_INBAND;
 
 		switch (pl->link_config.interface) {
@@ -945,9 +944,6 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode)
 				    "failed to validate link configuration for in-band status\n");
 			return -EINVAL;
 		}
-
-		/* Check if MAC/PCS also supports Autoneg. */
-		pl->link_config.an_enabled = phylink_test(pl->supported, Autoneg);
 	}
 
 	return 0;
@@ -957,7 +953,8 @@ static void phylink_apply_manual_flow(struct phylink *pl,
 				      struct phylink_link_state *state)
 {
 	/* If autoneg is disabled, pause AN is also disabled */
-	if (!state->an_enabled)
+	if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			       state->advertising))
 		state->pause &= ~MLO_PAUSE_AN;
 
 	/* Manual configuration of pause modes */
@@ -997,21 +994,22 @@ static void phylink_mac_config(struct phylink *pl,
 			       const struct phylink_link_state *state)
 {
 	phylink_dbg(pl,
-		    "%s: mode=%s/%s/%s/%s/%s adv=%*pb pause=%02x link=%u an=%u\n",
+		    "%s: mode=%s/%s/%s/%s/%s adv=%*pb pause=%02x link=%u\n",
 		    __func__, phylink_an_mode_str(pl->cur_link_an_mode),
 		    phy_modes(state->interface),
 		    phy_speed_to_str(state->speed),
 		    phy_duplex_to_str(state->duplex),
 		    phy_rate_matching_to_str(state->rate_matching),
 		    __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
-		    state->pause, state->link, state->an_enabled);
+		    state->pause, state->link);
 
 	pl->mac_ops->mac_config(pl->config, pl->cur_link_an_mode, state);
 }
 
 static void phylink_mac_pcs_an_restart(struct phylink *pl)
 {
-	if (pl->link_config.an_enabled &&
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      pl->link_config.advertising) &&
 	    phy_interface_mode_is_8023z(pl->link_config.interface) &&
 	    phylink_autoneg_inband(pl->cur_link_an_mode)) {
 		if (pl->pcs)
@@ -1138,9 +1136,9 @@ static void phylink_mac_pcs_get_state(struct phylink *pl,
 	linkmode_copy(state->advertising, pl->link_config.advertising);
 	linkmode_zero(state->lp_advertising);
 	state->interface = pl->link_config.interface;
-	state->an_enabled = pl->link_config.an_enabled;
 	state->rate_matching = pl->link_config.rate_matching;
-	if (state->an_enabled) {
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      state->advertising)) {
 		state->speed = SPEED_UNKNOWN;
 		state->duplex = DUPLEX_UNKNOWN;
 		state->pause = MLO_PAUSE_NONE;
@@ -1531,7 +1529,6 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->link_config.pause = MLO_PAUSE_AN;
 	pl->link_config.speed = SPEED_UNKNOWN;
 	pl->link_config.duplex = DUPLEX_UNKNOWN;
-	pl->link_config.an_enabled = true;
 	pl->mac_ops = mac_ops;
 	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	timer_setup(&pl->link_poll, phylink_fixed_poll, 0);
@@ -2136,8 +2133,9 @@ static void phylink_get_ksettings(const struct phylink_link_state *state,
 		kset->base.speed = state->speed;
 		kset->base.duplex = state->duplex;
 	}
-	kset->base.autoneg = state->an_enabled ? AUTONEG_ENABLE :
-				AUTONEG_DISABLE;
+	kset->base.autoneg = linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					       state->advertising) ?
+				AUTONEG_ENABLE : AUTONEG_DISABLE;
 }
 
 /**
@@ -2284,9 +2282,8 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 	/* We have ruled out the case with a PHY attached, and the
 	 * fixed-link cases.  All that is left are in-band links.
 	 */
-	config.an_enabled = kset->base.autoneg == AUTONEG_ENABLE;
 	linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, config.advertising,
-			 config.an_enabled);
+			 kset->base.autoneg == AUTONEG_ENABLE);
 
 	/* If this link is with an SFP, ensure that changes to advertised modes
 	 * also cause the associated interface to be selected such that the
@@ -2320,13 +2317,14 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 	}
 
 	/* If autonegotiation is enabled, we must have an advertisement */
-	if (config.an_enabled && phylink_is_empty_linkmode(config.advertising))
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      config.advertising) &&
+	    phylink_is_empty_linkmode(config.advertising))
 		return -EINVAL;
 
 	mutex_lock(&pl->state_mutex);
 	pl->link_config.speed = config.speed;
 	pl->link_config.duplex = config.duplex;
-	pl->link_config.an_enabled = config.an_enabled;
 
 	if (pl->link_config.interface != config.interface) {
 		/* The interface changed, e.g. 1000base-X <-> 2500base-X */
@@ -2932,7 +2930,6 @@ static int phylink_sfp_config_phy(struct phylink *pl, u8 mode,
 	config.speed = SPEED_UNKNOWN;
 	config.duplex = DUPLEX_UNKNOWN;
 	config.pause = MLO_PAUSE_AN;
-	config.an_enabled = pl->link_config.an_enabled;
 
 	/* Ignore errors if we're expecting a PHY to attach later */
 	ret = phylink_validate(pl, support, &config);
@@ -3001,7 +2998,6 @@ static int phylink_sfp_config_optical(struct phylink *pl)
 	config.speed = SPEED_UNKNOWN;
 	config.duplex = DUPLEX_UNKNOWN;
 	config.pause = MLO_PAUSE_AN;
-	config.an_enabled = true;
 
 	/* For all the interfaces that are supported, reduce the sfp_support
 	 * mask to only those link modes that can be supported.
@@ -3300,7 +3296,8 @@ void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state,
 	/* If there is no link or autonegotiation is disabled, the LP advertisement
 	 * data is not meaningful, so don't go any further.
 	 */
-	if (!state->link || !state->an_enabled)
+	if (!state->link || !linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					       state->advertising))
 		return;
 
 	switch (state->interface) {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index c492c26202b5..9ff56b050584 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -93,7 +93,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode)
  *   the medium link mode (@speed and @duplex) and the speed/duplex of the phy
  *   interface mode (@interface) are different.
  * @link: true if the link is up.
- * @an_enabled: true if autonegotiation is enabled/desired.
  * @an_complete: true if autonegotiation has completed.
  */
 struct phylink_link_state {
@@ -105,7 +104,6 @@ struct phylink_link_state {
 	int pause;
 	int rate_matching;
 	unsigned int link:1;
-	unsigned int an_enabled:1;
 	unsigned int an_complete:1;
 };
 
-- 
cgit v1.2.3


From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:58 -0700
Subject: bpf: Retire the struct_ops map kvalue->refcnt.

We have replaced kvalue-refcnt with synchronize_rcu() to wait for an
RCU grace period.

Maintenance of kvalue->refcnt was a complicated task, as we had to
simultaneously keep track of two reference counts: one for the
reference count of bpf_map. When the kvalue->refcnt reaches zero, we
also have to reduce the reference count on bpf_map - yet these steps
are not performed in an atomic manner and require us to be vigilant
when managing them. By eliminating kvalue->refcnt, we can make our
maintenance more straightforward as the refcount of bpf_map is now
solely managed!

To prevent the trampoline image of a struct_ops from being released
while it is still in use, we wait for an RCU grace period. The
setsockopt(TCP_CONGESTION, "...") command allows you to change your
socket's congestion control algorithm and can result in releasing the
old struct_ops implementation. It is fine. However, this function is
exposed through bpf_setsockopt(), it may be accessed by BPF programs
as well. To ensure that the trampoline image belonging to struct_op
can be safely called while its method is in use, the trampoline
safeguarde the BPF program with rcu_read_lock(). Doing so prevents any
destruction of the associated images before returning from a
trampoline and requires us to wait for an RCU grace period.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         |  1 +
 kernel/bpf/bpf_struct_ops.c | 77 ++++++++++++++++++++++++++-------------------
 kernel/bpf/syscall.c        |  6 ++--
 3 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec0df059f562..f04098468d7a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ba7a94276e3b..2f3c4a0e03ee 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/mutex.h>
 #include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
 
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
@@ -249,6 +250,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	enum bpf_struct_ops_state state;
+	s64 refcnt;
 
 	if (unlikely(*(u32 *)key != 0))
 		return -ENOENT;
@@ -267,7 +269,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	uvalue = value;
 	memcpy(uvalue, st_map->uvalue, map->value_size);
 	uvalue->state = state;
-	refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+	/* This value offers the user space a general estimate of how
+	 * many sockets are still utilizing this struct_ops for TCP
+	 * congestion control. The number might not be exact, but it
+	 * should sufficiently meet our present goals.
+	 */
+	refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+	refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
 
 	return 0;
 }
@@ -491,7 +500,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	refcount_set(&kvalue->refcnt, 1);
 	bpf_map_inc(map);
 
 	set_memory_rox((long)st_map->image, 1);
@@ -536,8 +544,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
 		st_map->st_ops->unreg(&st_map->kvalue.data);
-		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
-			bpf_map_put(map);
+		bpf_map_put(map);
 		return 0;
 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
 		return -EINPROGRESS;
@@ -570,7 +577,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
 	kfree(value);
 }
 
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
@@ -582,6 +589,28 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	bpf_map_area_free(st_map);
 }
 
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+	/* The struct_ops's function may switch to another struct_ops.
+	 *
+	 * For example, bpf_tcp_cc_x->init() may switch to
+	 * another tcp_cc_y by calling
+	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+	 * and its refcount may reach 0 which then free its
+	 * trampoline image while tcp_cc_x is still running.
+	 *
+	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+	 * to finish. bpf-tcp-cc prog is non sleepable.
+	 * A rcu_tasks gp is to wait for the last few insn
+	 * in the tramopline image to finish before releasing
+	 * the trampoline image.
+	 */
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+	__bpf_struct_ops_map_free(map);
+}
+
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
@@ -630,7 +659,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 				   NUMA_NO_NODE);
 	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
 	if (!st_map->uvalue || !st_map->links || !st_map->image) {
-		bpf_struct_ops_map_free(map);
+		__bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -676,41 +705,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 bool bpf_struct_ops_get(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
 
-	return refcount_inc_not_zero(&kvalue->refcnt);
-}
-
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
-{
-	struct bpf_struct_ops_map *st_map;
-
-	st_map = container_of(head, struct bpf_struct_ops_map, rcu);
-	bpf_map_put(&st_map->map);
+	map = __bpf_map_inc_not_zero(&st_map->map, false);
+	return !IS_ERR(map);
 }
 
 void bpf_struct_ops_put(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
-	if (refcount_dec_and_test(&kvalue->refcnt)) {
-		struct bpf_struct_ops_map *st_map;
-
-		st_map = container_of(kvalue, struct bpf_struct_ops_map,
-				      kvalue);
-		/* The struct_ops's function may switch to another struct_ops.
-		 *
-		 * For example, bpf_tcp_cc_x->init() may switch to
-		 * another tcp_cc_y by calling
-		 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-		 * and its map->refcnt may reach 0 which then free its
-		 * trampoline image while tcp_cc_x is still running.
-		 *
-		 * Thus, a rcu grace period is needed here.
-		 */
-		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
-	}
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	bpf_map_put(&st_map->map);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 099e9068bcdd..cff0348a2871 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
 {
 	int refold;
 
-- 
cgit v1.2.3


From 8fb1a76a0f35c45a424c9eb84b0f97ffd51e6052 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:59 -0700
Subject: net: Update an existing TCP congestion control algorithm.

This feature lets you immediately transition to another congestion
control algorithm or implementation with the same name.  Once a name
is updated, new connections will apply this new algorithm.

The purpose is to update a customized algorithm implemented in BPF
struct_ops with a new version on the flight.  The following is an
example of using the userspace API implemented in later BPF patches.

   link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
   .......
   err = bpf_link__update_map(link, skel->maps.ca_update_2);

We first load and register an algorithm implemented in BPF struct_ops,
then swap it out with a new one using the same name. After that, newly
created connections will apply the updated algorithm, while older ones
retain the previous version already applied.

This patch also takes this chance to refactor the ca validation into
the new tcp_validate_congestion_control() function.

Cc: netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-3-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcp.h   |  3 +++
 net/ipv4/tcp_cong.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..2abb755e6a3a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
 
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+				  struct tcp_congestion_ops *old_type);
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
 
 void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..1b34050a7538 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
 	return NULL;
 }
 
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
 {
-	int ret = 0;
-
 	/* all algorithms must implement these */
 	if (!ca->ssthresh || !ca->undo_cwnd ||
 	    !(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
 	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
 
 	spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+	struct tcp_congestion_ops *existing;
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+	spin_lock(&tcp_cong_list_lock);
+	existing = tcp_ca_find_key(old_ca->key);
+	if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+		pr_notice("%s not registered or non-unique key\n",
+			  ca->name);
+		ret = -EINVAL;
+	} else if (existing != old_ca) {
+		pr_notice("invalid old congestion control algorithm to replace\n");
+		ret = -EINVAL;
+	} else {
+		/* Add the new one before removing the old one to keep
+		 * one implementation available all the time.
+		 */
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
+		list_del_rcu(&existing->list);
+		pr_debug("%s updated\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module or struct_ops gets removed entirely.
+	 */
+	if (!ret)
+		synchronize_rcu();
+
+	return ret;
+}
+
 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
 {
 	const struct tcp_congestion_ops *ca;
-- 
cgit v1.2.3


From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:00 -0700
Subject: bpf: Create links for BPF struct_ops maps.

Make bpf_link support struct_ops.  Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.

With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.

bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.

The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self.  Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.

To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |   7 ++
 include/uapi/linux/bpf.h       |  12 +++-
 kernel/bpf/bpf_struct_ops.c    | 143 ++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |  23 ++++---
 net/ipv4/bpf_tcp_ca.c          |   8 ++-
 tools/include/uapi/linux/bpf.h |  12 +++-
 6 files changed, 190 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f04098468d7a..8552279efe46 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1518,6 +1518,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
 	const char *name;
@@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 	else
 		module_put(owner);
 }
+int bpf_struct_ops_link_create(union bpf_attr *attr);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 13129df937cd..42f40ee083bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 2f3c4a0e03ee..3d6b5240c25a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -17,6 +17,7 @@ enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
 	BPF_STRUCT_OPS_STATE_INUSE,
 	BPF_STRUCT_OPS_STATE_TOBEFREE,
+	BPF_STRUCT_OPS_STATE_READY,
 };
 
 #define BPF_STRUCT_OPS_COMMON_VALUE			\
@@ -59,6 +60,11 @@ struct bpf_struct_ops_map {
 	struct bpf_struct_ops_value kvalue;
 };
 
+struct bpf_struct_ops_link {
+	struct bpf_link link;
+	struct bpf_map __rcu *map;
+};
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -500,11 +506,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	bpf_map_inc(map);
+	if (st_map->map.map_flags & BPF_F_LINK) {
+		err = st_ops->validate(kdata);
+		if (err)
+			goto reset_unlock;
+		set_memory_rox((long)st_map->image, 1);
+		/* Let bpf_link handle registration & unregistration.
+		 *
+		 * Pair with smp_load_acquire() during lookup_elem().
+		 */
+		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+		goto unlock;
+	}
 
 	set_memory_rox((long)st_map->image, 1);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
+		/* This refcnt increment on the map here after
+		 * 'st_ops->reg()' is secure since the state of the
+		 * map must be set to INIT at this moment, and thus
+		 * bpf_struct_ops_map_delete_elem() can't unregister
+		 * or transition it to TOBEFREE concurrently.
+		 */
+		bpf_map_inc(map);
 		/* Pair with smp_load_acquire() during lookup_elem().
 		 * It ensures the above udata updates (e.g. prog->aux->id)
 		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -520,7 +544,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 */
 	set_memory_nx((long)st_map->image, 1);
 	set_memory_rw((long)st_map->image, 1);
-	bpf_map_put(map);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -538,6 +561,9 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	struct bpf_struct_ops_map *st_map;
 
 	st_map = (struct bpf_struct_ops_map *)map;
+	if (st_map->map.map_flags & BPF_F_LINK)
+		return -EOPNOTSUPP;
+
 	prev_state = cmpxchg(&st_map->kvalue.state,
 			     BPF_STRUCT_OPS_STATE_INUSE,
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
@@ -614,7 +640,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -638,6 +664,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	t = st_ops->type;
 
 	st_map_size = sizeof(*st_map) +
@@ -725,3 +754,111 @@ void bpf_struct_ops_put(const void *kdata)
 
 	bpf_map_put(&st_map->map);
 }
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+		map->map_flags & BPF_F_LINK &&
+		/* Pair with smp_store_release() during map_update */
+		smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_struct_ops_map *st_map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = (struct bpf_struct_ops_map *)
+		rcu_dereference_protected(st_link->map, true);
+	if (st_map) {
+		/* st_link->map can be NULL if
+		 * bpf_struct_ops_link_create() fails to register.
+		 */
+		st_map->st_ops->unreg(&st_map->kvalue.data);
+		bpf_map_put(&st_map->map);
+	}
+	kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+					    struct seq_file *seq)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	seq_printf(seq, "map_id:\t%d\n", map->id);
+	rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+					       struct bpf_link_info *info)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	info->struct_ops.map_id = map->id;
+	rcu_read_unlock();
+	return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+	.dealloc = bpf_struct_ops_map_link_dealloc,
+	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	struct bpf_struct_ops_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
+	int err;
+
+	map = bpf_map_get(attr->link_create.map_fd);
+	if (!map)
+		return -EINVAL;
+
+	st_map = (struct bpf_struct_ops_map *)map;
+
+	if (!bpf_struct_ops_valid_to_reg(map)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto err_out;
+
+	err = st_map->st_ops->reg(st_map->kvalue.data);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		link = NULL;
+		goto err_out;
+	}
+	RCU_INIT_POINTER(link->map, map);
+
+	return bpf_link_settle(&link_primer);
+
+err_out:
+	bpf_map_put(map);
+	kfree(link);
+	return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cff0348a2871..21f76698875c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2825,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_prog *prog = link->prog;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
-	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "link_type:\t%s\n"
-		   "link_id:\t%u\n"
-		   "prog_tag:\t%s\n"
-		   "prog_id:\t%u\n",
+		   "link_id:\t%u\n",
 		   bpf_link_type_strs[link->type],
-		   link->id,
-		   prog_tag,
-		   prog->aux->id);
+		   link->id);
+	if (prog) {
+		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+		seq_printf(m,
+			   "prog_tag:\t%s\n"
+			   "prog_id:\t%u\n",
+			   prog_tag,
+			   prog->aux->id);
+	}
 	if (link->ops->show_fdinfo)
 		link->ops->show_fdinfo(link, m);
 }
@@ -4314,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
 
 	info.type = link->type;
 	info.id = link->id;
-	info.prog_id = link->prog->aux->id;
+	if (link->prog)
+		info.prog_id = link->prog->aux->id;
 
 	if (link->ops->fill_link_info) {
 		err = link->ops->fill_link_info(link, &info);
@@ -4577,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (CHECK_ATTR(BPF_LINK_CREATE))
 		return -EINVAL;
 
+	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+		return bpf_struct_ops_link_create(attr);
+
 	prog = bpf_prog_get(attr->link_create.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..bbbd5eb94db2 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
 		if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
 				     sizeof(tcp_ca->name)) <= 0)
 			return -EINVAL;
-		if (tcp_ca_find(utcp_ca->name))
-			return -EEXIST;
 		return 1;
 	}
 
@@ -266,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_validate(void *kdata)
+{
+	return tcp_validate_congestion_control(kdata);
+}
+
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
@@ -273,6 +276,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
+	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
 };
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 13129df937cd..9cf1deaf21f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* eBPF struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:02 -0700
Subject: bpf: Update the struct_ops of a bpf_link.

By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.

The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       | 21 +++++++++++++-----
 kernel/bpf/bpf_struct_ops.c    | 48 +++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           | 34 ++++++++++++++++++++++++++++++
 net/ipv4/bpf_tcp_ca.c          |  6 ++++++
 tools/include/uapi/linux/bpf.h | 21 +++++++++++++-----
 6 files changed, 122 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8552279efe46..2d8f3f639e68 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
 	int (*fill_link_info)(const struct bpf_link *link,
 			      struct bpf_link_info *info);
+	int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+			  struct bpf_map *old_map);
 };
 
 struct bpf_tramp_link {
@@ -1518,6 +1520,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42f40ee083bf..e3d3b5160d26 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3d6b5240c25a..6401deca3b56 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -65,6 +65,8 @@ struct bpf_struct_ops_link {
 	struct bpf_map __rcu *map;
 };
 
+static DEFINE_MUTEX(update_mutex);
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -664,7 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+	if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
 		return ERR_PTR(-EOPNOTSUPP);
 
 	t = st_ops->type;
@@ -810,10 +812,54 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
 	return 0;
 }
 
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+					  struct bpf_map *expected_old_map)
+{
+	struct bpf_struct_ops_map *st_map, *old_st_map;
+	struct bpf_map *old_map;
+	struct bpf_struct_ops_link *st_link;
+	int err = 0;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+	if (!bpf_struct_ops_valid_to_reg(new_map))
+		return -EINVAL;
+
+	mutex_lock(&update_mutex);
+
+	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+	if (expected_old_map && old_map != expected_old_map) {
+		err = -EPERM;
+		goto err_out;
+	}
+
+	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+	/* The new and old struct_ops must be the same type. */
+	if (st_map->st_ops != old_st_map->st_ops) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+	if (err)
+		goto err_out;
+
+	bpf_map_inc(new_map);
+	rcu_assign_pointer(st_link->map, new_map);
+	bpf_map_put(old_map);
+
+err_out:
+	mutex_unlock(&update_mutex);
+
+	return err;
+}
+
 static const struct bpf_link_ops bpf_struct_ops_map_lops = {
 	.dealloc = bpf_struct_ops_map_link_dealloc,
 	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
 	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+	.update_map = bpf_struct_ops_map_link_update,
 };
 
 int bpf_struct_ops_link_create(union bpf_attr *attr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 21f76698875c..b4d758fa5981 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4682,6 +4682,35 @@ out:
 	return ret;
 }
 
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+	struct bpf_map *new_map, *old_map = NULL;
+	int ret;
+
+	new_map = bpf_map_get(attr->link_update.new_map_fd);
+	if (IS_ERR(new_map))
+		return -EINVAL;
+
+	if (attr->link_update.flags & BPF_F_REPLACE) {
+		old_map = bpf_map_get(attr->link_update.old_map_fd);
+		if (IS_ERR(old_map)) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+	} else if (attr->link_update.old_map_fd) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = link->ops->update_map(link, new_map, old_map);
+
+	if (old_map)
+		bpf_map_put(old_map);
+out_put:
+	bpf_map_put(new_map);
+	return ret;
+}
+
 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
 
 static int link_update(union bpf_attr *attr)
@@ -4702,6 +4731,11 @@ static int link_update(union bpf_attr *attr)
 	if (IS_ERR(link))
 		return PTR_ERR(link);
 
+	if (link->ops->update_map) {
+		ret = link_update_map(link, attr);
+		goto out_put_link;
+	}
+
 	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
 	if (IS_ERR(new_prog)) {
 		ret = PTR_ERR(new_prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index bbbd5eb94db2..e8b27826283e 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -264,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+	return tcp_update_congestion_control(kdata, old_kdata);
+}
+
 static int bpf_tcp_ca_validate(void *kdata)
 {
 	return tcp_validate_congestion_control(kdata);
@@ -273,6 +278,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
 	.unreg = bpf_tcp_ca_unreg,
+	.update = bpf_tcp_ca_update,
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9cf1deaf21f2..d6c5a022ae28 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
-- 
cgit v1.2.3


From 6e2a3a324aab9d76e6b864eb6ebc60b197b5141a Mon Sep 17 00:00:00 2001
From: Or Har-Toov <ohartoov@nvidia.com>
Date: Sun, 19 Mar 2023 14:59:30 +0200
Subject: net/mlx5: Expose bits for enabling out-of-order by default

Add needed HW bits for enabling out-of-order by default and
use go_back_n when out-of-order is not needed.

Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/75d6dfe263989a05c08c43406132b336ea12d00a.1679230449.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 66d76e97a087..62039b147432 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1077,7 +1077,9 @@ struct mlx5_ifc_roce_cap_bits {
 	u8         sw_r_roce_src_udp_port[0x1];
 	u8         fl_rc_qp_when_roce_disabled[0x1];
 	u8         fl_rc_qp_when_roce_enabled[0x1];
-	u8         reserved_at_7[0x17];
+	u8         reserved_at_7[0x1];
+	u8	   qp_ooo_transmit_default[0x1];
+	u8         reserved_at_9[0x15];
 	u8	   qp_ts_format[0x2];
 
 	u8         reserved_at_20[0x60];
@@ -1493,7 +1495,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_b0[0x1];
 	u8         uplink_follow[0x1];
 	u8         ts_cqe_to_dest_cqn[0x1];
-	u8         reserved_at_b3[0x7];
+	u8         reserved_at_b3[0x6];
+	u8         go_back_n[0x1];
 	u8         shampo[0x1];
 	u8         reserved_at_bb[0x5];
 
@@ -3261,7 +3264,8 @@ struct mlx5_ifc_qpc_bits {
 	u8         log_rq_stride[0x3];
 	u8         no_sq[0x1];
 	u8         log_sq_size[0x4];
-	u8         reserved_at_55[0x3];
+	u8         reserved_at_55[0x1];
+	u8	   retry_mode[0x2];
 	u8	   ts_format[0x2];
 	u8         reserved_at_5a[0x1];
 	u8         rlky[0x1];
-- 
cgit v1.2.3


From 8d91f3f8ae57e6292142ca89f322e90fa0d6ac02 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 11 Mar 2023 23:39:55 +0100
Subject: mmc: core: add helpers mmc_regulator_enable/disable_vqmmc

There's a number of drivers (e.g. dw_mmc, meson-gx, mmci, sunxi) using
the same mechanism and a private flag vqmmc_enabled to deal with
enabling/disabling the vqmmc regulator.

Move this to the core and create new helpers mmc_regulator_enable_vqmmc
and mmc_regulator_disable_vqmmc.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Link: https://lore.kernel.org/r/71586432-360f-9b92-17f6-b05a8a971bc2@gmail.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/regulator.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h     |  3 +++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c
index 4fad81cd5252..005247a49e51 100644
--- a/drivers/mmc/core/regulator.c
+++ b/drivers/mmc/core/regulator.c
@@ -274,3 +274,44 @@ int mmc_regulator_get_supply(struct mmc_host *mmc)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mmc_regulator_get_supply);
+
+/**
+ * mmc_regulator_enable_vqmmc - enable VQMMC regulator for a host
+ * @mmc: the host to regulate
+ *
+ * Returns 0 or errno. Enables the regulator for vqmmc.
+ * Keeps track of the enable status for ensuring that calls to
+ * regulator_enable/disable are balanced.
+ */
+int mmc_regulator_enable_vqmmc(struct mmc_host *mmc)
+{
+	int ret = 0;
+
+	if (!IS_ERR(mmc->supply.vqmmc) && !mmc->vqmmc_enabled) {
+		ret = regulator_enable(mmc->supply.vqmmc);
+		if (ret < 0)
+			dev_err(mmc_dev(mmc), "enabling vqmmc regulator failed\n");
+		else
+			mmc->vqmmc_enabled = true;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mmc_regulator_enable_vqmmc);
+
+/**
+ * mmc_regulator_disable_vqmmc - disable VQMMC regulator for a host
+ * @mmc: the host to regulate
+ *
+ * Returns 0 or errno. Disables the regulator for vqmmc.
+ * Keeps track of the enable status for ensuring that calls to
+ * regulator_enable/disable are balanced.
+ */
+void mmc_regulator_disable_vqmmc(struct mmc_host *mmc)
+{
+	if (!IS_ERR(mmc->supply.vqmmc) && mmc->vqmmc_enabled) {
+		regulator_disable(mmc->supply.vqmmc);
+		mmc->vqmmc_enabled = false;
+	}
+}
+EXPORT_SYMBOL_GPL(mmc_regulator_disable_vqmmc);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 812e6b583b25..461d1543893b 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -450,6 +450,7 @@ struct mmc_host {
 	unsigned int		retune_paused:1; /* re-tuning is temporarily disabled */
 	unsigned int		retune_crc_disable:1; /* don't trigger retune upon crc */
 	unsigned int		can_dma_map_merge:1; /* merging can be used */
+	unsigned int		vqmmc_enabled:1; /* vqmmc regulator is enabled */
 
 	int			rescan_disable;	/* disable card detection */
 	int			rescan_entered;	/* used with nonremovable devices */
@@ -598,6 +599,8 @@ static inline int mmc_regulator_set_vqmmc(struct mmc_host *mmc,
 #endif
 
 int mmc_regulator_get_supply(struct mmc_host *mmc);
+int mmc_regulator_enable_vqmmc(struct mmc_host *mmc);
+void mmc_regulator_disable_vqmmc(struct mmc_host *mmc);
 
 static inline int mmc_card_is_removable(struct mmc_host *host)
 {
-- 
cgit v1.2.3


From e297cd54b3f81d652456ae6cb93941fc6b5c6683 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:30 -0600
Subject: vhost_task: Allow vhost layer to use copy_process

Qemu will create vhost devices in the kernel which perform network, SCSI,
etc IO and management operations from worker threads created by the
kthread API. Because the kthread API does a copy_process on the kthreadd
thread, the vhost layer has to use kthread_use_mm to access the Qemu
thread's memory and cgroup_attach_task_all to add itself to the Qemu
thread's cgroups, and it bypasses the RLIMIT_NPROC limit which can result
in VMs creating more threads than the admin expected.

This patch adds a new struct vhost_task which can be used instead of
kthreads. They allow the vhost layer to use copy_process and inherit
the userspace process's mm and cgroups, the task is accounted for
under the userspace's nproc count and can be seen in its process tree,
and other features like namespaces work and are inherited by default.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 MAINTAINERS                      |   2 +
 drivers/vhost/Kconfig            |   5 ++
 include/linux/sched/vhost_task.h |  23 ++++++++
 kernel/Makefile                  |   1 +
 kernel/vhost_task.c              | 117 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 148 insertions(+)
 create mode 100644 include/linux/sched/vhost_task.h
 create mode 100644 kernel/vhost_task.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d5bc223f305..186d37c2b4e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -22176,7 +22176,9 @@ L:	virtualization@lists.linux-foundation.org
 L:	netdev@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
+F:	kernel/vhost_task.c
 F:	drivers/vhost/
+F:	include/linux/sched/vhost_task.h
 F:	include/linux/vhost_iotlb.h
 F:	include/uapi/linux/vhost.h
 
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 587fbae06182..b455d9ab6f3d 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -13,9 +13,14 @@ config VHOST_RING
 	  This option is selected by any driver which needs to access
 	  the host side of a virtio ring.
 
+config VHOST_TASK
+	bool
+	default n
+
 config VHOST
 	tristate
 	select VHOST_IOTLB
+	select VHOST_TASK
 	help
 	  This option is selected by any driver which needs to access
 	  the core of vhost.
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
new file mode 100644
index 000000000000..6123c10b99cf
--- /dev/null
+++ b/include/linux/sched/vhost_task.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VHOST_TASK_H
+#define _LINUX_VHOST_TASK_H
+
+#include <linux/completion.h>
+
+struct task_struct;
+
+struct vhost_task {
+	int (*fn)(void *data);
+	void *data;
+	struct completion exited;
+	unsigned long flags;
+	struct task_struct *task;
+};
+
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+				     const char *name);
+void vhost_task_start(struct vhost_task *vtsk);
+void vhost_task_stop(struct vhost_task *vtsk);
+bool vhost_task_should_stop(struct vhost_task *vtsk);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 10ef068f598d..6fc72b3afbde 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -15,6 +15,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace internal ftrace files
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..b7cbd66f889e
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+	VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+	struct vhost_task *vtsk = data;
+	int ret;
+
+	ret = vtsk->fn(vtsk->data);
+	complete(&vtsk->exited);
+	do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+	pid_t pid = vtsk->task->pid;
+
+	set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+	wake_up_process(vtsk->task);
+	/*
+	 * Make sure vhost_task_fn is no longer accessing the vhost_task before
+	 * freeing it below. If userspace crashed or exited without closing,
+	 * then the vhost_task->task could already be marked dead so
+	 * kernel_wait will return early.
+	 */
+	wait_for_completion(&vtsk->exited);
+	/*
+	 * If we are just closing/removing a device and the parent process is
+	 * not exiting then reap the task.
+	 */
+	kernel_wait4(pid, NULL, __WCLONE, NULL);
+	kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+	return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+				     const char *name)
+{
+	struct kernel_clone_args args = {
+		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+		.exit_signal	= 0,
+		.fn		= vhost_task_fn,
+		.name		= name,
+		.user_worker	= 1,
+		.no_files	= 1,
+		.ignore_signals	= 1,
+	};
+	struct vhost_task *vtsk;
+	struct task_struct *tsk;
+
+	vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+	if (!vtsk)
+		return NULL;
+	init_completion(&vtsk->exited);
+	vtsk->data = arg;
+	vtsk->fn = fn;
+
+	args.fn_arg = vtsk;
+
+	tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+	if (IS_ERR(tsk)) {
+		kfree(vtsk);
+		return NULL;
+	}
+
+	vtsk->task = tsk;
+	return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+	wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);
-- 
cgit v1.2.3


From 9cc61e5fbd619eea0401f519e7bac72fe3d4d1e8 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:04 +0100
Subject: driver core: bus: move dev_root out of struct bus_type

Now that all accesses of dev_root is through the bus_get_dev_root()
call, move the pointer out of struct bus_type and into the private
dynamic structure, subsys_private.

With this change, there is no modifiable portions of struct bus_type so
it can be marked as a constant structure and moved to read-only memory.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-22-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h        |  2 ++
 drivers/base/bus.c         | 28 ++++++++++++++++++++++------
 include/linux/device/bus.h |  2 --
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index b055eba1ec30..f1034e27e651 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -27,6 +27,7 @@
  *                 on this bus.
  * @bus - pointer back to the struct bus_type that this structure is associated
  *        with.
+ * @dev_root: Default device to use as the parent.
  *
  * @glue_dirs - "glue" directory to put in-between the parent device to
  *              avoid namespace conflicts
@@ -49,6 +50,7 @@ struct subsys_private {
 	struct blocking_notifier_head bus_notifier;
 	unsigned int drivers_autoprobe:1;
 	struct bus_type *bus;
+	struct device *dev_root;
 
 	struct kset glue_dirs;
 	struct class *class;
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index dd4b82d7510f..91a6b6b1fc49 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -935,8 +935,8 @@ void bus_unregister(const struct bus_type *bus)
 		return;
 
 	pr_debug("bus: '%s': unregistering\n", bus->name);
-	if (bus->dev_root)
-		device_unregister(bus->dev_root);
+	if (sp->dev_root)
+		device_unregister(sp->dev_root);
 
 	bus_kobj = &sp->subsys.kobj;
 	sysfs_remove_groups(bus_kobj, bus->bus_groups);
@@ -1198,6 +1198,7 @@ static int subsys_register(struct bus_type *subsys,
 			   const struct attribute_group **groups,
 			   struct kobject *parent_of_root)
 {
+	struct subsys_private *sp;
 	struct device *dev;
 	int err;
 
@@ -1205,6 +1206,12 @@ static int subsys_register(struct bus_type *subsys,
 	if (err < 0)
 		return err;
 
+	sp = bus_to_subsys(subsys);
+	if (!sp) {
+		err = -EINVAL;
+		goto err_sp;
+	}
+
 	dev = kzalloc(sizeof(struct device), GFP_KERNEL);
 	if (!dev) {
 		err = -ENOMEM;
@@ -1223,7 +1230,8 @@ static int subsys_register(struct bus_type *subsys,
 	if (err < 0)
 		goto err_dev_reg;
 
-	subsys->dev_root = dev;
+	sp->dev_root = dev;
+	subsys_put(sp);
 	return 0;
 
 err_dev_reg:
@@ -1232,6 +1240,8 @@ err_dev_reg:
 err_name:
 	kfree(dev);
 err_dev:
+	subsys_put(sp);
+err_sp:
 	bus_unregister(subsys);
 	return err;
 }
@@ -1349,9 +1359,15 @@ bool bus_is_registered(const struct bus_type *bus)
  */
 struct device *bus_get_dev_root(const struct bus_type *bus)
 {
-	if (bus)
-		return get_device(bus->dev_root);
-	return NULL;
+	struct subsys_private *sp = bus_to_subsys(bus);
+	struct device *dev_root;
+
+	if (!sp)
+		return NULL;
+
+	dev_root = get_device(sp->dev_root);
+	subsys_put(sp);
+	return dev_root;
 }
 EXPORT_SYMBOL_GPL(bus_get_dev_root);
 
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 6ce32ef4b8fd..c258e8770285 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -26,7 +26,6 @@ struct fwnode_handle;
  *
  * @name:	The name of the bus.
  * @dev_name:	Used for subsystems to enumerate devices like ("foo%u", dev->id).
- * @dev_root:	Default device to use as the parent.
  * @bus_groups:	Default attributes of the bus.
  * @dev_groups:	Default attributes of the devices on the bus.
  * @drv_groups: Default attributes of the device drivers on the bus.
@@ -82,7 +81,6 @@ struct fwnode_handle;
 struct bus_type {
 	const char		*name;
 	const char		*dev_name;
-	struct device		*dev_root;
 	const struct attribute_group **bus_groups;
 	const struct attribute_group **dev_groups;
 	const struct attribute_group **drv_groups;
-- 
cgit v1.2.3


From 75cff725d9566699a670a02b3cfd1c6e9e9ed53e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:05 +0100
Subject: driver core: bus: mark the struct bus_type for sysfs callbacks as
 constant

struct bus_type should never be modified in a sysfs callback as there is
nothing in the structure to modify, and frankly, the structure is almost
never used in a sysfs callback, so mark it as constant to allow struct
bus_type to be moved to read-only memory.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ben Widawsky <bwidawsk@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hu Haowen <src.res@email.cn>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Stuart Yoder <stuyoder@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Yanteng Si <siyanteng@loongson.cn>
Acked-by: Ilya Dryomov <idryomov@gmail.com> # rbd
Acked-by: Ira Weiny <ira.weiny@intel.com> # cxl
Reviewed-by: Alex Shi <alexs@kernel.org>
Acked-by: Iwona Winiarska <iwona.winiarska@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# pci
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com> # scsi
Link: https://lore.kernel.org/r/20230313182918.1312597-23-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/driver-model/bus.rst      |  4 +--
 Documentation/filesystems/sysfs.rst                |  4 +--
 .../translations/zh_CN/filesystems/sysfs.txt       |  4 +--
 .../translations/zh_TW/filesystems/sysfs.txt       |  4 +--
 arch/powerpc/platforms/pseries/ibmebus.c           |  4 +--
 arch/powerpc/platforms/pseries/vio.c               |  8 ++---
 drivers/ata/pata_parport/pata_parport.c            |  6 ++--
 drivers/base/bus.c                                 |  8 ++---
 drivers/block/rbd.c                                | 34 ++++++++----------
 drivers/bus/fsl-mc/fsl-mc-bus.c                    |  6 ++--
 drivers/cxl/core/port.c                            |  2 +-
 drivers/hv/vmbus_drv.c                             |  2 +-
 drivers/net/netdevsim/bus.c                        |  4 +--
 drivers/pci/pci-sysfs.c                            |  2 +-
 drivers/pci/pci.c                                  |  4 +--
 drivers/peci/sysfs.c                               |  2 +-
 drivers/rapidio/rio-sysfs.c                        |  2 +-
 drivers/s390/crypto/ap_bus.c                       | 42 +++++++++++-----------
 drivers/scsi/fcoe/fcoe_sysfs.c                     |  8 ++---
 drivers/scsi/fcoe/fcoe_transport.c                 |  6 ++--
 include/linux/device/bus.h                         |  4 +--
 include/scsi/libfcoe.h                             |  6 ++--
 22 files changed, 78 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
index 016b15a6e8ea..9709ab62a468 100644
--- a/Documentation/driver-api/driver-model/bus.rst
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -125,8 +125,8 @@ Exporting Attributes
 
   struct bus_attribute {
 	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *, char * buf);
-	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+	ssize_t (*show)(const struct bus_type *, char * buf);
+	ssize_t (*store)(const struct bus_type *, const char * buf, size_t count);
   };
 
 Bus drivers can export attributes using the BUS_ATTR_RW macro that works
diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst
index f8187d466b97..c32993bc83c7 100644
--- a/Documentation/filesystems/sysfs.rst
+++ b/Documentation/filesystems/sysfs.rst
@@ -373,8 +373,8 @@ Structure::
 
     struct bus_attribute {
 	    struct attribute        attr;
-	    ssize_t (*show)(struct bus_type *, char * buf);
-	    ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+	    ssize_t (*show)(const struct bus_type *, char * buf);
+	    ssize_t (*store)(const struct bus_type *, const char * buf, size_t count);
     };
 
 Declaring::
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 046cc1d52058..547062759e60 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -329,8 +329,8 @@ void device_remove_file(struct device *dev, const struct device_attribute * attr
 
 struct bus_attribute {
         struct attribute        attr;
-        ssize_t (*show)(struct bus_type *, char * buf);
-        ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+        ssize_t (*show)(const struct bus_type *, char * buf);
+        ssize_t (*store)(const struct bus_type *, const char * buf, size_t count);
 };
 
 声明:
diff --git a/Documentation/translations/zh_TW/filesystems/sysfs.txt b/Documentation/translations/zh_TW/filesystems/sysfs.txt
index acd677f19d4f..280824cc7e5d 100644
--- a/Documentation/translations/zh_TW/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_TW/filesystems/sysfs.txt
@@ -332,8 +332,8 @@ void device_remove_file(struct device *dev, const struct device_attribute * attr
 
 struct bus_attribute {
         struct attribute        attr;
-        ssize_t (*show)(struct bus_type *, char * buf);
-        ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+        ssize_t (*show)(const struct bus_type *, char * buf);
+        ssize_t (*store)(const struct bus_type *, const char * buf, size_t count);
 };
 
 聲明:
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index bb9c18682783..44703f13985b 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -267,7 +267,7 @@ static char *ibmebus_chomp(const char *in, size_t count)
 	return out;
 }
 
-static ssize_t probe_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t probe_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	struct device_node *dn = NULL;
 	struct device *dev;
@@ -305,7 +305,7 @@ out:
 }
 static BUS_ATTR_WO(probe);
 
-static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	struct device *dev;
 	char *path;
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 770df9351aaa..bf7aff6390be 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1006,7 +1006,7 @@ ATTRIBUTE_GROUPS(vio_cmo_dev);
 /* sysfs bus functions and data structures for CMO */
 
 #define viobus_cmo_rd_attr(name)                                        \
-static ssize_t cmo_bus_##name##_show(struct bus_type *bt, char *buf)    \
+static ssize_t cmo_bus_##name##_show(const struct bus_type *bt, char *buf)    \
 {                                                                       \
 	return sprintf(buf, "%lu\n", vio_cmo.name);                     \
 }                                                                       \
@@ -1015,7 +1015,7 @@ static struct bus_attribute bus_attr_cmo_bus_##name =			\
 
 #define viobus_cmo_pool_rd_attr(name, var)                              \
 static ssize_t                                                          \
-cmo_##name##_##var##_show(struct bus_type *bt, char *buf)               \
+cmo_##name##_##var##_show(const struct bus_type *bt, char *buf)         \
 {                                                                       \
 	return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
 }                                                                       \
@@ -1030,12 +1030,12 @@ viobus_cmo_pool_rd_attr(reserve, size);
 viobus_cmo_pool_rd_attr(excess, size);
 viobus_cmo_pool_rd_attr(excess, free);
 
-static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
+static ssize_t cmo_high_show(const struct bus_type *bt, char *buf)
 {
 	return sprintf(buf, "%lu\n", vio_cmo.high);
 }
 
-static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
+static ssize_t cmo_high_store(const struct bus_type *bt, const char *buf,
 			      size_t count)
 {
 	unsigned long flags;
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 294a266a0dda..64d1bde26940 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -554,8 +554,7 @@ void pata_parport_unregister_driver(struct pi_protocol *pr)
 }
 EXPORT_SYMBOL_GPL(pata_parport_unregister_driver);
 
-static ssize_t new_device_store(struct bus_type *bus, const char *buf,
-				size_t count)
+static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	char port[12] = "auto";
 	char protocol[8] = "auto";
@@ -630,8 +629,7 @@ static void pi_remove_one(struct device *dev)
 	/* pata_parport_dev_release will do kfree(pi) */
 }
 
-static ssize_t delete_device_store(struct bus_type *bus, const char *buf,
-				   size_t count)
+static ssize_t delete_device_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	struct device *dev;
 
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 91a6b6b1fc49..819ab745fa9f 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -274,7 +274,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 }
 static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);
 
-static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf)
+static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf)
 {
 	struct subsys_private *sp = bus_to_subsys(bus);
 	int ret;
@@ -287,7 +287,7 @@ static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf)
 	return ret;
 }
 
-static ssize_t drivers_autoprobe_store(struct bus_type *bus,
+static ssize_t drivers_autoprobe_store(const struct bus_type *bus,
 				       const char *buf, size_t count)
 {
 	struct subsys_private *sp = bus_to_subsys(bus);
@@ -304,7 +304,7 @@ static ssize_t drivers_autoprobe_store(struct bus_type *bus,
 	return count;
 }
 
-static ssize_t drivers_probe_store(struct bus_type *bus,
+static ssize_t drivers_probe_store(const struct bus_type *bus,
 				   const char *buf, size_t count)
 {
 	struct device *dev;
@@ -808,7 +808,7 @@ static void klist_devices_put(struct klist_node *n)
 	put_device(dev);
 }
 
-static ssize_t bus_uevent_store(struct bus_type *bus,
+static ssize_t bus_uevent_store(const struct bus_type *bus,
 				const char *buf, size_t count)
 {
 	struct subsys_private *sp = bus_to_subsys(bus);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5cb008b9700a..84ad3b17956f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -491,12 +491,12 @@ static bool single_major = true;
 module_param(single_major, bool, 0444);
 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
 
-static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
-static ssize_t remove_store(struct bus_type *bus, const char *buf,
+static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count);
+static ssize_t remove_store(const struct bus_type *bus, const char *buf,
 			    size_t count);
-static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
 				      size_t count);
-static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
 					 size_t count);
 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
 
@@ -538,7 +538,7 @@ static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
 	return is_lock_owner;
 }
 
-static ssize_t supported_features_show(struct bus_type *bus, char *buf)
+static ssize_t supported_features_show(const struct bus_type *bus, char *buf)
 {
 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
 }
@@ -6967,9 +6967,7 @@ err_out_format:
 	return ret;
 }
 
-static ssize_t do_rbd_add(struct bus_type *bus,
-			  const char *buf,
-			  size_t count)
+static ssize_t do_rbd_add(const char *buf, size_t count)
 {
 	struct rbd_device *rbd_dev = NULL;
 	struct ceph_options *ceph_opts = NULL;
@@ -7081,18 +7079,18 @@ err_out_args:
 	goto out;
 }
 
-static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	if (single_major)
 		return -EINVAL;
 
-	return do_rbd_add(bus, buf, count);
+	return do_rbd_add(buf, count);
 }
 
-static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
 				      size_t count)
 {
-	return do_rbd_add(bus, buf, count);
+	return do_rbd_add(buf, count);
 }
 
 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
@@ -7122,9 +7120,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
 	}
 }
 
-static ssize_t do_rbd_remove(struct bus_type *bus,
-			     const char *buf,
-			     size_t count)
+static ssize_t do_rbd_remove(const char *buf, size_t count)
 {
 	struct rbd_device *rbd_dev = NULL;
 	struct list_head *tmp;
@@ -7196,18 +7192,18 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
 	return count;
 }
 
-static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	if (single_major)
 		return -EINVAL;
 
-	return do_rbd_remove(bus, buf, count);
+	return do_rbd_remove(buf, count);
 }
 
-static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
 					 size_t count)
 {
-	return do_rbd_remove(bus, buf, count);
+	return do_rbd_remove(buf, count);
 }
 
 /*
diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 36cb091a33b4..653e2d4c116f 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -231,7 +231,7 @@ exit:
 	return 0;
 }
 
-static ssize_t rescan_store(struct bus_type *bus,
+static ssize_t rescan_store(const struct bus_type *bus,
 			    const char *buf, size_t count)
 {
 	unsigned long val;
@@ -284,7 +284,7 @@ exit:
 	return 0;
 }
 
-static ssize_t autorescan_store(struct bus_type *bus,
+static ssize_t autorescan_store(const struct bus_type *bus,
 				const char *buf, size_t count)
 {
 	bus_for_each_dev(bus, NULL, (void *)buf, fsl_mc_bus_set_autorescan);
@@ -292,7 +292,7 @@ static ssize_t autorescan_store(struct bus_type *bus,
 	return count;
 }
 
-static ssize_t autorescan_show(struct bus_type *bus, char *buf)
+static ssize_t autorescan_show(const struct bus_type *bus, char *buf)
 {
 	bus_for_each_dev(bus, NULL, (void *)buf, fsl_mc_bus_get_autorescan);
 	return strlen(buf);
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 8ee6b6e2e2a4..66333cd6248e 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1927,7 +1927,7 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd)
 EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL);
 
 /* for user tooling to ensure port disable work has completed */
-static ssize_t flush_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	if (sysfs_streq(buf, "1")) {
 		flush_workqueue(cxl_bus_wq);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index d24dd65b33d4..513adba09f56 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -684,7 +684,7 @@ static const struct attribute_group vmbus_dev_group = {
 __ATTRIBUTE_GROUPS(vmbus_dev);
 
 /* Set up the attribute for /sys/bus/vmbus/hibernation */
-static ssize_t hibernation_show(struct bus_type *bus, char *buf)
+static ssize_t hibernation_show(const struct bus_type *bus, char *buf)
 {
 	return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
 }
diff --git a/drivers/net/netdevsim/bus.c b/drivers/net/netdevsim/bus.c
index 0052968e881e..0787ad252dd9 100644
--- a/drivers/net/netdevsim/bus.c
+++ b/drivers/net/netdevsim/bus.c
@@ -132,7 +132,7 @@ static struct nsim_bus_dev *
 nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues);
 
 static ssize_t
-new_device_store(struct bus_type *bus, const char *buf, size_t count)
+new_device_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	unsigned int id, port_count, num_queues;
 	struct nsim_bus_dev *nsim_bus_dev;
@@ -186,7 +186,7 @@ static BUS_ATTR_WO(new_device);
 static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev);
 
 static ssize_t
-del_device_store(struct bus_type *bus, const char *buf, size_t count)
+del_device_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	struct nsim_bus_dev *nsim_bus_dev, *tmp;
 	unsigned int id;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index dd0d9d9bc509..ab32a91f287b 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -428,7 +428,7 @@ static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(msi_bus);
 
-static ssize_t rescan_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t rescan_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	unsigned long val;
 	struct pci_bus *b = NULL;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7a67611dc5f4..45c3bb039f21 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6679,7 +6679,7 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev)
 	}
 }
 
-static ssize_t resource_alignment_show(struct bus_type *bus, char *buf)
+static ssize_t resource_alignment_show(const struct bus_type *bus, char *buf)
 {
 	size_t count = 0;
 
@@ -6691,7 +6691,7 @@ static ssize_t resource_alignment_show(struct bus_type *bus, char *buf)
 	return count;
 }
 
-static ssize_t resource_alignment_store(struct bus_type *bus,
+static ssize_t resource_alignment_store(const struct bus_type *bus,
 					const char *buf, size_t count)
 {
 	char *param, *old, *end;
diff --git a/drivers/peci/sysfs.c b/drivers/peci/sysfs.c
index db9ef05776e3..c04244075794 100644
--- a/drivers/peci/sysfs.c
+++ b/drivers/peci/sysfs.c
@@ -15,7 +15,7 @@ static int rescan_controller(struct device *dev, void *data)
 	return peci_controller_scan_devices(to_peci_controller(dev));
 }
 
-static ssize_t rescan_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t rescan_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	bool res;
 	int ret;
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index f7679602498e..90d391210533 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -286,7 +286,7 @@ const struct attribute_group *rio_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t scan_store(struct bus_type *bus, const char *buf, size_t count)
+static ssize_t scan_store(const struct bus_type *bus, const char *buf, size_t count)
 {
 	long val;
 	int rc;
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index f4cc1720156f..5a99e0b18289 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1166,12 +1166,12 @@ EXPORT_SYMBOL(ap_parse_mask_str);
  * AP bus attributes.
  */
 
-static ssize_t ap_domain_show(struct bus_type *bus, char *buf)
+static ssize_t ap_domain_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_domain_index);
 }
 
-static ssize_t ap_domain_store(struct bus_type *bus,
+static ssize_t ap_domain_store(const struct bus_type *bus,
 			       const char *buf, size_t count)
 {
 	int domain;
@@ -1193,7 +1193,7 @@ static ssize_t ap_domain_store(struct bus_type *bus,
 
 static BUS_ATTR_RW(ap_domain);
 
-static ssize_t ap_control_domain_mask_show(struct bus_type *bus, char *buf)
+static ssize_t ap_control_domain_mask_show(const struct bus_type *bus, char *buf)
 {
 	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
@@ -1208,7 +1208,7 @@ static ssize_t ap_control_domain_mask_show(struct bus_type *bus, char *buf)
 
 static BUS_ATTR_RO(ap_control_domain_mask);
 
-static ssize_t ap_usage_domain_mask_show(struct bus_type *bus, char *buf)
+static ssize_t ap_usage_domain_mask_show(const struct bus_type *bus, char *buf)
 {
 	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
@@ -1223,7 +1223,7 @@ static ssize_t ap_usage_domain_mask_show(struct bus_type *bus, char *buf)
 
 static BUS_ATTR_RO(ap_usage_domain_mask);
 
-static ssize_t ap_adapter_mask_show(struct bus_type *bus, char *buf)
+static ssize_t ap_adapter_mask_show(const struct bus_type *bus, char *buf)
 {
 	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
@@ -1238,7 +1238,7 @@ static ssize_t ap_adapter_mask_show(struct bus_type *bus, char *buf)
 
 static BUS_ATTR_RO(ap_adapter_mask);
 
-static ssize_t ap_interrupts_show(struct bus_type *bus, char *buf)
+static ssize_t ap_interrupts_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n",
 			 ap_irq_flag ? 1 : 0);
@@ -1246,12 +1246,12 @@ static ssize_t ap_interrupts_show(struct bus_type *bus, char *buf)
 
 static BUS_ATTR_RO(ap_interrupts);
 
-static ssize_t config_time_show(struct bus_type *bus, char *buf)
+static ssize_t config_time_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_config_time);
 }
 
-static ssize_t config_time_store(struct bus_type *bus,
+static ssize_t config_time_store(const struct bus_type *bus,
 				 const char *buf, size_t count)
 {
 	int time;
@@ -1265,12 +1265,12 @@ static ssize_t config_time_store(struct bus_type *bus,
 
 static BUS_ATTR_RW(config_time);
 
-static ssize_t poll_thread_show(struct bus_type *bus, char *buf)
+static ssize_t poll_thread_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_poll_kthread ? 1 : 0);
 }
 
-static ssize_t poll_thread_store(struct bus_type *bus,
+static ssize_t poll_thread_store(const struct bus_type *bus,
 				 const char *buf, size_t count)
 {
 	int flag, rc;
@@ -1289,12 +1289,12 @@ static ssize_t poll_thread_store(struct bus_type *bus,
 
 static BUS_ATTR_RW(poll_thread);
 
-static ssize_t poll_timeout_show(struct bus_type *bus, char *buf)
+static ssize_t poll_timeout_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%llu\n", poll_timeout);
 }
 
-static ssize_t poll_timeout_store(struct bus_type *bus, const char *buf,
+static ssize_t poll_timeout_store(const struct bus_type *bus, const char *buf,
 				  size_t count)
 {
 	unsigned long long time;
@@ -1318,21 +1318,21 @@ static ssize_t poll_timeout_store(struct bus_type *bus, const char *buf,
 
 static BUS_ATTR_RW(poll_timeout);
 
-static ssize_t ap_max_domain_id_show(struct bus_type *bus, char *buf)
+static ssize_t ap_max_domain_id_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_domain_id);
 }
 
 static BUS_ATTR_RO(ap_max_domain_id);
 
-static ssize_t ap_max_adapter_id_show(struct bus_type *bus, char *buf)
+static ssize_t ap_max_adapter_id_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_adapter_id);
 }
 
 static BUS_ATTR_RO(ap_max_adapter_id);
 
-static ssize_t apmask_show(struct bus_type *bus, char *buf)
+static ssize_t apmask_show(const struct bus_type *bus, char *buf)
 {
 	int rc;
 
@@ -1393,7 +1393,7 @@ static int apmask_commit(unsigned long *newapm)
 	return 0;
 }
 
-static ssize_t apmask_store(struct bus_type *bus, const char *buf,
+static ssize_t apmask_store(const struct bus_type *bus, const char *buf,
 			    size_t count)
 {
 	int rc, changes = 0;
@@ -1425,7 +1425,7 @@ done:
 
 static BUS_ATTR_RW(apmask);
 
-static ssize_t aqmask_show(struct bus_type *bus, char *buf)
+static ssize_t aqmask_show(const struct bus_type *bus, char *buf)
 {
 	int rc;
 
@@ -1486,7 +1486,7 @@ static int aqmask_commit(unsigned long *newaqm)
 	return 0;
 }
 
-static ssize_t aqmask_store(struct bus_type *bus, const char *buf,
+static ssize_t aqmask_store(const struct bus_type *bus, const char *buf,
 			    size_t count)
 {
 	int rc, changes = 0;
@@ -1518,13 +1518,13 @@ done:
 
 static BUS_ATTR_RW(aqmask);
 
-static ssize_t scans_show(struct bus_type *bus, char *buf)
+static ssize_t scans_show(const struct bus_type *bus, char *buf)
 {
 	return scnprintf(buf, PAGE_SIZE, "%llu\n",
 			 atomic64_read(&ap_scan_bus_count));
 }
 
-static ssize_t scans_store(struct bus_type *bus, const char *buf,
+static ssize_t scans_store(const struct bus_type *bus, const char *buf,
 			   size_t count)
 {
 	AP_DBF_INFO("%s force AP bus rescan\n", __func__);
@@ -1536,7 +1536,7 @@ static ssize_t scans_store(struct bus_type *bus, const char *buf,
 
 static BUS_ATTR_RW(scans);
 
-static ssize_t bindings_show(struct bus_type *bus, char *buf)
+static ssize_t bindings_show(const struct bus_type *bus, char *buf)
 {
 	int rc;
 	unsigned int apqns, n;
diff --git a/drivers/scsi/fcoe/fcoe_sysfs.c b/drivers/scsi/fcoe/fcoe_sysfs.c
index 6260aa5ea6af..e17957f8085c 100644
--- a/drivers/scsi/fcoe/fcoe_sysfs.c
+++ b/drivers/scsi/fcoe/fcoe_sysfs.c
@@ -659,17 +659,17 @@ static const struct device_type fcoe_fcf_device_type = {
 	.release = fcoe_fcf_device_release,
 };
 
-static ssize_t ctlr_create_store(struct bus_type *bus, const char *buf,
+static ssize_t ctlr_create_store(const struct bus_type *bus, const char *buf,
 				 size_t count)
 {
-	return fcoe_ctlr_create_store(bus, buf, count);
+	return fcoe_ctlr_create_store(buf, count);
 }
 static BUS_ATTR_WO(ctlr_create);
 
-static ssize_t ctlr_destroy_store(struct bus_type *bus, const char *buf,
+static ssize_t ctlr_destroy_store(const struct bus_type *bus, const char *buf,
 				  size_t count)
 {
-	return fcoe_ctlr_destroy_store(bus, buf, count);
+	return fcoe_ctlr_destroy_store(buf, count);
 }
 static BUS_ATTR_WO(ctlr_destroy);
 
diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c
index 62341c6353a7..46b0bf237be1 100644
--- a/drivers/scsi/fcoe/fcoe_transport.c
+++ b/drivers/scsi/fcoe/fcoe_transport.c
@@ -745,8 +745,7 @@ static int libfcoe_device_notification(struct notifier_block *notifier,
 	return NOTIFY_OK;
 }
 
-ssize_t fcoe_ctlr_create_store(struct bus_type *bus,
-			       const char *buf, size_t count)
+ssize_t fcoe_ctlr_create_store(const char *buf, size_t count)
 {
 	struct net_device *netdev = NULL;
 	struct fcoe_transport *ft = NULL;
@@ -808,8 +807,7 @@ out_nodev:
 	return count;
 }
 
-ssize_t fcoe_ctlr_destroy_store(struct bus_type *bus,
-				const char *buf, size_t count)
+ssize_t fcoe_ctlr_destroy_store(const char *buf, size_t count)
 {
 	int rc = -ENODEV;
 	struct net_device *netdev = NULL;
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index c258e8770285..78c875386c06 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -118,8 +118,8 @@ extern int __must_check bus_rescan_devices(struct bus_type *bus);
 
 struct bus_attribute {
 	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *bus, char *buf);
-	ssize_t (*store)(struct bus_type *bus, const char *buf, size_t count);
+	ssize_t (*show)(const struct bus_type *bus, char *buf);
+	ssize_t (*store)(const struct bus_type *bus, const char *buf, size_t count);
 };
 
 #define BUS_ATTR_RW(_name) \
diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h
index 279782156373..8300ef1a982e 100644
--- a/include/scsi/libfcoe.h
+++ b/include/scsi/libfcoe.h
@@ -397,10 +397,8 @@ int fcoe_transport_attach(struct fcoe_transport *ft);
 int fcoe_transport_detach(struct fcoe_transport *ft);
 
 /* sysfs store handler for ctrl_control interface */
-ssize_t fcoe_ctlr_create_store(struct bus_type *bus,
-			       const char *buf, size_t count);
-ssize_t fcoe_ctlr_destroy_store(struct bus_type *bus,
-				const char *buf, size_t count);
+ssize_t fcoe_ctlr_create_store(const char *buf, size_t count);
+ssize_t fcoe_ctlr_destroy_store(const char *buf, size_t count);
 
 #endif /* _LIBFCOE_H */
 
-- 
cgit v1.2.3


From 00c4a3c47da761b4ba5d09c46b6fc77af5759081 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:06 +0100
Subject: driver core: bus: constantify bus_register()

bus_register() is now safe to take a constant * to bus_type, so make
that change and mark the subsys_private bus_type * constant as well.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-24-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h        | 2 +-
 drivers/base/bus.c         | 2 +-
 include/linux/device/bus.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index f1034e27e651..09c6682d16b5 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -49,7 +49,7 @@ struct subsys_private {
 	struct klist klist_drivers;
 	struct blocking_notifier_head bus_notifier;
 	unsigned int drivers_autoprobe:1;
-	struct bus_type *bus;
+	const struct bus_type *bus;
 	struct device *dev_root;
 
 	struct kset glue_dirs;
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 819ab745fa9f..f739a2a79e59 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -841,7 +841,7 @@ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL,
  * infrastructure, then register the children subsystems it has:
  * the devices and drivers that belong to the subsystem.
  */
-int bus_register(struct bus_type *bus)
+int bus_register(const struct bus_type *bus)
 {
 	int retval;
 	struct subsys_private *priv;
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 78c875386c06..c30c4748c170 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -110,7 +110,7 @@ struct bus_type {
 	bool need_parent_lock;
 };
 
-extern int __must_check bus_register(struct bus_type *bus);
+extern int __must_check bus_register(const struct bus_type *bus);
 
 extern void bus_unregister(const struct bus_type *bus);
 
-- 
cgit v1.2.3


From 9622b9f282e0f0d37683b21575b683039169e397 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:07 +0100
Subject: driver core: bus: constify bus_rescan_devices()

The bus_rescan_devices() function was missed in the previous change of
the bus_for_each* constant pointer changes, so fix it up now to take a
const * to struct bus_type.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-25-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c         | 2 +-
 include/linux/device/bus.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f739a2a79e59..ced61fad390e 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -769,7 +769,7 @@ static int __must_check bus_rescan_devices_helper(struct device *dev,
  * attached and rescan it against existing drivers to see if it matches
  * any by calling device_attach() for the unbound devices.
  */
-int bus_rescan_devices(struct bus_type *bus)
+int bus_rescan_devices(const struct bus_type *bus)
 {
 	return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
 }
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index c30c4748c170..b517b82d4723 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -114,7 +114,7 @@ extern int __must_check bus_register(const struct bus_type *bus);
 
 extern void bus_unregister(const struct bus_type *bus);
 
-extern int __must_check bus_rescan_devices(struct bus_type *bus);
+extern int __must_check bus_rescan_devices(const struct bus_type *bus);
 
 struct bus_attribute {
 	struct attribute	attr;
-- 
cgit v1.2.3


From 7c06be04251a0b3349868622a2111a54cbfad8d4 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:08 +0100
Subject: driver core: bus: constify driver_find()

The driver_find() function can now take a const * to bus_type, not just
a * so fix that up.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-26-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c            | 2 +-
 include/linux/device/driver.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index ced61fad390e..8fea26c22521 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1307,7 +1307,7 @@ EXPORT_SYMBOL_GPL(subsys_virtual_register);
  * from being unregistered or unloaded while the caller is using it.
  * The caller is responsible for preventing this.
  */
-struct device_driver *driver_find(const char *name, struct bus_type *bus)
+struct device_driver *driver_find(const char *name, const struct bus_type *bus)
 {
 	struct subsys_private *sp = bus_to_subsys(bus);
 	struct kobject *k;
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 50d0a416a5e7..ceb0e477c2c8 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -126,7 +126,7 @@ extern int __must_check driver_register(struct device_driver *drv);
 extern void driver_unregister(struct device_driver *drv);
 
 extern struct device_driver *driver_find(const char *name,
-					 struct bus_type *bus);
+					 const struct bus_type *bus);
 extern int driver_probe_done(void);
 extern void wait_for_device_probe(void);
 void __init wait_for_init_devices_probe(void);
-- 
cgit v1.2.3


From c28dd08ef713d2127c5bad3f3e0e93d6ec0309a2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:10 +0100
Subject: driver core: make the bus_type in struct device_driver constant

The pointer to struct bus_type in struct device_driver should only be
pointing to something that can never change now that the driver core has
fixed up the previously writable fields.  So mark it as a constant
pointer to enforce this and move forward with the goal of moving
bus_type into read-only memory.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-28-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index ceb0e477c2c8..0f22a6f46f8c 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -95,7 +95,7 @@ enum probe_type {
  */
 struct device_driver {
 	const char		*name;
-	struct bus_type		*bus;
+	const struct bus_type	*bus;
 
 	struct module		*owner;
 	const char		*mod_name;	/* used for built-in modules */
-- 
cgit v1.2.3


From b18d0a0f92a8fe9e56d812808184c8c4b9f18f92 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:16 +0100
Subject: iommu: make the pointer to struct bus_type constant

A number of iommu functions take a struct bus_type * and never modify
the data passed in, so make them all const * as that is what the driver
core is expecting to have passed into as well.

This is a step toward making all struct bus_type pointers constant in
the kernel.

Cc: Will Deacon <will@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: iommu@lists.linux.dev
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20230313182918.1312597-34-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iommu/iommu.c | 14 +++++++-------
 include/linux/iommu.h | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 10db680acaed..0b5e181998c9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -89,7 +89,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
-static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus,
 						 unsigned type);
 static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev);
@@ -1631,7 +1631,7 @@ static int iommu_get_def_domain_type(struct device *dev)
 	return 0;
 }
 
-static int iommu_group_alloc_default_domain(struct bus_type *bus,
+static int iommu_group_alloc_default_domain(const struct bus_type *bus,
 					    struct iommu_group *group,
 					    unsigned int type)
 {
@@ -1777,7 +1777,7 @@ static int probe_get_default_domain_type(struct device *dev, void *data)
 	return 0;
 }
 
-static void probe_alloc_default_domain(struct bus_type *bus,
+static void probe_alloc_default_domain(const struct bus_type *bus,
 				       struct iommu_group *group)
 {
 	struct __group_domain_type gtype;
@@ -1832,7 +1832,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group)
 					  iommu_do_create_direct_mappings);
 }
 
-int bus_iommu_probe(struct bus_type *bus)
+int bus_iommu_probe(const struct bus_type *bus)
 {
 	struct iommu_group *group, *next;
 	LIST_HEAD(group_list);
@@ -1876,7 +1876,7 @@ int bus_iommu_probe(struct bus_type *bus)
 	return ret;
 }
 
-bool iommu_present(struct bus_type *bus)
+bool iommu_present(const struct bus_type *bus)
 {
 	return bus->iommu_ops != NULL;
 }
@@ -1951,7 +1951,7 @@ void iommu_set_fault_handler(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
 
-static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus,
 						 unsigned type)
 {
 	struct iommu_domain *domain;
@@ -1976,7 +1976,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 	return domain;
 }
 
-struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
 {
 	return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6595454d4f48..0fd4e6734d5b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -455,11 +455,11 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 	return dev->iommu->iommu_dev->ops;
 }
 
-extern int bus_iommu_probe(struct bus_type *bus);
-extern bool iommu_present(struct bus_type *bus);
+extern int bus_iommu_probe(const struct bus_type *bus);
+extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
-extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
+extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
 extern struct iommu_group *iommu_group_get_by_id(int id);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
@@ -732,7 +732,7 @@ struct iommu_device {};
 struct iommu_fault_param {};
 struct iommu_iotlb_gather {};
 
-static inline bool iommu_present(struct bus_type *bus)
+static inline bool iommu_present(const struct bus_type *bus)
 {
 	return false;
 }
@@ -742,7 +742,7 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From d492cc2573a08352c48a66d3e3f312a15fb3f363 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:17 +0100
Subject: driver core: device.h: make struct bus_type a const *

Now that all users who accessed the bus_type structure in struct device
are properly using it as a const *, mark it as such so that no one can
modify it going forward anymore.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230313182918.1312597-35-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 862397ae520d..3b23772d3bbb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -566,7 +566,7 @@ struct device {
 	const char		*init_name; /* initial name of the device */
 	const struct device_type *type;
 
-	struct bus_type	*bus;		/* type of bus device is on */
+	const struct bus_type	*bus;	/* type of bus device is on */
 	struct device_driver *driver;	/* which driver has allocated this
 					   device */
 	void		*platform_data;	/* Platform specific data, device
-- 
cgit v1.2.3


From 9d11b13402d1b80f7f3ca5061d75f15cf8002555 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Mar 2023 19:29:18 +0100
Subject: USB: mark all struct bus_type as const

Now that the driver core can properly handle constant struct bus_type,
move all of the USB subsystem struct bus_type structures as const,
placing them into read-only memory which can not be modified at runtime.

Cc: Johan Hovold <johan@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: linux-usb@vger.kernel.org
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20230313182918.1312597-36-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/ulpi.c     | 2 +-
 drivers/usb/core/driver.c     | 2 +-
 drivers/usb/core/usb.h        | 2 +-
 drivers/usb/gadget/udc/core.c | 4 ++--
 drivers/usb/serial/bus.c      | 2 +-
 drivers/usb/typec/bus.c       | 2 +-
 drivers/usb/typec/bus.h       | 2 +-
 include/linux/usb/serial.h    | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c
index a98b2108376a..8305a5dfb910 100644
--- a/drivers/usb/common/ulpi.c
+++ b/drivers/usb/common/ulpi.c
@@ -90,7 +90,7 @@ static void ulpi_remove(struct device *dev)
 		drv->remove(to_ulpi_dev(dev));
 }
 
-static struct bus_type ulpi_bus = {
+static const struct bus_type ulpi_bus = {
 	.name = "ulpi",
 	.match = ulpi_match,
 	.uevent = ulpi_uevent,
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index a0e076c6f3a4..f58a0299fb3b 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -2025,7 +2025,7 @@ int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
 
 #endif /* CONFIG_PM */
 
-struct bus_type usb_bus_type = {
+const struct bus_type usb_bus_type = {
 	.name =		"usb",
 	.match =	usb_device_match,
 	.uevent =	usb_uevent,
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 0eac7d4285d1..cd434af259c3 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -140,7 +140,7 @@ static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
 
 #endif
 
-extern struct bus_type usb_bus_type;
+extern const struct bus_type usb_bus_type;
 extern struct mutex usb_port_peer_mutex;
 extern struct device_type usb_device_type;
 extern struct device_type usb_if_device_type;
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index df930b041c33..d919f0743a0a 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -26,7 +26,7 @@
 
 static DEFINE_IDA(gadget_id_numbers);
 
-static struct bus_type gadget_bus_type;
+static const struct bus_type gadget_bus_type;
 
 /**
  * struct usb_udc - describes one usb device controller
@@ -1747,7 +1747,7 @@ static int usb_udc_uevent(const struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-static struct bus_type gadget_bus_type = {
+static const struct bus_type gadget_bus_type = {
 	.name = "gadget",
 	.probe = gadget_bind_driver,
 	.remove = gadget_unbind_driver,
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 9e38142acd38..3eb8dc3a1a8f 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -144,7 +144,7 @@ static void free_dynids(struct usb_serial_driver *drv)
 	spin_unlock(&drv->dynids.lock);
 }
 
-struct bus_type usb_serial_bus_type = {
+const struct bus_type usb_serial_bus_type = {
 	.name =		"usb-serial",
 	.match =	usb_serial_device_match,
 	.probe =	usb_serial_device_probe,
diff --git a/drivers/usb/typec/bus.c b/drivers/usb/typec/bus.c
index 098f0efaa58d..fe5b9a2e61f5 100644
--- a/drivers/usb/typec/bus.c
+++ b/drivers/usb/typec/bus.c
@@ -431,7 +431,7 @@ static void typec_remove(struct device *dev)
 	adev->ops = NULL;
 }
 
-struct bus_type typec_bus = {
+const struct bus_type typec_bus = {
 	.name = "typec",
 	.dev_groups = typec_groups,
 	.match = typec_match,
diff --git a/drivers/usb/typec/bus.h b/drivers/usb/typec/bus.h
index c89168857417..643b8c81786d 100644
--- a/drivers/usb/typec/bus.h
+++ b/drivers/usb/typec/bus.h
@@ -28,7 +28,7 @@ struct altmode {
 
 #define to_altmode(d) container_of(d, struct altmode, adev)
 
-extern struct bus_type typec_bus;
+extern const struct bus_type typec_bus;
 extern const struct device_type typec_altmode_dev_type;
 
 #define is_typec_altmode(_dev_) (_dev_->type == &typec_altmode_dev_type)
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index f7bfedb740f5..7eeb5f9c4f0d 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -378,7 +378,7 @@ void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port,
 int usb_serial_bus_register(struct usb_serial_driver *device);
 void usb_serial_bus_deregister(struct usb_serial_driver *device);
 
-extern struct bus_type usb_serial_bus_type;
+extern const struct bus_type usb_serial_bus_type;
 extern struct tty_driver *usb_serial_tty_driver;
 
 static inline void usb_serial_debug_data(struct device *dev,
-- 
cgit v1.2.3


From 3f079114bf522f27f3680238b6429f3dd45535b6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Feb 2023 00:13:19 -0500
Subject: ext4: Convert data=journal writeback to use ext4_writepages()

Add support for writeback of journalled data directly into
ext4_writepages() instead of offloading it to write_cache_pages(). This
actually significantly simplifies the code and reduces code duplication.
For checkpointing of committed data we can use ext4_writepages()
rightaway the same way as writeback of ordered data uses it on
transaction commit. For journalling of dirty mapped pages, we need to
add a special case to mpage_prepare_extent_to_map() to add all page
buffers to the journal.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20230228051319.4085470-8-tytso@mit.edu
---
 fs/ext4/inode.c             | 341 ++++++++++++--------------------------------
 include/trace/events/ext4.h |   7 -
 2 files changed, 91 insertions(+), 257 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 472f6b914f48..652efb2221bf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 						   new_size);
 }
 
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents);
 
@@ -1632,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
-				      struct buffer_head *bh)
-{
-	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-}
-
 /*
  * ext4_insert_delayed_block - adds a delayed block to the extents status
  *                             tree, incrementing the reserved cluster/block
@@ -1870,219 +1863,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-static int __ext4_journalled_writepage(struct page *page,
-				       unsigned int len)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0, err = 0;
-	int inline_data = ext4_has_inline_data(inode);
-	struct buffer_head *inode_bh = NULL;
-	loff_t size;
-
-	ClearPageChecked(page);
-
-	if (inline_data) {
-		BUG_ON(page->index != 0);
-		BUG_ON(len > ext4_get_max_inline_size(inode));
-		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
-		if (inode_bh == NULL)
-			goto out;
-	}
-	/*
-	 * We need to release the page lock before we start the
-	 * journal, so grab a reference so the page won't disappear
-	 * out from under us.
-	 */
-	get_page(page);
-	unlock_page(page);
-
-	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
-				    ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		put_page(page);
-		goto out_no_pagelock;
-	}
-	BUG_ON(!ext4_handle_valid(handle));
-
-	lock_page(page);
-	put_page(page);
-	size = i_size_read(inode);
-	if (page->mapping != mapping || page_offset(page) > size) {
-		/* The page got truncated from under us */
-		ext4_journal_stop(handle);
-		ret = 0;
-		goto out;
-	}
-
-	if (inline_data) {
-		ret = ext4_mark_inode_dirty(handle, inode);
-	} else {
-		struct buffer_head *page_bufs = page_buffers(page);
-
-		if (page->index == size >> PAGE_SHIFT)
-			len = size & ~PAGE_MASK;
-		else
-			len = PAGE_SIZE;
-
-		ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
-					     NULL, do_journal_get_write_access);
-
-		err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
-					     NULL, write_end_fn);
-	}
-	if (ret == 0)
-		ret = err;
-	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
-	if (ret == 0)
-		ret = err;
-	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-
-	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
-out:
-	unlock_page(page);
-out_no_pagelock:
-	brelse(inode_bh);
-	return ret;
-}
-
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), no one guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * This function can get called via...
- *   - ext4_writepages after taking page lock (have journal handle)
- *   - journal_submit_inode_data_buffers (no journal handle)
- *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
- *   - grab_page_cache when doing write_begin (have journal handle)
- *
- * We don't do any block allocation in this function. If we have page with
- * multiple blocks we need to write those buffer_heads that are mapped. This
- * is important for mmaped based write. So if we do with blocksize 1K
- * truncate(f, 1024);
- * a = mmap(f, 0, 4096);
- * a[0] = 'a';
- * truncate(f, 4096);
- * we have in the page first buffer_head mapped via page_mkwrite call back
- * but other buffer_heads would be unmapped but dirty (dirty done via the
- * do_wp_page). So writepage should write the first block. If we modify
- * the mmap area beyond 1024 we will again get a page_fault and the
- * page_mkwrite callback will do the block allocation and mark the
- * buffer_heads mapped.
- *
- * We redirty the page if we have any buffer_heads that is either delay or
- * unwritten in the page.
- *
- * We can get recursively called as show below.
- *
- *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *		ext4_writepage()
- *
- * But since we don't do any block allocation we should not deadlock.
- * Page also have the dirty flag cleared so we don't get recurive page_lock.
- */
-static int ext4_writepage(struct page *page,
-			  struct writeback_control *wbc)
-{
-	struct folio *folio = page_folio(page);
-	int ret = 0;
-	loff_t size;
-	unsigned int len;
-	struct buffer_head *page_bufs = NULL;
-	struct inode *inode = page->mapping->host;
-	struct ext4_io_submit io_submit;
-
-	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
-		folio_invalidate(folio, 0, folio_size(folio));
-		folio_unlock(folio);
-		return -EIO;
-	}
-
-	trace_ext4_writepage(page);
-	size = i_size_read(inode);
-	if (page->index == size >> PAGE_SHIFT &&
-	    !ext4_verity_in_progress(inode))
-		len = size & ~PAGE_MASK;
-	else
-		len = PAGE_SIZE;
-
-	/* Should never happen but for bugs in other kernel subsystems */
-	if (!page_has_buffers(page)) {
-		ext4_warning_inode(inode,
-		   "page %lu does not have buffers attached", page->index);
-		ClearPageDirty(page);
-		unlock_page(page);
-		return 0;
-	}
-
-	page_bufs = page_buffers(page);
-	/*
-	 * We cannot do block allocation or other extent handling in this
-	 * function. If there are buffers needing that, we have to redirty
-	 * the page. But we may reach here when we do a journal commit via
-	 * journal_submit_inode_data_buffers() and in that case we must write
-	 * allocated buffers to achieve data=ordered mode guarantees.
-	 *
-	 * Also, if there is only one buffer per page (the fs block
-	 * size == the page size), if one buffer needs block
-	 * allocation or needs to modify the extent tree to clear the
-	 * unwritten flag, we know that the page can't be written at
-	 * all, so we might as well refuse the write immediately.
-	 * Unfortunately if the block size != page size, we can't as
-	 * easily detect this case using ext4_walk_page_buffers(), but
-	 * for the extremely common case, this is an optimization that
-	 * skips a useless round trip through ext4_bio_write_page().
-	 */
-	if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
-				   ext4_bh_delay_or_unwritten)) {
-		redirty_page_for_writepage(wbc, page);
-		if ((current->flags & PF_MEMALLOC) ||
-		    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
-			/*
-			 * For memory cleaning there's no point in writing only
-			 * some buffers. So just bail out. Warn if we came here
-			 * from direct reclaim.
-			 */
-			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
-							== PF_MEMALLOC);
-			unlock_page(page);
-			return 0;
-		}
-	}
-
-	if (PageChecked(page) && ext4_should_journal_data(inode))
-		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
-		 */
-		return __ext4_journalled_writepage(page, len);
-
-	ext4_io_submit_init(&io_submit, wbc);
-	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_submit.io_end) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return -ENOMEM;
-	}
-	ret = ext4_bio_write_page(&io_submit, page, len);
-	unlock_page(page);
-	ext4_io_submit(&io_submit);
-	/* Drop io_end reference we got from init */
-	ext4_put_io_end_defer(io_submit.io_end);
-	return ret;
-}
-
 static void mpage_page_done(struct mpage_da_data *mpd, struct page *page)
 {
 	mpd->first_page++;
@@ -2563,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page)
 	return false;
 }
 
+static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
+				     int len)
+{
+	struct buffer_head *page_bufs = page_buffers(page);
+	struct inode *inode = page->mapping->host;
+	int ret, err;
+
+	ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+				     NULL, do_journal_get_write_access);
+	err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+				     NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
+	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+	if (ret == 0)
+		ret = err;
+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+
+	return ret;
+}
+
+static int mpage_journal_page_buffers(handle_t *handle,
+				      struct mpage_da_data *mpd,
+				      struct page *page)
+{
+	struct inode *inode = mpd->inode;
+	loff_t size = i_size_read(inode);
+	int len;
+
+	ClearPageChecked(page);
+	clear_page_dirty_for_io(page);
+	mpd->wbc->nr_to_write--;
+
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(inode))
+		len = size & ~PAGE_MASK;
+	else
+		len = PAGE_SIZE;
+
+	return ext4_journal_page_buffers(handle, page, len);
+}
+
 /*
  * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
  * 				 needing mapping, submit mapped pages
@@ -2595,11 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	int blkbits = mpd->inode->i_blkbits;
 	ext4_lblk_t lblk;
 	struct buffer_head *head;
+	handle_t *handle = NULL;
+	int bpp = ext4_journal_blocks_per_page(mpd->inode);
 
 	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
+
+	if (ext4_should_journal_data(mpd->inode)) {
+		handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
+					    bpp);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+	}
 	folio_batch_init(&fbatch);
 	mpd->map.m_len = 0;
 	mpd->next_page = index;
@@ -2629,6 +2462,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
 				goto out;
 
+			if (handle) {
+				err = ext4_journal_ensure_credits(handle, bpp,
+								  0);
+				if (err < 0)
+					goto out;
+			}
+
 			folio_lock(folio);
 			/*
 			 * If the page is no longer dirty, or its mapping no
@@ -2668,8 +2508,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 				mpd->first_page = folio->index;
 			mpd->next_page = folio->index + folio_nr_pages(folio);
 			/*
-			 * Writeout for transaction commit where we cannot
-			 * modify metadata is simple. Just submit the page.
+			 * Writeout when we cannot modify metadata is simple.
+			 * Just submit the page. For data=journal mode we
+			 * first handle writeout of the page for checkpoint and
+			 * only after that handle delayed page dirtying. This
+			 * is crutial so that forcing a transaction commit and
+			 * then calling filemap_write_and_wait() guarantees
+			 * current state of data is in its final location. Such
+			 * sequence is used for example by insert/collapse
+			 * range operations before discarding the page cache.
 			 */
 			if (!mpd->can_map) {
 				if (ext4_page_nomap_can_writeout(&folio->page)) {
@@ -2677,6 +2524,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 					if (err < 0)
 						goto out;
 				}
+				/* Pending dirtying of journalled data? */
+				if (PageChecked(&folio->page)) {
+					err = mpage_journal_page_buffers(handle,
+						mpd, &folio->page);
+					if (err < 0)
+						goto out;
+				}
 				mpage_page_done(mpd, &folio->page);
 			} else {
 				/* Add all dirty buffers to mpd */
@@ -2694,18 +2548,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 		cond_resched();
 	}
 	mpd->scanned_until_end = 1;
+	if (handle)
+		ext4_journal_stop(handle);
 	return 0;
 out:
 	folio_batch_release(&fbatch);
+	if (handle)
+		ext4_journal_stop(handle);
 	return err;
 }
 
-static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc,
-			     void *data)
-{
-	return ext4_writepage(&folio->page, wbc);
-}
-
 static int ext4_do_writepages(struct mpage_da_data *mpd)
 {
 	struct writeback_control *wbc = mpd->wbc;
@@ -2731,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_writepages;
 
-	if (ext4_should_journal_data(inode)) {
-		blk_start_plug(&plug);
-		ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
-		blk_finish_plug(&plug);
-		goto out_writepages;
-	}
-
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
@@ -2772,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 		ext4_journal_stop(handle);
 	}
 
+	/*
+	 * data=journal mode does not do delalloc so we just need to writeout /
+	 * journal already mapped buffers
+	 */
+	if (ext4_should_journal_data(inode))
+		mpd->can_map = 0;
+
 	if (ext4_should_dioread_nolock(inode)) {
 		/*
 		 * We may need to convert up to one extent per block in
@@ -3148,9 +3000,8 @@ static int ext4_da_write_end(struct file *file,
 	 * i_disksize since writeback will push i_disksize upto i_size
 	 * eventually. If the end of the current write is > i_size and
 	 * inside an allocated block (ext4_da_should_update_i_disksize()
-	 * check), we need to update i_disksize here as neither
-	 * ext4_writepage() nor certain ext4_writepages() paths not
-	 * allocating blocks update i_disksize.
+	 * check), we need to update i_disksize here as certain
+	 * ext4_writepages() paths not allocating blocks update i_disksize.
 	 *
 	 * Note that we defer inode dirtying to generic_write_end() /
 	 * ext4_da_write_inline_data_end().
@@ -5376,7 +5227,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 	 * If the folio is fully truncated, we don't need to wait for any commit
 	 * (and we even should not as __ext4_journalled_invalidate_folio() may
 	 * strip all buffers from the folio but keep the folio dirty which can then
-	 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
+	 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
 	 * buffers). Also we don't need to wait for any commit if all buffers in
 	 * the folio remain valid. This is most beneficial for the common case of
 	 * blocksize == PAGESIZE.
@@ -6314,18 +6165,8 @@ retry_alloc:
 		err = __block_write_begin(page, 0, len, ext4_get_block);
 		if (!err) {
 			ret = VM_FAULT_SIGBUS;
-			if (ext4_walk_page_buffers(handle, inode,
-					page_buffers(page), 0, len, NULL,
-					do_journal_get_write_access))
-				goto out_error;
-			if (ext4_walk_page_buffers(handle, inode,
-					page_buffers(page), 0, len, NULL,
-					write_end_fn))
-				goto out_error;
-			if (ext4_jbd2_inode_add_write(handle, inode,
-						      page_offset(page), len))
+			if (ext4_journal_page_buffers(handle, page, len))
 				goto out_error;
-			ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 		} else {
 			unlock_page(page);
 		}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 77b426ae0064..ebccf6a6aa1b 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op,
 		  (unsigned long) __entry->index)
 );
 
-DEFINE_EVENT(ext4__page_op, ext4_writepage,
-
-	TP_PROTO(struct page *page),
-
-	TP_ARGS(page)
-);
-
 DEFINE_EVENT(ext4__page_op, ext4_readpage,
 
 	TP_PROTO(struct page *page),
-- 
cgit v1.2.3


From 1fb1ea0d9cb8359ae9d6f67f22666c74d5b9f47d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Mar 2023 19:07:47 +0200
Subject: mei: Move uuid.h to the MEI namespace

There is only a single user of the UUID uAPI, let's make it
part of that user.

The way it's done is to prevent compilation time breakage for
the user space that does

	#include <linux/uuid.h>

In the future MEI user space tools can switch over to use mei_uuid.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230310170747.22782-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                      |  1 +
 drivers/misc/mei/bus-fixup.c     |  2 +-
 drivers/misc/mei/hdcp/mei_hdcp.c |  2 +-
 drivers/misc/mei/hw.h            |  2 +-
 drivers/misc/mei/main.c          |  1 -
 drivers/misc/mei/pxp/mei_pxp.c   |  2 +-
 include/linux/mod_devicetable.h  |  1 +
 include/linux/uuid.h             |  3 ---
 include/uapi/linux/mei.h         |  2 +-
 include/uapi/linux/mei_uuid.h    | 29 +++++++++++++++++++++++++++++
 include/uapi/linux/uuid.h        | 31 +------------------------------
 11 files changed, 37 insertions(+), 39 deletions(-)
 create mode 100644 include/uapi/linux/mei_uuid.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7ee30a356e89..619c30ec36dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10459,6 +10459,7 @@ F:	drivers/watchdog/mei_wdt.c
 F:	include/linux/mei_aux.h
 F:	include/linux/mei_cl_bus.h
 F:	include/uapi/linux/mei.h
+F:	include/uapi/linux/mei_uuid.h
 F:	include/uapi/linux/uuid.h
 F:	samples/mei/*
 
diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c
index 211536109308..31e3c74ca1f1 100644
--- a/drivers/misc/mei/bus-fixup.c
+++ b/drivers/misc/mei/bus-fixup.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
 
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 
 #include "mei_dev.h"
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index e0dcd5c114db..45e3d4d27797 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -18,7 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 #include <linux/component.h>
 #include <drm/drm_connector.h>
diff --git a/drivers/misc/mei/hw.h b/drivers/misc/mei/hw.h
index 319418ddf4fb..e910302fcd1f 100644
--- a/drivers/misc/mei/hw.h
+++ b/drivers/misc/mei/hw.h
@@ -7,7 +7,7 @@
 #ifndef _MEI_HW_TYPES_H_
 #define _MEI_HW_TYPES_H_
 
-#include <linux/uuid.h>
+#include <linux/mei.h>
 
 /*
  * Timeouts in Seconds
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 632d4ae21e46..c64291741d73 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -18,7 +18,6 @@
 #include <linux/ioctl.h>
 #include <linux/cdev.h>
 #include <linux/sched/signal.h>
-#include <linux/uuid.h>
 #include <linux/compat.h>
 #include <linux/jiffies.h>
 #include <linux/interrupt.h>
diff --git a/drivers/misc/mei/pxp/mei_pxp.c b/drivers/misc/mei/pxp/mei_pxp.c
index 7ee1fa7b1cb3..3bf560bbdee0 100644
--- a/drivers/misc/mei/pxp/mei_pxp.c
+++ b/drivers/misc/mei/pxp/mei_pxp.c
@@ -13,7 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 #include <linux/component.h>
 #include <drm/drm_connector.h>
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 549590e9c644..35203ca3fc37 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -9,6 +9,7 @@
 #define LINUX_MOD_DEVICETABLE_H
 
 #ifdef __KERNEL__
+#include <linux/mei.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
 typedef unsigned long kernel_ulong_t;
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6b1a3efa1e0b..43d4a79b273d 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -107,7 +107,4 @@ extern const u8 uuid_index[16];
 int guid_parse(const char *uuid, guid_t *u);
 int uuid_parse(const char *uuid, uuid_t *u);
 
-/* MEI UUID type, don't use anywhere else */
-#include <uapi/linux/uuid.h>
-
 #endif
diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h
index 4f3638489d01..6e57743628c0 100644
--- a/include/uapi/linux/mei.h
+++ b/include/uapi/linux/mei.h
@@ -7,7 +7,7 @@
 #ifndef _LINUX_MEI_H
 #define _LINUX_MEI_H
 
-#include <linux/uuid.h>
+#include <linux/mei_uuid.h>
 
 /*
  * This IOCTL is used to associate the current file descriptor with a
diff --git a/include/uapi/linux/mei_uuid.h b/include/uapi/linux/mei_uuid.h
new file mode 100644
index 000000000000..676ebe12d623
--- /dev/null
+++ b/include/uapi/linux/mei_uuid.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * MEI UUID definition
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *	Huang Ying <ying.huang@intel.com>
+ */
+
+#ifndef _UAPI_LINUX_MEI_UUID_H_
+#define _UAPI_LINUX_MEI_UUID_H_
+
+#include <linux/types.h>
+
+typedef struct {
+	__u8 b[16];
+} uuid_le;
+
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_le)								\
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+   (b) & 0xff, ((b) >> 8) & 0xff,					\
+   (c) & 0xff, ((c) >> 8) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define NULL_UUID_LE							\
+	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+	     0x00, 0x00, 0x00, 0x00)
+
+#endif /* _UAPI_LINUX_MEI_UUID_H_ */
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 96ac684a4b2f..8443738f4bb2 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -1,30 +1 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* DO NOT USE in new code! This is solely for MEI due to legacy reasons */
-/*
- * MEI UUID definition
- *
- * Copyright (C) 2010, Intel Corp.
- *	Huang Ying <ying.huang@intel.com>
- */
-
-#ifndef _UAPI_LINUX_UUID_H_
-#define _UAPI_LINUX_UUID_H_
-
-#include <linux/types.h>
-
-typedef struct {
-	__u8 b[16];
-} uuid_le;
-
-#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_le)								\
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-   (b) & 0xff, ((b) >> 8) & 0xff,					\
-   (c) & 0xff, ((c) >> 8) & 0xff,					\
-   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
-
-#define NULL_UUID_LE							\
-	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-	     0x00, 0x00, 0x00, 0x00)
-
-#endif /* _UAPI_LINUX_UUID_H_ */
+#include <linux/mei_uuid.h>
-- 
cgit v1.2.3


From f7515d9fe8fc4b80754cd4d98a5fcaee84adeebb Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:07 -0800
Subject: objtool: Add objtool_types.h

Reduce the amount of header sync churn by splitting the shared objtool.h
types into a new file.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/dec622720851210ceafa12d4f4c5f9e73c832152.1677683419.git.jpoimboe@kernel.org
---
 MAINTAINERS                         |   2 +-
 include/linux/objtool.h             |  42 +-------
 include/linux/objtool_types.h       |  48 +++++++++
 tools/include/linux/objtool.h       | 200 ------------------------------------
 tools/include/linux/objtool_types.h |  48 +++++++++
 tools/objtool/check.c               |   2 +-
 tools/objtool/orc_dump.c            |   2 +-
 tools/objtool/orc_gen.c             |   2 +-
 tools/objtool/sync-check.sh         |   2 +-
 9 files changed, 102 insertions(+), 246 deletions(-)
 create mode 100644 include/linux/objtool_types.h
 delete mode 100644 tools/include/linux/objtool.h
 create mode 100644 tools/include/linux/objtool_types.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d5bc223f305..36e9e642c05e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15090,8 +15090,8 @@ OBJTOOL
 M:	Josh Poimboeuf <jpoimboe@kernel.org>
 M:	Peter Zijlstra <peterz@infradead.org>
 S:	Supported
+F:	include/linux/objtool*.h
 F:	tools/objtool/
-F:	include/linux/objtool.h
 
 OCELOT ETHERNET SWITCH DRIVER
 M:	Vladimir Oltean <vladimir.oltean@nxp.com>
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 9ac3df3fccf0..8375792acfc0 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -2,47 +2,7 @@
 #ifndef _LINUX_OBJTOOL_H
 #define _LINUX_OBJTOOL_H
 
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-/*
- * This struct is used by asm and inline asm code to manually annotate the
- * location of registers on the stack.
- */
-struct unwind_hint {
-	u32		ip;
-	s16		sp_offset;
-	u8		sp_reg;
-	u8		type;
-	u8		signal;
-	u8		end;
-};
-#endif
-
-/*
- * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
- * (the caller's SP right before it made the call).  Used for all callable
- * functions, i.e. all C code and all callable asm functions.
- *
- * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
- * points to a fully populated pt_regs from a syscall, interrupt, or exception.
- *
- * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
- * sp_reg+sp_offset points to the iret return frame.
- *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
- * Useful for code which doesn't have an ELF function annotation.
- *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
- */
-#define UNWIND_HINT_TYPE_CALL		0
-#define UNWIND_HINT_TYPE_REGS		1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_ENTRY		4
-#define UNWIND_HINT_TYPE_SAVE		5
-#define UNWIND_HINT_TYPE_RESTORE	6
+#include <linux/objtool_types.h>
 
 #ifdef CONFIG_OBJTOOL
 
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
new file mode 100644
index 000000000000..8513537a30ed
--- /dev/null
+++ b/include/linux/objtool_types.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_OBJTOOL_TYPES_H
+#define _LINUX_OBJTOOL_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack.
+ */
+struct unwind_hint {
+	u32		ip;
+	s16		sp_offset;
+	u8		sp_reg;
+	u8		type;
+	u8		signal;
+	u8		end;
+};
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
+ * (the caller's SP right before it made the call).  Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
+ * points to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
+ * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
+ */
+#define UNWIND_HINT_TYPE_CALL		0
+#define UNWIND_HINT_TYPE_REGS		1
+#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+#define UNWIND_HINT_TYPE_FUNC		3
+#define UNWIND_HINT_TYPE_ENTRY		4
+#define UNWIND_HINT_TYPE_SAVE		5
+#define UNWIND_HINT_TYPE_RESTORE	6
+
+#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
deleted file mode 100644
index 9ac3df3fccf0..000000000000
--- a/tools/include/linux/objtool.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_OBJTOOL_H
-#define _LINUX_OBJTOOL_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-/*
- * This struct is used by asm and inline asm code to manually annotate the
- * location of registers on the stack.
- */
-struct unwind_hint {
-	u32		ip;
-	s16		sp_offset;
-	u8		sp_reg;
-	u8		type;
-	u8		signal;
-	u8		end;
-};
-#endif
-
-/*
- * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
- * (the caller's SP right before it made the call).  Used for all callable
- * functions, i.e. all C code and all callable asm functions.
- *
- * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
- * points to a fully populated pt_regs from a syscall, interrupt, or exception.
- *
- * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
- * sp_reg+sp_offset points to the iret return frame.
- *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
- * Useful for code which doesn't have an ELF function annotation.
- *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
- */
-#define UNWIND_HINT_TYPE_CALL		0
-#define UNWIND_HINT_TYPE_REGS		1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_ENTRY		4
-#define UNWIND_HINT_TYPE_SAVE		5
-#define UNWIND_HINT_TYPE_RESTORE	6
-
-#ifdef CONFIG_OBJTOOL
-
-#include <asm/asm.h>
-
-#ifndef __ASSEMBLY__
-
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
-	"987: \n\t"						\
-	".pushsection .discard.unwind_hints\n\t"		\
-	/* struct unwind_hint */				\
-	".long 987b - .\n\t"					\
-	".short " __stringify(sp_offset) "\n\t"			\
-	".byte " __stringify(sp_reg) "\n\t"			\
-	".byte " __stringify(type) "\n\t"			\
-	".byte " __stringify(signal) "\n\t"			\
-	".byte " __stringify(end) "\n\t"			\
-	".balign 4 \n\t"					\
-	".popsection\n\t"
-
-/*
- * This macro marks the given function's stack frame as "non-standard", which
- * tells objtool to ignore the function when doing stack metadata validation.
- * It should only be used in special cases where you're 100% sure it won't
- * affect the reliability of frame pointers and kernel stack traces.
- *
- * For more information, see tools/objtool/Documentation/objtool.txt.
- */
-#define STACK_FRAME_NON_STANDARD(func) \
-	static void __used __section(".discard.func_stack_frame_non_standard") \
-		*__func_stack_frame_non_standard_##func = func
-
-/*
- * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore
- * for the case where a function is intentionally missing frame pointer setup,
- * but otherwise needs objtool/ORC coverage when frame pointers are disabled.
- */
-#ifdef CONFIG_FRAME_POINTER
-#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func)
-#else
-#define STACK_FRAME_NON_STANDARD_FP(func)
-#endif
-
-#define ANNOTATE_NOENDBR					\
-	"986: \n\t"						\
-	".pushsection .discard.noendbr\n\t"			\
-	_ASM_PTR " 986b\n\t"					\
-	".popsection\n\t"
-
-#define ASM_REACHABLE							\
-	"998:\n\t"							\
-	".pushsection .discard.reachable\n\t"				\
-	".long 998b - .\n\t"						\
-	".popsection\n\t"
-
-#else /* __ASSEMBLY__ */
-
-/*
- * This macro indicates that the following intra-function call is valid.
- * Any non-annotated intra-function call will cause objtool to issue a warning.
- */
-#define ANNOTATE_INTRA_FUNCTION_CALL				\
-	999:							\
-	.pushsection .discard.intra_function_calls;		\
-	.long 999b;						\
-	.popsection;
-
-/*
- * In asm, there are two kinds of code: normal C-type callable functions and
- * the rest.  The normal callable functions can be called by other code, and
- * don't do anything unusual with the stack.  Such normal callable functions
- * are annotated with the ENTRY/ENDPROC macros.  Most asm code falls in this
- * category.  In this case, no special debugging annotations are needed because
- * objtool can automatically generate the ORC data for the ORC unwinder to read
- * at runtime.
- *
- * Anything which doesn't fall into the above category, such as syscall and
- * interrupt handlers, tends to not be called directly by other functions, and
- * often does unusual non-C-function-type things with the stack pointer.  Such
- * code needs to be annotated such that objtool can understand it.  The
- * following CFI hint macros are for this type of code.
- *
- * These macros provide hints to objtool about the state of the stack at each
- * instruction.  Objtool starts from the hints and follows the code flow,
- * making automatic CFI adjustments when it sees pushes and pops, filling out
- * the debuginfo as necessary.  It will also warn if it sees any
- * inconsistencies.
- */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.Lunwind_hint_ip_\@:
-	.pushsection .discard.unwind_hints
-		/* struct unwind_hint */
-		.long .Lunwind_hint_ip_\@ - .
-		.short \sp_offset
-		.byte \sp_reg
-		.byte \type
-		.byte \signal
-		.byte \end
-		.balign 4
-	.popsection
-.endm
-
-.macro STACK_FRAME_NON_STANDARD func:req
-	.pushsection .discard.func_stack_frame_non_standard, "aw"
-	_ASM_PTR \func
-	.popsection
-.endm
-
-.macro STACK_FRAME_NON_STANDARD_FP func:req
-#ifdef CONFIG_FRAME_POINTER
-	STACK_FRAME_NON_STANDARD \func
-#endif
-.endm
-
-.macro ANNOTATE_NOENDBR
-.Lhere_\@:
-	.pushsection .discard.noendbr
-	.quad	.Lhere_\@
-	.popsection
-.endm
-
-.macro REACHABLE
-.Lhere_\@:
-	.pushsection .discard.reachable
-	.long	.Lhere_\@ - .
-	.popsection
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#else /* !CONFIG_OBJTOOL */
-
-#ifndef __ASSEMBLY__
-
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
-	"\n\t"
-#define STACK_FRAME_NON_STANDARD(func)
-#define STACK_FRAME_NON_STANDARD_FP(func)
-#define ANNOTATE_NOENDBR
-#define ASM_REACHABLE
-#else
-#define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.endm
-.macro STACK_FRAME_NON_STANDARD func:req
-.endm
-.macro ANNOTATE_NOENDBR
-.endm
-.macro REACHABLE
-.endm
-#endif
-
-#endif /* CONFIG_OBJTOOL */
-
-#endif /* _LINUX_OBJTOOL_H */
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
new file mode 100644
index 000000000000..8513537a30ed
--- /dev/null
+++ b/tools/include/linux/objtool_types.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_OBJTOOL_TYPES_H
+#define _LINUX_OBJTOOL_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack.
+ */
+struct unwind_hint {
+	u32		ip;
+	s16		sp_offset;
+	u8		sp_reg;
+	u8		type;
+	u8		signal;
+	u8		end;
+};
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
+ * (the caller's SP right before it made the call).  Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
+ * points to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
+ * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
+ */
+#define UNWIND_HINT_TYPE_CALL		0
+#define UNWIND_HINT_TYPE_REGS		1
+#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+#define UNWIND_HINT_TYPE_FUNC		3
+#define UNWIND_HINT_TYPE_ENTRY		4
+#define UNWIND_HINT_TYPE_SAVE		5
+#define UNWIND_HINT_TYPE_RESTORE	6
+
+#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 37c36dc1d868..efc2baa02883 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -17,7 +17,7 @@
 #include <objtool/warn.h>
 #include <objtool/endianness.h>
 
-#include <linux/objtool.h>
+#include <linux/objtool_types.h>
 #include <linux/hashtable.h>
 #include <linux/kernel.h>
 #include <linux/static_call_types.h>
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 2d8ebdcd1db3..9f6c528c26f2 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -4,7 +4,7 @@
  */
 
 #include <unistd.h>
-#include <linux/objtool.h>
+#include <linux/objtool_types.h>
 #include <asm/orc_types.h>
 #include <objtool/objtool.h>
 #include <objtool/warn.h>
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 57a4527d5988..f49630a21e0f 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -6,7 +6,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <linux/objtool.h>
+#include <linux/objtool_types.h>
 #include <asm/orc_types.h>
 
 #include <objtool/check.h>
diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh
index 105a291ff8e7..81d120d05442 100755
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -6,7 +6,7 @@ if [ -z "$SRCARCH" ]; then
 	exit 1
 fi
 
-FILES="include/linux/objtool.h"
+FILES="include/linux/objtool_types.h"
 
 if [ "$SRCARCH" = "x86" ]; then
 FILES="$FILES
-- 
cgit v1.2.3


From 1c0c1faf5692c18c127d044ecc0cc92c7bab3477 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:08 -0800
Subject: objtool: Use relative pointers for annotations

They produce the needed relocations while using half the space.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/bed05c64e28200220c9b1754a2f3ce71f73076ea.1677683419.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/nospec-branch.h |  6 +++---
 include/linux/objtool.h              | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 3ef70e54a858..78ed1546b775 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -194,9 +194,9 @@
  * builds.
  */
 .macro ANNOTATE_RETPOLINE_SAFE
-	.Lannotate_\@:
+.Lhere_\@:
 	.pushsection .discard.retpoline_safe
-	_ASM_PTR .Lannotate_\@
+	.long .Lhere_\@ - .
 	.popsection
 .endm
 
@@ -318,7 +318,7 @@
 #define ANNOTATE_RETPOLINE_SAFE					\
 	"999:\n\t"						\
 	".pushsection .discard.retpoline_safe\n\t"		\
-	_ASM_PTR " 999b\n\t"					\
+	".long 999b - .\n\t"					\
 	".popsection\n\t"
 
 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 8375792acfc0..2b0258d273fc 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -49,7 +49,7 @@
 #define ANNOTATE_NOENDBR					\
 	"986: \n\t"						\
 	".pushsection .discard.noendbr\n\t"			\
-	_ASM_PTR " 986b\n\t"					\
+	".long 986b - .\n\t"					\
 	".popsection\n\t"
 
 #define ASM_REACHABLE							\
@@ -67,7 +67,7 @@
 #define ANNOTATE_INTRA_FUNCTION_CALL				\
 	999:							\
 	.pushsection .discard.intra_function_calls;		\
-	.long 999b;						\
+	.long 999b - .;						\
 	.popsection;
 
 /*
@@ -92,10 +92,10 @@
  * inconsistencies.
  */
 .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.Lunwind_hint_ip_\@:
+.Lhere_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
-		.long .Lunwind_hint_ip_\@ - .
+		.long .Lhere_\@ - .
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
@@ -107,7 +107,7 @@
 
 .macro STACK_FRAME_NON_STANDARD func:req
 	.pushsection .discard.func_stack_frame_non_standard, "aw"
-	_ASM_PTR \func
+	.long \func - .
 	.popsection
 .endm
 
@@ -120,7 +120,7 @@
 .macro ANNOTATE_NOENDBR
 .Lhere_\@:
 	.pushsection .discard.noendbr
-	.quad	.Lhere_\@
+	.long	.Lhere_\@ - .
 	.popsection
 .endm
 
-- 
cgit v1.2.3


From d88ebba45dfe67114a6ac8c6514f2c65b6ed64c7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:09 -0800
Subject: objtool: Change UNWIND_HINT() argument order

The most important argument is 'type', make that one first.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/d994f8c29376c5618c75698df28fc03b52d3a868.1677683419.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/unwind_hints.h | 2 +-
 include/linux/objtool.h             | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index e7c71750b309..97b392217c0f 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -67,7 +67,7 @@
 #else
 
 #define UNWIND_HINT_FUNC \
-	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0)
+	UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0, 0)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 2b0258d273fc..725d7f0b6748 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -10,7 +10,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -137,8 +137,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
-	"\n\t"
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal, end) "\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
 #define ANNOTATE_NOENDBR
-- 
cgit v1.2.3


From f902cfdd46aedd2afb3e8033223312dbf5fbb675 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:10 -0800
Subject: x86,objtool: Introduce ORC_TYPE_*

Unwind hints and ORC entry types are two distinct things.  Separate them
out more explicitly.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/cc879d38fff8a43f8f7beb2fd56e35a5a384d7cd.1677683419.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/orc_types.h       |  4 ++++
 arch/x86/kernel/unwind_orc.c           | 12 ++++++------
 include/linux/objtool_types.h          |  1 +
 tools/arch/x86/include/asm/orc_types.h |  4 ++++
 tools/include/linux/objtool_types.h    |  1 +
 tools/objtool/orc_dump.c               |  7 +++----
 tools/objtool/orc_gen.c                | 19 +++++++++++++++++--
 7 files changed, 36 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 1343a62106de..b4d4ec78589e 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -39,6 +39,10 @@
 #define ORC_REG_SP_INDIRECT		9
 #define ORC_REG_MAX			15
 
+#define ORC_TYPE_CALL			0
+#define ORC_TYPE_REGS			1
+#define ORC_TYPE_REGS_PARTIAL		2
+
 #ifndef __ASSEMBLY__
 #include <asm/byteorder.h>
 
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 37307b40f8da..8348ac581de3 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -133,7 +133,7 @@ static struct orc_entry null_orc_entry = {
 	.sp_offset = sizeof(long),
 	.sp_reg = ORC_REG_SP,
 	.bp_reg = ORC_REG_UNDEFINED,
-	.type = UNWIND_HINT_TYPE_CALL
+	.type = ORC_TYPE_CALL
 };
 
 #ifdef CONFIG_CALL_THUNKS
@@ -153,7 +153,7 @@ static struct orc_entry *orc_callthunk_find(unsigned long ip)
 
 /* Fake frame pointer entry -- used as a fallback for generated code */
 static struct orc_entry orc_fp_entry = {
-	.type		= UNWIND_HINT_TYPE_CALL,
+	.type		= ORC_TYPE_CALL,
 	.sp_reg		= ORC_REG_BP,
 	.sp_offset	= 16,
 	.bp_reg		= ORC_REG_PREV_SP,
@@ -554,7 +554,7 @@ bool unwind_next_frame(struct unwind_state *state)
 
 	/* Find IP, SP and possibly regs: */
 	switch (orc->type) {
-	case UNWIND_HINT_TYPE_CALL:
+	case ORC_TYPE_CALL:
 		ip_p = sp - sizeof(long);
 
 		if (!deref_stack_reg(state, ip_p, &state->ip))
@@ -567,7 +567,7 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->prev_regs = NULL;
 		break;
 
-	case UNWIND_HINT_TYPE_REGS:
+	case ORC_TYPE_REGS:
 		if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
 			orc_warn_current("can't access registers at %pB\n",
 					 (void *)orig_ip);
@@ -590,13 +590,13 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->full_regs = true;
 		break;
 
-	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+	case ORC_TYPE_REGS_PARTIAL:
 		if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
 			orc_warn_current("can't access iret registers at %pB\n",
 					 (void *)orig_ip);
 			goto err;
 		}
-		/* See UNWIND_HINT_TYPE_REGS case comment. */
+		/* See ORC_TYPE_REGS case comment. */
 		state->ip = unwind_recover_rethook(state, state->ip,
 				(unsigned long *)(state->sp - sizeof(long)));
 
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index 8513537a30ed..9a83468c0039 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -40,6 +40,7 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+/* The below hint types don't have corresponding ORC types */
 #define UNWIND_HINT_TYPE_FUNC		3
 #define UNWIND_HINT_TYPE_ENTRY		4
 #define UNWIND_HINT_TYPE_SAVE		5
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index 1343a62106de..b4d4ec78589e 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -39,6 +39,10 @@
 #define ORC_REG_SP_INDIRECT		9
 #define ORC_REG_MAX			15
 
+#define ORC_TYPE_CALL			0
+#define ORC_TYPE_REGS			1
+#define ORC_TYPE_REGS_PARTIAL		2
+
 #ifndef __ASSEMBLY__
 #include <asm/byteorder.h>
 
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 8513537a30ed..9a83468c0039 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -40,6 +40,7 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+/* The below hint types don't have corresponding ORC types */
 #define UNWIND_HINT_TYPE_FUNC		3
 #define UNWIND_HINT_TYPE_ENTRY		4
 #define UNWIND_HINT_TYPE_SAVE		5
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 9f6c528c26f2..97ecbb8b9034 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -4,7 +4,6 @@
  */
 
 #include <unistd.h>
-#include <linux/objtool_types.h>
 #include <asm/orc_types.h>
 #include <objtool/objtool.h>
 #include <objtool/warn.h>
@@ -39,11 +38,11 @@ static const char *reg_name(unsigned int reg)
 static const char *orc_type_name(unsigned int type)
 {
 	switch (type) {
-	case UNWIND_HINT_TYPE_CALL:
+	case ORC_TYPE_CALL:
 		return "call";
-	case UNWIND_HINT_TYPE_REGS:
+	case ORC_TYPE_REGS:
 		return "regs";
-	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+	case ORC_TYPE_REGS_PARTIAL:
 		return "regs (partial)";
 	default:
 		return "?";
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index f49630a21e0f..e85bbb996f6c 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -26,6 +26,22 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 		return 0;
 	}
 
+	switch (cfi->type) {
+	case UNWIND_HINT_TYPE_CALL:
+		orc->type = ORC_TYPE_CALL;
+		break;
+	case UNWIND_HINT_TYPE_REGS:
+		orc->type = ORC_TYPE_REGS;
+		break;
+	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+		orc->type = ORC_TYPE_REGS_PARTIAL;
+		break;
+	default:
+		WARN_FUNC("unknown unwind hint type %d",
+			  insn->sec, insn->offset, cfi->type);
+		return -1;
+	}
+
 	orc->end = cfi->end;
 	orc->signal = cfi->signal;
 
@@ -83,7 +99,6 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 
 	orc->sp_offset = cfi->cfa.offset;
 	orc->bp_offset = bp->offset;
-	orc->type = cfi->type;
 
 	return 0;
 }
@@ -151,7 +166,7 @@ int orc_create(struct objtool_file *file)
 	struct orc_entry null = {
 		.sp_reg  = ORC_REG_UNDEFINED,
 		.bp_reg  = ORC_REG_UNDEFINED,
-		.type    = UNWIND_HINT_TYPE_CALL,
+		.type    = ORC_TYPE_CALL,
 	};
 
 	/* Build a deduplicated list of ORC entries: */
-- 
cgit v1.2.3


From 4708ea14bef314fc901857eefd65678236a9f2d9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:11 -0800
Subject: x86,objtool: Separate unret validation from unwind hints

The ENTRY unwind hint type is serving double duty as both an empty
unwind hint and an unret validation annotation.

Unret validation is unrelated to unwinding. Separate it out into its own
annotation.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/ff7448d492ea21b86d8a90264b105fbd0d751077.1677683419.git.jpoimboe@kernel.org
---
 arch/x86/entry/entry_64.S             | 14 ++++----
 arch/x86/include/asm/nospec-branch.h  |  8 ++---
 arch/x86/include/asm/unwind_hints.h   |  8 ++++-
 arch/x86/kernel/head_64.S             |  5 ---
 include/linux/objtool.h               | 16 +++++++++
 include/linux/objtool_types.h         |  5 ++-
 tools/include/linux/objtool_types.h   |  5 ++-
 tools/objtool/check.c                 | 65 ++++++++++++++++++++++++-----------
 tools/objtool/include/objtool/check.h |  4 +--
 9 files changed, 85 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index eccc3431e515..5b93eb7db0ab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -388,9 +388,9 @@ SYM_CODE_START(\asmsym)
 
 	.if \vector == X86_TRAP_BP
 		/* #BP advances %rip to the next instruction */
-		UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
+		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
 	.else
-		UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
 	.endif
 
 	ENDBR
@@ -461,7 +461,7 @@ SYM_CODE_END(\asmsym)
  */
 .macro idtentry_mce_db vector asmsym cfunc
 SYM_CODE_START(\asmsym)
-	UNWIND_HINT_IRET_REGS
+	UNWIND_HINT_IRET_ENTRY
 	ENDBR
 	ASM_CLAC
 	cld
@@ -518,7 +518,7 @@ SYM_CODE_END(\asmsym)
  */
 .macro idtentry_vc vector asmsym cfunc
 SYM_CODE_START(\asmsym)
-	UNWIND_HINT_IRET_REGS
+	UNWIND_HINT_IRET_ENTRY
 	ENDBR
 	ASM_CLAC
 	cld
@@ -582,7 +582,7 @@ SYM_CODE_END(\asmsym)
  */
 .macro idtentry_df vector asmsym cfunc
 SYM_CODE_START(\asmsym)
-	UNWIND_HINT_IRET_REGS offset=8
+	UNWIND_HINT_IRET_ENTRY offset=8
 	ENDBR
 	ASM_CLAC
 	cld
@@ -1107,7 +1107,7 @@ SYM_CODE_START(error_entry)
 	FENCE_SWAPGS_KERNEL_ENTRY
 	CALL_DEPTH_ACCOUNT
 	leaq	8(%rsp), %rax			/* return pt_regs pointer */
-	ANNOTATE_UNRET_END
+	VALIDATE_UNRET_END
 	RET
 
 .Lbstep_iret:
@@ -1153,7 +1153,7 @@ SYM_CODE_END(error_return)
  *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
 SYM_CODE_START(asm_exc_nmi)
-	UNWIND_HINT_IRET_REGS
+	UNWIND_HINT_IRET_ENTRY
 	ENDBR
 
 	/*
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 78ed1546b775..edb2b0cb8efe 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -210,8 +210,8 @@
  * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
  * eventually turn into it's own annotation.
  */
-.macro ANNOTATE_UNRET_END
-#ifdef CONFIG_DEBUG_ENTRY
+.macro VALIDATE_UNRET_END
+#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
 	ANNOTATE_RETPOLINE_SAFE
 	nop
 #endif
@@ -286,7 +286,7 @@
 .macro UNTRAIN_RET
 #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
 	defined(CONFIG_CALL_DEPTH_TRACKING)
-	ANNOTATE_UNRET_END
+	VALIDATE_UNRET_END
 	ALTERNATIVE_3 "",						\
 		      CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,		\
 		      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,	\
@@ -297,7 +297,7 @@
 .macro UNTRAIN_RET_FROM_CALL
 #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
 	defined(CONFIG_CALL_DEPTH_TRACKING)
-	ANNOTATE_UNRET_END
+	VALIDATE_UNRET_END
 	ALTERNATIVE_3 "",						\
 		      CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,		\
 		      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,	\
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 97b392217c0f..4c0f28d665eb 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -12,7 +12,8 @@
 .endm
 
 .macro UNWIND_HINT_ENTRY
-	UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
+	VALIDATE_UNRET_BEGIN
+	UNWIND_HINT_EMPTY
 .endm
 
 .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
@@ -52,6 +53,11 @@
 	UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
 .endm
 
+.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1
+	VALIDATE_UNRET_BEGIN
+	UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal
+.endm
+
 .macro UNWIND_HINT_FUNC
 	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
 .endm
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..ee3ed15ee1f8 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -390,8 +390,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
 	UNWIND_HINT_IRET_REGS offset=8
 	ENDBR
 
-	ANNOTATE_UNRET_END
-
 	/* Build pt_regs */
 	PUSH_AND_CLEAR_REGS
 
@@ -451,7 +449,6 @@ SYM_CODE_END(early_idt_handler_array)
 
 SYM_CODE_START_LOCAL(early_idt_handler_common)
 	UNWIND_HINT_IRET_REGS offset=16
-	ANNOTATE_UNRET_END
 	/*
 	 * The stack is the hardware frame, an error code or zero, and the
 	 * vector number.
@@ -501,8 +498,6 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
 	UNWIND_HINT_IRET_REGS offset=8
 	ENDBR
 
-	ANNOTATE_UNRET_END
-
 	/* Build pt_regs */
 	PUSH_AND_CLEAR_REGS
 
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 725d7f0b6748..5aa475118820 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -124,6 +124,22 @@
 	.popsection
 .endm
 
+/*
+ * Use objtool to validate the entry requirement that all code paths do
+ * VALIDATE_UNRET_END before RET.
+ *
+ * NOTE: The macro must be used at the beginning of a global symbol, otherwise
+ * it will be ignored.
+ */
+.macro VALIDATE_UNRET_BEGIN
+#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+.Lhere_\@:
+	.pushsection .discard.validate_unret
+	.long	.Lhere_\@ - .
+	.popsection
+#endif
+.endm
+
 .macro REACHABLE
 .Lhere_\@:
 	.pushsection .discard.reachable
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index 9a83468c0039..9787ad0f2ef4 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -42,8 +42,7 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 /* The below hint types don't have corresponding ORC types */
 #define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_ENTRY		4
-#define UNWIND_HINT_TYPE_SAVE		5
-#define UNWIND_HINT_TYPE_RESTORE	6
+#define UNWIND_HINT_TYPE_SAVE		4
+#define UNWIND_HINT_TYPE_RESTORE	5
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 9a83468c0039..9787ad0f2ef4 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -42,8 +42,7 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 /* The below hint types don't have corresponding ORC types */
 #define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_ENTRY		4
-#define UNWIND_HINT_TYPE_SAVE		5
-#define UNWIND_HINT_TYPE_RESTORE	6
+#define UNWIND_HINT_TYPE_SAVE		4
+#define UNWIND_HINT_TYPE_RESTORE	5
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index efc2baa02883..10be80b3fe67 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2307,16 +2307,9 @@ static int read_unwind_hints(struct objtool_file *file)
 					WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR",
 						  insn->sec, insn->offset);
 				}
-
-				insn->entry = 1;
 			}
 		}
 
-		if (hint->type == UNWIND_HINT_TYPE_ENTRY) {
-			hint->type = UNWIND_HINT_TYPE_CALL;
-			insn->entry = 1;
-		}
-
 		if (hint->type == UNWIND_HINT_TYPE_FUNC) {
 			insn->cfi = &func_cfi;
 			continue;
@@ -2449,6 +2442,34 @@ static int read_instr_hints(struct objtool_file *file)
 	return 0;
 }
 
+static int read_validate_unret_hints(struct objtool_file *file)
+{
+	struct section *sec;
+	struct instruction *insn;
+	struct reloc *reloc;
+
+	sec = find_section_by_name(file->elf, ".rela.discard.validate_unret");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in %s", sec->name);
+			return -1;
+		}
+
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
+		if (!insn) {
+			WARN("bad .discard.instr_end entry");
+			return -1;
+		}
+		insn->unret = 1;
+	}
+
+	return 0;
+}
+
+
 static int read_intra_function_calls(struct objtool_file *file)
 {
 	struct instruction *insn;
@@ -2667,6 +2688,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
+	ret = read_validate_unret_hints(file);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -3863,10 +3888,10 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
 /*
  * Validate rethunk entry constraint: must untrain RET before the first RET.
  *
- * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes
+ * Follow every branch (intra-function) and ensure VALIDATE_UNRET_END comes
  * before an actual RET instruction.
  */
-static int validate_entry(struct objtool_file *file, struct instruction *insn)
+static int validate_unret(struct objtool_file *file, struct instruction *insn)
 {
 	struct instruction *next, *dest;
 	int ret, warnings = 0;
@@ -3874,10 +3899,10 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 	for (;;) {
 		next = next_insn_to_validate(file, insn);
 
-		if (insn->visited & VISITED_ENTRY)
+		if (insn->visited & VISITED_UNRET)
 			return 0;
 
-		insn->visited |= VISITED_ENTRY;
+		insn->visited |= VISITED_UNRET;
 
 		if (!insn->ignore_alts && insn->alts) {
 			struct alternative *alt;
@@ -3887,7 +3912,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 				if (alt->skip_orig)
 					skip_orig = true;
 
-				ret = validate_entry(file, alt->insn);
+				ret = validate_unret(file, alt->insn);
 				if (ret) {
 				        if (opts.backtrace)
 						BT_FUNC("(alt)", insn);
@@ -3915,7 +3940,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 						  insn->sec, insn->offset);
 					return -1;
 				}
-				ret = validate_entry(file, insn->jump_dest);
+				ret = validate_unret(file, insn->jump_dest);
 				if (ret) {
 					if (opts.backtrace) {
 						BT_FUNC("(branch%s)", insn,
@@ -3940,7 +3965,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 				return -1;
 			}
 
-			ret = validate_entry(file, dest);
+			ret = validate_unret(file, dest);
 			if (ret) {
 				if (opts.backtrace)
 					BT_FUNC("(call)", insn);
@@ -3976,19 +4001,19 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 }
 
 /*
- * Validate that all branches starting at 'insn->entry' encounter UNRET_END
- * before RET.
+ * Validate that all branches starting at VALIDATE_UNRET_BEGIN encounter
+ * VALIDATE_UNRET_END before RET.
  */
-static int validate_unret(struct objtool_file *file)
+static int validate_unrets(struct objtool_file *file)
 {
 	struct instruction *insn;
 	int ret, warnings = 0;
 
 	for_each_insn(file, insn) {
-		if (!insn->entry)
+		if (!insn->unret)
 			continue;
 
-		ret = validate_entry(file, insn);
+		ret = validate_unret(file, insn);
 		if (ret < 0) {
 			WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset);
 			return ret;
@@ -4607,7 +4632,7 @@ int check(struct objtool_file *file)
 		 * Must be after validate_branch() and friends, it plays
 		 * further games with insn->visited.
 		 */
-		ret = validate_unret(file);
+		ret = validate_unrets(file);
 		if (ret < 0)
 			return ret;
 		warnings += ret;
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 3e7c7004f7df..daa46f1f0965 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -61,7 +61,7 @@ struct instruction {
 	    restore		: 1,
 	    retpoline_safe	: 1,
 	    noendbr		: 1,
-	    entry		: 1,
+	    unret		: 1,
 	    visited		: 4,
 	    no_reloc		: 1;
 		/* 10 bit hole */
@@ -92,7 +92,7 @@ static inline struct symbol *insn_func(struct instruction *insn)
 #define VISITED_BRANCH		0x01
 #define VISITED_BRANCH_UACCESS	0x02
 #define VISITED_BRANCH_MASK	0x03
-#define VISITED_ENTRY		0x04
+#define VISITED_UNRET		0x04
 
 static inline bool is_static_jump(struct instruction *insn)
 {
-- 
cgit v1.2.3


From fb799447ae2974a07907906dff5bd4b9e47b7123 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 1 Mar 2023 07:13:12 -0800
Subject: x86,objtool: Split UNWIND_HINT_EMPTY in two

Mark reported that the ORC unwinder incorrectly marks an unwind as
reliable when the unwind terminates prematurely in the dark corners of
return_to_handler() due to lack of information about the next frame.

The problem is UNWIND_HINT_EMPTY is used in two different situations:

  1) The end of the kernel stack unwind before hitting user entry, boot
     code, or fork entry

  2) A blind spot in ORC coverage where the unwinder has to bail due to
     lack of information about the next frame

The ORC unwinder has no way to tell the difference between the two.
When it encounters an undefined stack state with 'end=1', it blindly
marks the stack reliable, which can break the livepatch consistency
model.

Fix it by splitting UNWIND_HINT_EMPTY into UNWIND_HINT_UNDEFINED and
UNWIND_HINT_END_OF_STACK.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/fd6212c8b450d3564b855e1cb48404d6277b4d9f.1677683419.git.jpoimboe@kernel.org
---
 Documentation/livepatch/reliable-stacktrace.rst |  2 +-
 arch/x86/entry/entry_64.S                       | 12 +++++------
 arch/x86/include/asm/orc_types.h                | 14 ++++++-------
 arch/x86/include/asm/unwind_hints.h             | 12 +++++++----
 arch/x86/kernel/ftrace_64.S                     |  2 +-
 arch/x86/kernel/head_64.S                       | 12 +++++------
 arch/x86/kernel/relocate_kernel_64.S            | 10 ++++-----
 arch/x86/kernel/unwind_orc.c                    | 15 ++++++--------
 arch/x86/lib/retpoline.S                        |  6 +++---
 arch/x86/platform/pvh/head.S                    |  2 +-
 arch/x86/xen/xen-asm.S                          |  4 ++--
 arch/x86/xen/xen-head.S                         |  4 ++--
 include/linux/objtool.h                         | 10 ++++-----
 include/linux/objtool_types.h                   | 27 ++++++++++++++++---------
 scripts/sorttable.h                             |  2 +-
 tools/arch/x86/include/asm/orc_types.h          | 14 ++++++-------
 tools/include/linux/objtool_types.h             | 27 ++++++++++++++++---------
 tools/objtool/check.c                           |  2 +-
 tools/objtool/orc_dump.c                        |  8 ++++++--
 tools/objtool/orc_gen.c                         | 26 ++++++++++++------------
 20 files changed, 116 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/Documentation/livepatch/reliable-stacktrace.rst b/Documentation/livepatch/reliable-stacktrace.rst
index 67459d2ca2af..d56bb706172f 100644
--- a/Documentation/livepatch/reliable-stacktrace.rst
+++ b/Documentation/livepatch/reliable-stacktrace.rst
@@ -183,7 +183,7 @@ trampoline or return trampoline. For example, considering the x86_64
 .. code-block:: none
 
    SYM_CODE_START(return_to_handler)
-           UNWIND_HINT_EMPTY
+           UNWIND_HINT_UNDEFINED
            subq  $24, %rsp
 
            /* Save the return values */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 5b93eb7db0ab..45e135be2a9f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -205,7 +205,7 @@ syscall_return_via_sysret:
 	 */
 	movq	%rsp, %rdi
 	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 
 	pushq	RSP-RDI(%rdi)	/* RSP */
 	pushq	(%rdi)		/* RDI */
@@ -286,7 +286,7 @@ SYM_FUNC_END(__switch_to_asm)
 .pushsection .text, "ax"
 	__FUNC_ALIGN
 SYM_CODE_START_NOALIGN(ret_from_fork)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // copy_thread
 	CALL_DEPTH_ACCOUNT
 	movq	%rax, %rdi
@@ -303,7 +303,7 @@ SYM_CODE_START_NOALIGN(ret_from_fork)
 
 1:
 	/* kernel thread */
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	movq	%r12, %rdi
 	CALL_NOSPEC rbx
 	/*
@@ -643,7 +643,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 	 */
 	movq	%rsp, %rdi
 	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 
 	/* Copy the IRET frame to the trampoline stack. */
 	pushq	6*8(%rdi)	/* SS */
@@ -869,7 +869,7 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
  */
 	__FUNC_ALIGN
 SYM_CODE_START_NOALIGN(xen_failsafe_callback)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ENDBR
 	movl	%ds, %ecx
 	cmpw	%cx, 0x10(%rsp)
@@ -1520,7 +1520,7 @@ SYM_CODE_END(asm_exc_nmi)
  * MSRs to fully disable 32-bit SYSCALL.
  */
 SYM_CODE_START(ignore_sysret)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ENDBR
 	mov	$-ENOSYS, %eax
 	sysretl
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index b4d4ec78589e..46d7e06763c9 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -39,9 +39,11 @@
 #define ORC_REG_SP_INDIRECT		9
 #define ORC_REG_MAX			15
 
-#define ORC_TYPE_CALL			0
-#define ORC_TYPE_REGS			1
-#define ORC_TYPE_REGS_PARTIAL		2
+#define ORC_TYPE_UNDEFINED		0
+#define ORC_TYPE_END_OF_STACK		1
+#define ORC_TYPE_CALL			2
+#define ORC_TYPE_REGS			3
+#define ORC_TYPE_REGS_PARTIAL		4
 
 #ifndef __ASSEMBLY__
 #include <asm/byteorder.h>
@@ -60,16 +62,14 @@ struct orc_entry {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
-	unsigned	type:2;
+	unsigned	type:3;
 	unsigned	signal:1;
-	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
 	unsigned	unused:4;
-	unsigned	end:1;
 	unsigned	signal:1;
-	unsigned	type:2;
+	unsigned	type:3;
 #endif
 } __packed;
 
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 4c0f28d665eb..01cb9692b160 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -7,13 +7,17 @@
 
 #ifdef __ASSEMBLY__
 
-.macro UNWIND_HINT_EMPTY
-	UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1
+.macro UNWIND_HINT_END_OF_STACK
+	UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK
+.endm
+
+.macro UNWIND_HINT_UNDEFINED
+	UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED
 .endm
 
 .macro UNWIND_HINT_ENTRY
 	VALIDATE_UNRET_BEGIN
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 .endm
 
 .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
@@ -73,7 +77,7 @@
 #else
 
 #define UNWIND_HINT_FUNC \
-	UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0, 0)
+	UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 1265ad519249..0387732e9c3f 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -340,7 +340,7 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 SYM_CODE_START(return_to_handler)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 	subq  $16, %rsp
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ee3ed15ee1f8..8d8d25c64827 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -42,7 +42,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
 	__HEAD
 	.code64
 SYM_CODE_START_NOALIGN(startup_64)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded an identity mapped page table
@@ -105,7 +105,7 @@ SYM_CODE_START_NOALIGN(startup_64)
 	lretq
 
 .Lon_kernel_cs:
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 
 	/* Sanitize CPU configuration */
 	call verify_cpu
@@ -127,7 +127,7 @@ SYM_CODE_START_NOALIGN(startup_64)
 SYM_CODE_END(startup_64)
 
 SYM_CODE_START(secondary_startup_64)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -156,7 +156,7 @@ SYM_CODE_START(secondary_startup_64)
 	 * verify_cpu() above to make sure NX is enabled.
 	 */
 SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR
 
 	/*
@@ -238,7 +238,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	ANNOTATE_RETPOLINE_SAFE
 	jmp	*%rax
 1:
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // above
 
 	/*
@@ -371,7 +371,7 @@ SYM_CODE_END(secondary_startup_64)
  */
 SYM_CODE_START(start_cpu0)
 	ANNOTATE_NOENDBR
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	movq	initial_stack(%rip), %rsp
 	jmp	.Ljump_to_C_code
 SYM_CODE_END(start_cpu0)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4a73351f87f8..56cab1bb25f5 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -43,7 +43,7 @@
 	.code64
 SYM_CODE_START_NOALIGN(relocate_range)
 SYM_CODE_START_NOALIGN(relocate_kernel)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR
 	/*
 	 * %rdi indirection_page
@@ -113,7 +113,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 SYM_CODE_END(relocate_kernel)
 
 SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	/* set return address to 0 if not preserving context */
 	pushq	$0
 	/* store the start address on the stack */
@@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
 SYM_CODE_END(identity_mapped)
 
 SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // RET target, above
 	movq	RSP(%r8), %rsp
 	movq	CR4(%r8), %rax
@@ -256,8 +256,8 @@ SYM_CODE_END(virtual_mapped)
 
 	/* Do the copies */
 SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
-	UNWIND_HINT_EMPTY
-	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+	UNWIND_HINT_END_OF_STACK
+	movq	%rdi, %rcx	/* Put the page_list in %rcx */
 	xorl	%edi, %edi
 	xorl	%esi, %esi
 	jmp	1f
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 8348ac581de3..3ac50b7298d1 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -158,7 +158,6 @@ static struct orc_entry orc_fp_entry = {
 	.sp_offset	= 16,
 	.bp_reg		= ORC_REG_PREV_SP,
 	.bp_offset	= -16,
-	.end		= 0,
 };
 
 static struct orc_entry *orc_find(unsigned long ip)
@@ -250,13 +249,13 @@ static int orc_sort_cmp(const void *_a, const void *_b)
 		return -1;
 
 	/*
-	 * The "weak" section terminator entries need to always be on the left
+	 * The "weak" section terminator entries need to always be first
 	 * to ensure the lookup code skips them in favor of real entries.
 	 * These terminator entries exist to handle any gaps created by
 	 * whitelisted .o files which didn't get objtool generation.
 	 */
 	orc_a = cur_orc_table + (a - cur_orc_ip_table);
-	return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+	return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
 }
 
 void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
@@ -474,14 +473,12 @@ bool unwind_next_frame(struct unwind_state *state)
 		 */
 		orc = &orc_fp_entry;
 		state->error = true;
-	}
-
-	/* End-of-stack check for kernel threads: */
-	if (orc->sp_reg == ORC_REG_UNDEFINED) {
-		if (!orc->end)
+	} else {
+		if (orc->type == ORC_TYPE_UNDEFINED)
 			goto err;
 
-		goto the_end;
+		if (orc->type == ORC_TYPE_END_OF_STACK)
+			goto the_end;
 	}
 
 	state->signal = orc->signal;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 5f61c65322be..27ef53fab6bd 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -33,7 +33,7 @@
 
 	.align RETPOLINE_THUNK_SIZE
 SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 
 	ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
@@ -75,7 +75,7 @@ SYM_CODE_END(__x86_indirect_thunk_array)
 	.align RETPOLINE_THUNK_SIZE
 
 SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 
 	CALL_DEPTH_ACCOUNT
@@ -103,7 +103,7 @@ SYM_CODE_END(__x86_indirect_call_thunk_array)
 	.align RETPOLINE_THUNK_SIZE
 
 SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 	POLINE \reg
 	ANNOTATE_UNRET_SAFE
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 7fe564eaf228..c4365a05ab83 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,7 +50,7 @@
 #define PVH_DS_SEL		(PVH_GDT_ENTRY_DS * 8)
 
 SYM_CODE_START_LOCAL(pvh_start_xen)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	cld
 
 	lgdt (_pa(gdt))
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 4a184f6e4e4d..08f1ceb9eb81 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
 SYM_CODE_START(xen_early_idt_handler_array)
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ENDBR
 	pop %rcx
 	pop %r11
@@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
  * rsp->rax		}
  */
 SYM_CODE_START(xen_iret)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 	pushq $0
 	jmp hypercall_iret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index e36ea4268bd2..6eafdd17c242 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,7 +45,7 @@ SYM_CODE_END(hypercall_page)
 #ifdef CONFIG_XEN_PV
 	__INIT
 SYM_CODE_START(startup_xen)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR
 	cld
 
@@ -71,7 +71,7 @@ SYM_CODE_END(startup_xen)
 #ifdef CONFIG_XEN_PV_SMP
 .pushsection .text
 SYM_CODE_START(asm_cpu_bringup_and_idle)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_END_OF_STACK
 	ENDBR
 
 	call cpu_bringup_and_idle
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 5aa475118820..03f82c2c2ebf 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -10,7 +10,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(type, sp_reg, sp_offset, signal, end)	\
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -19,7 +19,6 @@
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
 	".byte " __stringify(signal) "\n\t"			\
-	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
 
@@ -91,7 +90,7 @@
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
 .Lhere_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -100,7 +99,6 @@
 		.byte \sp_reg
 		.byte \type
 		.byte \signal
-		.byte \end
 		.balign 4
 	.popsection
 .endm
@@ -153,14 +151,14 @@
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(type, sp_reg, sp_offset, signal, end) "\n\t"
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
 #define ANNOTATE_NOENDBR
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index 9787ad0f2ef4..453a4f4ef39d 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -16,12 +16,18 @@ struct unwind_hint {
 	u8		sp_reg;
 	u8		type;
 	u8		signal;
-	u8		end;
 };
 
 #endif /* __ASSEMBLY__ */
 
 /*
+ * UNWIND_HINT_TYPE_UNDEFINED: A blind spot in ORC coverage which can result in
+ * a truncated and unreliable stack unwind.
+ *
+ * UNWIND_HINT_TYPE_END_OF_STACK: The end of the kernel stack unwind before
+ * hitting user entry, boot code, or fork entry (when there are no pt_regs
+ * available).
+ *
  * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
  * (the caller's SP right before it made the call).  Used for all callable
  * functions, i.e. all C code and all callable asm functions.
@@ -32,17 +38,20 @@ struct unwind_hint {
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
  *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * UNWIND_HINT_TYPE_FUNC: Generate the unwind metadata of a callable function.
  * Useful for code which doesn't have an ELF function annotation.
  *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
+ * UNWIND_HINT_TYPE_{SAVE,RESTORE}: Save the unwind metadata at a certain
+ * location so that it can be restored later.
  */
-#define UNWIND_HINT_TYPE_CALL		0
-#define UNWIND_HINT_TYPE_REGS		1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+#define UNWIND_HINT_TYPE_UNDEFINED	0
+#define UNWIND_HINT_TYPE_END_OF_STACK	1
+#define UNWIND_HINT_TYPE_CALL		2
+#define UNWIND_HINT_TYPE_REGS		3
+#define UNWIND_HINT_TYPE_REGS_PARTIAL	4
 /* The below hint types don't have corresponding ORC types */
-#define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_SAVE		4
-#define UNWIND_HINT_TYPE_RESTORE	5
+#define UNWIND_HINT_TYPE_FUNC		5
+#define UNWIND_HINT_TYPE_SAVE		6
+#define UNWIND_HINT_TYPE_RESTORE	7
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index deb7c1d3e979..7bd0184380d3 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -128,7 +128,7 @@ static int orc_sort_cmp(const void *_a, const void *_b)
 	 * whitelisted .o files which didn't get objtool generation.
 	 */
 	orc_a = g_orc_table + (a - g_orc_ip_table);
-	return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+	return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
 }
 
 static void *sort_orctable(void *arg)
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index b4d4ec78589e..46d7e06763c9 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -39,9 +39,11 @@
 #define ORC_REG_SP_INDIRECT		9
 #define ORC_REG_MAX			15
 
-#define ORC_TYPE_CALL			0
-#define ORC_TYPE_REGS			1
-#define ORC_TYPE_REGS_PARTIAL		2
+#define ORC_TYPE_UNDEFINED		0
+#define ORC_TYPE_END_OF_STACK		1
+#define ORC_TYPE_CALL			2
+#define ORC_TYPE_REGS			3
+#define ORC_TYPE_REGS_PARTIAL		4
 
 #ifndef __ASSEMBLY__
 #include <asm/byteorder.h>
@@ -60,16 +62,14 @@ struct orc_entry {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
-	unsigned	type:2;
+	unsigned	type:3;
 	unsigned	signal:1;
-	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
 	unsigned	unused:4;
-	unsigned	end:1;
 	unsigned	signal:1;
-	unsigned	type:2;
+	unsigned	type:3;
 #endif
 } __packed;
 
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 9787ad0f2ef4..453a4f4ef39d 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -16,12 +16,18 @@ struct unwind_hint {
 	u8		sp_reg;
 	u8		type;
 	u8		signal;
-	u8		end;
 };
 
 #endif /* __ASSEMBLY__ */
 
 /*
+ * UNWIND_HINT_TYPE_UNDEFINED: A blind spot in ORC coverage which can result in
+ * a truncated and unreliable stack unwind.
+ *
+ * UNWIND_HINT_TYPE_END_OF_STACK: The end of the kernel stack unwind before
+ * hitting user entry, boot code, or fork entry (when there are no pt_regs
+ * available).
+ *
  * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
  * (the caller's SP right before it made the call).  Used for all callable
  * functions, i.e. all C code and all callable asm functions.
@@ -32,17 +38,20 @@ struct unwind_hint {
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
  *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * UNWIND_HINT_TYPE_FUNC: Generate the unwind metadata of a callable function.
  * Useful for code which doesn't have an ELF function annotation.
  *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
+ * UNWIND_HINT_TYPE_{SAVE,RESTORE}: Save the unwind metadata at a certain
+ * location so that it can be restored later.
  */
-#define UNWIND_HINT_TYPE_CALL		0
-#define UNWIND_HINT_TYPE_REGS		1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL	2
+#define UNWIND_HINT_TYPE_UNDEFINED	0
+#define UNWIND_HINT_TYPE_END_OF_STACK	1
+#define UNWIND_HINT_TYPE_CALL		2
+#define UNWIND_HINT_TYPE_REGS		3
+#define UNWIND_HINT_TYPE_REGS_PARTIAL	4
 /* The below hint types don't have corresponding ORC types */
-#define UNWIND_HINT_TYPE_FUNC		3
-#define UNWIND_HINT_TYPE_SAVE		4
-#define UNWIND_HINT_TYPE_RESTORE	5
+#define UNWIND_HINT_TYPE_FUNC		5
+#define UNWIND_HINT_TYPE_SAVE		6
+#define UNWIND_HINT_TYPE_RESTORE	7
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 10be80b3fe67..cae6ac6ff246 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2243,6 +2243,7 @@ static void set_func_state(struct cfi_state *state)
 	memcpy(&state->regs, &initial_func_cfi.regs,
 	       CFI_NUM_REGS * sizeof(struct cfi_reg));
 	state->stack_size = initial_func_cfi.cfa.offset;
+	state->type = UNWIND_HINT_TYPE_CALL;
 }
 
 static int read_unwind_hints(struct objtool_file *file)
@@ -2327,7 +2328,6 @@ static int read_unwind_hints(struct objtool_file *file)
 		cfi.cfa.offset = bswap_if_needed(file->elf, hint->sp_offset);
 		cfi.type = hint->type;
 		cfi.signal = hint->signal;
-		cfi.end = hint->end;
 
 		insn->cfi = cfi_hash_find_or_add(&cfi);
 	}
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 97ecbb8b9034..0e183bb1c720 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -38,6 +38,10 @@ static const char *reg_name(unsigned int reg)
 static const char *orc_type_name(unsigned int type)
 {
 	switch (type) {
+	case ORC_TYPE_UNDEFINED:
+		return "(und)";
+	case ORC_TYPE_END_OF_STACK:
+		return "end";
 	case ORC_TYPE_CALL:
 		return "call";
 	case ORC_TYPE_REGS:
@@ -201,6 +205,7 @@ int orc_dump(const char *_objname)
 			printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i]));
 		}
 
+		printf("type:%s", orc_type_name(orc[i].type));
 
 		printf(" sp:");
 
@@ -210,8 +215,7 @@ int orc_dump(const char *_objname)
 
 		print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
 
-		printf(" type:%s signal:%d end:%d\n",
-		       orc_type_name(orc[i].type), orc[i].signal, orc[i].end);
+		printf(" signal:%d\n", orc[i].signal);
 	}
 
 	elf_end(elf);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index e85bbb996f6c..b327f9ccfe73 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -21,12 +21,22 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 	memset(orc, 0, sizeof(*orc));
 
 	if (!cfi) {
-		orc->end = 0;
-		orc->sp_reg = ORC_REG_UNDEFINED;
+		/*
+		 * This is usually either unreachable nops/traps (which don't
+		 * trigger unreachable instruction warnings), or
+		 * STACK_FRAME_NON_STANDARD functions.
+		 */
+		orc->type = ORC_TYPE_UNDEFINED;
 		return 0;
 	}
 
 	switch (cfi->type) {
+	case UNWIND_HINT_TYPE_UNDEFINED:
+		orc->type = ORC_TYPE_UNDEFINED;
+		return 0;
+	case UNWIND_HINT_TYPE_END_OF_STACK:
+		orc->type = ORC_TYPE_END_OF_STACK;
+		return 0;
 	case UNWIND_HINT_TYPE_CALL:
 		orc->type = ORC_TYPE_CALL;
 		break;
@@ -42,14 +52,8 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 		return -1;
 	}
 
-	orc->end = cfi->end;
 	orc->signal = cfi->signal;
 
-	if (cfi->cfa.base == CFI_UNDEFINED) {
-		orc->sp_reg = ORC_REG_UNDEFINED;
-		return 0;
-	}
-
 	switch (cfi->cfa.base) {
 	case CFI_SP:
 		orc->sp_reg = ORC_REG_SP;
@@ -163,11 +167,7 @@ int orc_create(struct objtool_file *file)
 	struct orc_list_entry *entry;
 	struct list_head orc_list;
 
-	struct orc_entry null = {
-		.sp_reg  = ORC_REG_UNDEFINED,
-		.bp_reg  = ORC_REG_UNDEFINED,
-		.type    = ORC_TYPE_CALL,
-	};
+	struct orc_entry null = { .type = ORC_TYPE_UNDEFINED };
 
 	/* Build a deduplicated list of ORC entries: */
 	INIT_LIST_HEAD(&orc_list);
-- 
cgit v1.2.3


From f530b531fb9e05eb134cfc334242799ea974d111 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Fri, 17 Feb 2023 20:33:36 +0100
Subject: KVM: Shrink struct kvm_mmu_memory_cache

Move the 'capacity' member around to make use of the padding hole on 64
bit systems instead of introducing yet another one.

This allows us to save 8 bytes per instance for 64 bit builds of which,
e.g., x86's struct kvm_vcpu_arch has a few.

Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Link: https://lore.kernel.org/r/20230217193336.15278-3-minipli@grsecurity.net
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2728d49bbdf6..6f4737d5046a 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -91,11 +91,11 @@ struct gfn_to_pfn_cache {
  * is topped up (__kvm_mmu_topup_memory_cache()).
  */
 struct kvm_mmu_memory_cache {
-	int nobjs;
 	gfp_t gfp_zero;
 	gfp_t gfp_custom;
 	struct kmem_cache *kmem_cache;
 	int capacity;
+	int nobjs;
 	void **objects;
 };
 #endif
-- 
cgit v1.2.3


From 0d57b970df352517a75f4533820c49de360c4123 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 14 Mar 2023 09:17:17 +0100
Subject: security: Remove security_old_inode_init_security()

As the remaining two users reiserfs and ocfs2 switched to
security_inode_init_security(), security_old_inode_init_security() can be
now removed.

Out-of-tree kernel modules should switch to security_inode_init_security()
too.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 12 ------------
 security/security.c      | 11 -----------
 2 files changed, 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 5984d0d550b4..cd23221ce9e6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -336,9 +336,6 @@ int security_inode_init_security(struct inode *inode, struct inode *dir,
 int security_inode_init_security_anon(struct inode *inode,
 				      const struct qstr *name,
 				      const struct inode *context_inode);
-int security_old_inode_init_security(struct inode *inode, struct inode *dir,
-				     const struct qstr *qstr, const char **name,
-				     void **value, size_t *len);
 int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode);
 int security_inode_link(struct dentry *old_dentry, struct inode *dir,
 			 struct dentry *new_dentry);
@@ -778,15 +775,6 @@ static inline int security_inode_init_security_anon(struct inode *inode,
 	return 0;
 }
 
-static inline int security_old_inode_init_security(struct inode *inode,
-						   struct inode *dir,
-						   const struct qstr *qstr,
-						   const char **name,
-						   void **value, size_t *len)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int security_inode_create(struct inode *dir,
 					 struct dentry *dentry,
 					 umode_t mode)
diff --git a/security/security.c b/security/security.c
index b808e1b86551..f4170efcddda 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1655,17 +1655,6 @@ int security_inode_init_security_anon(struct inode *inode,
 			     context_inode);
 }
 
-int security_old_inode_init_security(struct inode *inode, struct inode *dir,
-				     const struct qstr *qstr, const char **name,
-				     void **value, size_t *len)
-{
-	if (unlikely(IS_PRIVATE(inode)))
-		return -EOPNOTSUPP;
-	return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir,
-			     qstr, name, value, len);
-}
-EXPORT_SYMBOL(security_old_inode_init_security);
-
 #ifdef CONFIG_SECURITY_PATH
 /**
  * security_path_mknod() - Check if creating a special file is allowed
-- 
cgit v1.2.3


From 704bc669e1dda3eb8f6d5cb462b21e85558a3912 Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Mon, 20 Mar 2023 12:29:05 +0900
Subject: workqueue: Introduce show_freezable_workqueues

Currently show_all_workqueue is called if freeze fails at the time of
freeze the workqueues, which shows the status of all workqueues and of
all worker pools. In this cases we may only need to dump state of only
workqueues that are freezable and busy.

This patch defines show_freezable_workqueues, which uses
show_one_workqueue, a granular function that shows the state of individual
workqueues, so that dump only the state of freezable workqueues
at that time.

tj: Minor message adjustment.

Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/power/process.c    |  2 +-
 kernel/workqueue.c        | 26 ++++++++++++++++++++++++--
 3 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ac551b8ee7d9..3992c994787f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -472,6 +472,7 @@ extern unsigned int work_busy(struct work_struct *work);
 extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
 extern void print_worker_info(const char *log_lvl, struct task_struct *task);
 extern void show_all_workqueues(void);
+extern void show_freezable_workqueues(void);
 extern void show_one_workqueue(struct workqueue_struct *wq);
 extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6c1c7e566d35..cae81a87cc91 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -93,7 +93,7 @@ static int try_to_freeze_tasks(bool user_only)
 		       todo - wq_busy, wq_busy);
 
 		if (wq_busy)
-			show_all_workqueues();
+			show_freezable_workqueues();
 
 		if (!wakeup || pm_debug_messages_on) {
 			read_lock(&tasklist_lock);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 044b4eee760b..d78935e4fb5d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5065,8 +5065,7 @@ next_pool:
 /**
  * show_all_workqueues - dump workqueue state
  *
- * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * all busy workqueues and pools.
+ * Called from a sysrq handler and prints out all busy workqueues and pools.
  */
 void show_all_workqueues(void)
 {
@@ -5087,6 +5086,29 @@ void show_all_workqueues(void)
 	rcu_read_unlock();
 }
 
+/**
+ * show_freezable_workqueues - dump freezable workqueue state
+ *
+ * Called from try_to_freeze_tasks() and prints out all freezable workqueues
+ * still busy.
+ */
+void show_freezable_workqueues(void)
+{
+	struct workqueue_struct *wq;
+
+	rcu_read_lock();
+
+	pr_info("Showing freezable workqueues that are still busy:\n");
+
+	list_for_each_entry_rcu(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_FREEZABLE))
+			continue;
+		show_one_workqueue(wq);
+	}
+
+	rcu_read_unlock();
+}
+
 /* used to show worker information through /proc/PID/{comm,stat,status} */
 void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
 {
-- 
cgit v1.2.3


From 8e4645226b4931e96d55546a1fb3863aa50b5e62 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Tue, 28 Feb 2023 08:35:37 +0000
Subject: cpuset: Clean up cpuset_node_allowed

Commit 002f290627c2 ("cpuset: use static key better and convert to new API")
has used __cpuset_node_allowed() instead of cpuset_node_allowed() to check
whether we can allocate on a memory node. Now this function isn't used by
anyone, so we can do the follow things to clean up it.

1. remove unused codes
2. rename __cpuset_node_allowed() to cpuset_node_allowed()
3. update comments in mm/page_alloc.c

Suggested-by: Waiman Long <longman@redhat.com>
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 16 ++--------------
 kernel/cgroup/cpuset.c |  4 ++--
 mm/page_alloc.c        |  4 ++--
 3 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d58e0476ee8e..980b76a1237e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -80,18 +80,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
-
-static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
-{
-	if (cpusets_enabled())
-		return __cpuset_node_allowed(node, gfp_mask);
-	return true;
-}
+extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);
 
 static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
 }
 
 static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -223,11 +216,6 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
-{
-	return true;
-}
-
 static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	return true;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 636f1c682ac0..0241b07d6f21 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3831,7 +3831,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 }
 
 /*
- * __cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -3870,7 +3870,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	bool allowed;			/* is allocation in zone z allowed? */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac1fc986af44..ea4e39b06475 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4217,7 +4217,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 retry:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-	 * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+	 * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
 	 */
 	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
 	z = ac->preferred_zoneref;
@@ -4891,7 +4891,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 		/*
 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
 		 * GFP_ATOMIC) rather than fail, see the comment for
-		 * __cpuset_node_allowed().
+		 * cpuset_node_allowed().
 		 */
 		if (alloc_flags & ALLOC_MIN_RESERVE)
 			alloc_flags &= ~ALLOC_CPUSET;
-- 
cgit v1.2.3


From 3eb8eea2a453463f5606ce3e46cf225f88671440 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 21 Mar 2023 22:38:48 -0700
Subject: docs: networking: document NAPI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add basic documentation about NAPI. We can stop linking to the ancient
doc on the LF wiki.

Link: https://lore.kernel.org/all/20230315223044.471002-1-kuba@kernel.org/
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Pavel Pisa <pisa@cmp.felk.cvut.cz> # for ctucanfd-driver.rst
Reviewed-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20230322053848.198452-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../device_drivers/can/ctu/ctucanfd-driver.rst     |   3 +-
 .../device_drivers/ethernet/intel/e100.rst         |   3 +-
 .../device_drivers/ethernet/intel/i40e.rst         |   4 +-
 .../device_drivers/ethernet/intel/ice.rst          |   4 +-
 Documentation/networking/index.rst                 |   1 +
 Documentation/networking/napi.rst                  | 254 +++++++++++++++++++++
 include/linux/netdevice.h                          |  13 +-
 7 files changed, 269 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/networking/napi.rst

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst b/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
index 1a4fc6607582..1661d13174d5 100644
--- a/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
+++ b/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
@@ -229,8 +229,7 @@ frames for a while. This has a potential to avoid the costly round of
 enabling interrupts, handling an incoming IRQ in ISR, re-enabling the
 softirq and switching context back to softirq.
 
-More detailed documentation of NAPI may be found on the pages of Linux
-Foundation `<https://wiki.linuxfoundation.org/networking/napi>`_.
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Integrating the core to Xilinx Zynq
 -----------------------------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/e100.rst b/Documentation/networking/device_drivers/ethernet/intel/e100.rst
index 3d4a9ba21946..371b7e5c3293 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/e100.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/e100.rst
@@ -151,8 +151,7 @@ NAPI
 
 NAPI (Rx polling mode) is supported in the e100 driver.
 
-See https://wiki.linuxfoundation.org/networking/napi for more
-information on NAPI.
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Multiple Interfaces on Same Ethernet Broadcast Network
 ------------------------------------------------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
index ac35bd472bdc..c495c4e16b3b 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
@@ -399,8 +399,8 @@ operate only in full duplex and only at their native speed.
 NAPI
 ----
 NAPI (Rx polling mode) is supported in the i40e driver.
-For more information on NAPI, see
-https://wiki.linuxfoundation.org/networking/napi
+
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Flow Control
 ------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index 5efea4dd1251..2b6dc7880d7b 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -817,10 +817,10 @@ NOTE:
 
 NAPI
 ----
+
 This driver supports NAPI (Rx polling mode).
-For more information on NAPI, see
-https://wiki.linuxfoundation.org/networking/napi
 
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 MACVLAN
 -------
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 4ddcae33c336..24bb256d6d53 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -73,6 +73,7 @@ Contents:
    mpls-sysctl
    mptcp-sysctl
    multiqueue
+   napi
    netconsole
    netdev-features
    netdevices
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
new file mode 100644
index 000000000000..a7a047742e93
--- /dev/null
+++ b/Documentation/networking/napi.rst
@@ -0,0 +1,254 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+.. _napi:
+
+====
+NAPI
+====
+
+NAPI is the event handling mechanism used by the Linux networking stack.
+The name NAPI no longer stands for anything in particular [#]_.
+
+In basic operation the device notifies the host about new events
+via an interrupt.
+The host then schedules a NAPI instance to process the events.
+The device may also be polled for events via NAPI without receiving
+interrupts first (:ref:`busy polling<poll>`).
+
+NAPI processing usually happens in the software interrupt context,
+but there is an option to use :ref:`separate kernel threads<threaded>`
+for NAPI processing.
+
+All in all NAPI abstracts away from the drivers the context and configuration
+of event (packet Rx and Tx) processing.
+
+Driver API
+==========
+
+The two most important elements of NAPI are the struct napi_struct
+and the associated poll method. struct napi_struct holds the state
+of the NAPI instance while the method is the driver-specific event
+handler. The method will typically free Tx packets that have been
+transmitted and process newly received packets.
+
+.. _drv_ctrl:
+
+Control API
+-----------
+
+netif_napi_add() and netif_napi_del() add/remove a NAPI instance
+from the system. The instances are attached to the netdevice passed
+as argument (and will be deleted automatically when netdevice is
+unregistered). Instances are added in a disabled state.
+
+napi_enable() and napi_disable() manage the disabled state.
+A disabled NAPI can't be scheduled and its poll method is guaranteed
+to not be invoked. napi_disable() waits for ownership of the NAPI
+instance to be released.
+
+The control APIs are not idempotent. Control API calls are safe against
+concurrent use of datapath APIs but an incorrect sequence of control API
+calls may result in crashes, deadlocks, or race conditions. For example,
+calling napi_disable() multiple times in a row will deadlock.
+
+Datapath API
+------------
+
+napi_schedule() is the basic method of scheduling a NAPI poll.
+Drivers should call this function in their interrupt handler
+(see :ref:`drv_sched` for more info). A successful call to napi_schedule()
+will take ownership of the NAPI instance.
+
+Later, after NAPI is scheduled, the driver's poll method will be
+called to process the events/packets. The method takes a ``budget``
+argument - drivers can process completions for any number of Tx
+packets but should only process up to ``budget`` number of
+Rx packets. Rx processing is usually much more expensive.
+
+In other words, it is recommended to ignore the budget argument when
+performing TX buffer reclamation to ensure that the reclamation is not
+arbitrarily bounded; however, it is required to honor the budget argument
+for RX processing.
+
+.. warning::
+
+   The ``budget`` argument may be 0 if core tries to only process Tx completions
+   and no Rx packets.
+
+The poll method returns the amount of work done. If the driver still
+has outstanding work to do (e.g. ``budget`` was exhausted)
+the poll method should return exactly ``budget``. In that case,
+the NAPI instance will be serviced/polled again (without the
+need to be scheduled).
+
+If event processing has been completed (all outstanding packets
+processed) the poll method should call napi_complete_done()
+before returning. napi_complete_done() releases the ownership
+of the instance.
+
+.. warning::
+
+   The case of finishing all events and using exactly ``budget``
+   must be handled carefully. There is no way to report this
+   (rare) condition to the stack, so the driver must either
+   not call napi_complete_done() and wait to be called again,
+   or return ``budget - 1``.
+
+   If the ``budget`` is 0 napi_complete_done() should never be called.
+
+Call sequence
+-------------
+
+Drivers should not make assumptions about the exact sequencing
+of calls. The poll method may be called without the driver scheduling
+the instance (unless the instance is disabled). Similarly,
+it's not guaranteed that the poll method will be called, even
+if napi_schedule() succeeded (e.g. if the instance gets disabled).
+
+As mentioned in the :ref:`drv_ctrl` section - napi_disable() and subsequent
+calls to the poll method only wait for the ownership of the instance
+to be released, not for the poll method to exit. This means that
+drivers should avoid accessing any data structures after calling
+napi_complete_done().
+
+.. _drv_sched:
+
+Scheduling and IRQ masking
+--------------------------
+
+Drivers should keep the interrupts masked after scheduling
+the NAPI instance - until NAPI polling finishes any further
+interrupts are unnecessary.
+
+Drivers which have to mask the interrupts explicitly (as opposed
+to IRQ being auto-masked by the device) should use the napi_schedule_prep()
+and __napi_schedule() calls:
+
+.. code-block:: c
+
+  if (napi_schedule_prep(&v->napi)) {
+      mydrv_mask_rxtx_irq(v->idx);
+      /* schedule after masking to avoid races */
+      __napi_schedule(&v->napi);
+  }
+
+IRQ should only be unmasked after a successful call to napi_complete_done():
+
+.. code-block:: c
+
+  if (budget && napi_complete_done(&v->napi, work_done)) {
+    mydrv_unmask_rxtx_irq(v->idx);
+    return min(work_done, budget - 1);
+  }
+
+napi_schedule_irqoff() is a variant of napi_schedule() which takes advantage
+of guarantees given by being invoked in IRQ context (no need to
+mask interrupts). Note that PREEMPT_RT forces all interrupts
+to be threaded so the interrupt may need to be marked ``IRQF_NO_THREAD``
+to avoid issues on real-time kernel configurations.
+
+Instance to queue mapping
+-------------------------
+
+Modern devices have multiple NAPI instances (struct napi_struct) per
+interface. There is no strong requirement on how the instances are
+mapped to queues and interrupts. NAPI is primarily a polling/processing
+abstraction without specific user-facing semantics. That said, most networking
+devices end up using NAPI in fairly similar ways.
+
+NAPI instances most often correspond 1:1:1 to interrupts and queue pairs
+(queue pair is a set of a single Rx and single Tx queue).
+
+In less common cases a NAPI instance may be used for multiple queues
+or Rx and Tx queues can be serviced by separate NAPI instances on a single
+core. Regardless of the queue assignment, however, there is usually still
+a 1:1 mapping between NAPI instances and interrupts.
+
+It's worth noting that the ethtool API uses a "channel" terminology where
+each channel can be either ``rx``, ``tx`` or ``combined``. It's not clear
+what constitutes a channel; the recommended interpretation is to understand
+a channel as an IRQ/NAPI which services queues of a given type. For example,
+a configuration of 1 ``rx``, 1 ``tx`` and 1 ``combined`` channel is expected
+to utilize 3 interrupts, 2 Rx and 2 Tx queues.
+
+User API
+========
+
+User interactions with NAPI depend on NAPI instance ID. The instance IDs
+are only visible to the user thru the ``SO_INCOMING_NAPI_ID`` socket option.
+It's not currently possible to query IDs used by a given device.
+
+Software IRQ coalescing
+-----------------------
+
+NAPI does not perform any explicit event coalescing by default.
+In most scenarios batching happens due to IRQ coalescing which is done
+by the device. There are cases where software coalescing is helpful.
+
+NAPI can be configured to arm a repoll timer instead of unmasking
+the hardware interrupts as soon as all packets are processed.
+The ``gro_flush_timeout`` sysfs configuration of the netdevice
+is reused to control the delay of the timer, while
+``napi_defer_hard_irqs`` controls the number of consecutive empty polls
+before NAPI gives up and goes back to using hardware IRQs.
+
+.. _poll:
+
+Busy polling
+------------
+
+Busy polling allows a user process to check for incoming packets before
+the device interrupt fires. As is the case with any busy polling it trades
+off CPU cycles for lower latency (production uses of NAPI busy polling
+are not well known).
+
+Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
+selected sockets or using the global ``net.core.busy_poll`` and
+``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
+also exists.
+
+IRQ mitigation
+---------------
+
+While busy polling is supposed to be used by low latency applications,
+a similar mechanism can be used for IRQ mitigation.
+
+Very high request-per-second applications (especially routing/forwarding
+applications and especially applications using AF_XDP sockets) may not
+want to be interrupted until they finish processing a request or a batch
+of packets.
+
+Such applications can pledge to the kernel that they will perform a busy
+polling operation periodically, and the driver should keep the device IRQs
+permanently masked. This mode is enabled by using the ``SO_PREFER_BUSY_POLL``
+socket option. To avoid system misbehavior the pledge is revoked
+if ``gro_flush_timeout`` passes without any busy poll call.
+
+The NAPI budget for busy polling is lower than the default (which makes
+sense given the low latency intention of normal busy polling). This is
+not the case with IRQ mitigation, however, so the budget can be adjusted
+with the ``SO_BUSY_POLL_BUDGET`` socket option.
+
+.. _threaded:
+
+Threaded NAPI
+-------------
+
+Threaded NAPI is an operating mode that uses dedicated kernel
+threads rather than software IRQ context for NAPI processing.
+The configuration is per netdevice and will affect all
+NAPI instances of that device. Each NAPI instance will spawn a separate
+thread (called ``napi/${ifc-name}-${napi-id}``).
+
+It is recommended to pin each kernel thread to a single CPU, the same
+CPU as the CPU which services the interrupt. Note that the mapping
+between IRQs and NAPI instances may not be trivial (and is driver
+dependent). The NAPI instance IDs will be assigned in the opposite
+order than the process IDs of the kernel threads.
+
+Threaded NAPI is controlled by writing 0/1 to the ``threaded`` file in
+netdev's sysfs directory.
+
+.. rubric:: Footnotes
+
+.. [#] NAPI was originally referred to as New API in 2.4 Linux.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 674ee5daa7b1..18a5be6ddd0f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -509,15 +509,18 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
- *	napi_complete - NAPI processing complete
- *	@n: NAPI context
+ * napi_complete_done - NAPI processing complete
+ * @n: NAPI context
+ * @work_done: number of packets processed
  *
- * Mark NAPI processing as complete.
- * Consider using napi_complete_done() instead.
+ * Mark NAPI processing as complete. Should only be called if poll budget
+ * has not been completely consumed.
+ * Prefer over napi_complete().
  * Return false if device should avoid rearming interrupts.
  */
+bool napi_complete_done(struct napi_struct *n, int work_done);
+
 static inline bool napi_complete(struct napi_struct *n)
 {
 	return napi_complete_done(n, 0);
-- 
cgit v1.2.3


From 56eb0598c7a30c76009a082d3213486d6a013df0 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 7 Mar 2023 14:35:52 +0000
Subject: trace: Add trace_ipi_send_cpumask()

trace_ipi_raise() is unsuitable for generically tracing IPI sources due to
its "reason" argument being an uninformative string (on arm64 all you get
is "Function call interrupts" for SMP calls).

Add a variant of it that exports a target cpumask, a callsite and a callback.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230307143558.294354-2-vschneid@redhat.com
---
 include/trace/events/ipi.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/ipi.h b/include/trace/events/ipi.h
index 0be71dad6ec0..b1125dc27682 100644
--- a/include/trace/events/ipi.h
+++ b/include/trace/events/ipi.h
@@ -35,6 +35,28 @@ TRACE_EVENT(ipi_raise,
 	TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
 );
 
+TRACE_EVENT(ipi_send_cpumask,
+
+	TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),
+
+	TP_ARGS(cpumask, callsite, callback),
+
+	TP_STRUCT__entry(
+		__cpumask(cpumask)
+		__field(void *, callsite)
+		__field(void *, callback)
+	),
+
+	TP_fast_assign(
+		__assign_cpumask(cpumask, cpumask_bits(cpumask));
+		__entry->callsite = (void *)callsite;
+		__entry->callback = callback;
+	),
+
+	TP_printk("cpumask=%s callsite=%pS callback=%pS",
+		  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
+);
+
 DECLARE_EVENT_CLASS(ipi_handler,
 
 	TP_PROTO(const char *reason),
-- 
cgit v1.2.3


From 4c8c3c7f70a6779d30f5492acbc9978f4636fe7a Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 7 Mar 2023 14:35:56 +0000
Subject: treewide: Trace IPIs sent via smp_send_reschedule()

To be able to trace invocations of smp_send_reschedule(), rename the
arch-specific definitions of it to arch_smp_send_reschedule() and wrap it
into an smp_send_reschedule() that contains a tracepoint.

Changes to include the declaration of the tracepoint were driven by the
following coccinelle script:

  @func_use@
  @@
  smp_send_reschedule(...);

  @include@
  @@
  #include <trace/events/ipi.h>

  @no_include depends on func_use && !include@
  @@
    #include <...>
  +
  + #include <trace/events/ipi.h>

[csky bits]
[riscv bits]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Guo Ren <guoren@kernel.org>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230307143558.294354-6-vschneid@redhat.com
---
 arch/alpha/kernel/smp.c                  |  2 +-
 arch/arc/kernel/smp.c                    |  2 +-
 arch/arm/kernel/smp.c                    |  2 +-
 arch/arm/mach-actions/platsmp.c          |  2 ++
 arch/arm64/kernel/smp.c                  |  2 +-
 arch/csky/kernel/smp.c                   |  2 +-
 arch/hexagon/kernel/smp.c                |  2 +-
 arch/ia64/kernel/smp.c                   |  4 ++--
 arch/loongarch/kernel/smp.c              |  4 ++--
 arch/mips/include/asm/smp.h              |  2 +-
 arch/mips/kernel/rtlx-cmp.c              |  2 ++
 arch/openrisc/kernel/smp.c               |  2 +-
 arch/parisc/kernel/smp.c                 |  4 ++--
 arch/powerpc/kernel/smp.c                |  6 ++++--
 arch/powerpc/kvm/book3s_hv.c             |  3 +++
 arch/powerpc/platforms/powernv/subcore.c |  2 ++
 arch/riscv/kernel/smp.c                  |  4 ++--
 arch/s390/kernel/smp.c                   |  2 +-
 arch/sh/kernel/smp.c                     |  2 +-
 arch/sparc/kernel/smp_32.c               |  2 +-
 arch/sparc/kernel/smp_64.c               |  2 +-
 arch/x86/include/asm/smp.h               |  2 +-
 arch/x86/kvm/svm/svm.c                   |  4 ++++
 arch/x86/kvm/x86.c                       |  2 ++
 arch/xtensa/kernel/smp.c                 |  2 +-
 include/linux/smp.h                      | 11 +++++++++--
 virt/kvm/kvm_main.c                      |  3 +++
 27 files changed, 53 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 0ede4b044e86..7439b2377df5 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -562,7 +562,7 @@ handle_ipi(struct pt_regs *regs)
 }
 
 void
-smp_send_reschedule(int cpu)
+arch_smp_send_reschedule(int cpu)
 {
 #ifdef DEBUG_IPI_MSG
 	if (cpu == hard_smp_processor_id())
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index ad93fe6e4b77..409cfa4675b4 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -292,7 +292,7 @@ static void ipi_send_msg(const struct cpumask *callmap, enum ipi_msg_type msg)
 		ipi_send_msg_one(cpu, msg);
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	ipi_send_msg_one(cpu, IPI_RESCHEDULE);
 }
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 5edf09237b2f..b350bfc9d1f8 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -746,7 +746,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
 	ipi_setup(smp_processor_id());
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/arch/arm/mach-actions/platsmp.c b/arch/arm/mach-actions/platsmp.c
index f26618b43514..7b208e96fbb6 100644
--- a/arch/arm/mach-actions/platsmp.c
+++ b/arch/arm/mach-actions/platsmp.c
@@ -20,6 +20,8 @@
 #include <asm/smp_plat.h>
 #include <asm/smp_scu.h>
 
+#include <trace/events/ipi.h>
+
 #define OWL_CPU1_ADDR	0x50
 #define OWL_CPU1_FLAG	0x5c
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 438c16fc4463..66f2745062dd 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -976,7 +976,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
 	ipi_setup(smp_processor_id());
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index b45d1073307f..be77383acb5f 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -140,7 +140,7 @@ void smp_send_stop(void)
 	on_each_cpu(ipi_stop, NULL, 1);
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 4ba93e59370c..4e8bee25b8c6 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -217,7 +217,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	}
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	send_ipi(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index e2cc59db86bc..ea4f009a232b 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -220,11 +220,11 @@ kdump_smp_send_init(void)
  * Called with preemption disabled.
  */
 void
-smp_send_reschedule (int cpu)
+arch_smp_send_reschedule (int cpu)
 {
 	ia64_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
 
 /*
  * Called with preemption disabled.
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 8c6e227cb29d..83225610a148 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -155,11 +155,11 @@ void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action)
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	loongson_send_ipi_single(cpu, SMP_RESCHEDULE);
 }
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
 
 irqreturn_t loongson_ipi_interrupt(int irq, void *dev)
 {
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index 5d9ff61004ca..9806e79895d9 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -66,7 +66,7 @@ extern void calculate_cpu_foreign_map(void);
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
 {
 	extern const struct plat_smp_ops *mp_ops;	/* private */
 
diff --git a/arch/mips/kernel/rtlx-cmp.c b/arch/mips/kernel/rtlx-cmp.c
index d26dcc4b46e7..e991cc936c1c 100644
--- a/arch/mips/kernel/rtlx-cmp.c
+++ b/arch/mips/kernel/rtlx-cmp.c
@@ -17,6 +17,8 @@
 #include <asm/vpe.h>
 #include <asm/rtlx.h>
 
+#include <trace/events/ipi.h>
+
 static int major;
 
 static void rtlx_interrupt(void)
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index e1419095a6f0..0a7a059e2dff 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -173,7 +173,7 @@ void handle_IPI(unsigned int ipi_msg)
 	}
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 7dbd92cafae3..b7fc859fa87d 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -246,8 +246,8 @@ void kgdb_roundup_cpus(void)
 inline void 
 smp_send_stop(void)	{ send_IPI_allbutself(IPI_CPU_STOP); }
 
-void 
-smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); }
+void
+arch_smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); }
 
 void
 smp_send_all_nop(void)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6b90f10a6c81..35f101ccb540 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -61,6 +61,8 @@
 #include <asm/kup.h>
 #include <asm/fadump.h>
 
+#include <trace/events/ipi.h>
+
 #ifdef DEBUG
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
@@ -364,12 +366,12 @@ static inline void do_message_pass(int cpu, int msg)
 #endif
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	if (likely(smp_ops))
 		do_message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6ba68dd6190b..3b70b5f80bd5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -43,6 +43,7 @@
 #include <linux/compiler.h>
 #include <linux/of.h>
 #include <linux/irqdomain.h>
+#include <linux/smp.h>
 
 #include <asm/ftrace.h>
 #include <asm/reg.h>
@@ -80,6 +81,8 @@
 #include <asm/dtl.h>
 #include <asm/plpar_wrappers.h>
 
+#include <trace/events/ipi.h>
+
 #include "book3s.h"
 #include "book3s_hv.h"
 
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 7e98b00ea2e8..c53c4c797768 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -20,6 +20,8 @@
 #include <asm/opal.h>
 #include <asm/smp.h>
 
+#include <trace/events/ipi.h>
+
 #include "subcore.h"
 #include "powernv.h"
 
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 8c3b59f1f9b8..42e9656a1db2 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -328,8 +328,8 @@ bool smp_crash_stop_failed(void)
 }
 #endif
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	send_ipi_single(cpu, IPI_RESCHEDULE);
 }
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index d4888453bbf8..a710319f97e9 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -553,7 +553,7 @@ void arch_send_call_function_single_ipi(int cpu)
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
 }
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 65924d9ec245..5cf35a774dc7 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -256,7 +256,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	       (bogosum / (5000/HZ)) % 100);
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	mp_ops->send_ipi(cpu, SMP_MSG_RESCHEDULE);
 }
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index ad8094d955eb..87eaa7719fa2 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -120,7 +120,7 @@ void cpu_panic(void)
 
 struct linux_prom_registers smp_penguin_ctable = { 0 };
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	/*
 	 * CPU model dependent way of implementing IPI generation targeting
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a55295d1b924..e5964d1d8b37 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1430,7 +1430,7 @@ static unsigned long send_cpu_poke(int cpu)
 	return hv_err;
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	if (cpu == smp_processor_id()) {
 		WARN_ON_ONCE(preemptible());
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..f9757123d8fa 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -98,7 +98,7 @@ static inline void play_dead(void)
 	smp_ops.play_dead();
 }
 
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
 {
 	smp_ops.smp_send_reschedule(cpu);
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 252e7f37e4e2..424fcdba4c78 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/rwsem.h>
 #include <linux/cc_platform.h>
+#include <linux/smp.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -41,6 +42,9 @@
 #include <asm/fpu/api.h>
 
 #include <asm/virtext.h>
+
+#include <trace/events/ipi.h>
+
 #include "trace.h"
 
 #include "svm.h"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7713420abab0..07ba937bdb6f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,7 +60,9 @@
 #include <linux/mem_encrypt.h>
 #include <linux/entry-kvm.h>
 #include <linux/suspend.h>
+#include <linux/smp.h>
 
+#include <trace/events/ipi.h>
 #include <trace/events/kvm.h>
 
 #include <asm/debugreg.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4dc109dd6214..d95907b8e4d3 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -389,7 +389,7 @@ void arch_send_call_function_single_ipi(int cpu)
 	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC);
 }
 
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
 {
 	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a80ab58ae3f1..c036a2228d8d 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -125,8 +125,15 @@ extern void smp_send_stop(void);
 /*
  * sends a 'reschedule' event to another CPU:
  */
-extern void smp_send_reschedule(int cpu);
-
+extern void arch_smp_send_reschedule(int cpu);
+/*
+ * scheduler_ipi() is inline so can't be passed as callback reason, but the
+ * callsite IP should be sufficient for root-causing IPIs sent from here.
+ */
+#define smp_send_reschedule(cpu) ({				  \
+	trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, NULL);  \
+	arch_smp_send_reschedule(cpu);				  \
+})
 
 /*
  * Prepare machine for booting other CPUs.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d255964ec331..7d1889629bfe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -62,11 +62,14 @@
 #include "kvm_mm.h"
 #include "vfio.h"
 
+#include <trace/events/ipi.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
 #include <linux/kvm_dirty_ring.h>
 
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
-- 
cgit v1.2.3


From 68e2d17c9eb311ab59aeb6d0c38aad8985fa2596 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2023 11:28:36 +0100
Subject: trace: Add trace_ipi_send_cpu()

Because copying cpumasks around when targeting a single CPU is a bit
daft...

Tested-and-reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230322103004.GA571242%40hirez.programming.kicks-ass.net
---
 include/linux/smp.h        |  6 +++---
 include/trace/events/ipi.h | 22 ++++++++++++++++++++++
 kernel/irq_work.c          |  6 ++----
 kernel/sched/core.c        |  1 +
 kernel/smp.c               |  4 ++--
 5 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index c036a2228d8d..ed8f344ba627 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -130,9 +130,9 @@ extern void arch_smp_send_reschedule(int cpu);
  * scheduler_ipi() is inline so can't be passed as callback reason, but the
  * callsite IP should be sufficient for root-causing IPIs sent from here.
  */
-#define smp_send_reschedule(cpu) ({				  \
-	trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, NULL);  \
-	arch_smp_send_reschedule(cpu);				  \
+#define smp_send_reschedule(cpu) ({		  \
+	trace_ipi_send_cpu(cpu, _RET_IP_, NULL);  \
+	arch_smp_send_reschedule(cpu);		  \
 })
 
 /*
diff --git a/include/trace/events/ipi.h b/include/trace/events/ipi.h
index b1125dc27682..3de9bfc982ce 100644
--- a/include/trace/events/ipi.h
+++ b/include/trace/events/ipi.h
@@ -35,6 +35,28 @@ TRACE_EVENT(ipi_raise,
 	TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
 );
 
+TRACE_EVENT(ipi_send_cpu,
+
+	TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),
+
+	TP_ARGS(cpu, callsite, callback),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, cpu)
+		__field(void *, callsite)
+		__field(void *, callback)
+	),
+
+	TP_fast_assign(
+		__entry->cpu = cpu;
+		__entry->callsite = (void *)callsite;
+		__entry->callback = callback;
+	),
+
+	TP_printk("cpu=%u callsite=%pS callback=%pS",
+		  __entry->cpu, __entry->callsite, __entry->callback)
+);
+
 TRACE_EVENT(ipi_send_cpumask,
 
 	TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c33e88e32a67..2f4fb336dda1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -78,10 +78,8 @@ void __weak arch_irq_work_raise(void)
 
 static __always_inline void irq_work_raise(struct irq_work *work)
 {
-	if (trace_ipi_send_cpumask_enabled() && arch_irq_work_has_interrupt())
-		trace_ipi_send_cpumask(cpumask_of(smp_processor_id()),
-				       _RET_IP_,
-				       work->func);
+	if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+		trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
 
 	arch_irq_work_raise();
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b0a48cfc0a22..ad40755ddc11 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -96,6 +96,7 @@
 #include "../../io_uring/io-wq.h"
 #include "../smpboot.h"
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
 
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 37e9613a0889..43f0796ecdb2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ static __always_inline void
 send_call_function_single_ipi(int cpu, smp_call_func_t func)
 {
 	if (call_function_single_prep_ipi(cpu)) {
-		trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, func);
+		trace_ipi_send_cpu(cpu, _RET_IP_, func);
 		arch_send_call_function_single_ipi(cpu);
 	}
 }
@@ -346,7 +346,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
 	 * even if we haven't sent the smp_call IPI yet (e.g. the stopper
 	 * executes migration_cpu_stop() on the remote CPU).
 	 */
-	if (trace_ipi_send_cpumask_enabled()) {
+	if (trace_ipi_send_cpu_enabled()) {
 		call_single_data_t *csd;
 		smp_call_func_t func;
 
-- 
cgit v1.2.3


From dbbb27e183b1568d5a907ace1cd144b0709ea52a Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Thu, 23 Mar 2023 04:38:00 -0700
Subject: cfg80211: support RNR for EMA AP

As per IEEE Std 802.11ax-2021, 11.1.3.8.3 Discovery of a nontransmitted
BSSID profile, an EMA AP that transmits a Beacon frame carrying a partial
list of nontransmitted BSSID profiles should include in the frame
a Reduced Neighbor Report element carrying information for at least the
nontransmitted BSSIDs that are not present in the Multiple BSSID element
carried in that frame.
Add new nested attribute NL80211_ATTR_EMA_RNR_ELEMS to support the above.
Number of RNR elements must be more than or equal to the number of
MBSSID elements. This attribute can be used only when EMA is enabled.
Userspace is responsible for splitting the RNR into multiple elements such
that each element excludes the non-transmitting profiles already included
in the MBSSID element (%NL80211_ATTR_MBSSID_ELEMS) at the same index.
Each EMA beacon will be generated by adding MBSSID and RNR elements
at the same index. If the userspace provides more RNR elements than the
number of MBSSID elements then these will be added in every EMA beacon.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://lore.kernel.org/r/20230323113801.6903-2-quic_alokad@quicinc.com
[Johannes: validate elements]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 19 +++++++++++
 include/uapi/linux/nl80211.h | 13 ++++++++
 net/wireless/nl80211.c       | 79 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 104 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 86cb048dc924..3cf236520288 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1178,6 +1178,23 @@ struct cfg80211_mbssid_elems {
 	} elem[];
 };
 
+/**
+ * struct cfg80211_rnr_elems - Reduced neighbor report (RNR) elements
+ *
+ * @cnt: Number of elements in array %elems.
+ *
+ * @elem: Array of RNR element(s) to be added into Beacon frames.
+ * @elem.data: Data for RNR elements.
+ * @elem.len: Length of data.
+ */
+struct cfg80211_rnr_elems {
+	u8 cnt;
+	struct {
+		const u8 *data;
+		size_t len;
+	} elem[];
+};
+
 /**
  * struct cfg80211_beacon_data - beacon data
  * @link_id: the link ID for the AP MLD link sending this beacon
@@ -1198,6 +1215,7 @@ struct cfg80211_mbssid_elems {
  * @probe_resp_len: length of probe response template (@probe_resp)
  * @probe_resp: probe response template (AP mode only)
  * @mbssid_ies: multiple BSSID elements
+ * @rnr_ies: reduced neighbor report elements
  * @ftm_responder: enable FTM responder functionality; -1 for no change
  *	(which also implies no change in LCI/civic location data)
  * @lci: Measurement Report element content, starting with Measurement Token
@@ -1221,6 +1239,7 @@ struct cfg80211_beacon_data {
 	const u8 *lci;
 	const u8 *civicloc;
 	struct cfg80211_mbssid_elems *mbssid_ies;
+	struct cfg80211_rnr_elems *rnr_ies;
 	s8 ftm_responder;
 
 	size_t head_len, tail_len;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 14e958a32b84..cf4fb981e131 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2794,6 +2794,17 @@ enum nl80211_commands {
  * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should
  *	be enabled or not (flag attribute).
  *
+ * @NL80211_ATTR_EMA_RNR_ELEMS: Optional nested attribute for
+ *	reduced neighbor report (RNR) elements. This attribute can be used
+ *	only when NL80211_MBSSID_CONFIG_ATTR_EMA is enabled.
+ *	Userspace is responsible for splitting the RNR into multiple
+ *	elements such that each element excludes the non-transmitting
+ *	profiles already included in the MBSSID element
+ *	(%NL80211_ATTR_MBSSID_ELEMS) at the same index. Each EMA beacon
+ *	will be generated by adding MBSSID and RNR elements at the same
+ *	index. If the userspace includes more RNR elements than number of
+ *	MBSSID elements then these will be added in every EMA beacon.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3328,6 +3339,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
 	NL80211_ATTR_HW_TIMESTAMP_ENABLED,
 
+	NL80211_ATTR_EMA_RNR_ELEMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0a31b1d2845d..80a20d69f285 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -809,6 +809,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 
 	[NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 },
 	[NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG },
+	[NL80211_ATTR_EMA_RNR_ELEMS] = { .type = NLA_NESTED },
 };
 
 /* policy for the key attributes */
@@ -5425,6 +5426,38 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs)
 	return elems;
 }
 
+static struct cfg80211_rnr_elems *
+nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs,
+			struct netlink_ext_ack *extack)
+{
+	struct nlattr *nl_elems;
+	struct cfg80211_rnr_elems *elems;
+	int rem_elems;
+	u8 i = 0, num_elems = 0;
+
+	nla_for_each_nested(nl_elems, attrs, rem_elems) {
+		int ret;
+
+		ret = validate_ie_attr(nl_elems, extack);
+		if (ret)
+			return ERR_PTR(ret);
+
+		num_elems++;
+	}
+
+	elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
+	if (!elems)
+		return ERR_PTR(-ENOMEM);
+
+	nla_for_each_nested(nl_elems, attrs, rem_elems) {
+		elems->elem[i].data = nla_data(nl_elems);
+		elems->elem[i].len = nla_len(nl_elems);
+		i++;
+	}
+	elems->cnt = num_elems;
+	return elems;
+}
+
 static int nl80211_parse_he_bss_color(struct nlattr *attrs,
 				      struct cfg80211_he_bss_color *he_bss_color)
 {
@@ -5451,7 +5484,8 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs,
 
 static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 				struct nlattr *attrs[],
-				struct cfg80211_beacon_data *bcn)
+				struct cfg80211_beacon_data *bcn,
+				struct netlink_ext_ack *extack)
 {
 	bool haveinfo = false;
 	int err;
@@ -5548,6 +5582,21 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 			return PTR_ERR(mbssid);
 
 		bcn->mbssid_ies = mbssid;
+
+		if (bcn->mbssid_ies && attrs[NL80211_ATTR_EMA_RNR_ELEMS]) {
+			struct cfg80211_rnr_elems *rnr =
+				nl80211_parse_rnr_elems(&rdev->wiphy,
+							attrs[NL80211_ATTR_EMA_RNR_ELEMS],
+							extack);
+
+			if (IS_ERR(rnr))
+				return PTR_ERR(rnr);
+
+			if (rnr && rnr->cnt < bcn->mbssid_ies->cnt)
+				return -EINVAL;
+
+			bcn->rnr_ies = rnr;
+		}
 	}
 
 	return 0;
@@ -5866,7 +5915,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (!params)
 		return -ENOMEM;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon,
+				   info->extack);
 	if (err)
 		goto out;
 
@@ -6096,6 +6146,11 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			goto out_unlock;
 	}
 
+	if (!params->mbssid_config.ema && params->beacon.rnr_ies) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	err = nl80211_calculate_ap_params(params);
 	if (err)
 		goto out_unlock;
@@ -6137,6 +6192,7 @@ out:
 	    params->mbssid_config.tx_wdev->netdev &&
 	    params->mbssid_config.tx_wdev->netdev != dev)
 		dev_put(params->mbssid_config.tx_wdev->netdev);
+	kfree(params->beacon.rnr_ies);
 	kfree(params);
 
 	return err;
@@ -6161,7 +6217,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!wdev->links[link_id].ap.beacon_interval)
 		return -EINVAL;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params, info->extack);
 	if (err)
 		goto out;
 
@@ -6171,6 +6227,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 
 out:
 	kfree(params.mbssid_ies);
+	kfree(params.rnr_ies);
 	return err;
 }
 
@@ -10030,7 +10087,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (!need_new_beacon)
 		goto skip_beacons;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after,
+				   info->extack);
 	if (err)
 		goto free;
 
@@ -10047,7 +10105,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto free;
 
-	err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa);
+	err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa,
+				   info->extack);
 	if (err)
 		goto free;
 
@@ -10167,6 +10226,8 @@ skip_beacons:
 free:
 	kfree(params.beacon_after.mbssid_ies);
 	kfree(params.beacon_csa.mbssid_ies);
+	kfree(params.beacon_after.rnr_ies);
+	kfree(params.beacon_csa.rnr_ies);
 	kfree(csa_attrs);
 	return err;
 }
@@ -15882,7 +15943,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 	params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]);
 	params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]);
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next,
+				   info->extack);
 	if (err)
 		return err;
 
@@ -15896,7 +15958,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto out;
 
-	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change);
+	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change,
+				   info->extack);
 	if (err)
 		goto out;
 
@@ -15952,6 +16015,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 out:
 	kfree(params.beacon_next.mbssid_ies);
 	kfree(params.beacon_color_change.mbssid_ies);
+	kfree(params.beacon_next.rnr_ies);
+	kfree(params.beacon_color_change.rnr_ies);
 	kfree(tb);
 	return err;
 }
-- 
cgit v1.2.3


From 083a7e87e1e45506420c96a2fadf2e66da6877e2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 14 Mar 2023 13:02:39 +0800
Subject: crypto: hash - Fix kdoc errors about HASH_ALG_COMMON

The HASH_ALG_COMMON macro cannot be parsed by kdoc so mark it as
a normal comment instead of kdoc.  Also add HASH_ALG_COMMON as a
structure member of shash_alg.

Fixes: 0e4e6d7094df ("crypto: hash - Count error stats differently")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/hash.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 2aa61e7679db..1ed674ba8429 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -41,7 +41,7 @@ struct crypto_istat_hash {
 #define HASH_ALG_COMMON_STAT
 #endif
 
-/**
+/*
  * struct hash_alg_common - define properties of message digest
  * @stat: Statistics for hash algorithm.
  * @digestsize: Size of the result of the transformation. A buffer of this size
@@ -219,6 +219,7 @@ struct shash_desc {
  * @stat: Statistics for hash algorithm.
  * @base: internally used
  * @halg: see struct hash_alg_common
+ * @HASH_ALG_COMMON: see struct hash_alg_common
  */
 struct shash_alg {
 	int (*init)(struct shash_desc *desc);
-- 
cgit v1.2.3


From 5097f84437c9bd50b2c65b5f85395c34b2d545db Mon Sep 17 00:00:00 2001
From: Jaewan Kim <jaewan@google.com>
Date: Wed, 22 Mar 2023 13:16:34 +0000
Subject: wifi: nl80211: make nl80211_send_chandef non-static

Expose nl80211_send_chandef functionality for mac80211_hwsim or vendor
netlink can use it where needed.

Signed-off-by: Jaewan Kim <jaewan@google.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://lore.kernel.org/r/20230322131637.2633968-3-jaewan@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++++
 net/wireless/nl80211.c | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3cf236520288..9e04f69712b1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -950,6 +950,15 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 				  const struct cfg80211_chan_def *chandef,
 				  enum nl80211_iftype iftype);
 
+/**
+ * nl80211_send_chandef - sends the channel definition.
+ * @msg: the msg to send channel definition
+ * @chandef: the channel definition to check
+ *
+ * Returns: 0 if sent the channel definition to msg, < 0 on error
+ **/
+int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);
+
 /**
  * ieee80211_chanwidth_rate_flags - return rate flags for channel width
  * @width: the channel width of the channel
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 80a20d69f285..f1cd3d9130dd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3765,8 +3765,7 @@ out:
 	return result;
 }
 
-static int nl80211_send_chandef(struct sk_buff *msg,
-				const struct cfg80211_chan_def *chandef)
+int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef)
 {
 	if (WARN_ON(!cfg80211_chandef_valid(chandef)))
 		return -EINVAL;
@@ -3797,6 +3796,7 @@ static int nl80211_send_chandef(struct sk_buff *msg,
 		return -ENOBUFS;
 	return 0;
 }
+EXPORT_SYMBOL(nl80211_send_chandef);
 
 static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 			      struct cfg80211_registered_device *rdev,
-- 
cgit v1.2.3


From 0a392354dbc3ff748e0856a75592fe8d0fdc7674 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 24 Mar 2023 09:26:47 +0000
Subject: device property: constify fwnode_get_phy_mode() argument

fwnode_get_phy_mode() does not modify the fwnode argument, merely
using it to obtain the phy-mode property value. Therefore, it can
be made const.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/E1pfdh9-00EQ8t-HB@rmk-PC.armlinux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/property.c  | 2 +-
 include/linux/property.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 3fc25e568598..35b8f3a4e24b 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -935,7 +935,7 @@ EXPORT_SYMBOL_GPL(device_get_dma_attr);
  * 'phy-connection-type', and return its index in phy_modes table, or errno in
  * error case.
  */
-int fwnode_get_phy_mode(struct fwnode_handle *fwnode)
+int fwnode_get_phy_mode(const struct fwnode_handle *fwnode)
 {
 	const char *pm;
 	int err, i;
diff --git a/include/linux/property.h b/include/linux/property.h
index 0a29db15ff34..110634f5db2e 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -396,7 +396,7 @@ enum dev_dma_attr device_get_dma_attr(const struct device *dev);
 const void *device_get_match_data(const struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
-int fwnode_get_phy_mode(struct fwnode_handle *fwnode);
+int fwnode_get_phy_mode(const struct fwnode_handle *fwnode);
 
 void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index);
 
-- 
cgit v1.2.3


From 5b9ff0ba11042096bfb396e506fa9038e6a61de7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 24 Mar 2023 13:27:20 +0200
Subject: device property: Constify a few fwnode APIs

The fwnode parameter is not altered in the following APIs:

- fwnode_get_next_parent_dev()
- fwnode_is_ancestor_of()
- fwnode_graph_get_endpoint_count()

so constify them.

Reported-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230324112720.71315-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/property.c  | 6 +++---
 include/linux/property.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 35b8f3a4e24b..6a908e9c30f8 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
  *
  * Return: a pointer to the device of the @fwnode's closest ancestor.
  */
-struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode)
+struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode)
 {
 	struct fwnode_handle *parent;
 	struct device *dev;
@@ -718,7 +718,7 @@ EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);
  *
  * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false.
  */
-bool fwnode_is_ancestor_of(struct fwnode_handle *ancestor, struct fwnode_handle *child)
+bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child)
 {
 	struct fwnode_handle *parent;
 
@@ -1235,7 +1235,7 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id);
  * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints
  * and endpoints connected to disabled devices are counted.
  */
-unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode,
+unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode,
 					     unsigned long flags)
 {
 	struct fwnode_handle *ep;
diff --git a/include/linux/property.h b/include/linux/property.h
index 110634f5db2e..4a536548606b 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -105,11 +105,11 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
 	for (parent = fwnode_get_parent(fwnode); parent;	\
 	     parent = fwnode_get_next_parent(parent))
 
-struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode);
+struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode);
 unsigned int fwnode_count_parents(const struct fwnode_handle *fwn);
 struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn,
 					    unsigned int depth);
-bool fwnode_is_ancestor_of(struct fwnode_handle *ancestor, struct fwnode_handle *child);
+bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child);
 struct fwnode_handle *fwnode_get_next_child_node(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *child);
 struct fwnode_handle *fwnode_get_next_available_child_node(
@@ -433,7 +433,7 @@ static inline bool fwnode_graph_is_endpoint(const struct fwnode_handle *fwnode)
 struct fwnode_handle *
 fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode,
 				u32 port, u32 endpoint, unsigned long flags);
-unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode,
+unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode,
 					     unsigned long flags);
 
 #define fwnode_graph_for_each_endpoint(fwnode, child)				\
-- 
cgit v1.2.3


From 0146878cf299174ec39cdd2340c19a528794d881 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 23 Mar 2023 07:52:35 +0100
Subject: ALSA: pcm: Improved XRUN handling for indirect PCM helpers

As PCM ack callback may handle the XRUN situation gracefully now,
change the indirect PCM helpers to give a proper error (-EPIPE).
Also, change the pointer callback helpers to deal with the XRUN error
properly, too.

This requires the PCM core change by the commit 8c721c53dda5
("ALSA: usb-audio: Fix recursive locking at XRUN during syncing").

Link: https://lore.kernel.org/r/20230323065237.5062-2-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/pcm-indirect.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/pcm-indirect.h b/include/sound/pcm-indirect.h
index 04127686e8d0..98e06ea73b2b 100644
--- a/include/sound/pcm-indirect.h
+++ b/include/sound/pcm-indirect.h
@@ -44,7 +44,7 @@ snd_pcm_indirect_playback_transfer(struct snd_pcm_substream *substream,
 		if (diff < -(snd_pcm_sframes_t) (runtime->boundary / 2))
 			diff += runtime->boundary;
 		if (diff < 0)
-			return -EINVAL;
+			return -EPIPE;
 		rec->sw_ready += (int)frames_to_bytes(runtime, diff);
 		rec->appl_ptr = appl_ptr;
 	}
@@ -83,6 +83,8 @@ snd_pcm_indirect_playback_pointer(struct snd_pcm_substream *substream,
 				  struct snd_pcm_indirect *rec, unsigned int ptr)
 {
 	int bytes = ptr - rec->hw_io;
+	int err;
+
 	if (bytes < 0)
 		bytes += rec->hw_buffer_size;
 	rec->hw_io = ptr;
@@ -90,8 +92,11 @@ snd_pcm_indirect_playback_pointer(struct snd_pcm_substream *substream,
 	rec->sw_io += bytes;
 	if (rec->sw_io >= rec->sw_buffer_size)
 		rec->sw_io -= rec->sw_buffer_size;
-	if (substream->ops->ack)
-		substream->ops->ack(substream);
+	if (substream->ops->ack) {
+		err = substream->ops->ack(substream);
+		if (err == -EPIPE)
+			return SNDRV_PCM_POS_XRUN;
+	}
 	return bytes_to_frames(substream->runtime, rec->sw_io);
 }
 
@@ -112,7 +117,7 @@ snd_pcm_indirect_capture_transfer(struct snd_pcm_substream *substream,
 		if (diff < -(snd_pcm_sframes_t) (runtime->boundary / 2))
 			diff += runtime->boundary;
 		if (diff < 0)
-			return -EINVAL;
+			return -EPIPE;
 		rec->sw_ready -= frames_to_bytes(runtime, diff);
 		rec->appl_ptr = appl_ptr;
 	}
@@ -152,6 +157,8 @@ snd_pcm_indirect_capture_pointer(struct snd_pcm_substream *substream,
 {
 	int qsize;
 	int bytes = ptr - rec->hw_io;
+	int err;
+
 	if (bytes < 0)
 		bytes += rec->hw_buffer_size;
 	rec->hw_io = ptr;
@@ -162,8 +169,11 @@ snd_pcm_indirect_capture_pointer(struct snd_pcm_substream *substream,
 	rec->sw_io += bytes;
 	if (rec->sw_io >= rec->sw_buffer_size)
 		rec->sw_io -= rec->sw_buffer_size;
-	if (substream->ops->ack)
-		substream->ops->ack(substream);
+	if (substream->ops->ack) {
+		err = substream->ops->ack(substream);
+		if (err == -EPIPE)
+			return SNDRV_PCM_POS_XRUN;
+	}
 	return bytes_to_frames(substream->runtime, rec->sw_io);
 }
 
-- 
cgit v1.2.3


From dcfbb67e48a2becfce7990386e985b9c45098ee5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 11:01:31 +0100
Subject: driver core: class: use lock_class_key already present in struct
 subsys_private

In commit 37e98d9bedb5 ("driver core: bus: move lock_class_key into
dynamic structure"), we moved the lock_class_key into the internal
structure shared by busses and classes, but only used it for buses.

Move the class code to use this structure as it is already present and
being allocated, instead of the statically allocated on-the-stack
variable that class_create() was using as part of a macro wrapper around
the core function call.

Reviewed-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324100132.1633647-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 15 +++++++++------
 include/linux/device/class.h | 36 ++----------------------------------
 2 files changed, 11 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 3d65221b0dcb..dbaeb79ae917 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -154,9 +154,10 @@ static void class_remove_groups(struct class *cls,
 	return sysfs_remove_groups(&cls->p->subsys.kobj, groups);
 }
 
-int __class_register(struct class *cls, struct lock_class_key *key)
+int class_register(struct class *cls)
 {
 	struct subsys_private *cp;
+	struct lock_class_key *key;
 	int error;
 
 	pr_debug("device class '%s': registering\n", cls->name);
@@ -167,6 +168,8 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 	klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put);
 	INIT_LIST_HEAD(&cp->interfaces);
 	kset_init(&cp->glue_dirs);
+	key = &cp->lock_key;
+	lockdep_register_key(key);
 	__mutex_init(&cp->mutex, "subsys mutex", key);
 	error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name);
 	if (error) {
@@ -201,7 +204,7 @@ err_out:
 	cls->p = NULL;
 	return error;
 }
-EXPORT_SYMBOL_GPL(__class_register);
+EXPORT_SYMBOL_GPL(class_register);
 
 void class_unregister(struct class *cls)
 {
@@ -218,7 +221,7 @@ static void class_create_release(struct class *cls)
 }
 
 /**
- * __class_create - create a struct class structure
+ * class_create - create a struct class structure
  * @name: pointer to a string for the name of this class.
  * @key: the lock_class_key for this class; used by mutex lock debugging
  *
@@ -230,7 +233,7 @@ static void class_create_release(struct class *cls)
  * Note, the pointer created here is to be destroyed when finished by
  * making a call to class_destroy().
  */
-struct class *__class_create(const char *name, struct lock_class_key *key)
+struct class *class_create(const char *name)
 {
 	struct class *cls;
 	int retval;
@@ -244,7 +247,7 @@ struct class *__class_create(const char *name, struct lock_class_key *key)
 	cls->name = name;
 	cls->class_release = class_create_release;
 
-	retval = __class_register(cls, key);
+	retval = class_register(cls);
 	if (retval)
 		goto error;
 
@@ -254,7 +257,7 @@ error:
 	kfree(cls);
 	return ERR_PTR(retval);
 }
-EXPORT_SYMBOL_GPL(__class_create);
+EXPORT_SYMBOL_GPL(class_create);
 
 /**
  * class_destroy - destroys a struct class structure
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 75c1451fcc63..03d2f99f84c5 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -82,18 +82,9 @@ struct class_dev_iter {
 
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
-extern int __must_check __class_register(struct class *class,
-					 struct lock_class_key *key);
+extern int __must_check class_register(struct class *class);
 extern void class_unregister(struct class *class);
 
-/* This is a #define to keep the compiler from merging different
- * instances of the __key variable */
-#define class_register(class)			\
-({						\
-	static struct lock_class_key __key;	\
-	__class_register(class, &__key);	\
-})
-
 struct class_compat;
 struct class_compat *class_compat_register(const char *name);
 void class_compat_unregister(struct class_compat *cls);
@@ -246,30 +237,7 @@ struct class_interface {
 extern int __must_check class_interface_register(struct class_interface *);
 extern void class_interface_unregister(struct class_interface *);
 
-extern struct class * __must_check __class_create(const char *name,
-						  struct lock_class_key *key);
+extern struct class * __must_check class_create(const char *name);
 extern void class_destroy(struct class *cls);
 
-/* This is a #define to keep the compiler from merging different
- * instances of the __key variable */
-
-/**
- * class_create - create a struct class structure
- * @name: pointer to a string for the name of this class.
- *
- * This is used to create a struct class pointer that can then be used
- * in calls to device_create().
- *
- * Returns &struct class pointer on success, or ERR_PTR() on error.
- *
- * Note, the pointer created here is to be destroyed when finished by
- * making a call to class_destroy().
- */
-#define class_create(name)			\
-({						\
-	static struct lock_class_key __key;	\
-	__class_create(name, &__key);		\
-})
-
-
 #endif	/* _DEVICE_CLASS_H_ */
-- 
cgit v1.2.3


From 43718dca48429b9eb5b82cdd41e2a1f162a02fb9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 11:01:32 +0100
Subject: driver core: class.h: remove extern from function prototypes

The kernel coding style does not require 'extern' in function prototypes
in .h files, so remove them from include/linux/device/class.h as they
are not needed.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324100132.1633647-2-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/class.h | 46 +++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 03d2f99f84c5..1dc7706cb42d 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -82,8 +82,9 @@ struct class_dev_iter {
 
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
-extern int __must_check class_register(struct class *class);
-extern void class_unregister(struct class *class);
+
+int __must_check class_register(struct class *class);
+void class_unregister(struct class *class);
 
 struct class_compat;
 struct class_compat *class_compat_register(const char *name);
@@ -93,19 +94,15 @@ int class_compat_create_link(struct class_compat *cls, struct device *dev,
 void class_compat_remove_link(struct class_compat *cls, struct device *dev,
 			      struct device *device_link);
 
-extern void class_dev_iter_init(struct class_dev_iter *iter,
-				const struct class *class,
-				const struct device *start,
-				const struct device_type *type);
-extern struct device *class_dev_iter_next(struct class_dev_iter *iter);
-extern void class_dev_iter_exit(struct class_dev_iter *iter);
+void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class,
+			 const struct device *start, const struct device_type *type);
+struct device *class_dev_iter_next(struct class_dev_iter *iter);
+void class_dev_iter_exit(struct class_dev_iter *iter);
 
-extern int class_for_each_device(const struct class *class, const struct device *start,
-				 void *data,
-				 int (*fn)(struct device *dev, void *data));
-extern struct device *class_find_device(const struct class *class,
-					const struct device *start, const void *data,
-					int (*match)(struct device *, const void *));
+int class_for_each_device(const struct class *class, const struct device *start, void *data,
+			  int (*fn)(struct device *dev, void *data));
+struct device *class_find_device(const struct class *class, const struct device *start,
+				 const void *data, int (*match)(struct device *, const void *));
 
 /**
  * class_find_device_by_name - device iterator for locating a particular device
@@ -191,12 +188,10 @@ struct class_attribute {
 #define CLASS_ATTR_WO(_name) \
 	struct class_attribute class_attr_##_name = __ATTR_WO(_name)
 
-extern int __must_check class_create_file_ns(const struct class *class,
-					     const struct class_attribute *attr,
-					     const void *ns);
-extern void class_remove_file_ns(const struct class *class,
-				 const struct class_attribute *attr,
-				 const void *ns);
+int __must_check class_create_file_ns(const struct class *class, const struct class_attribute *attr,
+				      const void *ns);
+void class_remove_file_ns(const struct class *class, const struct class_attribute *attr,
+			  const void *ns);
 
 static inline int __must_check class_create_file(const struct class *class,
 						 const struct class_attribute *attr)
@@ -223,8 +218,7 @@ struct class_attribute_string {
 	struct class_attribute_string class_attr_##_name = \
 		_CLASS_ATTR_STRING(_name, _mode, _str)
 
-extern ssize_t show_class_attr_string(struct class *class, struct class_attribute *attr,
-                        char *buf);
+ssize_t show_class_attr_string(struct class *class, struct class_attribute *attr, char *buf);
 
 struct class_interface {
 	struct list_head	node;
@@ -234,10 +228,10 @@ struct class_interface {
 	void (*remove_dev)	(struct device *, struct class_interface *);
 };
 
-extern int __must_check class_interface_register(struct class_interface *);
-extern void class_interface_unregister(struct class_interface *);
+int __must_check class_interface_register(struct class_interface *);
+void class_interface_unregister(struct class_interface *);
 
-extern struct class * __must_check class_create(const char *name);
-extern void class_destroy(struct class *cls);
+struct class * __must_check class_create(const char *name);
+void class_destroy(struct class *cls);
 
 #endif	/* _DEVICE_CLASS_H_ */
-- 
cgit v1.2.3


From f43243c66e5e9ad839d235f82a58e73a7e7612af Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 13:27:06 +0100
Subject: driver core: device.h: remove extern from function prototypes

The kernel coding style does not require 'extern' in function prototypes
in .h files, so remove them from include/linux/device.h as they are not
needed.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324122711.2664537-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 3b23772d3bbb..472dd24d4823 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1088,8 +1088,7 @@ void device_link_remove(void *consumer, struct device *supplier);
 void device_links_supplier_sync_state_pause(void);
 void device_links_supplier_sync_state_resume(void);
 
-extern __printf(3, 4)
-int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);
+__printf(3, 4) int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);
 
 /* Create alias, so I can be autoloaded. */
 #define MODULE_ALIAS_CHARDEV(major,minor) \
-- 
cgit v1.2.3


From 0d62b79fd8083ae2400f31e38f8d63db3bd59bff Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 13:27:07 +0100
Subject: driver core: bus.h: remove extern from function prototypes

The kernel coding style does not require 'extern' in function prototypes
in .h files, so remove them from include/linux/device/bus.h as they are
not needed.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324122711.2664537-2-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/bus.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index b517b82d4723..65a5e2c0f04d 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -110,11 +110,11 @@ struct bus_type {
 	bool need_parent_lock;
 };
 
-extern int __must_check bus_register(const struct bus_type *bus);
+int __must_check bus_register(const struct bus_type *bus);
 
-extern void bus_unregister(const struct bus_type *bus);
+void bus_unregister(const struct bus_type *bus);
 
-extern int __must_check bus_rescan_devices(const struct bus_type *bus);
+int __must_check bus_rescan_devices(const struct bus_type *bus);
 
 struct bus_attribute {
 	struct attribute	attr;
@@ -244,10 +244,8 @@ void bus_sort_breadthfirst(struct bus_type *bus,
  */
 struct notifier_block;
 
-extern int bus_register_notifier(const struct bus_type *bus,
-				 struct notifier_block *nb);
-extern int bus_unregister_notifier(const struct bus_type *bus,
-				   struct notifier_block *nb);
+int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb);
+int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb);
 
 /**
  * enum bus_notifier_event - Bus Notifier events that have happened
@@ -279,7 +277,7 @@ enum bus_notifier_event {
 	BUS_NOTIFY_DRIVER_NOT_BOUND,
 };
 
-extern struct kset *bus_get_kset(const struct bus_type *bus);
+struct kset *bus_get_kset(const struct bus_type *bus);
 struct device *bus_get_dev_root(const struct bus_type *bus);
 
 #endif
-- 
cgit v1.2.3


From 8a2b9c84c708cf2a99499d5a685a642af7b68c37 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 13:27:08 +0100
Subject: driver core: driver.h: remove extern from function prototypes

The kernel coding style does not require 'extern' in function prototypes
in .h files, so remove them from include/linux/device/driver.h as they
are not needed.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324122711.2664537-3-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 0f22a6f46f8c..c244267a6744 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -122,13 +122,12 @@ struct device_driver {
 };
 
 
-extern int __must_check driver_register(struct device_driver *drv);
-extern void driver_unregister(struct device_driver *drv);
+int __must_check driver_register(struct device_driver *drv);
+void driver_unregister(struct device_driver *drv);
 
-extern struct device_driver *driver_find(const char *name,
-					 const struct bus_type *bus);
-extern int driver_probe_done(void);
-extern void wait_for_device_probe(void);
+struct device_driver *driver_find(const char *name, const struct bus_type *bus);
+int driver_probe_done(void);
+void wait_for_device_probe(void);
 void __init wait_for_init_devices_probe(void);
 
 /* sysfs interface for exporting driver attributes */
@@ -147,18 +146,15 @@ struct driver_attribute {
 #define DRIVER_ATTR_WO(_name) \
 	struct driver_attribute driver_attr_##_name = __ATTR_WO(_name)
 
-extern int __must_check driver_create_file(struct device_driver *driver,
-					const struct driver_attribute *attr);
-extern void driver_remove_file(struct device_driver *driver,
-			       const struct driver_attribute *attr);
+int __must_check driver_create_file(struct device_driver *driver,
+				    const struct driver_attribute *attr);
+void driver_remove_file(struct device_driver *driver,
+			const struct driver_attribute *attr);
 
 int driver_set_override(struct device *dev, const char **override,
 			const char *s, size_t len);
-extern int __must_check driver_for_each_device(struct device_driver *drv,
-					       struct device *start,
-					       void *data,
-					       int (*fn)(struct device *dev,
-							 void *));
+int __must_check driver_for_each_device(struct device_driver *drv, struct device *start,
+					void *data, int (*fn)(struct device *dev, void *));
 struct device *driver_find_device(struct device_driver *drv,
 				  struct device *start, const void *data,
 				  int (*match)(struct device *dev, const void *data));
-- 
cgit v1.2.3


From 44650f33d3bd76dd968cb8880c8eedd465030cd5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 13:27:11 +0100
Subject: kobject.h remove extern from function prototypes

The kernel coding style does not require 'extern' in function prototypes
in .h files, so remove them from include/linux/kobject.h as they are not
needed.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230324122711.2664537-6-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kobject.h | 59 ++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index bdab370a24f4..c392c811d9ad 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -79,43 +79,37 @@ struct kobject {
 	unsigned int uevent_suppress:1;
 };
 
-extern __printf(2, 3)
-int kobject_set_name(struct kobject *kobj, const char *name, ...);
-extern __printf(2, 0)
-int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
-			   va_list vargs);
+__printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...);
+__printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs);
 
 static inline const char *kobject_name(const struct kobject *kobj)
 {
 	return kobj->name;
 }
 
-extern void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
-extern __printf(3, 4) __must_check
-int kobject_add(struct kobject *kobj, struct kobject *parent,
-		const char *fmt, ...);
-extern __printf(4, 5) __must_check
-int kobject_init_and_add(struct kobject *kobj,
-			 const struct kobj_type *ktype, struct kobject *parent,
-			 const char *fmt, ...);
+void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
+__printf(3, 4) __must_check int kobject_add(struct kobject *kobj,
+					    struct kobject *parent,
+					    const char *fmt, ...);
+__printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj,
+						     const struct kobj_type *ktype,
+						     struct kobject *parent,
+						     const char *fmt, ...);
 
-extern void kobject_del(struct kobject *kobj);
+void kobject_del(struct kobject *kobj);
 
-extern struct kobject * __must_check kobject_create_and_add(const char *name,
-						struct kobject *parent);
+struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent);
 
-extern int __must_check kobject_rename(struct kobject *, const char *new_name);
-extern int __must_check kobject_move(struct kobject *, struct kobject *);
+int __must_check kobject_rename(struct kobject *, const char *new_name);
+int __must_check kobject_move(struct kobject *, struct kobject *);
 
-extern struct kobject *kobject_get(struct kobject *kobj);
-extern struct kobject * __must_check kobject_get_unless_zero(
-						struct kobject *kobj);
-extern void kobject_put(struct kobject *kobj);
+struct kobject *kobject_get(struct kobject *kobj);
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj);
+void kobject_put(struct kobject *kobj);
 
-extern const void *kobject_namespace(const struct kobject *kobj);
-extern void kobject_get_ownership(const struct kobject *kobj,
-				  kuid_t *uid, kgid_t *gid);
-extern char *kobject_get_path(const struct kobject *kobj, gfp_t flag);
+const void *kobject_namespace(const struct kobject *kobj);
+void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
+char *kobject_get_path(const struct kobject *kobj, gfp_t flag);
 
 struct kobj_type {
 	void (*release)(struct kobject *kobj);
@@ -176,12 +170,11 @@ struct kset {
 	const struct kset_uevent_ops *uevent_ops;
 } __randomize_layout;
 
-extern void kset_init(struct kset *kset);
-extern int __must_check kset_register(struct kset *kset);
-extern void kset_unregister(struct kset *kset);
-extern struct kset * __must_check kset_create_and_add(const char *name,
-						const struct kset_uevent_ops *u,
-						struct kobject *parent_kobj);
+void kset_init(struct kset *kset);
+int __must_check kset_register(struct kset *kset);
+void kset_unregister(struct kset *kset);
+struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u,
+					       struct kobject *parent_kobj);
 
 static inline struct kset *to_kset(struct kobject *kobj)
 {
@@ -203,7 +196,7 @@ static inline const struct kobj_type *get_ktype(const struct kobject *kobj)
 	return kobj->ktype;
 }
 
-extern struct kobject *kset_find_obj(struct kset *, const char *);
+struct kobject *kset_find_obj(struct kset *, const char *);
 
 /* The global /sys/kernel/ kobject for people to chain off of */
 extern struct kobject *kernel_kobj;
-- 
cgit v1.2.3


From b0d237087c674c43df76c1a0bc2737592f3038f4 Mon Sep 17 00:00:00 2001
From: Jun Miao <jun.miao@intel.com>
Date: Thu, 23 Feb 2023 13:28:51 +0800
Subject: KVM: Fix comments that refer to the non-existent
 install_new_memslots()

Fix stale comments that were left behind when install_new_memslots() was
replaced by kvm_swap_active_memslots() as part of the scalable memslots
rework.

Fixes: a54d806688fe ("KVM: Keep memslots in tree-based structures instead of array-based ones")
Signed-off-by: Jun Miao <jun.miao@intel.com>
Link: https://lore.kernel.org/r/20230223052851.1054799-1-jun.miao@intel.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/locking.rst |  2 +-
 include/linux/kvm_host.h           |  4 ++--
 virt/kvm/kvm_main.c                | 14 +++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 14c4e9fa501d..8c77554e4896 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -21,7 +21,7 @@ The acquisition orders for mutexes are as follows:
 - kvm->mn_active_invalidate_count ensures that pairs of
   invalidate_range_start() and invalidate_range_end() callbacks
   use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
-  are taken on the waiting side in install_new_memslots, so MMU notifiers
+  are taken on the waiting side when modifying memslots, so MMU notifiers
   must not take either kvm->slots_lock or kvm->slots_arch_lock.
 
 For SRCU:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 90edc16d37e5..9696c2fb30e9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -58,7 +58,7 @@
 
 /*
  * Bit 63 of the memslot generation number is an "update in-progress flag",
- * e.g. is temporarily set for the duration of install_new_memslots().
+ * e.g. is temporarily set for the duration of kvm_swap_active_memslots().
  * This flag effectively creates a unique generation number that is used to
  * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
  * i.e. may (or may not) have come from the previous memslots generation.
@@ -713,7 +713,7 @@ struct kvm {
 	 * use by the VM. To be used under the slots_lock (above) or in a
 	 * kvm->srcu critical section where acquiring the slots_lock would
 	 * lead to deadlock with the synchronize_srcu in
-	 * install_new_memslots.
+	 * kvm_swap_active_memslots().
 	 */
 	struct mutex slots_arch_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f0a7da37e32..d1abb331ea68 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1298,7 +1298,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	 * At this point, pending calls to invalidate_range_start()
 	 * have completed but no more MMU notifiers will run, so
 	 * mn_active_invalidate_count may remain unbalanced.
-	 * No threads can be waiting in install_new_memslots as the
+	 * No threads can be waiting in kvm_swap_active_memslots() as the
 	 * last reference on KVM has been dropped, but freeing
 	 * memslots would deadlock without this manual intervention.
 	 */
@@ -1742,13 +1742,13 @@ static void kvm_invalidate_memslot(struct kvm *kvm,
 	kvm_arch_flush_shadow_memslot(kvm, old);
 	kvm_arch_guest_memory_reclaimed(kvm);
 
-	/* Was released by kvm_swap_active_memslots, reacquire. */
+	/* Was released by kvm_swap_active_memslots(), reacquire. */
 	mutex_lock(&kvm->slots_arch_lock);
 
 	/*
 	 * Copy the arch-specific field of the newly-installed slot back to the
 	 * old slot as the arch data could have changed between releasing
-	 * slots_arch_lock in install_new_memslots() and re-acquiring the lock
+	 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
 	 * above.  Writers are required to retrieve memslots *after* acquiring
 	 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
 	 */
@@ -1810,11 +1810,11 @@ static int kvm_set_memslot(struct kvm *kvm,
 	int r;
 
 	/*
-	 * Released in kvm_swap_active_memslots.
+	 * Released in kvm_swap_active_memslots().
 	 *
-	 * Must be held from before the current memslots are copied until
-	 * after the new memslots are installed with rcu_assign_pointer,
-	 * then released before the synchronize srcu in kvm_swap_active_memslots.
+	 * Must be held from before the current memslots are copied until after
+	 * the new memslots are installed with rcu_assign_pointer, then
+	 * released before the synchronize srcu in kvm_swap_active_memslots().
 	 *
 	 * When modifying memslots outside of the slots_lock, must be held
 	 * before reading the pointer to the current memslots until after all
-- 
cgit v1.2.3


From 340133318800b55784792d762c7713265c96336a Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 24 Mar 2023 10:21:14 +0100
Subject: selinux: clean up dead code after removing runtime disable

Commit f22f9aaf6c3d ("selinux: remove the runtime disable functionality")
removes the config SECURITY_SELINUX_DISABLE. This results in some dead code
in lsm_hooks.h.

Remove this dead code.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index af87b962f5f7..60cdc83bf8af 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1740,29 +1740,6 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
 		__used __section(".early_lsm_info.init")		\
 		__aligned(sizeof(unsigned long))
 
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-/*
- * Assuring the safety of deleting a security module is up to
- * the security module involved. This may entail ordering the
- * module's hook list in a particular way, refusing to disable
- * the module once a policy is loaded or any number of other
- * actions better imagined than described.
- *
- * The name of the configuration option reflects the only module
- * that currently uses the mechanism. Any developer who thinks
- * disabling their module is a good idea needs to be at least as
- * careful as the SELinux team.
- */
-static inline void security_delete_hooks(struct security_hook_list *hooks,
-						int count)
-{
-	int i;
-
-	for (i = 0; i < count; i++)
-		hlist_del_rcu(&hooks[i].list);
-}
-#endif /* CONFIG_SECURITY_SELINUX_DISABLE */
-
 extern int lsm_inode_alloc(struct inode *inode);
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
-- 
cgit v1.2.3


From 5c8c74ef20e7973c270498dbbf96170c9f92dae3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 17 Mar 2023 10:59:48 -0600
Subject: scsi: target: uapi: Replace fake flex-array with flexible-array
 member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zero-length arrays as fake flexible arrays are deprecated and we are moving
towards adopting C99 flexible-array members instead.

Address the following warning found with GCC-13 and -fstrict-flex-arrays=3
enabled:

CC      drivers/target/target_core_user.o
drivers/target/target_core_user.c: In function ‘queue_cmd_ring’:
drivers/target/target_core_user.c:1096:15: warning: array subscript 0 is outside array bounds of ‘struct iovec[0]’ [-Warray-bounds=]
 1096 |         iov = &entry->req.iov[0];
      |               ^~~~~~~~~~~~~~~~~~
In file included from drivers/target/target_core_user.c:31:
./include/uapi/linux/target_core_user.h:122:38: note: while referencing ‘iov’
  122 |                         struct iovec iov[0];
      |                                      ^~~

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines
on memcpy() and help us make progress towards globally enabling
-fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/21
Link: https://github.com/KSPP/linux/issues/270
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/ZBSchMvTdl7VObKI@work
Reviewed-by: Bodo Stroesser <bostroesser@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/uapi/linux/target_core_user.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index fbd8ca67e107..f925a77f19ed 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -119,7 +119,7 @@ struct tcmu_cmd_entry {
 			__u64 cdb_off;
 			__u64 __pad1;
 			__u64 __pad2;
-			struct iovec iov[0];
+			__DECLARE_FLEX_ARRAY(struct iovec, iov);
 		} req;
 		struct {
 			__u8 scsi_status;
-- 
cgit v1.2.3


From becd9be6069e7b183c084f460f0eb363e43cc487 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 18 Mar 2023 20:56:12 -0500
Subject: scsi: target: Move sess cmd counter to new struct

iSCSI needs to wait on outstanding commands like how SRP and the FC/FCoE
drivers do. It can't use target_stop_session() because for MCS support we
can't stop the entire session during recovery because if other connections
are OK then we want to be able to continue to execute I/O on them.

Move the per session cmd counters to a new struct so iSCSI can allocate
them per connection. The xcopy code can also just not allocate in the
future since it doesn't need to track commands.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20230319015620.96006-2-michael.christie@oracle.com
Reviewed-by: Maurizio Lombardi <mlombard@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_tpg.c         |   2 +-
 drivers/target/target_core_transport.c   | 135 ++++++++++++++++++++++---------
 include/target/iscsi/iscsi_target_core.h |   1 +
 include/target/target_core_base.h        |  13 ++-
 4 files changed, 107 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index 736847c933e5..8ebccdbd94f0 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -328,7 +328,7 @@ static void target_shutdown_sessions(struct se_node_acl *acl)
 restart:
 	spin_lock_irqsave(&acl->nacl_sess_lock, flags);
 	list_for_each_entry(sess, &acl->acl_sess_list, sess_acl_list) {
-		if (atomic_read(&sess->stopped))
+		if (sess->cmd_cnt && atomic_read(&sess->cmd_cnt->stopped))
 			continue;
 
 		list_del_init(&sess->sess_acl_list);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 5926316252eb..3d6034f00dcd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -220,11 +220,49 @@ void transport_subsystem_check_init(void)
 	sub_api_initialized = 1;
 }
 
-static void target_release_sess_cmd_refcnt(struct percpu_ref *ref)
+static void target_release_cmd_refcnt(struct percpu_ref *ref)
 {
-	struct se_session *sess = container_of(ref, typeof(*sess), cmd_count);
+	struct target_cmd_counter *cmd_cnt  = container_of(ref,
+							   typeof(*cmd_cnt),
+							   refcnt);
+	wake_up(&cmd_cnt->refcnt_wq);
+}
+
+static struct target_cmd_counter *target_alloc_cmd_counter(void)
+{
+	struct target_cmd_counter *cmd_cnt;
+	int rc;
+
+	cmd_cnt = kzalloc(sizeof(*cmd_cnt), GFP_KERNEL);
+	if (!cmd_cnt)
+		return NULL;
 
-	wake_up(&sess->cmd_count_wq);
+	init_completion(&cmd_cnt->stop_done);
+	init_waitqueue_head(&cmd_cnt->refcnt_wq);
+	atomic_set(&cmd_cnt->stopped, 0);
+
+	rc = percpu_ref_init(&cmd_cnt->refcnt, target_release_cmd_refcnt, 0,
+			     GFP_KERNEL);
+	if (rc)
+		goto free_cmd_cnt;
+
+	return cmd_cnt;
+
+free_cmd_cnt:
+	kfree(cmd_cnt);
+	return NULL;
+}
+
+static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
+{
+	/*
+	 * Drivers like loop do not call target_stop_session during session
+	 * shutdown so we have to drop the ref taken at init time here.
+	 */
+	if (!atomic_read(&cmd_cnt->stopped))
+		percpu_ref_put(&cmd_cnt->refcnt);
+
+	percpu_ref_exit(&cmd_cnt->refcnt);
 }
 
 /**
@@ -238,25 +276,17 @@ int transport_init_session(struct se_session *se_sess)
 	INIT_LIST_HEAD(&se_sess->sess_list);
 	INIT_LIST_HEAD(&se_sess->sess_acl_list);
 	spin_lock_init(&se_sess->sess_cmd_lock);
-	init_waitqueue_head(&se_sess->cmd_count_wq);
-	init_completion(&se_sess->stop_done);
-	atomic_set(&se_sess->stopped, 0);
-	return percpu_ref_init(&se_sess->cmd_count,
-			       target_release_sess_cmd_refcnt, 0, GFP_KERNEL);
+	se_sess->cmd_cnt = target_alloc_cmd_counter();
+	if (!se_sess->cmd_cnt)
+		return -ENOMEM;
+
+	return  0;
 }
 EXPORT_SYMBOL(transport_init_session);
 
 void transport_uninit_session(struct se_session *se_sess)
 {
-	/*
-	 * Drivers like iscsi and loop do not call target_stop_session
-	 * during session shutdown so we have to drop the ref taken at init
-	 * time here.
-	 */
-	if (!atomic_read(&se_sess->stopped))
-		percpu_ref_put(&se_sess->cmd_count);
-
-	percpu_ref_exit(&se_sess->cmd_count);
+	target_free_cmd_counter(se_sess->cmd_cnt);
 }
 
 /**
@@ -2970,9 +3000,16 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 		se_cmd->se_cmd_flags |= SCF_ACK_KREF;
 	}
 
-	if (!percpu_ref_tryget_live(&se_sess->cmd_count))
-		ret = -ESHUTDOWN;
-
+	/*
+	 * Users like xcopy do not use counters since they never do a stop
+	 * and wait.
+	 */
+	if (se_sess->cmd_cnt) {
+		if (!percpu_ref_tryget_live(&se_sess->cmd_cnt->refcnt))
+			ret = -ESHUTDOWN;
+		else
+			se_cmd->cmd_cnt = se_sess->cmd_cnt;
+	}
 	if (ret && ack_kref)
 		target_put_sess_cmd(se_cmd);
 
@@ -2993,7 +3030,7 @@ static void target_free_cmd_mem(struct se_cmd *cmd)
 static void target_release_cmd_kref(struct kref *kref)
 {
 	struct se_cmd *se_cmd = container_of(kref, struct se_cmd, cmd_kref);
-	struct se_session *se_sess = se_cmd->se_sess;
+	struct target_cmd_counter *cmd_cnt = se_cmd->cmd_cnt;
 	struct completion *free_compl = se_cmd->free_compl;
 	struct completion *abrt_compl = se_cmd->abrt_compl;
 
@@ -3004,7 +3041,8 @@ static void target_release_cmd_kref(struct kref *kref)
 	if (abrt_compl)
 		complete(abrt_compl);
 
-	percpu_ref_put(&se_sess->cmd_count);
+	if (cmd_cnt)
+		percpu_ref_put(&cmd_cnt->refcnt);
 }
 
 /**
@@ -3123,46 +3161,65 @@ void target_show_cmd(const char *pfx, struct se_cmd *cmd)
 }
 EXPORT_SYMBOL(target_show_cmd);
 
-static void target_stop_session_confirm(struct percpu_ref *ref)
+static void target_stop_cmd_counter_confirm(struct percpu_ref *ref)
+{
+	struct target_cmd_counter *cmd_cnt = container_of(ref,
+						struct target_cmd_counter,
+						refcnt);
+	complete_all(&cmd_cnt->stop_done);
+}
+
+/**
+ * target_stop_cmd_counter - Stop new IO from being added to the counter.
+ * @cmd_cnt: counter to stop
+ */
+static void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
-	struct se_session *se_sess = container_of(ref, struct se_session,
-						  cmd_count);
-	complete_all(&se_sess->stop_done);
+	pr_debug("Stopping command counter.\n");
+	if (!atomic_cmpxchg(&cmd_cnt->stopped, 0, 1))
+		percpu_ref_kill_and_confirm(&cmd_cnt->refcnt,
+					    target_stop_cmd_counter_confirm);
 }
 
 /**
  * target_stop_session - Stop new IO from being queued on the session.
- * @se_sess:    session to stop
+ * @se_sess: session to stop
  */
 void target_stop_session(struct se_session *se_sess)
 {
-	pr_debug("Stopping session queue.\n");
-	if (atomic_cmpxchg(&se_sess->stopped, 0, 1) == 0)
-		percpu_ref_kill_and_confirm(&se_sess->cmd_count,
-					    target_stop_session_confirm);
+	target_stop_cmd_counter(se_sess->cmd_cnt);
 }
 EXPORT_SYMBOL(target_stop_session);
 
 /**
- * target_wait_for_sess_cmds - Wait for outstanding commands
- * @se_sess:    session to wait for active I/O
+ * target_wait_for_cmds - Wait for outstanding cmds.
+ * @cmd_cnt: counter to wait for active I/O for.
  */
-void target_wait_for_sess_cmds(struct se_session *se_sess)
+static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 {
 	int ret;
 
-	WARN_ON_ONCE(!atomic_read(&se_sess->stopped));
+	WARN_ON_ONCE(!atomic_read(&cmd_cnt->stopped));
 
 	do {
 		pr_debug("Waiting for running cmds to complete.\n");
-		ret = wait_event_timeout(se_sess->cmd_count_wq,
-				percpu_ref_is_zero(&se_sess->cmd_count),
-				180 * HZ);
+		ret = wait_event_timeout(cmd_cnt->refcnt_wq,
+					 percpu_ref_is_zero(&cmd_cnt->refcnt),
+					 180 * HZ);
 	} while (ret <= 0);
 
-	wait_for_completion(&se_sess->stop_done);
+	wait_for_completion(&cmd_cnt->stop_done);
 	pr_debug("Waiting for cmds done.\n");
 }
+
+/**
+ * target_wait_for_sess_cmds - Wait for outstanding commands
+ * @se_sess: session to wait for active I/O
+ */
+void target_wait_for_sess_cmds(struct se_session *se_sess)
+{
+	target_wait_for_cmds(se_sess->cmd_cnt);
+}
 EXPORT_SYMBOL(target_wait_for_sess_cmds);
 
 /*
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 94d06ddfd80a..229118156a1f 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -600,6 +600,7 @@ struct iscsit_conn {
 	struct iscsi_tpg_np	*tpg_np;
 	/* Pointer to parent session */
 	struct iscsit_session	*sess;
+	struct target_cmd_counter *cmd_cnt;
 	int			bitmap_id;
 	int			rx_thread_active;
 	struct task_struct	*rx_thread;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 12c9ba16217e..bd299790e99c 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -494,6 +494,7 @@ struct se_cmd {
 	struct se_lun		*se_lun;
 	/* Only used for internal passthrough and legacy TCM fabric modules */
 	struct se_session	*se_sess;
+	struct target_cmd_counter *cmd_cnt;
 	struct se_tmr_req	*se_tmr_req;
 	struct llist_node	se_cmd_list;
 	struct completion	*free_compl;
@@ -619,22 +620,26 @@ static inline struct se_node_acl *fabric_stat_to_nacl(struct config_item *item)
 			acl_fabric_stat_group);
 }
 
-struct se_session {
+struct target_cmd_counter {
+	struct percpu_ref	refcnt;
+	wait_queue_head_t	refcnt_wq;
+	struct completion	stop_done;
 	atomic_t		stopped;
+};
+
+struct se_session {
 	u64			sess_bin_isid;
 	enum target_prot_op	sup_prot_ops;
 	enum target_prot_type	sess_prot_type;
 	struct se_node_acl	*se_node_acl;
 	struct se_portal_group *se_tpg;
 	void			*fabric_sess_ptr;
-	struct percpu_ref	cmd_count;
 	struct list_head	sess_list;
 	struct list_head	sess_acl_list;
 	spinlock_t		sess_cmd_lock;
-	wait_queue_head_t	cmd_count_wq;
-	struct completion	stop_done;
 	void			*sess_cmd_map;
 	struct sbitmap_queue	sess_tag_pool;
+	struct target_cmd_counter *cmd_cnt;
 };
 
 struct se_device;
-- 
cgit v1.2.3


From 4edba7e4a8f39112398d3cda94128a8e13a7d527 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 18 Mar 2023 20:56:13 -0500
Subject: scsi: target: Move cmd counter allocation

iSCSI needs to allocate its cmd counter per connection for MCS support
where we need to stop and wait on commands running on a connection instead
of per session. This moves the cmd counter allocation to
target_setup_session() which is used by drivers that need the stop+wait
behavior per session.

xcopy doesn't need stop+wait at all, so we will be OK moving the cmd
counter allocation outside of transport_init_session().

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20230319015620.96006-3-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/iscsi_target_login.c | 10 ++++++
 drivers/target/target_core_internal.h     |  1 -
 drivers/target/target_core_transport.c    | 55 +++++++++++++++----------------
 drivers/target/target_core_xcopy.c        | 15 ++-------
 include/target/target_core_fabric.h       |  4 ++-
 5 files changed, 42 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 27e448c2d066..8ab6c0107d89 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -324,8 +324,18 @@ static int iscsi_login_zero_tsih_s1(
 		goto free_ops;
 	}
 
+	/*
+	 * This is temp for iser. It will be moved to per conn in later
+	 * patches for iscsi.
+	 */
+	sess->se_sess->cmd_cnt = target_alloc_cmd_counter();
+	if (!sess->se_sess->cmd_cnt)
+		goto free_se_sess;
+
 	return 0;
 
+free_se_sess:
+	transport_free_session(sess->se_sess);
 free_ops:
 	kfree(sess->sess_ops);
 free_id:
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 38a6d08f75b3..85e35cf582e5 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -138,7 +138,6 @@ int	init_se_kmem_caches(void);
 void	release_se_kmem_caches(void);
 u32	scsi_get_new_index(scsi_index_t);
 void	transport_subsystem_check_init(void);
-void	transport_uninit_session(struct se_session *);
 unsigned char *transport_dump_cmd_direction(struct se_cmd *);
 void	transport_dump_dev_state(struct se_device *, char *, int *);
 void	transport_dump_dev_info(struct se_device *, struct se_lun *,
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 3d6034f00dcd..60647a49a1d3 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -228,7 +228,7 @@ static void target_release_cmd_refcnt(struct percpu_ref *ref)
 	wake_up(&cmd_cnt->refcnt_wq);
 }
 
-static struct target_cmd_counter *target_alloc_cmd_counter(void)
+struct target_cmd_counter *target_alloc_cmd_counter(void)
 {
 	struct target_cmd_counter *cmd_cnt;
 	int rc;
@@ -252,6 +252,7 @@ free_cmd_cnt:
 	kfree(cmd_cnt);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(target_alloc_cmd_counter);
 
 static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
@@ -271,24 +272,14 @@ static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
  *
  * The caller must have zero-initialized @se_sess before calling this function.
  */
-int transport_init_session(struct se_session *se_sess)
+void transport_init_session(struct se_session *se_sess)
 {
 	INIT_LIST_HEAD(&se_sess->sess_list);
 	INIT_LIST_HEAD(&se_sess->sess_acl_list);
 	spin_lock_init(&se_sess->sess_cmd_lock);
-	se_sess->cmd_cnt = target_alloc_cmd_counter();
-	if (!se_sess->cmd_cnt)
-		return -ENOMEM;
-
-	return  0;
 }
 EXPORT_SYMBOL(transport_init_session);
 
-void transport_uninit_session(struct se_session *se_sess)
-{
-	target_free_cmd_counter(se_sess->cmd_cnt);
-}
-
 /**
  * transport_alloc_session - allocate a session object and initialize it
  * @sup_prot_ops: bitmask that defines which T10-PI modes are supported.
@@ -296,7 +287,6 @@ void transport_uninit_session(struct se_session *se_sess)
 struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops)
 {
 	struct se_session *se_sess;
-	int ret;
 
 	se_sess = kmem_cache_zalloc(se_sess_cache, GFP_KERNEL);
 	if (!se_sess) {
@@ -304,11 +294,7 @@ struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops)
 				" se_sess_cache\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	ret = transport_init_session(se_sess);
-	if (ret < 0) {
-		kmem_cache_free(se_sess_cache, se_sess);
-		return ERR_PTR(ret);
-	}
+	transport_init_session(se_sess);
 	se_sess->sup_prot_ops = sup_prot_ops;
 
 	return se_sess;
@@ -474,8 +460,13 @@ target_setup_session(struct se_portal_group *tpg,
 		     int (*callback)(struct se_portal_group *,
 				     struct se_session *, void *))
 {
+	struct target_cmd_counter *cmd_cnt;
 	struct se_session *sess;
+	int rc;
 
+	cmd_cnt = target_alloc_cmd_counter();
+	if (!cmd_cnt)
+		return ERR_PTR(-ENOMEM);
 	/*
 	 * If the fabric driver is using percpu-ida based pre allocation
 	 * of I/O descriptor tags, go ahead and perform that setup now..
@@ -485,29 +476,36 @@ target_setup_session(struct se_portal_group *tpg,
 	else
 		sess = transport_alloc_session(prot_op);
 
-	if (IS_ERR(sess))
-		return sess;
+	if (IS_ERR(sess)) {
+		rc = PTR_ERR(sess);
+		goto free_cnt;
+	}
+	sess->cmd_cnt = cmd_cnt;
 
 	sess->se_node_acl = core_tpg_check_initiator_node_acl(tpg,
 					(unsigned char *)initiatorname);
 	if (!sess->se_node_acl) {
-		transport_free_session(sess);
-		return ERR_PTR(-EACCES);
+		rc = -EACCES;
+		goto free_sess;
 	}
 	/*
 	 * Go ahead and perform any remaining fabric setup that is
 	 * required before transport_register_session().
 	 */
 	if (callback != NULL) {
-		int rc = callback(tpg, sess, private);
-		if (rc) {
-			transport_free_session(sess);
-			return ERR_PTR(rc);
-		}
+		rc = callback(tpg, sess, private);
+		if (rc)
+			goto free_sess;
 	}
 
 	transport_register_session(tpg, sess->se_node_acl, sess, private);
 	return sess;
+
+free_sess:
+	transport_free_session(sess);
+free_cnt:
+	target_free_cmd_counter(cmd_cnt);
+	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL(target_setup_session);
 
@@ -632,7 +630,8 @@ void transport_free_session(struct se_session *se_sess)
 		sbitmap_queue_free(&se_sess->sess_tag_pool);
 		kvfree(se_sess->sess_cmd_map);
 	}
-	transport_uninit_session(se_sess);
+	if (se_sess->cmd_cnt)
+		target_free_cmd_counter(se_sess->cmd_cnt);
 	kmem_cache_free(se_sess_cache, se_sess);
 }
 EXPORT_SYMBOL(transport_free_session);
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 49eaee022ef1..49a83500c8b7 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -461,8 +461,6 @@ static const struct target_core_fabric_ops xcopy_pt_tfo = {
 
 int target_xcopy_setup_pt(void)
 {
-	int ret;
-
 	xcopy_wq = alloc_workqueue("xcopy_wq", WQ_MEM_RECLAIM, 0);
 	if (!xcopy_wq) {
 		pr_err("Unable to allocate xcopy_wq\n");
@@ -479,9 +477,7 @@ int target_xcopy_setup_pt(void)
 	INIT_LIST_HEAD(&xcopy_pt_nacl.acl_list);
 	INIT_LIST_HEAD(&xcopy_pt_nacl.acl_sess_list);
 	memset(&xcopy_pt_sess, 0, sizeof(struct se_session));
-	ret = transport_init_session(&xcopy_pt_sess);
-	if (ret < 0)
-		goto destroy_wq;
+	transport_init_session(&xcopy_pt_sess);
 
 	xcopy_pt_nacl.se_tpg = &xcopy_pt_tpg;
 	xcopy_pt_nacl.nacl_sess = &xcopy_pt_sess;
@@ -490,19 +486,12 @@ int target_xcopy_setup_pt(void)
 	xcopy_pt_sess.se_node_acl = &xcopy_pt_nacl;
 
 	return 0;
-
-destroy_wq:
-	destroy_workqueue(xcopy_wq);
-	xcopy_wq = NULL;
-	return ret;
 }
 
 void target_xcopy_release_pt(void)
 {
-	if (xcopy_wq) {
+	if (xcopy_wq)
 		destroy_workqueue(xcopy_wq);
-		transport_uninit_session(&xcopy_pt_sess);
-	}
 }
 
 /*
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 38f0662476d1..65527174b8bc 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -133,7 +133,9 @@ struct se_session *target_setup_session(struct se_portal_group *,
 				struct se_session *, void *));
 void target_remove_session(struct se_session *);
 
-int transport_init_session(struct se_session *se_sess);
+struct target_cmd_counter *target_alloc_cmd_counter(void);
+
+void transport_init_session(struct se_session *se_sess);
 struct se_session *transport_alloc_session(enum target_prot_op);
 int transport_alloc_session_tags(struct se_session *, unsigned int,
 		unsigned int);
-- 
cgit v1.2.3


From 8e288be8606ad87c1726618eacfb8fbd3ab4b806 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 18 Mar 2023 20:56:14 -0500
Subject: scsi: target: Pass in cmd counter to use during cmd setup

Allow target_get_sess_cmd() users to pass in the cmd counter they want to
use. Right now we pass in the session's cmd counter but in a subsequent
commit iSCSI will switch from per session to per conn.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20230319015620.96006-4-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c    | 10 ++++++----
 drivers/target/target_core_transport.c | 28 +++++++++++++---------------
 drivers/target/target_core_xcopy.c     |  8 ++++----
 drivers/usb/gadget/function/f_tcm.c    |  4 ++--
 include/target/target_core_fabric.h    |  8 +++++---
 5 files changed, 30 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index baf4da7bb3b4..87927a36f90d 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1190,9 +1190,10 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 	 * Initialize struct se_cmd descriptor from target_core_mod infrastructure
 	 */
 	__target_init_cmd(&cmd->se_cmd, &iscsi_ops,
-			 conn->sess->se_sess, be32_to_cpu(hdr->data_length),
-			 cmd->data_direction, sam_task_attr,
-			 cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun));
+			  conn->sess->se_sess, be32_to_cpu(hdr->data_length),
+			  cmd->data_direction, sam_task_attr,
+			  cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun),
+			  conn->sess->se_sess->cmd_cnt);
 
 	pr_debug("Got SCSI Command, ITT: 0x%08x, CmdSN: 0x%08x,"
 		" ExpXferLen: %u, Length: %u, CID: %hu\n", hdr->itt,
@@ -2055,7 +2056,8 @@ iscsit_handle_task_mgt_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 	__target_init_cmd(&cmd->se_cmd, &iscsi_ops,
 			  conn->sess->se_sess, 0, DMA_NONE,
 			  TCM_SIMPLE_TAG, cmd->sense_buffer + 2,
-			  scsilun_to_int(&hdr->lun));
+			  scsilun_to_int(&hdr->lun),
+			  conn->sess->se_sess->cmd_cnt);
 
 	target_get_sess_cmd(&cmd->se_cmd, true);
 
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 60647a49a1d3..c395606ab1a9 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1441,14 +1441,12 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
  *
  * Preserves the value of @cmd->tag.
  */
-void __target_init_cmd(
-	struct se_cmd *cmd,
-	const struct target_core_fabric_ops *tfo,
-	struct se_session *se_sess,
-	u32 data_length,
-	int data_direction,
-	int task_attr,
-	unsigned char *sense_buffer, u64 unpacked_lun)
+void __target_init_cmd(struct se_cmd *cmd,
+		       const struct target_core_fabric_ops *tfo,
+		       struct se_session *se_sess, u32 data_length,
+		       int data_direction, int task_attr,
+		       unsigned char *sense_buffer, u64 unpacked_lun,
+		       struct target_cmd_counter *cmd_cnt)
 {
 	INIT_LIST_HEAD(&cmd->se_delayed_node);
 	INIT_LIST_HEAD(&cmd->se_qf_node);
@@ -1468,6 +1466,7 @@ void __target_init_cmd(
 	cmd->sam_task_attr = task_attr;
 	cmd->sense_buffer = sense_buffer;
 	cmd->orig_fe_lun = unpacked_lun;
+	cmd->cmd_cnt = cmd_cnt;
 
 	if (!(cmd->se_cmd_flags & SCF_USE_CPUID))
 		cmd->cpuid = raw_smp_processor_id();
@@ -1687,7 +1686,8 @@ int target_init_cmd(struct se_cmd *se_cmd, struct se_session *se_sess,
 	 * target_core_fabric_ops->queue_status() callback
 	 */
 	__target_init_cmd(se_cmd, se_tpg->se_tpg_tfo, se_sess, data_length,
-			  data_dir, task_attr, sense, unpacked_lun);
+			  data_dir, task_attr, sense, unpacked_lun,
+			  se_sess->cmd_cnt);
 
 	/*
 	 * Obtain struct se_cmd->cmd_kref reference. A second kref_get here is
@@ -1982,7 +1982,8 @@ int target_submit_tmr(struct se_cmd *se_cmd, struct se_session *se_sess,
 	BUG_ON(!se_tpg);
 
 	__target_init_cmd(se_cmd, se_tpg->se_tpg_tfo, se_sess,
-			  0, DMA_NONE, TCM_SIMPLE_TAG, sense, unpacked_lun);
+			  0, DMA_NONE, TCM_SIMPLE_TAG, sense, unpacked_lun,
+			  se_sess->cmd_cnt);
 	/*
 	 * FIXME: Currently expect caller to handle se_cmd->se_tmr_req
 	 * allocation failure.
@@ -2986,7 +2987,6 @@ EXPORT_SYMBOL(transport_generic_free_cmd);
  */
 int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 {
-	struct se_session *se_sess = se_cmd->se_sess;
 	int ret = 0;
 
 	/*
@@ -3003,11 +3003,9 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 	 * Users like xcopy do not use counters since they never do a stop
 	 * and wait.
 	 */
-	if (se_sess->cmd_cnt) {
-		if (!percpu_ref_tryget_live(&se_sess->cmd_cnt->refcnt))
+	if (se_cmd->cmd_cnt) {
+		if (!percpu_ref_tryget_live(&se_cmd->cmd_cnt->refcnt))
 			ret = -ESHUTDOWN;
-		else
-			se_cmd->cmd_cnt = se_sess->cmd_cnt;
 	}
 	if (ret && ack_kref)
 		target_put_sess_cmd(se_cmd);
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 49a83500c8b7..91ed015b588c 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -591,8 +591,8 @@ static int target_xcopy_read_source(
 		(unsigned long long)src_lba, transfer_length_block, src_bytes);
 
 	__target_init_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, src_bytes,
-			  DMA_FROM_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0);
-
+			  DMA_FROM_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0,
+			  NULL);
 	rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, src_dev, &cdb[0],
 				remote_port);
 	if (rc < 0) {
@@ -636,8 +636,8 @@ static int target_xcopy_write_destination(
 		(unsigned long long)dst_lba, transfer_length_block, dst_bytes);
 
 	__target_init_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, dst_bytes,
-			  DMA_TO_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0);
-
+			  DMA_TO_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0,
+			  NULL);
 	rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, dst_dev, &cdb[0],
 				remote_port);
 	if (rc < 0) {
diff --git a/drivers/usb/gadget/function/f_tcm.c b/drivers/usb/gadget/function/f_tcm.c
index 658e2e21fdd0..c21acebe8aae 100644
--- a/drivers/usb/gadget/function/f_tcm.c
+++ b/drivers/usb/gadget/function/f_tcm.c
@@ -1054,7 +1054,7 @@ static void usbg_cmd_work(struct work_struct *work)
 				  tv_nexus->tvn_se_sess->se_tpg->se_tpg_tfo,
 				  tv_nexus->tvn_se_sess, cmd->data_len, DMA_NONE,
 				  cmd->prio_attr, cmd->sense_iu.sense,
-				  cmd->unpacked_lun);
+				  cmd->unpacked_lun, NULL);
 		goto out;
 	}
 
@@ -1183,7 +1183,7 @@ static void bot_cmd_work(struct work_struct *work)
 				  tv_nexus->tvn_se_sess->se_tpg->se_tpg_tfo,
 				  tv_nexus->tvn_se_sess, cmd->data_len, DMA_NONE,
 				  cmd->prio_attr, cmd->sense_iu.sense,
-				  cmd->unpacked_lun);
+				  cmd->unpacked_lun, NULL);
 		goto out;
 	}
 
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 65527174b8bc..d507e7885f17 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -151,9 +151,11 @@ void	transport_deregister_session_configfs(struct se_session *);
 void	transport_deregister_session(struct se_session *);
 
 
-void	__target_init_cmd(struct se_cmd *,
-		const struct target_core_fabric_ops *,
-		struct se_session *, u32, int, int, unsigned char *, u64);
+void	__target_init_cmd(struct se_cmd *cmd,
+		const struct target_core_fabric_ops *tfo,
+		struct se_session *sess, u32 data_length, int data_direction,
+		int task_attr, unsigned char *sense_buffer, u64 unpacked_lun,
+		struct target_cmd_counter *cmd_cnt);
 int	target_init_cmd(struct se_cmd *se_cmd, struct se_session *se_sess,
 		unsigned char *sense, u64 unpacked_lun, u32 data_length,
 		int task_attr, int data_dir, int flags);
-- 
cgit v1.2.3


From 6d256bee602b131bd4fbc92863b6a1210bcf6325 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 18 Mar 2023 20:56:15 -0500
Subject: scsi: target: iscsit: isert: Alloc per conn cmd counter

This has iscsit allocate a per conn cmd counter and converts iscsit/isert
to use it instead of the per session one.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20230319015620.96006-5-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c   |  4 ++--
 drivers/target/iscsi/iscsi_target.c       |  4 ++--
 drivers/target/iscsi/iscsi_target_login.c | 17 +++++++----------
 drivers/target/target_core_transport.c    |  9 ++++++---
 include/target/target_core_fabric.h       |  3 +++
 5 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 75404885cf98..f290cd49698e 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2506,8 +2506,8 @@ isert_wait4cmds(struct iscsit_conn *conn)
 	isert_info("iscsit_conn %p\n", conn);
 
 	if (conn->sess) {
-		target_stop_session(conn->sess->se_sess);
-		target_wait_for_sess_cmds(conn->sess->se_sess);
+		target_stop_cmd_counter(conn->cmd_cnt);
+		target_wait_for_cmds(conn->cmd_cnt);
 	}
 }
 
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 87927a36f90d..11115c207844 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1193,7 +1193,7 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 			  conn->sess->se_sess, be32_to_cpu(hdr->data_length),
 			  cmd->data_direction, sam_task_attr,
 			  cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun),
-			  conn->sess->se_sess->cmd_cnt);
+			  conn->cmd_cnt);
 
 	pr_debug("Got SCSI Command, ITT: 0x%08x, CmdSN: 0x%08x,"
 		" ExpXferLen: %u, Length: %u, CID: %hu\n", hdr->itt,
@@ -2057,7 +2057,7 @@ iscsit_handle_task_mgt_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 			  conn->sess->se_sess, 0, DMA_NONE,
 			  TCM_SIMPLE_TAG, cmd->sense_buffer + 2,
 			  scsilun_to_int(&hdr->lun),
-			  conn->sess->se_sess->cmd_cnt);
+			  conn->cmd_cnt);
 
 	target_get_sess_cmd(&cmd->se_cmd, true);
 
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 8ab6c0107d89..274bdd7845ca 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -324,18 +324,8 @@ static int iscsi_login_zero_tsih_s1(
 		goto free_ops;
 	}
 
-	/*
-	 * This is temp for iser. It will be moved to per conn in later
-	 * patches for iscsi.
-	 */
-	sess->se_sess->cmd_cnt = target_alloc_cmd_counter();
-	if (!sess->se_sess->cmd_cnt)
-		goto free_se_sess;
-
 	return 0;
 
-free_se_sess:
-	transport_free_session(sess->se_sess);
 free_ops:
 	kfree(sess->sess_ops);
 free_id:
@@ -1157,8 +1147,14 @@ static struct iscsit_conn *iscsit_alloc_conn(struct iscsi_np *np)
 		goto free_conn_cpumask;
 	}
 
+	conn->cmd_cnt = target_alloc_cmd_counter();
+	if (!conn->cmd_cnt)
+		goto free_conn_allowed_cpumask;
+
 	return conn;
 
+free_conn_allowed_cpumask:
+	free_cpumask_var(conn->allowed_cpumask);
 free_conn_cpumask:
 	free_cpumask_var(conn->conn_cpumask);
 free_conn_ops:
@@ -1172,6 +1168,7 @@ free_conn:
 
 void iscsit_free_conn(struct iscsit_conn *conn)
 {
+	target_free_cmd_counter(conn->cmd_cnt);
 	free_cpumask_var(conn->allowed_cpumask);
 	free_cpumask_var(conn->conn_cpumask);
 	kfree(conn->conn_ops);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index c395606ab1a9..86adff2a86ed 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -254,7 +254,7 @@ free_cmd_cnt:
 }
 EXPORT_SYMBOL_GPL(target_alloc_cmd_counter);
 
-static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
+void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
 	/*
 	 * Drivers like loop do not call target_stop_session during session
@@ -265,6 +265,7 @@ static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 
 	percpu_ref_exit(&cmd_cnt->refcnt);
 }
+EXPORT_SYMBOL_GPL(target_free_cmd_counter);
 
 /**
  * transport_init_session - initialize a session object
@@ -3170,13 +3171,14 @@ static void target_stop_cmd_counter_confirm(struct percpu_ref *ref)
  * target_stop_cmd_counter - Stop new IO from being added to the counter.
  * @cmd_cnt: counter to stop
  */
-static void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
+void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
 	pr_debug("Stopping command counter.\n");
 	if (!atomic_cmpxchg(&cmd_cnt->stopped, 0, 1))
 		percpu_ref_kill_and_confirm(&cmd_cnt->refcnt,
 					    target_stop_cmd_counter_confirm);
 }
+EXPORT_SYMBOL_GPL(target_stop_cmd_counter);
 
 /**
  * target_stop_session - Stop new IO from being queued on the session.
@@ -3192,7 +3194,7 @@ EXPORT_SYMBOL(target_stop_session);
  * target_wait_for_cmds - Wait for outstanding cmds.
  * @cmd_cnt: counter to wait for active I/O for.
  */
-static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
+void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 {
 	int ret;
 
@@ -3208,6 +3210,7 @@ static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 	wait_for_completion(&cmd_cnt->stop_done);
 	pr_debug("Waiting for cmds done.\n");
 }
+EXPORT_SYMBOL_GPL(target_wait_for_cmds);
 
 /**
  * target_wait_for_sess_cmds - Wait for outstanding commands
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index d507e7885f17..b188b1e90e1e 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -133,7 +133,10 @@ struct se_session *target_setup_session(struct se_portal_group *,
 				struct se_session *, void *));
 void target_remove_session(struct se_session *);
 
+void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt);
+void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt);
 struct target_cmd_counter *target_alloc_cmd_counter(void);
+void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt);
 
 void transport_init_session(struct se_session *se_sess);
 struct se_session *transport_alloc_session(enum target_prot_op);
-- 
cgit v1.2.3


From 673db054d7a2b5a470d7a25baf65956d005ad729 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 18 Mar 2023 20:56:18 -0500
Subject: scsi: target: Fix multiple LUN_RESET handling

This fixes a bug where an initiator thinks a LUN_RESET has cleaned up
running commands when it hasn't. The bug was added in commit 51ec502a3266
("target: Delete tmr from list before processing").

The problem occurs when:

 1. We have N I/O cmds running in the target layer spread over 2 sessions.

 2. The initiator sends a LUN_RESET for each session.

 3. session1's LUN_RESET loops over all the running commands from both
    sessions and moves them to its local drain_task_list.

 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
    the commit above has it remove itself. session2 also does not see any
    commands since the other reset moved them off the state lists.

 5. sessions2's LUN_RESET will then complete with a successful response.

 6. sessions2's inititor believes the running commands on its session are
    now cleaned up due to the successful response and cleans up the running
    commands from its side. It then restarts them.

 7. The commands do eventually complete on the backend and the target
    starts to return aborted task statuses for them. The initiator will
    either throw a invalid ITT error or might accidentally lookup a new
    task if the ITT has been reallocated already.

Fix the bug by reverting the patch, and serialize the execution of
LUN_RESETs and Preempt and Aborts.

Also prevent us from waiting on LUN_RESETs in core_tmr_drain_tmr_list,
because it turns out the original patch fixed a bug that was not
mentioned. For LUN_RESET1 core_tmr_drain_tmr_list can see a second
LUN_RESET and wait on it. Then the second reset will run
core_tmr_drain_tmr_list and see the first reset and wait on it resulting in
a deadlock.

Fixes: 51ec502a3266 ("target: Delete tmr from list before processing")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20230319015620.96006-8-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c |  1 +
 drivers/target/target_core_tmr.c    | 26 +++++++++++++++++++++++---
 include/target/target_core_base.h   |  1 +
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index f6e58410ec3f..aeb03136773d 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -782,6 +782,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	spin_lock_init(&dev->t10_alua.lba_map_lock);
 
 	INIT_WORK(&dev->delayed_cmd_work, target_do_delayed_work);
+	mutex_init(&dev->lun_reset_mutex);
 
 	dev->t10_wwn.t10_dev = dev;
 	/*
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index 2b95b4550a63..4718db628222 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -188,14 +188,23 @@ static void core_tmr_drain_tmr_list(
 	 * LUN_RESET tmr..
 	 */
 	spin_lock_irqsave(&dev->se_tmr_lock, flags);
-	if (tmr)
-		list_del_init(&tmr->tmr_list);
 	list_for_each_entry_safe(tmr_p, tmr_pp, &dev->dev_tmr_list, tmr_list) {
+		if (tmr_p == tmr)
+			continue;
+
 		cmd = tmr_p->task_cmd;
 		if (!cmd) {
 			pr_err("Unable to locate struct se_cmd for TMR\n");
 			continue;
 		}
+
+		/*
+		 * We only execute one LUN_RESET at a time so we can't wait
+		 * on them below.
+		 */
+		if (tmr_p->function == TMR_LUN_RESET)
+			continue;
+
 		/*
 		 * If this function was called with a valid pr_res_key
 		 * parameter (eg: for PROUT PREEMPT_AND_ABORT service action
@@ -379,14 +388,25 @@ int core_tmr_lun_reset(
 				tmr_nacl->initiatorname);
 		}
 	}
+
+
+	/*
+	 * We only allow one reset or preempt and abort to execute at a time
+	 * to prevent one call from claiming all the cmds causing a second
+	 * call from returning while cmds it should have waited on are still
+	 * running.
+	 */
+	mutex_lock(&dev->lun_reset_mutex);
+
 	pr_debug("LUN_RESET: %s starting for [%s], tas: %d\n",
 		(preempt_and_abort_list) ? "Preempt" : "TMR",
 		dev->transport->name, tas);
-
 	core_tmr_drain_tmr_list(dev, tmr, preempt_and_abort_list);
 	core_tmr_drain_state_list(dev, prout_cmd, tmr_sess, tas,
 				preempt_and_abort_list);
 
+	mutex_unlock(&dev->lun_reset_mutex);
+
 	/*
 	 * Clear any legacy SPC-2 reservation when called during
 	 * LOGICAL UNIT RESET
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index bd299790e99c..8cc42ad65c92 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -872,6 +872,7 @@ struct se_device {
 	struct rcu_head		rcu_head;
 	int			queue_cnt;
 	struct se_device_queue	*queues;
+	struct mutex		lun_reset_mutex;
 };
 
 struct target_opcode_descriptor {
-- 
cgit v1.2.3


From 9821d8d4628e630ab56f47a8e6b878a2576e069b Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 14 Feb 2023 09:29:46 +0200
Subject: lib: cpu_rmap: Use allocator for rmap entries

Use a proper allocator for rmap entries using a naive for loop. The
allocator relies on whether an entry is NULL to be considered free.
Remove the used field of rmap which is not needed.

Also, avoid crashing the kernel if an entry is not available.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 include/linux/cpu_rmap.h |  3 +--
 lib/cpu_rmap.c           | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
index be8aea04d023..0ec745e6cd36 100644
--- a/include/linux/cpu_rmap.h
+++ b/include/linux/cpu_rmap.h
@@ -16,14 +16,13 @@
  * struct cpu_rmap - CPU affinity reverse-map
  * @refcount: kref for object
  * @size: Number of objects to be reverse-mapped
- * @used: Number of objects added
  * @obj: Pointer to array of object pointers
  * @near: For each CPU, the index and distance to the nearest object,
  *      based on affinity masks
  */
 struct cpu_rmap {
 	struct kref	refcount;
-	u16		size, used;
+	u16		size;
 	void		**obj;
 	struct {
 		u16	index;
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index e77f12bb3c77..5d4bf7a8b926 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -128,19 +128,31 @@ debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
 }
 #endif
 
+static int get_free_index(struct cpu_rmap *rmap)
+{
+	int i;
+
+	for (i = 0; i < rmap->size; i++)
+		if (!rmap->obj[i])
+			return i;
+
+	return -ENOSPC;
+}
+
 /**
  * cpu_rmap_add - add object to a rmap
  * @rmap: CPU rmap allocated with alloc_cpu_rmap()
  * @obj: Object to add to rmap
  *
- * Return index of object.
+ * Return index of object or -ENOSPC if no free entry was found
  */
 int cpu_rmap_add(struct cpu_rmap *rmap, void *obj)
 {
-	u16 index;
+	int index = get_free_index(rmap);
+
+	if (index < 0)
+		return index;
 
-	BUG_ON(rmap->used >= rmap->size);
-	index = rmap->used++;
 	rmap->obj[index] = obj;
 	return index;
 }
@@ -230,7 +242,7 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap)
 	if (!rmap)
 		return;
 
-	for (index = 0; index < rmap->used; index++) {
+	for (index = 0; index < rmap->size; index++) {
 		glue = rmap->obj[index];
 		if (glue)
 			irq_set_affinity_notifier(glue->notify.irq, NULL);
@@ -295,13 +307,22 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
 	glue->notify.release = irq_cpu_rmap_release;
 	glue->rmap = rmap;
 	cpu_rmap_get(rmap);
-	glue->index = cpu_rmap_add(rmap, glue);
+	rc = cpu_rmap_add(rmap, glue);
+	if (rc < 0)
+		goto err_add;
+
+	glue->index = rc;
 	rc = irq_set_affinity_notifier(irq, &glue->notify);
-	if (rc) {
-		cpu_rmap_put(glue->rmap);
-		rmap->obj[glue->index] = NULL;
-		kfree(glue);
-	}
+	if (rc)
+		goto err_set;
+
+	return rc;
+
+err_set:
+	rmap->obj[glue->index] = NULL;
+err_add:
+	cpu_rmap_put(glue->rmap);
+	kfree(glue);
 	return rc;
 }
 EXPORT_SYMBOL(irq_cpu_rmap_add);
-- 
cgit v1.2.3


From 71f0a2478605c100358a9f9e174849fa643bf8a7 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 14 Feb 2023 11:05:46 +0200
Subject: lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add

Add a function to complement irq_cpu_rmap_add(). It removes the irq from
the reverse mapping by setting the notifier to NULL. The function calls
irq_set_affinity_notifier() with NULL at the notify argument which then
cancel any pending notifier work and decrement reference on the
notifier. When ref count reaches zero, the glue pointer is kfree and the
rmap entry is set to NULL serving both to avoid second attempt to
release it and also making the rmap entry available for subsequent
mapping.

It should be noted the drivers usually creates the reverse mapping at
initialization time and remove it at unload time so we do not expect
failures in allocating rmap due to kref holding the glue entry.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 include/linux/cpu_rmap.h |  1 +
 lib/cpu_rmap.c           | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
index 0ec745e6cd36..cae324d10965 100644
--- a/include/linux/cpu_rmap.h
+++ b/include/linux/cpu_rmap.h
@@ -60,6 +60,7 @@ static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size)
 }
 extern void free_irq_cpu_rmap(struct cpu_rmap *rmap);
 
+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq);
 extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq);
 
 #endif /* __LINUX_CPU_RMAP_H */
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 5d4bf7a8b926..73c1636b927b 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -285,6 +285,17 @@ static void irq_cpu_rmap_release(struct kref *ref)
 	kfree(glue);
 }
 
+/**
+ * irq_cpu_rmap_remove - remove an IRQ from a CPU affinity reverse-map
+ * @rmap: The reverse-map
+ * @irq: The IRQ number
+ */
+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq)
+{
+	return irq_set_affinity_notifier(irq, NULL);
+}
+EXPORT_SYMBOL(irq_cpu_rmap_remove);
+
 /**
  * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map
  * @rmap: The reverse-map
-- 
cgit v1.2.3


From fb0a6a268dcd6fe144c99d60a1166e34c6991d5f Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 5 Jan 2023 11:31:46 +0200
Subject: net/mlx5: Provide external API for allocating vectors

Provide external API to be used by other drivers relying on mlx5_core,
for allocating MSIX vectors. An example for such a driver would be
mlx5_vdpa.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 52 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                       |  6 +++
 2 files changed, 58 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 7fa63d31ae5b..e12e528c09f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -483,6 +483,58 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 	return irq;
 }
 
+/**
+ * mlx5_msix_alloc - allocate msix interrupt
+ * @dev: mlx5 device from which to request
+ * @handler: interrupt handler
+ * @affdesc: affinity descriptor
+ * @name: interrupt name
+ *
+ * Returns: struct msi_map with result encoded.
+ * Note: the caller must make sure to release the irq by calling
+ *       mlx5_msix_free() if shutdown was initiated.
+ */
+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name)
+{
+	struct msi_map map;
+	int err;
+
+	if (!dev->pdev) {
+		map.virq = 0;
+		map.index = -EINVAL;
+		return map;
+	}
+
+	map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
+	if (!map.virq)
+		return map;
+
+	err = request_irq(map.virq, handler, 0, name, NULL);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		pci_msix_free_irq(dev->pdev, map);
+		map.virq = 0;
+		map.index = -ENOMEM;
+	}
+	return map;
+}
+EXPORT_SYMBOL(mlx5_msix_alloc);
+
+/**
+ * mlx5_msix_free - free a previously allocated msix interrupt
+ * @dev: mlx5 device associated with interrupt
+ * @map: map previously returned by mlx5_msix_alloc()
+ */
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
+{
+	free_irq(map.virq, NULL);
+	pci_msix_free_irq(dev->pdev, map);
+}
+EXPORT_SYMBOL(mlx5_msix_free);
+
 /**
  * mlx5_irqs_release_vectors - release one or more IRQs back to the system.
  * @irqs: IRQs to be released.
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209..df0f82110249 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1308,4 +1308,10 @@ enum {
 	MLX5_OCTWORD = 16,
 };
 
+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name);
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map);
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From e0d3f2c694e5db309af270a07fc570fe19902ba4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 22 Mar 2023 12:53:58 -0700
Subject: scsi: core: Declare SCSI host template pointer members const

Declare the SCSI host template pointer members const and also the remaining
SCSI host template pointers in the SCSI core.

Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20230322195515.1267197-4-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 Documentation/scsi/scsi_mid_low_api.rst | 2 +-
 drivers/scsi/hosts.c                    | 4 ++--
 include/linux/raid_class.h              | 2 +-
 include/scsi/libfc.h                    | 2 +-
 include/scsi/scsi_host.h                | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst
index a8c5bd15a440..6fa3a6279501 100644
--- a/Documentation/scsi/scsi_mid_low_api.rst
+++ b/Documentation/scsi/scsi_mid_low_api.rst
@@ -436,7 +436,7 @@ Details::
     *
     *      Defined in: drivers/scsi/hosts.c .
     **/
-    struct Scsi_Host * scsi_host_alloc(struct scsi_host_template * sht,
+    struct Scsi_Host * scsi_host_alloc(const struct scsi_host_template * sht,
 				    int privsize)
 
 
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index f7f62e56afca..0ac3289f6b09 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(scsi_remove_host);
 int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 			   struct device *dma_dev)
 {
-	struct scsi_host_template *sht = shost->hostt;
+	const struct scsi_host_template *sht = shost->hostt;
 	int error = -EINVAL;
 
 	shost_printk(KERN_INFO, shost, "%s\n",
@@ -392,7 +392,7 @@ static struct device_type scsi_host_type = {
  * Return value:
  * 	Pointer to a new Scsi_Host
  **/
-struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
+struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *sht, int privsize)
 {
 	struct Scsi_Host *shost;
 	int index;
diff --git a/include/linux/raid_class.h b/include/linux/raid_class.h
index 5cdfcb873a8f..6a9b177d5c41 100644
--- a/include/linux/raid_class.h
+++ b/include/linux/raid_class.h
@@ -11,7 +11,7 @@ struct raid_template {
 };
 
 struct raid_function_template {
-	void *cookie;
+	const void *cookie;
 	int (*is_raid)(struct device *);
 	void (*get_resync)(struct device *);
 	void (*get_state)(struct device *);
diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h
index 6e29e1719db1..eca6fd42d7f7 100644
--- a/include/scsi/libfc.h
+++ b/include/scsi/libfc.h
@@ -866,7 +866,7 @@ static inline void *lport_priv(const struct fc_lport *lport)
  * Returns: libfc lport
  */
 static inline struct fc_lport *
-libfc_host_alloc(struct scsi_host_template *sht, int priv_size)
+libfc_host_alloc(const struct scsi_host_template *sht, int priv_size)
 {
 	struct fc_lport *lport;
 	struct Scsi_Host *shost;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 587cc767bb67..0f29799efa02 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -554,7 +554,7 @@ struct Scsi_Host {
 	struct completion     * eh_action; /* Wait for specific actions on the
 					      host. */
 	wait_queue_head_t       host_wait;
-	struct scsi_host_template *hostt;
+	const struct scsi_host_template *hostt;
 	struct scsi_transport_template *transportt;
 
 	struct kref		tagset_refcnt;
@@ -747,7 +747,7 @@ static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
 extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
 extern void scsi_flush_work(struct Scsi_Host *);
 
-extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
+extern struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *, int);
 extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
 					       struct device *,
 					       struct device *);
-- 
cgit v1.2.3


From 25df73d93323e20c1f05700776fe3df75d3b27b2 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 22 Mar 2023 12:53:59 -0700
Subject: scsi: ata: Declare SCSI host templates const

Make it explicit that ATA host templates are not modified.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com> (for DWC AHCI SATA)
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jon Hunter <jonathanh@nvidia.com> (for Tegra)
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20230322195515.1267197-5-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/acard-ahci.c                |  2 +-
 drivers/ata/ahci.c                      |  2 +-
 drivers/ata/ahci.h                      |  2 +-
 drivers/ata/ahci_brcm.c                 |  2 +-
 drivers/ata/ahci_ceva.c                 |  2 +-
 drivers/ata/ahci_da850.c                |  2 +-
 drivers/ata/ahci_dm816.c                |  2 +-
 drivers/ata/ahci_dwc.c                  |  2 +-
 drivers/ata/ahci_imx.c                  |  2 +-
 drivers/ata/ahci_mtk.c                  |  2 +-
 drivers/ata/ahci_mvebu.c                |  2 +-
 drivers/ata/ahci_platform.c             |  2 +-
 drivers/ata/ahci_qoriq.c                |  2 +-
 drivers/ata/ahci_seattle.c              |  2 +-
 drivers/ata/ahci_st.c                   |  2 +-
 drivers/ata/ahci_sunxi.c                |  2 +-
 drivers/ata/ahci_tegra.c                |  2 +-
 drivers/ata/ahci_xgene.c                |  2 +-
 drivers/ata/ata_generic.c               |  2 +-
 drivers/ata/ata_piix.c                  |  6 +++---
 drivers/ata/libahci.c                   |  4 ++--
 drivers/ata/libahci_platform.c          |  2 +-
 drivers/ata/libata-core.c               |  4 ++--
 drivers/ata/libata-scsi.c               |  2 +-
 drivers/ata/libata-sff.c                |  8 ++++----
 drivers/ata/libata.h                    |  2 +-
 drivers/ata/pata_acpi.c                 |  2 +-
 drivers/ata/pata_ali.c                  |  2 +-
 drivers/ata/pata_amd.c                  |  2 +-
 drivers/ata/pata_arasan_cf.c            |  2 +-
 drivers/ata/pata_artop.c                |  2 +-
 drivers/ata/pata_atiixp.c               |  2 +-
 drivers/ata/pata_atp867x.c              |  2 +-
 drivers/ata/pata_buddha.c               |  2 +-
 drivers/ata/pata_cmd640.c               |  2 +-
 drivers/ata/pata_cmd64x.c               |  2 +-
 drivers/ata/pata_cs5520.c               |  2 +-
 drivers/ata/pata_cs5530.c               |  2 +-
 drivers/ata/pata_cs5535.c               |  2 +-
 drivers/ata/pata_cs5536.c               |  2 +-
 drivers/ata/pata_cypress.c              |  2 +-
 drivers/ata/pata_efar.c                 |  2 +-
 drivers/ata/pata_ep93xx.c               |  2 +-
 drivers/ata/pata_falcon.c               |  2 +-
 drivers/ata/pata_ftide010.c             |  2 +-
 drivers/ata/pata_gayle.c                |  2 +-
 drivers/ata/pata_hpt366.c               |  2 +-
 drivers/ata/pata_hpt37x.c               |  2 +-
 drivers/ata/pata_hpt3x2n.c              |  2 +-
 drivers/ata/pata_hpt3x3.c               |  2 +-
 drivers/ata/pata_icside.c               |  2 +-
 drivers/ata/pata_imx.c                  |  2 +-
 drivers/ata/pata_isapnp.c               |  2 +-
 drivers/ata/pata_it8213.c               |  2 +-
 drivers/ata/pata_it821x.c               |  2 +-
 drivers/ata/pata_ixp4xx_cf.c            |  2 +-
 drivers/ata/pata_jmicron.c              |  2 +-
 drivers/ata/pata_legacy.c               |  2 +-
 drivers/ata/pata_macio.c                |  2 +-
 drivers/ata/pata_marvell.c              |  2 +-
 drivers/ata/pata_mpc52xx.c              |  2 +-
 drivers/ata/pata_mpiix.c                |  2 +-
 drivers/ata/pata_netcell.c              |  2 +-
 drivers/ata/pata_ninja32.c              |  2 +-
 drivers/ata/pata_ns87410.c              |  2 +-
 drivers/ata/pata_ns87415.c              |  2 +-
 drivers/ata/pata_octeon_cf.c            |  2 +-
 drivers/ata/pata_of_platform.c          |  2 +-
 drivers/ata/pata_oldpiix.c              |  2 +-
 drivers/ata/pata_opti.c                 |  2 +-
 drivers/ata/pata_optidma.c              |  2 +-
 drivers/ata/pata_parport/pata_parport.c |  2 +-
 drivers/ata/pata_pcmcia.c               |  2 +-
 drivers/ata/pata_pdc2027x.c             |  2 +-
 drivers/ata/pata_pdc202xx_old.c         |  2 +-
 drivers/ata/pata_piccolo.c              |  2 +-
 drivers/ata/pata_platform.c             |  4 ++--
 drivers/ata/pata_pxa.c                  |  2 +-
 drivers/ata/pata_radisys.c              |  2 +-
 drivers/ata/pata_rb532_cf.c             |  2 +-
 drivers/ata/pata_rdc.c                  |  2 +-
 drivers/ata/pata_rz1000.c               |  2 +-
 drivers/ata/pata_sc1200.c               |  2 +-
 drivers/ata/pata_sch.c                  |  2 +-
 drivers/ata/pata_serverworks.c          |  6 +++---
 drivers/ata/pata_sil680.c               |  2 +-
 drivers/ata/pata_sis.c                  |  2 +-
 drivers/ata/pata_sl82c105.c             |  2 +-
 drivers/ata/pata_triflex.c              |  2 +-
 drivers/ata/pata_via.c                  |  2 +-
 drivers/ata/pdc_adma.c                  |  2 +-
 drivers/ata/sata_dwc_460ex.c            |  2 +-
 drivers/ata/sata_fsl.c                  |  2 +-
 drivers/ata/sata_highbank.c             |  2 +-
 drivers/ata/sata_inic162x.c             |  2 +-
 drivers/ata/sata_mv.c                   |  4 ++--
 drivers/ata/sata_nv.c                   |  8 ++++----
 drivers/ata/sata_promise.c              |  2 +-
 drivers/ata/sata_qstor.c                |  2 +-
 drivers/ata/sata_rcar.c                 |  2 +-
 drivers/ata/sata_sil.c                  |  2 +-
 drivers/ata/sata_sil24.c                |  2 +-
 drivers/ata/sata_sis.c                  |  2 +-
 drivers/ata/sata_svw.c                  |  2 +-
 drivers/ata/sata_sx4.c                  |  2 +-
 drivers/ata/sata_uli.c                  |  2 +-
 drivers/ata/sata_via.c                  |  2 +-
 drivers/ata/sata_vsc.c                  |  2 +-
 include/linux/ahci_platform.h           |  2 +-
 include/linux/ata_platform.h            |  2 +-
 include/linux/libata.h                  | 10 +++++-----
 111 files changed, 129 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/acard-ahci.c b/drivers/ata/acard-ahci.c
index 993eadd173da..547f56341705 100644
--- a/drivers/ata/acard-ahci.c
+++ b/drivers/ata/acard-ahci.c
@@ -66,7 +66,7 @@ static int acard_ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg
 static int acard_ahci_pci_device_resume(struct pci_dev *pdev);
 #endif
 
-static struct scsi_host_template acard_ahci_sht = {
+static const struct scsi_host_template acard_ahci_sht = {
 	AHCI_SHT("acard-ahci"),
 };
 
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 14a1c0d14916..addba109406b 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -101,7 +101,7 @@ static int ahci_pci_device_resume(struct device *dev);
 #endif
 #endif /* CONFIG_PM */
 
-static struct scsi_host_template ahci_sht = {
+static const struct scsi_host_template ahci_sht = {
 	AHCI_SHT("ahci"),
 };
 
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index ff8e6ae1c636..4bae95b06ae3 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -430,7 +430,7 @@ void ahci_set_em_messages(struct ahci_host_priv *hpriv,
 			  struct ata_port_info *pi);
 int ahci_reset_em(struct ata_host *host);
 void ahci_print_info(struct ata_host *host, const char *scc_s);
-int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht);
+int ahci_host_activate(struct ata_host *host, const struct scsi_host_template *sht);
 void ahci_error_handler(struct ata_port *ap);
 u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked);
 
diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c
index 6f216eb25610..4e3dc2b6d67f 100644
--- a/drivers/ata/ahci_brcm.c
+++ b/drivers/ata/ahci_brcm.c
@@ -417,7 +417,7 @@ out_disable_clks:
 	return ret;
 }
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_ceva.c b/drivers/ata/ahci_ceva.c
index cb24ecf36faf..bc027468decb 100644
--- a/drivers/ata/ahci_ceva.c
+++ b/drivers/ata/ahci_ceva.c
@@ -185,7 +185,7 @@ static void ahci_ceva_setup(struct ahci_host_priv *hpriv)
 	}
 }
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index dc8a019b8340..ca0924dc5bd2 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -153,7 +153,7 @@ static const struct ata_port_info ahci_da850_port_info = {
 	.port_ops	= &ahci_da850_port_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_dm816.c b/drivers/ata/ahci_dm816.c
index d26efcd20f64..b08547b877a1 100644
--- a/drivers/ata/ahci_dm816.c
+++ b/drivers/ata/ahci_dm816.c
@@ -134,7 +134,7 @@ static const struct ata_port_info ahci_dm816_port_info = {
 	.port_ops	= &ahci_dm816_port_ops,
 };
 
-static struct scsi_host_template ahci_dm816_platform_sht = {
+static const struct scsi_host_template ahci_dm816_platform_sht = {
 	AHCI_SHT(AHCI_DM816_DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_dwc.c b/drivers/ata/ahci_dwc.c
index 8fb66860db31..4bfbb09cdc02 100644
--- a/drivers/ata/ahci_dwc.c
+++ b/drivers/ata/ahci_dwc.c
@@ -398,7 +398,7 @@ static const struct ata_port_info ahci_dwc_port_info = {
 	.port_ops	= &ahci_dwc_port_ops,
 };
 
-static struct scsi_host_template ahci_dwc_scsi_info = {
+static const struct scsi_host_template ahci_dwc_scsi_info = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index a950767f7948..1c1139dae29a 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -979,7 +979,7 @@ static u32 imx_ahci_parse_props(struct device *dev,
 	return reg_value;
 }
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_mtk.c b/drivers/ata/ahci_mtk.c
index c056378e3e72..f6a75341256f 100644
--- a/drivers/ata/ahci_mtk.c
+++ b/drivers/ata/ahci_mtk.c
@@ -37,7 +37,7 @@ static const struct ata_port_info ahci_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c
index 22ecc4f3ae79..596cf017f427 100644
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -178,7 +178,7 @@ static const struct ata_port_info ahci_mvebu_port_info = {
 	.port_ops  = &ahci_platform_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 8f5572a9f8f1..299ee686ac49 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -36,7 +36,7 @@ static const struct ata_port_info ahci_port_info_nolpm = {
 	.port_ops	= &ahci_platform_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c
index 9cf9bf36a874..0ba764d283c8 100644
--- a/drivers/ata/ahci_qoriq.c
+++ b/drivers/ata/ahci_qoriq.c
@@ -159,7 +159,7 @@ static const struct ata_port_info ahci_qoriq_port_info = {
 	.port_ops	= &ahci_qoriq_ops,
 };
 
-static struct scsi_host_template ahci_qoriq_sht = {
+static const struct scsi_host_template ahci_qoriq_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_seattle.c b/drivers/ata/ahci_seattle.c
index ced12705ed9d..9eda7bbd2151 100644
--- a/drivers/ata/ahci_seattle.c
+++ b/drivers/ata/ahci_seattle.c
@@ -72,7 +72,7 @@ static const struct ata_port_info ahci_port_seattle_info = {
 	.port_ops	= &ahci_seattle_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c
index 8607b68eee53..f2c1edb36986 100644
--- a/drivers/ata/ahci_st.c
+++ b/drivers/ata/ahci_st.c
@@ -138,7 +138,7 @@ static const struct ata_port_info st_ahci_port_info = {
 	.port_ops       = &st_ahci_port_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_sunxi.c b/drivers/ata/ahci_sunxi.c
index c7273c1cb0c7..076c12b4ba08 100644
--- a/drivers/ata/ahci_sunxi.c
+++ b/drivers/ata/ahci_sunxi.c
@@ -206,7 +206,7 @@ static const struct ata_port_info ahci_sunxi_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index 4fb94db1217d..8e5e2b359f2d 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c
@@ -506,7 +506,7 @@ static const struct of_device_id tegra_ahci_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_ahci_of_match);
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 1e08704d5117..83f5ff54ef5b 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -710,7 +710,7 @@ static int xgene_ahci_mux_select(struct xgene_ahci_context *ctx)
 	return val & CFG_SATA_ENET_SELECT_MASK ? -1 : 0;
 }
 
-static struct scsi_host_template ahci_platform_sht = {
+static const struct scsi_host_template ahci_platform_sht = {
 	AHCI_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index 20a32e4d501d..2f57ec00ab82 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -95,7 +95,7 @@ static int generic_set_mode(struct ata_link *link, struct ata_device **unused)
 	return 0;
 }
 
-static struct scsi_host_template generic_sht = {
+static const struct scsi_host_template generic_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index ade5e894563b..ec3c5bd1f813 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1059,7 +1059,7 @@ static u8 piix_vmw_bmdma_status(struct ata_port *ap)
 	return ata_bmdma_status(ap) & ~ATA_DMA_ERR;
 }
 
-static struct scsi_host_template piix_sht = {
+static const struct scsi_host_template piix_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
@@ -1095,7 +1095,7 @@ static struct attribute *piix_sidpr_shost_attrs[] = {
 
 ATTRIBUTE_GROUPS(piix_sidpr_shost);
 
-static struct scsi_host_template piix_sidpr_sht = {
+static const struct scsi_host_template piix_sidpr_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 	.shost_groups		= piix_sidpr_shost_groups,
 };
@@ -1645,7 +1645,7 @@ static int piix_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct device *dev = &pdev->dev;
 	struct ata_port_info port_info[2];
 	const struct ata_port_info *ppi[] = { &port_info[0], &port_info[1] };
-	struct scsi_host_template *sht = &piix_sht;
+	const struct scsi_host_template *sht = &piix_sht;
 	unsigned long port_flags;
 	struct ata_host *host;
 	struct piix_host_priv *hpriv;
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 8f216de76648..9c2cb6cbea76 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -2692,7 +2692,7 @@ void ahci_set_em_messages(struct ahci_host_priv *hpriv,
 EXPORT_SYMBOL_GPL(ahci_set_em_messages);
 
 static int ahci_host_activate_multi_irqs(struct ata_host *host,
-					 struct scsi_host_template *sht)
+					 const struct scsi_host_template *sht)
 {
 	struct ahci_host_priv *hpriv = host->private_data;
 	int i, rc;
@@ -2736,7 +2736,7 @@ static int ahci_host_activate_multi_irqs(struct ata_host *host,
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht)
+int ahci_host_activate(struct ata_host *host, const struct scsi_host_template *sht)
 {
 	struct ahci_host_priv *hpriv = host->private_data;
 	int irq = hpriv->irq;
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index b9e336bacf17..d6c3a6ffb0b3 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -680,7 +680,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
 			    const struct ata_port_info *pi_template,
-			    struct scsi_host_template *sht)
+			    const struct scsi_host_template *sht)
 {
 	struct device *dev = &pdev->dev;
 	struct ata_port_info pi = *pi_template;
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 14c17c3bda4e..8bf612bdd61a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5775,7 +5775,7 @@ static void async_port_probe(void *data, async_cookie_t cookie)
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
+int ata_host_register(struct ata_host *host, const struct scsi_host_template *sht)
 {
 	int i, rc;
 
@@ -5883,7 +5883,7 @@ EXPORT_SYMBOL_GPL(ata_host_register);
  */
 int ata_host_activate(struct ata_host *host, int irq,
 		      irq_handler_t irq_handler, unsigned long irq_flags,
-		      struct scsi_host_template *sht)
+		      const struct scsi_host_template *sht)
 {
 	int i, rc;
 	char *irq_desc;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e093c7a7deeb..7bb12deab70c 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -4186,7 +4186,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 	scsi_done(cmd);
 }
 
-int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
+int ata_scsi_add_hosts(struct ata_host *host, const struct scsi_host_template *sht)
 {
 	int i, rc;
 
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index cd82d3b5ed14..9d28badfe41d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -2281,7 +2281,7 @@ EXPORT_SYMBOL_GPL(ata_pci_sff_prepare_host);
  */
 int ata_pci_sff_activate_host(struct ata_host *host,
 			      irq_handler_t irq_handler,
-			      struct scsi_host_template *sht)
+			      const struct scsi_host_template *sht)
 {
 	struct device *dev = host->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -2378,7 +2378,7 @@ static const struct ata_port_info *ata_sff_find_valid_pi(
 
 static int ata_pci_init_one(struct pci_dev *pdev,
 		const struct ata_port_info * const *ppi,
-		struct scsi_host_template *sht, void *host_priv,
+		const struct scsi_host_template *sht, void *host_priv,
 		int hflags, bool bmdma)
 {
 	struct device *dev = &pdev->dev;
@@ -2452,7 +2452,7 @@ out:
  */
 int ata_pci_sff_init_one(struct pci_dev *pdev,
 		 const struct ata_port_info * const *ppi,
-		 struct scsi_host_template *sht, void *host_priv, int hflag)
+		 const struct scsi_host_template *sht, void *host_priv, int hflag)
 {
 	return ata_pci_init_one(pdev, ppi, sht, host_priv, hflag, 0);
 }
@@ -3175,7 +3175,7 @@ EXPORT_SYMBOL_GPL(ata_pci_bmdma_prepare_host);
  */
 int ata_pci_bmdma_init_one(struct pci_dev *pdev,
 			   const struct ata_port_info * const * ppi,
-			   struct scsi_host_template *sht, void *host_priv,
+			   const struct scsi_host_template *sht, void *host_priv,
 			   int hflags)
 {
 	return ata_pci_init_one(pdev, ppi, sht, host_priv, hflags, 1);
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 2cd6124a01e8..926d0d33cd29 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -111,7 +111,7 @@ static inline void ata_acpi_bind_dev(struct ata_device *dev) {}
 extern struct ata_device *ata_scsi_find_dev(struct ata_port *ap,
 					    const struct scsi_device *scsidev);
 extern int ata_scsi_add_hosts(struct ata_host *host,
-			      struct scsi_host_template *sht);
+			      const struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
 extern bool ata_scsi_sense_is_valid(u8 sk, u8 asc, u8 ascq);
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index f8706ee427d2..ab38871b5e00 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -205,7 +205,7 @@ static int pacpi_port_start(struct ata_port *ap)
 	return ata_bmdma_port_start(ap);
 }
 
-static struct scsi_host_template pacpi_sht = {
+static const struct scsi_host_template pacpi_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index 76ad0e73fe2a..bb790edd6036 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -355,7 +355,7 @@ static void ali_c2_c3_postreset(struct ata_link *link, unsigned int *classes)
 	ata_sff_postreset(link, classes);
 }
 
-static struct scsi_host_template ali_sht = {
+static const struct scsi_host_template ali_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c
index f216f9d7b9ec..5b02b89748b7 100644
--- a/drivers/ata/pata_amd.c
+++ b/drivers/ata/pata_amd.c
@@ -388,7 +388,7 @@ static void nv_host_stop(struct ata_host *host)
 	pci_write_config_dword(to_pci_dev(host->dev), 0x60, udma);
 }
 
-static struct scsi_host_template amd_sht = {
+static const struct scsi_host_template amd_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index e89617ed9175..6ab294322e79 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -218,7 +218,7 @@ struct arasan_cf_dev {
 	struct ata_queued_cmd *qc;
 };
 
-static struct scsi_host_template arasan_cf_sht = {
+static const struct scsi_host_template arasan_cf_sht = {
 	ATA_BASE_SHT(DRIVER_NAME),
 	.dma_boundary = 0xFFFFFFFFUL,
 };
diff --git a/drivers/ata/pata_artop.c b/drivers/ata/pata_artop.c
index 20a8f31a3f57..40544282f455 100644
--- a/drivers/ata/pata_artop.c
+++ b/drivers/ata/pata_artop.c
@@ -292,7 +292,7 @@ static int artop6210_qc_defer(struct ata_queued_cmd *qc)
 	return 0;
 }
 
-static struct scsi_host_template artop_sht = {
+static const struct scsi_host_template artop_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c
index efdb94cff68b..8c5cc803aab3 100644
--- a/drivers/ata/pata_atiixp.c
+++ b/drivers/ata/pata_atiixp.c
@@ -251,7 +251,7 @@ static void atiixp_bmdma_stop(struct ata_queued_cmd *qc)
 	ata_bmdma_stop(qc);
 }
 
-static struct scsi_host_template atiixp_sht = {
+static const struct scsi_host_template atiixp_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= LIBATA_DUMB_MAX_PRD,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/pata_atp867x.c b/drivers/ata/pata_atp867x.c
index 779d660415c8..aaef5924f636 100644
--- a/drivers/ata/pata_atp867x.c
+++ b/drivers/ata/pata_atp867x.c
@@ -259,7 +259,7 @@ static int atp867x_cable_detect(struct ata_port *ap)
 	return ATA_CBL_PATA_UNK;
 }
 
-static struct scsi_host_template atp867x_sht = {
+static const struct scsi_host_template atp867x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_buddha.c b/drivers/ata/pata_buddha.c
index 27d4c417fc60..49bc619b83e2 100644
--- a/drivers/ata/pata_buddha.c
+++ b/drivers/ata/pata_buddha.c
@@ -57,7 +57,7 @@ static unsigned int xsurf_bases[2] = {
 	XSURF_BASE1, XSURF_BASE2
 };
 
-static struct scsi_host_template pata_buddha_sht = {
+static const struct scsi_host_template pata_buddha_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_cmd640.c b/drivers/ata/pata_cmd640.c
index 1a3372a72213..45a7217b136e 100644
--- a/drivers/ata/pata_cmd640.c
+++ b/drivers/ata/pata_cmd640.c
@@ -172,7 +172,7 @@ static bool cmd640_sff_irq_check(struct ata_port *ap)
 	return irq_stat & irq_mask;
 }
 
-static struct scsi_host_template cmd640_sht = {
+static const struct scsi_host_template cmd640_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_cmd64x.c b/drivers/ata/pata_cmd64x.c
index 5baa4a7819c1..fafea2b79145 100644
--- a/drivers/ata/pata_cmd64x.c
+++ b/drivers/ata/pata_cmd64x.c
@@ -319,7 +319,7 @@ static void cmd646r1_bmdma_stop(struct ata_queued_cmd *qc)
 	ata_bmdma_stop(qc);
 }
 
-static struct scsi_host_template cmd64x_sht = {
+static const struct scsi_host_template cmd64x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_cs5520.c b/drivers/ata/pata_cs5520.c
index f4289a532f87..422d42761a1d 100644
--- a/drivers/ata/pata_cs5520.c
+++ b/drivers/ata/pata_cs5520.c
@@ -94,7 +94,7 @@ static void cs5520_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	cs5520_set_timings(ap, adev, adev->pio_mode);
 }
 
-static struct scsi_host_template cs5520_sht = {
+static const struct scsi_host_template cs5520_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= LIBATA_DUMB_MAX_PRD,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/pata_cs5530.c b/drivers/ata/pata_cs5530.c
index d5b7ac14e78f..1e67b0f8db43 100644
--- a/drivers/ata/pata_cs5530.c
+++ b/drivers/ata/pata_cs5530.c
@@ -146,7 +146,7 @@ static unsigned int cs5530_qc_issue(struct ata_queued_cmd *qc)
 	return ata_bmdma_qc_issue(qc);
 }
 
-static struct scsi_host_template cs5530_sht = {
+static const struct scsi_host_template cs5530_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize	= LIBATA_DUMB_MAX_PRD,
 	.dma_boundary	= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/pata_cs5535.c b/drivers/ata/pata_cs5535.c
index c2c3238ff84b..d793fc441b46 100644
--- a/drivers/ata/pata_cs5535.c
+++ b/drivers/ata/pata_cs5535.c
@@ -141,7 +141,7 @@ static void cs5535_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	wrmsr(ATAC_CH0D0_DMA + 2 * adev->devno, reg, 0);
 }
 
-static struct scsi_host_template cs5535_sht = {
+static const struct scsi_host_template cs5535_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c
index ab47aeb5587f..b811efd2cc34 100644
--- a/drivers/ata/pata_cs5536.c
+++ b/drivers/ata/pata_cs5536.c
@@ -217,7 +217,7 @@ static void cs5536_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	cs5536_write(pdev, ETC, etc);
 }
 
-static struct scsi_host_template cs5536_sht = {
+static const struct scsi_host_template cs5536_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_cypress.c b/drivers/ata/pata_cypress.c
index 3be5d52a777b..ae347b5c2871 100644
--- a/drivers/ata/pata_cypress.c
+++ b/drivers/ata/pata_cypress.c
@@ -115,7 +115,7 @@ static void cy82c693_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	outb(0x50, 0x23);
 }
 
-static struct scsi_host_template cy82c693_sht = {
+static const struct scsi_host_template cy82c693_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c
index 21da59f35b41..2e6eccf2902f 100644
--- a/drivers/ata/pata_efar.c
+++ b/drivers/ata/pata_efar.c
@@ -234,7 +234,7 @@ static void efar_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 	spin_unlock_irqrestore(&efar_lock, flags);
 }
 
-static struct scsi_host_template efar_sht = {
+static const struct scsi_host_template efar_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ep93xx.c b/drivers/ata/pata_ep93xx.c
index 47845d920075..c6e043e05d43 100644
--- a/drivers/ata/pata_ep93xx.c
+++ b/drivers/ata/pata_ep93xx.c
@@ -872,7 +872,7 @@ static int ep93xx_pata_port_start(struct ata_port *ap)
 	return 0;
 }
 
-static struct scsi_host_template ep93xx_pata_sht = {
+static const struct scsi_host_template ep93xx_pata_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	/* ep93xx dma implementation limit */
 	.sg_tablesize		= 32,
diff --git a/drivers/ata/pata_falcon.c b/drivers/ata/pata_falcon.c
index 823c88622e34..996516e64f13 100644
--- a/drivers/ata/pata_falcon.c
+++ b/drivers/ata/pata_falcon.c
@@ -33,7 +33,7 @@
 #define DRV_NAME "pata_falcon"
 #define DRV_VERSION "0.1.0"
 
-static struct scsi_host_template pata_falcon_sht = {
+static const struct scsi_host_template pata_falcon_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ftide010.c b/drivers/ata/pata_ftide010.c
index 88924b5daa1a..6f6734c09b11 100644
--- a/drivers/ata/pata_ftide010.c
+++ b/drivers/ata/pata_ftide010.c
@@ -84,7 +84,7 @@ struct ftide010 {
 #define FTIDE010_CLK_MOD_DEV0_UDMA_EN	BIT(4)
 #define FTIDE010_CLK_MOD_DEV1_UDMA_EN	BIT(5)
 
-static struct scsi_host_template pata_ftide010_sht = {
+static const struct scsi_host_template pata_ftide010_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_gayle.c b/drivers/ata/pata_gayle.c
index 65bc9f3042ce..e5aa07f92106 100644
--- a/drivers/ata/pata_gayle.c
+++ b/drivers/ata/pata_gayle.c
@@ -35,7 +35,7 @@
 
 #define GAYLE_CONTROL	0x101a
 
-static struct scsi_host_template pata_gayle_sht = {
+static const struct scsi_host_template pata_gayle_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index 7e441fb304d3..bdccd1ba1524 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -312,7 +312,7 @@ static int hpt366_prereset(struct ata_link *link, unsigned long deadline)
 	return ata_sff_prereset(link, deadline);
 }
 
-static struct scsi_host_template hpt36x_sht = {
+static const struct scsi_host_template hpt36x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c
index ce3c5eaa7e76..c0329cf01135 100644
--- a/drivers/ata/pata_hpt37x.c
+++ b/drivers/ata/pata_hpt37x.c
@@ -526,7 +526,7 @@ static void hpt37x_bmdma_stop(struct ata_queued_cmd *qc)
 }
 
 
-static struct scsi_host_template hpt37x_sht = {
+static const struct scsi_host_template hpt37x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_hpt3x2n.c b/drivers/ata/pata_hpt3x2n.c
index 617c95522f43..5b1ecccf3c83 100644
--- a/drivers/ata/pata_hpt3x2n.c
+++ b/drivers/ata/pata_hpt3x2n.c
@@ -337,7 +337,7 @@ static unsigned int hpt3x2n_qc_issue(struct ata_queued_cmd *qc)
 	return ata_bmdma_qc_issue(qc);
 }
 
-static struct scsi_host_template hpt3x2n_sht = {
+static const struct scsi_host_template hpt3x2n_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_hpt3x3.c b/drivers/ata/pata_hpt3x3.c
index 83974d5eb387..d65c586b5ad0 100644
--- a/drivers/ata/pata_hpt3x3.c
+++ b/drivers/ata/pata_hpt3x3.c
@@ -136,7 +136,7 @@ static int hpt3x3_atapi_dma(struct ata_queued_cmd *qc)
 
 #endif /* CONFIG_PATA_HPT3X3_DMA */
 
-static struct scsi_host_template hpt3x3_sht = {
+static const struct scsi_host_template hpt3x3_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 498383cb6e29..9cfb064782c3 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -298,7 +298,7 @@ static int icside_dma_init(struct pata_icside_info *info)
 }
 
 
-static struct scsi_host_template pata_icside_sht = {
+static const struct scsi_host_template pata_icside_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= SG_MAX_SEGMENTS,
 	.dma_boundary		= IOMD_DMA_BOUNDARY,
diff --git a/drivers/ata/pata_imx.c b/drivers/ata/pata_imx.c
index 150939275b1b..4013f28679a9 100644
--- a/drivers/ata/pata_imx.c
+++ b/drivers/ata/pata_imx.c
@@ -97,7 +97,7 @@ static void pata_imx_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	__raw_writel(val, priv->host_regs + PATA_IMX_ATA_CONTROL);
 }
 
-static struct scsi_host_template pata_imx_sht = {
+static const struct scsi_host_template pata_imx_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_isapnp.c b/drivers/ata/pata_isapnp.c
index 43bb224430d3..25a63d043c8e 100644
--- a/drivers/ata/pata_isapnp.c
+++ b/drivers/ata/pata_isapnp.c
@@ -20,7 +20,7 @@
 #define DRV_NAME "pata_isapnp"
 #define DRV_VERSION "0.2.5"
 
-static struct scsi_host_template isapnp_sht = {
+static const struct scsi_host_template isapnp_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_it8213.c b/drivers/ata/pata_it8213.c
index 8a3e8778163c..b7ac56103c8a 100644
--- a/drivers/ata/pata_it8213.c
+++ b/drivers/ata/pata_it8213.c
@@ -228,7 +228,7 @@ static void it8213_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 	pci_write_config_byte(dev, 0x48, udma_enable);
 }
 
-static struct scsi_host_template it8213_sht = {
+static const struct scsi_host_template it8213_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c
index 8a5b4e0079ab..2fe3fb6102ce 100644
--- a/drivers/ata/pata_it821x.c
+++ b/drivers/ata/pata_it821x.c
@@ -800,7 +800,7 @@ static int it821x_rdc_cable(struct ata_port *ap)
 	return ATA_CBL_PATA80;
 }
 
-static struct scsi_host_template it821x_sht = {
+static const struct scsi_host_template it821x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index e225913a619d..9a2c1b6cd71f 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -173,7 +173,7 @@ static unsigned int ixp4xx_mmio_data_xfer(struct ata_queued_cmd *qc,
 	return words << 1;
 }
 
-static struct scsi_host_template ixp4xx_sht = {
+static const struct scsi_host_template ixp4xx_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_jmicron.c b/drivers/ata/pata_jmicron.c
index d1b3ce8958dd..f51fb8219762 100644
--- a/drivers/ata/pata_jmicron.c
+++ b/drivers/ata/pata_jmicron.c
@@ -107,7 +107,7 @@ static int jmicron_pre_reset(struct ata_link *link, unsigned long deadline)
 
 /* No PIO or DMA methods needed for this device */
 
-static struct scsi_host_template jmicron_sht = {
+static const struct scsi_host_template jmicron_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index 03c580625c2c..448a511cbc17 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -250,7 +250,7 @@ static int legacy_set_mode(struct ata_link *link, struct ata_device **unused)
 	return 0;
 }
 
-static struct scsi_host_template legacy_sht = {
+static const struct scsi_host_template legacy_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c
index 9ccaac9e2bc3..c4d86ea049f0 100644
--- a/drivers/ata/pata_macio.c
+++ b/drivers/ata/pata_macio.c
@@ -908,7 +908,7 @@ static int pata_macio_do_resume(struct pata_macio_priv *priv)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-static struct scsi_host_template pata_macio_sht = {
+static const struct scsi_host_template pata_macio_sht = {
 	__ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= MAX_DCMDS,
 	/* We may not need that strict one */
diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
index 014ccb0f45dc..8119caaad605 100644
--- a/drivers/ata/pata_marvell.c
+++ b/drivers/ata/pata_marvell.c
@@ -92,7 +92,7 @@ static int marvell_cable_detect(struct ata_port *ap)
 
 /* No PIO or DMA methods needed for this device */
 
-static struct scsi_host_template marvell_sht = {
+static const struct scsi_host_template marvell_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_mpc52xx.c b/drivers/ata/pata_mpc52xx.c
index 3ebd6522a1fd..66c9dea4ea6e 100644
--- a/drivers/ata/pata_mpc52xx.c
+++ b/drivers/ata/pata_mpc52xx.c
@@ -606,7 +606,7 @@ mpc52xx_ata_task_irq(int irq, void *vpriv)
 	return IRQ_HANDLED;
 }
 
-static struct scsi_host_template mpc52xx_ata_sht = {
+static const struct scsi_host_template mpc52xx_ata_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_mpiix.c b/drivers/ata/pata_mpiix.c
index 8fda0e32c1ab..69e4baf27d72 100644
--- a/drivers/ata/pata_mpiix.c
+++ b/drivers/ata/pata_mpiix.c
@@ -136,7 +136,7 @@ static unsigned int mpiix_qc_issue(struct ata_queued_cmd *qc)
 	return ata_sff_qc_issue(qc);
 }
 
-static struct scsi_host_template mpiix_sht = {
+static const struct scsi_host_template mpiix_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_netcell.c b/drivers/ata/pata_netcell.c
index 06929e77c491..c0b2897fcf40 100644
--- a/drivers/ata/pata_netcell.c
+++ b/drivers/ata/pata_netcell.c
@@ -31,7 +31,7 @@ static unsigned int netcell_read_id(struct ata_device *adev,
 	return err_mask;
 }
 
-static struct scsi_host_template netcell_sht = {
+static const struct scsi_host_template netcell_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ninja32.c b/drivers/ata/pata_ninja32.c
index f9255d6fd194..76a91013d27d 100644
--- a/drivers/ata/pata_ninja32.c
+++ b/drivers/ata/pata_ninja32.c
@@ -77,7 +77,7 @@ static void ninja32_dev_select(struct ata_port *ap, unsigned int device)
 	}
 }
 
-static struct scsi_host_template ninja32_sht = {
+static const struct scsi_host_template ninja32_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ns87410.c b/drivers/ata/pata_ns87410.c
index ca3ab2736fef..44cc24d21d5f 100644
--- a/drivers/ata/pata_ns87410.c
+++ b/drivers/ata/pata_ns87410.c
@@ -114,7 +114,7 @@ static unsigned int ns87410_qc_issue(struct ata_queued_cmd *qc)
 	return ata_sff_qc_issue(qc);
 }
 
-static struct scsi_host_template ns87410_sht = {
+static const struct scsi_host_template ns87410_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_ns87415.c b/drivers/ata/pata_ns87415.c
index 9dd6bffefb48..d60e1f69d7b0 100644
--- a/drivers/ata/pata_ns87415.c
+++ b/drivers/ata/pata_ns87415.c
@@ -320,7 +320,7 @@ static struct ata_port_operations ns87560_pata_ops = {
 };
 #endif
 
-static struct scsi_host_template ns87415_sht = {
+static const struct scsi_host_template ns87415_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index 4cbcdc5da038..b1ce9f1761af 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -58,7 +58,7 @@ struct octeon_cf_port {
 	u64 dma_base;
 };
 
-static struct scsi_host_template octeon_cf_sht = {
+static const struct scsi_host_template octeon_cf_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index ac5a633c00a5..178b28eff170 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -15,7 +15,7 @@
 
 #define DRV_NAME "pata_of_platform"
 
-static struct scsi_host_template pata_platform_sht = {
+static const struct scsi_host_template pata_platform_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_oldpiix.c b/drivers/ata/pata_oldpiix.c
index 22a020374410..dca82d92b004 100644
--- a/drivers/ata/pata_oldpiix.c
+++ b/drivers/ata/pata_oldpiix.c
@@ -204,7 +204,7 @@ static unsigned int oldpiix_qc_issue(struct ata_queued_cmd *qc)
 }
 
 
-static struct scsi_host_template oldpiix_sht = {
+static const struct scsi_host_template oldpiix_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_opti.c b/drivers/ata/pata_opti.c
index 01976c4e4033..3d23f57eb128 100644
--- a/drivers/ata/pata_opti.c
+++ b/drivers/ata/pata_opti.c
@@ -148,7 +148,7 @@ static void opti_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	opti_write_reg(ap, 0x85, CNTRL_REG);
 }
 
-static struct scsi_host_template opti_sht = {
+static const struct scsi_host_template opti_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_optidma.c b/drivers/ata/pata_optidma.c
index ad1090b90e52..dfc36b4ec9c6 100644
--- a/drivers/ata/pata_optidma.c
+++ b/drivers/ata/pata_optidma.c
@@ -334,7 +334,7 @@ static int optidma_set_mode(struct ata_link *link, struct ata_device **r_failed)
 	return rc;
 }
 
-static struct scsi_host_template optidma_sht = {
+static const struct scsi_host_template optidma_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 294a266a0dda..02a62310845a 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -398,7 +398,7 @@ static struct device pata_parport_bus = {
 	.release = pata_parport_bus_release,
 };
 
-static struct scsi_host_template pata_parport_sht = {
+static const struct scsi_host_template pata_parport_sht = {
 	PATA_PARPORT_SHT("pata_parport")
 };
 
diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c
index 8eb066abbd9c..5b602206c522 100644
--- a/drivers/ata/pata_pcmcia.c
+++ b/drivers/ata/pata_pcmcia.c
@@ -132,7 +132,7 @@ static void pcmcia_8bit_drain_fifo(struct ata_queued_cmd *qc)
 
 }
 
-static struct scsi_host_template pcmcia_sht = {
+static const struct scsi_host_template pcmcia_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index 4191aa61c8e4..6820c5597b14 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -122,7 +122,7 @@ static struct pci_driver pdc2027x_pci_driver = {
 #endif
 };
 
-static struct scsi_host_template pdc2027x_sht = {
+static const struct scsi_host_template pdc2027x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_pdc202xx_old.c b/drivers/ata/pata_pdc202xx_old.c
index f894ff2de0a9..a32723e46357 100644
--- a/drivers/ata/pata_pdc202xx_old.c
+++ b/drivers/ata/pata_pdc202xx_old.c
@@ -289,7 +289,7 @@ static int pdc2026x_check_atapi_dma(struct ata_queued_cmd *qc)
 	return 1;
 }
 
-static struct scsi_host_template pdc202xx_sht = {
+static const struct scsi_host_template pdc202xx_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_piccolo.c b/drivers/ata/pata_piccolo.c
index 389b63b13c70..ced906bf56be 100644
--- a/drivers/ata/pata_piccolo.c
+++ b/drivers/ata/pata_piccolo.c
@@ -62,7 +62,7 @@ static void tosh_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 }
 
 
-static struct scsi_host_template tosh_sht = {
+static const struct scsi_host_template tosh_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index 21fb059859bd..87479bc893b2 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -45,7 +45,7 @@ static int pata_platform_set_mode(struct ata_link *link, struct ata_device **unu
 	return 0;
 }
 
-static struct scsi_host_template pata_platform_sht = {
+static const struct scsi_host_template pata_platform_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
@@ -97,7 +97,7 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
 int __pata_platform_probe(struct device *dev, struct resource *io_res,
 			  struct resource *ctl_res, struct resource *irq_res,
 			  unsigned int ioport_shift, int __pio_mask,
-			  struct scsi_host_template *sht, bool use16bit)
+			  const struct scsi_host_template *sht, bool use16bit)
 {
 	struct ata_host *host;
 	struct ata_port *ap;
diff --git a/drivers/ata/pata_pxa.c b/drivers/ata/pata_pxa.c
index 985f42c4fd70..ea402e02c46e 100644
--- a/drivers/ata/pata_pxa.c
+++ b/drivers/ata/pata_pxa.c
@@ -136,7 +136,7 @@ static int pxa_check_atapi_dma(struct ata_queued_cmd *qc)
 	return -EOPNOTSUPP;
 }
 
-static struct scsi_host_template pxa_ata_sht = {
+static const struct scsi_host_template pxa_ata_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_radisys.c b/drivers/ata/pata_radisys.c
index 3aca8fe3fdb6..84b001097093 100644
--- a/drivers/ata/pata_radisys.c
+++ b/drivers/ata/pata_radisys.c
@@ -183,7 +183,7 @@ static unsigned int radisys_qc_issue(struct ata_queued_cmd *qc)
 }
 
 
-static struct scsi_host_template radisys_sht = {
+static const struct scsi_host_template radisys_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 2e110aefe59b..3974d294a341 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -73,7 +73,7 @@ static struct ata_port_operations rb532_pata_port_ops = {
 
 /* ------------------------------------------------------------------------ */
 
-static struct scsi_host_template rb532_pata_sht = {
+static const struct scsi_host_template rb532_pata_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c
index ecb229c2c1a2..0a9689862f71 100644
--- a/drivers/ata/pata_rdc.c
+++ b/drivers/ata/pata_rdc.c
@@ -288,7 +288,7 @@ static const struct ata_port_info rdc_port_info = {
 	.port_ops	= &rdc_pata_ops,
 };
 
-static struct scsi_host_template rdc_sht = {
+static const struct scsi_host_template rdc_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c
index fb00c3e5fd19..8e2606793091 100644
--- a/drivers/ata/pata_rz1000.c
+++ b/drivers/ata/pata_rz1000.c
@@ -50,7 +50,7 @@ static int rz1000_set_mode(struct ata_link *link, struct ata_device **unused)
 }
 
 
-static struct scsi_host_template rz1000_sht = {
+static const struct scsi_host_template rz1000_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_sc1200.c b/drivers/ata/pata_sc1200.c
index f28daf62a37d..a388dfb97ad8 100644
--- a/drivers/ata/pata_sc1200.c
+++ b/drivers/ata/pata_sc1200.c
@@ -192,7 +192,7 @@ static int sc1200_qc_defer(struct ata_queued_cmd *qc)
 	return 0;
 }
 
-static struct scsi_host_template sc1200_sht = {
+static const struct scsi_host_template sc1200_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize	= LIBATA_DUMB_MAX_PRD,
 	.dma_boundary	= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/pata_sch.c b/drivers/ata/pata_sch.c
index 4f9c2aefd807..8356f1f2a025 100644
--- a/drivers/ata/pata_sch.c
+++ b/drivers/ata/pata_sch.c
@@ -57,7 +57,7 @@ static struct pci_driver sch_pci_driver = {
 #endif
 };
 
-static struct scsi_host_template sch_sht = {
+static const struct scsi_host_template sch_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_serverworks.c b/drivers/ata/pata_serverworks.c
index c0bc4af0d196..549ff24a9823 100644
--- a/drivers/ata/pata_serverworks.c
+++ b/drivers/ata/pata_serverworks.c
@@ -252,13 +252,13 @@ static void serverworks_set_dmamode(struct ata_port *ap, struct ata_device *adev
 	pci_write_config_byte(pdev, 0x54, ultra_cfg);
 }
 
-static struct scsi_host_template serverworks_osb4_sht = {
+static const struct scsi_host_template serverworks_osb4_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize	= LIBATA_DUMB_MAX_PRD,
 	.dma_boundary	= ATA_DMA_BOUNDARY,
 };
 
-static struct scsi_host_template serverworks_csb_sht = {
+static const struct scsi_host_template serverworks_csb_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
@@ -413,7 +413,7 @@ static int serverworks_init_one(struct pci_dev *pdev, const struct pci_device_id
 		}
 	};
 	const struct ata_port_info *ppi[] = { &info[id->driver_data], NULL };
-	struct scsi_host_template *sht = &serverworks_csb_sht;
+	const struct scsi_host_template *sht = &serverworks_csb_sht;
 	int rc;
 
 	rc = pcim_enable_device(pdev);
diff --git a/drivers/ata/pata_sil680.c b/drivers/ata/pata_sil680.c
index 67ef2e26d7df..abe64b5f83cf 100644
--- a/drivers/ata/pata_sil680.c
+++ b/drivers/ata/pata_sil680.c
@@ -223,7 +223,7 @@ static bool sil680_sff_irq_check(struct ata_port *ap)
 	return val & 0x08;
 }
 
-static struct scsi_host_template sil680_sht = {
+static const struct scsi_host_template sil680_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c
index 92e4cf05de2c..31de06b66221 100644
--- a/drivers/ata/pata_sis.c
+++ b/drivers/ata/pata_sis.c
@@ -539,7 +539,7 @@ static unsigned int sis_133_mode_filter(struct ata_device *adev, unsigned int ma
 	return mask;
 }
 
-static struct scsi_host_template sis_sht = {
+static const struct scsi_host_template sis_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_sl82c105.c b/drivers/ata/pata_sl82c105.c
index 8487470e2e01..3b62ea482f1a 100644
--- a/drivers/ata/pata_sl82c105.c
+++ b/drivers/ata/pata_sl82c105.c
@@ -238,7 +238,7 @@ static bool sl82c105_sff_irq_check(struct ata_port *ap)
 	return val & mask;
 }
 
-static struct scsi_host_template sl82c105_sht = {
+static const struct scsi_host_template sl82c105_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_triflex.c b/drivers/ata/pata_triflex.c
index 782162d2f3f8..26d448a869e2 100644
--- a/drivers/ata/pata_triflex.c
+++ b/drivers/ata/pata_triflex.c
@@ -160,7 +160,7 @@ static void triflex_bmdma_stop(struct ata_queued_cmd *qc)
 	triflex_load_timing(qc->ap, qc->dev, qc->dev->pio_mode);
 }
 
-static struct scsi_host_template triflex_sht = {
+static const struct scsi_host_template triflex_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 34f00f389932..696b99720dcb 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -443,7 +443,7 @@ static int via_port_start(struct ata_port *ap)
 	return 0;
 }
 
-static struct scsi_host_template via_sht = {
+static const struct scsi_host_template via_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/pdc_adma.c b/drivers/ata/pdc_adma.c
index 35b823ac20c9..8e6b2599f0d5 100644
--- a/drivers/ata/pdc_adma.c
+++ b/drivers/ata/pdc_adma.c
@@ -123,7 +123,7 @@ static void adma_freeze(struct ata_port *ap);
 static void adma_thaw(struct ata_port *ap);
 static int adma_prereset(struct ata_link *link, unsigned long deadline);
 
-static struct scsi_host_template adma_ata_sht = {
+static const struct scsi_host_template adma_ata_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= LIBATA_MAX_PRD,
 	.dma_boundary		= ADMA_DMA_BOUNDARY,
diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 21d77633a98f..24334a8a3f0b 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -1076,7 +1076,7 @@ static void sata_dwc_dev_select(struct ata_port *ap, unsigned int device)
 /*
  * scsi mid-layer and libata interface structures
  */
-static struct scsi_host_template sata_dwc_sht = {
+static const struct scsi_host_template sata_dwc_sht = {
 	ATA_NCQ_SHT(DRV_NAME),
 	/*
 	 * test-only: Currently this driver doesn't handle NCQ
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index b052c5a65c17..ccd99b9aa9ff 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1376,7 +1376,7 @@ static void sata_fsl_host_stop(struct ata_host *host)
 /*
  * scsi mid-layer and libata interface structures
  */
-static struct scsi_host_template sata_fsl_sht = {
+static const struct scsi_host_template sata_fsl_sht = {
 	ATA_NCQ_SHT_QD("sata_fsl", SATA_FSL_QUEUE_DEPTH),
 	.sg_tablesize = SATA_FSL_MAX_PRD_USABLE,
 	.dma_boundary = ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index dfbf9493e451..8237ece4a46f 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -438,7 +438,7 @@ static const struct ata_port_info ahci_highbank_port_info = {
 	.port_ops       = &ahci_highbank_ops,
 };
 
-static struct scsi_host_template ahci_highbank_platform_sht = {
+static const struct scsi_host_template ahci_highbank_platform_sht = {
 	AHCI_SHT("sata_highbank"),
 };
 
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 2833c722118d..2c8c78ed86c1 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -242,7 +242,7 @@ struct inic_port_priv {
 	dma_addr_t	cpb_tbl_dma;
 };
 
-static struct scsi_host_template inic_sht = {
+static const struct scsi_host_template inic_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= LIBATA_MAX_PRD, /* maybe it can be larger? */
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index e3cff01201b8..d404e631d152 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -659,13 +659,13 @@ static u8 mv_sff_check_status(struct ata_port *ap);
  * PRDs for 64K boundaries in mv_fill_sg().
  */
 #ifdef CONFIG_PCI
-static struct scsi_host_template mv5_sht = {
+static const struct scsi_host_template mv5_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= MV_MAX_SG_CT / 2,
 	.dma_boundary		= MV_DMA_BOUNDARY,
 };
 #endif
-static struct scsi_host_template mv6_sht = {
+static const struct scsi_host_template mv6_sht = {
 	__ATA_BASE_SHT(DRV_NAME),
 	.can_queue		= MV_MAX_Q_DEPTH - 1,
 	.sg_tablesize		= MV_MAX_SG_CT / 2,
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 9b2d289e89e1..abf5651c87ab 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -371,11 +371,11 @@ static struct pci_driver nv_pci_driver = {
 	.remove			= ata_pci_remove_one,
 };
 
-static struct scsi_host_template nv_sht = {
+static const struct scsi_host_template nv_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
-static struct scsi_host_template nv_adma_sht = {
+static const struct scsi_host_template nv_adma_sht = {
 	__ATA_BASE_SHT(DRV_NAME),
 	.can_queue		= NV_ADMA_MAX_CPBS,
 	.sg_tablesize		= NV_ADMA_SGTBL_TOTAL_LEN,
@@ -386,7 +386,7 @@ static struct scsi_host_template nv_adma_sht = {
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,
 };
 
-static struct scsi_host_template nv_swncq_sht = {
+static const struct scsi_host_template nv_swncq_sht = {
 	__ATA_BASE_SHT(DRV_NAME),
 	.can_queue		= ATA_MAX_QUEUE - 1,
 	.sg_tablesize		= LIBATA_MAX_PRD,
@@ -520,7 +520,7 @@ static struct ata_port_operations nv_swncq_ops = {
 
 struct nv_pi_priv {
 	irq_handler_t			irq_handler;
-	struct scsi_host_template	*sht;
+	const struct scsi_host_template	*sht;
 };
 
 #define NV_PI_PRIV(_irq_handler, _sht) \
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 4e60e6c4c35a..2df1a070b25a 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -158,7 +158,7 @@ static void pdc_error_handler(struct ata_port *ap);
 static void pdc_post_internal_cmd(struct ata_queued_cmd *qc);
 static int pdc_pata_cable_detect(struct ata_port *ap);
 
-static struct scsi_host_template pdc_ata_sht = {
+static const struct scsi_host_template pdc_ata_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= PDC_MAX_PRD,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 8ca0810aad26..8a6286159044 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -108,7 +108,7 @@ static void qs_thaw(struct ata_port *ap);
 static int qs_prereset(struct ata_link *link, unsigned long deadline);
 static void qs_error_handler(struct ata_port *ap);
 
-static struct scsi_host_template qs_ata_sht = {
+static const struct scsi_host_template qs_ata_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= QS_MAX_PRD,
 	.dma_boundary		= QS_DMA_BOUNDARY,
diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index 0195eb29f6c2..34790f15c1b8 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -608,7 +608,7 @@ static u8 sata_rcar_bmdma_status(struct ata_port *ap)
 	return host_stat;
 }
 
-static struct scsi_host_template sata_rcar_sht = {
+static const struct scsi_host_template sata_rcar_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	/*
 	 * This controller allows transfer chunks up to 512MB which cross 64KB
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 3b989a52879d..cc77c0248284 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -156,7 +156,7 @@ static struct pci_driver sil_pci_driver = {
 #endif
 };
 
-static struct scsi_host_template sil_sht = {
+static const struct scsi_host_template sil_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	/** These controllers support Large Block Transfer which allows
 	    transfer chunks up to 2GB and which cross 64KB boundaries,
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 22cc9e9789dd..e72a0257990d 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -373,7 +373,7 @@ static struct pci_driver sil24_pci_driver = {
 #endif
 };
 
-static struct scsi_host_template sil24_sht = {
+static const struct scsi_host_template sil24_sht = {
 	__ATA_BASE_SHT(DRV_NAME),
 	.can_queue		= SIL24_MAX_CMDS,
 	.sg_tablesize		= SIL24_MAX_SGE,
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index 316237362aa9..ef8724986de3 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -72,7 +72,7 @@ static struct pci_driver sis_pci_driver = {
 #endif
 };
 
-static struct scsi_host_template sis_sht = {
+static const struct scsi_host_template sis_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/sata_svw.c b/drivers/ata/sata_svw.c
index 2e3418a82b44..c47c3fb434d5 100644
--- a/drivers/ata/sata_svw.c
+++ b/drivers/ata/sata_svw.c
@@ -330,7 +330,7 @@ static int k2_sata_show_info(struct seq_file *m, struct Scsi_Host *shost)
 	return 0;
 }
 
-static struct scsi_host_template k2_sata_sht = {
+static const struct scsi_host_template k2_sata_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 	.show_info		= k2_sata_show_info,
 };
diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
index a92c60455b1d..ccc016072637 100644
--- a/drivers/ata/sata_sx4.c
+++ b/drivers/ata/sata_sx4.c
@@ -226,7 +226,7 @@ static void pdc_post_internal_cmd(struct ata_queued_cmd *qc);
 static int pdc_check_atapi_dma(struct ata_queued_cmd *qc);
 
 
-static struct scsi_host_template pdc_sata_sht = {
+static const struct scsi_host_template pdc_sata_sht = {
 	ATA_BASE_SHT(DRV_NAME),
 	.sg_tablesize		= LIBATA_MAX_PRD,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
diff --git a/drivers/ata/sata_uli.c b/drivers/ata/sata_uli.c
index 815e6af75310..60ea45926cd1 100644
--- a/drivers/ata/sata_uli.c
+++ b/drivers/ata/sata_uli.c
@@ -59,7 +59,7 @@ static struct pci_driver uli_pci_driver = {
 	.remove			= ata_pci_remove_one,
 };
 
-static struct scsi_host_template uli_sht = {
+static const struct scsi_host_template uli_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c
index c7891cc84ea0..57cbf2cef618 100644
--- a/drivers/ata/sata_via.c
+++ b/drivers/ata/sata_via.c
@@ -107,7 +107,7 @@ static struct pci_driver svia_pci_driver = {
 	.remove			= ata_pci_remove_one,
 };
 
-static struct scsi_host_template svia_sht = {
+static const struct scsi_host_template svia_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/drivers/ata/sata_vsc.c b/drivers/ata/sata_vsc.c
index 87e4ed66b306..d39b87537168 100644
--- a/drivers/ata/sata_vsc.c
+++ b/drivers/ata/sata_vsc.c
@@ -277,7 +277,7 @@ out:
 }
 
 
-static struct scsi_host_template vsc_sata_sht = {
+static const struct scsi_host_template vsc_sata_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 17fa26215292..fe0760ce34c8 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -37,7 +37,7 @@ struct ahci_host_priv *ahci_platform_get_resources(
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
 			    const struct ata_port_info *pi_template,
-			    struct scsi_host_template *sht);
+			    const struct scsi_host_template *sht);
 
 void ahci_platform_shutdown(struct platform_device *pdev);
 
diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h
index 9cafec92282d..b9745cc08e38 100644
--- a/include/linux/ata_platform.h
+++ b/include/linux/ata_platform.h
@@ -19,7 +19,7 @@ extern int __pata_platform_probe(struct device *dev,
 				 struct resource *irq_res,
 				 unsigned int ioport_shift,
 				 int __pio_mask,
-				 struct scsi_host_template *sht,
+				 const struct scsi_host_template *sht,
 				 bool use16bit);
 
 /*
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a759dfbdcc91..311cd93377c7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1065,10 +1065,10 @@ extern void ata_host_get(struct ata_host *host);
 extern void ata_host_put(struct ata_host *host);
 extern int ata_host_start(struct ata_host *host);
 extern int ata_host_register(struct ata_host *host,
-			     struct scsi_host_template *sht);
+			     const struct scsi_host_template *sht);
 extern int ata_host_activate(struct ata_host *host, int irq,
 			     irq_handler_t irq_handler, unsigned long irq_flags,
-			     struct scsi_host_template *sht);
+			     const struct scsi_host_template *sht);
 extern void ata_host_detach(struct ata_host *host);
 extern void ata_host_init(struct ata_host *, struct device *, struct ata_port_operations *);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
@@ -1980,10 +1980,10 @@ extern int ata_pci_sff_prepare_host(struct pci_dev *pdev,
 				    struct ata_host **r_host);
 extern int ata_pci_sff_activate_host(struct ata_host *host,
 				     irq_handler_t irq_handler,
-				     struct scsi_host_template *sht);
+				     const struct scsi_host_template *sht);
 extern int ata_pci_sff_init_one(struct pci_dev *pdev,
 		const struct ata_port_info * const * ppi,
-		struct scsi_host_template *sht, void *host_priv, int hflags);
+		const struct scsi_host_template *sht, void *host_priv, int hflags);
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_ATA_BMDMA
@@ -2019,7 +2019,7 @@ extern int ata_pci_bmdma_prepare_host(struct pci_dev *pdev,
 				      struct ata_host **r_host);
 extern int ata_pci_bmdma_init_one(struct pci_dev *pdev,
 				  const struct ata_port_info * const * ppi,
-				  struct scsi_host_template *sht,
+				  const struct scsi_host_template *sht,
 				  void *host_priv, int hflags);
 #endif /* CONFIG_PCI */
 #endif /* CONFIG_ATA_BMDMA */
-- 
cgit v1.2.3


From 80602aca4fcca2b0a90ab39fcce31ec591a3fba0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 22 Mar 2023 12:54:44 -0700
Subject: scsi: iscsi: Declare SCSI host template const

Make it explicit that the SCSI host template is not modified.

Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20230322195515.1267197-50-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c | 4 ++--
 drivers/scsi/be2iscsi/be_main.c          | 2 +-
 drivers/scsi/bnx2i/bnx2i_iscsi.c         | 4 ++--
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c       | 2 +-
 drivers/scsi/cxgbi/libcxgbi.c            | 2 +-
 drivers/scsi/cxgbi/libcxgbi.h            | 2 +-
 drivers/scsi/iscsi_tcp.c                 | 4 ++--
 drivers/scsi/libiscsi.c                  | 2 +-
 drivers/scsi/qedi/qedi_gbl.h             | 2 +-
 drivers/scsi/qedi/qedi_iscsi.c           | 2 +-
 include/scsi/libiscsi.h                  | 2 +-
 11 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 6b7603765383..bb9aaff92ca3 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -78,7 +78,7 @@ MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
 
-static struct scsi_host_template iscsi_iser_sht;
+static const struct scsi_host_template iscsi_iser_sht;
 static struct iscsi_transport iscsi_iser_transport;
 static struct scsi_transport_template *iscsi_iser_scsi_transport;
 static struct workqueue_struct *release_wq;
@@ -956,7 +956,7 @@ static umode_t iser_attr_is_visible(int param_type, int param)
 	return 0;
 }
 
-static struct scsi_host_template iscsi_iser_sht = {
+static const struct scsi_host_template iscsi_iser_sht = {
 	.module                 = THIS_MODULE,
 	.name                   = "iSCSI Initiator over iSER",
 	.queuecommand           = iscsi_queuecommand,
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 50a577ac3bb4..5d416507947b 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -398,7 +398,7 @@ static const struct pci_device_id beiscsi_pci_id_table[] = {
 MODULE_DEVICE_TABLE(pci, beiscsi_pci_id_table);
 
 
-static struct scsi_host_template beiscsi_sht = {
+static const struct scsi_host_template beiscsi_sht = {
 	.module = THIS_MODULE,
 	.name = "Emulex 10Gbe open-iscsi Initiator Driver",
 	.proc_name = DRV_NAME,
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index a3c800e04a2e..9971f32a663c 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -22,7 +22,7 @@
 
 struct scsi_transport_template *bnx2i_scsi_xport_template;
 struct iscsi_transport bnx2i_iscsi_transport;
-static struct scsi_host_template bnx2i_host_template;
+static const struct scsi_host_template bnx2i_host_template;
 
 /*
  * Global endpoint resource info
@@ -2250,7 +2250,7 @@ static umode_t bnx2i_attr_is_visible(int param_type, int param)
  * 'Scsi_Host_Template' structure and 'iscsi_tranport' structure template
  * used while registering with the scsi host and iSCSI transport module.
  */
-static struct scsi_host_template bnx2i_host_template = {
+static const struct scsi_host_template bnx2i_host_template = {
 	.module			= THIS_MODULE,
 	.name			= "QLogic Offload iSCSI Initiator",
 	.proc_name		= "bnx2i",
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index ff9d4287937a..ec6530240707 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -80,7 +80,7 @@ static struct cxgb3_client t3_client = {
 	.event_handler = cxgb3i_dev_event_handler,
 };
 
-static struct scsi_host_template cxgb3i_host_template = {
+static const struct scsi_host_template cxgb3i_host_template = {
 	.module		= THIS_MODULE,
 	.name		= DRV_MODULE_NAME,
 	.proc_name	= DRV_MODULE_NAME,
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index af281e271f88..eb47c8c96d0e 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -337,7 +337,7 @@ void cxgbi_hbas_remove(struct cxgbi_device *cdev)
 EXPORT_SYMBOL_GPL(cxgbi_hbas_remove);
 
 int cxgbi_hbas_add(struct cxgbi_device *cdev, u64 max_lun,
-		unsigned int max_conns, struct scsi_host_template *sht,
+		unsigned int max_conns, const struct scsi_host_template *sht,
 		struct scsi_transport_template *stt)
 {
 	struct cxgbi_hba *chba;
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
index d8fc7beafa20..d92cf1dccc2f 100644
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -591,7 +591,7 @@ struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *, int *);
 struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *,
 						     int *);
 int cxgbi_hbas_add(struct cxgbi_device *, u64, unsigned int,
-			struct scsi_host_template *,
+			const struct scsi_host_template *,
 			struct scsi_transport_template *);
 void cxgbi_hbas_remove(struct cxgbi_device *);
 
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index c76f82fb8b63..6df2f4041f12 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -47,7 +47,7 @@ MODULE_DESCRIPTION("iSCSI/TCP data-path");
 MODULE_LICENSE("GPL");
 
 static struct scsi_transport_template *iscsi_sw_tcp_scsi_transport;
-static struct scsi_host_template iscsi_sw_tcp_sht;
+static const struct scsi_host_template iscsi_sw_tcp_sht;
 static struct iscsi_transport iscsi_sw_tcp_transport;
 
 static unsigned int iscsi_max_lun = ~0;
@@ -1072,7 +1072,7 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev)
 	return 0;
 }
 
-static struct scsi_host_template iscsi_sw_tcp_sht = {
+static const struct scsi_host_template iscsi_sw_tcp_sht = {
 	.module			= THIS_MODULE,
 	.name			= "iSCSI Initiator over TCP/IP",
 	.queuecommand           = iscsi_queuecommand,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 127f3d7f19dc..0fda8905eabd 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2895,7 +2895,7 @@ EXPORT_SYMBOL_GPL(iscsi_host_add);
  * This should be called by partial offload and software iscsi drivers.
  * To access the driver specific memory use the iscsi_host_priv() macro.
  */
-struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht,
+struct Scsi_Host *iscsi_host_alloc(const struct scsi_host_template *sht,
 				   int dd_data_size, bool xmit_can_sleep)
 {
 	struct Scsi_Host *shost;
diff --git a/drivers/scsi/qedi/qedi_gbl.h b/drivers/scsi/qedi/qedi_gbl.h
index 72942772b198..0e316cc24b19 100644
--- a/drivers/scsi/qedi/qedi_gbl.h
+++ b/drivers/scsi/qedi/qedi_gbl.h
@@ -17,7 +17,7 @@ extern int qedi_do_not_recover;
 
 extern uint qedi_io_tracing;
 
-extern struct scsi_host_template qedi_host_template;
+extern const struct scsi_host_template qedi_host_template;
 extern struct iscsi_transport qedi_iscsi_transport;
 extern const struct qed_iscsi_ops *qedi_ops;
 extern const struct qedi_debugfs_ops qedi_debugfs_ops[];
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 31ec429104e2..6ed8ef97642c 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -40,7 +40,7 @@ static int qedi_eh_host_reset(struct scsi_cmnd *cmd)
 	return qedi_recover_all_conns(qedi);
 }
 
-struct scsi_host_template qedi_host_template = {
+const struct scsi_host_template qedi_host_template = {
 	.module = THIS_MODULE,
 	.name = "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver",
 	.proc_name = QEDI_MODULE_NAME,
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index e39fb0736ade..7282555adfd5 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -407,7 +407,7 @@ extern int iscsi_host_set_param(struct Scsi_Host *shost,
 extern int iscsi_host_get_param(struct Scsi_Host *shost,
 				enum iscsi_host_param param, char *buf);
 extern int iscsi_host_add(struct Scsi_Host *shost, struct device *pdev);
-extern struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht,
+extern struct Scsi_Host *iscsi_host_alloc(const struct scsi_host_template *sht,
 					  int dd_data_size,
 					  bool xmit_can_sleep);
 extern void iscsi_host_remove(struct Scsi_Host *shost, bool is_shutdown);
-- 
cgit v1.2.3


From 009455205e68ef017a429dc818b92c4a84b867c4 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 24 Mar 2023 10:08:14 +0100
Subject: driver core: bus: move documentation for lock_key to proper location.

In commit 37e98d9bedb5 ("driver core: bus: move lock_class_key into
dynamic structure"), the lock_key variable moved out of struct bus_type
and into struct subsys_private, yet the documentation for it did not
move.  Fix that up and place the documentation comment in the correct
location.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Fixes: 37e98d9bedb5 ("driver core: bus: move lock_class_key into dynamic structure")
Link: https://lore.kernel.org/r/20230324090814.386654-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h        | 1 +
 include/linux/device/bus.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index a2d3a1d0fa9b..d3e081dc4b13 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -33,6 +33,7 @@
  *              avoid namespace conflicts
  * @class - pointer back to the struct class that this structure is associated
  *          with.
+ * @lock_key:	Lock class key for use by the lock validator
  *
  * This structure is the one that is the actual kobject allowing struct
  * bus_type/class to be statically allocated safely.  Nothing outside of the
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 65a5e2c0f04d..ae10c4322754 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -65,7 +65,6 @@ struct fwnode_handle;
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
  *              driver implementations to a bus and allow the driver to do
  *              bus-specific setup
- * @lock_key:	Lock class key for use by the lock validator
  * @need_parent_lock:	When probing or removing a device on this bus, the
  *			device core should lock the device's parent.
  *
-- 
cgit v1.2.3


From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:42 -0700
Subject: bpf: Add a few bpf mem allocator functions

This patch adds a few bpf mem allocator functions which will
be used in the bpf_local_storage in a later patch.

bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the
flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL).
bpf_local_storage knows its running context is sleepable (GFP_KERNEL)
and provides a better guarantee on memory allocation.

bpf_local_storage has some uncommon cases that its selem
cannot be reused immediately. It handles its own
rcu_head and goes through a rcu_trace gp and then free it.
bpf_mem_cache_raw_free() is added for direct free purpose
without leaking the LLIST_NODE_SZ internal knowledge.
During free time, the 'struct bpf_mem_alloc *ma' is no longer
available. However, the caller should know if it is
percpu memory or not and it can call different raw_free functions.
bpf_local_storage does not support percpu value, so only
the non-percpu 'bpf_mem_cache_raw_free()' is added in
this patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  2 ++
 kernel/bpf/memalloc.c         | 59 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index a7104af61ab4..3929be5743f4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_raw_free(void *ptr);
+void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
 #endif /* _BPF_MEM_ALLOC_H */
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 	return entry;
 }
 
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
 {
-	/* Allocate, but don't deplete atomic reserves that typical
-	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
-	 * will allocate from the current numa node which is what we
-	 * want here.
-	 */
-	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
 	if (c->percpu_size) {
 		void **obj = kmalloc_node(c->percpu_size, flags, node);
 		void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
 		 */
 		obj = __llist_del_first(&c->free_by_rcu);
 		if (!obj) {
-			obj = __alloc(c, node);
+			/* Allocate, but don't deplete atomic reserves that typical
+			 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+			 * will allocate from the current numa node which is what we
+			 * want here.
+			 */
+			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
 			if (!obj)
 				break;
 		}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+	if (!ptr)
+		return;
+
+	kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+	struct bpf_mem_cache *c;
+	void *ret;
+
+	c = this_cpu_ptr(ma->cache);
+
+	ret = unit_alloc(c);
+	if (!ret && flags == GFP_KERNEL) {
+		struct mem_cgroup *memcg, *old_memcg;
+
+		memcg = get_memcg(c);
+		old_memcg = set_active_memcg(memcg);
+		ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+		set_active_memcg(old_memcg);
+		mem_cgroup_put(memcg);
+	}
+
+	return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
-- 
cgit v1.2.3


From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:43 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem

This patch uses bpf_mem_alloc for the task and cgroup local storage that
the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID.
eg. bpf_get_current_task_btf() can be used in some of the kmalloc code
path which will cause deadlock/recursion. bpf_mem_cache_alloc is
deadlock free and will solve a legit use case in [1].

For sk storage, its batch creation benchmark shows a few percent
regression when the sk create/destroy batch size is larger than 32.
The sk creation/destruction happens much more often and
depends on external traffic. Considering it is hypothetical
to be able to cause deadlock with sk storage, it can cross
the bridge to use bpf_mem_alloc till a legit (ie. useful)
use case comes up.

For inode storage, bpf_local_storage_destroy() is called before
waiting for a rcu gp and its memory cannot be reused immediately.
inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp.

A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc().
Only task and cgroup storage have 'bpf_ma == true' which
means to use bpf_mem_cache_alloc/free(). This patch only changes
selem to use bpf_mem_alloc for task and cgroup. The next patch
will change the local_storage to use bpf_mem_alloc also for
task and cgroup.

Here is some more details on the changes:

* memory allocation:
After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because
bpf_mem_cache_alloc() could return a reused selem. It is to keep
the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data
is zero-ed. SDATA(selem)->data is the visible part to the bpf prog.
No need to use zero_map_value() to do the zeroing because
bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using
the selem before returning the selem through bpf_mem_cache_free().
For the internal fields of selem, they will be initialized when
linking to the new smap and the new local_storage.

When 'bpf_ma == false', nothing changes in this patch. It will
stay with the bpf_map_kzalloc().

* memory free:
The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle
the bpf_ma == true case.

For the common selem free path where its owner is also being destroyed,
the mem is freed in bpf_local_storage_destroy(), the owner (task
and cgroup) has gone through a rcu gp. The memory can be reused
immediately, so bpf_local_storage_destroy() will call
bpf_selem_free(..., reuse_now = true) which will do
bpf_mem_cache_free() for immediate reuse consideration.

An exception is the delete elem code path. The delete elem code path
is called from the helper bpf_*_storage_delete() and the syscall
bpf_map_delete_elem(). This path is an unusual case for local
storage because the common use case is to have the local storage
staying with its owner life time so that the bpf prog and the user
space does not have to monitor the owner's destruction. For the delete
elem path, the selem cannot be reused immediately because there could
be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false)
and it will wait for a rcu tasks trace gp before freeing the elem. The
rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree().

When 'bpf_ma == false', it should be the same as before.
__bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu().
A few words on the 'reuse_now == true'. When 'reuse_now == true',
it is still racing with bpf_local_storage_map_free which is under rcu
protection, so it still needs to wait for a rcu gp instead of kfree().
Otherwise, the selem may be reused by slab for a totally different struct
while the bpf_local_storage_map_free() is still using it (as a
rcu reader). For the inode case, there may be other rcu readers also.
In short, when bpf_ma == false and reuse_now == true => vanilla rcu.

[1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 ++-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 95 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 95 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index a34f61467a2f..30efbcab2798 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/types.h>
+#include <linux/bpf_mem_alloc.h>
 #include <uapi/linux/btf.h>
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
@@ -55,6 +56,8 @@ struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
+	struct bpf_mem_alloc selem_ma;
+	bool bpf_ma;
 };
 
 struct bpf_local_storage_data {
@@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache);
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma);
 
 struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index f5b016a5484d..d17d5b694668 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+	return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
 }
 
 static void cgroup_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 9a5f05151898..e17ad581b9be 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 
 static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &inode_cache);
+	return bpf_local_storage_map_alloc(attr, &inode_cache, false);
 }
 
 static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 351d991694cb..309ea727a5cb 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
-	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+		migrate_enable();
+		if (selem)
+			/* Keep the original bpf_map_kzalloc behavior
+			 * before started using the bpf_mem_cache_alloc.
+			 *
+			 * No need to use zero_map_value. The bpf_selem_free()
+			 * only does bpf_mem_cache_free when there is
+			 * no other bpf prog is using the selem.
+			 */
+			memset(SDATA(selem)->data, 0, smap->map.value_size);
+	} else {
+		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+					gfp_flags | __GFP_NOWARN);
+	}
+
 	if (selem) {
 		if (value)
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -124,12 +140,34 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(selem);
+	else
+		kfree_rcu(selem, rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+			     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(selem, rcu);
+	else
+		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	kfree(selem);
+	bpf_mem_cache_raw_free(selem);
 }
 
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -145,10 +183,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    bool reuse_now)
 {
 	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
+
+	if (!smap->bpf_ma) {
+		__bpf_selem_free(selem, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	} else {
+		/* Instead of using the vanilla call_rcu(),
+		 * bpf_mem_cache_free will be able to reuse selem
+		 * immediately.
+		 */
+		migrate_disable();
+		bpf_mem_cache_free(&smap->selem_ma, selem);
+		migrate_enable();
+	}
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -654,13 +705,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
 	return usage;
 }
 
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache)
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
+	int err;
 
 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
 	if (!smap)
@@ -675,8 +738,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
 					 nbuckets, GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
-		bpf_map_area_free(smap);
-		return ERR_PTR(-ENOMEM);
+		err = -ENOMEM;
+		goto free_smap;
 	}
 
 	for (i = 0; i < nbuckets; i++) {
@@ -687,8 +750,20 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
 				   sdata.data[attr->value_size]);
 
+	smap->bpf_ma = bpf_ma;
+	if (bpf_ma) {
+		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+		if (err)
+			goto free_smap;
+	}
+
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
+
+free_smap:
+	kvfree(smap->buckets);
+	bpf_map_area_free(smap);
+	return ERR_PTR(err);
 }
 
 void bpf_local_storage_map_free(struct bpf_map *map,
@@ -754,6 +829,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
+	if (smap->bpf_ma)
+		bpf_mem_alloc_destroy(&smap->selem_ma);
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ab5bd1ef58c4..d1af0c8f9ce4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &task_cache);
+	return bpf_local_storage_map_alloc(attr, &task_cache, true);
 }
 
 static void task_storage_map_free(struct bpf_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cb0f5a105b89..085025c7130a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &sk_cache);
+	return bpf_local_storage_map_alloc(attr, &sk_cache, false);
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:44 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage

This patch uses bpf_mem_cache_alloc/free for allocating and freeing
bpf_local_storage for task and cgroup storage.

The changes are similar to the previous patch. A few things that
worth to mention for bpf_local_storage:

The local_storage is freed when the last selem is deleted.
Before deleting a selem from local_storage, it needs to retrieve the
local_storage->smap because the bpf_selem_unlink_storage_nolock()
may have set it to NULL. Note that local_storage->smap may have
already been NULL when the selem created this local_storage has
been removed. In this case, call_rcu will be used to free the
local_storage.
Also, the bpf_ma (true or false) value is needed before calling
bpf_local_storage_free(). The bpf_ma can either be obtained from
the local_storage->smap (if available) or any of its selem's smap.
A new helper check_storage_bpf_ma() is added to obtain
bpf_ma for a deleting bpf_local_storage.

When bpf_local_storage_alloc getting a reused memory, all
fields are either in the correct values or will be initialized.
'cache[]' must already be all NULLs. 'list' must be empty.
Others will be initialized.

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |   1 +
 kernel/bpf/bpf_local_storage.c    | 130 +++++++++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 30efbcab2798..173ec7f43ed1 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -57,6 +57,7 @@ struct bpf_local_storage_map {
 	u16 elem_size;
 	u16 cache_idx;
 	struct bpf_mem_alloc selem_ma;
+	struct bpf_mem_alloc storage_ma;
 	bool bpf_ma;
 };
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 309ea727a5cb..dab2ff4c99d9 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -111,33 +111,74 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage *local_storage;
+
+	/* If RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree(), else do kfree_rcu().
+	 */
+	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(local_storage);
+	else
+		kfree_rcu(local_storage, rcu);
+}
+
 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
-	kfree(local_storage);
+	bpf_mem_cache_raw_free(local_storage);
 }
 
 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
-	/* If RCU Tasks Trace grace period implies RCU grace period, do
-	 * kfree(), else do kfree_rcu().
-	 */
 	if (rcu_trace_implies_rcu_gp())
 		bpf_local_storage_free_rcu(rcu);
 	else
 		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+				     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(local_storage, rcu);
+	else
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     __bpf_local_storage_free_trace_rcu);
+}
+
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
-				   bool reuse_now)
+				   struct bpf_local_storage_map *smap,
+				   bool bpf_ma, bool reuse_now)
 {
-	if (!reuse_now)
+	if (!bpf_ma) {
+		__bpf_local_storage_free(local_storage, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_trace_rcu);
-	else
+		return;
+	}
+
+	if (smap) {
+		migrate_disable();
+		bpf_mem_cache_free(&smap->storage_ma, local_storage);
+		migrate_enable();
+	} else {
+		/* smap could be NULL if the selem that triggered
+		 * this 'local_storage' creation had been long gone.
+		 * In this case, directly do call_rcu().
+		 */
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+	}
 }
 
 /* rcu tasks trace callback for bpf_ma == false */
@@ -260,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	return free_local_storage;
 }
 
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+				 struct bpf_local_storage_map *storage_smap,
+				 struct bpf_local_storage_elem *selem)
+{
+
+	struct bpf_local_storage_map *selem_smap;
+
+	/* local_storage->smap may be NULL. If it is, get the bpf_ma
+	 * from any selem in the local_storage->list. The bpf_ma of all
+	 * local_storage and selem should have the same value
+	 * for the same map type.
+	 *
+	 * If the local_storage->list is already empty, the caller will not
+	 * care about the bpf_ma value also because the caller is not
+	 * responsibile to free the local_storage.
+	 */
+
+	if (storage_smap)
+		return storage_smap->bpf_ma;
+
+	if (!selem) {
+		struct hlist_node *n;
+
+		n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+					  bpf_rcu_lock_held());
+		if (!n)
+			return false;
+
+		selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+	}
+	selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+	return selem_smap->bpf_ma;
+}
+
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 				     bool reuse_now)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
-	bool free_local_storage = false;
+	bool bpf_ma, free_local_storage = false;
 	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -273,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 
 	local_storage = rcu_dereference_check(selem->local_storage,
 					      bpf_rcu_lock_held());
+	storage_smap = rcu_dereference_check(local_storage->smap,
+					     bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -280,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, reuse_now);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -400,8 +481,15 @@ int bpf_local_storage_alloc(void *owner,
 	if (err)
 		return err;
 
-	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+		migrate_enable();
+	} else {
+		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+					  gfp_flags | __GFP_NOWARN);
+	}
+
 	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
@@ -447,7 +535,7 @@ int bpf_local_storage_alloc(void *owner,
 	return 0;
 
 uncharge:
-	bpf_local_storage_free(storage, true);
+	bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -660,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 
 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
-	bool free_storage = false;
+	bool bpf_ma, free_storage = false;
 	struct hlist_node *n;
 	unsigned long flags;
 
+	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
+
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added to or deleted from the
@@ -692,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_storage)
-		bpf_local_storage_free(local_storage, true);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -755,6 +847,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
 		if (err)
 			goto free_smap;
+
+		err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+		if (err) {
+			bpf_mem_alloc_destroy(&smap->selem_ma);
+			goto free_smap;
+		}
 	}
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
@@ -829,8 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	if (smap->bpf_ma)
+	if (smap->bpf_ma) {
 		bpf_mem_alloc_destroy(&smap->selem_ma);
+		bpf_mem_alloc_destroy(&smap->storage_ma);
+	}
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
-- 
cgit v1.2.3


From 46c30cb8f5393586c6ebc7b53a235c85bfac1de8 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@kernel.org>
Date: Sun, 18 Dec 2022 18:03:54 +0000
Subject: 9p: Add additional debug flags and open modes

Add some additional debug flags to assist with debugging
cache changes.  Also add some additional open modes so we
can track cache state in fids more directly.

Signed-off-by: Eric Van Hensbergen <ericvh@kernel.org>
---
 include/net/9p/9p.h | 6 ++++++
 net/9p/client.c     | 8 ++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 429adf6be29c..60cad0d200a4 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -42,6 +42,8 @@ enum p9_debug_flags {
 	P9_DEBUG_PKT =		(1<<10),
 	P9_DEBUG_FSC =		(1<<11),
 	P9_DEBUG_VPKT =		(1<<12),
+	P9_DEBUG_CACHE =	(1<<13),
+	P9_DEBUG_MMAP =		(1<<14),
 };
 
 #ifdef CONFIG_NET_9P_DEBUG
@@ -213,6 +215,10 @@ enum p9_open_mode_t {
 	P9_ORCLOSE = 0x40,
 	P9_OAPPEND = 0x80,
 	P9_OEXCL = 0x1000,
+	P9L_MODE_MASK = 0x1FFF, /* don't send anything under this to server */
+	P9L_DIRECT = 0x2000, /* cache disabled */
+	P9L_NOWRITECACHE = 0x4000, /* no write caching  */
+	P9L_LOOSE = 0x8000, /* loose cache */
 };
 
 /**
diff --git a/net/9p/client.c b/net/9p/client.c
index 2adcb5e7b0e2..a3340268ec8d 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1230,9 +1230,9 @@ int p9_client_open(struct p9_fid *fid, int mode)
 		return -EINVAL;
 
 	if (p9_is_proto_dotl(clnt))
-		req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode);
+		req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK);
 	else
-		req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
+		req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto error;
@@ -1277,7 +1277,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags,
 		return -EINVAL;
 
 	req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags,
-			    mode, gid);
+			    mode & P9L_MODE_MASK, gid);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto error;
@@ -1321,7 +1321,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
 		return -EINVAL;
 
 	req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
-			    mode, extension);
+			    mode & P9L_MODE_MASK, extension);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto error;
-- 
cgit v1.2.3


From a90ac762d345890b40d88a1385a34a2449c2d75e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 24 Mar 2023 09:23:42 +0000
Subject: net: sfp: make sfp_bus_find_fwnode() take a const fwnode

sfp_bus_find_fwnode() does not write to the fwnode, so let's make it
const.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 2 +-
 include/linux/sfp.h       | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index daac293e8ede..8284f29b3644 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -593,7 +593,7 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
  *	- %-ENOMEM if we failed to allocate the bus.
  *	- an error from the upstream's connect_phy() method.
  */
-struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)
+struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode)
 {
 	struct fwnode_reference_args ref;
 	struct sfp_bus *bus;
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 52b98f9666a2..ef06a195b3c2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -557,7 +557,7 @@ int sfp_get_module_eeprom_by_page(struct sfp_bus *bus,
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
 void sfp_bus_put(struct sfp_bus *bus);
-struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode);
+struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
@@ -619,7 +619,8 @@ static inline void sfp_bus_put(struct sfp_bus *bus)
 {
 }
 
-static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)
+static inline struct sfp_bus *
+sfp_bus_find_fwnode(const struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 4a0faa02d419a6728abef0f1d8a32d8c35ef95e6 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 24 Mar 2023 09:23:53 +0000
Subject: net: phy: constify fwnode_get_phy_node() fwnode argument

fwnode_get_phy_node() does not motify the fwnode structure, so make
the argument const,

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 2 +-
 include/linux/phy.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1785f1cead97..1de3e339b31a 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3057,7 +3057,7 @@ EXPORT_SYMBOL_GPL(device_phy_find_device);
  * and "phy-device" are not supported in ACPI. DT supports all the three
  * named references to the phy node.
  */
-struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)
+struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode)
 {
 	struct fwnode_handle *phy_node;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 36bf0bbc8efa..db7c0bd67559 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1547,7 +1547,7 @@ int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id);
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
 struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode);
 struct phy_device *device_phy_find_device(struct device *dev);
-struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
 void phy_device_free(struct phy_device *phydev);
-- 
cgit v1.2.3


From 812b0597fb4043240724e4c7bed7ba1fe15c0e3f Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Sun, 26 Mar 2023 06:52:01 -0700
Subject: x86/hyperv: Change vTOM handling to use standard coco mechanisms

Hyper-V guests on AMD SEV-SNP hardware have the option of using the
"virtual Top Of Memory" (vTOM) feature specified by the SEV-SNP
architecture. With vTOM, shared vs. private memory accesses are
controlled by splitting the guest physical address space into two
halves.

vTOM is the dividing line where the uppermost bit of the physical
address space is set; e.g., with 47 bits of guest physical address
space, vTOM is 0x400000000000 (bit 46 is set).  Guest physical memory is
accessible at two parallel physical addresses -- one below vTOM and one
above vTOM.  Accesses below vTOM are private (encrypted) while accesses
above vTOM are shared (decrypted). In this sense, vTOM is like the
GPA.SHARED bit in Intel TDX.

Support for Hyper-V guests using vTOM was added to the Linux kernel in
two patch sets[1][2]. This support treats the vTOM bit as part of
the physical address. For accessing shared (decrypted) memory, these
patch sets create a second kernel virtual mapping that maps to physical
addresses above vTOM.

A better approach is to treat the vTOM bit as a protection flag, not
as part of the physical address. This new approach is like the approach
for the GPA.SHARED bit in Intel TDX. Rather than creating a second kernel
virtual mapping, the existing mapping is updated using recently added
coco mechanisms.

When memory is changed between private and shared using
set_memory_decrypted() and set_memory_encrypted(), the PTEs for the
existing kernel mapping are changed to add or remove the vTOM bit in the
guest physical address, just as with TDX. The hypercalls to change the
memory status on the host side are made using the existing callback
mechanism. Everything just works, with a minor tweak to map the IO-APIC
to use private accesses.

To accomplish the switch in approach, the following must be done:

* Update Hyper-V initialization to set the cc_mask based on vTOM
  and do other coco initialization.

* Update physical_mask so the vTOM bit is no longer treated as part
  of the physical address

* Remove CC_VENDOR_HYPERV and merge the associated vTOM functionality
  under CC_VENDOR_AMD. Update cc_mkenc() and cc_mkdec() to set/clear
  the vTOM bit as a protection flag.

* Code already exists to make hypercalls to inform Hyper-V about pages
  changing between shared and private.  Update this code to run as a
  callback from __set_memory_enc_pgtable().

* Remove the Hyper-V special case from __set_memory_enc_dec()

* Remove the Hyper-V specific call to swiotlb_update_mem_attributes()
  since mem_encrypt_init() will now do it.

* Add a Hyper-V specific implementation of the is_private_mmio()
  callback that returns true for the IO-APIC and vTPM MMIO addresses

  [1] https://lore.kernel.org/all/20211025122116.264793-1-ltykernel@gmail.com/
  [2] https://lore.kernel.org/all/20211213071407.314309-1-ltykernel@gmail.com/

  [ bp: Touchups. ]

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/1679838727-87310-7-git-send-email-mikelley@microsoft.com
---
 arch/x86/coco/core.c               | 40 +++++++++++++++------
 arch/x86/hyperv/hv_init.c          | 11 ------
 arch/x86/hyperv/ivm.c              | 72 ++++++++++++++++++++++++++++++++------
 arch/x86/include/asm/coco.h        |  1 -
 arch/x86/include/asm/mem_encrypt.h |  1 +
 arch/x86/include/asm/mshyperv.h    | 16 +++++----
 arch/x86/kernel/cpu/mshyperv.c     | 15 ++++----
 arch/x86/mm/pat/set_memory.c       |  3 --
 drivers/hv/vmbus_drv.c             |  1 -
 include/asm-generic/mshyperv.h     |  2 ++
 10 files changed, 111 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 49b44f881484..f4f0625691fd 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -29,6 +29,22 @@ static bool intel_cc_platform_has(enum cc_attr attr)
 	}
 }
 
+/*
+ * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and
+ * the other levels of SME/SEV functionality, including C-bit
+ * based SEV-SNP, are not enabled.
+ */
+static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr)
+{
+	switch (attr) {
+	case CC_ATTR_GUEST_MEM_ENCRYPT:
+	case CC_ATTR_MEM_ENCRYPT:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /*
  * SME and SEV are very similar but they are not the same, so there are
  * times that the kernel will need to distinguish between SME and SEV. The
@@ -41,9 +57,14 @@ static bool intel_cc_platform_has(enum cc_attr attr)
  * up under SME the trampoline area cannot be encrypted, whereas under SEV
  * the trampoline area must be encrypted.
  */
+
 static bool amd_cc_platform_has(enum cc_attr attr)
 {
 #ifdef CONFIG_AMD_MEM_ENCRYPT
+
+	if (sev_status & MSR_AMD64_SNP_VTOM)
+		return amd_cc_platform_vtom(attr);
+
 	switch (attr) {
 	case CC_ATTR_MEM_ENCRYPT:
 		return sme_me_mask;
@@ -76,11 +97,6 @@ static bool amd_cc_platform_has(enum cc_attr attr)
 #endif
 }
 
-static bool hyperv_cc_platform_has(enum cc_attr attr)
-{
-	return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
-}
-
 bool cc_platform_has(enum cc_attr attr)
 {
 	switch (vendor) {
@@ -88,8 +104,6 @@ bool cc_platform_has(enum cc_attr attr)
 		return amd_cc_platform_has(attr);
 	case CC_VENDOR_INTEL:
 		return intel_cc_platform_has(attr);
-	case CC_VENDOR_HYPERV:
-		return hyperv_cc_platform_has(attr);
 	default:
 		return false;
 	}
@@ -103,11 +117,14 @@ u64 cc_mkenc(u64 val)
 	 * encryption status of the page.
 	 *
 	 * - for AMD, bit *set* means the page is encrypted
-	 * - for Intel *clear* means encrypted.
+	 * - for AMD with vTOM and for Intel, *clear* means encrypted
 	 */
 	switch (vendor) {
 	case CC_VENDOR_AMD:
-		return val | cc_mask;
+		if (sev_status & MSR_AMD64_SNP_VTOM)
+			return val & ~cc_mask;
+		else
+			return val | cc_mask;
 	case CC_VENDOR_INTEL:
 		return val & ~cc_mask;
 	default:
@@ -120,7 +137,10 @@ u64 cc_mkdec(u64 val)
 	/* See comment in cc_mkenc() */
 	switch (vendor) {
 	case CC_VENDOR_AMD:
-		return val & ~cc_mask;
+		if (sev_status & MSR_AMD64_SNP_VTOM)
+			return val | cc_mask;
+		else
+			return val & ~cc_mask;
 	case CC_VENDOR_INTEL:
 		return val | cc_mask;
 	default:
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 41ef036ebb7b..edbc67ec1f3e 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -29,7 +29,6 @@
 #include <linux/syscore_ops.h>
 #include <clocksource/hyperv_timer.h>
 #include <linux/highmem.h>
-#include <linux/swiotlb.h>
 
 int hyperv_init_cpuhp;
 u64 hv_current_partition_id = ~0ull;
@@ -504,16 +503,6 @@ void __init hyperv_init(void)
 	/* Query the VMs extended capability once, so that it can be cached. */
 	hv_query_ext_cap(0);
 
-#ifdef CONFIG_SWIOTLB
-	/*
-	 * Swiotlb bounce buffer needs to be mapped in extra address
-	 * space. Map function doesn't work in the early place and so
-	 * call swiotlb_update_mem_attributes() here.
-	 */
-	if (hv_is_isolation_supported())
-		swiotlb_update_mem_attributes();
-#endif
-
 	return;
 
 clean_guest_os_id:
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 5648efb6c73e..f6a020cb1a24 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -13,6 +13,8 @@
 #include <asm/svm.h>
 #include <asm/sev.h>
 #include <asm/io.h>
+#include <asm/coco.h>
+#include <asm/mem_encrypt.h>
 #include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
@@ -233,7 +235,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
-#endif
 
 /*
  * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -286,27 +287,25 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 }
 
 /*
- * hv_set_mem_host_visibility - Set specified memory visible to host.
+ * hv_vtom_set_host_visibility - Set specified memory visible to host.
  *
  * In Isolation VM, all guest memory is encrypted from host and guest
  * needs to set memory visible to host via hvcall before sharing memory
  * with host. This function works as wrap of hv_mark_gpa_visibility()
  * with memory base and size.
  */
-int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible)
+static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
 {
-	enum hv_mem_host_visibility visibility = visible ?
-			VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE;
+	enum hv_mem_host_visibility visibility = enc ?
+			VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
 	u64 *pfn_array;
 	int ret = 0;
+	bool result = true;
 	int i, pfn;
 
-	if (!hv_is_isolation_supported() || !hv_hypercall_pg)
-		return 0;
-
 	pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
 	if (!pfn_array)
-		return -ENOMEM;
+		return false;
 
 	for (i = 0, pfn = 0; i < pagecount; i++) {
 		pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
@@ -315,17 +314,68 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl
 		if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
 			ret = hv_mark_gpa_visibility(pfn, pfn_array,
 						     visibility);
-			if (ret)
+			if (ret) {
+				result = false;
 				goto err_free_pfn_array;
+			}
 			pfn = 0;
 		}
 	}
 
  err_free_pfn_array:
 	kfree(pfn_array);
-	return ret;
+	return result;
 }
 
+static bool hv_vtom_tlb_flush_required(bool private)
+{
+	return true;
+}
+
+static bool hv_vtom_cache_flush_required(void)
+{
+	return false;
+}
+
+static bool hv_is_private_mmio(u64 addr)
+{
+	/*
+	 * Hyper-V always provides a single IO-APIC in a guest VM.
+	 * When a paravisor is used, it is emulated by the paravisor
+	 * in the guest context and must be mapped private.
+	 */
+	if (addr >= HV_IOAPIC_BASE_ADDRESS &&
+	    addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE))
+		return true;
+
+	/* Same with a vTPM */
+	if (addr >= VTPM_BASE_ADDRESS &&
+	    addr < (VTPM_BASE_ADDRESS + PAGE_SIZE))
+		return true;
+
+	return false;
+}
+
+void __init hv_vtom_init(void)
+{
+	/*
+	 * By design, a VM using vTOM doesn't see the SEV setting,
+	 * so SEV initialization is bypassed and sev_status isn't set.
+	 * Set it here to indicate a vTOM VM.
+	 */
+	sev_status = MSR_AMD64_SNP_VTOM;
+	cc_set_vendor(CC_VENDOR_AMD);
+	cc_set_mask(ms_hyperv.shared_gpa_boundary);
+	physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
+
+	x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
+	x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required;
+	x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
+	x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+}
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
 /*
  * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
  */
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 3d98c3a60d34..d2c6a2e8d04d 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -7,7 +7,6 @@
 enum cc_vendor {
 	CC_VENDOR_NONE,
 	CC_VENDOR_AMD,
-	CC_VENDOR_HYPERV,
 	CC_VENDOR_INTEL,
 };
 
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 72ca90552b6a..b7126701574c 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -56,6 +56,7 @@ void __init sev_es_init_vc_handling(void);
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0ULL
+#define sev_status	0ULL
 
 static inline void __init sme_early_encrypt(resource_size_t paddr,
 					    unsigned long size) { }
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 4c4c0ec3b62e..e3cef98a0142 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,14 @@
 #include <asm/paravirt.h>
 #include <asm/mshyperv.h>
 
+/*
+ * Hyper-V always provides a single IO-APIC at this MMIO address.
+ * Ideally, the value should be looked up in ACPI tables, but it
+ * is needed for mapping the IO-APIC early in boot on Confidential
+ * VMs, before ACPI functions can be used.
+ */
+#define HV_IOAPIC_BASE_ADDRESS 0xfec00000
+
 union hv_ghcb;
 
 DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
@@ -206,18 +214,19 @@ struct irq_domain *hv_create_pci_msi_domain(void);
 int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
 		struct hv_interrupt_entry *entry);
 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
-int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 void hv_ghcb_msr_write(u64 msr, u64 value);
 void hv_ghcb_msr_read(u64 msr, u64 *value);
 bool hv_ghcb_negotiate_protocol(void);
 void hv_ghcb_terminate(unsigned int set, unsigned int reason);
+void hv_vtom_init(void);
 #else
 static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
 static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
 static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
 static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
+static inline void hv_vtom_init(void) {}
 #endif
 
 extern bool hv_isolation_type_snp(void);
@@ -259,11 +268,6 @@ static inline void hv_set_register(unsigned int reg, u64 value) { }
 static inline u64 hv_get_register(unsigned int reg) { return 0; }
 static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { }
 static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
-static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
-					     bool visible)
-{
-	return -1;
-}
 #endif /* CONFIG_HYPERV */
 
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f36dc2f796c5..ded7506217f2 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -33,7 +33,6 @@
 #include <asm/nmi.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/numa.h>
-#include <asm/coco.h>
 
 /* Is Linux running as the root partition? */
 bool hv_root_partition;
@@ -397,8 +396,10 @@ static void __init ms_hyperv_init_platform(void)
 	if (ms_hyperv.priv_high & HV_ISOLATION) {
 		ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
 		ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
-		ms_hyperv.shared_gpa_boundary =
-			BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+		if (ms_hyperv.shared_gpa_boundary_active)
+			ms_hyperv.shared_gpa_boundary =
+				BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
 
 		pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
 			ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
@@ -409,11 +410,6 @@ static void __init ms_hyperv_init_platform(void)
 			swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
 #endif
 		}
-		/* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
-		if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
-			if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
-				cc_set_vendor(CC_VENDOR_HYPERV);
-		}
 	}
 
 	if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -482,6 +478,9 @@ static void __init ms_hyperv_init_platform(void)
 	i8253_clear_counter_on_shutdown = false;
 
 #if IS_ENABLED(CONFIG_HYPERV)
+	if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+	    (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+		hv_vtom_init();
 	/*
 	 * Setup the hook to get control post apic initialization.
 	 */
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 356758b7d4b4..b037954e0f61 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2175,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
 
 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 {
-	if (hv_is_isolation_supported())
-		return hv_set_mem_host_visibility(addr, numpages, !enc);
-
 	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 		return __set_memory_enc_pgtable(addr, numpages, enc);
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index d24dd65b33d4..e9e1c4139e0d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2156,7 +2156,6 @@ void vmbus_device_unregister(struct hv_device *device_obj)
  * VMBUS is an acpi enumerated device. Get the information we
  * need from DSDT.
  */
-#define VTPM_BASE_ADDRESS 0xfed40000
 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 {
 	resource_size_t start = 0;
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 8845a2eca339..90d7f68ed39d 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -26,6 +26,8 @@
 #include <asm/ptrace.h>
 #include <asm/hyperv-tlfs.h>
 
+#define VTPM_BASE_ADDRESS 0xfed40000
+
 struct ms_hyperv_info {
 	u32 features;
 	u32 priv_high;
-- 
cgit v1.2.3


From 7755cec63adeecea3cbbf4032047812c37cf7cc3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 17 Mar 2023 15:50:20 -0400
Subject: arm64: perf: Move PMUv3 driver to drivers/perf

Having the ARM PMUv3 driver sitting in arch/arm64/kernel is getting
in the way of being able to use perf on ARMv8 cores running a 32bit
kernel, such as 32bit KVM guests.

This patch moves it into drivers/perf/arm_pmuv3.c, with an include
file in include/linux/perf/arm_pmuv3.h. The only thing left in
arch/arm64 is some mundane perf stuff.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Zaid Al-Bassam <zalbassam@google.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20230317195027.3746949-2-zalbassam@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/perf_event.h |  249 ------
 arch/arm64/kernel/Makefile          |    1 -
 arch/arm64/kernel/perf_event.c      | 1467 ----------------------------------
 drivers/perf/Kconfig                |   11 +
 drivers/perf/Makefile               |    1 +
 drivers/perf/arm_pmuv3.c            | 1468 +++++++++++++++++++++++++++++++++++
 include/kvm/arm_pmu.h               |    2 +-
 include/linux/perf/arm_pmuv3.h      |  258 ++++++
 8 files changed, 1739 insertions(+), 1718 deletions(-)
 delete mode 100644 arch/arm64/kernel/perf_event.c
 create mode 100644 drivers/perf/arm_pmuv3.c
 create mode 100644 include/linux/perf/arm_pmuv3.h

(limited to 'include')

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 3eaf462f5752..eb7071c9eb34 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -9,255 +9,6 @@
 #include <asm/stack_pointer.h>
 #include <asm/ptrace.h>
 
-#define	ARMV8_PMU_MAX_COUNTERS	32
-#define	ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
-
-/*
- * Common architectural and microarchitectural event numbers.
- */
-#define ARMV8_PMUV3_PERFCTR_SW_INCR				0x0000
-#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL			0x0001
-#define ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL			0x0002
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL			0x0003
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE				0x0004
-#define ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL			0x0005
-#define ARMV8_PMUV3_PERFCTR_LD_RETIRED				0x0006
-#define ARMV8_PMUV3_PERFCTR_ST_RETIRED				0x0007
-#define ARMV8_PMUV3_PERFCTR_INST_RETIRED			0x0008
-#define ARMV8_PMUV3_PERFCTR_EXC_TAKEN				0x0009
-#define ARMV8_PMUV3_PERFCTR_EXC_RETURN				0x000A
-#define ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED			0x000B
-#define ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED			0x000C
-#define ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED			0x000D
-#define ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED			0x000E
-#define ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED		0x000F
-#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED				0x0010
-#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES				0x0011
-#define ARMV8_PMUV3_PERFCTR_BR_PRED				0x0012
-#define ARMV8_PMUV3_PERFCTR_MEM_ACCESS				0x0013
-#define ARMV8_PMUV3_PERFCTR_L1I_CACHE				0x0014
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB			0x0015
-#define ARMV8_PMUV3_PERFCTR_L2D_CACHE				0x0016
-#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL			0x0017
-#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB			0x0018
-#define ARMV8_PMUV3_PERFCTR_BUS_ACCESS				0x0019
-#define ARMV8_PMUV3_PERFCTR_MEMORY_ERROR			0x001A
-#define ARMV8_PMUV3_PERFCTR_INST_SPEC				0x001B
-#define ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED			0x001C
-#define ARMV8_PMUV3_PERFCTR_BUS_CYCLES				0x001D
-#define ARMV8_PMUV3_PERFCTR_CHAIN				0x001E
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE			0x001F
-#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE			0x0020
-#define ARMV8_PMUV3_PERFCTR_BR_RETIRED				0x0021
-#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED			0x0022
-#define ARMV8_PMUV3_PERFCTR_STALL_FRONTEND			0x0023
-#define ARMV8_PMUV3_PERFCTR_STALL_BACKEND			0x0024
-#define ARMV8_PMUV3_PERFCTR_L1D_TLB				0x0025
-#define ARMV8_PMUV3_PERFCTR_L1I_TLB				0x0026
-#define ARMV8_PMUV3_PERFCTR_L2I_CACHE				0x0027
-#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL			0x0028
-#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE			0x0029
-#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL			0x002A
-#define ARMV8_PMUV3_PERFCTR_L3D_CACHE				0x002B
-#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB			0x002C
-#define ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL			0x002D
-#define ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL			0x002E
-#define ARMV8_PMUV3_PERFCTR_L2D_TLB				0x002F
-#define ARMV8_PMUV3_PERFCTR_L2I_TLB				0x0030
-#define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS			0x0031
-#define ARMV8_PMUV3_PERFCTR_LL_CACHE				0x0032
-#define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS			0x0033
-#define ARMV8_PMUV3_PERFCTR_DTLB_WALK				0x0034
-#define ARMV8_PMUV3_PERFCTR_ITLB_WALK				0x0035
-#define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD				0x0036
-#define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD			0x0037
-#define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD			0x0038
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD			0x0039
-#define ARMV8_PMUV3_PERFCTR_OP_RETIRED				0x003A
-#define ARMV8_PMUV3_PERFCTR_OP_SPEC				0x003B
-#define ARMV8_PMUV3_PERFCTR_STALL				0x003C
-#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND			0x003D
-#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND			0x003E
-#define ARMV8_PMUV3_PERFCTR_STALL_SLOT				0x003F
-
-/* Statistical profiling extension microarchitectural events */
-#define	ARMV8_SPE_PERFCTR_SAMPLE_POP				0x4000
-#define	ARMV8_SPE_PERFCTR_SAMPLE_FEED				0x4001
-#define	ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE			0x4002
-#define	ARMV8_SPE_PERFCTR_SAMPLE_COLLISION			0x4003
-
-/* AMUv1 architecture events */
-#define	ARMV8_AMU_PERFCTR_CNT_CYCLES				0x4004
-#define	ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM			0x4005
-
-/* long-latency read miss events */
-#define	ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS			0x4006
-#define	ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD			0x4009
-#define	ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS			0x400A
-#define	ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD			0x400B
-
-/* Trace buffer events */
-#define ARMV8_PMUV3_PERFCTR_TRB_WRAP				0x400C
-#define ARMV8_PMUV3_PERFCTR_TRB_TRIG				0x400E
-
-/* Trace unit events */
-#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT0				0x4010
-#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT1				0x4011
-#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT2				0x4012
-#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT3				0x4013
-#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4			0x4018
-#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5			0x4019
-#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6			0x401A
-#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7			0x401B
-
-/* additional latency from alignment events */
-#define	ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT			0x4020
-#define	ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT			0x4021
-#define	ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT			0x4022
-
-/* Armv8.5 Memory Tagging Extension events */
-#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED			0x4024
-#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD			0x4025
-#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR			0x4026
-
-/* ARMv8 recommended implementation defined event types */
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD			0x0040
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR			0x0041
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD		0x0042
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR		0x0043
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_INNER		0x0044
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_OUTER		0x0045
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_VICTIM		0x0046
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_CLEAN			0x0047
-#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_INVAL			0x0048
-
-#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD			0x004C
-#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR			0x004D
-#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD				0x004E
-#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR				0x004F
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_RD			0x0050
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WR			0x0051
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_RD		0x0052
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_WR		0x0053
-
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_VICTIM		0x0056
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_CLEAN			0x0057
-#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_INVAL			0x0058
-
-#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_RD			0x005C
-#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_WR			0x005D
-#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_RD				0x005E
-#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_WR				0x005F
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD			0x0060
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR			0x0061
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_SHARED			0x0062
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NOT_SHARED		0x0063
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NORMAL			0x0064
-#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_PERIPH			0x0065
-#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_RD			0x0066
-#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_WR			0x0067
-#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LD_SPEC			0x0068
-#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_ST_SPEC			0x0069
-#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LDST_SPEC		0x006A
-
-#define ARMV8_IMPDEF_PERFCTR_LDREX_SPEC				0x006C
-#define ARMV8_IMPDEF_PERFCTR_STREX_PASS_SPEC			0x006D
-#define ARMV8_IMPDEF_PERFCTR_STREX_FAIL_SPEC			0x006E
-#define ARMV8_IMPDEF_PERFCTR_STREX_SPEC				0x006F
-#define ARMV8_IMPDEF_PERFCTR_LD_SPEC				0x0070
-#define ARMV8_IMPDEF_PERFCTR_ST_SPEC				0x0071
-#define ARMV8_IMPDEF_PERFCTR_LDST_SPEC				0x0072
-#define ARMV8_IMPDEF_PERFCTR_DP_SPEC				0x0073
-#define ARMV8_IMPDEF_PERFCTR_ASE_SPEC				0x0074
-#define ARMV8_IMPDEF_PERFCTR_VFP_SPEC				0x0075
-#define ARMV8_IMPDEF_PERFCTR_PC_WRITE_SPEC			0x0076
-#define ARMV8_IMPDEF_PERFCTR_CRYPTO_SPEC			0x0077
-#define ARMV8_IMPDEF_PERFCTR_BR_IMMED_SPEC			0x0078
-#define ARMV8_IMPDEF_PERFCTR_BR_RETURN_SPEC			0x0079
-#define ARMV8_IMPDEF_PERFCTR_BR_INDIRECT_SPEC			0x007A
-
-#define ARMV8_IMPDEF_PERFCTR_ISB_SPEC				0x007C
-#define ARMV8_IMPDEF_PERFCTR_DSB_SPEC				0x007D
-#define ARMV8_IMPDEF_PERFCTR_DMB_SPEC				0x007E
-
-#define ARMV8_IMPDEF_PERFCTR_EXC_UNDEF				0x0081
-#define ARMV8_IMPDEF_PERFCTR_EXC_SVC				0x0082
-#define ARMV8_IMPDEF_PERFCTR_EXC_PABORT				0x0083
-#define ARMV8_IMPDEF_PERFCTR_EXC_DABORT				0x0084
-
-#define ARMV8_IMPDEF_PERFCTR_EXC_IRQ				0x0086
-#define ARMV8_IMPDEF_PERFCTR_EXC_FIQ				0x0087
-#define ARMV8_IMPDEF_PERFCTR_EXC_SMC				0x0088
-
-#define ARMV8_IMPDEF_PERFCTR_EXC_HVC				0x008A
-#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_PABORT			0x008B
-#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_DABORT			0x008C
-#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_OTHER			0x008D
-#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_IRQ			0x008E
-#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_FIQ			0x008F
-#define ARMV8_IMPDEF_PERFCTR_RC_LD_SPEC				0x0090
-#define ARMV8_IMPDEF_PERFCTR_RC_ST_SPEC				0x0091
-
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_RD			0x00A0
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WR			0x00A1
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_RD		0x00A2
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_WR		0x00A3
-
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_VICTIM		0x00A6
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_CLEAN			0x00A7
-#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_INVAL			0x00A8
-
-/*
- * Per-CPU PMCR: config reg
- */
-#define ARMV8_PMU_PMCR_E	(1 << 0) /* Enable all counters */
-#define ARMV8_PMU_PMCR_P	(1 << 1) /* Reset all counters */
-#define ARMV8_PMU_PMCR_C	(1 << 2) /* Cycle counter reset */
-#define ARMV8_PMU_PMCR_D	(1 << 3) /* CCNT counts every 64th cpu cycle */
-#define ARMV8_PMU_PMCR_X	(1 << 4) /* Export to ETM */
-#define ARMV8_PMU_PMCR_DP	(1 << 5) /* Disable CCNT if non-invasive debug*/
-#define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
-#define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
-#define	ARMV8_PMU_PMCR_N_SHIFT	11	 /* Number of counters supported */
-#define	ARMV8_PMU_PMCR_N_MASK	0x1f
-#define	ARMV8_PMU_PMCR_MASK	0xff	 /* Mask for writable bits */
-
-/*
- * PMOVSR: counters overflow flag status reg
- */
-#define	ARMV8_PMU_OVSR_MASK		0xffffffff	/* Mask for writable bits */
-#define	ARMV8_PMU_OVERFLOWED_MASK	ARMV8_PMU_OVSR_MASK
-
-/*
- * PMXEVTYPER: Event selection reg
- */
-#define	ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
-#define	ARMV8_PMU_EVTYPE_EVENT	0xffff		/* Mask for EVENT bits */
-
-/*
- * Event filters for PMUv3
- */
-#define	ARMV8_PMU_EXCLUDE_EL1	(1U << 31)
-#define	ARMV8_PMU_EXCLUDE_EL0	(1U << 30)
-#define	ARMV8_PMU_INCLUDE_EL2	(1U << 27)
-
-/*
- * PMUSERENR: user enable reg
- */
-#define ARMV8_PMU_USERENR_MASK	0xf		/* Mask for writable bits */
-#define ARMV8_PMU_USERENR_EN	(1 << 0) /* PMU regs can be accessed at EL0 */
-#define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
-#define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
-#define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
-
-/* PMMIR_EL1.SLOTS mask */
-#define ARMV8_PMU_SLOTS_MASK	0xff
-
-#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
-#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
-#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
-#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
-
 #ifdef CONFIG_PERF_EVENTS
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ceba6792f5b3..7c2bb4e72476 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o entry-ftrace.o
 obj-$(CONFIG_MODULES)			+= module.o
 obj-$(CONFIG_ARM64_MODULE_PLTS)		+= module-plts.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o perf_callchain.o
-obj-$(CONFIG_HW_PERF_EVENTS)		+= perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_CPU_PM)			+= sleep.o suspend.o
 obj-$(CONFIG_CPU_IDLE)			+= cpuidle.o
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
deleted file mode 100644
index dde06c0f97f3..000000000000
--- a/arch/arm64/kernel/perf_event.c
+++ /dev/null
@@ -1,1467 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ARMv8 PMUv3 Performance Events handling code.
- *
- * Copyright (C) 2012 ARM Limited
- * Author: Will Deacon <will.deacon@arm.com>
- *
- * This code is based heavily on the ARMv7 perf event code.
- */
-
-#include <asm/irq_regs.h>
-#include <asm/perf_event.h>
-#include <asm/sysreg.h>
-#include <asm/virt.h>
-
-#include <clocksource/arm_arch_timer.h>
-
-#include <linux/acpi.h>
-#include <linux/clocksource.h>
-#include <linux/kvm_host.h>
-#include <linux/of.h>
-#include <linux/perf/arm_pmu.h>
-#include <linux/platform_device.h>
-#include <linux/sched_clock.h>
-#include <linux/smp.h>
-
-/* ARMv8 Cortex-A53 specific event types. */
-#define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
-
-/* ARMv8 Cavium ThunderX specific event types. */
-#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST			0xE9
-#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS		0xEA
-#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS		0xEB
-#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS		0xEC
-#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS		0xED
-
-/*
- * ARMv8 Architectural defined events, not all of these may
- * be supported on any given implementation. Unsupported events will
- * be disabled at run-time based on the PMCEID registers.
- */
-static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = {
-	PERF_MAP_ALL_UNSUPPORTED,
-	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV8_PMUV3_PERFCTR_CPU_CYCLES,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV8_PMUV3_PERFCTR_INST_RETIRED,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE,
-	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED,
-	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
-	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV8_PMUV3_PERFCTR_BUS_CYCLES,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV8_PMUV3_PERFCTR_STALL_FRONTEND,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV8_PMUV3_PERFCTR_STALL_BACKEND,
-};
-
-static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-						[PERF_COUNT_HW_CACHE_OP_MAX]
-						[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE,
-	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
-
-	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE,
-	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL,
-
-	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL,
-	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1D_TLB,
-
-	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL,
-	[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB,
-
-	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD,
-	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_LL_CACHE_RD,
-
-	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_BR_PRED,
-	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
-};
-
-static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-					      [PERF_COUNT_HW_CACHE_OP_MAX]
-					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL,
-
-	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
-	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
-};
-
-static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-					      [PERF_COUNT_HW_CACHE_OP_MAX]
-					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
-	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
-
-	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
-	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
-
-	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
-	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
-};
-
-static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-					      [PERF_COUNT_HW_CACHE_OP_MAX]
-					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
-};
-
-static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-						   [PERF_COUNT_HW_CACHE_OP_MAX]
-						   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
-	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST,
-	[C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS,
-	[C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS,
-
-	[C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS,
-	[C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS,
-
-	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
-	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
-	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
-	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
-};
-
-static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
-					      [PERF_COUNT_HW_CACHE_OP_MAX]
-					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	PERF_CACHE_MAP_ALL_UNSUPPORTED,
-
-	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
-	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
-	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
-
-	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
-	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
-	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
-	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
-
-	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
-	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
-};
-
-static ssize_t
-armv8pmu_events_sysfs_show(struct device *dev,
-			   struct device_attribute *attr, char *page)
-{
-	struct perf_pmu_events_attr *pmu_attr;
-
-	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
-
-	return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
-}
-
-#define ARMV8_EVENT_ATTR(name, config)						\
-	PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)
-
-static struct attribute *armv8_pmuv3_event_attrs[] = {
-	ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR),
-	ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL),
-	ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL),
-	ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL),
-	ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE),
-	ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL),
-	ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED),
-	ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED),
-	ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED),
-	ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN),
-	ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN),
-	ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED),
-	ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED),
-	ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED),
-	ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED),
-	ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED),
-	ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED),
-	ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES),
-	ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED),
-	ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS),
-	ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE),
-	ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB),
-	ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE),
-	ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL),
-	ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB),
-	ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS),
-	ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR),
-	ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC),
-	ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED),
-	ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES),
-	/* Don't expose the chain event in /sys, since it's useless in isolation */
-	ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE),
-	ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE),
-	ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED),
-	ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED),
-	ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND),
-	ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND),
-	ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB),
-	ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB),
-	ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE),
-	ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL),
-	ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE),
-	ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL),
-	ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE),
-	ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB),
-	ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL),
-	ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL),
-	ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB),
-	ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB),
-	ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS),
-	ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE),
-	ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS),
-	ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK),
-	ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK),
-	ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD),
-	ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD),
-	ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD),
-	ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD),
-	ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED),
-	ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC),
-	ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL),
-	ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND),
-	ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND),
-	ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT),
-	ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP),
-	ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED),
-	ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE),
-	ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION),
-	ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES),
-	ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM),
-	ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS),
-	ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD),
-	ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS),
-	ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD),
-	ARMV8_EVENT_ATTR(trb_wrap, ARMV8_PMUV3_PERFCTR_TRB_WRAP),
-	ARMV8_EVENT_ATTR(trb_trig, ARMV8_PMUV3_PERFCTR_TRB_TRIG),
-	ARMV8_EVENT_ATTR(trcextout0, ARMV8_PMUV3_PERFCTR_TRCEXTOUT0),
-	ARMV8_EVENT_ATTR(trcextout1, ARMV8_PMUV3_PERFCTR_TRCEXTOUT1),
-	ARMV8_EVENT_ATTR(trcextout2, ARMV8_PMUV3_PERFCTR_TRCEXTOUT2),
-	ARMV8_EVENT_ATTR(trcextout3, ARMV8_PMUV3_PERFCTR_TRCEXTOUT3),
-	ARMV8_EVENT_ATTR(cti_trigout4, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4),
-	ARMV8_EVENT_ATTR(cti_trigout5, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5),
-	ARMV8_EVENT_ATTR(cti_trigout6, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6),
-	ARMV8_EVENT_ATTR(cti_trigout7, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7),
-	ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT),
-	ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT),
-	ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT),
-	ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED),
-	ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD),
-	ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR),
-	NULL,
-};
-
-static umode_t
-armv8pmu_event_attr_is_visible(struct kobject *kobj,
-			       struct attribute *attr, int unused)
-{
-	struct device *dev = kobj_to_dev(kobj);
-	struct pmu *pmu = dev_get_drvdata(dev);
-	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	struct perf_pmu_events_attr *pmu_attr;
-
-	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
-
-	if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
-	    test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap))
-		return attr->mode;
-
-	if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) {
-		u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE;
-
-		if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
-		    test_bit(id, cpu_pmu->pmceid_ext_bitmap))
-			return attr->mode;
-	}
-
-	return 0;
-}
-
-static const struct attribute_group armv8_pmuv3_events_attr_group = {
-	.name = "events",
-	.attrs = armv8_pmuv3_event_attrs,
-	.is_visible = armv8pmu_event_attr_is_visible,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-15");
-PMU_FORMAT_ATTR(long, "config1:0");
-PMU_FORMAT_ATTR(rdpmc, "config1:1");
-
-static int sysctl_perf_user_access __read_mostly;
-
-static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
-{
-	return event->attr.config1 & 0x1;
-}
-
-static inline bool armv8pmu_event_want_user_access(struct perf_event *event)
-{
-	return event->attr.config1 & 0x2;
-}
-
-static struct attribute *armv8_pmuv3_format_attrs[] = {
-	&format_attr_event.attr,
-	&format_attr_long.attr,
-	&format_attr_rdpmc.attr,
-	NULL,
-};
-
-static const struct attribute_group armv8_pmuv3_format_attr_group = {
-	.name = "format",
-	.attrs = armv8_pmuv3_format_attrs,
-};
-
-static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
-			  char *page)
-{
-	struct pmu *pmu = dev_get_drvdata(dev);
-	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
-
-	return sysfs_emit(page, "0x%08x\n", slots);
-}
-
-static DEVICE_ATTR_RO(slots);
-
-static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
-			      char *page)
-{
-	struct pmu *pmu = dev_get_drvdata(dev);
-	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT)
-			& ARMV8_PMU_BUS_SLOTS_MASK;
-
-	return sysfs_emit(page, "0x%08x\n", bus_slots);
-}
-
-static DEVICE_ATTR_RO(bus_slots);
-
-static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
-			      char *page)
-{
-	struct pmu *pmu = dev_get_drvdata(dev);
-	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT)
-			& ARMV8_PMU_BUS_WIDTH_MASK;
-	u32 val = 0;
-
-	/* Encoded as Log2(number of bytes), plus one */
-	if (bus_width > 2 && bus_width < 13)
-		val = 1 << (bus_width - 1);
-
-	return sysfs_emit(page, "0x%08x\n", val);
-}
-
-static DEVICE_ATTR_RO(bus_width);
-
-static struct attribute *armv8_pmuv3_caps_attrs[] = {
-	&dev_attr_slots.attr,
-	&dev_attr_bus_slots.attr,
-	&dev_attr_bus_width.attr,
-	NULL,
-};
-
-static const struct attribute_group armv8_pmuv3_caps_attr_group = {
-	.name = "caps",
-	.attrs = armv8_pmuv3_caps_attrs,
-};
-
-/*
- * Perf Events' indices
- */
-#define	ARMV8_IDX_CYCLE_COUNTER	0
-#define	ARMV8_IDX_COUNTER0	1
-#define	ARMV8_IDX_CYCLE_COUNTER_USER	32
-
-/*
- * We unconditionally enable ARMv8.5-PMU long event counter support
- * (64-bit events) where supported. Indicate if this arm_pmu has long
- * event counter support.
- */
-static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
-{
-	return (cpu_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5);
-}
-
-static inline bool armv8pmu_event_has_user_read(struct perf_event *event)
-{
-	return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
-}
-
-/*
- * We must chain two programmable counters for 64 bit events,
- * except when we have allocated the 64bit cycle counter (for CPU
- * cycles event) or when user space counter access is enabled.
- */
-static inline bool armv8pmu_event_is_chained(struct perf_event *event)
-{
-	int idx = event->hw.idx;
-	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
-
-	return !armv8pmu_event_has_user_read(event) &&
-	       armv8pmu_event_is_64bit(event) &&
-	       !armv8pmu_has_long_event(cpu_pmu) &&
-	       (idx != ARMV8_IDX_CYCLE_COUNTER);
-}
-
-/*
- * ARMv8 low level PMU access
- */
-
-/*
- * Perf Event to low level counters mapping
- */
-#define	ARMV8_IDX_TO_COUNTER(x)	\
-	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
-
-/*
- * This code is really good
- */
-
-#define PMEVN_CASE(n, case_macro) \
-	case n: case_macro(n); break
-
-#define PMEVN_SWITCH(x, case_macro)				\
-	do {							\
-		switch (x) {					\
-		PMEVN_CASE(0,  case_macro);			\
-		PMEVN_CASE(1,  case_macro);			\
-		PMEVN_CASE(2,  case_macro);			\
-		PMEVN_CASE(3,  case_macro);			\
-		PMEVN_CASE(4,  case_macro);			\
-		PMEVN_CASE(5,  case_macro);			\
-		PMEVN_CASE(6,  case_macro);			\
-		PMEVN_CASE(7,  case_macro);			\
-		PMEVN_CASE(8,  case_macro);			\
-		PMEVN_CASE(9,  case_macro);			\
-		PMEVN_CASE(10, case_macro);			\
-		PMEVN_CASE(11, case_macro);			\
-		PMEVN_CASE(12, case_macro);			\
-		PMEVN_CASE(13, case_macro);			\
-		PMEVN_CASE(14, case_macro);			\
-		PMEVN_CASE(15, case_macro);			\
-		PMEVN_CASE(16, case_macro);			\
-		PMEVN_CASE(17, case_macro);			\
-		PMEVN_CASE(18, case_macro);			\
-		PMEVN_CASE(19, case_macro);			\
-		PMEVN_CASE(20, case_macro);			\
-		PMEVN_CASE(21, case_macro);			\
-		PMEVN_CASE(22, case_macro);			\
-		PMEVN_CASE(23, case_macro);			\
-		PMEVN_CASE(24, case_macro);			\
-		PMEVN_CASE(25, case_macro);			\
-		PMEVN_CASE(26, case_macro);			\
-		PMEVN_CASE(27, case_macro);			\
-		PMEVN_CASE(28, case_macro);			\
-		PMEVN_CASE(29, case_macro);			\
-		PMEVN_CASE(30, case_macro);			\
-		default: WARN(1, "Invalid PMEV* index\n");	\
-		}						\
-	} while (0)
-
-#define RETURN_READ_PMEVCNTRN(n) \
-	return read_sysreg(pmevcntr##n##_el0)
-static unsigned long read_pmevcntrn(int n)
-{
-	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
-	return 0;
-}
-
-#define WRITE_PMEVCNTRN(n) \
-	write_sysreg(val, pmevcntr##n##_el0)
-static void write_pmevcntrn(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
-}
-
-#define WRITE_PMEVTYPERN(n) \
-	write_sysreg(val, pmevtyper##n##_el0)
-static void write_pmevtypern(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
-}
-
-static inline u32 armv8pmu_pmcr_read(void)
-{
-	return read_sysreg(pmcr_el0);
-}
-
-static inline void armv8pmu_pmcr_write(u32 val)
-{
-	val &= ARMV8_PMU_PMCR_MASK;
-	isb();
-	write_sysreg(val, pmcr_el0);
-}
-
-static inline int armv8pmu_has_overflowed(u32 pmovsr)
-{
-	return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
-}
-
-static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
-{
-	return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
-}
-
-static inline u64 armv8pmu_read_evcntr(int idx)
-{
-	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
-
-	return read_pmevcntrn(counter);
-}
-
-static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
-{
-	int idx = event->hw.idx;
-	u64 val = armv8pmu_read_evcntr(idx);
-
-	if (armv8pmu_event_is_chained(event))
-		val = (val << 32) | armv8pmu_read_evcntr(idx - 1);
-	return val;
-}
-
-/*
- * The cycle counter is always a 64-bit counter. When ARMV8_PMU_PMCR_LP
- * is set the event counters also become 64-bit counters. Unless the
- * user has requested a long counter (attr.config1) then we want to
- * interrupt upon 32-bit overflow - we achieve this by applying a bias.
- */
-static bool armv8pmu_event_needs_bias(struct perf_event *event)
-{
-	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	if (armv8pmu_event_is_64bit(event))
-		return false;
-
-	if (armv8pmu_has_long_event(cpu_pmu) ||
-	    idx == ARMV8_IDX_CYCLE_COUNTER)
-		return true;
-
-	return false;
-}
-
-static u64 armv8pmu_bias_long_counter(struct perf_event *event, u64 value)
-{
-	if (armv8pmu_event_needs_bias(event))
-		value |= GENMASK(63, 32);
-
-	return value;
-}
-
-static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value)
-{
-	if (armv8pmu_event_needs_bias(event))
-		value &= ~GENMASK(63, 32);
-
-	return value;
-}
-
-static u64 armv8pmu_read_counter(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-	u64 value;
-
-	if (idx == ARMV8_IDX_CYCLE_COUNTER)
-		value = read_sysreg(pmccntr_el0);
-	else
-		value = armv8pmu_read_hw_counter(event);
-
-	return  armv8pmu_unbias_long_counter(event, value);
-}
-
-static inline void armv8pmu_write_evcntr(int idx, u64 value)
-{
-	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
-
-	write_pmevcntrn(counter, value);
-}
-
-static inline void armv8pmu_write_hw_counter(struct perf_event *event,
-					     u64 value)
-{
-	int idx = event->hw.idx;
-
-	if (armv8pmu_event_is_chained(event)) {
-		armv8pmu_write_evcntr(idx, upper_32_bits(value));
-		armv8pmu_write_evcntr(idx - 1, lower_32_bits(value));
-	} else {
-		armv8pmu_write_evcntr(idx, value);
-	}
-}
-
-static void armv8pmu_write_counter(struct perf_event *event, u64 value)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	value = armv8pmu_bias_long_counter(event, value);
-
-	if (idx == ARMV8_IDX_CYCLE_COUNTER)
-		write_sysreg(value, pmccntr_el0);
-	else
-		armv8pmu_write_hw_counter(event, value);
-}
-
-static inline void armv8pmu_write_evtype(int idx, u32 val)
-{
-	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
-
-	val &= ARMV8_PMU_EVTYPE_MASK;
-	write_pmevtypern(counter, val);
-}
-
-static inline void armv8pmu_write_event_type(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	/*
-	 * For chained events, the low counter is programmed to count
-	 * the event of interest and the high counter is programmed
-	 * with CHAIN event code with filters set to count at all ELs.
-	 */
-	if (armv8pmu_event_is_chained(event)) {
-		u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN |
-				ARMV8_PMU_INCLUDE_EL2;
-
-		armv8pmu_write_evtype(idx - 1, hwc->config_base);
-		armv8pmu_write_evtype(idx, chain_evt);
-	} else {
-		if (idx == ARMV8_IDX_CYCLE_COUNTER)
-			write_sysreg(hwc->config_base, pmccfiltr_el0);
-		else
-			armv8pmu_write_evtype(idx, hwc->config_base);
-	}
-}
-
-static u32 armv8pmu_event_cnten_mask(struct perf_event *event)
-{
-	int counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
-	u32 mask = BIT(counter);
-
-	if (armv8pmu_event_is_chained(event))
-		mask |= BIT(counter - 1);
-	return mask;
-}
-
-static inline void armv8pmu_enable_counter(u32 mask)
-{
-	/*
-	 * Make sure event configuration register writes are visible before we
-	 * enable the counter.
-	 * */
-	isb();
-	write_sysreg(mask, pmcntenset_el0);
-}
-
-static inline void armv8pmu_enable_event_counter(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	u32 mask = armv8pmu_event_cnten_mask(event);
-
-	kvm_set_pmu_events(mask, attr);
-
-	/* We rely on the hypervisor switch code to enable guest counters */
-	if (!kvm_pmu_counter_deferred(attr))
-		armv8pmu_enable_counter(mask);
-}
-
-static inline void armv8pmu_disable_counter(u32 mask)
-{
-	write_sysreg(mask, pmcntenclr_el0);
-	/*
-	 * Make sure the effects of disabling the counter are visible before we
-	 * start configuring the event.
-	 */
-	isb();
-}
-
-static inline void armv8pmu_disable_event_counter(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	u32 mask = armv8pmu_event_cnten_mask(event);
-
-	kvm_clr_pmu_events(mask);
-
-	/* We rely on the hypervisor switch code to disable guest counters */
-	if (!kvm_pmu_counter_deferred(attr))
-		armv8pmu_disable_counter(mask);
-}
-
-static inline void armv8pmu_enable_intens(u32 mask)
-{
-	write_sysreg(mask, pmintenset_el1);
-}
-
-static inline void armv8pmu_enable_event_irq(struct perf_event *event)
-{
-	u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
-	armv8pmu_enable_intens(BIT(counter));
-}
-
-static inline void armv8pmu_disable_intens(u32 mask)
-{
-	write_sysreg(mask, pmintenclr_el1);
-	isb();
-	/* Clear the overflow flag in case an interrupt is pending. */
-	write_sysreg(mask, pmovsclr_el0);
-	isb();
-}
-
-static inline void armv8pmu_disable_event_irq(struct perf_event *event)
-{
-	u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
-	armv8pmu_disable_intens(BIT(counter));
-}
-
-static inline u32 armv8pmu_getreset_flags(void)
-{
-	u32 value;
-
-	/* Read */
-	value = read_sysreg(pmovsclr_el0);
-
-	/* Write to clear flags */
-	value &= ARMV8_PMU_OVSR_MASK;
-	write_sysreg(value, pmovsclr_el0);
-
-	return value;
-}
-
-static void armv8pmu_disable_user_access(void)
-{
-	write_sysreg(0, pmuserenr_el0);
-}
-
-static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
-{
-	int i;
-	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
-
-	/* Clear any unused counters to avoid leaking their contents */
-	for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
-		if (i == ARMV8_IDX_CYCLE_COUNTER)
-			write_sysreg(0, pmccntr_el0);
-		else
-			armv8pmu_write_evcntr(i, 0);
-	}
-
-	write_sysreg(0, pmuserenr_el0);
-	write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
-}
-
-static void armv8pmu_enable_event(struct perf_event *event)
-{
-	/*
-	 * Enable counter and interrupt, and set the counter to count
-	 * the event that we're interested in.
-	 */
-
-	/*
-	 * Disable counter
-	 */
-	armv8pmu_disable_event_counter(event);
-
-	/*
-	 * Set event.
-	 */
-	armv8pmu_write_event_type(event);
-
-	/*
-	 * Enable interrupt for this counter
-	 */
-	armv8pmu_enable_event_irq(event);
-
-	/*
-	 * Enable counter
-	 */
-	armv8pmu_enable_event_counter(event);
-}
-
-static void armv8pmu_disable_event(struct perf_event *event)
-{
-	/*
-	 * Disable counter
-	 */
-	armv8pmu_disable_event_counter(event);
-
-	/*
-	 * Disable interrupt for this counter
-	 */
-	armv8pmu_disable_event_irq(event);
-}
-
-static void armv8pmu_start(struct arm_pmu *cpu_pmu)
-{
-	struct perf_event_context *ctx;
-	int nr_user = 0;
-
-	ctx = perf_cpu_task_ctx();
-	if (ctx)
-		nr_user = ctx->nr_user;
-
-	if (sysctl_perf_user_access && nr_user)
-		armv8pmu_enable_user_access(cpu_pmu);
-	else
-		armv8pmu_disable_user_access();
-
-	/* Enable all counters */
-	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
-}
-
-static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
-{
-	/* Disable all counters */
-	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
-}
-
-static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
-{
-	u32 pmovsr;
-	struct perf_sample_data data;
-	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
-	struct pt_regs *regs;
-	int idx;
-
-	/*
-	 * Get and reset the IRQ flags
-	 */
-	pmovsr = armv8pmu_getreset_flags();
-
-	/*
-	 * Did an overflow occur?
-	 */
-	if (!armv8pmu_has_overflowed(pmovsr))
-		return IRQ_NONE;
-
-	/*
-	 * Handle the counter(s) overflow(s)
-	 */
-	regs = get_irq_regs();
-
-	/*
-	 * Stop the PMU while processing the counter overflows
-	 * to prevent skews in group events.
-	 */
-	armv8pmu_stop(cpu_pmu);
-	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
-		struct perf_event *event = cpuc->events[idx];
-		struct hw_perf_event *hwc;
-
-		/* Ignore if we don't have an event. */
-		if (!event)
-			continue;
-
-		/*
-		 * We have a single interrupt for all counters. Check that
-		 * each counter has overflowed before we process it.
-		 */
-		if (!armv8pmu_counter_has_overflowed(pmovsr, idx))
-			continue;
-
-		hwc = &event->hw;
-		armpmu_event_update(event);
-		perf_sample_data_init(&data, 0, hwc->last_period);
-		if (!armpmu_event_set_period(event))
-			continue;
-
-		/*
-		 * Perf event overflow will queue the processing of the event as
-		 * an irq_work which will be taken care of in the handling of
-		 * IPI_IRQ_WORK.
-		 */
-		if (perf_event_overflow(event, &data, regs))
-			cpu_pmu->disable(event);
-	}
-	armv8pmu_start(cpu_pmu);
-
-	return IRQ_HANDLED;
-}
-
-static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
-				    struct arm_pmu *cpu_pmu)
-{
-	int idx;
-
-	for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
-		if (!test_and_set_bit(idx, cpuc->used_mask))
-			return idx;
-	}
-	return -EAGAIN;
-}
-
-static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
-				   struct arm_pmu *cpu_pmu)
-{
-	int idx;
-
-	/*
-	 * Chaining requires two consecutive event counters, where
-	 * the lower idx must be even.
-	 */
-	for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) {
-		if (!test_and_set_bit(idx, cpuc->used_mask)) {
-			/* Check if the preceding even counter is available */
-			if (!test_and_set_bit(idx - 1, cpuc->used_mask))
-				return idx;
-			/* Release the Odd counter */
-			clear_bit(idx, cpuc->used_mask);
-		}
-	}
-	return -EAGAIN;
-}
-
-static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
-				  struct perf_event *event)
-{
-	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
-
-	/* Always prefer to place a cycle counter into the cycle counter. */
-	if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
-		if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
-			return ARMV8_IDX_CYCLE_COUNTER;
-		else if (armv8pmu_event_is_64bit(event) &&
-			   armv8pmu_event_want_user_access(event) &&
-			   !armv8pmu_has_long_event(cpu_pmu))
-				return -EAGAIN;
-	}
-
-	/*
-	 * Otherwise use events counters
-	 */
-	if (armv8pmu_event_is_chained(event))
-		return	armv8pmu_get_chain_idx(cpuc, cpu_pmu);
-	else
-		return armv8pmu_get_single_idx(cpuc, cpu_pmu);
-}
-
-static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
-				     struct perf_event *event)
-{
-	int idx = event->hw.idx;
-
-	clear_bit(idx, cpuc->used_mask);
-	if (armv8pmu_event_is_chained(event))
-		clear_bit(idx - 1, cpuc->used_mask);
-}
-
-static int armv8pmu_user_event_idx(struct perf_event *event)
-{
-	if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
-		return 0;
-
-	/*
-	 * We remap the cycle counter index to 32 to
-	 * match the offset applied to the rest of
-	 * the counter indices.
-	 */
-	if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER)
-		return ARMV8_IDX_CYCLE_COUNTER_USER;
-
-	return event->hw.idx;
-}
-
-/*
- * Add an event filter to a given event.
- */
-static int armv8pmu_set_event_filter(struct hw_perf_event *event,
-				     struct perf_event_attr *attr)
-{
-	unsigned long config_base = 0;
-
-	if (attr->exclude_idle)
-		return -EPERM;
-
-	/*
-	 * If we're running in hyp mode, then we *are* the hypervisor.
-	 * Therefore we ignore exclude_hv in this configuration, since
-	 * there's no hypervisor to sample anyway. This is consistent
-	 * with other architectures (x86 and Power).
-	 */
-	if (is_kernel_in_hyp_mode()) {
-		if (!attr->exclude_kernel && !attr->exclude_host)
-			config_base |= ARMV8_PMU_INCLUDE_EL2;
-		if (attr->exclude_guest)
-			config_base |= ARMV8_PMU_EXCLUDE_EL1;
-		if (attr->exclude_host)
-			config_base |= ARMV8_PMU_EXCLUDE_EL0;
-	} else {
-		if (!attr->exclude_hv && !attr->exclude_host)
-			config_base |= ARMV8_PMU_INCLUDE_EL2;
-	}
-
-	/*
-	 * Filter out !VHE kernels and guest kernels
-	 */
-	if (attr->exclude_kernel)
-		config_base |= ARMV8_PMU_EXCLUDE_EL1;
-
-	if (attr->exclude_user)
-		config_base |= ARMV8_PMU_EXCLUDE_EL0;
-
-	/*
-	 * Install the filter into config_base as this is used to
-	 * construct the event type.
-	 */
-	event->config_base = config_base;
-
-	return 0;
-}
-
-static void armv8pmu_reset(void *info)
-{
-	struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
-	u32 pmcr;
-
-	/* The counter and interrupt enable registers are unknown at reset. */
-	armv8pmu_disable_counter(U32_MAX);
-	armv8pmu_disable_intens(U32_MAX);
-
-	/* Clear the counters we flip at guest entry/exit */
-	kvm_clr_pmu_events(U32_MAX);
-
-	/*
-	 * Initialize & Reset PMNC. Request overflow interrupt for
-	 * 64 bit cycle counter but cheat in armv8pmu_write_counter().
-	 */
-	pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC;
-
-	/* Enable long event counter support where available */
-	if (armv8pmu_has_long_event(cpu_pmu))
-		pmcr |= ARMV8_PMU_PMCR_LP;
-
-	armv8pmu_pmcr_write(pmcr);
-}
-
-static int __armv8_pmuv3_map_event(struct perf_event *event,
-				   const unsigned (*extra_event_map)
-						  [PERF_COUNT_HW_MAX],
-				   const unsigned (*extra_cache_map)
-						  [PERF_COUNT_HW_CACHE_MAX]
-						  [PERF_COUNT_HW_CACHE_OP_MAX]
-						  [PERF_COUNT_HW_CACHE_RESULT_MAX])
-{
-	int hw_event_id;
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-
-	hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map,
-				       &armv8_pmuv3_perf_cache_map,
-				       ARMV8_PMU_EVTYPE_EVENT);
-
-	/*
-	 * CHAIN events only work when paired with an adjacent counter, and it
-	 * never makes sense for a user to open one in isolation, as they'll be
-	 * rotated arbitrarily.
-	 */
-	if (hw_event_id == ARMV8_PMUV3_PERFCTR_CHAIN)
-		return -EINVAL;
-
-	if (armv8pmu_event_is_64bit(event))
-		event->hw.flags |= ARMPMU_EVT_64BIT;
-
-	/*
-	 * User events must be allocated into a single counter, and so
-	 * must not be chained.
-	 *
-	 * Most 64-bit events require long counter support, but 64-bit
-	 * CPU_CYCLES events can be placed into the dedicated cycle
-	 * counter when this is free.
-	 */
-	if (armv8pmu_event_want_user_access(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			return -EINVAL;
-		if (armv8pmu_event_is_64bit(event) &&
-		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
-		    !armv8pmu_has_long_event(armpmu))
-			return -EOPNOTSUPP;
-
-		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
-	}
-
-	/* Only expose micro/arch events supported by this PMU */
-	if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
-	    && test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
-		return hw_event_id;
-	}
-
-	return armpmu_map_event(event, extra_event_map, extra_cache_map,
-				ARMV8_PMU_EVTYPE_EVENT);
-}
-
-static int armv8_pmuv3_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL, NULL);
-}
-
-static int armv8_a53_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map);
-}
-
-static int armv8_a57_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map);
-}
-
-static int armv8_a73_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map);
-}
-
-static int armv8_thunder_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL,
-				       &armv8_thunder_perf_cache_map);
-}
-
-static int armv8_vulcan_map_event(struct perf_event *event)
-{
-	return __armv8_pmuv3_map_event(event, NULL,
-				       &armv8_vulcan_perf_cache_map);
-}
-
-struct armv8pmu_probe_info {
-	struct arm_pmu *pmu;
-	bool present;
-};
-
-static void __armv8pmu_probe_pmu(void *info)
-{
-	struct armv8pmu_probe_info *probe = info;
-	struct arm_pmu *cpu_pmu = probe->pmu;
-	u64 dfr0;
-	u64 pmceid_raw[2];
-	u32 pmceid[2];
-	int pmuver;
-
-	dfr0 = read_sysreg(id_aa64dfr0_el1);
-	pmuver = cpuid_feature_extract_unsigned_field(dfr0,
-			ID_AA64DFR0_EL1_PMUVer_SHIFT);
-	if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
-	    pmuver == ID_AA64DFR0_EL1_PMUVer_NI)
-		return;
-
-	cpu_pmu->pmuver = pmuver;
-	probe->present = true;
-
-	/* Read the nb of CNTx counters supported from PMNC */
-	cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT)
-		& ARMV8_PMU_PMCR_N_MASK;
-
-	/* Add the CPU cycles counter */
-	cpu_pmu->num_events += 1;
-
-	pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0);
-	pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0);
-
-	bitmap_from_arr32(cpu_pmu->pmceid_bitmap,
-			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
-
-	pmceid[0] = pmceid_raw[0] >> 32;
-	pmceid[1] = pmceid_raw[1] >> 32;
-
-	bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap,
-			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
-
-	/* store PMMIR_EL1 register for sysfs */
-	if (pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4 && (pmceid_raw[1] & BIT(31)))
-		cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1);
-	else
-		cpu_pmu->reg_pmmir = 0;
-}
-
-static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
-{
-	struct armv8pmu_probe_info probe = {
-		.pmu = cpu_pmu,
-		.present = false,
-	};
-	int ret;
-
-	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
-				    __armv8pmu_probe_pmu,
-				    &probe, 1);
-	if (ret)
-		return ret;
-
-	return probe.present ? 0 : -ENODEV;
-}
-
-static void armv8pmu_disable_user_access_ipi(void *unused)
-{
-	armv8pmu_disable_user_access();
-}
-
-static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (ret || !write || sysctl_perf_user_access)
-		return ret;
-
-	on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
-	return 0;
-}
-
-static struct ctl_table armv8_pmu_sysctl_table[] = {
-	{
-		.procname       = "perf_user_access",
-		.data		= &sysctl_perf_user_access,
-		.maxlen		= sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler	= armv8pmu_proc_user_access_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-	{ }
-};
-
-static void armv8_pmu_register_sysctl_table(void)
-{
-	static u32 tbl_registered = 0;
-
-	if (!cmpxchg_relaxed(&tbl_registered, 0, 1))
-		register_sysctl("kernel", armv8_pmu_sysctl_table);
-}
-
-static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
-			  int (*map_event)(struct perf_event *event),
-			  const struct attribute_group *events,
-			  const struct attribute_group *format,
-			  const struct attribute_group *caps)
-{
-	int ret = armv8pmu_probe_pmu(cpu_pmu);
-	if (ret)
-		return ret;
-
-	cpu_pmu->handle_irq		= armv8pmu_handle_irq;
-	cpu_pmu->enable			= armv8pmu_enable_event;
-	cpu_pmu->disable		= armv8pmu_disable_event;
-	cpu_pmu->read_counter		= armv8pmu_read_counter;
-	cpu_pmu->write_counter		= armv8pmu_write_counter;
-	cpu_pmu->get_event_idx		= armv8pmu_get_event_idx;
-	cpu_pmu->clear_event_idx	= armv8pmu_clear_event_idx;
-	cpu_pmu->start			= armv8pmu_start;
-	cpu_pmu->stop			= armv8pmu_stop;
-	cpu_pmu->reset			= armv8pmu_reset;
-	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
-
-	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
-
-	cpu_pmu->name			= name;
-	cpu_pmu->map_event		= map_event;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
-			events : &armv8_pmuv3_events_attr_group;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ?
-			format : &armv8_pmuv3_format_attr_group;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
-			caps : &armv8_pmuv3_caps_attr_group;
-
-	armv8_pmu_register_sysctl_table();
-	return 0;
-}
-
-static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
-				   int (*map_event)(struct perf_event *event))
-{
-	return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
-}
-
-#define PMUV3_INIT_SIMPLE(name)						\
-static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
-{									\
-	return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\
-}
-
-PMUV3_INIT_SIMPLE(armv8_pmuv3)
-
-PMUV3_INIT_SIMPLE(armv8_cortex_a34)
-PMUV3_INIT_SIMPLE(armv8_cortex_a55)
-PMUV3_INIT_SIMPLE(armv8_cortex_a65)
-PMUV3_INIT_SIMPLE(armv8_cortex_a75)
-PMUV3_INIT_SIMPLE(armv8_cortex_a76)
-PMUV3_INIT_SIMPLE(armv8_cortex_a77)
-PMUV3_INIT_SIMPLE(armv8_cortex_a78)
-PMUV3_INIT_SIMPLE(armv9_cortex_a510)
-PMUV3_INIT_SIMPLE(armv9_cortex_a710)
-PMUV3_INIT_SIMPLE(armv8_cortex_x1)
-PMUV3_INIT_SIMPLE(armv9_cortex_x2)
-PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
-PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
-PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
-PMUV3_INIT_SIMPLE(armv8_neoverse_v1)
-
-PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
-PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
-
-static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35",
-				       armv8_a53_map_event);
-}
-
-static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53",
-				       armv8_a53_map_event);
-}
-
-static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
-				       armv8_a57_map_event);
-}
-
-static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
-				       armv8_a57_map_event);
-}
-
-static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73",
-				       armv8_a73_map_event);
-}
-
-static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
-				       armv8_thunder_map_event);
-}
-
-static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan",
-				       armv8_vulcan_map_event);
-}
-
-static const struct of_device_id armv8_pmu_of_device_ids[] = {
-	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_pmu_init},
-	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_cortex_a34_pmu_init},
-	{.compatible = "arm,cortex-a35-pmu",	.data = armv8_a35_pmu_init},
-	{.compatible = "arm,cortex-a53-pmu",	.data = armv8_a53_pmu_init},
-	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_cortex_a55_pmu_init},
-	{.compatible = "arm,cortex-a57-pmu",	.data = armv8_a57_pmu_init},
-	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_cortex_a65_pmu_init},
-	{.compatible = "arm,cortex-a72-pmu",	.data = armv8_a72_pmu_init},
-	{.compatible = "arm,cortex-a73-pmu",	.data = armv8_a73_pmu_init},
-	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_cortex_a75_pmu_init},
-	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
-	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
-	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
-	{.compatible = "arm,cortex-a510-pmu",	.data = armv9_cortex_a510_pmu_init},
-	{.compatible = "arm,cortex-a710-pmu",	.data = armv9_cortex_a710_pmu_init},
-	{.compatible = "arm,cortex-x1-pmu",	.data = armv8_cortex_x1_pmu_init},
-	{.compatible = "arm,cortex-x2-pmu",	.data = armv9_cortex_x2_pmu_init},
-	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_neoverse_e1_pmu_init},
-	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_neoverse_n1_pmu_init},
-	{.compatible = "arm,neoverse-n2-pmu",	.data = armv9_neoverse_n2_pmu_init},
-	{.compatible = "arm,neoverse-v1-pmu",	.data = armv8_neoverse_v1_pmu_init},
-	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
-	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
-	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},
-	{.compatible = "nvidia,denver-pmu",	.data = armv8_nvidia_denver_pmu_init},
-	{},
-};
-
-static int armv8_pmu_device_probe(struct platform_device *pdev)
-{
-	return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL);
-}
-
-static struct platform_driver armv8_pmu_driver = {
-	.driver		= {
-		.name	= ARMV8_PMU_PDEV_NAME,
-		.of_match_table = armv8_pmu_of_device_ids,
-		.suppress_bind_attrs = true,
-	},
-	.probe		= armv8_pmu_device_probe,
-};
-
-static int __init armv8_pmu_driver_init(void)
-{
-	if (acpi_disabled)
-		return platform_driver_register(&armv8_pmu_driver);
-	else
-		return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init);
-}
-device_initcall(armv8_pmu_driver_init)
-
-void arch_perf_update_userpage(struct perf_event *event,
-			       struct perf_event_mmap_page *userpg, u64 now)
-{
-	struct clock_read_data *rd;
-	unsigned int seq;
-	u64 ns;
-
-	userpg->cap_user_time = 0;
-	userpg->cap_user_time_zero = 0;
-	userpg->cap_user_time_short = 0;
-	userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);
-
-	if (userpg->cap_user_rdpmc) {
-		if (event->hw.flags & ARMPMU_EVT_64BIT)
-			userpg->pmc_width = 64;
-		else
-			userpg->pmc_width = 32;
-	}
-
-	do {
-		rd = sched_clock_read_begin(&seq);
-
-		if (rd->read_sched_clock != arch_timer_read_counter)
-			return;
-
-		userpg->time_mult = rd->mult;
-		userpg->time_shift = rd->shift;
-		userpg->time_zero = rd->epoch_ns;
-		userpg->time_cycles = rd->epoch_cyc;
-		userpg->time_mask = rd->sched_clock_mask;
-
-		/*
-		 * Subtract the cycle base, such that software that
-		 * doesn't know about cap_user_time_short still 'works'
-		 * assuming no wraps.
-		 */
-		ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
-		userpg->time_zero -= ns;
-
-	} while (sched_clock_read_retry(seq));
-
-	userpg->time_offset = userpg->time_zero - now;
-
-	/*
-	 * time_shift is not expected to be greater than 31 due to
-	 * the original published conversion algorithm shifting a
-	 * 32-bit value (now specifies a 64-bit value) - refer
-	 * perf_event_mmap_page documentation in perf_event.h.
-	 */
-	if (userpg->time_shift == 32) {
-		userpg->time_shift = 31;
-		userpg->time_mult >>= 1;
-	}
-
-	/*
-	 * Internal timekeeping for enabled/running/stopped times
-	 * is always computed with the sched_clock.
-	 */
-	userpg->cap_user_time = 1;
-	userpg->cap_user_time_zero = 1;
-	userpg->cap_user_time_short = 1;
-}
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 66c259000a44..defe6b47854b 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -100,6 +100,17 @@ config ARM_SMMU_V3_PMU
 	   through the SMMU and allow the resulting information to be filtered
 	   based on the Stream ID of the corresponding master.
 
+config ARM_PMUV3
+	depends on HW_PERF_EVENTS && ARM64
+	bool "ARM PMUv3 support" if !ARM64
+	default y
+	  help
+	  Say y if you want to use the ARM performance monitor unit (PMU)
+	  version 3. The PMUv3 is the CPU performance monitors on ARMv8
+	  (aarch32 and aarch64) systems that implement the PMUv3
+	  architecture.
+	  Currently, PMUv3 is only supported on aarch64 (arm64)
+
 config ARM_DSU_PMU
 	tristate "ARM DynamIQ Shared Unit (DSU) PMU"
 	depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 13e45da61100..dabc859540ce 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_ARM_CMN) += arm-cmn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_PMUV3) += arm_pmuv3.o
 obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
new file mode 100644
index 000000000000..c25203f8b940
--- /dev/null
+++ b/drivers/perf/arm_pmuv3.c
@@ -0,0 +1,1468 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ARMv8 PMUv3 Performance Events handling code.
+ *
+ * Copyright (C) 2012 ARM Limited
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This code is based heavily on the ARMv7 perf event code.
+ */
+
+#include <asm/irq_regs.h>
+#include <asm/perf_event.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+#include <clocksource/arm_arch_timer.h>
+
+#include <linux/acpi.h>
+#include <linux/clocksource.h>
+#include <linux/kvm_host.h>
+#include <linux/of.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/perf/arm_pmuv3.h>
+#include <linux/platform_device.h>
+#include <linux/sched_clock.h>
+#include <linux/smp.h>
+
+/* ARMv8 Cortex-A53 specific event types. */
+#define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
+
+/* ARMv8 Cavium ThunderX specific event types. */
+#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST			0xE9
+#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS		0xEA
+#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS		0xEB
+#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS		0xEC
+#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS		0xED
+
+/*
+ * ARMv8 Architectural defined events, not all of these may
+ * be supported on any given implementation. Unsupported events will
+ * be disabled at run-time based on the PMCEID registers.
+ */
+static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
+	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV8_PMUV3_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV8_PMUV3_PERFCTR_INST_RETIRED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE,
+	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
+	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV8_PMUV3_PERFCTR_BUS_CYCLES,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV8_PMUV3_PERFCTR_STALL_FRONTEND,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV8_PMUV3_PERFCTR_STALL_BACKEND,
+};
+
+static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+						[PERF_COUNT_HW_CACHE_OP_MAX]
+						[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL,
+	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1D_TLB,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL,
+	[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB,
+
+	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD,
+	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_LL_CACHE_RD,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_BR_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
+};
+
+static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					      [PERF_COUNT_HW_CACHE_OP_MAX]
+					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL,
+
+	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
+	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
+};
+
+static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					      [PERF_COUNT_HW_CACHE_OP_MAX]
+					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
+
+	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
+	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
+};
+
+static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					      [PERF_COUNT_HW_CACHE_OP_MAX]
+					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
+};
+
+static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+						   [PERF_COUNT_HW_CACHE_OP_MAX]
+						   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST,
+	[C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS,
+	[C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS,
+
+	[C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS,
+	[C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
+};
+
+static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					      [PERF_COUNT_HW_CACHE_OP_MAX]
+					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
+
+	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
+	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
+};
+
+static ssize_t
+armv8pmu_events_sysfs_show(struct device *dev,
+			   struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
+}
+
+#define ARMV8_EVENT_ATTR(name, config)						\
+	PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)
+
+static struct attribute *armv8_pmuv3_event_attrs[] = {
+	ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR),
+	ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL),
+	ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL),
+	ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL),
+	ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE),
+	ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL),
+	ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED),
+	ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED),
+	ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED),
+	ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN),
+	ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN),
+	ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED),
+	ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED),
+	ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED),
+	ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED),
+	ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED),
+	ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED),
+	ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES),
+	ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED),
+	ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS),
+	ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE),
+	ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB),
+	ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE),
+	ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL),
+	ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB),
+	ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS),
+	ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR),
+	ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC),
+	ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED),
+	ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES),
+	/* Don't expose the chain event in /sys, since it's useless in isolation */
+	ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE),
+	ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE),
+	ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED),
+	ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED),
+	ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND),
+	ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND),
+	ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB),
+	ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB),
+	ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE),
+	ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL),
+	ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE),
+	ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL),
+	ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE),
+	ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB),
+	ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL),
+	ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL),
+	ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB),
+	ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB),
+	ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS),
+	ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE),
+	ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS),
+	ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK),
+	ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK),
+	ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD),
+	ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD),
+	ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD),
+	ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED),
+	ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC),
+	ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL),
+	ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND),
+	ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND),
+	ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT),
+	ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP),
+	ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED),
+	ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE),
+	ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION),
+	ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES),
+	ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM),
+	ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS),
+	ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS),
+	ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(trb_wrap, ARMV8_PMUV3_PERFCTR_TRB_WRAP),
+	ARMV8_EVENT_ATTR(trb_trig, ARMV8_PMUV3_PERFCTR_TRB_TRIG),
+	ARMV8_EVENT_ATTR(trcextout0, ARMV8_PMUV3_PERFCTR_TRCEXTOUT0),
+	ARMV8_EVENT_ATTR(trcextout1, ARMV8_PMUV3_PERFCTR_TRCEXTOUT1),
+	ARMV8_EVENT_ATTR(trcextout2, ARMV8_PMUV3_PERFCTR_TRCEXTOUT2),
+	ARMV8_EVENT_ATTR(trcextout3, ARMV8_PMUV3_PERFCTR_TRCEXTOUT3),
+	ARMV8_EVENT_ATTR(cti_trigout4, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4),
+	ARMV8_EVENT_ATTR(cti_trigout5, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5),
+	ARMV8_EVENT_ATTR(cti_trigout6, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6),
+	ARMV8_EVENT_ATTR(cti_trigout7, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7),
+	ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED),
+	ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD),
+	ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR),
+	NULL,
+};
+
+static umode_t
+armv8pmu_event_attr_is_visible(struct kobject *kobj,
+			       struct attribute *attr, int unused)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
+
+	if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
+	    test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap))
+		return attr->mode;
+
+	if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) {
+		u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE;
+
+		if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
+		    test_bit(id, cpu_pmu->pmceid_ext_bitmap))
+			return attr->mode;
+	}
+
+	return 0;
+}
+
+static const struct attribute_group armv8_pmuv3_events_attr_group = {
+	.name = "events",
+	.attrs = armv8_pmuv3_event_attrs,
+	.is_visible = armv8pmu_event_attr_is_visible,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-15");
+PMU_FORMAT_ATTR(long, "config1:0");
+PMU_FORMAT_ATTR(rdpmc, "config1:1");
+
+static int sysctl_perf_user_access __read_mostly;
+
+static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
+{
+	return event->attr.config1 & 0x1;
+}
+
+static inline bool armv8pmu_event_want_user_access(struct perf_event *event)
+{
+	return event->attr.config1 & 0x2;
+}
+
+static struct attribute *armv8_pmuv3_format_attrs[] = {
+	&format_attr_event.attr,
+	&format_attr_long.attr,
+	&format_attr_rdpmc.attr,
+	NULL,
+};
+
+static const struct attribute_group armv8_pmuv3_format_attr_group = {
+	.name = "format",
+	.attrs = armv8_pmuv3_format_attrs,
+};
+
+static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
+			  char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
+
+	return sysfs_emit(page, "0x%08x\n", slots);
+}
+
+static DEVICE_ATTR_RO(slots);
+
+static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT)
+			& ARMV8_PMU_BUS_SLOTS_MASK;
+
+	return sysfs_emit(page, "0x%08x\n", bus_slots);
+}
+
+static DEVICE_ATTR_RO(bus_slots);
+
+static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT)
+			& ARMV8_PMU_BUS_WIDTH_MASK;
+	u32 val = 0;
+
+	/* Encoded as Log2(number of bytes), plus one */
+	if (bus_width > 2 && bus_width < 13)
+		val = 1 << (bus_width - 1);
+
+	return sysfs_emit(page, "0x%08x\n", val);
+}
+
+static DEVICE_ATTR_RO(bus_width);
+
+static struct attribute *armv8_pmuv3_caps_attrs[] = {
+	&dev_attr_slots.attr,
+	&dev_attr_bus_slots.attr,
+	&dev_attr_bus_width.attr,
+	NULL,
+};
+
+static const struct attribute_group armv8_pmuv3_caps_attr_group = {
+	.name = "caps",
+	.attrs = armv8_pmuv3_caps_attrs,
+};
+
+/*
+ * Perf Events' indices
+ */
+#define	ARMV8_IDX_CYCLE_COUNTER	0
+#define	ARMV8_IDX_COUNTER0	1
+#define	ARMV8_IDX_CYCLE_COUNTER_USER	32
+
+/*
+ * We unconditionally enable ARMv8.5-PMU long event counter support
+ * (64-bit events) where supported. Indicate if this arm_pmu has long
+ * event counter support.
+ */
+static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
+{
+	return (cpu_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5);
+}
+
+static inline bool armv8pmu_event_has_user_read(struct perf_event *event)
+{
+	return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
+}
+
+/*
+ * We must chain two programmable counters for 64 bit events,
+ * except when we have allocated the 64bit cycle counter (for CPU
+ * cycles event) or when user space counter access is enabled.
+ */
+static inline bool armv8pmu_event_is_chained(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+
+	return !armv8pmu_event_has_user_read(event) &&
+	       armv8pmu_event_is_64bit(event) &&
+	       !armv8pmu_has_long_event(cpu_pmu) &&
+	       (idx != ARMV8_IDX_CYCLE_COUNTER);
+}
+
+/*
+ * ARMv8 low level PMU access
+ */
+
+/*
+ * Perf Event to low level counters mapping
+ */
+#define	ARMV8_IDX_TO_COUNTER(x)	\
+	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
+
+/*
+ * This code is really good
+ */
+
+#define PMEVN_CASE(n, case_macro) \
+	case n: case_macro(n); break
+
+#define PMEVN_SWITCH(x, case_macro)				\
+	do {							\
+		switch (x) {					\
+		PMEVN_CASE(0,  case_macro);			\
+		PMEVN_CASE(1,  case_macro);			\
+		PMEVN_CASE(2,  case_macro);			\
+		PMEVN_CASE(3,  case_macro);			\
+		PMEVN_CASE(4,  case_macro);			\
+		PMEVN_CASE(5,  case_macro);			\
+		PMEVN_CASE(6,  case_macro);			\
+		PMEVN_CASE(7,  case_macro);			\
+		PMEVN_CASE(8,  case_macro);			\
+		PMEVN_CASE(9,  case_macro);			\
+		PMEVN_CASE(10, case_macro);			\
+		PMEVN_CASE(11, case_macro);			\
+		PMEVN_CASE(12, case_macro);			\
+		PMEVN_CASE(13, case_macro);			\
+		PMEVN_CASE(14, case_macro);			\
+		PMEVN_CASE(15, case_macro);			\
+		PMEVN_CASE(16, case_macro);			\
+		PMEVN_CASE(17, case_macro);			\
+		PMEVN_CASE(18, case_macro);			\
+		PMEVN_CASE(19, case_macro);			\
+		PMEVN_CASE(20, case_macro);			\
+		PMEVN_CASE(21, case_macro);			\
+		PMEVN_CASE(22, case_macro);			\
+		PMEVN_CASE(23, case_macro);			\
+		PMEVN_CASE(24, case_macro);			\
+		PMEVN_CASE(25, case_macro);			\
+		PMEVN_CASE(26, case_macro);			\
+		PMEVN_CASE(27, case_macro);			\
+		PMEVN_CASE(28, case_macro);			\
+		PMEVN_CASE(29, case_macro);			\
+		PMEVN_CASE(30, case_macro);			\
+		default: WARN(1, "Invalid PMEV* index\n");	\
+		}						\
+	} while (0)
+
+#define RETURN_READ_PMEVCNTRN(n) \
+	return read_sysreg(pmevcntr##n##_el0)
+static unsigned long read_pmevcntrn(int n)
+{
+	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
+	return 0;
+}
+
+#define WRITE_PMEVCNTRN(n) \
+	write_sysreg(val, pmevcntr##n##_el0)
+static void write_pmevcntrn(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
+}
+
+#define WRITE_PMEVTYPERN(n) \
+	write_sysreg(val, pmevtyper##n##_el0)
+static void write_pmevtypern(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
+}
+
+static inline u32 armv8pmu_pmcr_read(void)
+{
+	return read_sysreg(pmcr_el0);
+}
+
+static inline void armv8pmu_pmcr_write(u32 val)
+{
+	val &= ARMV8_PMU_PMCR_MASK;
+	isb();
+	write_sysreg(val, pmcr_el0);
+}
+
+static inline int armv8pmu_has_overflowed(u32 pmovsr)
+{
+	return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
+}
+
+static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
+{
+	return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
+}
+
+static inline u64 armv8pmu_read_evcntr(int idx)
+{
+	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
+	return read_pmevcntrn(counter);
+}
+
+static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+	u64 val = armv8pmu_read_evcntr(idx);
+
+	if (armv8pmu_event_is_chained(event))
+		val = (val << 32) | armv8pmu_read_evcntr(idx - 1);
+	return val;
+}
+
+/*
+ * The cycle counter is always a 64-bit counter. When ARMV8_PMU_PMCR_LP
+ * is set the event counters also become 64-bit counters. Unless the
+ * user has requested a long counter (attr.config1) then we want to
+ * interrupt upon 32-bit overflow - we achieve this by applying a bias.
+ */
+static bool armv8pmu_event_needs_bias(struct perf_event *event)
+{
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (armv8pmu_event_is_64bit(event))
+		return false;
+
+	if (armv8pmu_has_long_event(cpu_pmu) ||
+	    idx == ARMV8_IDX_CYCLE_COUNTER)
+		return true;
+
+	return false;
+}
+
+static u64 armv8pmu_bias_long_counter(struct perf_event *event, u64 value)
+{
+	if (armv8pmu_event_needs_bias(event))
+		value |= GENMASK(63, 32);
+
+	return value;
+}
+
+static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value)
+{
+	if (armv8pmu_event_needs_bias(event))
+		value &= ~GENMASK(63, 32);
+
+	return value;
+}
+
+static u64 armv8pmu_read_counter(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	u64 value;
+
+	if (idx == ARMV8_IDX_CYCLE_COUNTER)
+		value = read_sysreg(pmccntr_el0);
+	else
+		value = armv8pmu_read_hw_counter(event);
+
+	return  armv8pmu_unbias_long_counter(event, value);
+}
+
+static inline void armv8pmu_write_evcntr(int idx, u64 value)
+{
+	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
+	write_pmevcntrn(counter, value);
+}
+
+static inline void armv8pmu_write_hw_counter(struct perf_event *event,
+					     u64 value)
+{
+	int idx = event->hw.idx;
+
+	if (armv8pmu_event_is_chained(event)) {
+		armv8pmu_write_evcntr(idx, upper_32_bits(value));
+		armv8pmu_write_evcntr(idx - 1, lower_32_bits(value));
+	} else {
+		armv8pmu_write_evcntr(idx, value);
+	}
+}
+
+static void armv8pmu_write_counter(struct perf_event *event, u64 value)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	value = armv8pmu_bias_long_counter(event, value);
+
+	if (idx == ARMV8_IDX_CYCLE_COUNTER)
+		write_sysreg(value, pmccntr_el0);
+	else
+		armv8pmu_write_hw_counter(event, value);
+}
+
+static inline void armv8pmu_write_evtype(int idx, u32 val)
+{
+	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
+	val &= ARMV8_PMU_EVTYPE_MASK;
+	write_pmevtypern(counter, val);
+}
+
+static inline void armv8pmu_write_event_type(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	/*
+	 * For chained events, the low counter is programmed to count
+	 * the event of interest and the high counter is programmed
+	 * with CHAIN event code with filters set to count at all ELs.
+	 */
+	if (armv8pmu_event_is_chained(event)) {
+		u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN |
+				ARMV8_PMU_INCLUDE_EL2;
+
+		armv8pmu_write_evtype(idx - 1, hwc->config_base);
+		armv8pmu_write_evtype(idx, chain_evt);
+	} else {
+		if (idx == ARMV8_IDX_CYCLE_COUNTER)
+			write_sysreg(hwc->config_base, pmccfiltr_el0);
+		else
+			armv8pmu_write_evtype(idx, hwc->config_base);
+	}
+}
+
+static u32 armv8pmu_event_cnten_mask(struct perf_event *event)
+{
+	int counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
+	u32 mask = BIT(counter);
+
+	if (armv8pmu_event_is_chained(event))
+		mask |= BIT(counter - 1);
+	return mask;
+}
+
+static inline void armv8pmu_enable_counter(u32 mask)
+{
+	/*
+	 * Make sure event configuration register writes are visible before we
+	 * enable the counter.
+	 * */
+	isb();
+	write_sysreg(mask, pmcntenset_el0);
+}
+
+static inline void armv8pmu_enable_event_counter(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u32 mask = armv8pmu_event_cnten_mask(event);
+
+	kvm_set_pmu_events(mask, attr);
+
+	/* We rely on the hypervisor switch code to enable guest counters */
+	if (!kvm_pmu_counter_deferred(attr))
+		armv8pmu_enable_counter(mask);
+}
+
+static inline void armv8pmu_disable_counter(u32 mask)
+{
+	write_sysreg(mask, pmcntenclr_el0);
+	/*
+	 * Make sure the effects of disabling the counter are visible before we
+	 * start configuring the event.
+	 */
+	isb();
+}
+
+static inline void armv8pmu_disable_event_counter(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u32 mask = armv8pmu_event_cnten_mask(event);
+
+	kvm_clr_pmu_events(mask);
+
+	/* We rely on the hypervisor switch code to disable guest counters */
+	if (!kvm_pmu_counter_deferred(attr))
+		armv8pmu_disable_counter(mask);
+}
+
+static inline void armv8pmu_enable_intens(u32 mask)
+{
+	write_sysreg(mask, pmintenset_el1);
+}
+
+static inline void armv8pmu_enable_event_irq(struct perf_event *event)
+{
+	u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
+	armv8pmu_enable_intens(BIT(counter));
+}
+
+static inline void armv8pmu_disable_intens(u32 mask)
+{
+	write_sysreg(mask, pmintenclr_el1);
+	isb();
+	/* Clear the overflow flag in case an interrupt is pending. */
+	write_sysreg(mask, pmovsclr_el0);
+	isb();
+}
+
+static inline void armv8pmu_disable_event_irq(struct perf_event *event)
+{
+	u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx);
+	armv8pmu_disable_intens(BIT(counter));
+}
+
+static inline u32 armv8pmu_getreset_flags(void)
+{
+	u32 value;
+
+	/* Read */
+	value = read_sysreg(pmovsclr_el0);
+
+	/* Write to clear flags */
+	value &= ARMV8_PMU_OVSR_MASK;
+	write_sysreg(value, pmovsclr_el0);
+
+	return value;
+}
+
+static void armv8pmu_disable_user_access(void)
+{
+	write_sysreg(0, pmuserenr_el0);
+}
+
+static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
+{
+	int i;
+	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
+
+	/* Clear any unused counters to avoid leaking their contents */
+	for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
+		if (i == ARMV8_IDX_CYCLE_COUNTER)
+			write_sysreg(0, pmccntr_el0);
+		else
+			armv8pmu_write_evcntr(i, 0);
+	}
+
+	write_sysreg(0, pmuserenr_el0);
+	write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
+}
+
+static void armv8pmu_enable_event(struct perf_event *event)
+{
+	/*
+	 * Enable counter and interrupt, and set the counter to count
+	 * the event that we're interested in.
+	 */
+
+	/*
+	 * Disable counter
+	 */
+	armv8pmu_disable_event_counter(event);
+
+	/*
+	 * Set event.
+	 */
+	armv8pmu_write_event_type(event);
+
+	/*
+	 * Enable interrupt for this counter
+	 */
+	armv8pmu_enable_event_irq(event);
+
+	/*
+	 * Enable counter
+	 */
+	armv8pmu_enable_event_counter(event);
+}
+
+static void armv8pmu_disable_event(struct perf_event *event)
+{
+	/*
+	 * Disable counter
+	 */
+	armv8pmu_disable_event_counter(event);
+
+	/*
+	 * Disable interrupt for this counter
+	 */
+	armv8pmu_disable_event_irq(event);
+}
+
+static void armv8pmu_start(struct arm_pmu *cpu_pmu)
+{
+	struct perf_event_context *ctx;
+	int nr_user = 0;
+
+	ctx = perf_cpu_task_ctx();
+	if (ctx)
+		nr_user = ctx->nr_user;
+
+	if (sysctl_perf_user_access && nr_user)
+		armv8pmu_enable_user_access(cpu_pmu);
+	else
+		armv8pmu_disable_user_access();
+
+	/* Enable all counters */
+	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
+}
+
+static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
+{
+	/* Disable all counters */
+	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
+}
+
+static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
+{
+	u32 pmovsr;
+	struct perf_sample_data data;
+	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
+	struct pt_regs *regs;
+	int idx;
+
+	/*
+	 * Get and reset the IRQ flags
+	 */
+	pmovsr = armv8pmu_getreset_flags();
+
+	/*
+	 * Did an overflow occur?
+	 */
+	if (!armv8pmu_has_overflowed(pmovsr))
+		return IRQ_NONE;
+
+	/*
+	 * Handle the counter(s) overflow(s)
+	 */
+	regs = get_irq_regs();
+
+	/*
+	 * Stop the PMU while processing the counter overflows
+	 * to prevent skews in group events.
+	 */
+	armv8pmu_stop(cpu_pmu);
+	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+
+		/* Ignore if we don't have an event. */
+		if (!event)
+			continue;
+
+		/*
+		 * We have a single interrupt for all counters. Check that
+		 * each counter has overflowed before we process it.
+		 */
+		if (!armv8pmu_counter_has_overflowed(pmovsr, idx))
+			continue;
+
+		hwc = &event->hw;
+		armpmu_event_update(event);
+		perf_sample_data_init(&data, 0, hwc->last_period);
+		if (!armpmu_event_set_period(event))
+			continue;
+
+		/*
+		 * Perf event overflow will queue the processing of the event as
+		 * an irq_work which will be taken care of in the handling of
+		 * IPI_IRQ_WORK.
+		 */
+		if (perf_event_overflow(event, &data, regs))
+			cpu_pmu->disable(event);
+	}
+	armv8pmu_start(cpu_pmu);
+
+	return IRQ_HANDLED;
+}
+
+static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
+				    struct arm_pmu *cpu_pmu)
+{
+	int idx;
+
+	for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
+		if (!test_and_set_bit(idx, cpuc->used_mask))
+			return idx;
+	}
+	return -EAGAIN;
+}
+
+static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
+				   struct arm_pmu *cpu_pmu)
+{
+	int idx;
+
+	/*
+	 * Chaining requires two consecutive event counters, where
+	 * the lower idx must be even.
+	 */
+	for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) {
+		if (!test_and_set_bit(idx, cpuc->used_mask)) {
+			/* Check if the preceding even counter is available */
+			if (!test_and_set_bit(idx - 1, cpuc->used_mask))
+				return idx;
+			/* Release the Odd counter */
+			clear_bit(idx, cpuc->used_mask);
+		}
+	}
+	return -EAGAIN;
+}
+
+static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
+				  struct perf_event *event)
+{
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
+
+	/* Always prefer to place a cycle counter into the cycle counter. */
+	if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
+		if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
+			return ARMV8_IDX_CYCLE_COUNTER;
+		else if (armv8pmu_event_is_64bit(event) &&
+			   armv8pmu_event_want_user_access(event) &&
+			   !armv8pmu_has_long_event(cpu_pmu))
+				return -EAGAIN;
+	}
+
+	/*
+	 * Otherwise use events counters
+	 */
+	if (armv8pmu_event_is_chained(event))
+		return	armv8pmu_get_chain_idx(cpuc, cpu_pmu);
+	else
+		return armv8pmu_get_single_idx(cpuc, cpu_pmu);
+}
+
+static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
+				     struct perf_event *event)
+{
+	int idx = event->hw.idx;
+
+	clear_bit(idx, cpuc->used_mask);
+	if (armv8pmu_event_is_chained(event))
+		clear_bit(idx - 1, cpuc->used_mask);
+}
+
+static int armv8pmu_user_event_idx(struct perf_event *event)
+{
+	if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
+		return 0;
+
+	/*
+	 * We remap the cycle counter index to 32 to
+	 * match the offset applied to the rest of
+	 * the counter indices.
+	 */
+	if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER)
+		return ARMV8_IDX_CYCLE_COUNTER_USER;
+
+	return event->hw.idx;
+}
+
+/*
+ * Add an event filter to a given event.
+ */
+static int armv8pmu_set_event_filter(struct hw_perf_event *event,
+				     struct perf_event_attr *attr)
+{
+	unsigned long config_base = 0;
+
+	if (attr->exclude_idle)
+		return -EPERM;
+
+	/*
+	 * If we're running in hyp mode, then we *are* the hypervisor.
+	 * Therefore we ignore exclude_hv in this configuration, since
+	 * there's no hypervisor to sample anyway. This is consistent
+	 * with other architectures (x86 and Power).
+	 */
+	if (is_kernel_in_hyp_mode()) {
+		if (!attr->exclude_kernel && !attr->exclude_host)
+			config_base |= ARMV8_PMU_INCLUDE_EL2;
+		if (attr->exclude_guest)
+			config_base |= ARMV8_PMU_EXCLUDE_EL1;
+		if (attr->exclude_host)
+			config_base |= ARMV8_PMU_EXCLUDE_EL0;
+	} else {
+		if (!attr->exclude_hv && !attr->exclude_host)
+			config_base |= ARMV8_PMU_INCLUDE_EL2;
+	}
+
+	/*
+	 * Filter out !VHE kernels and guest kernels
+	 */
+	if (attr->exclude_kernel)
+		config_base |= ARMV8_PMU_EXCLUDE_EL1;
+
+	if (attr->exclude_user)
+		config_base |= ARMV8_PMU_EXCLUDE_EL0;
+
+	/*
+	 * Install the filter into config_base as this is used to
+	 * construct the event type.
+	 */
+	event->config_base = config_base;
+
+	return 0;
+}
+
+static void armv8pmu_reset(void *info)
+{
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
+	u32 pmcr;
+
+	/* The counter and interrupt enable registers are unknown at reset. */
+	armv8pmu_disable_counter(U32_MAX);
+	armv8pmu_disable_intens(U32_MAX);
+
+	/* Clear the counters we flip at guest entry/exit */
+	kvm_clr_pmu_events(U32_MAX);
+
+	/*
+	 * Initialize & Reset PMNC. Request overflow interrupt for
+	 * 64 bit cycle counter but cheat in armv8pmu_write_counter().
+	 */
+	pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC;
+
+	/* Enable long event counter support where available */
+	if (armv8pmu_has_long_event(cpu_pmu))
+		pmcr |= ARMV8_PMU_PMCR_LP;
+
+	armv8pmu_pmcr_write(pmcr);
+}
+
+static int __armv8_pmuv3_map_event(struct perf_event *event,
+				   const unsigned (*extra_event_map)
+						  [PERF_COUNT_HW_MAX],
+				   const unsigned (*extra_cache_map)
+						  [PERF_COUNT_HW_CACHE_MAX]
+						  [PERF_COUNT_HW_CACHE_OP_MAX]
+						  [PERF_COUNT_HW_CACHE_RESULT_MAX])
+{
+	int hw_event_id;
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+
+	hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map,
+				       &armv8_pmuv3_perf_cache_map,
+				       ARMV8_PMU_EVTYPE_EVENT);
+
+	/*
+	 * CHAIN events only work when paired with an adjacent counter, and it
+	 * never makes sense for a user to open one in isolation, as they'll be
+	 * rotated arbitrarily.
+	 */
+	if (hw_event_id == ARMV8_PMUV3_PERFCTR_CHAIN)
+		return -EINVAL;
+
+	if (armv8pmu_event_is_64bit(event))
+		event->hw.flags |= ARMPMU_EVT_64BIT;
+
+	/*
+	 * User events must be allocated into a single counter, and so
+	 * must not be chained.
+	 *
+	 * Most 64-bit events require long counter support, but 64-bit
+	 * CPU_CYCLES events can be placed into the dedicated cycle
+	 * counter when this is free.
+	 */
+	if (armv8pmu_event_want_user_access(event)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			return -EINVAL;
+		if (armv8pmu_event_is_64bit(event) &&
+		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
+		    !armv8pmu_has_long_event(armpmu))
+			return -EOPNOTSUPP;
+
+		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
+	}
+
+	/* Only expose micro/arch events supported by this PMU */
+	if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
+	    && test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
+		return hw_event_id;
+	}
+
+	return armpmu_map_event(event, extra_event_map, extra_cache_map,
+				ARMV8_PMU_EVTYPE_EVENT);
+}
+
+static int armv8_pmuv3_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL, NULL);
+}
+
+static int armv8_a53_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map);
+}
+
+static int armv8_a57_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map);
+}
+
+static int armv8_a73_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map);
+}
+
+static int armv8_thunder_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL,
+				       &armv8_thunder_perf_cache_map);
+}
+
+static int armv8_vulcan_map_event(struct perf_event *event)
+{
+	return __armv8_pmuv3_map_event(event, NULL,
+				       &armv8_vulcan_perf_cache_map);
+}
+
+struct armv8pmu_probe_info {
+	struct arm_pmu *pmu;
+	bool present;
+};
+
+static void __armv8pmu_probe_pmu(void *info)
+{
+	struct armv8pmu_probe_info *probe = info;
+	struct arm_pmu *cpu_pmu = probe->pmu;
+	u64 dfr0;
+	u64 pmceid_raw[2];
+	u32 pmceid[2];
+	int pmuver;
+
+	dfr0 = read_sysreg(id_aa64dfr0_el1);
+	pmuver = cpuid_feature_extract_unsigned_field(dfr0,
+			ID_AA64DFR0_EL1_PMUVer_SHIFT);
+	if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
+	    pmuver == ID_AA64DFR0_EL1_PMUVer_NI)
+		return;
+
+	cpu_pmu->pmuver = pmuver;
+	probe->present = true;
+
+	/* Read the nb of CNTx counters supported from PMNC */
+	cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT)
+		& ARMV8_PMU_PMCR_N_MASK;
+
+	/* Add the CPU cycles counter */
+	cpu_pmu->num_events += 1;
+
+	pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0);
+	pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0);
+
+	bitmap_from_arr32(cpu_pmu->pmceid_bitmap,
+			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+
+	pmceid[0] = pmceid_raw[0] >> 32;
+	pmceid[1] = pmceid_raw[1] >> 32;
+
+	bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap,
+			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+
+	/* store PMMIR_EL1 register for sysfs */
+	if (pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4 && (pmceid_raw[1] & BIT(31)))
+		cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1);
+	else
+		cpu_pmu->reg_pmmir = 0;
+}
+
+static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
+{
+	struct armv8pmu_probe_info probe = {
+		.pmu = cpu_pmu,
+		.present = false,
+	};
+	int ret;
+
+	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
+				    __armv8pmu_probe_pmu,
+				    &probe, 1);
+	if (ret)
+		return ret;
+
+	return probe.present ? 0 : -ENODEV;
+}
+
+static void armv8pmu_disable_user_access_ipi(void *unused)
+{
+	armv8pmu_disable_user_access();
+}
+
+static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write || sysctl_perf_user_access)
+		return ret;
+
+	on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
+	return 0;
+}
+
+static struct ctl_table armv8_pmu_sysctl_table[] = {
+	{
+		.procname       = "perf_user_access",
+		.data		= &sysctl_perf_user_access,
+		.maxlen		= sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= armv8pmu_proc_user_access_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static void armv8_pmu_register_sysctl_table(void)
+{
+	static u32 tbl_registered = 0;
+
+	if (!cmpxchg_relaxed(&tbl_registered, 0, 1))
+		register_sysctl("kernel", armv8_pmu_sysctl_table);
+}
+
+static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
+			  int (*map_event)(struct perf_event *event),
+			  const struct attribute_group *events,
+			  const struct attribute_group *format,
+			  const struct attribute_group *caps)
+{
+	int ret = armv8pmu_probe_pmu(cpu_pmu);
+	if (ret)
+		return ret;
+
+	cpu_pmu->handle_irq		= armv8pmu_handle_irq;
+	cpu_pmu->enable			= armv8pmu_enable_event;
+	cpu_pmu->disable		= armv8pmu_disable_event;
+	cpu_pmu->read_counter		= armv8pmu_read_counter;
+	cpu_pmu->write_counter		= armv8pmu_write_counter;
+	cpu_pmu->get_event_idx		= armv8pmu_get_event_idx;
+	cpu_pmu->clear_event_idx	= armv8pmu_clear_event_idx;
+	cpu_pmu->start			= armv8pmu_start;
+	cpu_pmu->stop			= armv8pmu_stop;
+	cpu_pmu->reset			= armv8pmu_reset;
+	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
+
+	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+
+	cpu_pmu->name			= name;
+	cpu_pmu->map_event		= map_event;
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
+			events : &armv8_pmuv3_events_attr_group;
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ?
+			format : &armv8_pmuv3_format_attr_group;
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
+			caps : &armv8_pmuv3_caps_attr_group;
+
+	armv8_pmu_register_sysctl_table();
+	return 0;
+}
+
+static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
+				   int (*map_event)(struct perf_event *event))
+{
+	return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
+}
+
+#define PMUV3_INIT_SIMPLE(name)						\
+static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
+{									\
+	return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\
+}
+
+PMUV3_INIT_SIMPLE(armv8_pmuv3)
+
+PMUV3_INIT_SIMPLE(armv8_cortex_a34)
+PMUV3_INIT_SIMPLE(armv8_cortex_a55)
+PMUV3_INIT_SIMPLE(armv8_cortex_a65)
+PMUV3_INIT_SIMPLE(armv8_cortex_a75)
+PMUV3_INIT_SIMPLE(armv8_cortex_a76)
+PMUV3_INIT_SIMPLE(armv8_cortex_a77)
+PMUV3_INIT_SIMPLE(armv8_cortex_a78)
+PMUV3_INIT_SIMPLE(armv9_cortex_a510)
+PMUV3_INIT_SIMPLE(armv9_cortex_a710)
+PMUV3_INIT_SIMPLE(armv8_cortex_x1)
+PMUV3_INIT_SIMPLE(armv9_cortex_x2)
+PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
+PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
+PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
+PMUV3_INIT_SIMPLE(armv8_neoverse_v1)
+
+PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
+PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
+
+static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35",
+				       armv8_a53_map_event);
+}
+
+static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53",
+				       armv8_a53_map_event);
+}
+
+static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
+				       armv8_a57_map_event);
+}
+
+static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
+				       armv8_a57_map_event);
+}
+
+static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73",
+				       armv8_a73_map_event);
+}
+
+static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
+				       armv8_thunder_map_event);
+}
+
+static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan",
+				       armv8_vulcan_map_event);
+}
+
+static const struct of_device_id armv8_pmu_of_device_ids[] = {
+	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_pmu_init},
+	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_cortex_a34_pmu_init},
+	{.compatible = "arm,cortex-a35-pmu",	.data = armv8_a35_pmu_init},
+	{.compatible = "arm,cortex-a53-pmu",	.data = armv8_a53_pmu_init},
+	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_cortex_a55_pmu_init},
+	{.compatible = "arm,cortex-a57-pmu",	.data = armv8_a57_pmu_init},
+	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_cortex_a65_pmu_init},
+	{.compatible = "arm,cortex-a72-pmu",	.data = armv8_a72_pmu_init},
+	{.compatible = "arm,cortex-a73-pmu",	.data = armv8_a73_pmu_init},
+	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_cortex_a75_pmu_init},
+	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
+	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
+	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
+	{.compatible = "arm,cortex-a510-pmu",	.data = armv9_cortex_a510_pmu_init},
+	{.compatible = "arm,cortex-a710-pmu",	.data = armv9_cortex_a710_pmu_init},
+	{.compatible = "arm,cortex-x1-pmu",	.data = armv8_cortex_x1_pmu_init},
+	{.compatible = "arm,cortex-x2-pmu",	.data = armv9_cortex_x2_pmu_init},
+	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_neoverse_e1_pmu_init},
+	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_neoverse_n1_pmu_init},
+	{.compatible = "arm,neoverse-n2-pmu",	.data = armv9_neoverse_n2_pmu_init},
+	{.compatible = "arm,neoverse-v1-pmu",	.data = armv8_neoverse_v1_pmu_init},
+	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
+	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
+	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},
+	{.compatible = "nvidia,denver-pmu",	.data = armv8_nvidia_denver_pmu_init},
+	{},
+};
+
+static int armv8_pmu_device_probe(struct platform_device *pdev)
+{
+	return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL);
+}
+
+static struct platform_driver armv8_pmu_driver = {
+	.driver		= {
+		.name	= ARMV8_PMU_PDEV_NAME,
+		.of_match_table = armv8_pmu_of_device_ids,
+		.suppress_bind_attrs = true,
+	},
+	.probe		= armv8_pmu_device_probe,
+};
+
+static int __init armv8_pmu_driver_init(void)
+{
+	if (acpi_disabled)
+		return platform_driver_register(&armv8_pmu_driver);
+	else
+		return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init);
+}
+device_initcall(armv8_pmu_driver_init)
+
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
+{
+	struct clock_read_data *rd;
+	unsigned int seq;
+	u64 ns;
+
+	userpg->cap_user_time = 0;
+	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_time_short = 0;
+	userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);
+
+	if (userpg->cap_user_rdpmc) {
+		if (event->hw.flags & ARMPMU_EVT_64BIT)
+			userpg->pmc_width = 64;
+		else
+			userpg->pmc_width = 32;
+	}
+
+	do {
+		rd = sched_clock_read_begin(&seq);
+
+		if (rd->read_sched_clock != arch_timer_read_counter)
+			return;
+
+		userpg->time_mult = rd->mult;
+		userpg->time_shift = rd->shift;
+		userpg->time_zero = rd->epoch_ns;
+		userpg->time_cycles = rd->epoch_cyc;
+		userpg->time_mask = rd->sched_clock_mask;
+
+		/*
+		 * Subtract the cycle base, such that software that
+		 * doesn't know about cap_user_time_short still 'works'
+		 * assuming no wraps.
+		 */
+		ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
+		userpg->time_zero -= ns;
+
+	} while (sched_clock_read_retry(seq));
+
+	userpg->time_offset = userpg->time_zero - now;
+
+	/*
+	 * time_shift is not expected to be greater than 31 due to
+	 * the original published conversion algorithm shifting a
+	 * 32-bit value (now specifies a 64-bit value) - refer
+	 * perf_event_mmap_page documentation in perf_event.h.
+	 */
+	if (userpg->time_shift == 32) {
+		userpg->time_shift = 31;
+		userpg->time_mult >>= 1;
+	}
+
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always computed with the sched_clock.
+	 */
+	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
+	userpg->cap_user_time_short = 1;
+}
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 628775334d5e..1a6a695ca67a 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -8,7 +8,7 @@
 #define __ASM_ARM_KVM_PMU_H
 
 #include <linux/perf_event.h>
-#include <asm/perf_event.h>
+#include <linux/perf/arm_pmuv3.h>
 
 #define ARMV8_PMU_CYCLE_IDX		(ARMV8_PMU_MAX_COUNTERS - 1)
 
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
new file mode 100644
index 000000000000..53173abfc022
--- /dev/null
+++ b/include/linux/perf/arm_pmuv3.h
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __PERF_ARM_PMUV3_H
+#define __PERF_ARM_PMUV3_H
+
+#define ARMV8_PMU_MAX_COUNTERS	32
+#define ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
+
+/*
+ * Common architectural and microarchitectural event numbers.
+ */
+#define ARMV8_PMUV3_PERFCTR_SW_INCR				0x0000
+#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL			0x0001
+#define ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL			0x0002
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL			0x0003
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE				0x0004
+#define ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL			0x0005
+#define ARMV8_PMUV3_PERFCTR_LD_RETIRED				0x0006
+#define ARMV8_PMUV3_PERFCTR_ST_RETIRED				0x0007
+#define ARMV8_PMUV3_PERFCTR_INST_RETIRED			0x0008
+#define ARMV8_PMUV3_PERFCTR_EXC_TAKEN				0x0009
+#define ARMV8_PMUV3_PERFCTR_EXC_RETURN				0x000A
+#define ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED			0x000B
+#define ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED			0x000C
+#define ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED			0x000D
+#define ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED			0x000E
+#define ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED		0x000F
+#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED				0x0010
+#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES				0x0011
+#define ARMV8_PMUV3_PERFCTR_BR_PRED				0x0012
+#define ARMV8_PMUV3_PERFCTR_MEM_ACCESS				0x0013
+#define ARMV8_PMUV3_PERFCTR_L1I_CACHE				0x0014
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB			0x0015
+#define ARMV8_PMUV3_PERFCTR_L2D_CACHE				0x0016
+#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL			0x0017
+#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB			0x0018
+#define ARMV8_PMUV3_PERFCTR_BUS_ACCESS				0x0019
+#define ARMV8_PMUV3_PERFCTR_MEMORY_ERROR			0x001A
+#define ARMV8_PMUV3_PERFCTR_INST_SPEC				0x001B
+#define ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED			0x001C
+#define ARMV8_PMUV3_PERFCTR_BUS_CYCLES				0x001D
+#define ARMV8_PMUV3_PERFCTR_CHAIN				0x001E
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE			0x001F
+#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE			0x0020
+#define ARMV8_PMUV3_PERFCTR_BR_RETIRED				0x0021
+#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED			0x0022
+#define ARMV8_PMUV3_PERFCTR_STALL_FRONTEND			0x0023
+#define ARMV8_PMUV3_PERFCTR_STALL_BACKEND			0x0024
+#define ARMV8_PMUV3_PERFCTR_L1D_TLB				0x0025
+#define ARMV8_PMUV3_PERFCTR_L1I_TLB				0x0026
+#define ARMV8_PMUV3_PERFCTR_L2I_CACHE				0x0027
+#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL			0x0028
+#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE			0x0029
+#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL			0x002A
+#define ARMV8_PMUV3_PERFCTR_L3D_CACHE				0x002B
+#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB			0x002C
+#define ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL			0x002D
+#define ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL			0x002E
+#define ARMV8_PMUV3_PERFCTR_L2D_TLB				0x002F
+#define ARMV8_PMUV3_PERFCTR_L2I_TLB				0x0030
+#define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS			0x0031
+#define ARMV8_PMUV3_PERFCTR_LL_CACHE				0x0032
+#define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS			0x0033
+#define ARMV8_PMUV3_PERFCTR_DTLB_WALK				0x0034
+#define ARMV8_PMUV3_PERFCTR_ITLB_WALK				0x0035
+#define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD				0x0036
+#define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD			0x0037
+#define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD			0x0038
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD			0x0039
+#define ARMV8_PMUV3_PERFCTR_OP_RETIRED				0x003A
+#define ARMV8_PMUV3_PERFCTR_OP_SPEC				0x003B
+#define ARMV8_PMUV3_PERFCTR_STALL				0x003C
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND			0x003D
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND			0x003E
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT				0x003F
+
+/* Statistical profiling extension microarchitectural events */
+#define ARMV8_SPE_PERFCTR_SAMPLE_POP				0x4000
+#define ARMV8_SPE_PERFCTR_SAMPLE_FEED				0x4001
+#define ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE			0x4002
+#define ARMV8_SPE_PERFCTR_SAMPLE_COLLISION			0x4003
+
+/* AMUv1 architecture events */
+#define ARMV8_AMU_PERFCTR_CNT_CYCLES				0x4004
+#define ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM			0x4005
+
+/* long-latency read miss events */
+#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS			0x4006
+#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD			0x4009
+#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS			0x400A
+#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD			0x400B
+
+/* Trace buffer events */
+#define ARMV8_PMUV3_PERFCTR_TRB_WRAP				0x400C
+#define ARMV8_PMUV3_PERFCTR_TRB_TRIG				0x400E
+
+/* Trace unit events */
+#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT0				0x4010
+#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT1				0x4011
+#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT2				0x4012
+#define ARMV8_PMUV3_PERFCTR_TRCEXTOUT3				0x4013
+#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4			0x4018
+#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5			0x4019
+#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6			0x401A
+#define ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7			0x401B
+
+/* additional latency from alignment events */
+#define ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT			0x4020
+#define ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT			0x4021
+#define ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT			0x4022
+
+/* Armv8.5 Memory Tagging Extension events */
+#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED			0x4024
+#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD			0x4025
+#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR			0x4026
+
+/* ARMv8 recommended implementation defined event types */
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD			0x0040
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR			0x0041
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD		0x0042
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR		0x0043
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_INNER		0x0044
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_OUTER		0x0045
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_VICTIM		0x0046
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_CLEAN			0x0047
+#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_INVAL			0x0048
+
+#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD			0x004C
+#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR			0x004D
+#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD				0x004E
+#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR				0x004F
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_RD			0x0050
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WR			0x0051
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_RD		0x0052
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_WR		0x0053
+
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_VICTIM		0x0056
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_CLEAN			0x0057
+#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_INVAL			0x0058
+
+#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_RD			0x005C
+#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_WR			0x005D
+#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_RD				0x005E
+#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_WR				0x005F
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD			0x0060
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR			0x0061
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_SHARED			0x0062
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NOT_SHARED		0x0063
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NORMAL			0x0064
+#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_PERIPH			0x0065
+#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_RD			0x0066
+#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_WR			0x0067
+#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LD_SPEC			0x0068
+#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_ST_SPEC			0x0069
+#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LDST_SPEC		0x006A
+
+#define ARMV8_IMPDEF_PERFCTR_LDREX_SPEC				0x006C
+#define ARMV8_IMPDEF_PERFCTR_STREX_PASS_SPEC			0x006D
+#define ARMV8_IMPDEF_PERFCTR_STREX_FAIL_SPEC			0x006E
+#define ARMV8_IMPDEF_PERFCTR_STREX_SPEC				0x006F
+#define ARMV8_IMPDEF_PERFCTR_LD_SPEC				0x0070
+#define ARMV8_IMPDEF_PERFCTR_ST_SPEC				0x0071
+#define ARMV8_IMPDEF_PERFCTR_LDST_SPEC				0x0072
+#define ARMV8_IMPDEF_PERFCTR_DP_SPEC				0x0073
+#define ARMV8_IMPDEF_PERFCTR_ASE_SPEC				0x0074
+#define ARMV8_IMPDEF_PERFCTR_VFP_SPEC				0x0075
+#define ARMV8_IMPDEF_PERFCTR_PC_WRITE_SPEC			0x0076
+#define ARMV8_IMPDEF_PERFCTR_CRYPTO_SPEC			0x0077
+#define ARMV8_IMPDEF_PERFCTR_BR_IMMED_SPEC			0x0078
+#define ARMV8_IMPDEF_PERFCTR_BR_RETURN_SPEC			0x0079
+#define ARMV8_IMPDEF_PERFCTR_BR_INDIRECT_SPEC			0x007A
+
+#define ARMV8_IMPDEF_PERFCTR_ISB_SPEC				0x007C
+#define ARMV8_IMPDEF_PERFCTR_DSB_SPEC				0x007D
+#define ARMV8_IMPDEF_PERFCTR_DMB_SPEC				0x007E
+
+#define ARMV8_IMPDEF_PERFCTR_EXC_UNDEF				0x0081
+#define ARMV8_IMPDEF_PERFCTR_EXC_SVC				0x0082
+#define ARMV8_IMPDEF_PERFCTR_EXC_PABORT				0x0083
+#define ARMV8_IMPDEF_PERFCTR_EXC_DABORT				0x0084
+
+#define ARMV8_IMPDEF_PERFCTR_EXC_IRQ				0x0086
+#define ARMV8_IMPDEF_PERFCTR_EXC_FIQ				0x0087
+#define ARMV8_IMPDEF_PERFCTR_EXC_SMC				0x0088
+
+#define ARMV8_IMPDEF_PERFCTR_EXC_HVC				0x008A
+#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_PABORT			0x008B
+#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_DABORT			0x008C
+#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_OTHER			0x008D
+#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_IRQ			0x008E
+#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_FIQ			0x008F
+#define ARMV8_IMPDEF_PERFCTR_RC_LD_SPEC				0x0090
+#define ARMV8_IMPDEF_PERFCTR_RC_ST_SPEC				0x0091
+
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_RD			0x00A0
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WR			0x00A1
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_RD		0x00A2
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_WR		0x00A3
+
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_VICTIM		0x00A6
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_CLEAN			0x00A7
+#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_INVAL			0x00A8
+
+/*
+ * Per-CPU PMCR: config reg
+ */
+#define ARMV8_PMU_PMCR_E	(1 << 0) /* Enable all counters */
+#define ARMV8_PMU_PMCR_P	(1 << 1) /* Reset all counters */
+#define ARMV8_PMU_PMCR_C	(1 << 2) /* Cycle counter reset */
+#define ARMV8_PMU_PMCR_D	(1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV8_PMU_PMCR_X	(1 << 4) /* Export to ETM */
+#define ARMV8_PMU_PMCR_DP	(1 << 5) /* Disable CCNT if non-invasive debug*/
+#define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
+#define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
+#define ARMV8_PMU_PMCR_N_SHIFT	11  /* Number of counters supported */
+#define ARMV8_PMU_PMCR_N_MASK	0x1f
+#define ARMV8_PMU_PMCR_MASK	0xff    /* Mask for writable bits */
+
+/*
+ * PMOVSR: counters overflow flag status reg
+ */
+#define ARMV8_PMU_OVSR_MASK		0xffffffff	/* Mask for writable bits */
+#define ARMV8_PMU_OVERFLOWED_MASK	ARMV8_PMU_OVSR_MASK
+
+/*
+ * PMXEVTYPER: Event selection reg
+ */
+#define ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
+#define ARMV8_PMU_EVTYPE_EVENT	0xffff		/* Mask for EVENT bits */
+
+/*
+ * Event filters for PMUv3
+ */
+#define ARMV8_PMU_EXCLUDE_EL1	(1U << 31)
+#define ARMV8_PMU_EXCLUDE_EL0	(1U << 30)
+#define ARMV8_PMU_INCLUDE_EL2	(1U << 27)
+
+/*
+ * PMUSERENR: user enable reg
+ */
+#define ARMV8_PMU_USERENR_MASK	0xf		/* Mask for writable bits */
+#define ARMV8_PMU_USERENR_EN	(1 << 0) /* PMU regs can be accessed at EL0 */
+#define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
+#define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
+#define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+
+/* PMMIR_EL1.SLOTS mask */
+#define ARMV8_PMU_SLOTS_MASK	0xff
+
+#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
+#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
+#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
+#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
+
+#endif
-- 
cgit v1.2.3


From df29ddf4f04b00cf60669686dc7f534c3ed7c433 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 17 Mar 2023 15:50:21 -0400
Subject: arm64: perf: Abstract system register accesses away

As we want to enable 32bit support, we need to distanciate the
PMUv3 driver from the AArch64 system register names.

This patch moves all system register accesses to an architecture
specific include file, allowing the 32bit counterpart to be
slotted in at a later time.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Co-developed-by: Zaid Al-Bassam <zalbassam@google.com>
Signed-off-by: Zaid Al-Bassam <zalbassam@google.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20230317195027.3746949-3-zalbassam@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/arm_pmuv3.h | 137 +++++++++++++++++++++++++++++++++++++
 drivers/perf/arm_pmuv3.c           | 115 +++++++------------------------
 include/linux/perf/arm_pmuv3.h     |  45 ++++++++++++
 3 files changed, 205 insertions(+), 92 deletions(-)
 create mode 100644 arch/arm64/include/asm/arm_pmuv3.h

(limited to 'include')

diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h
new file mode 100644
index 000000000000..c444cbfb3acd
--- /dev/null
+++ b/arch/arm64/include/asm/arm_pmuv3.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __ASM_PMUV3_H
+#define __ASM_PMUV3_H
+
+#include <asm/cpufeature.h>
+#include <asm/sysreg.h>
+
+#define RETURN_READ_PMEVCNTRN(n) \
+	return read_sysreg(pmevcntr##n##_el0)
+static unsigned long read_pmevcntrn(int n)
+{
+	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
+	return 0;
+}
+
+#define WRITE_PMEVCNTRN(n) \
+	write_sysreg(val, pmevcntr##n##_el0)
+static void write_pmevcntrn(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
+}
+
+#define WRITE_PMEVTYPERN(n) \
+	write_sysreg(val, pmevtyper##n##_el0)
+static void write_pmevtypern(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
+}
+
+static inline unsigned long read_pmmir(void)
+{
+	return read_cpuid(PMMIR_EL1);
+}
+
+static inline u32 read_pmuver(void)
+{
+	u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
+
+	return cpuid_feature_extract_unsigned_field(dfr0,
+			ID_AA64DFR0_EL1_PMUVer_SHIFT);
+}
+
+static inline void write_pmcr(u32 val)
+{
+	write_sysreg(val, pmcr_el0);
+}
+
+static inline u32 read_pmcr(void)
+{
+	return read_sysreg(pmcr_el0);
+}
+
+static inline void write_pmselr(u32 val)
+{
+	write_sysreg(val, pmselr_el0);
+}
+
+static inline void write_pmccntr(u64 val)
+{
+	write_sysreg(val, pmccntr_el0);
+}
+
+static inline u64 read_pmccntr(void)
+{
+	return read_sysreg(pmccntr_el0);
+}
+
+static inline void write_pmxevcntr(u32 val)
+{
+	write_sysreg(val, pmxevcntr_el0);
+}
+
+static inline u32 read_pmxevcntr(void)
+{
+	return read_sysreg(pmxevcntr_el0);
+}
+
+static inline void write_pmxevtyper(u32 val)
+{
+	write_sysreg(val, pmxevtyper_el0);
+}
+
+static inline void write_pmcntenset(u32 val)
+{
+	write_sysreg(val, pmcntenset_el0);
+}
+
+static inline void write_pmcntenclr(u32 val)
+{
+	write_sysreg(val, pmcntenclr_el0);
+}
+
+static inline void write_pmintenset(u32 val)
+{
+	write_sysreg(val, pmintenset_el1);
+}
+
+static inline void write_pmintenclr(u32 val)
+{
+	write_sysreg(val, pmintenclr_el1);
+}
+
+static inline void write_pmccfiltr(u32 val)
+{
+	write_sysreg(val, pmccfiltr_el0);
+}
+
+static inline void write_pmovsclr(u32 val)
+{
+	write_sysreg(val, pmovsclr_el0);
+}
+
+static inline u32 read_pmovsclr(void)
+{
+	return read_sysreg(pmovsclr_el0);
+}
+
+static inline void write_pmuserenr(u32 val)
+{
+	write_sysreg(val, pmuserenr_el0);
+}
+
+static inline u32 read_pmceid0(void)
+{
+	return read_sysreg(pmceid0_el0);
+}
+
+static inline u32 read_pmceid1(void)
+{
+	return read_sysreg(pmceid1_el0);
+}
+
+#endif
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index c25203f8b940..f783f068d612 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -10,7 +10,6 @@
 
 #include <asm/irq_regs.h>
 #include <asm/perf_event.h>
-#include <asm/sysreg.h>
 #include <asm/virt.h>
 
 #include <clocksource/arm_arch_timer.h>
@@ -25,6 +24,8 @@
 #include <linux/sched_clock.h>
 #include <linux/smp.h>
 
+#include <asm/arm_pmuv3.h>
+
 /* ARMv8 Cortex-A53 specific event types. */
 #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
 
@@ -425,83 +426,16 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
 #define	ARMV8_IDX_TO_COUNTER(x)	\
 	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
 
-/*
- * This code is really good
- */
-
-#define PMEVN_CASE(n, case_macro) \
-	case n: case_macro(n); break
-
-#define PMEVN_SWITCH(x, case_macro)				\
-	do {							\
-		switch (x) {					\
-		PMEVN_CASE(0,  case_macro);			\
-		PMEVN_CASE(1,  case_macro);			\
-		PMEVN_CASE(2,  case_macro);			\
-		PMEVN_CASE(3,  case_macro);			\
-		PMEVN_CASE(4,  case_macro);			\
-		PMEVN_CASE(5,  case_macro);			\
-		PMEVN_CASE(6,  case_macro);			\
-		PMEVN_CASE(7,  case_macro);			\
-		PMEVN_CASE(8,  case_macro);			\
-		PMEVN_CASE(9,  case_macro);			\
-		PMEVN_CASE(10, case_macro);			\
-		PMEVN_CASE(11, case_macro);			\
-		PMEVN_CASE(12, case_macro);			\
-		PMEVN_CASE(13, case_macro);			\
-		PMEVN_CASE(14, case_macro);			\
-		PMEVN_CASE(15, case_macro);			\
-		PMEVN_CASE(16, case_macro);			\
-		PMEVN_CASE(17, case_macro);			\
-		PMEVN_CASE(18, case_macro);			\
-		PMEVN_CASE(19, case_macro);			\
-		PMEVN_CASE(20, case_macro);			\
-		PMEVN_CASE(21, case_macro);			\
-		PMEVN_CASE(22, case_macro);			\
-		PMEVN_CASE(23, case_macro);			\
-		PMEVN_CASE(24, case_macro);			\
-		PMEVN_CASE(25, case_macro);			\
-		PMEVN_CASE(26, case_macro);			\
-		PMEVN_CASE(27, case_macro);			\
-		PMEVN_CASE(28, case_macro);			\
-		PMEVN_CASE(29, case_macro);			\
-		PMEVN_CASE(30, case_macro);			\
-		default: WARN(1, "Invalid PMEV* index\n");	\
-		}						\
-	} while (0)
-
-#define RETURN_READ_PMEVCNTRN(n) \
-	return read_sysreg(pmevcntr##n##_el0)
-static unsigned long read_pmevcntrn(int n)
-{
-	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
-	return 0;
-}
-
-#define WRITE_PMEVCNTRN(n) \
-	write_sysreg(val, pmevcntr##n##_el0)
-static void write_pmevcntrn(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
-}
-
-#define WRITE_PMEVTYPERN(n) \
-	write_sysreg(val, pmevtyper##n##_el0)
-static void write_pmevtypern(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
-}
-
 static inline u32 armv8pmu_pmcr_read(void)
 {
-	return read_sysreg(pmcr_el0);
+	return read_pmcr();
 }
 
 static inline void armv8pmu_pmcr_write(u32 val)
 {
 	val &= ARMV8_PMU_PMCR_MASK;
 	isb();
-	write_sysreg(val, pmcr_el0);
+	write_pmcr(val);
 }
 
 static inline int armv8pmu_has_overflowed(u32 pmovsr)
@@ -576,7 +510,7 @@ static u64 armv8pmu_read_counter(struct perf_event *event)
 	u64 value;
 
 	if (idx == ARMV8_IDX_CYCLE_COUNTER)
-		value = read_sysreg(pmccntr_el0);
+		value = read_pmccntr();
 	else
 		value = armv8pmu_read_hw_counter(event);
 
@@ -611,7 +545,7 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
 	value = armv8pmu_bias_long_counter(event, value);
 
 	if (idx == ARMV8_IDX_CYCLE_COUNTER)
-		write_sysreg(value, pmccntr_el0);
+		write_pmccntr(value);
 	else
 		armv8pmu_write_hw_counter(event, value);
 }
@@ -642,7 +576,7 @@ static inline void armv8pmu_write_event_type(struct perf_event *event)
 		armv8pmu_write_evtype(idx, chain_evt);
 	} else {
 		if (idx == ARMV8_IDX_CYCLE_COUNTER)
-			write_sysreg(hwc->config_base, pmccfiltr_el0);
+			write_pmccfiltr(hwc->config_base);
 		else
 			armv8pmu_write_evtype(idx, hwc->config_base);
 	}
@@ -665,7 +599,7 @@ static inline void armv8pmu_enable_counter(u32 mask)
 	 * enable the counter.
 	 * */
 	isb();
-	write_sysreg(mask, pmcntenset_el0);
+	write_pmcntenset(mask);
 }
 
 static inline void armv8pmu_enable_event_counter(struct perf_event *event)
@@ -682,7 +616,7 @@ static inline void armv8pmu_enable_event_counter(struct perf_event *event)
 
 static inline void armv8pmu_disable_counter(u32 mask)
 {
-	write_sysreg(mask, pmcntenclr_el0);
+	write_pmcntenclr(mask);
 	/*
 	 * Make sure the effects of disabling the counter are visible before we
 	 * start configuring the event.
@@ -704,7 +638,7 @@ static inline void armv8pmu_disable_event_counter(struct perf_event *event)
 
 static inline void armv8pmu_enable_intens(u32 mask)
 {
-	write_sysreg(mask, pmintenset_el1);
+	write_pmintenset(mask);
 }
 
 static inline void armv8pmu_enable_event_irq(struct perf_event *event)
@@ -715,10 +649,10 @@ static inline void armv8pmu_enable_event_irq(struct perf_event *event)
 
 static inline void armv8pmu_disable_intens(u32 mask)
 {
-	write_sysreg(mask, pmintenclr_el1);
+	write_pmintenclr(mask);
 	isb();
 	/* Clear the overflow flag in case an interrupt is pending. */
-	write_sysreg(mask, pmovsclr_el0);
+	write_pmovsclr(mask);
 	isb();
 }
 
@@ -733,18 +667,18 @@ static inline u32 armv8pmu_getreset_flags(void)
 	u32 value;
 
 	/* Read */
-	value = read_sysreg(pmovsclr_el0);
+	value = read_pmovsclr();
 
 	/* Write to clear flags */
 	value &= ARMV8_PMU_OVSR_MASK;
-	write_sysreg(value, pmovsclr_el0);
+	write_pmovsclr(value);
 
 	return value;
 }
 
 static void armv8pmu_disable_user_access(void)
 {
-	write_sysreg(0, pmuserenr_el0);
+	write_pmuserenr(0);
 }
 
 static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
@@ -755,13 +689,13 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
 	/* Clear any unused counters to avoid leaking their contents */
 	for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
 		if (i == ARMV8_IDX_CYCLE_COUNTER)
-			write_sysreg(0, pmccntr_el0);
+			write_pmccntr(0);
 		else
 			armv8pmu_write_evcntr(i, 0);
 	}
 
-	write_sysreg(0, pmuserenr_el0);
-	write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
+	write_pmuserenr(0);
+	write_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR);
 }
 
 static void armv8pmu_enable_event(struct perf_event *event)
@@ -1145,14 +1079,11 @@ static void __armv8pmu_probe_pmu(void *info)
 {
 	struct armv8pmu_probe_info *probe = info;
 	struct arm_pmu *cpu_pmu = probe->pmu;
-	u64 dfr0;
 	u64 pmceid_raw[2];
 	u32 pmceid[2];
 	int pmuver;
 
-	dfr0 = read_sysreg(id_aa64dfr0_el1);
-	pmuver = cpuid_feature_extract_unsigned_field(dfr0,
-			ID_AA64DFR0_EL1_PMUVer_SHIFT);
+	pmuver = read_pmuver();
 	if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
 	    pmuver == ID_AA64DFR0_EL1_PMUVer_NI)
 		return;
@@ -1167,8 +1098,8 @@ static void __armv8pmu_probe_pmu(void *info)
 	/* Add the CPU cycles counter */
 	cpu_pmu->num_events += 1;
 
-	pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0);
-	pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0);
+	pmceid[0] = pmceid_raw[0] = read_pmceid0();
+	pmceid[1] = pmceid_raw[1] = read_pmceid1();
 
 	bitmap_from_arr32(cpu_pmu->pmceid_bitmap,
 			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
@@ -1179,9 +1110,9 @@ static void __armv8pmu_probe_pmu(void *info)
 	bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap,
 			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
 
-	/* store PMMIR_EL1 register for sysfs */
+	/* store PMMIR register for sysfs */
 	if (pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4 && (pmceid_raw[1] & BIT(31)))
-		cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1);
+		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
 }
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 53173abfc022..e3899bd77f5c 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -255,4 +255,49 @@
 #define ARMV8_PMU_BUS_WIDTH_SHIFT 16
 #define ARMV8_PMU_BUS_WIDTH_MASK 0xf
 
+/*
+ * This code is really good
+ */
+
+#define PMEVN_CASE(n, case_macro) \
+	case n: case_macro(n); break
+
+#define PMEVN_SWITCH(x, case_macro)				\
+	do {							\
+		switch (x) {					\
+		PMEVN_CASE(0,  case_macro);			\
+		PMEVN_CASE(1,  case_macro);			\
+		PMEVN_CASE(2,  case_macro);			\
+		PMEVN_CASE(3,  case_macro);			\
+		PMEVN_CASE(4,  case_macro);			\
+		PMEVN_CASE(5,  case_macro);			\
+		PMEVN_CASE(6,  case_macro);			\
+		PMEVN_CASE(7,  case_macro);			\
+		PMEVN_CASE(8,  case_macro);			\
+		PMEVN_CASE(9,  case_macro);			\
+		PMEVN_CASE(10, case_macro);			\
+		PMEVN_CASE(11, case_macro);			\
+		PMEVN_CASE(12, case_macro);			\
+		PMEVN_CASE(13, case_macro);			\
+		PMEVN_CASE(14, case_macro);			\
+		PMEVN_CASE(15, case_macro);			\
+		PMEVN_CASE(16, case_macro);			\
+		PMEVN_CASE(17, case_macro);			\
+		PMEVN_CASE(18, case_macro);			\
+		PMEVN_CASE(19, case_macro);			\
+		PMEVN_CASE(20, case_macro);			\
+		PMEVN_CASE(21, case_macro);			\
+		PMEVN_CASE(22, case_macro);			\
+		PMEVN_CASE(23, case_macro);			\
+		PMEVN_CASE(24, case_macro);			\
+		PMEVN_CASE(25, case_macro);			\
+		PMEVN_CASE(26, case_macro);			\
+		PMEVN_CASE(27, case_macro);			\
+		PMEVN_CASE(28, case_macro);			\
+		PMEVN_CASE(29, case_macro);			\
+		PMEVN_CASE(30, case_macro);			\
+		default: WARN(1, "Invalid PMEV* index\n");	\
+		}						\
+	} while (0)
+
 #endif
-- 
cgit v1.2.3


From d583fbd7066a2dea43050521a95d9770f7d7593e Mon Sep 17 00:00:00 2001
From: Dmytro Maluka <dmy@semihalf.com>
Date: Wed, 22 Mar 2023 21:43:43 +0100
Subject: KVM: irqfd: Make resampler_list an RCU list

It is useful to be able to do read-only traversal of the list of all the
registered irqfd resamplers without locking the resampler_lock mutex.
In particular, we are going to traverse it to search for a resampler
registered for the given irq of an irqchip, and that will be done with
an irqchip spinlock (ioapic->lock) held, so it is undesirable to lock a
mutex in this context. So turn this list into an RCU list.

For protecting the read side, reuse kvm->irq_srcu which is already used
for protecting a number of irq related things (kvm->irq_routing,
irqfd->resampler->list, kvm->irq_ack_notifier_list,
kvm->arch.mask_notifier_list).

Signed-off-by: Dmytro Maluka <dmy@semihalf.com>
Message-Id: <20230322204344.50138-2-dmy@semihalf.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 1 +
 include/linux/kvm_irqfd.h | 2 +-
 virt/kvm/eventfd.c        | 8 ++++++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8ada23756b0e..9f508c8e66e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -755,6 +755,7 @@ struct kvm {
 	struct {
 		spinlock_t        lock;
 		struct list_head  items;
+		/* resampler_list update side is protected by resampler_lock. */
 		struct list_head  resampler_list;
 		struct mutex      resampler_lock;
 	} irqfds;
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index dac047abdba7..8ad43692e3bb 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -31,7 +31,7 @@ struct kvm_kernel_irqfd_resampler {
 	/*
 	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
 	 * resamplers among irqfds on the same gsi.
-	 * Accessed and modified under kvm->irqfds.resampler_lock
+	 * RCU list modified under kvm->irqfds.resampler_lock
 	 */
 	struct list_head link;
 };
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2a3ed401ce46..61aea70dd888 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -96,8 +96,12 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
 	synchronize_srcu(&kvm->irq_srcu);
 
 	if (list_empty(&resampler->list)) {
-		list_del(&resampler->link);
+		list_del_rcu(&resampler->link);
 		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+		/*
+		 * synchronize_srcu(&kvm->irq_srcu) already called
+		 * in kvm_unregister_irq_ack_notifier().
+		 */
 		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 			    resampler->notifier.gsi, 0, false);
 		kfree(resampler);
@@ -369,7 +373,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 			resampler->notifier.irq_acked = irqfd_resampler_ack;
 			INIT_LIST_HEAD(&resampler->link);
 
-			list_add(&resampler->link, &kvm->irqfds.resampler_list);
+			list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list);
 			kvm_register_irq_ack_notifier(kvm,
 						      &resampler->notifier);
 			irqfd->resampler = resampler;
-- 
cgit v1.2.3


From fef8f2b90edbd7089a4278021314f11f056b0cbb Mon Sep 17 00:00:00 2001
From: Dmytro Maluka <dmy@semihalf.com>
Date: Wed, 22 Mar 2023 21:43:44 +0100
Subject: KVM: x86/ioapic: Resample the pending state of an IRQ when unmasking

KVM irqfd based emulation of level-triggered interrupts doesn't work
quite correctly in some cases, particularly in the case of interrupts
that are handled in a Linux guest as oneshot interrupts (IRQF_ONESHOT).
Such an interrupt is acked to the device in its threaded irq handler,
i.e. later than it is acked to the interrupt controller (EOI at the end
of hardirq), not earlier.

Linux keeps such interrupt masked until its threaded handler finishes,
to prevent the EOI from re-asserting an unacknowledged interrupt.
However, with KVM + vfio (or whatever is listening on the resamplefd)
we always notify resamplefd at the EOI, so vfio prematurely unmasks the
host physical IRQ, thus a new physical interrupt is fired in the host.
This extra interrupt in the host is not a problem per se. The problem is
that it is unconditionally queued for injection into the guest, so the
guest sees an extra bogus interrupt. [*]

There are observed at least 2 user-visible issues caused by those
extra erroneous interrupts for a oneshot irq in the guest:

1. System suspend aborted due to a pending wakeup interrupt from
   ChromeOS EC (drivers/platform/chrome/cros_ec.c).
2. Annoying "invalid report id data" errors from ELAN0000 touchpad
   (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
   every time the touchpad is touched.

The core issue here is that by the time when the guest unmasks the IRQ,
the physical IRQ line is no longer asserted (since the guest has
acked the interrupt to the device in the meantime), yet we
unconditionally inject the interrupt queued into the guest by the
previous resampling. So to fix the issue, we need a way to detect that
the IRQ is no longer pending, and cancel the queued interrupt in this
case.

With IOAPIC we are not able to probe the physical IRQ line state
directly (at least not if the underlying physical interrupt controller
is an IOAPIC too), so in this patch we use irqfd resampler for that.
Namely, instead of injecting the queued interrupt, we just notify the
resampler that this interrupt is done. If the IRQ line is actually
already deasserted, we are done. If it is still asserted, a new
interrupt will be shortly triggered through irqfd and injected into the
guest.

In the case if there is no irqfd resampler registered for this IRQ, we
cannot fix the issue, so we keep the existing behavior: immediately
unconditionally inject the queued interrupt.

This patch fixes the issue for x86 IOAPIC only. In the long run, we can
fix it for other irqchips and other architectures too, possibly taking
advantage of reading the physical state of the IRQ line, which is
possible with some other irqchips (e.g. with arm64 GIC, maybe even with
the legacy x86 PIC).

[*] In this description we assume that the interrupt is a physical host
    interrupt forwarded to the guest e.g. by vfio. Potentially the same
    issue may occur also with a purely virtual interrupt from an
    emulated device, e.g. if the guest handles this interrupt, again, as
    a oneshot interrupt.

Signed-off-by: Dmytro Maluka <dmy@semihalf.com>
Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-d2fde2700083@semihalf.com/
Link: https://lore.kernel.org/lkml/87o7wrug0w.wl-maz@kernel.org/
Message-Id: <20230322204344.50138-3-dmy@semihalf.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.c    | 36 +++++++++++++++++++++++++++++++++---
 include/linux/kvm_host.h | 10 ++++++++++
 virt/kvm/eventfd.c       | 41 +++++++++++++++++++++++++++++++++++------
 3 files changed, 78 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 042dee556125..995eb5054360 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
-		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
-		    && ioapic->irr & (1 << index))
-			ioapic_service(ioapic, index, false);
+		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+		    ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+			/*
+			 * Pending status in irr may be outdated: the IRQ line may have
+			 * already been deasserted by a device while the IRQ was masked.
+			 * This occurs, for instance, if the interrupt is handled in a
+			 * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+			 * case the guest acknowledges the interrupt to the device in
+			 * its threaded irq handler, i.e. after the EOI but before
+			 * unmasking, so at the time of unmasking the IRQ line is
+			 * already down but our pending irr bit is still set. In such
+			 * cases, injecting this pending interrupt to the guest is
+			 * buggy: the guest will receive an extra unwanted interrupt.
+			 *
+			 * So we need to check here if the IRQ is actually still pending.
+			 * As we are generally not able to probe the IRQ line status
+			 * directly, we do it through irqfd resampler. Namely, we clear
+			 * the pending status and notify the resampler that this interrupt
+			 * is done, without actually injecting it into the guest. If the
+			 * IRQ line is actually already deasserted, we are done. If it is
+			 * still asserted, a new interrupt will be shortly triggered
+			 * through irqfd and injected into the guest.
+			 *
+			 * If, however, it's not possible to resample (no irqfd resampler
+			 * registered for this irq), then unconditionally inject this
+			 * pending interrupt into the guest, so the guest will not miss
+			 * an interrupt, although may get an extra unwanted interrupt.
+			 */
+			if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+				ioapic->irr &= ~(1 << index);
+			else
+				ioapic_service(ioapic, index, false);
+		}
 		if (e->fields.delivery_mode == APIC_DM_FIXED) {
 			struct kvm_lapic_irq irq;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9f508c8e66e1..a9adf75344be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1987,6 +1987,9 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 #ifdef CONFIG_HAVE_KVM_IRQFD
 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
+bool kvm_notify_irqfd_resampler(struct kvm *kvm,
+				unsigned int irqchip,
+				unsigned int pin);
 void kvm_irq_routing_update(struct kvm *);
 #else
 static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
@@ -1995,6 +1998,13 @@ static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 }
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
+					      unsigned int irqchip,
+					      unsigned int pin)
+{
+	return false;
+}
 #endif
 
 #else
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 61aea70dd888..b0af834ffa95 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -55,6 +55,15 @@ irqfd_inject(struct work_struct *work)
 			    irqfd->gsi, 1, false);
 }
 
+static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
+{
+	struct kvm_kernel_irqfd *irqfd;
+
+	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
+				 srcu_read_lock_held(&resampler->kvm->irq_srcu))
+		eventfd_signal(irqfd->resamplefd, 1);
+}
+
 /*
  * Since resampler irqfds share an IRQ source ID, we de-assert once
  * then notify all of the resampler irqfds using this GSI.  We can't
@@ -65,7 +74,6 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kernel_irqfd_resampler *resampler;
 	struct kvm *kvm;
-	struct kvm_kernel_irqfd *irqfd;
 	int idx;
 
 	resampler = container_of(kian,
@@ -76,11 +84,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 		    resampler->notifier.gsi, 0, false);
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
-
-	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
-	    srcu_read_lock_held(&kvm->irq_srcu))
-		eventfd_signal(irqfd->resamplefd, 1);
-
+	irqfd_resampler_notify(resampler);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
@@ -648,6 +652,31 @@ void kvm_irq_routing_update(struct kvm *kvm)
 	spin_unlock_irq(&kvm->irqfds.lock);
 }
 
+bool kvm_notify_irqfd_resampler(struct kvm *kvm,
+				unsigned int irqchip,
+				unsigned int pin)
+{
+	struct kvm_kernel_irqfd_resampler *resampler;
+	int gsi, idx;
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+	if (gsi != -1) {
+		list_for_each_entry_srcu(resampler,
+					 &kvm->irqfds.resampler_list, link,
+					 srcu_read_lock_held(&kvm->irq_srcu)) {
+			if (resampler->notifier.gsi == gsi) {
+				irqfd_resampler_notify(resampler);
+				srcu_read_unlock(&kvm->irq_srcu, idx);
+				return true;
+			}
+		}
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+
+	return false;
+}
+
 /*
  * create a host-wide workqueue for issuing deferred shutdown requests
  * aggregated from all vm* instances. We need our own isolated
-- 
cgit v1.2.3


From 579d472b379983498a6a7d9f7b1dd74c20afdc50 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 27 Mar 2023 16:01:50 +0300
Subject: device property: Remove unused struct net_device forward declaration

There is no users in the property.h for the struct net_device.
Remove the latter for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230327130150.84114-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/property.h b/include/linux/property.h
index 4a536548606b..59f452198c64 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -16,7 +16,6 @@
 #include <linux/types.h>
 
 struct device;
-struct net_device;
 
 enum dev_prop_type {
 	DEV_PROP_U8,
-- 
cgit v1.2.3


From 5c9a27df4eb9a402770d5547af255a765e1c10ac Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 27 Mar 2023 18:03:19 +0200
Subject: driver core: move sysfs_dev_char_kobj out of class.h

The structure sysfs_dev_char_kobj is local only to the driver core code,
so move it out of the global class.h file and into the internal base.h
file as no one else should be touching this symbol.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230327160319.513974-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h          | 3 +++
 include/linux/device/class.h | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index d3e081dc4b13..2867ca4ee4ce 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -189,6 +189,9 @@ const char *device_get_devnode(const struct device *dev, umode_t *mode,
 extern struct kset *devices_kset;
 void devices_kset_move_last(struct device *dev);
 
+/* /sys/dev/char directory */
+extern struct kobject *sysfs_dev_char_kobj;
+
 #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
 void module_add_driver(struct module *mod, struct device_driver *drv);
 void module_remove_driver(struct device_driver *drv);
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 1dc7706cb42d..d3960733c0fa 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -81,7 +81,6 @@ struct class_dev_iter {
 };
 
 extern struct kobject *sysfs_dev_block_kobj;
-extern struct kobject *sysfs_dev_char_kobj;
 
 int __must_check class_register(struct class *class);
 void class_unregister(struct class *class);
-- 
cgit v1.2.3


From 2f1f043e7bea3fbf4c1869df2f7a0312bc8ca2bf Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 12 Jan 2023 22:59:53 -0800
Subject: locking/lockdep: Introduce lock_sync()

Currently, functions like synchronize_srcu() do not have lockdep
annotations resembling those of other write-side locking primitives.
Such annotations might look as follows:

	lock_acquire();
	lock_release();

Such annotations would tell lockdep that synchronize_srcu() acts like
an empty critical section that waits for other (read-side) critical
sections to finish.  This would definitely catch some deadlock, but
as pointed out by Paul Mckenney [1], this could also introduce false
positives because of irq-safe/unsafe detection.  Of course, there are
tricks could help with this:

	might_sleep(); // Existing statement in __synchronize_srcu().
	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
		local_irq_disable();
		lock_acquire();
		lock_release();
		local_irq_enable();
	}

But it would be better for lockdep to provide a separate annonation for
functions like synchronize_srcu(), so that people won't need to repeat
the ugly tricks above.

Therefore introduce lock_sync(), which is simply an lock+unlock
pair with no irq safe/unsafe deadlock check.  This works because the
to-be-annontated functions do not create real critical sections, and
there is therefore no way that irq can create extra dependencies.

[1]: https://lore.kernel.org/lkml/20180412021233.ewncg5jjuzjw3x62@tardis/

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
[ boqun: Fix typos reported by Davidlohr Bueso and Paul E. Mckenney ]
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/lockdep.h  |  5 +++++
 kernel/locking/lockdep.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1023f349af71..14d9dbedc6c1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -268,6 +268,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 extern void lock_release(struct lockdep_map *lock, unsigned long ip);
 
+extern void lock_sync(struct lockdep_map *lock, unsigned int subclass,
+		      int read, int check, struct lockdep_map *nest_lock,
+		      unsigned long ip);
+
 /* lock_is_held_type() returns */
 #define LOCK_STATE_UNKNOWN	-1
 #define LOCK_STATE_NOT_HELD	0
@@ -554,6 +558,7 @@ do {									\
 #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_tryread(l)		lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
 #define lock_map_release(l)			lock_release(l, _THIS_IP_)
+#define lock_map_sync(l)			lock_sync(l, 0, 0, 1, NULL, _THIS_IP_)
 
 #ifdef CONFIG_PROVE_LOCKING
 # define might_lock(lock)						\
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..3ee3b278789d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5693,6 +5693,40 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ *
+ * This annotation acts as an acquire+release annotation pair with hardirqoff
+ * being 1. Since there's no critical section, no interrupt can create extra
+ * dependencies "inside" the annotation, hardirqoff == 1 allows us to avoid
+ * false positives.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+	       int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(!lockdep_enabled()))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	lockdep_recursion_inc();
+	__lock_acquire(lock, subclass, 0, read, check, 1, nest_lock, ip, 0, 0);
+
+	if (__lock_release(lock, ip))
+		check_chain_key(current);
+	lockdep_recursion_finish();
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
 noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From f0f44752f5f61ee4e3bd88ae033fdb888320aafe Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 12 Jan 2023 22:59:54 -0800
Subject: rcu: Annotate SRCU's update-side lockdep dependencies

Although all flavors of RCU readers are annotated correctly with
lockdep as recursive read locks, they do not set the lock_acquire
'check' parameter.  This means that RCU read locks are not added to
the lockdep dependency graph, which in turn means that lockdep cannot
detect RCU-based deadlocks.  This is not a problem for RCU flavors having
atomic read-side critical sections because context-based annotations can
catch these deadlocks, see for example the RCU_LOCKDEP_WARN() statement
in synchronize_rcu().  But context-based annotations are not helpful
for sleepable RCU, especially given that it is perfectly legal to do
synchronize_srcu(&srcu1) within an srcu_read_lock(&srcu2).

However, we can detect SRCU-based by: (1) Making srcu_read_lock() a
'check'ed recursive read lock and (2) Making synchronize_srcu() a empty
write lock critical section.  Even better, with the newly introduced
lock_sync(), we can avoid false positives about irq-unsafe/safe.
This commit therefore makes it so.

Note that NMI-safe SRCU read side critical sections are currently not
annotated, but might be annotated in the future.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
[ boqun: Add comments for annotation per Waiman's suggestion ]
[ boqun: Fix comment warning reported by Stephen Rothwell ]
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/srcu.h  | 34 ++++++++++++++++++++++++++++++++--
 kernel/rcu/srcutiny.c |  2 ++
 kernel/rcu/srcutree.c |  2 ++
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 74796cd7e7a9..41c4b26fb1c1 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,6 +102,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 	return lock_is_held(&ssp->dep_map);
 }
 
+/*
+ * Annotations provide deadlock detection for SRCU.
+ *
+ * Similar to other lockdep annotations, except there is an additional
+ * srcu_lock_sync(), which is basically an empty *write*-side critical section,
+ * see lock_sync() for more information.
+ */
+
+/* Annotates a srcu_read_lock() */
+static inline void srcu_lock_acquire(struct lockdep_map *map)
+{
+	lock_map_acquire_read(map);
+}
+
+/* Annotates a srcu_read_lock() */
+static inline void srcu_lock_release(struct lockdep_map *map)
+{
+	lock_map_release(map);
+}
+
+/* Annotates a synchronize_srcu() */
+static inline void srcu_lock_sync(struct lockdep_map *map)
+{
+	lock_map_sync(map);
+}
+
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
@@ -109,6 +135,10 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 	return 1;
 }
 
+#define srcu_lock_acquire(m) do { } while (0)
+#define srcu_lock_release(m) do { } while (0)
+#define srcu_lock_sync(m) do { } while (0)
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #define SRCU_NMI_UNKNOWN	0x0
@@ -182,7 +212,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 
 	srcu_check_nmi_safety(ssp, false);
 	retval = __srcu_read_lock(ssp);
-	rcu_lock_acquire(&(ssp)->dep_map);
+	srcu_lock_acquire(&(ssp)->dep_map);
 	return retval;
 }
 
@@ -254,7 +284,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
 {
 	WARN_ON_ONCE(idx & ~0x1);
 	srcu_check_nmi_safety(ssp, false);
-	rcu_lock_release(&(ssp)->dep_map);
+	srcu_lock_release(&(ssp)->dep_map);
 	__srcu_read_unlock(ssp, idx);
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b12fb0cec44d..336af24e0fe3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp)
 {
 	struct rcu_synchronize rs;
 
+	srcu_lock_sync(&ssp->dep_map);
+
 	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
 			lock_is_held(&rcu_bh_lock_map) ||
 			lock_is_held(&rcu_lock_map) ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ab4ee58af84b..c541b82646b6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1307,6 +1307,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
 {
 	struct rcu_synchronize rcu;
 
+	srcu_lock_sync(&ssp->dep_map);
+
 	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
 			 lock_is_held(&rcu_lock_map) ||
-- 
cgit v1.2.3


From 0471db447cb7de56bbe2fedd9256b4d2b8ef642a Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Fri, 13 Jan 2023 15:57:22 -0800
Subject: locking/lockdep: Improve the deadlock scenario print for sync and
 read lock

Lock scenario print is always a weak spot of lockdep splats. Improvement
can be made if we rework the dependency search and the error printing.

However without touching the graph search, we can improve a little for
the circular deadlock case, since we have the to-be-added lock
dependency, and know whether these two locks are read/write/sync.

In order to know whether a held_lock is sync or not, a bit was
"stolen" from ->references, which reduce our limit for the same lock
class nesting from 2^12 to 2^11, and it should still be good enough.

Besides, since we now have bit in held_lock for sync, we don't need the
"hardirqoffs being 1" trick, and also we can avoid the __lock_release()
if we jump out of __lock_acquire() before the held_lock stored.

With these changes, a deadlock case evolved with read lock and sync gets
a better print-out from:

	[...]  Possible unsafe locking scenario:
	[...]
	[...]        CPU0                    CPU1
	[...]        ----                    ----
	[...]   lock(srcuA);
	[...]                                lock(srcuB);
	[...]                                lock(srcuA);
	[...]   lock(srcuB);

to

	[...]  Possible unsafe locking scenario:
	[...]
	[...]        CPU0                    CPU1
	[...]        ----                    ----
	[...]   rlock(srcuA);
	[...]                                lock(srcuB);
	[...]                                lock(srcuA);
	[...]   sync(srcuB);

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/lockdep.h  |  3 ++-
 kernel/locking/lockdep.c | 48 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 14d9dbedc6c1..b32256e9e944 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -134,7 +134,8 @@ struct held_lock {
 	unsigned int read:2;        /* see lock_acquire() comment */
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
-	unsigned int references:12;					/* 32 bits */
+	unsigned int sync:1;
+	unsigned int references:11;					/* 32 bits */
 	unsigned int pin_count;
 };
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3ee3b278789d..dcd1d5bfc1e0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src,
 	struct lock_class *source = hlock_class(src);
 	struct lock_class *target = hlock_class(tgt);
 	struct lock_class *parent = prt->class;
+	int src_read = src->read;
+	int tgt_read = tgt->read;
 
 	/*
 	 * A direct locking problem where unsafe_class lock is taken
@@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src,
 	printk(" Possible unsafe locking scenario:\n\n");
 	printk("       CPU0                    CPU1\n");
 	printk("       ----                    ----\n");
-	printk("  lock(");
+	if (tgt_read != 0)
+		printk("  rlock(");
+	else
+		printk("  lock(");
 	__print_lock_name(target);
 	printk(KERN_CONT ");\n");
 	printk("                               lock(");
@@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src,
 	printk("                               lock(");
 	__print_lock_name(target);
 	printk(KERN_CONT ");\n");
-	printk("  lock(");
+	if (src_read != 0)
+		printk("  rlock(");
+	else if (src->sync)
+		printk("  sync(");
+	else
+		printk("  lock(");
 	__print_lock_name(source);
 	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
@@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 					return 0;
 		}
 	}
-	if (!hlock->hardirqs_off) {
+
+	/*
+	 * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+	 * creates no critical section and no extra dependency can be introduced
+	 * by interrupts
+	 */
+	if (!hlock->hardirqs_off && !hlock->sync) {
 		if (hlock->read) {
 			if (!mark_lock(curr, hlock,
 					LOCK_ENABLED_HARDIRQ_READ))
@@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, int hardirqs_off,
 			  struct lockdep_map *nest_lock, unsigned long ip,
-			  int references, int pin_count)
+			  int references, int pin_count, int sync)
 {
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
@@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	class_idx = class - lock_classes;
 
-	if (depth) { /* we're holding locks */
+	if (depth && !sync) {
+		/* we're holding locks and the new held lock is not a sync */
 		hlock = curr->held_locks + depth - 1;
 		if (hlock->class_idx == class_idx && nest_lock) {
 			if (!references)
@@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->trylock = trylock;
 	hlock->read = read;
 	hlock->check = check;
+	hlock->sync = !!sync;
 	hlock->hardirqs_off = !!hardirqs_off;
 	hlock->references = references;
 #ifdef CONFIG_LOCK_STAT
@@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (!validate_chain(curr, hlock, chain_head, chain_key))
 		return 0;
 
+	/* For lock_sync(), we are done here since no actual critical section */
+	if (hlock->sync)
+		return 1;
+
 	curr->curr_chain_key = chain_key;
 	curr->lockdep_depth++;
 	check_chain_key(curr);
@@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
 				    hlock->read, hlock->check,
 				    hlock->hardirqs_off,
 				    hlock->nest_lock, hlock->acquire_ip,
-				    hlock->references, hlock->pin_count)) {
+				    hlock->references, hlock->pin_count, 0)) {
 		case 0:
 			return 1;
 		case 1:
@@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	lockdep_recursion_inc();
 	__lock_acquire(lock, subclass, trylock, read, check,
-		       irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+		       irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
 	lockdep_recursion_finish();
 	raw_local_irq_restore(flags);
 }
@@ -5700,11 +5722,6 @@ EXPORT_SYMBOL_GPL(lock_release);
  * APIs are used to wait for one or multiple critical sections (on other CPUs
  * or threads), and it means that calling these APIs inside these critical
  * sections is potential deadlock.
- *
- * This annotation acts as an acquire+release annotation pair with hardirqoff
- * being 1. Since there's no critical section, no interrupt can create extra
- * dependencies "inside" the annotation, hardirqoff == 1 allows us to avoid
- * false positives.
  */
 void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
 	       int check, struct lockdep_map *nest_lock, unsigned long ip)
@@ -5718,10 +5735,9 @@ void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
 	check_flags(flags);
 
 	lockdep_recursion_inc();
-	__lock_acquire(lock, subclass, 0, read, check, 1, nest_lock, ip, 0, 0);
-
-	if (__lock_release(lock, ip))
-		check_chain_key(current);
+	__lock_acquire(lock, subclass, 0, read, check,
+		       irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+	check_chain_key(current);
 	lockdep_recursion_finish();
 	raw_local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From d8c0ee307a6086299157ce8eddec9aef478ef475 Mon Sep 17 00:00:00 2001
From: Yinbo Zhu <zhuyinbo@loongson.cn>
Date: Thu, 23 Mar 2023 10:52:28 +0800
Subject: dt-bindings: clock: add loongson-2 boot clock index

The Loongson-2 boot clock was used to spi and lio peripheral and
this patch was to add boot clock index number.

Signed-off-by: Yinbo Zhu <zhuyinbo@loongson.cn>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230323025229.2971-1-zhuyinbo@loongson.cn
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/loongson,ls2k-clk.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/loongson,ls2k-clk.h b/include/dt-bindings/clock/loongson,ls2k-clk.h
index db1e27e792ff..3bc4dfc193c2 100644
--- a/include/dt-bindings/clock/loongson,ls2k-clk.h
+++ b/include/dt-bindings/clock/loongson,ls2k-clk.h
@@ -24,6 +24,7 @@
 #define LOONGSON2_SATA_CLK				14
 #define LOONGSON2_PIX0_CLK				15
 #define LOONGSON2_PIX1_CLK				16
-#define LOONGSON2_CLK_END				17
+#define LOONGSON2_BOOT_CLK				17
+#define LOONGSON2_CLK_END				18
 
 #endif
-- 
cgit v1.2.3


From 0ca6a0970073a14f5608f0add0e431697316cbca Mon Sep 17 00:00:00 2001
From: Álvaro Fernández Rojas <noltari@gmail.com>
Date: Wed, 22 Mar 2023 18:15:12 +0100
Subject: dt-bindings: clk: add BCM63268 timer clock definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing timer clock definitions for BCM63268.

Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20230322171515.120353-2-noltari@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/bcm63268-clock.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/bcm63268-clock.h b/include/dt-bindings/clock/bcm63268-clock.h
index da23e691d359..dea8adc8510e 100644
--- a/include/dt-bindings/clock/bcm63268-clock.h
+++ b/include/dt-bindings/clock/bcm63268-clock.h
@@ -27,4 +27,17 @@
 #define BCM63268_CLK_TBUS	27
 #define BCM63268_CLK_ROBOSW250	31
 
+#define BCM63268_TCLK_EPHY1		0
+#define BCM63268_TCLK_EPHY2		1
+#define BCM63268_TCLK_EPHY3		2
+#define BCM63268_TCLK_GPHY1		3
+#define BCM63268_TCLK_DSL		4
+#define BCM63268_TCLK_WAKEON_EPHY	6
+#define BCM63268_TCLK_WAKEON_DSL	7
+#define BCM63268_TCLK_FAP1		11
+#define BCM63268_TCLK_FAP2		15
+#define BCM63268_TCLK_UTO_50		16
+#define BCM63268_TCLK_UTO_EXTIN		17
+#define BCM63268_TCLK_USB_REF		18
+
 #endif /* __DT_BINDINGS_CLOCK_BCM63268_H */
-- 
cgit v1.2.3


From 2a67e196bb5197bdca89332d2a71e708ee4d897d Mon Sep 17 00:00:00 2001
From: Álvaro Fernández Rojas <noltari@gmail.com>
Date: Wed, 22 Mar 2023 18:15:13 +0100
Subject: dt-bindings: reset: add BCM63268 timer reset definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing timer reset definitions for BCM63268.

Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20230322171515.120353-3-noltari@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/reset/bcm63268-reset.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/bcm63268-reset.h b/include/dt-bindings/reset/bcm63268-reset.h
index 6a6403a4c2d5..d87a7882782a 100644
--- a/include/dt-bindings/reset/bcm63268-reset.h
+++ b/include/dt-bindings/reset/bcm63268-reset.h
@@ -23,4 +23,8 @@
 #define BCM63268_RST_PCIE_HARD	17
 #define BCM63268_RST_GPHY	18
 
+#define BCM63268_TRST_SW	29
+#define BCM63268_TRST_HW	30
+#define BCM63268_TRST_POR	31
+
 #endif /* __DT_BINDINGS_RESET_BCM63268_H */
-- 
cgit v1.2.3


From ffaf886e249ee269f9084c21fbd8617ce9b83817 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Mon, 27 Mar 2023 00:34:26 +0000
Subject: ASoC: soc-core.c: add snd_soc_add_pcm_runtimes()

Current ASoC supports snd_soc_add_pcm_runtime(), but user need to
call it one-by-one if it has multi dai_links.
This patch adds snd_soc_add_pcm_runtimes() which supports multi
dai_links.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87h6u76nhq.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h                  |  5 +++--
 sound/soc/intel/avs/boards/hdaudio.c | 10 ++++------
 sound/soc/soc-core.c                 | 31 +++++++++++++++++++++----------
 sound/soc/soc-topology.c             |  2 +-
 4 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index e58b43b5da7c..57c5786a625b 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1302,8 +1302,9 @@ int snd_soc_of_get_dai_link_cpus(struct device *dev,
 				 struct snd_soc_dai_link *dai_link);
 void snd_soc_of_put_dai_link_cpus(struct snd_soc_dai_link *dai_link);
 
-int snd_soc_add_pcm_runtime(struct snd_soc_card *card,
-			    struct snd_soc_dai_link *dai_link);
+int snd_soc_add_pcm_runtimes(struct snd_soc_card *card,
+			     struct snd_soc_dai_link *dai_link,
+			     int num_dai_link);
 void snd_soc_remove_pcm_runtime(struct snd_soc_card *card,
 				struct snd_soc_pcm_runtime *rtd);
 
diff --git a/sound/soc/intel/avs/boards/hdaudio.c b/sound/soc/intel/avs/boards/hdaudio.c
index e68c4c7aa2ba..a542a67e21d0 100644
--- a/sound/soc/intel/avs/boards/hdaudio.c
+++ b/sound/soc/intel/avs/boards/hdaudio.c
@@ -194,12 +194,10 @@ static int avs_probing_link_init(struct snd_soc_pcm_runtime *rtm)
 		return ret;
 	}
 
-	for (n = 0; n < pcm_count; n++) {
-		ret = snd_soc_add_pcm_runtime(card, &links[n]);
-		if (ret < 0) {
-			dev_err(card->dev, "add links failed: %d\n", ret);
-			return ret;
-		}
+	ret = snd_soc_add_pcm_runtimes(card, links, pcm_count);
+	if (ret < 0) {
+		dev_err(card->dev, "add links failed: %d\n", ret);
+		return ret;
 	}
 
 	ret = avs_create_dapm_routes(card->dev, codec, pcm_count, &routes, &n);
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 2faa0d8d0d8e..9bbcff492c1e 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -959,8 +959,8 @@ EXPORT_SYMBOL_GPL(snd_soc_remove_pcm_runtime);
  * topology component. And machine drivers can still define static
  * DAI links in dai_link array.
  */
-int snd_soc_add_pcm_runtime(struct snd_soc_card *card,
-			    struct snd_soc_dai_link *dai_link)
+static int snd_soc_add_pcm_runtime(struct snd_soc_card *card,
+				   struct snd_soc_dai_link *dai_link)
 {
 	struct snd_soc_pcm_runtime *rtd;
 	struct snd_soc_dai_link_component *codec, *platform, *cpu;
@@ -1027,7 +1027,21 @@ _err_defer:
 	snd_soc_remove_pcm_runtime(card, rtd);
 	return -EPROBE_DEFER;
 }
-EXPORT_SYMBOL_GPL(snd_soc_add_pcm_runtime);
+
+int snd_soc_add_pcm_runtimes(struct snd_soc_card *card,
+			     struct snd_soc_dai_link *dai_link,
+			     int num_dai_link)
+{
+	for (int i = 0; i < num_dai_link; i++) {
+		int ret = snd_soc_add_pcm_runtime(card, dai_link + i);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(snd_soc_add_pcm_runtimes);
 
 static void snd_soc_runtime_get_dai_fmt(struct snd_soc_pcm_runtime *rtd)
 {
@@ -1921,8 +1935,7 @@ static int snd_soc_bind_card(struct snd_soc_card *card)
 {
 	struct snd_soc_pcm_runtime *rtd;
 	struct snd_soc_component *component;
-	struct snd_soc_dai_link *dai_link;
-	int ret, i;
+	int ret;
 
 	mutex_lock(&client_mutex);
 	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT);
@@ -1939,11 +1952,9 @@ static int snd_soc_bind_card(struct snd_soc_card *card)
 
 	/* add predefined DAI links to the list */
 	card->num_rtd = 0;
-	for_each_card_prelinks(card, i, dai_link) {
-		ret = snd_soc_add_pcm_runtime(card, dai_link);
-		if (ret < 0)
-			goto probe_end;
-	}
+	ret = snd_soc_add_pcm_runtimes(card, card->dai_link, card->num_links);
+	if (ret < 0)
+		goto probe_end;
 
 	/* card bind complete so register a sound card */
 	ret = snd_card_new(card->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 7f6424fa59ab..be9849749713 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -1741,7 +1741,7 @@ static int soc_tplg_fe_link_create(struct soc_tplg *tplg,
 		goto err;
 	}
 
-	ret = snd_soc_add_pcm_runtime(tplg->comp->card, link);
+	ret = snd_soc_add_pcm_runtimes(tplg->comp->card, link, 1);
 	if (ret < 0) {
 		dev_err(tplg->dev, "ASoC: adding FE link failed\n");
 		goto err;
-- 
cgit v1.2.3


From 3948b05950fdd64002a5f182c65ba5cf2d53cf71 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Mar 2023 09:28:42 -0700
Subject: net: introduce a config option to tweak MAX_SKB_FRAGS

Currently, MAX_SKB_FRAGS value is 17.

For standard tcp sendmsg() traffic, no big deal because tcp_sendmsg()
attempts order-3 allocations, stuffing 32768 bytes per frag.

But with zero copy, we use order-0 pages.

For BIG TCP to show its full potential, we add a config option
to be able to fit up to 45 segments per skb.

This is also needed for BIG TCP rx zerocopy, as zerocopy currently
does not support skbs with frag list.

We have used MAX_SKB_FRAGS=45 value for years at Google before
we deployed 4K MTU, with no adverse effect, other than
a recent issue in mlx4, fixed in commit 26782aad00cc
("net/mlx4: MLX4_TX_BOUNCE_BUFFER_SIZE depends on MAX_SKB_FRAGS")

Back then, goal was to be able to receive full size (64KB) GRO
packets without the frag_list overhead.

Note that /proc/sys/net/core/max_skb_frags can also be used to limit
the number of fragments TCP can use in tx packets.

By default we keep the old/legacy value of 17 until we get
more coverage for the updated values.

Sizes of struct skb_shared_info on 64bit arches

MAX_SKB_FRAGS | sizeof(struct skb_shared_info):
==============================================
         17     320
         21     320+64  = 384
         25     320+128 = 448
         29     320+192 = 512
         33     320+256 = 576
         37     320+320 = 640
         41     320+384 = 704
         45     320+448 = 768

This inflation might cause problems for drivers assuming they could pack
both the incoming packet (for MTU=1500) and skb_shared_info in half a page,
using build_skb().

v3: fix build error when CONFIG_NET=n
v2: fix two build errors assuming MAX_SKB_FRAGS was "unsigned long"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20230323162842.1935061-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/scsi/cxgbi/libcxgbi.c |  4 ++--
 include/linux/skbuff.h        | 16 +++++-----------
 net/Kconfig                   | 12 ++++++++++++
 net/packet/af_packet.c        |  4 ++--
 4 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index af281e271f88..3e1de4c842cc 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2314,9 +2314,9 @@ static int cxgbi_sock_tx_queue_up(struct cxgbi_sock *csk, struct sk_buff *skb)
 		frags++;
 
 	if (frags >= SKB_WR_LIST_SIZE) {
-		pr_err("csk 0x%p, frags %u, %u,%u >%lu.\n",
+		pr_err("csk 0x%p, frags %u, %u,%u >%u.\n",
 		       csk, skb_shinfo(skb)->nr_frags, skb->len,
-		       skb->data_len, SKB_WR_LIST_SIZE);
+		       skb->data_len, (unsigned int)SKB_WR_LIST_SIZE);
 		return -EINVAL;
 	}
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b..82511b2f61ea 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,18 +345,12 @@ struct sk_buff_head {
 
 struct sk_buff;
 
-/* To allow 64K frame to be packed as single skb without frag_list we
- * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
- * buffers which do not start on a page boundary.
- *
- * Since GRO uses frags we allocate at least 16 regardless of page
- * size.
- */
-#if (65536/PAGE_SIZE + 1) < 16
-#define MAX_SKB_FRAGS 16UL
-#else
-#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
+#ifndef CONFIG_MAX_SKB_FRAGS
+# define CONFIG_MAX_SKB_FRAGS 17
 #endif
+
+#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS
+
 extern int sysctl_max_skb_frags;
 
 /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
diff --git a/net/Kconfig b/net/Kconfig
index 48c33c222199..f806722bccf4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -251,6 +251,18 @@ config PCPU_DEV_REFCNT
 	  network device refcount are using per cpu variables if this option is set.
 	  This can be forced to N to detect underflows (with a performance drop).
 
+config MAX_SKB_FRAGS
+	int "Maximum number of fragments per skb_shared_info"
+	range 17 45
+	default 17
+	help
+	  Having more fragments per skb_shared_info can help GRO efficiency.
+	  This helps BIG TCP workloads, but might expose bugs in some
+	  legacy drivers.
+	  This also increases memory overhead of small packets,
+	  and in drivers using build_skb().
+	  If unsure, say 17.
+
 config RPS
 	bool
 	depends on SMP && SYSFS
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 497193f73030..568f8d76e3c1 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2622,8 +2622,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		nr_frags = skb_shinfo(skb)->nr_frags;
 
 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
-			pr_err("Packet exceed the number of skb frags(%lu)\n",
-			       MAX_SKB_FRAGS);
+			pr_err("Packet exceed the number of skb frags(%u)\n",
+			       (unsigned int)MAX_SKB_FRAGS);
 			return -EFAULT;
 		}
 
-- 
cgit v1.2.3


From 3e4d5ba9a3f85f21f1ebdee5a5901bb43389abc5 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Thu, 23 Mar 2023 18:36:04 +0200
Subject: netlink: Add a macro to set policy message with format string

Similar to NL_SET_ERR_MSG_FMT, add a macro which sets netlink policy
error message with a format string.

Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 3e8743252167..19c0791ed9d5 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -161,9 +161,31 @@ struct netlink_ext_ack {
 	}							\
 } while (0)
 
+#define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do {	\
+	struct netlink_ext_ack *__extack = (extack);				\
+										\
+	if (!__extack)								\
+		break;								\
+										\
+	if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN,		\
+		     "%s" fmt "%s", "", ##args, "") >=				\
+	    NETLINK_MAX_FMTMSG_LEN)						\
+		net_warn_ratelimited("%s" fmt "%s", "truncated extack: ",       \
+				     ##args, "\n");				\
+										\
+	do_trace_netlink_extack(__extack->_msg_buf);				\
+										\
+	__extack->_msg = __extack->_msg_buf;					\
+	__extack->bad_attr = (attr);						\
+	__extack->policy = (pol);						\
+} while (0)
+
 #define NL_SET_ERR_MSG_ATTR(extack, attr, msg)		\
 	NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)
 
+#define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \
+	NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args)
+
 #define NL_SET_ERR_ATTR_MISS(extack, nest, type)  do {	\
 	struct netlink_ext_ack *__extack = (extack);	\
 							\
-- 
cgit v1.2.3


From 233eb4e786b57ea686b51c13a04cc2839fd682fc Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Thu, 23 Mar 2023 18:36:05 +0200
Subject: ethtool: Add support for configuring tx_push_buf_len

This attribute, which is part of ethtool's ring param configuration
allows the user to specify the maximum number of the packet's payload
that can be written directly to the device.

Example usage:
    # ethtool -G [interface] tx-push-buf-len [number of bytes]

Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml     |  8 +++++
 Documentation/networking/ethtool-netlink.rst | 47 ++++++++++++++++++----------
 include/linux/ethtool.h                      | 14 ++++++---
 include/uapi/linux/ethtool_netlink.h         |  2 ++
 net/ethtool/netlink.h                        |  2 +-
 net/ethtool/rings.c                          | 34 ++++++++++++++++++--
 6 files changed, 84 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 4727c067e2ba..6d8ae3d9a680 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -165,6 +165,12 @@ attribute-sets:
       -
         name: rx-push
         type: u8
+      -
+        name: tx-push-buf-len
+        type: u32
+      -
+        name: tx-push-buf-len-max
+        type: u32
 
   -
     name: mm-stat
@@ -311,6 +317,8 @@ operations:
             - cqe-size
             - tx-push
             - rx-push
+            - tx-push-buf-len
+            - tx-push-buf-len-max
       dump: *ring-get-op
     -
       name: rings-set
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index e1bc6186d7ea..cd0973d4ba01 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -860,22 +860,24 @@ Request contents:
 
 Kernel response contents:
 
-  ====================================  ======  ===========================
-  ``ETHTOOL_A_RINGS_HEADER``            nested  reply header
-  ``ETHTOOL_A_RINGS_RX_MAX``            u32     max size of RX ring
-  ``ETHTOOL_A_RINGS_RX_MINI_MAX``       u32     max size of RX mini ring
-  ``ETHTOOL_A_RINGS_RX_JUMBO_MAX``      u32     max size of RX jumbo ring
-  ``ETHTOOL_A_RINGS_TX_MAX``            u32     max size of TX ring
-  ``ETHTOOL_A_RINGS_RX``                u32     size of RX ring
-  ``ETHTOOL_A_RINGS_RX_MINI``           u32     size of RX mini ring
-  ``ETHTOOL_A_RINGS_RX_JUMBO``          u32     size of RX jumbo ring
-  ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
-  ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
-  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
-  ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
-  ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
-  ``ETHTOOL_A_RINGS_RX_PUSH``           u8      flag of RX Push mode
-  ====================================  ======  ===========================
+  =======================================   ======  ===========================
+  ``ETHTOOL_A_RINGS_HEADER``                nested  reply header
+  ``ETHTOOL_A_RINGS_RX_MAX``                u32     max size of RX ring
+  ``ETHTOOL_A_RINGS_RX_MINI_MAX``           u32     max size of RX mini ring
+  ``ETHTOOL_A_RINGS_RX_JUMBO_MAX``          u32     max size of RX jumbo ring
+  ``ETHTOOL_A_RINGS_TX_MAX``                u32     max size of TX ring
+  ``ETHTOOL_A_RINGS_RX``                    u32     size of RX ring
+  ``ETHTOOL_A_RINGS_RX_MINI``               u32     size of RX mini ring
+  ``ETHTOOL_A_RINGS_RX_JUMBO``              u32     size of RX jumbo ring
+  ``ETHTOOL_A_RINGS_TX``                    u32     size of TX ring
+  ``ETHTOOL_A_RINGS_RX_BUF_LEN``            u32     size of buffers on the ring
+  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``        u8      TCP header / data split
+  ``ETHTOOL_A_RINGS_CQE_SIZE``              u32     Size of TX/RX CQE
+  ``ETHTOOL_A_RINGS_TX_PUSH``               u8      flag of TX Push mode
+  ``ETHTOOL_A_RINGS_RX_PUSH``               u8      flag of RX Push mode
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN``       u32     size of TX push buffer
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX``   u32     max size of TX push buffer
+  =======================================   ======  ===========================
 
 ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
 page-flipping TCP zero-copy receive (``getsockopt(TCP_ZEROCOPY_RECEIVE)``).
@@ -891,6 +893,18 @@ through MMIO writes, thus reducing the latency. However, enabling this feature
 may increase the CPU cost. Drivers may enforce additional per-packet
 eligibility checks (e.g. on packet size).
 
+``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN`` specifies the maximum number of bytes of a
+transmitted packet a driver can push directly to the underlying device
+('push' mode). Pushing some of the payload bytes to the device has the
+advantages of reducing latency for small packets by avoiding DMA mapping (same
+as ``ETHTOOL_A_RINGS_TX_PUSH`` parameter) as well as allowing the underlying
+device to process packet headers ahead of fetching its payload.
+This can help the device to make fast actions based on the packet's headers.
+This is similar to the "tx-copybreak" parameter, which copies the packet to a
+preallocated DMA memory area instead of mapping new memory. However,
+tx-push-buff parameter copies the packet directly to the device to allow the
+device to take faster actions on the packet.
+
 RINGS_SET
 =========
 
@@ -908,6 +922,7 @@ Request contents:
   ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
   ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
   ``ETHTOOL_A_RINGS_RX_PUSH``           u8      flag of RX Push mode
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN``   u32     size of TX push buffer
   ====================================  ======  ===========================
 
 Kernel checks that requested ring sizes do not exceed limits reported by
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 2792185dda22..798d35890118 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -75,6 +75,8 @@ enum {
  * @tx_push: The flag of tx push mode
  * @rx_push: The flag of rx push mode
  * @cqe_size: Size of TX/RX completion queue event
+ * @tx_push_buf_len: Size of TX push buffer
+ * @tx_push_buf_max_len: Maximum allowed size of TX push buffer
  */
 struct kernel_ethtool_ringparam {
 	u32	rx_buf_len;
@@ -82,6 +84,8 @@ struct kernel_ethtool_ringparam {
 	u8	tx_push;
 	u8	rx_push;
 	u32	cqe_size;
+	u32	tx_push_buf_len;
+	u32	tx_push_buf_max_len;
 };
 
 /**
@@ -90,12 +94,14 @@ struct kernel_ethtool_ringparam {
  * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size
  * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push
  * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push
+ * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len
  */
 enum ethtool_supported_ring_param {
-	ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0),
-	ETHTOOL_RING_USE_CQE_SIZE   = BIT(1),
-	ETHTOOL_RING_USE_TX_PUSH    = BIT(2),
-	ETHTOOL_RING_USE_RX_PUSH    = BIT(3),
+	ETHTOOL_RING_USE_RX_BUF_LEN		= BIT(0),
+	ETHTOOL_RING_USE_CQE_SIZE		= BIT(1),
+	ETHTOOL_RING_USE_TX_PUSH		= BIT(2),
+	ETHTOOL_RING_USE_RX_PUSH		= BIT(3),
+	ETHTOOL_RING_USE_TX_PUSH_BUF_LEN	= BIT(4),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index d39ce21381c5..1ebf8d455f07 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -357,6 +357,8 @@ enum {
 	ETHTOOL_A_RINGS_CQE_SIZE,			/* u32 */
 	ETHTOOL_A_RINGS_TX_PUSH,			/* u8 */
 	ETHTOOL_A_RINGS_RX_PUSH,			/* u8 */
+	ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,		/* u32 */
+	ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_RINGS_CNT,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index f7b189ed96b2..79424b34b553 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -413,7 +413,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT
 extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
 extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
 extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
-extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_PUSH + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index f358cd57d094..1c4972526142 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -11,6 +11,7 @@ struct rings_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_ringparam	ringparam;
 	struct kernel_ethtool_ringparam	kernel_ringparam;
+	u32				supported_ring_params;
 };
 
 #define RINGS_REPDATA(__reply_base) \
@@ -32,6 +33,8 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base,
 
 	if (!dev->ethtool_ops->get_ringparam)
 		return -EOPNOTSUPP;
+
+	data->supported_ring_params = dev->ethtool_ops->supported_ring_params;
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
@@ -57,7 +60,9 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u8))  +	/* _RINGS_TCP_DATA_SPLIT */
 	       nla_total_size(sizeof(u32)  +	/* _RINGS_CQE_SIZE */
 	       nla_total_size(sizeof(u8))  +	/* _RINGS_TX_PUSH */
-	       nla_total_size(sizeof(u8)));	/* _RINGS_RX_PUSH */
+	       nla_total_size(sizeof(u8))) +	/* _RINGS_RX_PUSH */
+	       nla_total_size(sizeof(u32)) +	/* _RINGS_TX_PUSH_BUF_LEN */
+	       nla_total_size(sizeof(u32));	/* _RINGS_TX_PUSH_BUF_LEN_MAX */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -67,6 +72,7 @@ static int rings_fill_reply(struct sk_buff *skb,
 	const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
 	const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
 	const struct ethtool_ringparam *ringparam = &data->ringparam;
+	u32 supported_ring_params = data->supported_ring_params;
 
 	WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
 
@@ -98,7 +104,12 @@ static int rings_fill_reply(struct sk_buff *skb,
 	    (kr->cqe_size &&
 	     (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) ||
 	    nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) ||
-	    nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push))
+	    nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) ||
+	    ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) &&
+	     (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,
+			  kr->tx_push_buf_max_len) ||
+	      nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,
+			  kr->tx_push_buf_len))))
 		return -EMSGSIZE;
 
 	return 0;
@@ -117,6 +128,7 @@ const struct nla_policy ethnl_rings_set_policy[] = {
 	[ETHTOOL_A_RINGS_CQE_SIZE]		= NLA_POLICY_MIN(NLA_U32, 1),
 	[ETHTOOL_A_RINGS_TX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
 	[ETHTOOL_A_RINGS_RX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
+	[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN]	= { .type = NLA_U32 },
 };
 
 static int
@@ -158,6 +170,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info,
 		return -EOPNOTSUPP;
 	}
 
+	if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] &&
+	    !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+				    "setting tx push buf len is not supported");
+		return -EOPNOTSUPP;
+	}
+
 	return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP;
 }
 
@@ -189,6 +209,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
 			tb[ETHTOOL_A_RINGS_TX_PUSH], &mod);
 	ethnl_update_u8(&kernel_ringparam.rx_push,
 			tb[ETHTOOL_A_RINGS_RX_PUSH], &mod);
+	ethnl_update_u32(&kernel_ringparam.tx_push_buf_len,
+			 tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod);
 	if (!mod)
 		return 0;
 
@@ -209,6 +231,14 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
 		return -EINVAL;
 	}
 
+	if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) {
+		NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+					"Requested TX push buffer exceeds the maximum of %u",
+					kernel_ringparam.tx_push_buf_max_len);
+
+		return -EINVAL;
+	}
+
 	ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
 					      &kernel_ringparam, info->extack);
 	return ret < 0 ? ret : 1;
-- 
cgit v1.2.3


From 6f2656eab290f0dd437cd4b2ec956c3519172d3b Mon Sep 17 00:00:00 2001
From: Luís Henriques <lhenriques@suse.de>
Date: Thu, 16 Mar 2023 18:14:11 +0000
Subject: fscrypt: new helper function - fscrypt_prepare_lookup_partial()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a new helper function which can be used both in
lookups and in atomic_open operations by filesystems that want to handle
filename encryption and no-key dentries themselves.

The reason for this function to be used in atomic open is that this
operation can act as a lookup if handed a dentry that is negative.  And in
this case we may need to set DCACHE_NOKEY_NAME.

Signed-off-by: Luís Henriques <lhenriques@suse.de>
Tested-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
[ebiggers: improved the function comment, and moved the function to just
           below __fscrypt_prepare_lookup()]
Link: https://lore.kernel.org/r/20230320220149.21863-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/hooks.c       | 30 ++++++++++++++++++++++++++++++
 include/linux/fscrypt.h |  7 +++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 7b8c5a1104b5..9151934c5086 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -111,6 +111,36 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 
+/**
+ * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup
+ * @dir: the encrypted directory being searched
+ * @dentry: the dentry being looked up in @dir
+ *
+ * This function should be used by the ->lookup and ->atomic_open methods of
+ * filesystems that handle filename encryption and no-key name encoding
+ * themselves and thus can't use fscrypt_prepare_lookup().  Like
+ * fscrypt_prepare_lookup(), this will try to set up the directory's encryption
+ * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable.
+ * However, this function doesn't set up a struct fscrypt_name for the filename.
+ *
+ * Return: 0 on success; -errno on error.  Note that the encryption key being
+ *	   unavailable is not considered an error.  It is also not an error if
+ *	   the encryption policy is unsupported by this kernel; that is treated
+ *	   like the key being unavailable, so that files can still be deleted.
+ */
+int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
+{
+	int err = fscrypt_get_encryption_info(dir, true);
+
+	if (!err && !fscrypt_has_encryption_key(dir)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_NOKEY_NAME;
+		spin_unlock(&dentry->d_lock);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);
+
 int __fscrypt_prepare_readdir(struct inode *dir)
 {
 	return fscrypt_get_encryption_info(dir, true);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index e0a49c3125eb..a69f1302051d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -359,6 +359,7 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 			     unsigned int flags);
 int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 			     struct fscrypt_name *fname);
+int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry);
 int __fscrypt_prepare_readdir(struct inode *dir);
 int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 int fscrypt_prepare_setflags(struct inode *inode,
@@ -673,6 +674,12 @@ static inline int __fscrypt_prepare_lookup(struct inode *dir,
 	return -EOPNOTSUPP;
 }
 
+static inline int fscrypt_prepare_lookup_partial(struct inode *dir,
+						 struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int __fscrypt_prepare_readdir(struct inode *dir)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 517d4927aabe488144863e72b52bb3e506fecd34 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 25 Mar 2023 09:45:26 +0100
Subject: driver core: bus: constify class_unregister/destroy()

The class_unregister() and class_destroy() function should be taking a
const * to struct class, not just a *, so fix that up.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230325084526.3622123-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 8 ++++----
 include/linux/device/class.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 0f8938a17144..8ae91e118827 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -142,13 +142,13 @@ static void klist_class_dev_put(struct klist_node *n)
 	put_device(dev);
 }
 
-static int class_add_groups(struct class *cls,
+static int class_add_groups(const struct class *cls,
 			    const struct attribute_group **groups)
 {
 	return sysfs_create_groups(&cls->p->subsys.kobj, groups);
 }
 
-static void class_remove_groups(struct class *cls,
+static void class_remove_groups(const struct class *cls,
 				const struct attribute_group **groups)
 {
 	return sysfs_remove_groups(&cls->p->subsys.kobj, groups);
@@ -206,7 +206,7 @@ err_out:
 }
 EXPORT_SYMBOL_GPL(class_register);
 
-void class_unregister(struct class *cls)
+void class_unregister(const struct class *cls)
 {
 	pr_debug("device class '%s': unregistering\n", cls->name);
 	class_remove_groups(cls, cls->class_groups);
@@ -265,7 +265,7 @@ EXPORT_SYMBOL_GPL(class_create);
  * Note, the pointer to be destroyed must have been created with a call
  * to class_create().
  */
-void class_destroy(struct class *cls)
+void class_destroy(const struct class *cls)
 {
 	if (IS_ERR_OR_NULL(cls))
 		return;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index d3960733c0fa..73436c578d37 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -83,7 +83,7 @@ struct class_dev_iter {
 extern struct kobject *sysfs_dev_block_kobj;
 
 int __must_check class_register(struct class *class);
-void class_unregister(struct class *class);
+void class_unregister(const struct class *class);
 
 struct class_compat;
 struct class_compat *class_compat_register(const char *name);
@@ -231,6 +231,6 @@ int __must_check class_interface_register(struct class_interface *);
 void class_interface_unregister(struct class_interface *);
 
 struct class * __must_check class_create(const char *name);
-void class_destroy(struct class *cls);
+void class_destroy(const struct class *cls);
 
 #endif	/* _DEVICE_CLASS_H_ */
-- 
cgit v1.2.3


From e5ab9eff46b04c5a04778e40d7092fed3fda52ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:30 +0100
Subject: atomics: Provide atomic_add_negative() variants

atomic_add_negative() does not provide the relaxed/acquire/release
variants.

Provide them in preparation for a new scalable reference count algorithm.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de
---
 include/linux/atomic/atomic-arch-fallback.h | 208 ++++++++++++++++++++++++++--
 include/linux/atomic/atomic-instrumented.h  |  68 ++++++++-
 include/linux/atomic/atomic-long.h          |  38 ++++-
 scripts/atomic/atomics.tbl                  |   2 +-
 scripts/atomic/fallbacks/add_negative       |  11 +-
 5 files changed, 309 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 77bc5522e61c..4226379a232d 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v)
 #define arch_atomic_inc_and_test arch_atomic_inc_and_test
 #endif
 
+#ifndef arch_atomic_add_negative_relaxed
+#ifdef arch_atomic_add_negative
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative
+#define arch_atomic_add_negative_release arch_atomic_add_negative
+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative
+#endif /* arch_atomic_add_negative */
+
 #ifndef arch_atomic_add_negative
 /**
- * arch_atomic_add_negative - add and test if negative
+ * arch_atomic_add_negative - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type atomic_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
 arch_atomic_add_negative(int i, atomic_t *v)
@@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v)
 #define arch_atomic_add_negative arch_atomic_add_negative
 #endif
 
+#ifndef arch_atomic_add_negative_acquire
+/**
+ * arch_atomic_add_negative_acquire - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_acquire(i, v) < 0;
+}
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
+#endif
+
+#ifndef arch_atomic_add_negative_release
+/**
+ * arch_atomic_add_negative_release - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_release(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_release(i, v) < 0;
+}
+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
+#endif
+
+#ifndef arch_atomic_add_negative_relaxed
+/**
+ * arch_atomic_add_negative_relaxed - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_relaxed(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_relaxed(i, v) < 0;
+}
+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed
+#endif
+
+#else /* arch_atomic_add_negative_relaxed */
+
+#ifndef arch_atomic_add_negative_acquire
+static __always_inline bool
+arch_atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	bool ret = arch_atomic_add_negative_relaxed(i, v);
+	__atomic_acquire_fence();
+	return ret;
+}
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
+#endif
+
+#ifndef arch_atomic_add_negative_release
+static __always_inline bool
+arch_atomic_add_negative_release(int i, atomic_t *v)
+{
+	__atomic_release_fence();
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
+#endif
+
+#ifndef arch_atomic_add_negative
+static __always_inline bool
+arch_atomic_add_negative(int i, atomic_t *v)
+{
+	bool ret;
+	__atomic_pre_full_fence();
+	ret = arch_atomic_add_negative_relaxed(i, v);
+	__atomic_post_full_fence();
+	return ret;
+}
+#define arch_atomic_add_negative arch_atomic_add_negative
+#endif
+
+#endif /* arch_atomic_add_negative_relaxed */
+
 #ifndef arch_atomic_fetch_add_unless
 /**
  * arch_atomic_fetch_add_unless - add unless the number is already a given value
@@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v)
 #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
 #endif
 
+#ifndef arch_atomic64_add_negative_relaxed
+#ifdef arch_atomic64_add_negative
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative
+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative
+#endif /* arch_atomic64_add_negative */
+
 #ifndef arch_atomic64_add_negative
 /**
- * arch_atomic64_add_negative - add and test if negative
+ * arch_atomic64_add_negative - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type atomic64_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
 arch_atomic64_add_negative(s64 i, atomic64_t *v)
@@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v)
 #define arch_atomic64_add_negative arch_atomic64_add_negative
 #endif
 
+#ifndef arch_atomic64_add_negative_acquire
+/**
+ * arch_atomic64_add_negative_acquire - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_acquire(i, v) < 0;
+}
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
+#endif
+
+#ifndef arch_atomic64_add_negative_release
+/**
+ * arch_atomic64_add_negative_release - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_release(i, v) < 0;
+}
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
+#endif
+
+#ifndef arch_atomic64_add_negative_relaxed
+/**
+ * arch_atomic64_add_negative_relaxed - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_relaxed(i, v) < 0;
+}
+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed
+#endif
+
+#else /* arch_atomic64_add_negative_relaxed */
+
+#ifndef arch_atomic64_add_negative_acquire
+static __always_inline bool
+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	bool ret = arch_atomic64_add_negative_relaxed(i, v);
+	__atomic_acquire_fence();
+	return ret;
+}
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
+#endif
+
+#ifndef arch_atomic64_add_negative_release
+static __always_inline bool
+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	__atomic_release_fence();
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
+#endif
+
+#ifndef arch_atomic64_add_negative
+static __always_inline bool
+arch_atomic64_add_negative(s64 i, atomic64_t *v)
+{
+	bool ret;
+	__atomic_pre_full_fence();
+	ret = arch_atomic64_add_negative_relaxed(i, v);
+	__atomic_post_full_fence();
+	return ret;
+}
+#define arch_atomic64_add_negative arch_atomic64_add_negative
+#endif
+
+#endif /* arch_atomic64_add_negative_relaxed */
+
 #ifndef arch_atomic64_fetch_add_unless
 /**
  * arch_atomic64_fetch_add_unless - add unless the number is already a given value
@@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// b5e87bdd5ede61470c29f7a7e4de781af3770f09
+// 00071fffa021cec66f6290d706d69c91df87bade
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 7a139ec030b0..0496816738ca 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v)
 	return arch_atomic_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic_add_negative_release(int i, atomic_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic_add_negative_relaxed(int i, atomic_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+
 static __always_inline int
 atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
@@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v)
 	return arch_atomic64_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+
 static __always_inline s64
 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 {
@@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic_long_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
+// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a
diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
index 800b8c35992d..2fc51ba66beb 100644
--- a/include/linux/atomic/atomic-long.h
+++ b/include/linux/atomic/atomic-long.h
@@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic64_add_negative(i, v);
 }
 
+static __always_inline bool
+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_release(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic_add_negative(i, v);
 }
 
+static __always_inline bool
+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_release(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v)
 
 #endif /* CONFIG_64BIT */
 #endif /* _LINUX_ATOMIC_LONG_H */
-// e8f0e08ff072b74d180eabe2ad001282b38c2c88
+// a194c07d7d2f4b0e178d3c118c919775d5d65f50
diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl
index fbee2f6190d9..85ca8d9b5c27 100644
--- a/scripts/atomic/atomics.tbl
+++ b/scripts/atomic/atomics.tbl
@@ -33,7 +33,7 @@ try_cmpxchg		B	v	p:old	i:new
 sub_and_test		b	i	v
 dec_and_test		b	v
 inc_and_test		b	v
-add_negative		b	i	v
+add_negative		B	i	v
 add_unless		fb	v	i:a	i:u
 inc_not_zero		b	v
 inc_unless_negative	b	v
diff --git a/scripts/atomic/fallbacks/add_negative b/scripts/atomic/fallbacks/add_negative
index 15caa2eb2371..e5980abf5904 100755
--- a/scripts/atomic/fallbacks/add_negative
+++ b/scripts/atomic/fallbacks/add_negative
@@ -1,16 +1,15 @@
 cat <<EOF
 /**
- * arch_${atomic}_add_negative - add and test if negative
+ * arch_${atomic}_add_negative${order} - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type ${atomic}_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
-arch_${atomic}_add_negative(${int} i, ${atomic}_t *v)
+arch_${atomic}_add_negative${order}(${int} i, ${atomic}_t *v)
 {
-	return arch_${atomic}_add_return(i, v) < 0;
+	return arch_${atomic}_add_return${order}(i, v) < 0;
 }
 EOF
-- 
cgit v1.2.3


From ee1ee6db07795d9637bc5e8993a8ddcf886541ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:31 +0100
Subject: atomics: Provide rcuref - scalable reference counting

atomic_t based reference counting, including refcount_t, uses
atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
implemented with a atomic_try_cmpxchg() loop. High contention of the
reference count leads to retry loops and scales badly. There is nothing to
improve on this implementation as the semantics have to be preserved.

Provide rcuref as a scalable alternative solution which is suitable for RCU
managed objects. Similar to refcount_t it comes with overflow and underflow
detection and mitigation.

rcuref treats the underlying atomic_t as an unsigned integer and partitions
this space into zones:

  0x00000000 - 0x7FFFFFFF	valid zone (1 .. (INT_MAX + 1) references)
  0x80000000 - 0xBFFFFFFF	saturation zone
  0xC0000000 - 0xFFFFFFFE	dead zone
  0xFFFFFFFF   			no reference

rcuref_get() unconditionally increments the reference count with
atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
reference count with atomic_add_negative_release().

This unconditional increment avoids the inc_not_zero() problem, but
requires a more complex implementation on the put() side when the count
drops from 0 to -1.

When this transition is detected then it is attempted to mark the reference
count dead, by setting it to the midpoint of the dead zone with a single
atomic_cmpxchg_release() operation. This operation can fail due to a
concurrent rcuref_get() elevating the reference count from -1 to 0 again.

If the unconditional increment in rcuref_get() hits a reference count which
is marked dead (or saturated) it will detect it after the fact and bring
back the reference count to the midpoint of the respective zone. The zones
provide enough tolerance which makes it practically impossible to escape
from a zone.

The racy implementation of rcuref_put() requires to protect rcuref_put()
against a grace period ending in order to prevent a subtle use after
free. As RCU is the only mechanism which allows to protect against that, it
is not possible to fully replace the atomic_inc_not_zero() based
implementation of refcount_t with this scheme.

The final drop is slightly more expensive than the atomic_dec_return()
counterpart, but that's not the case which this is optimized for. The
optimization is on the high frequeunt get()/put() pairs and their
scalability.

The performance of an uncontended rcuref_get()/put() pair where the put()
is not dropping the last reference is still on par with the plain atomic
operations, while at the same time providing overflow and underflow
detection and mitigation.

The performance of rcuref compared to plain atomic_inc_not_zero() and
atomic_dec_return() based reference counting under contention:

 -  Micro benchmark: All CPUs running a increment/decrement loop on an
    elevated reference count, which means the 0 to -1 transition never
    happens.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.3X to 4.7X

 - Conversion of dst_entry::__refcnt to rcuref and testing with the
    localhost memtier/memcached benchmark. That benchmark shows the
    reference count contention prominently.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.1X to 2.6X over the
    previous fix for the false sharing issue vs. struct
    dst_entry::__refcnt.

    When memtier is run over a real 1Gb network connection, there is a
    small gain on top of the false sharing fix. The two changes combined
    result in a 2%-5% total gain for that networked test.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
---
 include/linux/rcuref.h | 155 +++++++++++++++++++++++++++
 include/linux/types.h  |   6 ++
 lib/Makefile           |   2 +-
 lib/rcuref.c           | 281 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 443 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/rcuref.h
 create mode 100644 lib/rcuref.c

(limited to 'include')

diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
new file mode 100644
index 000000000000..2c8bfd0f1b6b
--- /dev/null
+++ b/include/linux/rcuref.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RCUREF_H
+#define _LINUX_RCUREF_H
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/limits.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+
+#define RCUREF_ONEREF		0x00000000U
+#define RCUREF_MAXREF		0x7FFFFFFFU
+#define RCUREF_SATURATED	0xA0000000U
+#define RCUREF_RELEASED		0xC0000000U
+#define RCUREF_DEAD		0xE0000000U
+#define RCUREF_NOREF		0xFFFFFFFFU
+
+/**
+ * rcuref_init - Initialize a rcuref reference count with the given reference count
+ * @ref:	Pointer to the reference count
+ * @cnt:	The initial reference count typically '1'
+ */
+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
+{
+	atomic_set(&ref->refcnt, cnt - 1);
+}
+
+/**
+ * rcuref_read - Read the number of held reference counts of a rcuref
+ * @ref:	Pointer to the reference count
+ *
+ * Return: The number of held references (0 ... N)
+ */
+static inline unsigned int rcuref_read(rcuref_t *ref)
+{
+	unsigned int c = atomic_read(&ref->refcnt);
+
+	/* Return 0 if within the DEAD zone. */
+	return c >= RCUREF_RELEASED ? 0 : c + 1;
+}
+
+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
+
+/**
+ * rcuref_get - Acquire one reference on a rcuref reference count
+ * @ref:	Pointer to the reference count
+ *
+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See documentation in lib/rcuref.c
+ *
+ * Return:
+ *	False if the attempt to acquire a reference failed. This happens
+ *	when the last reference has been put already
+ *
+ *	True if a reference was successfully acquired
+ */
+static inline __must_check bool rcuref_get(rcuref_t *ref)
+{
+	/*
+	 * Unconditionally increase the reference count. The saturation and
+	 * dead zones provide enough tolerance for this.
+	 */
+	if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
+		return true;
+
+	/* Handle the cases inside the saturation and dead zones */
+	return rcuref_get_slowpath(ref);
+}
+
+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
+
+/*
+ * Internal helper. Do not invoke directly.
+ */
+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
+			 "suspicious rcuref_put_rcusafe() usage");
+	/*
+	 * Unconditionally decrease the reference count. The saturation and
+	 * dead zones provide enough tolerance for this.
+	 */
+	if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
+		return false;
+
+	/*
+	 * Handle the last reference drop and cases inside the saturation
+	 * and dead zones.
+	 */
+	return rcuref_put_slowpath(ref);
+}
+
+/**
+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
+ * @ref:	Pointer to the reference count
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Can be invoked from contexts, which guarantee that no grace period can
+ * happen which would free the object concurrently if the decrement drops
+ * the last reference and the slowpath races against a concurrent get() and
+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely release the
+ *	object which is protected by the reference counter.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	release the protected object.
+ */
+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
+{
+	return __rcuref_put(ref);
+}
+
+/**
+ * rcuref_put -- Release one reference for a rcuref reference count
+ * @ref:	Pointer to the reference count
+ *
+ * Can be invoked from any context.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return:
+ *
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+static inline __must_check bool rcuref_put(rcuref_t *ref)
+{
+	bool released;
+
+	preempt_disable();
+	released = __rcuref_put(ref);
+	preempt_enable();
+	return released;
+}
+
+#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index ea8cf60a8a79..688fb943556a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -175,6 +175,12 @@ typedef struct {
 } atomic64_t;
 #endif
 
+typedef struct {
+	atomic_t refcnt;
+} rcuref_t;
+
+#define RCUREF_INIT(i)	{ .refcnt = ATOMIC_INIT(i - 1) }
+
 struct list_head {
 	struct list_head *next, *prev;
 };
diff --git a/lib/Makefile b/lib/Makefile
index baf2821f7a00..31a3a257fd49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
-	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
+	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
diff --git a/lib/rcuref.c b/lib/rcuref.c
new file mode 100644
index 000000000000..5ec00a4a64d1
--- /dev/null
+++ b/lib/rcuref.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * rcuref - A scalable reference count implementation for RCU managed objects
+ *
+ * rcuref is provided to replace open coded reference count implementations
+ * based on atomic_t. It protects explicitely RCU managed objects which can
+ * be visible even after the last reference has been dropped and the object
+ * is heading towards destruction.
+ *
+ * A common usage pattern is:
+ *
+ * get()
+ *	rcu_read_lock();
+ *	p = get_ptr();
+ *	if (p && !atomic_inc_not_zero(&p->refcnt))
+ *		p = NULL;
+ *	rcu_read_unlock();
+ *	return p;
+ *
+ * put()
+ *	if (!atomic_dec_return(&->refcnt)) {
+ *		remove_ptr(p);
+ *		kfree_rcu((p, rcu);
+ *	}
+ *
+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
+ * O(N^2) behaviour under contention with N concurrent operations.
+ *
+ * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
+ * better under contention.
+ *
+ * Why not refcount?
+ * =================
+ *
+ * In principle it should be possible to make refcount use the rcuref
+ * scheme, but the destruction race described below cannot be prevented
+ * unless the protected object is RCU managed.
+ *
+ * Theory of operation
+ * ===================
+ *
+ * rcuref uses an unsigned integer reference counter. As long as the
+ * counter value is greater than or equal to RCUREF_ONEREF and not larger
+ * than RCUREF_MAXREF the reference is alive:
+ *
+ * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
+ * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
+ * <---valid --------> <-------saturation zone-------> <-----dead zone----->
+ *
+ * The get() and put() operations do unconditional increments and
+ * decrements. The result is checked after the operation. This optimizes
+ * for the fast path.
+ *
+ * If the reference count is saturated or dead, then the increments and
+ * decrements are not harmful as the reference count still stays in the
+ * respective zones and is always set back to STATURATED resp. DEAD. The
+ * zones have room for 2^28 racing operations in each direction, which
+ * makes it practically impossible to escape the zones.
+ *
+ * Once the last reference is dropped the reference count becomes
+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
+ * slowpath then tries to set the reference count from RCUREF_NOREF to
+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
+ * concurrent rcuref_get() can acquire the reference count and bring it
+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
+ *
+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
+ * DEAD + 1, which is inside the dead zone. If that happens the reference
+ * count is put back to DEAD.
+ *
+ * The actual race is possible due to the unconditional increment and
+ * decrements in rcuref_get() and rcuref_put():
+ *
+ *	T1				T2
+ *	get()				put()
+ *					if (atomic_add_negative(-1, &ref->refcnt))
+ *		succeeds->			atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
+ *
+ *	atomic_add_negative(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
+ *
+ * As the result of T1's add is negative, the get() goes into the slow path
+ * and observes refcnt being in the dead zone which makes the operation fail.
+ *
+ * Possible critical states:
+ *
+ *	Context Counter	References	Operation
+ *	T1	0	1		init()
+ *	T2	1	2		get()
+ *	T1	0	1		put()
+ *	T2     -1	0		put() tries to mark dead
+ *	T1	0	1		get()
+ *	T2	0	1		put() mark dead fails
+ *	T1     -1	0		put() tries to mark dead
+ *	T1    DEAD	0		put() mark dead succeeds
+ *	T2    DEAD+1	0		get() fails and puts it back to DEAD
+ *
+ * Of course there are more complex scenarios, but the above illustrates
+ * the working principle. The rest is left to the imagination of the
+ * reader.
+ *
+ * Deconstruction race
+ * ===================
+ *
+ * The release operation must be protected by prohibiting a grace period in
+ * order to prevent a possible use after free:
+ *
+ *	T1				T2
+ *	put()				get()
+ *	// ref->refcnt = ONEREF
+ *	if (!atomic_add_negative(-1, &ref->refcnt))
+ *		return false;				<- Not taken
+ *
+ *	// ref->refcnt == NOREF
+ *	--> preemption
+ *					// Elevates ref->refcnt to ONEREF
+ *					if (!atomic_add_negative(1, &ref->refcnt))
+ *						return true;			<- taken
+ *
+ *					if (put(&p->ref)) { <-- Succeeds
+ *						remove_pointer(p);
+ *						kfree_rcu(p, rcu);
+ *					}
+ *
+ *		RCU grace period ends, object is freed
+ *
+ *	atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);	<- UAF
+ *
+ * This is prevented by disabling preemption around the put() operation as
+ * that's in most kernel configurations cheaper than a rcu_read_lock() /
+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
+ * prevents the grace period which keeps the object alive until all put()
+ * operations complete.
+ *
+ * Saturation protection
+ * =====================
+ *
+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
+ * Once this is exceedded the reference count becomes stale by setting it
+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
+ * wrap arounds which obviously cause worse problems than a memory
+ * leak. When saturation is reached a warning is emitted.
+ *
+ * Race conditions
+ * ===============
+ *
+ * All reference count increment/decrement operations are unconditional and
+ * only verified after the fact. This optimizes for the good case and takes
+ * the occasional race vs. a dead or already saturated refcount into
+ * account. The saturation and dead zones are large enough to accomodate
+ * for that.
+ *
+ * Memory ordering
+ * ===============
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object to increase the
+ * reference count on will provide the ordering. For locked data
+ * structures, its the lock acquire, for RCU/lockless data structures its
+ * the dependent load.
+ *
+ * rcuref_get() provides a control dependency ordering future stores which
+ * ensures that the object is not modified when acquiring a reference
+ * fails.
+ *
+ * rcuref_put() provides release order, i.e. all prior loads and stores
+ * will be issued before. It also provides a control dependency ordering
+ * against the subsequent destruction of the object.
+ *
+ * If rcuref_put() successfully dropped the last reference and marked the
+ * object DEAD it also provides acquire ordering.
+ */
+
+#include <linux/export.h>
+#include <linux/rcuref.h>
+
+/**
+ * rcuref_get_slowpath - Slowpath of rcuref_get()
+ * @ref:	Pointer to the reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	False if the reference count was already marked dead
+ *
+ *	True if the reference count is saturated, which prevents the
+ *	object from being deconstructed ever.
+ */
+bool rcuref_get_slowpath(rcuref_t *ref)
+{
+	unsigned int cnt = atomic_read(&ref->refcnt);
+
+	/*
+	 * If the reference count was already marked dead, undo the
+	 * increment so it stays in the middle of the dead zone and return
+	 * fail.
+	 */
+	if (cnt >= RCUREF_RELEASED) {
+		atomic_set(&ref->refcnt, RCUREF_DEAD);
+		return false;
+	}
+
+	/*
+	 * If it was saturated, warn and mark it so. In case the increment
+	 * was already on a saturated value restore the saturation
+	 * marker. This keeps it in the middle of the saturation zone and
+	 * prevents the reference count from overflowing. This leaks the
+	 * object memory, but prevents the obvious reference count overflow
+	 * damage.
+	 */
+	if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
+		atomic_set(&ref->refcnt, RCUREF_SATURATED);
+	return true;
+}
+EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
+
+/**
+ * rcuref_put_slowpath - Slowpath of __rcuref_put()
+ * @ref:	Pointer to the reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+bool rcuref_put_slowpath(rcuref_t *ref)
+{
+	unsigned int cnt = atomic_read(&ref->refcnt);
+
+	/* Did this drop the last reference? */
+	if (likely(cnt == RCUREF_NOREF)) {
+		/*
+		 * Carefully try to set the reference count to RCUREF_DEAD.
+		 *
+		 * This can fail if a concurrent get() operation has
+		 * elevated it again or the corresponding put() even marked
+		 * it dead already. Both are valid situations and do not
+		 * require a retry. If this fails the caller is not
+		 * allowed to deconstruct the object.
+		 */
+		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
+			return false;
+
+		/*
+		 * The caller can safely schedule the object for
+		 * deconstruction. Provide acquire ordering.
+		 */
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	/*
+	 * If the reference count was already in the dead zone, then this
+	 * put() operation is imbalanced. Warn, put the reference count back to
+	 * DEAD and tell the caller to not deconstruct the object.
+	 */
+	if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
+		atomic_set(&ref->refcnt, RCUREF_DEAD);
+		return false;
+	}
+
+	/*
+	 * This is a put() operation on a saturated refcount. Restore the
+	 * mean saturation value and tell the caller to not deconstruct the
+	 * object.
+	 */
+	if (cnt > RCUREF_MAXREF)
+		atomic_set(&ref->refcnt, RCUREF_SATURATED);
+	return false;
+}
+EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
-- 
cgit v1.2.3


From d2c48b2387eb89e0bf2a2e06e30987cf410acad4 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 16 Feb 2023 09:49:19 +0100
Subject: firmware: arm_sdei: Fix sleep from invalid context BUG

Running a preempt-rt (v6.2-rc3-rt1) based kernel on an Ampere Altra
triggers:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
  in_atomic(): 0, irqs_disabled(): 128, non_block: 0, pid: 24, name: cpuhp/0
  preempt_count: 0, expected: 0
  RCU nest depth: 0, expected: 0
  3 locks held by cpuhp/0/24:
    #0: ffffda30217c70d0 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x5c/0x248
    #1: ffffda30217c7120 (cpuhp_state-up){+.+.}-{0:0}, at: cpuhp_thread_fun+0x5c/0x248
    #2: ffffda3021c711f0 (sdei_list_lock){....}-{3:3}, at: sdei_cpuhp_up+0x3c/0x130
  irq event stamp: 36
  hardirqs last  enabled at (35): [<ffffda301e85b7bc>] finish_task_switch+0xb4/0x2b0
  hardirqs last disabled at (36): [<ffffda301e812fec>] cpuhp_thread_fun+0x21c/0x248
  softirqs last  enabled at (0): [<ffffda301e80b184>] copy_process+0x63c/0x1ac0
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  CPU: 0 PID: 24 Comm: cpuhp/0 Not tainted 5.19.0-rc3-rt5-[...]
  Hardware name: WIWYNN Mt.Jade Server [...]
  Call trace:
    dump_backtrace+0x114/0x120
    show_stack+0x20/0x70
    dump_stack_lvl+0x9c/0xd8
    dump_stack+0x18/0x34
    __might_resched+0x188/0x228
    rt_spin_lock+0x70/0x120
    sdei_cpuhp_up+0x3c/0x130
    cpuhp_invoke_callback+0x250/0xf08
    cpuhp_thread_fun+0x120/0x248
    smpboot_thread_fn+0x280/0x320
    kthread+0x130/0x140
    ret_from_fork+0x10/0x20

sdei_cpuhp_up() is called in the STARTING hotplug section,
which runs with interrupts disabled. Use a CPUHP_AP_ONLINE_DYN entry
instead to execute the cpuhp cb later, with preemption enabled.

SDEI originally got its own cpuhp slot to allow interacting
with perf. It got superseded by pNMI and this early slot is not
relevant anymore. [1]

Some SDEI calls (e.g. SDEI_1_0_FN_SDEI_PE_MASK) take actions on the
calling CPU. It is checked that preemption is disabled for them.
_ONLINE cpuhp cb are executed in the 'per CPU hotplug thread'.
Preemption is enabled in those threads, but their cpumask is limited
to 1 CPU.
Move 'WARN_ON_ONCE(preemptible())' statements so that SDEI cpuhp cb
don't trigger them.

Also add a check for the SDEI_1_0_FN_SDEI_PRIVATE_RESET SDEI call
which acts on the calling CPU.

[1]:
https://lore.kernel.org/all/5813b8c5-ae3e-87fd-fccc-94c9cd08816d@arm.com/

Suggested-by: James Morse <james.morse@arm.com>
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Link: https://lore.kernel.org/r/20230216084920.144064-1-pierre.gondois@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/firmware/arm_sdei.c | 37 ++++++++++++++++++++-----------------
 include/linux/cpuhotplug.h  |  1 -
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index 1e1a51510e83..f9040bd61081 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -43,6 +43,8 @@ static asmlinkage void (*sdei_firmware_call)(unsigned long function_id,
 /* entry point from firmware to arch asm code */
 static unsigned long sdei_entry_point;
 
+static int sdei_hp_state;
+
 struct sdei_event {
 	/* These three are protected by the sdei_list_lock */
 	struct list_head	list;
@@ -301,8 +303,6 @@ int sdei_mask_local_cpu(void)
 {
 	int err;
 
-	WARN_ON_ONCE(preemptible());
-
 	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_MASK, 0, 0, 0, 0, 0, NULL);
 	if (err && err != -EIO) {
 		pr_warn_once("failed to mask CPU[%u]: %d\n",
@@ -315,6 +315,7 @@ int sdei_mask_local_cpu(void)
 
 static void _ipi_mask_cpu(void *ignored)
 {
+	WARN_ON_ONCE(preemptible());
 	sdei_mask_local_cpu();
 }
 
@@ -322,8 +323,6 @@ int sdei_unmask_local_cpu(void)
 {
 	int err;
 
-	WARN_ON_ONCE(preemptible());
-
 	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0, NULL);
 	if (err && err != -EIO) {
 		pr_warn_once("failed to unmask CPU[%u]: %d\n",
@@ -336,6 +335,7 @@ int sdei_unmask_local_cpu(void)
 
 static void _ipi_unmask_cpu(void *ignored)
 {
+	WARN_ON_ONCE(preemptible());
 	sdei_unmask_local_cpu();
 }
 
@@ -343,6 +343,8 @@ static void _ipi_private_reset(void *ignored)
 {
 	int err;
 
+	WARN_ON_ONCE(preemptible());
+
 	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0,
 			     NULL);
 	if (err && err != -EIO)
@@ -389,8 +391,6 @@ static void _local_event_enable(void *data)
 	int err;
 	struct sdei_crosscall_args *arg = data;
 
-	WARN_ON_ONCE(preemptible());
-
 	err = sdei_api_event_enable(arg->event->event_num);
 
 	sdei_cross_call_return(arg, err);
@@ -479,8 +479,6 @@ static void _local_event_unregister(void *data)
 	int err;
 	struct sdei_crosscall_args *arg = data;
 
-	WARN_ON_ONCE(preemptible());
-
 	err = sdei_api_event_unregister(arg->event->event_num);
 
 	sdei_cross_call_return(arg, err);
@@ -561,8 +559,6 @@ static void _local_event_register(void *data)
 	struct sdei_registered_event *reg;
 	struct sdei_crosscall_args *arg = data;
 
-	WARN_ON(preemptible());
-
 	reg = per_cpu_ptr(arg->event->private_registered, smp_processor_id());
 	err = sdei_api_event_register(arg->event->event_num, sdei_entry_point,
 				      reg, 0, 0);
@@ -717,6 +713,8 @@ static int sdei_pm_notifier(struct notifier_block *nb, unsigned long action,
 {
 	int rv;
 
+	WARN_ON_ONCE(preemptible());
+
 	switch (action) {
 	case CPU_PM_ENTER:
 		rv = sdei_mask_local_cpu();
@@ -765,7 +763,7 @@ static int sdei_device_freeze(struct device *dev)
 	int err;
 
 	/* unregister private events */
-	cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING);
+	cpuhp_remove_state(sdei_entry_point);
 
 	err = sdei_unregister_shared();
 	if (err)
@@ -786,12 +784,15 @@ static int sdei_device_thaw(struct device *dev)
 		return err;
 	}
 
-	err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI",
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SDEI",
 				&sdei_cpuhp_up, &sdei_cpuhp_down);
-	if (err)
+	if (err < 0) {
 		pr_warn("Failed to re-register CPU hotplug notifier...\n");
+		return err;
+	}
 
-	return err;
+	sdei_hp_state = err;
+	return 0;
 }
 
 static int sdei_device_restore(struct device *dev)
@@ -823,7 +824,7 @@ static int sdei_reboot_notifier(struct notifier_block *nb, unsigned long action,
 	 * We are going to reset the interface, after this there is no point
 	 * doing work when we take CPUs offline.
 	 */
-	cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING);
+	cpuhp_remove_state(sdei_hp_state);
 
 	sdei_platform_reset();
 
@@ -1003,13 +1004,15 @@ static int sdei_probe(struct platform_device *pdev)
 		goto remove_cpupm;
 	}
 
-	err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI",
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SDEI",
 				&sdei_cpuhp_up, &sdei_cpuhp_down);
-	if (err) {
+	if (err < 0) {
 		pr_warn("Failed to register CPU hotplug notifier...\n");
 		goto remove_reboot;
 	}
 
+	sdei_hp_state = err;
+
 	return 0;
 
 remove_reboot:
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index c6fab004104a..5066fa20e3e9 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -163,7 +163,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CSTATE_STARTING,
 	CPUHP_AP_PERF_XTENSA_STARTING,
 	CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
-	CPUHP_AP_ARM_SDEI_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
 	CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
-- 
cgit v1.2.3


From 76d0de5729c0569c4071e7f21fcab394e502f03a Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 2 Feb 2023 00:56:01 +0900
Subject: fprobe: Pass entry_data to handlers

Pass the private entry_data to the entry and exit handlers so that
they can share the context data, something like saved function
arguments etc.
User must specify the private entry_data size by @entry_data_size
field before registering the fprobe.

Link: https://lkml.kernel.org/r/167526696173.433354.17408372048319432574.stgit@mhiramat.roam.corp.google.com

Cc: Florent Revest <revest@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/fprobe.h          |  8 ++++++--
 kernel/trace/bpf_trace.c        |  2 +-
 kernel/trace/fprobe.c           | 21 ++++++++++++++-------
 lib/test_fprobe.c               |  6 ++++--
 samples/fprobe/fprobe_example.c |  6 ++++--
 5 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 1c2bde0ead73..e0d4e6136249 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -13,6 +13,7 @@
  * @nmissed: The counter for missing events.
  * @flags: The status flag.
  * @rethook: The rethook data structure. (internal data)
+ * @entry_data_size: The private data storage size.
  * @entry_handler: The callback function for function entry.
  * @exit_handler: The callback function for function exit.
  */
@@ -29,9 +30,12 @@ struct fprobe {
 	unsigned long		nmissed;
 	unsigned int		flags;
 	struct rethook		*rethook;
+	size_t			entry_data_size;
 
-	void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
-	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
+	void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
+			      struct pt_regs *regs, void *entry_data);
+	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
+			     struct pt_regs *regs, void *entry_data);
 };
 
 /* This fprobe is soft-disabled. */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e8da032bb6fc..fa403c323501 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2646,7 +2646,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 
 static void
 kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
-			  struct pt_regs *regs)
+			  struct pt_regs *regs, void *data)
 {
 	struct bpf_kprobe_multi_link *link;
 
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index e8143e368074..fa25d09c9d57 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -17,14 +17,16 @@
 struct fprobe_rethook_node {
 	struct rethook_node node;
 	unsigned long entry_ip;
+	char data[];
 };
 
 static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct fprobe_rethook_node *fpr;
-	struct rethook_node *rh;
+	struct rethook_node *rh = NULL;
 	struct fprobe *fp;
+	void *entry_data = NULL;
 	int bit;
 
 	fp = container_of(ops, struct fprobe, ops);
@@ -37,9 +39,6 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 		return;
 	}
 
-	if (fp->entry_handler)
-		fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
-
 	if (fp->exit_handler) {
 		rh = rethook_try_get(fp->rethook);
 		if (!rh) {
@@ -48,9 +47,16 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 		}
 		fpr = container_of(rh, struct fprobe_rethook_node, node);
 		fpr->entry_ip = ip;
-		rethook_hook(rh, ftrace_get_regs(fregs), true);
+		if (fp->entry_data_size)
+			entry_data = fpr->data;
 	}
 
+	if (fp->entry_handler)
+		fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
+
+	if (rh)
+		rethook_hook(rh, ftrace_get_regs(fregs), true);
+
 out:
 	ftrace_test_recursion_unlock(bit);
 }
@@ -81,7 +87,8 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
 
 	fpr = container_of(rh, struct fprobe_rethook_node, node);
 
-	fp->exit_handler(fp, fpr->entry_ip, regs);
+	fp->exit_handler(fp, fpr->entry_ip, regs,
+			 fp->entry_data_size ? (void *)fpr->data : NULL);
 }
 NOKPROBE_SYMBOL(fprobe_exit_handler);
 
@@ -146,7 +153,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 	for (i = 0; i < size; i++) {
 		struct fprobe_rethook_node *node;
 
-		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
 		if (!node) {
 			rethook_free(fp->rethook);
 			fp->rethook = NULL;
diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
index 1fb56cf5e5ce..e4f65d114ed2 100644
--- a/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@ -30,7 +30,8 @@ static noinline u32 fprobe_selftest_target2(u32 value)
 	return (value / div_factor) + 1;
 }
 
-static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip,
+				     struct pt_regs *regs, void *data)
 {
 	KUNIT_EXPECT_FALSE(current_test, preemptible());
 	/* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */
@@ -39,7 +40,8 @@ static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct
 	entry_val = (rand1 / div_factor);
 }
 
-static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
+				    struct pt_regs *regs, void *data)
 {
 	unsigned long ret = regs_return_value(regs);
 
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index e22da8573116..dd794990ad7e 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -48,7 +48,8 @@ static void show_backtrace(void)
 	stack_trace_print(stacks, len, 24);
 }
 
-static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static void sample_entry_handler(struct fprobe *fp, unsigned long ip,
+				 struct pt_regs *regs, void *data)
 {
 	if (use_trace)
 		/*
@@ -63,7 +64,8 @@ static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_
 		show_backtrace();
 }
 
-static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs,
+				void *data)
 {
 	unsigned long rip = instruction_pointer(regs);
 
-- 
cgit v1.2.3


From 59a7a298565aa0ce44ce8e4fbcbb89a19730013a Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 2 Feb 2023 00:56:19 +0900
Subject: fprobe: Add nr_maxactive to specify rethook_node pool size

Add nr_maxactive to specify rethook_node pool size. This means
the maximum number of actively running target functions concurrently
for probing by exit_handler. Note that if the running function is
preempted or sleep, it is still counted as 'active'.

Link: https://lkml.kernel.org/r/167526697917.433354.17779774988245113106.stgit@mhiramat.roam.corp.google.com

Cc: Florent Revest <revest@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/fprobe.h | 2 ++
 kernel/trace/fprobe.c  | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index e0d4e6136249..678f741a7b33 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -14,6 +14,7 @@
  * @flags: The status flag.
  * @rethook: The rethook data structure. (internal data)
  * @entry_data_size: The private data storage size.
+ * @nr_maxactive: The max number of active functions.
  * @entry_handler: The callback function for function entry.
  * @exit_handler: The callback function for function exit.
  */
@@ -31,6 +32,7 @@ struct fprobe {
 	unsigned int		flags;
 	struct rethook		*rethook;
 	size_t			entry_data_size;
+	int			nr_maxactive;
 
 	void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
 			      struct pt_regs *regs, void *entry_data);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index fa25d09c9d57..f222848571f2 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -143,7 +143,10 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 	}
 
 	/* Initialize rethook if needed */
-	size = num * num_possible_cpus() * 2;
+	if (fp->nr_maxactive)
+		size = fp->nr_maxactive;
+	else
+		size = num * num_possible_cpus() * 2;
 	if (size < 0)
 		return -E2BIG;
 
-- 
cgit v1.2.3


From 39d954200bf6ad503c722e44d0be80c7b826fa42 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 2 Feb 2023 00:56:38 +0900
Subject: fprobe: Skip exit_handler if entry_handler returns !0

Skip hooking function return and calling exit_handler if the
entry_handler() returns !0.

Link: https://lkml.kernel.org/r/167526699798.433354.10998365726830117303.stgit@mhiramat.roam.corp.google.com

Cc: Florent Revest <revest@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/fprobe.h          |  4 ++--
 kernel/trace/bpf_trace.c        | 15 +++++++++++++--
 kernel/trace/fprobe.c           | 14 +++++++++-----
 lib/test_fprobe.c               |  7 +++++--
 samples/fprobe/fprobe_example.c |  5 +++--
 5 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 678f741a7b33..47fefc7f363b 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -34,8 +34,8 @@ struct fprobe {
 	size_t			entry_data_size;
 	int			nr_maxactive;
 
-	void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
-			      struct pt_regs *regs, void *entry_data);
+	int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
+			     struct pt_regs *regs, void *entry_data);
 	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
 			     struct pt_regs *regs, void *entry_data);
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa403c323501..d804172b709c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2644,12 +2644,23 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 	return err;
 }
 
-static void
+static int
 kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
 			  struct pt_regs *regs, void *data)
 {
 	struct bpf_kprobe_multi_link *link;
 
+	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+	kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+	return 0;
+}
+
+static void
+kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+			       struct pt_regs *regs, void *data)
+{
+	struct bpf_kprobe_multi_link *link;
+
 	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
 	kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
 }
@@ -2848,7 +2859,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		goto error;
 
 	if (flags & BPF_F_KPROBE_MULTI_RETURN)
-		link->fp.exit_handler = kprobe_multi_link_handler;
+		link->fp.exit_handler = kprobe_multi_link_exit_handler;
 	else
 		link->fp.entry_handler = kprobe_multi_link_handler;
 
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index f222848571f2..9abb3905bc8e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -27,7 +27,7 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 	struct rethook_node *rh = NULL;
 	struct fprobe *fp;
 	void *entry_data = NULL;
-	int bit;
+	int bit, ret;
 
 	fp = container_of(ops, struct fprobe, ops);
 	if (fprobe_disabled(fp))
@@ -52,11 +52,15 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 	}
 
 	if (fp->entry_handler)
-		fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
-
-	if (rh)
-		rethook_hook(rh, ftrace_get_regs(fregs), true);
+		ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
 
+	/* If entry_handler returns !0, nmissed is not counted. */
+	if (rh) {
+		if (ret)
+			rethook_recycle(rh);
+		else
+			rethook_hook(rh, ftrace_get_regs(fregs), true);
+	}
 out:
 	ftrace_test_recursion_unlock(bit);
 }
diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
index 4b37d7022f35..9fa2ac9eda83 100644
--- a/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@ -37,7 +37,7 @@ static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
 	return nest(value + 2);
 }
 
-static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip,
+static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
 				     struct pt_regs *regs, void *data)
 {
 	KUNIT_EXPECT_FALSE(current_test, preemptible());
@@ -51,6 +51,8 @@ static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip,
 			*(u32 *)data = entry_val;
 	} else
 		KUNIT_EXPECT_NULL(current_test, data);
+
+	return 0;
 }
 
 static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
@@ -74,10 +76,11 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
 		KUNIT_EXPECT_NULL(current_test, data);
 }
 
-static notrace void nest_entry_handler(struct fprobe *fp, unsigned long ip,
+static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
 				     struct pt_regs *regs, void *data)
 {
 	KUNIT_EXPECT_FALSE(current_test, preemptible());
+	return 0;
 }
 
 static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index dd794990ad7e..4efc8feb6277 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -48,8 +48,8 @@ static void show_backtrace(void)
 	stack_trace_print(stacks, len, 24);
 }
 
-static void sample_entry_handler(struct fprobe *fp, unsigned long ip,
-				 struct pt_regs *regs, void *data)
+static int sample_entry_handler(struct fprobe *fp, unsigned long ip,
+				struct pt_regs *regs, void *data)
 {
 	if (use_trace)
 		/*
@@ -62,6 +62,7 @@ static void sample_entry_handler(struct fprobe *fp, unsigned long ip,
 	nhit++;
 	if (stackdump)
 		show_backtrace();
+	return 0;
 }
 
 static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs,
-- 
cgit v1.2.3


From d155df53f31068c3340733d586eb9b3ddfd70fc5 Mon Sep 17 00:00:00 2001
From: Ma Wupeng <mawupeng1@huawei.com>
Date: Fri, 17 Feb 2023 10:56:15 +0800
Subject: x86/mm/pat: clear VM_PAT if copy_p4d_range failed

Syzbot reports a warning in untrack_pfn().  Digging into the root we found
that this is due to memory allocation failure in pmd_alloc_one.  And this
failure is produced due to failslab.

In copy_page_range(), memory alloaction for pmd failed.  During the error
handling process in copy_page_range(), mmput() is called to remove all
vmas.  While untrack_pfn this empty pfn, warning happens.

Here's a simplified flow:

dup_mm
  dup_mmap
    copy_page_range
      copy_p4d_range
        copy_pud_range
          copy_pmd_range
            pmd_alloc
              __pmd_alloc
                pmd_alloc_one
                  page = alloc_pages(gfp, 0);
                    if (!page)
                      return NULL;
    mmput
        exit_mmap
          unmap_vmas
            unmap_single_vma
              untrack_pfn
                follow_phys
                  WARN_ON_ONCE(1);

Since this vma is not generate successfully, we can clear flag VM_PAT.  In
this case, untrack_pfn() will not be called while cleaning this vma.

Function untrack_pfn_moved() has also been renamed to fit the new logic.

Link: https://lkml.kernel.org/r/20230217025615.1595558-1-mawupeng1@huawei.com
Signed-off-by: Ma Wupeng <mawupeng1@huawei.com>
Reported-by: <syzbot+5f488e922d047d8f00cc@syzkaller.appspotmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/pat/memtype.c | 12 ++++++++----
 include/linux/pgtable.h   |  7 ++++---
 mm/memory.c               |  1 +
 mm/mremap.c               |  2 +-
 4 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 46a00aa858b6..de10800cd4dd 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1073,11 +1073,15 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 }
 
 /*
- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
- * with the old vma after its pfnmap page table has been removed.  The new
- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ * untrack_pfn_clear is called if the following situation fits:
+ *
+ * 1) while mremapping a pfnmap for a new region,  with the old vma after
+ * its pfnmap page table has been removed.  The new vma has a new pfnmap
+ * to the same pfn & cache type with VM_PAT set.
+ * 2) while duplicating vm area, the new vma fails to copy the pgtable from
+ * old vma.
  */
-void untrack_pfn_moved(struct vm_area_struct *vma)
+void untrack_pfn_clear(struct vm_area_struct *vma)
 {
 	vm_flags_clear(vma, VM_PAT);
 }
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c63cd44777ec..9dc936bc77d1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1191,9 +1191,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
 }
 
 /*
- * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
+ * untrack_pfn_clear is called while mremapping a pfnmap for a new region
+ * or fails to copy pgtable during duplicate vm area.
  */
-static inline void untrack_pfn_moved(struct vm_area_struct *vma)
+static inline void untrack_pfn_clear(struct vm_area_struct *vma)
 {
 }
 #else
@@ -1205,7 +1206,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 			unsigned long size, bool mm_wr_locked);
-extern void untrack_pfn_moved(struct vm_area_struct *vma);
+extern void untrack_pfn_clear(struct vm_area_struct *vma);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index f456f3b5049c..bfa3100ec5a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1290,6 +1290,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 			continue;
 		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
 					    addr, next))) {
+			untrack_pfn_clear(dst_vma);
 			ret = -ENOMEM;
 			break;
 		}
diff --git a/mm/mremap.c b/mm/mremap.c
index 411a85682b58..1ddf7beb62e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -683,7 +683,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
 	/* Tell pfnmap has moved from this vma */
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
-		untrack_pfn_moved(vma);
+		untrack_pfn_clear(vma);
 
 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
-- 
cgit v1.2.3


From 9a52b2f32a0942047348b30f866b846da5fcf4e3 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Tue, 14 Feb 2023 03:54:44 +0000
Subject: mm: multi-gen LRU: clean up sysfs code

This patch cleans up the sysfs code. Specifically,
  1. use sysfs_emit(),
  2. use __ATTR_RW(), and
  3. constify multi-gen LRU struct attribute_group.

Link: https://lkml.kernel.org/r/20230214035445.1250139-1-talumbau@google.com
Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/vmscan.c            | 22 +++++++++-------------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..bf8786d45b31 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1369,7 +1369,7 @@ typedef struct pglist_data {
 
 #ifdef CONFIG_LRU_GEN
 	/* kswap mm walk data */
-	struct lru_gen_mm_walk	mm_walk;
+	struct lru_gen_mm_walk mm_walk;
 	/* lru_gen_folio list */
 	struct lru_gen_memcg memcg_lru;
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..7ac7d1ec13e0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5663,14 +5663,14 @@ unlock:
  *                          sysfs interface
  ******************************************************************************/
 
-static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
 }
 
 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
-static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
-			     const char *buf, size_t len)
+static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t len)
 {
 	unsigned int msecs;
 
@@ -5682,11 +5682,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
 	return len;
 }
 
-static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
-	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
-);
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
 
-static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	unsigned int caps = 0;
 
@@ -5703,7 +5701,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
 }
 
 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
-static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
 			     const char *buf, size_t len)
 {
 	int i;
@@ -5730,9 +5728,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
 	return len;
 }
 
-static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
-	enabled, 0644, show_enabled, store_enabled
-);
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
 
 static struct attribute *lru_gen_attrs[] = {
 	&lru_gen_min_ttl_attr.attr,
@@ -5740,7 +5736,7 @@ static struct attribute *lru_gen_attrs[] = {
 	NULL
 };
 
-static struct attribute_group lru_gen_attr_group = {
+static const struct attribute_group lru_gen_attr_group = {
 	.name = "lru_gen",
 	.attrs = lru_gen_attrs,
 };
-- 
cgit v1.2.3


From aa464ba9a1e444d5ef95bb63ee3b2ef26fc96ed7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Feb 2023 17:18:34 +1000
Subject: lazy tlb: introduce lazy tlb mm refcount helper functions

Add explicit _lazy_tlb annotated functions for lazy tlb mm refcounting.
This makes the lazy tlb mm references more obvious, and allows the
refcounting scheme to be modified in later changes.  There is no
functional change with this patch.

Link: https://lkml.kernel.org/r/20230203071837.1136453-3-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  2 +-
 include/linux/sched/mm.h             | 16 ++++++++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 12 ++++++++++--
 kernel/sched/core.c                  | 15 ++++++++-------
 9 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 53813f9464a2..c30df1097c52 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6b90f10a6c81..7db6b3faea65 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1611,7 +1611,7 @@ void start_secondary(void *unused)
 	if (IS_ENABLED(CONFIG_PPC32))
 		setup_kup();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index e50bc5fc7ddf..ce804b7bf84e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -797,10 +797,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	/*
diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..87cf3a2f0e9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1034,7 +1034,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mmput(old_mm);
 		return 0;
 	}
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..5376caf6fcf3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -79,6 +79,22 @@ static inline void mmdrop_sched(struct mm_struct *mm)
 }
 #endif
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
+static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+{
+	mmdrop_sched(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6c0a92ca6bb5..189895288d9d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -623,7 +623,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f2afdb0add7c..86902cb5ab78 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -537,7 +537,7 @@ static void exit_mm(void)
 		return;
 	sync_mm_rss(mm);
 	mmap_read_lock(mm);
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1f1b60f1a746..470708c205e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1415,6 +1415,11 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	/*
+	 * It is possible for mm to be the same as tsk->active_mm, but
+	 * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+	 * because these references are not equivalent.
+	 */
 	mmgrab(mm);
 
 	task_lock(tsk);
@@ -1438,9 +1443,9 @@ void kthread_use_mm(struct mm_struct *mm)
 	 * memory barrier after storing to tsk->mm, before accessing
 	 * user-space memory. A full memory barrier for membarrier
 	 * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
-	 * mmdrop().
+	 * mmdrop_lazy_tlb().
 	 */
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 }
 EXPORT_SYMBOL_GPL(kthread_use_mm);
 
@@ -1468,10 +1473,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	local_irq_disable();
 	tsk->mm = NULL;
 	membarrier_update_current_mm(NULL);
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..143e46bd2a68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5203,13 +5203,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop_sched(mm);
+		mmdrop_lazy_tlb_sched(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -5266,9 +5267,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -5276,7 +5277,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -5293,7 +5294,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		lru_gen_use_mm(next->mm);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -9935,7 +9936,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
cgit v1.2.3


From 88e3009b5283bbd41447f0352d0b9df16cf6f183 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Feb 2023 17:18:35 +1000
Subject: lazy tlb: allow lazy tlb mm refcounting to be configurable

Add CONFIG_MMU_TLB_REFCOUNT which enables refcounting of the lazy tlb mm
when it is context switched.  This can be disabled by architectures that
don't require this refcounting if they clean up lazy tlb mms when the last
refcount is dropped.  Currently this is always enabled, so the patch
introduces no functional change.

Link: https://lkml.kernel.org/r/20230203071837.1136453-4-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/active_mm.rst |  6 ++++++
 arch/Kconfig                   | 17 +++++++++++++++++
 include/linux/sched/mm.h       | 18 +++++++++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst
index 45d89f8fb3a8..d096fc091e23 100644
--- a/Documentation/mm/active_mm.rst
+++ b/Documentation/mm/active_mm.rst
@@ -2,6 +2,12 @@
 Active MM
 =========
 
+Note, the mm_count refcount may no longer include the "lazy" users
+(running tasks with ->active_mm == mm && ->mm == NULL) on kernels
+with CONFIG_MMU_LAZY_TLB_REFCOUNT=n. Taking and releasing these lazy
+references must be done with mmgrab_lazy_tlb() and mmdrop_lazy_tlb()
+helpers, which abstract this config option.
+
 ::
 
  List:       linux-kernel
diff --git a/arch/Kconfig b/arch/Kconfig
index e3511afbb7f2..643f8a04141e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -465,6 +465,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# MMU_LAZY_TLB_REFCOUNT=n can improve the scalability of context switching
+# to/from kernel threads when the same mm is running on a lot of CPUs (a large
+# multi-threaded application), by reducing contention on the mm refcount.
+#
+# This can be disabled if the architecture ensures no CPUs are using an mm as a
+# "lazy tlb" beyond its final refcount (i.e., by the time __mmdrop frees the mm
+# or its kernel page tables). This could be arranged by arch_exit_mmap(), or
+# final exit(2) TLB flush, for example.
+#
+# To implement this, an arch *must*:
+# Ensure the _lazy_tlb variants of mmgrab/mmdrop are used when manipulating
+# the lazy tlb reference of a kthread's ->active_mm (non-arch code has been
+# converted already).
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 5376caf6fcf3..689dbe812563 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -82,17 +82,29 @@ static inline void mmdrop_sched(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
 {
-	mmdrop_sched(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmdrop_sched(mm);
+	else
+		smp_mb(); /* see mmdrop_lazy_tlb() above */
 }
 
 /**
-- 
cgit v1.2.3


From 739100c88f49a67c6487bb2d826b0b5a2ddc80e2 Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 10 Feb 2023 13:46:45 -0800
Subject: mm: add tracepoints to ksm

This adds the following tracepoints to ksm:
- start / stop scan
- ksm enter / exit
- merge a page
- merge a page with ksm
- remove a page
- remove a rmap item

This patch has been split off from the RFC patch series "mm:
process/cgroup ksm support".

Link: https://lkml.kernel.org/r/20230210214645.2720847-1-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                |   1 +
 include/trace/events/ksm.h | 251 +++++++++++++++++++++++++++++++++++++++++++++
 mm/ksm.c                   |  21 +++-
 3 files changed, 271 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/ksm.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 1dc8bd26b6cf..a62ffb710b47 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13409,6 +13409,7 @@ F:	include/linux/memory_hotplug.h
 F:	include/linux/mm.h
 F:	include/linux/mmzone.h
 F:	include/linux/pagewalk.h
+F:	include/trace/events/ksm.h
 F:	mm/
 F:	tools/mm/
 F:	tools/testing/selftests/mm/
diff --git a/include/trace/events/ksm.h b/include/trace/events/ksm.h
new file mode 100644
index 000000000000..b5ac35c1d0e8
--- /dev/null
+++ b/include/trace/events/ksm.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ksm
+
+#if !defined(_TRACE_KSM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KSM_H
+
+#include <linux/tracepoint.h>
+
+/**
+ * ksm_scan_template - called for start / stop scan
+ *
+ * @seq:		sequence number of scan
+ * @rmap_entries:	actual number of rmap entries
+ *
+ * Allows to trace the start / stop of a ksm scan.
+ */
+DECLARE_EVENT_CLASS(ksm_scan_template,
+
+	TP_PROTO(int seq, u32 rmap_entries),
+
+	TP_ARGS(seq, rmap_entries),
+
+	TP_STRUCT__entry(
+		__field(int,	seq)
+		__field(u32,	rmap_entries)
+	),
+
+	TP_fast_assign(
+		__entry->seq		= seq;
+		__entry->rmap_entries	= rmap_entries;
+	),
+
+	TP_printk("seq %d rmap size %d",
+			__entry->seq, __entry->rmap_entries)
+);
+
+/**
+ * ksm_start_scan - called after a new ksm scan is started
+ *
+ * @seq:		sequence number of scan
+ * @rmap_entries:	actual number of rmap entries
+ *
+ * Allows to trace the start of a ksm scan.
+ */
+DEFINE_EVENT(ksm_scan_template, ksm_start_scan,
+
+	TP_PROTO(int seq, u32 rmap_entries),
+
+	TP_ARGS(seq, rmap_entries)
+);
+
+/**
+ * ksm_stop_scan - called after a new ksm scan has completed
+ *
+ * @seq:		sequence number of scan
+ * @rmap_entries:	actual number of rmap entries
+ *
+ * Allows to trace the completion of a ksm scan.
+ */
+DEFINE_EVENT(ksm_scan_template, ksm_stop_scan,
+
+	TP_PROTO(int seq, u32 rmap_entries),
+
+	TP_ARGS(seq, rmap_entries)
+);
+
+/**
+ * ksm_enter - called after a new process has been added / removed from ksm
+ *
+ * @mm:			address of the mm object of the process
+ *
+ * Allows to trace the when a process has been added or removed from ksm.
+ */
+DECLARE_EVENT_CLASS(ksm_enter_exit_template,
+
+	TP_PROTO(void *mm),
+
+	TP_ARGS(mm),
+
+	TP_STRUCT__entry(
+		__field(void *,		mm)
+	),
+
+	TP_fast_assign(
+		__entry->mm	= mm;
+	),
+
+	TP_printk("mm %p", __entry->mm)
+);
+
+/**
+ * ksm_enter - called after a new process has been added to ksm
+ *
+ * @mm:			address of the mm object of the process
+ *
+ * Allows to trace the when a process has been added to ksm.
+ */
+DEFINE_EVENT(ksm_enter_exit_template, ksm_enter,
+
+	TP_PROTO(void *mm),
+
+	TP_ARGS(mm)
+);
+
+/**
+ * ksm_exit - called after a new process has been removed from ksm
+ *
+ * @mm:			address of the mm object of the process
+ *
+ * Allows to trace the when a process has been removed from ksm.
+ */
+DEFINE_EVENT(ksm_enter_exit_template, ksm_exit,
+
+	TP_PROTO(void *mm),
+
+	TP_ARGS(mm)
+);
+
+/**
+ * ksm_merge_one_page - called after a page has been merged
+ *
+ * @pfn:		page frame number of ksm page
+ * @rmap_item:		address of rmap_item  object
+ * @mm:			address of the process mm struct
+ * @err:		success
+ *
+ * Allows to trace the ksm merging of individual pages.
+ */
+TRACE_EVENT(ksm_merge_one_page,
+
+	TP_PROTO(unsigned long pfn, void *rmap_item, void *mm, int err),
+
+	TP_ARGS(pfn, rmap_item, mm, err),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	pfn)
+		__field(void *,		rmap_item)
+		__field(void *,		mm)
+		__field(int,		err)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= pfn;
+		__entry->rmap_item	= rmap_item;
+		__entry->mm		= mm;
+		__entry->err		= err;
+	),
+
+	TP_printk("ksm pfn %lu rmap_item %p mm %p error %d",
+			__entry->pfn, __entry->rmap_item, __entry->mm, __entry->err)
+);
+
+/**
+ * ksm_merge_with_ksm_page - called after a page has been merged with a ksm page
+ *
+ * @ksm_page:		address ksm page
+ * @pfn:		page frame number of ksm page
+ * @rmap_item:		address of rmap_item  object
+ * @mm:			address of the mm object of the process
+ * @err:		success
+ *
+ * Allows to trace the merging of a page with a ksm page.
+ */
+TRACE_EVENT(ksm_merge_with_ksm_page,
+
+	TP_PROTO(void *ksm_page, unsigned long pfn, void *rmap_item, void *mm, int err),
+
+	TP_ARGS(ksm_page, pfn, rmap_item, mm, err),
+
+	TP_STRUCT__entry(
+		__field(void *,		ksm_page)
+		__field(unsigned long,	pfn)
+		__field(void *,		rmap_item)
+		__field(void *,		mm)
+		__field(int,		err)
+	),
+
+	TP_fast_assign(
+		__entry->ksm_page	= ksm_page;
+		__entry->pfn		= pfn;
+		__entry->rmap_item	= rmap_item;
+		__entry->mm		= mm;
+		__entry->err		= err;
+	),
+
+	TP_printk("%spfn %lu rmap_item %p mm %p error %d",
+		  (__entry->ksm_page ? "ksm " : ""),
+		  __entry->pfn, __entry->rmap_item, __entry->mm, __entry->err)
+);
+
+/**
+ * ksm_remove_ksm_page - called after a ksm page has been removed
+ *
+ * @pfn:		page frame number of ksm page
+ *
+ * Allows to trace the removing of stable ksm pages.
+ */
+TRACE_EVENT(ksm_remove_ksm_page,
+
+	TP_PROTO(unsigned long pfn),
+
+	TP_ARGS(pfn),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, pfn)
+	),
+
+	TP_fast_assign(
+		__entry->pfn = pfn;
+	),
+
+	TP_printk("pfn %lu", __entry->pfn)
+);
+
+/**
+ * ksm_remove_rmap_item - called after a rmap_item has been removed from the
+ *                        stable tree
+ *
+ * @pfn:		page frame number of ksm page
+ * @rmap_item:		address of rmap_item  object
+ * @mm:			address of the process mm struct
+ *
+ * Allows to trace the removal of pages from the stable tree list.
+ */
+TRACE_EVENT(ksm_remove_rmap_item,
+
+	TP_PROTO(unsigned long pfn, void *rmap_item, void *mm),
+
+	TP_ARGS(pfn, rmap_item, mm),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	pfn)
+		__field(void *,		rmap_item)
+		__field(void *,		mm)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= pfn;
+		__entry->rmap_item	= rmap_item;
+		__entry->mm		= mm;
+	),
+
+	TP_printk("pfn %lu rmap_item %p mm %p",
+			__entry->pfn, __entry->rmap_item, __entry->mm)
+);
+
+#endif /* _TRACE_KSM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 2b8d30068cbb..290a3eb6d8de 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -45,6 +45,9 @@
 #include "internal.h"
 #include "mm_slot.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ksm.h>
+
 #ifdef CONFIG_NUMA
 #define NUMA(x)		(x)
 #define DO_NUMA(x)	do { (x); } while (0)
@@ -633,10 +636,12 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
 	BUG_ON(stable_node->rmap_hlist_len < 0);
 
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
-		if (rmap_item->hlist.next)
+		if (rmap_item->hlist.next) {
 			ksm_pages_sharing--;
-		else
+			trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
+		} else {
 			ksm_pages_shared--;
+		}
 
 		rmap_item->mm->ksm_merging_pages--;
 
@@ -657,6 +662,7 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
 	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
 	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
 
+	trace_ksm_remove_ksm_page(stable_node->kpfn);
 	if (stable_node->head == &migrate_nodes)
 		list_del(&stable_node->list);
 	else
@@ -1324,6 +1330,8 @@ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
 	get_anon_vma(vma->anon_vma);
 out:
 	mmap_read_unlock(mm);
+	trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
+				rmap_item, mm, err);
 	return err;
 }
 
@@ -2142,6 +2150,9 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
 		if (vma) {
 			err = try_to_merge_one_page(vma, page,
 					ZERO_PAGE(rmap_item->address));
+			trace_ksm_merge_one_page(
+				page_to_pfn(ZERO_PAGE(rmap_item->address)),
+				rmap_item, mm, err);
 		} else {
 			/*
 			 * If the vma is out of date, we do not need to
@@ -2264,6 +2275,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 
 	mm_slot = ksm_scan.mm_slot;
 	if (mm_slot == &ksm_mm_head) {
+		trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
+
 		/*
 		 * A number of pages can hang around indefinitely on per-cpu
 		 * pagevecs, raised page count preventing write_protect_page
@@ -2414,6 +2427,7 @@ no_vmas:
 	if (mm_slot != &ksm_mm_head)
 		goto next_mm;
 
+	trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
 	ksm_scan.seqnr++;
 	return NULL;
 }
@@ -2565,6 +2579,7 @@ int __ksm_enter(struct mm_struct *mm)
 	if (needs_wakeup)
 		wake_up_interruptible(&ksm_thread_wait);
 
+	trace_ksm_enter(mm);
 	return 0;
 }
 
@@ -2606,6 +2621,8 @@ void __ksm_exit(struct mm_struct *mm)
 		mmap_write_lock(mm);
 		mmap_write_unlock(mm);
 	}
+
+	trace_ksm_exit(mm);
 }
 
 struct page *ksm_might_need_to_copy(struct page *page,
-- 
cgit v1.2.3


From e26fcc02c7f6c76cba42d043756e35a78ce9ac11 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Mon, 30 Jan 2023 13:25:12 +0900
Subject: mmflags.h: use less error prone method to define pageflag_names

Patch series "mm, printk: introduce new format for page_type", v4.

This series moves PG_slab page flag to page_type, freeing one bit in
page->flags and introduces %pGt format that prints human-readable
page_type like %pGp for printing page flags.

See changelog of patch 2 for more implementation details.

Thanks everyone that gave valuable comments.


This patch (of 3):

Use helper macro to decrease chances of typo when defining pageflag_names.

Link: https://lkml.kernel.org/r/20230130042514.2418-1-42.hyeyoo@gmail.com
Link: https://lore.kernel.org/lkml/Y6AycLbpjVzXM5I9@smile.fi.intel.com
Link: https://lkml.kernel.org/r/20230130042514.2418-2-42.hyeyoo@gmail.com
Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/mmflags.h | 85 ++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 9db52bc4ce19..f453babe29b3 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -67,71 +67,74 @@
 	) : "none"
 
 #ifdef CONFIG_MMU
-#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_MLOCK(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_MLOCK(flag,string)
+#define IF_HAVE_PG_MLOCK(_name)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
-#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_UNCACHED(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_UNCACHED(flag,string)
+#define IF_HAVE_PG_UNCACHED(_name)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
-#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_HWPOISON(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_HWPOISON(flag,string)
+#define IF_HAVE_PG_HWPOISON(_name)
 #endif
 
 #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
-#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_IDLE(flag,string)
+#define IF_HAVE_PG_IDLE(_name)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
-#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_ARCH_X(flag,string)
+#define IF_HAVE_PG_ARCH_X(_name)
 #endif
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_SKIP_KASAN_POISON(_name) \
+	,{1UL << PG_##_name, __stringify(_name)}
 #else
-#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string)
+#define IF_HAVE_PG_SKIP_KASAN_POISON(_name)
 #endif
 
+#define DEF_PAGEFLAG_NAME(_name) { 1UL <<  PG_##_name, __stringify(_name) }
+
 #define __def_pageflag_names						\
-	{1UL << PG_locked,		"locked"	},		\
-	{1UL << PG_waiters,		"waiters"	},		\
-	{1UL << PG_error,		"error"		},		\
-	{1UL << PG_referenced,		"referenced"	},		\
-	{1UL << PG_uptodate,		"uptodate"	},		\
-	{1UL << PG_dirty,		"dirty"		},		\
-	{1UL << PG_lru,			"lru"		},		\
-	{1UL << PG_active,		"active"	},		\
-	{1UL << PG_workingset,		"workingset"	},		\
-	{1UL << PG_slab,		"slab"		},		\
-	{1UL << PG_owner_priv_1,	"owner_priv_1"	},		\
-	{1UL << PG_arch_1,		"arch_1"	},		\
-	{1UL << PG_reserved,		"reserved"	},		\
-	{1UL << PG_private,		"private"	},		\
-	{1UL << PG_private_2,		"private_2"	},		\
-	{1UL << PG_writeback,		"writeback"	},		\
-	{1UL << PG_head,		"head"		},		\
-	{1UL << PG_mappedtodisk,	"mappedtodisk"	},		\
-	{1UL << PG_reclaim,		"reclaim"	},		\
-	{1UL << PG_swapbacked,		"swapbacked"	},		\
-	{1UL << PG_unevictable,		"unevictable"	}		\
-IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
-IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
-IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
-IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
-IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
-IF_HAVE_PG_ARCH_X(PG_arch_2,		"arch_2"	)		\
-IF_HAVE_PG_ARCH_X(PG_arch_3,		"arch_3"	)		\
-IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
+	DEF_PAGEFLAG_NAME(locked),					\
+	DEF_PAGEFLAG_NAME(waiters),					\
+	DEF_PAGEFLAG_NAME(error),					\
+	DEF_PAGEFLAG_NAME(referenced),					\
+	DEF_PAGEFLAG_NAME(uptodate),					\
+	DEF_PAGEFLAG_NAME(dirty),					\
+	DEF_PAGEFLAG_NAME(lru),						\
+	DEF_PAGEFLAG_NAME(active),					\
+	DEF_PAGEFLAG_NAME(workingset),					\
+	DEF_PAGEFLAG_NAME(slab),					\
+	DEF_PAGEFLAG_NAME(owner_priv_1),				\
+	DEF_PAGEFLAG_NAME(arch_1),					\
+	DEF_PAGEFLAG_NAME(reserved),					\
+	DEF_PAGEFLAG_NAME(private),					\
+	DEF_PAGEFLAG_NAME(private_2),					\
+	DEF_PAGEFLAG_NAME(writeback),					\
+	DEF_PAGEFLAG_NAME(head),					\
+	DEF_PAGEFLAG_NAME(mappedtodisk),				\
+	DEF_PAGEFLAG_NAME(reclaim),					\
+	DEF_PAGEFLAG_NAME(swapbacked),					\
+	DEF_PAGEFLAG_NAME(unevictable)					\
+IF_HAVE_PG_MLOCK(mlocked)						\
+IF_HAVE_PG_UNCACHED(uncached)						\
+IF_HAVE_PG_HWPOISON(hwpoison)						\
+IF_HAVE_PG_IDLE(idle)							\
+IF_HAVE_PG_IDLE(young)							\
+IF_HAVE_PG_ARCH_X(arch_2)						\
+IF_HAVE_PG_ARCH_X(arch_3)						\
+IF_HAVE_PG_SKIP_KASAN_POISON(skip_kasan_poison)
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
-- 
cgit v1.2.3


From 4c85c0be3d7a9a7ffe48bfe0954eacc0ba9d3c75 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Mon, 30 Jan 2023 13:25:13 +0900
Subject: mm, printk: introduce new format %pGt for page_type

%pGp format is used to display 'flags' field of a struct page.  However,
some page flags (i.e.  PG_buddy, see page-flags.h for more details) are
stored in page_type field.  To display human-readable output of page_type,
introduce %pGt format.

It is important to note the meaning of bits are different in page_type.
if page_type is 0xffffffff, no flags are set.  Setting PG_buddy
(0x00000080) flag results in a page_type of 0xffffff7f.  Clearing a bit
actually means setting a flag.  Bits in page_type are inverted when
displaying type names.

Only values for which page_type_has_type() returns true are considered as
page_type, to avoid confusion with mapcount values.  if it returns false,
only raw values are displayed and not page type names.

Link: https://lkml.kernel.org/r/20230130042514.2418-3-42.hyeyoo@gmail.com
Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>	[vsprintf part]
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/printk-formats.rst | 16 +++++++++++-----
 include/linux/page-flags.h                |  7 ++++++-
 include/trace/events/mmflags.h            |  8 ++++++++
 lib/test_printf.c                         | 26 ++++++++++++++++++++++++++
 lib/vsprintf.c                            | 21 +++++++++++++++++++++
 mm/debug.c                                |  5 +++++
 mm/internal.h                             |  1 +
 7 files changed, 78 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index dbe1aacc79d0..dfe7e75a71de 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -575,20 +575,26 @@ The field width is passed by value, the bitmap is passed by reference.
 Helper macros cpumask_pr_args() and nodemask_pr_args() are available to ease
 printing cpumask and nodemask.
 
-Flags bitfields such as page flags, gfp_flags
----------------------------------------------
+Flags bitfields such as page flags, page_type, gfp_flags
+--------------------------------------------------------
 
 ::
 
 	%pGp	0x17ffffc0002036(referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff)
+	%pGt	0xffffff7f(buddy)
 	%pGg	GFP_USER|GFP_DMA32|GFP_NOWARN
 	%pGv	read|exec|mayread|maywrite|mayexec|denywrite
 
 For printing flags bitfields as a collection of symbolic constants that
 would construct the value. The type of flags is given by the third
-character. Currently supported are [p]age flags, [v]ma_flags (both
-expect ``unsigned long *``) and [g]fp_flags (expects ``gfp_t *``). The flag
-names and print order depends on the particular	type.
+character. Currently supported are:
+
+        - p - [p]age flags, expects value of type (``unsigned long *``)
+        - t - page [t]ype, expects value of type (``unsigned int *``)
+        - v - [v]ma_flags, expects value of type (``unsigned long *``)
+        - g - [g]fp_flags, expects value of type (``gfp_t *``)
+
+The flag names and print order depends on the particular type.
 
 Note that this format should not be used directly in the
 :c:func:`TP_printk()` part of a tracepoint. Instead, use the show_*_flags()
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a7e3a3405520..57287102c5bd 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -926,9 +926,14 @@ static inline bool is_page_hwpoison(struct page *page)
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
 
+static inline int page_type_has_type(unsigned int page_type)
+{
+	return (int)page_type < PAGE_MAPCOUNT_RESERVE;
+}
+
 static inline int page_has_type(struct page *page)
 {
-	return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
+	return page_type_has_type(page->page_type);
 }
 
 #define PAGE_TYPE_OPS(uname, lname)					\
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index f453babe29b3..b28218b7998e 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -141,6 +141,14 @@ IF_HAVE_PG_SKIP_KASAN_POISON(skip_kasan_poison)
 	__def_pageflag_names						\
 	) : "none"
 
+#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
+
+#define __def_pagetype_names						\
+	DEF_PAGETYPE_NAME(offline),					\
+	DEF_PAGETYPE_NAME(guard),					\
+	DEF_PAGETYPE_NAME(table),					\
+	DEF_PAGETYPE_NAME(buddy)
+
 #if defined(CONFIG_X86)
 #define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
 #elif defined(CONFIG_PPC)
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 46b4e6c414a3..7677ebccf3c3 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -642,12 +642,26 @@ page_flags_test(int section, int node, int zone, int last_cpupid,
 	test(cmp_buf, "%pGp", &flags);
 }
 
+static void __init page_type_test(unsigned int page_type, const char *name,
+				  char *cmp_buf)
+{
+	unsigned long size;
+
+	size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type);
+	if (page_type_has_type(page_type))
+		size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
+
+	snprintf(cmp_buf + size, BUF_SIZE - size, ")");
+	test(cmp_buf, "%pGt", &page_type);
+}
+
 static void __init
 flags(void)
 {
 	unsigned long flags;
 	char *cmp_buffer;
 	gfp_t gfp;
+	unsigned int page_type;
 
 	cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
 	if (!cmp_buffer)
@@ -687,6 +701,18 @@ flags(void)
 	gfp |= __GFP_HIGH;
 	test(cmp_buffer, "%pGg", &gfp);
 
+	page_type = ~0;
+	page_type_test(page_type, "", cmp_buffer);
+
+	page_type = 10;
+	page_type_test(page_type, "", cmp_buffer);
+
+	page_type = ~PG_buddy;
+	page_type_test(page_type, "buddy", cmp_buffer);
+
+	page_type = ~(PG_table | PG_buddy);
+	page_type_test(page_type, "table|buddy", cmp_buffer);
+
 	kfree(cmp_buffer);
 }
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index be71a03c936a..fbe320b5e89f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2052,6 +2052,25 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
 	return buf;
 }
 
+static
+char *format_page_type(char *buf, char *end, unsigned int page_type)
+{
+	buf = number(buf, end, page_type, default_flag_spec);
+
+	if (buf < end)
+		*buf = '(';
+	buf++;
+
+	if (page_type_has_type(page_type))
+		buf = format_flags(buf, end, ~page_type, pagetype_names);
+
+	if (buf < end)
+		*buf = ')';
+	buf++;
+
+	return buf;
+}
+
 static noinline_for_stack
 char *flags_string(char *buf, char *end, void *flags_ptr,
 		   struct printf_spec spec, const char *fmt)
@@ -2065,6 +2084,8 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
 	switch (fmt[1]) {
 	case 'p':
 		return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
+	case 't':
+		return format_page_type(buf, end, *(unsigned int *)flags_ptr);
 	case 'v':
 		flags = *(unsigned long *)flags_ptr;
 		names = vmaflag_names;
diff --git a/mm/debug.c b/mm/debug.c
index 96d594e16292..01cf0435723b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -36,6 +36,11 @@ const struct trace_print_flags pageflag_names[] = {
 	{0, NULL}
 };
 
+const struct trace_print_flags pagetype_names[] = {
+	__def_pagetype_names,
+	{0, NULL}
+};
+
 const struct trace_print_flags gfpflag_names[] = {
 	__def_gfpflag_names,
 	{0, NULL}
diff --git a/mm/internal.h b/mm/internal.h
index 7920a8b7982e..2a7ffd9962c4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -802,6 +802,7 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 
 extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags pagetype_names[];
 extern const struct trace_print_flags vmaflag_names[];
 extern const struct trace_print_flags gfpflag_names[];
 
-- 
cgit v1.2.3


From 16d91faf09be148d9c1cf4999f4d15fb24a7cd72 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 16 Feb 2023 11:59:24 -0800
Subject: kasan: call clear_page with a match-all tag instead of changing page
 tag

Instead of changing the page's tag solely in order to obtain a pointer
with a match-all tag and then changing it back again, just convert the
pointer that we get from kmap_atomic() into one with a match-all tag
before passing it to clear_page().

On a certain microarchitecture, this has been observed to cause a
measurable improvement in microbenchmark performance, presumably as a
result of being able to avoid the atomic operations on the page tag.

Link: https://lkml.kernel.org/r/20230216195924.3287772-1-pcc@google.com
Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/I0249822cc29097ca7a04ad48e8eb14871f80e711
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 8fc10089e19e..9c7cdaa3de8c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,12 +243,10 @@ static inline void clear_highpage(struct page *page)
 
 static inline void clear_highpage_kasan_tagged(struct page *page)
 {
-	u8 tag;
+	void *kaddr = kmap_local_page(page);
 
-	tag = page_kasan_tag(page);
-	page_kasan_tag_reset(page);
-	clear_highpage(page);
-	page_kasan_tag_set(page, tag);
+	clear_page(kasan_reset_tag(kaddr));
+	kunmap_local(kaddr);
 }
 
 #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
-- 
cgit v1.2.3


From 3e4fb13ac34bad94cf390415b1e6bc3ca9520d65 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 2 Mar 2023 19:58:35 +0800
Subject: mm: swap: remove unneeded cgroup_throttle_swaprate()

All the callers of cgroup_throttle_swaprate() are converted to
folio_throttle_swaprate(), so make __cgroup_throttle_swaprate() to take a
folio, and rename it to __folio_throttle_swaprate(), also rename gfp_mask
to gfp and drop redundant extern keyword.  finally, drop unused
cgroup_throttle_swaprate().

Link: https://lkml.kernel.org/r/20230302115835.105364-8-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 12 ++++--------
 mm/swapfile.c        |  6 +++---
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 209a425739a9..d5d0b54e90e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -641,22 +641,18 @@ extern atomic_t zswap_stored_pages;
 #endif
 
 #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
-static inline  void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
+static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 {
 	if (mem_cgroup_disabled())
 		return;
-	__cgroup_throttle_swaprate(page, gfp_mask);
+	__folio_throttle_swaprate(folio, gfp);
 }
 #else
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
-{
-}
-#endif
 static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 {
-	cgroup_throttle_swaprate(&folio->page, gfp);
 }
+#endif
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62ba2bf577d7..c1b97436f811 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3635,12 +3635,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 {
 	struct swap_info_struct *si, *next;
-	int nid = page_to_nid(page);
+	int nid = folio_nid(folio);
 
-	if (!(gfp_mask & __GFP_IO))
+	if (!(gfp & __GFP_IO))
 		return;
 
 	if (!blk_cgroup_congested())
-- 
cgit v1.2.3


From 99c29133639a29fa803ea27ec79bf9e732efd062 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Date: Mon, 6 Mar 2023 17:15:48 +0100
Subject: mm: add PTE pointer parameter to flush_tlb_fix_spurious_fault()

s390 can do more fine-grained handling of spurious TLB protection faults,
when there also is the PTE pointer available.

Therefore, pass on the PTE pointer to flush_tlb_fix_spurious_fault() as an
additional parameter.

This will add no functional change to other architectures, but those with
private flush_tlb_fix_spurious_fault() implementations need to be made
aware of the new parameter.

Link: https://lkml.kernel.org/r/20230306161548.661740-1-gerald.schaefer@linux.ibm.com
Signed-off-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Acked-by: Michael Ellerman <mpe@ellerman.id.au>		[powerpc]
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h              |  2 +-
 arch/mips/include/asm/pgtable.h               |  3 ++-
 arch/powerpc/include/asm/book3s/64/tlbflush.h |  3 ++-
 arch/s390/include/asm/pgtable.h               | 12 +++++++-----
 arch/x86/include/asm/pgtable.h                |  2 +-
 include/linux/pgtable.h                       |  2 +-
 mm/memory.c                                   |  3 ++-
 mm/pgtable-generic.c                          |  2 +-
 8 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b6ba466e2e8a..0bd18de9fd97 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
  * fault on one CPU which has been handled concurrently by another CPU
  * does not need to perform additional invalidation.
  */
-#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 791389bf3c12..574fa14ac8b2 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -469,7 +469,8 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
 }
 
 static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
-						unsigned long address)
+						unsigned long address,
+						pte_t *ptep)
 {
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 2bbc0fcce04a..ff7f0ee179e5 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -121,7 +121,8 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
 static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
-						unsigned long address)
+						unsigned long address,
+						pte_t *ptep)
 {
 	/*
 	 * Book3S 64 does not require spurious fault flushes because the PTE
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2c70b4d1263d..c1f6b46ec555 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1239,7 +1239,8 @@ static inline int pte_allow_rdp(pte_t old, pte_t new)
 }
 
 static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
-						unsigned long address)
+						unsigned long address,
+						pte_t *ptep)
 {
 	/*
 	 * RDP might not have propagated the PTE protection reset to all CPUs,
@@ -1247,11 +1248,12 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
 	 * NOTE: This will also be called when a racing pagetable update on
 	 * another thread already installed the correct PTE. Both cases cannot
 	 * really be distinguished.
-	 * Therefore, only do the local TLB flush when RDP can be used, to avoid
-	 * unnecessary overhead.
+	 * Therefore, only do the local TLB flush when RDP can be used, and the
+	 * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead.
+	 * A local RDP can be used to do the flush.
 	 */
-	if (MACHINE_HAS_RDP)
-		asm volatile("ptlb" : : : "memory");
+	if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT))
+		__ptep_rdp(address, ptep, 0, 0, 1);
 }
 #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7425f32e5293..15ae4d6ba476 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1097,7 +1097,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
 }
 
-#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
 
 #define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9dc936bc77d1..c5a51481bbb9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -817,7 +817,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif
 
 #ifndef flush_tlb_fix_spurious_fault
-#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
 #endif
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index fdc903825981..6285cad1f4fb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4944,7 +4944,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		 * with threads.
 		 */
 		if (vmf->flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+						     vmf->pte);
 	}
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 90ab721a12a8..d2fc52bffafc 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -69,7 +69,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	int changed = !pte_same(*ptep, entry);
 	if (changed) {
 		set_pte_at(vma->vm_mm, address, ptep, entry);
-		flush_tlb_fix_spurious_fault(vma, address);
+		flush_tlb_fix_spurious_fault(vma, address, ptep);
 	}
 	return changed;
 }
-- 
cgit v1.2.3


From 82b3aa2681ca92421c4b508e7c390000273c53fe Mon Sep 17 00:00:00 2001
From: Yue Zhao <findns94@gmail.com>
Date: Mon, 6 Mar 2023 23:41:36 +0800
Subject: mm, memcg: Prevent memory.swappiness load/store tearing

The knob for cgroup v1 memory controller: memory.swappiness is not
protected by any locking so it can be modified while it is used.  This is
not an actual problem because races are unlikely.  But it is better to use
[READ|WRITE]_ONCE to prevent compiler from doing anything funky.

The access of memcg->swappiness and vm_swappiness is lockless, so both of
them can be concurrently set at the same time as we are trying to read
them.  All occurrences of memcg->swappiness and vm_swappiness are updated
with [READ|WRITE]_ONCE.

[findns94@gmail.com: v3]
  Link: https://lkml.kernel.org/r/20230308162555.14195-3-findns94@gmail.com
Link: https://lkml.kernel.org/r/20230306154138.3775-3-findns94@gmail.com
Signed-off-by: Yue Zhao <findns94@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Tang Yizhou <tangyeechou@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 8 ++++----
 mm/memcontrol.c      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d5d0b54e90e8..bfc3b06b5f8f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -620,18 +620,18 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	/* Cgroup2 doesn't have per-cgroup swappiness */
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		return vm_swappiness;
+		return READ_ONCE(vm_swappiness);
 
 	/* root ? */
 	if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
-		return vm_swappiness;
+		return READ_ONCE(vm_swappiness);
 
-	return memcg->swappiness;
+	return READ_ONCE(memcg->swappiness);
 }
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
-	return vm_swappiness;
+	return READ_ONCE(vm_swappiness);
 }
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06821e5f7604..1b0112afcad3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4179,9 +4179,9 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 		return -EINVAL;
 
 	if (!mem_cgroup_is_root(memcg))
-		memcg->swappiness = val;
+		WRITE_ONCE(memcg->swappiness, val);
 	else
-		vm_swappiness = val;
+		WRITE_ONCE(vm_swappiness, val);
 
 	return 0;
 }
@@ -5353,7 +5353,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #endif
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
 	if (parent) {
-		memcg->swappiness = mem_cgroup_swappiness(parent);
+		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 		memcg->oom_kill_disable = parent->oom_kill_disable;
 
 		page_counter_init(&memcg->memory, &parent->memory);
-- 
cgit v1.2.3


From 452a8f40728065800b5a5b81f1152e9a16d39656 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 15:31:25 +0100
Subject: mm,jfs: move write_one_page/folio_write_one to jfs

The last remaining user of folio_write_one through the write_one_page
wrapper is jfs, so move the functionality there and hard code the call to
metapage_writepage.

Note that the use of the pagecache by the JFS 'metapage' buffer cache is a
bit odd, and we could probably do without VM-level dirty tracking at all,
but that's a change for another time.

Link: https://lkml.kernel.org/r/20230307143125.27778-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Gang He <ghe@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jan Kara via Ocfs2-devel <ocfs2-devel@oss.oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jfs/jfs_metapage.c   | 39 ++++++++++++++++++++++++++++++++++-----
 include/linux/pagemap.h |  6 ------
 mm/page-writeback.c     | 40 ----------------------------------------
 3 files changed, 34 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2e8461ce74de..961569c11159 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp)
 	unlock_page(mp->page);
 }
 
+static int metapage_write_one(struct page *page)
+{
+	struct folio *folio = page_folio(page);
+	struct address_space *mapping = folio->mapping;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = folio_nr_pages(folio),
+	};
+	int ret = 0;
+
+	BUG_ON(!folio_test_locked(folio));
+
+	folio_wait_writeback(folio);
+
+	if (folio_clear_dirty_for_io(folio)) {
+		folio_get(folio);
+		ret = metapage_writepage(page, &wbc);
+		if (ret == 0)
+			folio_wait_writeback(folio);
+		folio_put(folio);
+	} else {
+		folio_unlock(folio);
+	}
+
+	if (!ret)
+		ret = filemap_check_errors(mapping);
+	return ret;
+}
+
 void force_metapage(struct metapage *mp)
 {
 	struct page *page = mp->page;
@@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp)
 	get_page(page);
 	lock_page(page);
 	set_page_dirty(page);
-	if (write_one_page(page))
-		jfs_error(mp->sb, "write_one_page() failed\n");
+	if (metapage_write_one(page))
+		jfs_error(mp->sb, "metapage_write_one() failed\n");
 	clear_bit(META_forcewrite, &mp->flag);
 	put_page(page);
 }
@@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp)
 		set_page_dirty(page);
 		if (test_bit(META_sync, &mp->flag)) {
 			clear_bit(META_sync, &mp->flag);
-			if (write_one_page(page))
-				jfs_error(mp->sb, "write_one_page() failed\n");
-			lock_page(page); /* write_one_page unlocks the page */
+			if (metapage_write_one(page))
+				jfs_error(mp->sb, "metapage_write_one() failed\n");
+			lock_page(page);
 		}
 	} else if (mp->lsn)	/* discard_metapage doesn't remove it */
 		remove_from_logsync(mp);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0acb8e1fb7af..853184a46411 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1066,12 +1066,6 @@ static inline void folio_cancel_dirty(struct folio *folio)
 bool folio_clear_dirty_for_io(struct folio *folio);
 bool clear_page_dirty_for_io(struct page *page);
 void folio_invalidate(struct folio *folio, size_t offset, size_t length);
-int __must_check folio_write_one(struct folio *folio);
-static inline int __must_check write_one_page(struct page *page)
-{
-	return folio_write_one(page_folio(page));
-}
-
 int __set_page_dirty_nobuffers(struct page *page);
 bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 516b1aa247e8..db7943999007 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2583,46 +2583,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return ret;
 }
 
-/**
- * folio_write_one - write out a single folio and wait on I/O.
- * @folio: The folio to write.
- *
- * The folio must be locked by the caller and will be unlocked upon return.
- *
- * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
- * function returns.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int folio_write_one(struct folio *folio)
-{
-	struct address_space *mapping = folio->mapping;
-	int ret = 0;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = folio_nr_pages(folio),
-	};
-
-	BUG_ON(!folio_test_locked(folio));
-
-	folio_wait_writeback(folio);
-
-	if (folio_clear_dirty_for_io(folio)) {
-		folio_get(folio);
-		ret = mapping->a_ops->writepage(&folio->page, &wbc);
-		if (ret == 0)
-			folio_wait_writeback(folio);
-		folio_put(folio);
-	} else {
-		folio_unlock(folio);
-	}
-
-	if (!ret)
-		ret = filemap_check_errors(mapping);
-	return ret;
-}
-EXPORT_SYMBOL(folio_write_one);
-
 /*
  * For address_spaces which do not use buffers nor write back.
  */
-- 
cgit v1.2.3


From 2c6efe9cf2d7841b75fe38ed1adbd41a90f51ba0 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 9 Mar 2023 15:05:45 -0800
Subject: shmem: add support to ignore swap

In doing experimentations with shmem having the option to avoid swap
becomes a useful mechanism.  One of the *raves* about brd over shmem is
you can avoid swap, but that's not really a good reason to use brd if we
can instead use shmem.  Using brd has its own good reasons to exist, but
just because "tmpfs" doesn't let you do that is not a great reason to
avoid it if we can easily add support for it.

I don't add support for reconfiguring incompatible options, but if we
really wanted to we can add support for that.

To avoid swap we use mapping_set_unevictable() upon inode creation, and
put a WARN_ON_ONCE() stop-gap on writepages() for reclaim.

Link: https://lkml.kernel.org/r/20230309230545.2930737-7-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/tmpfs.rst  |  9 ++++++---
 Documentation/mm/unevictable-lru.rst |  2 ++
 include/linux/shmem_fs.h             |  1 +
 mm/shmem.c                           | 28 +++++++++++++++++++++++++++-
 4 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 1ec9a9f8196b..f18f46be5c0c 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -13,7 +13,8 @@ everything stored therein is lost.
 
 tmpfs puts everything into the kernel internal caches and grows and
 shrinks to accommodate the files it contains and is able to swap
-unneeded pages out to swap space, and supports THP.
+unneeded pages out to swap space, if swap was enabled for the tmpfs
+mount. tmpfs also supports THP.
 
 tmpfs extends ramfs with a few userspace configurable options listed and
 explained further below, some of which can be reconfigured dynamically on the
@@ -33,8 +34,8 @@ configured in size at initialization and you cannot dynamically resize them.
 Contrary to brd ramdisks, tmpfs has its own filesystem, it does not rely on the
 block layer at all.
 
-Since tmpfs lives completely in the page cache and on swap, all tmpfs
-pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
+Since tmpfs lives completely in the page cache and optionally on swap,
+all tmpfs pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
 free(1). Notice that these counters also include shared memory
 (shmem, see ipcs(1)). The most reliable way to get the count is
 using df(1) and du(1).
@@ -83,6 +84,8 @@ nr_inodes  The maximum number of inodes for this instance. The default
            is half of the number of your physical RAM pages, or (on a
            machine with highmem) the number of lowmem RAM pages,
            whichever is the lower.
+noswap     Disables swap. Remounts must respect the original settings.
+           By default swap is enabled.
 =========  ============================================================
 
 These parameters accept a suffix k, m or g for kilo, mega and giga and
diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst
index 92ac5dca420c..d5ac8511eb67 100644
--- a/Documentation/mm/unevictable-lru.rst
+++ b/Documentation/mm/unevictable-lru.rst
@@ -42,6 +42,8 @@ The unevictable list addresses the following classes of unevictable pages:
 
  * Those owned by ramfs.
 
+ * Those owned by tmpfs with the noswap mount option.
+
  * Those mapped into SHM_LOCK'd shared memory regions.
 
  * Those mapped into VM_LOCKED [mlock()ed] VMAs.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 103d1000a5a2..50bf82b36995 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -45,6 +45,7 @@ struct shmem_sb_info {
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
 	bool full_inums;	    /* If i_ino should be uint or ino_t */
+	bool noswap;		    /* ignores VM reclaim / swap requests */
 	ino_t next_ino;		    /* The next per-sb inode number to use */
 	ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
diff --git a/mm/shmem.c b/mm/shmem.c
index 7751c136eadb..787e83791eb5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -116,10 +116,12 @@ struct shmem_options {
 	bool full_inums;
 	int huge;
 	int seen;
+	bool noswap;
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
 #define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_NOSWAP 16
 };
 
 #ifdef CONFIG_TMPFS
@@ -1334,6 +1336,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	struct address_space *mapping = folio->mapping;
 	struct inode *inode = mapping->host;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	swp_entry_t swap;
 	pgoff_t index;
 
@@ -1347,7 +1350,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	if (WARN_ON_ONCE(!wbc->for_reclaim))
 		goto redirty;
 
-	if (WARN_ON_ONCE(info->flags & VM_LOCKED))
+	if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
 		goto redirty;
 
 	if (!total_swap_pages)
@@ -2372,6 +2375,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
 			shmem_set_inode_flags(inode, info->fsflags);
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
+		if (sbinfo->noswap)
+			mapping_set_unevictable(inode->i_mapping);
 		simple_xattrs_init(&info->xattrs);
 		cache_no_acl(inode);
 		mapping_set_large_folios(inode->i_mapping);
@@ -3459,6 +3464,7 @@ enum shmem_param {
 	Opt_uid,
 	Opt_inode32,
 	Opt_inode64,
+	Opt_noswap,
 };
 
 static const struct constant_table shmem_param_enums_huge[] = {
@@ -3480,6 +3486,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
 	fsparam_u32   ("uid",		Opt_uid),
 	fsparam_flag  ("inode32",	Opt_inode32),
 	fsparam_flag  ("inode64",	Opt_inode64),
+	fsparam_flag  ("noswap",	Opt_noswap),
 	{}
 };
 
@@ -3563,6 +3570,10 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 		ctx->full_inums = true;
 		ctx->seen |= SHMEM_SEEN_INUMS;
 		break;
+	case Opt_noswap:
+		ctx->noswap = true;
+		ctx->seen |= SHMEM_SEEN_NOSWAP;
+		break;
 	}
 	return 0;
 
@@ -3661,6 +3672,14 @@ static int shmem_reconfigure(struct fs_context *fc)
 		err = "Current inum too high to switch to 32-bit inums";
 		goto out;
 	}
+	if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+		err = "Cannot disable swap on remount";
+		goto out;
+	}
+	if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
+		err = "Cannot enable swap on remount if it was disabled on first mount";
+		goto out;
+	}
 
 	if (ctx->seen & SHMEM_SEEN_HUGE)
 		sbinfo->huge = ctx->huge;
@@ -3681,6 +3700,10 @@ static int shmem_reconfigure(struct fs_context *fc)
 		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
 		ctx->mpol = NULL;
 	}
+
+	if (ctx->noswap)
+		sbinfo->noswap = true;
+
 	raw_spin_unlock(&sbinfo->stat_lock);
 	mpol_put(mpol);
 	return 0;
@@ -3735,6 +3758,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
 #endif
 	shmem_show_mpol(seq, sbinfo->mpol);
+	if (sbinfo->noswap)
+		seq_printf(seq, ",noswap");
 	return 0;
 }
 
@@ -3778,6 +3803,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 			ctx->inodes = shmem_default_max_inodes();
 		if (!(ctx->seen & SHMEM_SEEN_INUMS))
 			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+		sbinfo->noswap = ctx->noswap;
 	} else {
 		sb->s_flags |= SB_NOUSER;
 	}
-- 
cgit v1.2.3


From 7eb16f23b9a415f062db22739e59bb144e0b24ab Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 10 Mar 2023 17:29:05 +0100
Subject: io-mapping: don't disable preempt on RT in
 io_mapping_map_atomic_wc().

io_mapping_map_atomic_wc() disables preemption and pagefaults for
historical reasons.  The conversion to io_mapping_map_local_wc(), which
only disables migration, cannot be done wholesale because quite some call
sites need to be updated to accommodate with the changed semantics.

On PREEMPT_RT enabled kernels the io_mapping_map_atomic_wc() semantics are
problematic due to the implicit disabling of preemption which makes it
impossible to acquire 'sleeping' spinlocks within the mapped atomic
sections.

PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for
more than a decade.  It could be argued that this is a justification to do
this unconditionally, but PREEMPT_RT covers only a limited number of
architectures and it disables some functionality which limits the coverage
further.

Limit the replacement to PREEMPT_RT for now.  This is also done
kmap_atomic().

Link: https://lkml.kernel.org/r/20230310162905.O57Pj7hh@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Richard Weinberger <richard.weinberger@gmail.com>
  Link: https://lore.kernel.org/CAFLxGvw0WMxaMqYqJ5WgvVSbKHq2D2xcXTOgMCpgq9nDC-MWTQ@mail.gmail.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/io-mapping.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 09d4f17c8d3b..7376c1df9c90 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		migrate_disable();
 	pagefault_disable();
 	return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
 }
@@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	kunmap_local_indexed((void __force *)vaddr);
 	pagefault_enable();
-	preempt_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		migrate_enable();
 }
 
 static inline void __iomem *
@@ -162,7 +168,10 @@ static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
 			 unsigned long offset)
 {
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		migrate_disable();
 	pagefault_disable();
 	return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
 }
@@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	io_mapping_unmap(vaddr);
 	pagefault_enable();
-	preempt_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		migrate_enable();
 }
 
 static inline void __iomem *
-- 
cgit v1.2.3


From 0a54864f8dfb64b64c84c9db6ff70e0e93690a33 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 9 Mar 2023 20:29:14 -0800
Subject: kasan: remove PG_skip_kasan_poison flag

Code inspection reveals that PG_skip_kasan_poison is redundant with
kasantag, because the former is intended to be set iff the latter is the
match-all tag.  It can also be observed that it's basically pointless to
poison pages which have kasantag=0, because any pages with this tag would
have been pointed to by pointers with match-all tags, so poisoning the
pages would have little to no effect in terms of bug detection.
Therefore, change the condition in should_skip_kasan_poison() to check
kasantag instead, and remove PG_skip_kasan_poison and associated flags.

Link: https://lkml.kernel.org/r/20230310042914.3805818-3-pcc@google.com
Link: https://linux-review.googlesource.com/id/I57f825f2eaeaf7e8389d6cf4597c8a5821359838
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h      | 30 +++++++---------
 include/linux/page-flags.h     |  9 -----
 include/trace/events/mmflags.h | 13 ++-----
 mm/kasan/hw_tags.c             |  2 +-
 mm/page_alloc.c                | 81 ++++++++++++++++--------------------------
 mm/vmalloc.c                   |  2 +-
 6 files changed, 47 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 5088637fe5c2..6583a58670c5 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -47,16 +47,14 @@ typedef unsigned int __bitwise gfp_t;
 #define ___GFP_ACCOUNT		0x400000u
 #define ___GFP_ZEROTAGS		0x800000u
 #ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO		0x1000000u
-#define ___GFP_SKIP_KASAN_UNPOISON	0x2000000u
-#define ___GFP_SKIP_KASAN_POISON	0x4000000u
+#define ___GFP_SKIP_ZERO	0x1000000u
+#define ___GFP_SKIP_KASAN	0x2000000u
 #else
-#define ___GFP_SKIP_ZERO		0
-#define ___GFP_SKIP_KASAN_UNPOISON	0
-#define ___GFP_SKIP_KASAN_POISON	0
+#define ___GFP_SKIP_ZERO	0
+#define ___GFP_SKIP_KASAN	0
 #endif
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x8000000u
+#define ___GFP_NOLOCKDEP	0x4000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -234,25 +232,24 @@ typedef unsigned int __bitwise gfp_t;
  * memory tags at the same time as zeroing memory has minimal additional
  * performace impact.
  *
- * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
- * Only effective in HW_TAGS mode.
- *
- * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
- * Typically, used for userspace pages. Only effective in HW_TAGS mode.
+ * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
+ * Used for userspace and vmalloc pages; the latter are unpoisoned by
+ * kasan_unpoison_vmalloc instead. For userspace pages, results in
+ * poisoning being skipped as well, see should_skip_kasan_poison for
+ * details. Only effective in HW_TAGS mode.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
 #define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
 #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
-#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
-#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
@@ -335,8 +332,7 @@ typedef unsigned int __bitwise gfp_t;
 #define GFP_DMA		__GFP_DMA
 #define GFP_DMA32	__GFP_DMA32
 #define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
-			 __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON)
+#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | __GFP_SKIP_KASAN)
 #define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 57287102c5bd..dcda20c47b8f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -135,9 +135,6 @@ enum pageflags {
 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
 	PG_arch_2,
 	PG_arch_3,
-#endif
-#ifdef CONFIG_KASAN_HW_TAGS
-	PG_skip_kasan_poison,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -594,12 +591,6 @@ TESTCLEARFLAG(Young, young, PF_ANY)
 PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
-#ifdef CONFIG_KASAN_HW_TAGS
-PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
-#else
-PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
-#endif
-
 /*
  * PageReported() is used to track reported free pages within the Buddy
  * allocator. We can use the non-atomic version of the test and set
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index b28218b7998e..b63e7c0fbbe5 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -55,8 +55,7 @@
 #ifdef CONFIG_KASAN_HW_TAGS
 #define __def_gfpflag_names_kasan ,			\
 	gfpflag_string(__GFP_SKIP_ZERO),		\
-	gfpflag_string(__GFP_SKIP_KASAN_POISON),	\
-	gfpflag_string(__GFP_SKIP_KASAN_UNPOISON)
+	gfpflag_string(__GFP_SKIP_KASAN)
 #else
 #define __def_gfpflag_names_kasan
 #endif
@@ -96,13 +95,6 @@
 #define IF_HAVE_PG_ARCH_X(_name)
 #endif
 
-#ifdef CONFIG_KASAN_HW_TAGS
-#define IF_HAVE_PG_SKIP_KASAN_POISON(_name) \
-	,{1UL << PG_##_name, __stringify(_name)}
-#else
-#define IF_HAVE_PG_SKIP_KASAN_POISON(_name)
-#endif
-
 #define DEF_PAGEFLAG_NAME(_name) { 1UL <<  PG_##_name, __stringify(_name) }
 
 #define __def_pageflag_names						\
@@ -133,8 +125,7 @@ IF_HAVE_PG_HWPOISON(hwpoison)						\
 IF_HAVE_PG_IDLE(idle)							\
 IF_HAVE_PG_IDLE(young)							\
 IF_HAVE_PG_ARCH_X(arch_2)						\
-IF_HAVE_PG_ARCH_X(arch_3)						\
-IF_HAVE_PG_SKIP_KASAN_POISON(skip_kasan_poison)
+IF_HAVE_PG_ARCH_X(arch_3)
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index d1bcb0205327..bb4f56e5bdec 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -318,7 +318,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
 	 * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
 	 * the first virtual mapping, which is created by vmalloc().
 	 * Tagging the page_alloc memory backing that vmalloc() allocation is
-	 * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+	 * skipped, see ___GFP_SKIP_KASAN.
 	 *
 	 * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a109444e9f44..3737f9d58f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,17 +112,6 @@ typedef int __bitwise fpi_t;
  */
 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
 
-/*
- * Don't poison memory with KASAN (only for the tag-based modes).
- * During boot, all non-reserved memblock memory is exposed to page_alloc.
- * Poisoning all that memory lengthens boot time, especially on systems with
- * large amount of RAM. This flag is used to skip that poisoning.
- * This is only done for the tag-based KASAN modes, as those are able to
- * detect memory corruptions with the memory tags assigned by default.
- * All memory allocated normally after boot gets poisoned as usual.
- */
-#define FPI_SKIP_KASAN_POISON	((__force fpi_t)BIT(2))
-
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1370,13 +1359,19 @@ out:
 /*
  * Skip KASAN memory poisoning when either:
  *
- * 1. Deferred memory initialization has not yet completed,
- *    see the explanation below.
- * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
- *    see the comment next to it.
- * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
- *    see the comment next to it.
- * 4. The allocation is excluded from being checked due to sampling,
+ * 1. For generic KASAN: deferred memory initialization has not yet completed.
+ *    Tag-based KASAN modes skip pages freed via deferred memory initialization
+ *    using page tags instead (see below).
+ * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
+ *    that error detection is disabled for accesses via the page address.
+ *
+ * Pages will have match-all tags in the following circumstances:
+ *
+ * 1. Pages are being initialized for the first time, including during deferred
+ *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
+ * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
+ *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
+ * 3. The allocation was excluded from being checked due to sampling,
  *    see the call to kasan_unpoison_pages.
  *
  * Poisoning pages during deferred memory init will greatly lengthen the
@@ -1392,10 +1387,10 @@ out:
  */
 static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
-	return deferred_pages_enabled() ||
-	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
-	       PageSkipKASanPoison(page);
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+		return deferred_pages_enabled();
+
+	return page_kasan_tag(page) == 0xff;
 }
 
 static void kernel_init_pages(struct page *page, int numpages)
@@ -1730,7 +1725,7 @@ void __free_pages_core(struct page *page, unsigned int order)
 	 * Bypass PCP and place fresh pages right to the tail, primarily
 	 * relevant for memory onlining.
 	 */
-	__free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
+	__free_pages_ok(page, order, FPI_TO_TAIL);
 }
 
 #ifdef CONFIG_NUMA
@@ -2396,9 +2391,9 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags)
 
 	/*
 	 * With hardware tag-based KASAN enabled, skip if this has been
-	 * requested via __GFP_SKIP_KASAN_UNPOISON.
+	 * requested via __GFP_SKIP_KASAN.
 	 */
-	return flags & __GFP_SKIP_KASAN_UNPOISON;
+	return flags & __GFP_SKIP_KASAN;
 }
 
 static inline bool should_skip_init(gfp_t flags)
@@ -2417,7 +2412,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
 			!should_skip_init(gfp_flags);
 	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
-	bool reset_tags = true;
 	int i;
 
 	set_page_private(page, 0);
@@ -2451,37 +2445,22 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 		/* Take note that memory was initialized by the loop above. */
 		init = false;
 	}
-	if (!should_skip_kasan_unpoison(gfp_flags)) {
-		/* Try unpoisoning (or setting tags) and initializing memory. */
-		if (kasan_unpoison_pages(page, order, init)) {
-			/* Take note that memory was initialized by KASAN. */
-			if (kasan_has_integrated_init())
-				init = false;
-			/* Take note that memory tags were set by KASAN. */
-			reset_tags = false;
-		} else {
-			/*
-			 * KASAN decided to exclude this allocation from being
-			 * (un)poisoned due to sampling. Make KASAN skip
-			 * poisoning when the allocation is freed.
-			 */
-			SetPageSkipKASanPoison(page);
-		}
-	}
-	/*
-	 * If memory tags have not been set by KASAN, reset the page tags to
-	 * ensure page_address() dereferencing does not fault.
-	 */
-	if (reset_tags) {
+	if (!should_skip_kasan_unpoison(gfp_flags) &&
+	    kasan_unpoison_pages(page, order, init)) {
+		/* Take note that memory was initialized by KASAN. */
+		if (kasan_has_integrated_init())
+			init = false;
+	} else {
+		/*
+		 * If memory tags have not been set by KASAN, reset the page
+		 * tags to ensure page_address() dereferencing does not fault.
+		 */
 		for (i = 0; i != 1 << order; ++i)
 			page_kasan_tag_reset(page + i);
 	}
 	/* If memory is still not initialized, initialize it now. */
 	if (init)
 		kernel_init_pages(page, 1 << order);
-	/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
-	if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
-		SetPageSkipKASanPoison(page);
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bef6cf2b4d46..5e60e9792cbf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3188,7 +3188,7 @@ again:
 			 * pages backing VM_ALLOC mapping. Memory is instead
 			 * poisoned and zeroed by kasan_unpoison_vmalloc().
 			 */
-			gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+			gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
 		}
 
 		/* Take note that the mapping is PAGE_KERNEL. */
-- 
cgit v1.2.3


From dcc1be119071f034f3123d3c618d2ef70c80125e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Mon, 13 Mar 2023 12:27:14 +0000
Subject: mm: prefer xxx_page() alloc/free functions for order-0 pages

Update instances of alloc_pages(..., 0), __get_free_pages(..., 0) and
__free_pages(..., 0) to use alloc_page(), __get_free_page() and
__free_page() respectively in core code.

Link: https://lkml.kernel.org/r/50c48ca4789f1da2a65795f2346f5ae3eff7d665.1678710232.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/pgalloc.h | 4 ++--
 mm/debug_vm_pgtable.c         | 4 ++--
 mm/hugetlb_vmemmap.c          | 2 +-
 mm/mmu_gather.c               | 2 +-
 mm/page_alloc.c               | 2 +-
 mm/vmalloc.c                  | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 977bea16cf1b..a7cf825befae 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -123,11 +123,11 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	page = alloc_pages(gfp, 0);
+	page = alloc_page(gfp);
 	if (!page)
 		return NULL;
 	if (!pgtable_pmd_page_ctor(page)) {
-		__free_pages(page, 0);
+		__free_page(page);
 		return NULL;
 	}
 	return (pmd_t *)page_address(page);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 7887cc2b75bf..4362021b1ce7 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1048,7 +1048,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)
 
 	if (args->pte_pfn != ULONG_MAX) {
 		page = pfn_to_page(args->pte_pfn);
-		__free_pages(page, 0);
+		__free_page(page);
 
 		args->pte_pfn = ULONG_MAX;
 	}
@@ -1290,7 +1290,7 @@ static int __init init_args(struct pgtable_debug_args *args)
 		}
 	}
 
-	page = alloc_pages(GFP_KERNEL, 0);
+	page = alloc_page(GFP_KERNEL);
 	if (page)
 		args->pte_pfn = page_to_pfn(page);
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index a15cc56cf70a..1198064f80eb 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -400,7 +400,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 	return 0;
 out:
 	list_for_each_entry_safe(page, next, list, lru)
-		__free_pages(page, 0);
+		__free_page(page);
 	return -ENOMEM;
 }
 
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2b93cf6ac9ae..ea9683e12936 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
 	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
 		return false;
 
-	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+	batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 	if (!batch)
 		return false;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3737f9d58f5f..0936bde1d486 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5558,7 +5558,7 @@ EXPORT_SYMBOL(__get_free_pages);
 
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
-	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
+	return __get_free_page(gfp_mask | __GFP_ZERO);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5e60e9792cbf..978194dc2bb8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2739,7 +2739,7 @@ void vfree(const void *addr)
 		 * High-order allocs for huge vmallocs are split, so
 		 * can be freed as an array of order-0 allocations
 		 */
-		__free_pages(page, 0);
+		__free_page(page);
 		cond_resched();
 	}
 	atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
-- 
cgit v1.2.3


From 42c9db39704839eeb77b27db4c1d57bfa2a54a5b Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 13 Mar 2023 19:28:12 +0800
Subject: mm: vmscan: add a map_nr_max field to shrinker_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "make slab shrink lockless", v5.

This patch series aims to make slab shrink lockless.

1. Background
=============

On our servers, we often find the following system cpu hotspots:

  52.22% [kernel]        [k] down_read_trylock
  19.60% [kernel]        [k] up_read
   8.86% [kernel]        [k] shrink_slab
   2.44% [kernel]        [k] idr_find
   1.25% [kernel]        [k] count_shadow_nodes
   1.18% [kernel]        [k] shrink lruvec
   0.71% [kernel]        [k] mem_cgroup_iter
   0.71% [kernel]        [k] shrink_node
   0.55% [kernel]        [k] find_next_bit

And we used bpftrace to capture its calltrace as follows:

@[
    down_read_trylock+1
    shrink_slab+128
    shrink_node+371
    do_try_to_free_pages+232
    try_to_free_pages+243
    _alloc_pages_slowpath+771
    _alloc_pages_nodemask+702
    pagecache_get_page+255
    filemap_fault+1361
    ext4_filemap_fault+44
    __do_fault+76
    handle_mm_fault+3543
    do_user_addr_fault+442
    do_page_fault+48
    page_fault+62
]: 1161690
@[
    down_read_trylock+1
    shrink_slab+128
    shrink_node+371
    balance_pgdat+690
    kswapd+389
    kthread+246
    ret_from_fork+31
]: 8424884
@[
    down_read_trylock+1
    shrink_slab+128
    shrink_node+371
    do_try_to_free_pages+232
    try_to_free_pages+243
    __alloc_pages_slowpath+771
    __alloc_pages_nodemask+702
    __do_page_cache_readahead+244
    filemap_fault+1674
    ext4_filemap_fault+44
    __do_fault+76
    handle_mm_fault+3543
    do_user_addr_fault+442
    do_page_fault+48
    page_fault+62
]: 20917631

We can see that down_read_trylock() of shrinker_rwsem is being called with
high frequency at that time.  Because of the poor multicore scalability of
atomic operations, this can lead to a significant drop in IPC
(instructions per cycle).

And more, the shrinker_rwsem is a global read-write lock in shrinkers
subsystem, which protects most operations such as slab shrink,
registration and unregistration of shrinkers, etc.  This can easily cause
problems in the following cases.

1) When the memory pressure is high and there are many filesystems
   mounted or unmounted at the same time, slab shrink will be affected
   (down_read_trylock() failed).

   Such as the real workload mentioned by Kirill Tkhai:

   ```
   One of the real workloads from my experience is start of an
   overcommitted node containing many starting containers after node crash
   (or many resuming containers after reboot for kernel update).  In these
   cases memory pressure is huge, and the node goes round in long reclaim.
   ```

2) If a shrinker is blocked (such as the case mentioned in [1]) and a
   writer comes in (such as mount a fs), then this writer will be blocked
   and cause all subsequent shrinker-related operations to be blocked.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/

All the above cases can be solved by replacing the shrinker_rwsem trylocks
with SRCU.

2. Survey
=========

Before doing the code implementation, I found that there were many similar
submissions in the community:

a. Davidlohr Bueso submitted a patch in 2015.
   Subject: [PATCH -next v2] mm: srcu-ify shrinkers
   Link: https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
   Result: It was finally merged into the linux-next branch,
           but failed on arm allnoconfig (without CONFIG_SRCU)

b. Tetsuo Handa submitted a patchset in 2017.
   Subject: [PATCH 1/2] mm,vmscan: Kill global shrinker lock.
   Link: https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
   Result: Finally chose to use the current simple way (break
           when rwsem_is_contended()).  And Christoph Hellwig suggested to
           using SRCU, but SRCU was not unconditionally enabled at the
           time.

c. Kirill Tkhai submitted a patchset in 2018.
   Subject: [PATCH RFC 00/10] Introduce lockless shrink_slab()
   Link: https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
   Result: At that time, SRCU was not unconditionally enabled,
           and there were some objections to enabling SRCU.  Later,
           because Kirill's focus was moved to other things, this patchset
           was not continued to be updated.

d. Sultan Alsawaf submitted a patch in 2021.
   Subject: [PATCH] mm: vmscan: Replace shrinker_rwsem trylocks with SRCU protection
   Link: https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
   Result: Rejected because SRCU was not unconditionally enabled.

We can find that almost all these historical commits were abandoned
because SRCU was not unconditionally enabled.  But now SRCU has been
unconditionally enable by Paul E.  McKenney in 2023 [2], so it's time to
replace shrinker_rwsem trylocks with SRCU.

[2] https://lore.kernel.org/lkml/20230105003759.GA1769545@paulmck-ThinkPad-P17-Gen-1/

3. Reproduction and testing
===========================

We can reproduce the down_read_trylock() hotspot through the following script:

```
#!/bin/bash

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    mkdir -p /sys/fs/cgroup/perf_event/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
            dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
      do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands.  Then we can use
the following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  32.31%  [kernel]           [k] down_read_trylock
  19.40%  [kernel]           [k] pv_native_safe_halt
  16.24%  [kernel]           [k] up_read
  15.70%  [kernel]           [k] shrink_slab
   4.69%  [kernel]           [k] _find_next_bit
   2.62%  [kernel]           [k] shrink_node
   1.78%  [kernel]           [k] shrink_lruvec
   0.76%  [kernel]           [k] do_shrink_slab

2) After applying this patchset:

  27.83%  [kernel]           [k] _find_next_bit
  16.97%  [kernel]           [k] shrink_slab
  15.82%  [kernel]           [k] pv_native_safe_halt
   9.58%  [kernel]           [k] shrink_node
   8.31%  [kernel]           [k] shrink_lruvec
   5.64%  [kernel]           [k] do_shrink_slab
   3.88%  [kernel]           [k] mem_cgroup_iter

At the same time, we use the following perf command to capture IPC
information:

perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10

1) Before applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      454187219766      cycles                    test                    ( +-  1.84% )
       78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )

        10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )

2) After applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
      527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)

          10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )

We can see that IPC drops very seriously when calling down_read_trylock()
at high frequency.  After using SRCU, the IPC is at a normal level.


This patch (of 8):

To prepare for the subsequent lockless memcg slab shrink, add a map_nr_max
field to struct shrinker_info to records its own real shrinker_nr_max.

Link: https://lkml.kernel.org/r/20230313112819.38938-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230313112819.38938-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Suggested-by: Kirill Tkhai <tkhai@ya.ru>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill Tkhai <tkhai@ya.ru>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Christian König <christian.koenig@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 mm/vmscan.c                | 35 ++++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6eda2ab205d..aa69ea98e2d8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,6 +97,7 @@ struct shrinker_info {
 	struct rcu_head rcu;
 	atomic_long_t *nr_deferred;
 	unsigned long *map;
+	int map_nr_max;
 };
 
 struct lruvec_stats_percpu {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9414226218f0..9a2a6301052c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -226,7 +226,8 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 
 static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 				    int map_size, int defer_size,
-				    int old_map_size, int old_defer_size)
+				    int old_map_size, int old_defer_size,
+				    int new_nr_max)
 {
 	struct shrinker_info *new, *old;
 	struct mem_cgroup_per_node *pn;
@@ -240,12 +241,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 		if (!old)
 			return 0;
 
+		/* Already expanded this shrinker_info */
+		if (new_nr_max <= old->map_nr_max)
+			continue;
+
 		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
 		if (!new)
 			return -ENOMEM;
 
 		new->nr_deferred = (atomic_long_t *)(new + 1);
 		new->map = (void *)new->nr_deferred + defer_size;
+		new->map_nr_max = new_nr_max;
 
 		/* map: set all old bits, clear all new bits */
 		memset(new->map, (int)0xff, old_map_size);
@@ -295,6 +301,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
 		}
 		info->nr_deferred = (atomic_long_t *)(info + 1);
 		info->map = (void *)info->nr_deferred + defer_size;
+		info->map_nr_max = shrinker_nr_max;
 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 	}
 	up_write(&shrinker_rwsem);
@@ -302,23 +309,14 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
 	return ret;
 }
 
-static inline bool need_expand(int nr_max)
-{
-	return round_up(nr_max, BITS_PER_LONG) >
-	       round_up(shrinker_nr_max, BITS_PER_LONG);
-}
-
 static int expand_shrinker_info(int new_id)
 {
 	int ret = 0;
-	int new_nr_max = new_id + 1;
+	int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
 	int map_size, defer_size = 0;
 	int old_map_size, old_defer_size = 0;
 	struct mem_cgroup *memcg;
 
-	if (!need_expand(new_nr_max))
-		goto out;
-
 	if (!root_mem_cgroup)
 		goto out;
 
@@ -332,7 +330,8 @@ static int expand_shrinker_info(int new_id)
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
 		ret = expand_one_shrinker_info(memcg, map_size, defer_size,
-					       old_map_size, old_defer_size);
+					       old_map_size, old_defer_size,
+					       new_nr_max);
 		if (ret) {
 			mem_cgroup_iter_break(NULL, memcg);
 			goto out;
@@ -352,9 +351,11 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 
 		rcu_read_lock();
 		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
-		/* Pairs with smp mb in shrink_slab() */
-		smp_mb__before_atomic();
-		set_bit(shrinker_id, info->map);
+		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
+			/* Pairs with smp mb in shrink_slab() */
+			smp_mb__before_atomic();
+			set_bit(shrinker_id, info->map);
+		}
 		rcu_read_unlock();
 	}
 }
@@ -432,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
 	for_each_node(nid) {
 		child_info = shrinker_info_protected(memcg, nid);
 		parent_info = shrinker_info_protected(parent, nid);
-		for (i = 0; i < shrinker_nr_max; i++) {
+		for (i = 0; i < child_info->map_nr_max; i++) {
 			nr = atomic_long_read(&child_info->nr_deferred[i]);
 			atomic_long_add(nr, &parent_info->nr_deferred[i]);
 		}
@@ -899,7 +900,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	if (unlikely(!info))
 		goto unlock;
 
-	for_each_set_bit(i, info->map, shrinker_nr_max) {
+	for_each_set_bit(i, info->map, info->map_nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
-- 
cgit v1.2.3


From 3c556d2425b04054e22045d4ef7d34f163b7a71a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 15 Mar 2023 13:16:42 -0400
Subject: mm/thp: rename TRANSPARENT_HUGEPAGE_NEVER_DAX to _UNSUPPORTED

TRANSPARENT_HUGEPAGE_NEVER_DAX has nothing to do with DAX.  It's set when
has_transparent_hugepage() returns false, checked in hugepage_vma_check()
and will disable THP completely if false.  Rename it to
TRANSPARENT_HUGEPAGE_UNSUPPORTED to reflect its real purpose.

[peterx@redhat.com: fix comment, per David]
  Link: https://lkml.kernel.org/r/ZBMzQW674oHQJV7F@x1n
Link: https://lkml.kernel.org/r/20230315171642.1244625-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 +-
 mm/huge_memory.c        | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 70bd867eba94..9a3a3af2dd80 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -79,7 +79,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
 }
 
 enum transparent_hugepage_flag {
-	TRANSPARENT_HUGEPAGE_NEVER_DAX,
+	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c1df321ad64..cbf095e7f059 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -88,7 +88,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 	/*
 	 * If the hardware/firmware marked hugepage support disabled.
 	 */
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
 		return false;
 
 	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -460,11 +460,7 @@ static int __init hugepage_init(void)
 	struct kobject *hugepage_kobj;
 
 	if (!has_transparent_hugepage()) {
-		/*
-		 * Hardware doesn't support hugepages, hence disable
-		 * DAX PMD support.
-		 */
-		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
+		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From d288a162dd1c73507da582966f17dd226e34a0c0 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 23 Mar 2023 21:55:29 +0100
Subject: net: dst: Prevent false sharing vs. dst_entry:: __refcnt

dst_entry::__refcnt is highly contended in scenarios where many connections
happen from and to the same IP. The reference count is an atomic_t, so the
reference count operations have to take the cache-line exclusive.

Aside of the unavoidable reference count contention there is another
significant problem which is caused by that: False sharing.

perf top identified two affected read accesses. dst_entry::lwtstate and
rtable::rt_genid.

dst_entry:__refcnt is located at offset 64 of dst_entry, which puts it into
a seperate cacheline vs. the read mostly members located at the beginning
of the struct.

That prevents false sharing vs. the struct members in the first 64
bytes of the structure, but there is also

  dst_entry::lwtstate

which is located after the reference count and in the same cache line. This
member is read after a reference count has been acquired.

struct rtable embeds a struct dst_entry at offset 0. struct dst_entry has a
size of 112 bytes, which means that the struct members of rtable which
follow the dst member share the same cache line as dst_entry::__refcnt.
Especially

  rtable::rt_genid

is also read by the contexts which have a reference count acquired
already.

When dst_entry:__refcnt is incremented or decremented via an atomic
operation these read accesses stall. This was found when analysing the
memtier benchmark in 1:100 mode, which amplifies the problem extremly.

Move the rt[6i]_uncached[_list] members out of struct rtable and struct
rt6_info into struct dst_entry to provide padding and move the lwtstate
member after that so it ends up in the same cache line.

The resulting improvement depends on the micro-architecture and the number
of CPUs. It ranges from +20% to +120% with a localhost memtier/memcached
benchmark.

[ tglx: Rearrange struct ]

Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230323102800.042297517@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h       | 15 ++++++++++++++-
 include/net/ip6_fib.h   |  3 ---
 include/net/ip6_route.h |  2 +-
 include/net/route.h     |  3 ---
 net/ipv4/route.c        | 20 ++++++++++----------
 net/ipv4/xfrm4_policy.c |  4 ++--
 net/ipv6/route.c        | 26 +++++++++++++-------------
 net/ipv6/xfrm6_policy.c |  4 ++--
 8 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index d67fda89cd0f..81f2279ea911 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -69,15 +69,28 @@ struct dst_entry {
 #endif
 	int			__use;
 	unsigned long		lastuse;
-	struct lwtunnel_state   *lwtstate;
 	struct rcu_head		rcu_head;
 	short			error;
 	short			__pad;
 	__u32			tclassid;
 #ifndef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
 	atomic_t		__refcnt;	/* 32-bit offset 64 */
 #endif
 	netdevice_tracker	dev_tracker;
+
+	/*
+	 * Used by rtable and rt6_info. Moves lwtstate into the next cache
+	 * line on 64bit so that lwtstate does not cause false sharing with
+	 * __refcnt under contention of __refcnt. This also puts the
+	 * frequently accessed members of rtable and rt6_info out of the
+	 * __refcnt cache line.
+	 */
+	struct list_head	rt_uncached;
+	struct uncached_list	*rt_uncached_list;
+#ifdef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
+#endif
 };
 
 struct dst_metrics {
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6268963d9599..79570cb4ea9c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -217,9 +217,6 @@ struct rt6_info {
 	struct inet6_dev		*rt6i_idev;
 	u32				rt6i_flags;
 
-	struct list_head		rt6i_uncached;
-	struct uncached_list		*rt6i_uncached_list;
-
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 };
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 81ee387a1fc4..3556595ce59a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -100,7 +100,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
 static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
 {
 	if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
-	    !list_empty(&rt->rt6i_uncached))
+	    !list_empty(&rt->dst.rt_uncached))
 		ip6_rt_put(rt);
 }
 
diff --git a/include/net/route.h b/include/net/route.h
index fe00b0a2e475..bcc367cf3aa2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -78,9 +78,6 @@ struct rtable {
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
 				rt_pmtu:31;
-
-	struct list_head	rt_uncached;
-	struct uncached_list	*rt_uncached_list;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a0a0bb452e9..2a3d14d95ada 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1508,20 +1508,20 @@ void rt_add_uncached_list(struct rtable *rt)
 {
 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
 
-	rt->rt_uncached_list = ul;
+	rt->dst.rt_uncached_list = ul;
 
 	spin_lock_bh(&ul->lock);
-	list_add_tail(&rt->rt_uncached, &ul->head);
+	list_add_tail(&rt->dst.rt_uncached, &ul->head);
 	spin_unlock_bh(&ul->lock);
 }
 
 void rt_del_uncached_list(struct rtable *rt)
 {
-	if (!list_empty(&rt->rt_uncached)) {
-		struct uncached_list *ul = rt->rt_uncached_list;
+	if (!list_empty(&rt->dst.rt_uncached)) {
+		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 		spin_lock_bh(&ul->lock);
-		list_del_init(&rt->rt_uncached);
+		list_del_init(&rt->dst.rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
 }
@@ -1546,13 +1546,13 @@ void rt_flush_dev(struct net_device *dev)
 			continue;
 
 		spin_lock_bh(&ul->lock);
-		list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
+		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			if (rt->dst.dev != dev)
 				continue;
 			rt->dst.dev = blackhole_netdev;
 			netdev_ref_replace(dev, blackhole_netdev,
 					   &rt->dst.dev_tracker, GFP_ATOMIC);
-			list_move(&rt->rt_uncached, &ul->quarantine);
+			list_move(&rt->dst.rt_uncached, &ul->quarantine);
 		}
 		spin_unlock_bh(&ul->lock);
 	}
@@ -1644,7 +1644,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_uses_gateway = 0;
 		rt->rt_gw_family = 0;
 		rt->rt_gw4 = 0;
-		INIT_LIST_HEAD(&rt->rt_uncached);
+		INIT_LIST_HEAD(&rt->dst.rt_uncached);
 
 		rt->dst.output = ip_output;
 		if (flags & RTCF_LOCAL)
@@ -1675,7 +1675,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
 			new_rt->rt_gw4 = rt->rt_gw4;
 		else if (rt->rt_gw_family == AF_INET6)
 			new_rt->rt_gw6 = rt->rt_gw6;
-		INIT_LIST_HEAD(&new_rt->rt_uncached);
+		INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
 
 		new_rt->dst.input = rt->dst.input;
 		new_rt->dst.output = rt->dst.output;
@@ -2859,7 +2859,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		else if (rt->rt_gw_family == AF_INET6)
 			rt->rt_gw6 = ort->rt_gw6;
 
-		INIT_LIST_HEAD(&rt->rt_uncached);
+		INIT_LIST_HEAD(&rt->dst.rt_uncached);
 	}
 
 	dst_release(dst_orig);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3d0dfa6cf9f9..47861c8b7340 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 		xdst->u.rt.rt_gw6 = rt->rt_gw6;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
-	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+	INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
 	rt_add_uncached_list(&xdst->u.rt);
 
 	return 0;
@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
 	dst_destroy_metrics_generic(dst);
-	if (xdst->u.rt.rt_uncached_list)
+	if (xdst->u.rt.dst.rt_uncached_list)
 		rt_del_uncached_list(&xdst->u.rt);
 	xfrm_dst_destroy(xdst);
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 244df77fac87..12e163dec34e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt)
 {
 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 
-	rt->rt6i_uncached_list = ul;
+	rt->dst.rt_uncached_list = ul;
 
 	spin_lock_bh(&ul->lock);
-	list_add_tail(&rt->rt6i_uncached, &ul->head);
+	list_add_tail(&rt->dst.rt_uncached, &ul->head);
 	spin_unlock_bh(&ul->lock);
 }
 
 void rt6_uncached_list_del(struct rt6_info *rt)
 {
-	if (!list_empty(&rt->rt6i_uncached)) {
-		struct uncached_list *ul = rt->rt6i_uncached_list;
+	if (!list_empty(&rt->dst.rt_uncached)) {
+		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 		spin_lock_bh(&ul->lock);
-		list_del_init(&rt->rt6i_uncached);
+		list_del_init(&rt->dst.rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
 }
@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
 			continue;
 
 		spin_lock_bh(&ul->lock);
-		list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
+		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			struct inet6_dev *rt_idev = rt->rt6i_idev;
 			struct net_device *rt_dev = rt->dst.dev;
 			bool handled = false;
@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
 				handled = true;
 			}
 			if (handled)
-				list_move(&rt->rt6i_uncached,
+				list_move(&rt->dst.rt_uncached,
 					  &ul->quarantine);
 		}
 		spin_unlock_bh(&ul->lock);
@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 static void rt6_info_init(struct rt6_info *rt)
 {
 	memset_after(rt, 0, dst);
-	INIT_LIST_HEAD(&rt->rt6i_uncached);
+	INIT_LIST_HEAD(&rt->dst.rt_uncached);
 }
 
 /* allocate dst with ip6_dst_ops */
@@ -2638,7 +2638,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
 	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
 	rt6 = (struct rt6_info *)dst;
 	/* For dst cached in uncached_list, refcnt is already taken. */
-	if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+	if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
 		dst = &net->ipv6.ip6_null_entry->dst;
 		dst_hold(dst);
 	}
@@ -2748,7 +2748,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
 	from = rcu_dereference(rt->from);
 
 	if (from && (rt->rt6i_flags & RTF_PCPU ||
-	    unlikely(!list_empty(&rt->rt6i_uncached))))
+	    unlikely(!list_empty(&rt->dst.rt_uncached))))
 		dst_ret = rt6_dst_from_check(rt, from, cookie);
 	else
 		dst_ret = rt6_check(rt, from, cookie);
@@ -6477,7 +6477,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	net->ipv6.fib6_has_custom_rules = false;
@@ -6489,7 +6489,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
 
 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -6499,7 +6499,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
 #ifdef CONFIG_IPV6_SUBTREES
 	net->ipv6.fib6_routes_require_src = 0;
 #endif
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ea435eba3053..2b493f8d0091 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
 	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
 	xdst->u.rt6.rt6i_src = rt->rt6i_src;
-	INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
+	INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
 	rt6_uncached_list_add(&xdst->u.rt6);
 
 	return 0;
@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
 	if (likely(xdst->u.rt6.rt6i_idev))
 		in6_dev_put(xdst->u.rt6.rt6i_idev);
 	dst_destroy_metrics_generic(dst);
-	if (xdst->u.rt6.rt6i_uncached_list)
+	if (xdst->u.rt6.dst.rt_uncached_list)
 		rt6_uncached_list_del(&xdst->u.rt6);
 	xfrm_dst_destroy(xdst);
 }
-- 
cgit v1.2.3


From bc9d3a9f2afca189a6ae40225b6985e3c775375e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:32 +0100
Subject: net: dst: Switch to rcuref_t reference counting

Under high contention dst_entry::__refcnt becomes a significant bottleneck.

atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
high retry rates on contention.

Switch the reference count to rcuref_t which results in a significant
performance gain. Rename the reference count member to __rcuref to reflect
the change.

The gain depends on the micro-architecture and the number of concurrent
operations and has been measured in the range of +25% to +130% with a
localhost memtier/memcached benchmark which amplifies the problem
massively.

Running the memtier/memcached benchmark over a real (1Gb) network
connection the conversion on top of the false sharing fix for struct
dst_entry::__refcnt results in a total gain in the 2%-5% range over the
upstream baseline.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h               | 19 ++++++++++---------
 include/net/sock.h              |  2 +-
 net/bridge/br_nf_core.c         |  2 +-
 net/core/dst.c                  | 26 +++++---------------------
 net/core/rtnetlink.c            |  2 +-
 net/ipv6/route.c                |  6 +++---
 net/netfilter/ipvs/ip_vs_xmit.c |  4 ++--
 7 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 81f2279ea911..78884429deed 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -16,6 +16,7 @@
 #include <linux/bug.h>
 #include <linux/jiffies.h>
 #include <linux/refcount.h>
+#include <linux/rcuref.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
 #include <linux/indirect_call_wrapper.h>
@@ -61,11 +62,11 @@ struct dst_entry {
 	unsigned short		trailer_len;	/* space to reserve at tail */
 
 	/*
-	 * __refcnt wants to be on a different cache line from
+	 * __rcuref wants to be on a different cache line from
 	 * input/output/ops or performance tanks badly
 	 */
 #ifdef CONFIG_64BIT
-	atomic_t		__refcnt;	/* 64-bit offset 64 */
+	rcuref_t		__rcuref;	/* 64-bit offset 64 */
 #endif
 	int			__use;
 	unsigned long		lastuse;
@@ -75,16 +76,16 @@ struct dst_entry {
 	__u32			tclassid;
 #ifndef CONFIG_64BIT
 	struct lwtunnel_state   *lwtstate;
-	atomic_t		__refcnt;	/* 32-bit offset 64 */
+	rcuref_t		__rcuref;	/* 32-bit offset 64 */
 #endif
 	netdevice_tracker	dev_tracker;
 
 	/*
 	 * Used by rtable and rt6_info. Moves lwtstate into the next cache
 	 * line on 64bit so that lwtstate does not cause false sharing with
-	 * __refcnt under contention of __refcnt. This also puts the
+	 * __rcuref under contention of __rcuref. This also puts the
 	 * frequently accessed members of rtable and rt6_info out of the
-	 * __refcnt cache line.
+	 * __rcuref cache line.
 	 */
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst)
 {
 	/*
 	 * If your kernel compilation stops here, please check
-	 * the placement of __refcnt in struct dst_entry
+	 * the placement of __rcuref in struct dst_entry
 	 */
-	BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
-	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
+	BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
+	WARN_ON(!rcuref_get(&dst->__rcuref));
 }
 
 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
  */
 static inline bool dst_hold_safe(struct dst_entry *dst)
 {
-	return atomic_inc_not_zero(&dst->__refcnt);
+	return rcuref_get(&dst->__rcuref);
 }
 
 /**
diff --git a/include/net/sock.h b/include/net/sock.h
index 573f2bf7e0de..5edf0038867c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2131,7 +2131,7 @@ sk_dst_get(struct sock *sk)
 
 	rcu_read_lock();
 	dst = rcu_dereference(sk->sk_dst_cache);
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+	if (dst && !rcuref_get(&dst->__rcuref))
 		dst = NULL;
 	rcu_read_unlock();
 	return dst;
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 8c69f0c95a8e..98aea5485aae 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 {
 	struct rtable *rt = &br->fake_rtable;
 
-	atomic_set(&rt->dst.__refcnt, 1);
+	rcuref_init(&rt->dst.__rcuref, 1);
 	rt->dst.dev = br->dev;
 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
diff --git a/net/core/dst.c b/net/core/dst.c
index 31c08a3386d3..3247e84045ca 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->tclassid = 0;
 #endif
 	dst->lwtstate = NULL;
-	atomic_set(&dst->__refcnt, initial_ref);
+	rcuref_init(&dst->__rcuref, initial_ref);
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
@@ -162,31 +162,15 @@ EXPORT_SYMBOL(dst_dev_put);
 
 void dst_release(struct dst_entry *dst)
 {
-	if (dst) {
-		int newrefcnt;
-
-		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-					     __func__, dst, newrefcnt);
-		if (!newrefcnt)
-			call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
-	}
+	if (dst && rcuref_put(&dst->__rcuref))
+		call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
 }
 EXPORT_SYMBOL(dst_release);
 
 void dst_release_immediate(struct dst_entry *dst)
 {
-	if (dst) {
-		int newrefcnt;
-
-		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-					     __func__, dst, newrefcnt);
-		if (!newrefcnt)
-			dst_destroy(dst);
-	}
+	if (dst && rcuref_put(&dst->__rcuref))
+		dst_destroy(dst);
 }
 EXPORT_SYMBOL(dst_release_immediate);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b7b1661d0d56..906aebdc566b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 	if (dst) {
 		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
 		ci.rta_used = dst->__use;
-		ci.rta_clntref = atomic_read(&dst->__refcnt);
+		ci.rta_clntref = rcuref_read(&dst->__rcuref);
 	}
 	if (expires) {
 		unsigned long clock;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 12e163dec34e..35085fc0cf15 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
 
 static const struct rt6_info ip6_null_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -ENETUNREACH,
@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
 
 static const struct rt6_info ip6_prohibit_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -EACCES,
@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -EINVAL,
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 80448885c3d7..99c349c0d968 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
-				  atomic_read(&rt->dst.__refcnt));
+				  rcuref_read(&rt->dst.__rcuref));
 		}
 		if (ret_saddr)
 			*ret_saddr = dest_dst->dst_saddr.ip;
@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
-				  atomic_read(&rt->dst.__refcnt));
+				  rcuref_read(&rt->dst.__rcuref));
 		}
 		if (ret_saddr)
 			*ret_saddr = dest_dst->dst_saddr.in6;
-- 
cgit v1.2.3


From 75a2d4226b53710380d1017b3f4c88f937ddba78 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 25 Mar 2023 09:45:37 +0100
Subject: driver core: class: mark the struct class for sysfs callbacks as
 constant

struct class should never be modified in a sysfs callback as there is
nothing in the structure to modify, and frankly, the structure is almost
never used in a sysfs callback, so mark it as constant to allow struct
class to be moved to read-only memory.

While we are touching all class sysfs callbacks also mark the attribute
as constant as it can not be modified.  The bonding code still uses this
structure so it can not be removed from the function callbacks.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Russ Weight <russell.h.weight@intel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steve French <sfrench@samba.org>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-cifs@vger.kernel.org
Cc: linux-gpio@vger.kernel.org
Cc: linux-mtd@lists.infradead.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: netdev@vger.kernel.org
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20230325084537.3622280-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/pseries/dlpar.c    |  4 ++--
 arch/powerpc/platforms/pseries/mobility.c |  4 ++--
 drivers/base/class.c                      |  4 ++--
 drivers/base/devcoredump.c                |  4 ++--
 drivers/base/firmware_loader/sysfs.c      |  4 ++--
 drivers/block/pktcdvd.c                   |  6 +++---
 drivers/block/zram/zram_drv.c             | 11 +++++------
 drivers/gpio/gpiolib-sysfs.c              |  8 ++++----
 drivers/infiniband/core/user_mad.c        |  4 ++--
 drivers/mtd/ubi/build.c                   |  2 +-
 drivers/net/bonding/bond_sysfs.c          | 18 +++++++++---------
 drivers/s390/crypto/zcrypt_api.c          |  8 ++++----
 fs/ksmbd/server.c                         | 10 +++++-----
 include/linux/device/class.h              |  9 +++++----
 14 files changed, 48 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 75ffdbcd2865..719c97a155ed 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -512,7 +512,7 @@ static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog)
 	return 0;
 }
 
-static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
+static ssize_t dlpar_store(const struct class *class, const struct class_attribute *attr,
 			   const char *buf, size_t count)
 {
 	struct pseries_hp_errorlog hp_elog;
@@ -551,7 +551,7 @@ dlpar_store_out:
 	return rc ? rc : count;
 }
 
-static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
+static ssize_t dlpar_show(const struct class *class, const struct class_attribute *attr,
 			  char *buf)
 {
 	return sprintf(buf, "%s\n", "memory,cpu");
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 643d309d1bd0..6b25642adfa0 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -787,8 +787,8 @@ int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
 	return pseries_migrate_partition(handle);
 }
 
-static ssize_t migration_store(struct class *class,
-			       struct class_attribute *attr, const char *buf,
+static ssize_t migration_store(const struct class *class,
+			       const struct class_attribute *attr, const char *buf,
 			       size_t count)
 {
 	u64 streamid;
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 8ae91e118827..04de20e0dba8 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -482,8 +482,8 @@ void class_interface_unregister(struct class_interface *class_intf)
 }
 EXPORT_SYMBOL_GPL(class_interface_unregister);
 
-ssize_t show_class_attr_string(struct class *class,
-			       struct class_attribute *attr, char *buf)
+ssize_t show_class_attr_string(const struct class *class,
+			       const struct class_attribute *attr, char *buf)
 {
 	struct class_attribute_string *cs;
 
diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 59aaf2e1375a..91536ee05f14 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -167,7 +167,7 @@ static int devcd_free(struct device *dev, void *data)
 	return 0;
 }
 
-static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
+static ssize_t disabled_show(const struct class *class, const struct class_attribute *attr,
 			     char *buf)
 {
 	return sysfs_emit(buf, "%d\n", devcd_disabled);
@@ -197,7 +197,7 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
  * so, above situation would not occur.
  */
 
-static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
+static ssize_t disabled_store(const struct class *class, const struct class_attribute *attr,
 			      const char *buf, size_t count)
 {
 	long tmp = simple_strtol(buf, NULL, 10);
diff --git a/drivers/base/firmware_loader/sysfs.c b/drivers/base/firmware_loader/sysfs.c
index 56911d75b90a..c9c93b47d9a5 100644
--- a/drivers/base/firmware_loader/sysfs.c
+++ b/drivers/base/firmware_loader/sysfs.c
@@ -25,7 +25,7 @@ void __fw_load_abort(struct fw_priv *fw_priv)
 }
 
 #ifdef CONFIG_FW_LOADER_USER_HELPER
-static ssize_t timeout_show(struct class *class, struct class_attribute *attr,
+static ssize_t timeout_show(const struct class *class, const struct class_attribute *attr,
 			    char *buf)
 {
 	return sysfs_emit(buf, "%d\n", __firmware_loading_timeout());
@@ -44,7 +44,7 @@ static ssize_t timeout_show(struct class *class, struct class_attribute *attr,
  *
  *	Note: zero means 'wait forever'.
  **/
-static ssize_t timeout_store(struct class *class, struct class_attribute *attr,
+static ssize_t timeout_store(const struct class *class, const struct class_attribute *attr,
 			     const char *buf, size_t count)
 {
 	int tmp_loading_timeout = simple_strtol(buf, NULL, 10);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 642e3377441a..ba9bbdef9ef5 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -343,7 +343,7 @@ static void class_pktcdvd_release(struct class *cls)
 	kfree(cls);
 }
 
-static ssize_t device_map_show(struct class *c, struct class_attribute *attr,
+static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr,
 			       char *data)
 {
 	int n = 0;
@@ -364,7 +364,7 @@ static ssize_t device_map_show(struct class *c, struct class_attribute *attr,
 }
 static CLASS_ATTR_RO(device_map);
 
-static ssize_t add_store(struct class *c, struct class_attribute *attr,
+static ssize_t add_store(const struct class *c, const struct class_attribute *attr,
 			 const char *buf, size_t count)
 {
 	unsigned int major, minor;
@@ -385,7 +385,7 @@ static ssize_t add_store(struct class *c, struct class_attribute *attr,
 }
 static CLASS_ATTR_WO(add);
 
-static ssize_t remove_store(struct class *c, struct class_attribute *attr,
+static ssize_t remove_store(const struct class *c, const struct class_attribute *attr,
 			    const char *buf, size_t count)
 {
 	unsigned int major, minor;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b7bb52f8dfbd..3feadfb96114 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2424,8 +2424,8 @@ static int zram_remove(struct zram *zram)
  * creates a new un-initialized zram device and returns back this device's
  * device_id (or an error code if it fails to create a new device).
  */
-static ssize_t hot_add_show(struct class *class,
-			struct class_attribute *attr,
+static ssize_t hot_add_show(const struct class *class,
+			const struct class_attribute *attr,
 			char *buf)
 {
 	int ret;
@@ -2438,11 +2438,10 @@ static ssize_t hot_add_show(struct class *class,
 		return ret;
 	return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
 }
-static struct class_attribute class_attr_hot_add =
-	__ATTR(hot_add, 0400, hot_add_show, NULL);
+static CLASS_ATTR_RO(hot_add);
 
-static ssize_t hot_remove_store(struct class *class,
-			struct class_attribute *attr,
+static ssize_t hot_remove_store(const struct class *class,
+			const struct class_attribute *attr,
 			const char *buf,
 			size_t count)
 {
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index 774755052618..a895915affa5 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -426,8 +426,8 @@ ATTRIBUTE_GROUPS(gpiochip);
  * /sys/class/gpio/unexport ... write-only
  *	integer N ... number of GPIO to unexport
  */
-static ssize_t export_store(struct class *class,
-				struct class_attribute *attr,
+static ssize_t export_store(const struct class *class,
+				const struct class_attribute *attr,
 				const char *buf, size_t len)
 {
 	long			gpio;
@@ -478,8 +478,8 @@ done:
 }
 static CLASS_ATTR_WO(export);
 
-static ssize_t unexport_store(struct class *class,
-				struct class_attribute *attr,
+static ssize_t unexport_store(const struct class *class,
+				const struct class_attribute *attr,
 				const char *buf, size_t len)
 {
 	long			gpio;
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index f83954180a33..0e9e04f8c685 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1229,8 +1229,8 @@ static char *umad_devnode(const struct device *dev, umode_t *mode)
 	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
 }
 
-static ssize_t abi_version_show(struct class *class,
-				struct class_attribute *attr, char *buf)
+static ssize_t abi_version_show(const struct class *class,
+				const struct class_attribute *attr, char *buf)
 {
 	return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
 }
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index ae6d35e3da9c..32105bd35831 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -95,7 +95,7 @@ static DEFINE_SPINLOCK(ubi_devices_lock);
 
 /* "Show" method for files in '/<sysfs>/class/ubi/' */
 /* UBI version attribute ('/<sysfs>/class/ubi/version') */
-static ssize_t version_show(struct class *class, struct class_attribute *attr,
+static ssize_t version_show(const struct class *class, const struct class_attribute *attr,
 			    char *buf)
 {
 	return sprintf(buf, "%d\n", UBI_VERSION);
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 8996bd0a194a..0bb59da24922 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -31,12 +31,12 @@
 /* "show" function for the bond_masters attribute.
  * The class parameter is ignored.
  */
-static ssize_t bonding_show_bonds(struct class *cls,
-				  struct class_attribute *attr,
+static ssize_t bonding_show_bonds(const struct class *cls,
+				  const struct class_attribute *attr,
 				  char *buf)
 {
-	struct bond_net *bn =
-		container_of(attr, struct bond_net, class_attr_bonding_masters);
+	const struct bond_net *bn =
+		container_of_const(attr, struct bond_net, class_attr_bonding_masters);
 	int res = 0;
 	struct bonding *bond;
 
@@ -59,7 +59,7 @@ static ssize_t bonding_show_bonds(struct class *cls,
 	return res;
 }
 
-static struct net_device *bond_get_by_name(struct bond_net *bn, const char *ifname)
+static struct net_device *bond_get_by_name(const struct bond_net *bn, const char *ifname)
 {
 	struct bonding *bond;
 
@@ -75,12 +75,12 @@ static struct net_device *bond_get_by_name(struct bond_net *bn, const char *ifna
  *
  * The class parameter is ignored.
  */
-static ssize_t bonding_store_bonds(struct class *cls,
-				   struct class_attribute *attr,
+static ssize_t bonding_store_bonds(const struct class *cls,
+				   const struct class_attribute *attr,
 				   const char *buffer, size_t count)
 {
-	struct bond_net *bn =
-		container_of(attr, struct bond_net, class_attr_bonding_masters);
+	const struct bond_net *bn =
+		container_of_const(attr, struct bond_net, class_attr_bonding_masters);
 	char command[IFNAMSIZ + 1] = {0, };
 	char *ifname;
 	int rv, res = count;
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 582ac301d315..cff2eea88f98 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -340,8 +340,8 @@ static const struct attribute_group *zcdn_dev_attr_groups[] = {
 	NULL
 };
 
-static ssize_t zcdn_create_store(struct class *class,
-				 struct class_attribute *attr,
+static ssize_t zcdn_create_store(const struct class *class,
+				 const struct class_attribute *attr,
 				 const char *buf, size_t count)
 {
 	int rc;
@@ -357,8 +357,8 @@ static ssize_t zcdn_create_store(struct class *class,
 static const struct class_attribute class_attr_zcdn_create =
 	__ATTR(create, 0600, NULL, zcdn_create_store);
 
-static ssize_t zcdn_destroy_store(struct class *class,
-				  struct class_attribute *attr,
+static ssize_t zcdn_destroy_store(const struct class *class,
+				  const struct class_attribute *attr,
 				  const char *buf, size_t count)
 {
 	int rc;
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index b5af3e43e677..c2c958a5423e 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -418,7 +418,7 @@ int server_queue_ctrl_reset_work(void)
 	return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET);
 }
 
-static ssize_t stats_show(struct class *class, struct class_attribute *attr,
+static ssize_t stats_show(const struct class *class, const struct class_attribute *attr,
 			  char *buf)
 {
 	/*
@@ -437,8 +437,8 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
 			  server_conf.ipc_last_active / HZ);
 }
 
-static ssize_t kill_server_store(struct class *class,
-				 struct class_attribute *attr, const char *buf,
+static ssize_t kill_server_store(const struct class *class,
+				 const struct class_attribute *attr, const char *buf,
 				 size_t len)
 {
 	if (!sysfs_streq(buf, "hard"))
@@ -458,7 +458,7 @@ static const char * const debug_type_strings[] = {"smb", "auth", "vfs",
 						  "oplock", "ipc", "conn",
 						  "rdma"};
 
-static ssize_t debug_show(struct class *class, struct class_attribute *attr,
+static ssize_t debug_show(const struct class *class, const struct class_attribute *attr,
 			  char *buf)
 {
 	ssize_t sz = 0;
@@ -476,7 +476,7 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
 	return sz;
 }
 
-static ssize_t debug_store(struct class *class, struct class_attribute *attr,
+static ssize_t debug_store(const struct class *class, const struct class_attribute *attr,
 			   const char *buf, size_t len)
 {
 	int i;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 73436c578d37..b53728ca56fb 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -174,10 +174,10 @@ static inline struct device *class_find_device_by_acpi_dev(const struct class *c
 
 struct class_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct class *class, struct class_attribute *attr,
+	ssize_t (*show)(const struct class *class, const struct class_attribute *attr,
 			char *buf);
-	ssize_t (*store)(struct class *class, struct class_attribute *attr,
-			const char *buf, size_t count);
+	ssize_t (*store)(const struct class *class, const struct class_attribute *attr,
+			 const char *buf, size_t count);
 };
 
 #define CLASS_ATTR_RW(_name) \
@@ -217,7 +217,8 @@ struct class_attribute_string {
 	struct class_attribute_string class_attr_##_name = \
 		_CLASS_ATTR_STRING(_name, _mode, _str)
 
-ssize_t show_class_attr_string(struct class *class, struct class_attribute *attr, char *buf);
+ssize_t show_class_attr_string(const struct class *class, const struct class_attribute *attr,
+			       char *buf);
 
 struct class_interface {
 	struct list_head	node;
-- 
cgit v1.2.3


From 130eac4170859fb368681e00d390f20f44bbf27b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 28 Mar 2023 15:10:43 +0200
Subject: xhci: use pm_ptr() instead of #ifdef for CONFIG_PM conditionals

A recent patch caused an unused-function warning in builds with
CONFIG_PM disabled, after the function became marked 'static':

drivers/usb/host/xhci-pci.c:91:13: error: 'xhci_msix_sync_irqs' defined but not used [-Werror=unused-function]
   91 | static void xhci_msix_sync_irqs(struct xhci_hcd *xhci)
      |             ^~~~~~~~~~~~~~~~~~~

This could be solved by adding another #ifdef, but as there is
a trend towards removing CONFIG_PM checks in favor of helper
macros, do the same conversion here and use pm_ptr() to get
either a function pointer or NULL but avoid the warning.

As the hidden functions reference some other symbols, make
sure those are visible at compile time, at the minimal cost of
a few extra bytes for 'struct usb_device'.

Fixes: 9abe15d55dcc ("xhci: Move xhci MSI sync function to to xhci-pci")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20230328131114.1296430-1-arnd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-pci.c | 16 +++++-----------
 include/linux/usb.h         |  3 +--
 include/linux/usb/hcd.h     |  2 --
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index a53ecc8ff8c5..bbbb01282038 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -728,7 +728,6 @@ static void xhci_pci_remove(struct pci_dev *dev)
 	usb_hcd_pci_remove(dev);
 }
 
-#ifdef CONFIG_PM
 /*
  * In some Intel xHCI controllers, in order to get D3 working,
  * through a vendor specific SSIC CONFIG register at offset 0x883c,
@@ -927,7 +926,6 @@ static void xhci_pci_shutdown(struct usb_hcd *hcd)
 	if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
 		pci_set_power_state(pdev, PCI_D3hot);
 }
-#endif /* CONFIG_PM */
 
 /*-------------------------------------------------------------------------*/
 
@@ -970,9 +968,7 @@ static struct pci_driver xhci_pci_driver = {
 
 	.shutdown = 	usb_hcd_pci_shutdown,
 	.driver = {
-#ifdef CONFIG_PM
-		.pm = &usb_hcd_pci_pm_ops,
-#endif
+		.pm = pm_ptr(&usb_hcd_pci_pm_ops),
 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
 	},
 };
@@ -980,12 +976,10 @@ static struct pci_driver xhci_pci_driver = {
 static int __init xhci_pci_init(void)
 {
 	xhci_init_driver(&xhci_pci_hc_driver, &xhci_pci_overrides);
-#ifdef CONFIG_PM
-	xhci_pci_hc_driver.pci_suspend = xhci_pci_suspend;
-	xhci_pci_hc_driver.pci_resume = xhci_pci_resume;
-	xhci_pci_hc_driver.pci_poweroff_late = xhci_pci_poweroff_late;
-	xhci_pci_hc_driver.shutdown = xhci_pci_shutdown;
-#endif
+	xhci_pci_hc_driver.pci_suspend = pm_ptr(xhci_pci_suspend);
+	xhci_pci_hc_driver.pci_resume = pm_ptr(xhci_pci_resume);
+	xhci_pci_hc_driver.pci_poweroff_late = pm_ptr(xhci_pci_poweroff_late);
+	xhci_pci_hc_driver.shutdown = pm_ptr(xhci_pci_shutdown);
 	xhci_pci_hc_driver.stop = xhci_pci_stop;
 	return pci_register_driver(&xhci_pci_driver);
 }
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 9642ee02d713..d510fabcafa2 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -704,13 +704,12 @@ struct usb_device {
 
 	unsigned long active_duration;
 
-#ifdef CONFIG_PM
 	unsigned long connect_time;
 
 	unsigned do_remote_wakeup:1;
 	unsigned reset_resume:1;
 	unsigned port_is_suspended:1;
-#endif
+
 	struct wusb_dev *wusb_dev;
 	int slot_id;
 	struct usb2_lpm_parameters l1_params;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index b51c07111729..094c77eaf455 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -488,9 +488,7 @@ extern void usb_hcd_pci_shutdown(struct pci_dev *dev);
 
 extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev);
 
-#ifdef CONFIG_PM
 extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
-#endif
 #endif /* CONFIG_USB_PCI */
 
 /* pci-ish (pdev null is ok) buffer alloc/mapping support */
-- 
cgit v1.2.3


From d54bd5abf4d26e1b6722238f75e36069ea91def9 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Wed, 15 Mar 2023 01:16:55 -0700
Subject: RDMA/bnxt_re: Add resize_cq support

Add resize_cq verb support for user space CQs. Resize operation for
kernel CQs are not supported now.

Driver should free the current CQ only after user library polls
for all the completions and switch to new CQ. So after the resize_cq
is returned from the driver, user library polls for existing completions
and store it as temporary data. Once library reaps all completions in the
current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about
the resize_cq completion. Adding a check for user CQs in driver's
poll_cq and complete the resize operation for user CQs.
Updating uverbs_cmd_mask with poll_cq to support this.

Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Link: https://lore.kernel.org/r/1678868215-23626-1-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.h |   3 +
 drivers/infiniband/hw/bnxt_re/main.c     |   2 +
 drivers/infiniband/hw/bnxt_re/qplib_fp.c |  44 +++++++++++++
 drivers/infiniband/hw/bnxt_re/qplib_fp.h |   5 ++
 include/uapi/rdma/bnxt_re-abi.h          |   4 ++
 6 files changed, 167 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 989edc789633..e86afecfbe46 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -2912,6 +2912,106 @@ fail:
 	return rc;
 }
 
+static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq)
+{
+	struct bnxt_re_dev *rdev = cq->rdev;
+
+	bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq);
+
+	cq->qplib_cq.max_wqe = cq->resize_cqe;
+	if (cq->resize_umem) {
+		ib_umem_release(cq->umem);
+		cq->umem = cq->resize_umem;
+		cq->resize_umem = NULL;
+		cq->resize_cqe = 0;
+	}
+}
+
+int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct bnxt_qplib_sg_info sg_info = {};
+	struct bnxt_qplib_dpi *orig_dpi = NULL;
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_re_ucontext *uctx = NULL;
+	struct bnxt_re_resize_cq_req req;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_cq *cq;
+	int rc, entries;
+
+	cq =  container_of(ibcq, struct bnxt_re_cq, ib_cq);
+	rdev = cq->rdev;
+	dev_attr = &rdev->dev_attr;
+	if (!ibcq->uobject) {
+		ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (cq->resize_umem) {
+		ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy",
+			  cq->qplib_cq.id);
+		return -EBUSY;
+	}
+
+	/* Check the requested cq depth out of supported depth */
+	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
+		ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - out of range cqe %d",
+			  cq->qplib_cq.id, cqe);
+		return -EINVAL;
+	}
+
+	entries = roundup_pow_of_two(cqe + 1);
+	if (entries > dev_attr->max_cq_wqes + 1)
+		entries = dev_attr->max_cq_wqes + 1;
+
+	uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext,
+					 ib_uctx);
+	/* uverbs consumer */
+	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+		rc = -EFAULT;
+		goto fail;
+	}
+
+	cq->resize_umem = ib_umem_get(&rdev->ibdev, req.cq_va,
+				      entries * sizeof(struct cq_base),
+				      IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(cq->resize_umem)) {
+		rc = PTR_ERR(cq->resize_umem);
+		cq->resize_umem = NULL;
+		ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n",
+			  __func__, rc);
+		goto fail;
+	}
+	cq->resize_cqe = entries;
+	memcpy(&sg_info, &cq->qplib_cq.sg_info, sizeof(sg_info));
+	orig_dpi = cq->qplib_cq.dpi;
+
+	cq->qplib_cq.sg_info.umem = cq->resize_umem;
+	cq->qplib_cq.sg_info.pgsize = PAGE_SIZE;
+	cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT;
+	cq->qplib_cq.dpi = &uctx->dpi;
+
+	rc = bnxt_qplib_resize_cq(&rdev->qplib_res, &cq->qplib_cq, entries);
+	if (rc) {
+		ibdev_err(&rdev->ibdev, "Resize HW CQ %#x failed!",
+			  cq->qplib_cq.id);
+		goto fail;
+	}
+
+	cq->ib_cq.cqe = cq->resize_cqe;
+
+	return 0;
+
+fail:
+	if (cq->resize_umem) {
+		ib_umem_release(cq->resize_umem);
+		cq->resize_umem = NULL;
+		cq->resize_cqe = 0;
+		memcpy(&cq->qplib_cq.sg_info, &sg_info, sizeof(sg_info));
+		cq->qplib_cq.dpi = orig_dpi;
+	}
+	return rc;
+}
+
 static u8 __req_to_ib_wc_status(u8 qstatus)
 {
 	switch (qstatus) {
@@ -3425,6 +3525,15 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 	struct bnxt_re_sqp_entries *sqp_entry = NULL;
 	unsigned long flags;
 
+	/* User CQ; the only processing we do is to
+	 * complete any pending CQ resize operation.
+	 */
+	if (cq->umem) {
+		if (cq->resize_umem)
+			bnxt_re_resize_cq_complete(cq);
+		return 0;
+	}
+
 	spin_lock_irqsave(&cq->cq_lock, flags);
 	budget = min_t(u32, num_entries, cq->max_cql);
 	num_entries = budget;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 94326267f9bb..31f7e34040f7 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -104,6 +104,8 @@ struct bnxt_re_cq {
 #define MAX_CQL_PER_POLL	1024
 	u32			max_cql;
 	struct ib_umem		*umem;
+	struct ib_umem		*resize_umem;
+	int			resize_cqe;
 };
 
 struct bnxt_re_mr {
@@ -191,6 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
 		      const struct ib_recv_wr **bad_recv_wr);
 int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		      struct ib_udata *udata);
+int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index c5867e78f231..48bbba7df22e 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -553,6 +553,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.query_srq = bnxt_re_query_srq,
 	.reg_user_mr = bnxt_re_reg_user_mr,
 	.req_notify_cq = bnxt_re_req_notify_cq,
+	.resize_cq = bnxt_re_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
 	INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
@@ -584,6 +585,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 		return ret;
 
 	dma_set_max_seg_size(&rdev->en_dev->pdev->dev, UINT_MAX);
+	ibdev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ);
 	return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev);
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 96e581ced50e..1d769a3106f6 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -2100,6 +2100,50 @@ exit:
 	return rc;
 }
 
+void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res,
+				   struct bnxt_qplib_cq *cq)
+{
+	bnxt_qplib_free_hwq(res, &cq->hwq);
+	memcpy(&cq->hwq, &cq->resize_hwq, sizeof(cq->hwq));
+}
+
+int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq,
+			 int new_cqes)
+{
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct creq_resize_cq_resp resp = {};
+	struct cmdq_resize_cq req = {};
+	struct bnxt_qplib_pbl *pbl;
+	u32 pg_sz, lvl, new_sz;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, RESIZE_CQ, cmd_flags);
+	hwq_attr.sginfo = &cq->sg_info;
+	hwq_attr.res = res;
+	hwq_attr.depth = new_cqes;
+	hwq_attr.stride = sizeof(struct cq_base);
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&cq->resize_hwq, &hwq_attr);
+	if (rc)
+		return rc;
+
+	req.cq_cid = cpu_to_le32(cq->id);
+	pbl = &cq->resize_hwq.pbl[PBL_LVL_0];
+	pg_sz = bnxt_qplib_base_pg_size(&cq->resize_hwq);
+	lvl = (cq->resize_hwq.level << CMDQ_RESIZE_CQ_LVL_SFT) &
+				       CMDQ_RESIZE_CQ_LVL_MASK;
+	new_sz = (new_cqes << CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT) &
+		  CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK;
+	req.new_cq_size_pg_size_lvl = cpu_to_le32(new_sz | pg_sz | lvl);
+	req.new_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	return rc;
+}
+
 int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 037501952543..d74d5ead2e32 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -400,6 +400,7 @@ struct bnxt_qplib_cq {
 	u16				count;
 	u16				period;
 	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_hwq		resize_hwq;
 	u32				cnq_hw_ring_id;
 	struct bnxt_qplib_nq		*nq;
 	bool				resize_in_progress;
@@ -532,6 +533,10 @@ void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp);
 int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 			 struct bnxt_qplib_swqe *wqe);
 int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
+int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq,
+			 int new_cqes);
+void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res,
+				   struct bnxt_qplib_cq *cq);
 int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
 int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 		       int num, struct bnxt_qplib_qp **qp);
diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h
index b1de99bf56ce..c4e90775da0c 100644
--- a/include/uapi/rdma/bnxt_re-abi.h
+++ b/include/uapi/rdma/bnxt_re-abi.h
@@ -96,6 +96,10 @@ struct bnxt_re_cq_resp {
 	__u32 rsvd;
 };
 
+struct bnxt_re_resize_cq_req {
+	__aligned_u64 cq_va;
+};
+
 struct bnxt_re_qp_req {
 	__aligned_u64 qpsva;
 	__aligned_u64 qprva;
-- 
cgit v1.2.3


From 77f7eb9f3416aace703971156133926e44e2195b Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 23 Mar 2023 12:13:51 +0200
Subject: net/mlx5: Introduce other vport query for Q-counters

These new fields in QUERY_Q_COUNTER command allow us to access
another vport counters during the query command, which is specially
useful to query representor vports.

In addition also add the required caps to check if this capability
is actually supported.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://lore.kernel.org/r/75c73a4a0e60f18c37b35a4a11ca2e2415e4a6f3.1679566038.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 62039b147432..e4306cd87cd7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1729,7 +1729,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_transport_domain[0x5];
 	u8         reserved_at_328[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_330[0xb];
+	u8         reserved_at_330[0x9];
+	u8         q_counter_aggregation[0x1];
+	u8         q_counter_other_vport[0x1];
 	u8         log_max_xrcd[0x5];
 
 	u8         nic_receive_steering_discard[0x1];
@@ -5603,10 +5605,15 @@ struct mlx5_ifc_query_q_counter_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x80];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x60];
 
 	u8         clear[0x1];
-	u8         reserved_at_c1[0x1f];
+	u8         aggregate[0x1];
+	u8         reserved_at_c2[0x1e];
 
 	u8         reserved_at_e0[0x18];
 	u8         counter_set_id[0x8];
-- 
cgit v1.2.3


From 634f1a7110b439c65fd8a809171c1d2d28bcea6f Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobby.eshleman@bytedance.com>
Date: Mon, 27 Mar 2023 19:11:51 +0000
Subject: vsock: support sockmap

This patch adds sockmap support for vsock sockets. It is intended to be
usable by all transports, but only the virtio and loopback transports
are implemented.

SOCK_STREAM, SOCK_DGRAM, and SOCK_SEQPACKET are all supported.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |   1 +
 include/linux/virtio_vsock.h            |   1 +
 include/net/af_vsock.h                  |  17 ++++
 net/vmw_vsock/Makefile                  |   1 +
 net/vmw_vsock/af_vsock.c                |  64 ++++++++++--
 net/vmw_vsock/virtio_transport.c        |   2 +
 net/vmw_vsock/virtio_transport_common.c |  25 +++++
 net/vmw_vsock/vsock_bpf.c               | 174 ++++++++++++++++++++++++++++++++
 net/vmw_vsock/vsock_loopback.c          |   2 +
 9 files changed, 281 insertions(+), 6 deletions(-)
 create mode 100644 net/vmw_vsock/vsock_bpf.c

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index c8e6087769a1..6578db78f0ae 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
 
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 3f9c16611306..c58453699ee9 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
 void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 568a87c5e0d0..0e7504a42925 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -75,6 +75,7 @@ struct vsock_sock {
 	void *trans;
 };
 
+s64 vsock_connectible_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_space(struct vsock_sock *vsk);
 struct sock *vsock_create_connected(struct sock *parent);
@@ -173,6 +174,9 @@ struct vsock_transport {
 
 	/* Addressing. */
 	u32 (*get_local_cid)(void);
+
+	/* Read a single skb */
+	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
 };
 
 /**** CORE ****/
@@ -225,5 +229,18 @@ int vsock_init_tap(void);
 int vsock_add_tap(struct vsock_tap *vt);
 int vsock_remove_tap(struct vsock_tap *vt);
 void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
+int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+			      int flags);
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t len, int flags);
+
+#ifdef CONFIG_BPF_SYSCALL
+extern struct proto vsock_proto;
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+void __init vsock_bpf_build_proto(void);
+#else
+static inline void __init vsock_bpf_build_proto(void)
+{}
+#endif
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 6a943ec95c4a..5da74c4a9f1d 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
 obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
 
 vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
+vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
 
 vsock_diag-y += diag.o
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 19aea7cba26e..5f2dda35c980 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 /* Protocol family. */
-static struct proto vsock_proto = {
+struct proto vsock_proto = {
 	.name = "AF_VSOCK",
 	.owner = THIS_MODULE,
 	.obj_size = sizeof(struct vsock_sock),
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot = vsock_bpf_update_proto,
+#endif
 };
 
 /* The default peer timeout indicates how long we will wait for a peer response
@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_data);
 
-static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
+s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 {
 	struct sock *sk = sk_vsock(vsk);
 
@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 	else
 		return vsock_stream_has_data(vsk);
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
 
 s64 vsock_stream_has_space(struct vsock_sock *vsk)
 {
@@ -1131,6 +1135,13 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
+static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	return vsk->transport->read_skb(vsk, read_actor);
+}
+
 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len)
 {
@@ -1242,18 +1253,42 @@ static int vsock_dgram_connect(struct socket *sock,
 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
 	sock->state = SS_CONNECTED;
 
+	/* sock map disallows redirection of non-TCP sockets with sk_state !=
+	 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set
+	 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams.
+	 *
+	 * This doesn't seem to be abnormal state for datagram sockets, as the
+	 * same approach can be see in other datagram socket types as well
+	 * (such as unix sockets).
+	 */
+	sk->sk_state = TCP_ESTABLISHED;
+
 out:
 	release_sock(sk);
 	return err;
 }
 
-static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
-			       size_t len, int flags)
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t len, int flags)
 {
-	struct vsock_sock *vsk = vsock_sk(sock->sk);
+#ifdef CONFIG_BPF_SYSCALL
+	const struct proto *prot;
+#endif
+	struct vsock_sock *vsk;
+	struct sock *sk;
+
+	sk = sock->sk;
+	vsk = vsock_sk(sk);
+
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto)
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+#endif
 
 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
 }
+EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
 
 static const struct proto_ops vsock_dgram_ops = {
 	.family = PF_VSOCK,
@@ -1272,6 +1307,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.recvmsg = vsock_dgram_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
@@ -2086,13 +2122,16 @@ out:
 	return err;
 }
 
-static int
+int
 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			  int flags)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
 	const struct vsock_transport *transport;
+#ifdef CONFIG_BPF_SYSCALL
+	const struct proto *prot;
+#endif
 	int err;
 
 	sk = sock->sk;
@@ -2139,6 +2178,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto) {
+		release_sock(sk);
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+	}
+#endif
+
 	if (sk->sk_type == SOCK_STREAM)
 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
 	else
@@ -2148,6 +2195,7 @@ out:
 	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
 
 static int vsock_set_rcvlowat(struct sock *sk, int val)
 {
@@ -2188,6 +2236,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
 	.set_rcvlowat = vsock_set_rcvlowat,
+	.read_skb = vsock_read_skb,
 };
 
 static const struct proto_ops vsock_seqpacket_ops = {
@@ -2209,6 +2258,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
 	.recvmsg = vsock_connectible_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_create(struct net *net, struct socket *sock,
@@ -2348,6 +2398,8 @@ static int __init vsock_init(void)
 		goto err_unregister_proto;
 	}
 
+	vsock_bpf_build_proto();
+
 	return 0;
 
 err_unregister_proto:
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 28b5a8e8e094..e95df847176b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7fc178c3ee07..f39639dd6eb5 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1418,6 +1418,31 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
 }
 EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
 
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct sock *sk = sk_vsock(vsk);
+	struct sk_buff *skb;
+	int off = 0;
+	int copied;
+	int err;
+
+	spin_lock_bh(&vvs->rx_lock);
+	/* Use __skb_recv_datagram() for race-free handling of the receive. It
+	 * works for types other than dgrams.
+	 */
+	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
+	spin_unlock_bh(&vvs->rx_lock);
+
+	if (!skb)
+		return err;
+
+	copied = recv_actor(sk, skb);
+	kfree_skb(skb);
+	return copied;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
new file mode 100644
index 000000000000..a3c97546ab84
--- /dev/null
+++ b/net/vmw_vsock/vsock_bpf.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
+ *
+ * Based off of net/unix/unix_bpf.c
+ */
+
+#include <linux/bpf.h>
+#include <linux/module.h>
+#include <linux/skmsg.h>
+#include <linux/socket.h>
+#include <linux/wait.h>
+#include <net/af_vsock.h>
+#include <net/sock.h>
+
+#define vsock_sk_has_data(__sk, __psock)				\
+		({	!skb_queue_empty(&(__sk)->sk_receive_queue) ||	\
+			!skb_queue_empty(&(__psock)->ingress_skb) ||	\
+			!list_empty(&(__psock)->ingress_msg);		\
+		})
+
+static struct proto *vsock_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(vsock_prot_lock);
+static struct proto vsock_bpf_prot;
+
+static bool vsock_has_data(struct sock *sk, struct sk_psock *psock)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	s64 ret;
+
+	ret = vsock_connectible_has_data(vsk);
+	if (ret > 0)
+		return true;
+
+	return vsock_sk_has_data(sk, psock);
+}
+
+static bool vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
+{
+	bool ret;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return true;
+
+	if (!timeo)
+		return false;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = vsock_has_data(sk, psock);
+	if (!ret) {
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		ret = vsock_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+	int err;
+
+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
+		err = vsock_connectible_recvmsg(sock, msg, len, flags);
+	else if (sk->sk_type == SOCK_DGRAM)
+		err = vsock_dgram_recvmsg(sock, msg, len, flags);
+	else
+		err = -EPROTOTYPE;
+
+	return err;
+}
+
+static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+			     size_t len, int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return __vsock_recvmsg(sk, msg, len, flags);
+
+	lock_sock(sk);
+	if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) {
+		release_sock(sk);
+		sk_psock_put(sk, psock);
+		return __vsock_recvmsg(sk, msg, len, flags);
+	}
+
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	while (copied == 0) {
+		long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+		if (!vsock_msg_wait_data(sk, psock, timeo)) {
+			copied = -EAGAIN;
+			break;
+		}
+
+		if (sk_psock_queue_empty(psock)) {
+			release_sock(sk);
+			sk_psock_put(sk, psock);
+			return __vsock_recvmsg(sk, msg, len, flags);
+		}
+
+		copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	}
+
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+
+	return copied;
+}
+
+/* Copy of original proto with updated sock_map methods */
+static struct proto vsock_bpf_prot = {
+	.close = sock_map_close,
+	.recvmsg = vsock_bpf_recvmsg,
+	.sock_is_readable = sk_msg_is_readable,
+	.unhash = sock_map_unhash,
+};
+
+static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+	prot->recvmsg = vsock_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void vsock_bpf_check_needs_rebuild(struct proto *ops)
+{
+	/* Paired with the smp_store_release() below. */
+	if (unlikely(ops != smp_load_acquire(&vsock_prot_saved))) {
+		spin_lock_bh(&vsock_prot_lock);
+		if (likely(ops != vsock_prot_saved)) {
+			vsock_bpf_rebuild_protos(&vsock_bpf_prot, ops);
+			/* Make sure proto function pointers are updated before publishing the
+			 * pointer to the struct.
+			 */
+			smp_store_release(&vsock_prot_saved, ops);
+		}
+		spin_unlock_bh(&vsock_prot_lock);
+	}
+}
+
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	struct vsock_sock *vsk;
+
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		sock_replace_proto(sk, psock->sk_proto);
+		return 0;
+	}
+
+	vsk = vsock_sk(sk);
+	if (!vsk->transport)
+		return -ENODEV;
+
+	if (!vsk->transport->read_skb)
+		return -EOPNOTSUPP;
+
+	vsock_bpf_check_needs_rebuild(psock->sk_proto);
+	sock_replace_proto(sk, &vsock_bpf_prot);
+	return 0;
+}
+
+void __init vsock_bpf_build_proto(void)
+{
+	vsock_bpf_rebuild_protos(&vsock_bpf_prot, &vsock_proto);
+}
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 671e03240fc5..40753b661c13 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -94,6 +94,8 @@ static struct virtio_transport loopback_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vsock_loopback_send_pkt,
-- 
cgit v1.2.3


From 8cdc3223e78c43e1b60ea1c536a103e32fdca3c5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 27 Mar 2023 16:54:54 -0700
Subject: ipv6: Remove in6addr_any alternatives.

Some code defines the IPv6 wildcard address as a local variable and
use it with memcmp() or ipv6_addr_equal().

Let's use in6addr_any and ipv6_addr_any() instead.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c |  5 ++---
 include/net/ip6_fib.h                                     |  9 +++------
 include/trace/events/fib.h                                |  5 ++---
 include/trace/events/fib6.h                               |  5 +----
 net/ethtool/ioctl.c                                       | 10 +++++-----
 net/ipv4/inet_hashtables.c                                | 11 ++++-------
 6 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index a108e73c9f66..20c2d2ecaf93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -98,7 +98,6 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
 	else if (ip_version == 6) {
 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
-		struct in6_addr zerov6 = {};
 
 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
@@ -106,8 +105,8 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
-		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
-		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
+		if (ipv6_addr_any(&tun_attr->dst_ip.v6) ||
+		    ipv6_addr_any(&tun_attr->src_ip.v6))
 			return 0;
 	}
 #endif
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 79570cb4ea9c..05e6f756feaf 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -469,13 +469,10 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 	rcu_read_lock();
 
 	from = rcu_dereference(rt->from);
-	if (from) {
+	if (from)
 		*addr = from->fib6_prefsrc.addr;
-	} else {
-		struct in6_addr in6_zero = {};
-
-		*addr = in6_zero;
-	}
+	else
+		*addr = in6addr_any;
 
 	rcu_read_unlock();
 }
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index c2300c407f58..76297ecd4935 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -36,7 +36,6 @@ TRACE_EVENT(fib_table_lookup,
 	),
 
 	TP_fast_assign(
-		struct in6_addr in6_zero = {};
 		struct net_device *dev;
 		struct in6_addr *in6;
 		__be32 *p32;
@@ -74,7 +73,7 @@ TRACE_EVENT(fib_table_lookup,
 				*p32 = nhc->nhc_gw.ipv4;
 
 				in6 = (struct in6_addr *)__entry->gw6;
-				*in6 = in6_zero;
+				*in6 = in6addr_any;
 			} else if (nhc->nhc_gw_family == AF_INET6) {
 				p32 = (__be32 *) __entry->gw4;
 				*p32 = 0;
@@ -87,7 +86,7 @@ TRACE_EVENT(fib_table_lookup,
 			*p32 = 0;
 
 			in6 = (struct in6_addr *)__entry->gw6;
-			*in6 = in6_zero;
+			*in6 = in6addr_any;
 		}
 	),
 
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 6e821eb79450..4d3e607b3cde 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -68,11 +68,8 @@ TRACE_EVENT(fib6_table_lookup,
 			strcpy(__entry->name, "-");
 		}
 		if (res->f6i == net->ipv6.fib6_null_entry) {
-			struct in6_addr in6_zero = {};
-
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = in6_zero;
-
+			*in6 = in6addr_any;
 		} else if (res->nh) {
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = res->nh->fib_nh_gw6;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 646b3e490c71..59adc4e6e9ee 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -27,6 +27,7 @@
 #include <linux/net.h>
 #include <linux/pm_runtime.h>
 #include <net/devlink.h>
+#include <net/ipv6.h>
 #include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <linux/ethtool_netlink.h>
@@ -3127,7 +3128,6 @@ struct ethtool_rx_flow_rule *
 ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 {
 	const struct ethtool_rx_flow_spec *fs = input->fs;
-	static struct in6_addr zero_addr = {};
 	struct ethtool_rx_flow_match *match;
 	struct ethtool_rx_flow_rule *flow;
 	struct flow_action_entry *act;
@@ -3233,20 +3233,20 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 
 		v6_spec = &fs->h_u.tcp_ip6_spec;
 		v6_m_spec = &fs->m_u.tcp_ip6_spec;
-		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) {
 			memcpy(&match->key.ipv6.src, v6_spec->ip6src,
 			       sizeof(match->key.ipv6.src));
 			memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src,
 			       sizeof(match->mask.ipv6.src));
 		}
-		if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
 			memcpy(&match->key.ipv6.dst, v6_spec->ip6dst,
 			       sizeof(match->key.ipv6.dst));
 			memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst,
 			       sizeof(match->mask.ipv6.dst));
 		}
-		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) ||
-		    memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) ||
+		    !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
 			match->dissector.used_keys |=
 				BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS);
 			match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] =
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6edae3886885..e7391bf310a7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -826,13 +826,11 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const
 				      unsigned short port, int l3mdev, const struct sock *sk)
 {
 #if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr addr_any = {};
-
 	if (sk->sk_family != tb->family) {
 		if (sk->sk_family == AF_INET)
 			return net_eq(ib2_net(tb), net) && tb->port == port &&
 				tb->l3mdev == l3mdev &&
-				ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any);
+				ipv6_addr_any(&tb->v6_rcv_saddr);
 
 		return false;
 	}
@@ -840,7 +838,7 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const
 	if (sk->sk_family == AF_INET6)
 		return net_eq(ib2_net(tb), net) && tb->port == port &&
 			tb->l3mdev == l3mdev &&
-			ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any);
+			ipv6_addr_any(&tb->v6_rcv_saddr);
 	else
 #endif
 		return net_eq(ib2_net(tb), net) && tb->port == port &&
@@ -866,11 +864,10 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
 {
 	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
 	u32 hash;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr addr_any = {};
 
+#if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
-		hash = ipv6_portaddr_hash(net, &addr_any, port);
+		hash = ipv6_portaddr_hash(net, &in6addr_any, port);
 	else
 #endif
 		hash = ipv4_portaddr_hash(net, 0, port);
-- 
cgit v1.2.3


From 954d1fa1ac93aa8a66f7d9a9ba545cf7f020d348 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Mar 2023 10:57:59 +0800
Subject: macvlan: Add netlink attribute for broadcast cutoff

Make the broadcast cutoff configurable through netlink.  Note
that macvlan is weird because there is no central device for
us to configure (the lowerdev could be anything).  So all the
options are duplicated over what could be thousands of child
devices.

IFLA_MACVLAN_BC_QUEUE_LEN took the approach of taking the maximum
of all child device settings.  This is unnecessary as we could
simply store the option in the port device and take the last
child device that gets updated as the value to use.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c              | 31 +++++++++++++++++++++++++++++--
 include/uapi/linux/if_link.h       |  1 +
 tools/include/uapi/linux/if_link.h |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 62b4748d3836..4215106adc40 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -47,6 +47,7 @@ struct macvlan_port {
 	struct sk_buff_head	bc_queue;
 	struct work_struct	bc_work;
 	u32			bc_queue_len_used;
+	int			bc_cutoff;
 	u32			flags;
 	int			count;
 	struct hlist_head	vlan_source_hash[MACVLAN_HASH_SIZE];
@@ -814,6 +815,12 @@ static void macvlan_compute_filter(unsigned long *mc_filter,
 	}
 }
 
+static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan)
+{
+	macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL,
+			       vlan->port->bc_cutoff);
+}
+
 static void macvlan_set_mac_lists(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
@@ -838,8 +845,16 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 	 */
 	macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL,
 			       0);
-	macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL,
-			       1);
+	macvlan_recompute_bc_filter(vlan);
+}
+
+static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff)
+{
+	if (vlan->port->bc_cutoff == cutoff)
+		return;
+
+	vlan->port->bc_cutoff = cutoff;
+	macvlan_recompute_bc_filter(vlan);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
@@ -1254,6 +1269,7 @@ static int macvlan_port_create(struct net_device *dev)
 		INIT_HLIST_HEAD(&port->vlan_source_hash[i]);
 
 	port->bc_queue_len_used = 0;
+	port->bc_cutoff = 1;
 	skb_queue_head_init(&port->bc_queue);
 	INIT_WORK(&port->bc_work, macvlan_process_broadcast);
 
@@ -1527,6 +1543,10 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN])
 		vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);
 
+	if (data && data[IFLA_MACVLAN_BC_CUTOFF])
+		update_port_bc_cutoff(
+			vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));
+
 	err = register_netdevice(dev);
 	if (err < 0)
 		goto destroy_macvlan_port;
@@ -1623,6 +1643,10 @@ static int macvlan_changelink(struct net_device *dev,
 		update_port_bc_queue_len(vlan->port);
 	}
 
+	if (data && data[IFLA_MACVLAN_BC_CUTOFF])
+		update_port_bc_cutoff(
+			vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));
+
 	if (set_mode)
 		vlan->mode = mode;
 	if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
@@ -1703,6 +1727,9 @@ static int macvlan_fill_info(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used))
 		goto nla_put_failure;
+	if (port->bc_cutoff != 1 &&
+	    nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 57ceb788250f..8d679688efe0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -635,6 +635,7 @@ enum {
 	IFLA_MACVLAN_MACADDR_COUNT,
 	IFLA_MACVLAN_BC_QUEUE_LEN,
 	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
+	IFLA_MACVLAN_BC_CUTOFF,
 	__IFLA_MACVLAN_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 901d98b865a1..39e659c83cfd 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -605,6 +605,7 @@ enum {
 	IFLA_MACVLAN_MACADDR_COUNT,
 	IFLA_MACVLAN_BC_QUEUE_LEN,
 	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
+	IFLA_MACVLAN_BC_CUTOFF,
 	__IFLA_MACVLAN_MAX,
 };
 
-- 
cgit v1.2.3


From b93c2a68f3d9dc98ec30dcb342ae47c1c8d09d18 Mon Sep 17 00:00:00 2001
From: Elson Roy Serrao <quic_eserrao@quicinc.com>
Date: Fri, 24 Mar 2023 14:47:57 -0700
Subject: usb: gadget: Properly configure the device for remote wakeup

The wakeup bit in the bmAttributes field indicates whether the device
is configured for remote wakeup. But this field should be allowed to
set only if the UDC supports such wakeup mechanism. So configure this
field based on UDC capability. Also inform the UDC whether the device
is configured for remote wakeup by implementing a gadget op.

Reviewed-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Signed-off-by: Elson Roy Serrao <quic_eserrao@quicinc.com>
Link: https://lore.kernel.org/r/1679694482-16430-2-git-send-email-quic_eserrao@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/composite.c | 18 ++++++++++++++++++
 drivers/usb/gadget/configfs.c  |  3 +++
 drivers/usb/gadget/udc/core.c  | 27 +++++++++++++++++++++++++++
 drivers/usb/gadget/udc/trace.h |  5 +++++
 include/linux/usb/composite.h  |  2 ++
 include/linux/usb/gadget.h     |  8 ++++++++
 6 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index f1ecd12b62c3..00c89571de2a 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -513,6 +513,19 @@ static u8 encode_bMaxPower(enum usb_device_speed speed,
 		return min(val, 900U) / 8;
 }
 
+void check_remote_wakeup_config(struct usb_gadget *g,
+				struct usb_configuration *c)
+{
+	if (USB_CONFIG_ATT_WAKEUP & c->bmAttributes) {
+		/* Reset the rw bit if gadget is not capable of it */
+		if (!g->wakeup_capable && g->ops->set_remote_wakeup) {
+			WARN(c->cdev, "Clearing wakeup bit for config c.%d\n",
+			     c->bConfigurationValue);
+			c->bmAttributes &= ~USB_CONFIG_ATT_WAKEUP;
+		}
+	}
+}
+
 static int config_buf(struct usb_configuration *config,
 		enum usb_device_speed speed, void *buf, u8 type)
 {
@@ -994,6 +1007,11 @@ static int set_config(struct usb_composite_dev *cdev,
 		power = min(power, 500U);
 	else
 		power = min(power, 900U);
+
+	if (USB_CONFIG_ATT_WAKEUP & c->bmAttributes)
+		usb_gadget_set_remote_wakeup(gadget, 1);
+	else
+		usb_gadget_set_remote_wakeup(gadget, 0);
 done:
 	if (power <= USB_SELF_POWER_VBUS_MAX_DRAW)
 		usb_gadget_set_selfpowered(gadget);
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index b9f1136aa0a2..4c639e9ddedc 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1761,6 +1761,9 @@ static int configfs_composite_bind(struct usb_gadget *gadget,
 		if (gadget_is_otg(gadget))
 			c->descriptors = otg_desc;
 
+		/* Properly configure the bmAttributes wakeup bit */
+		check_remote_wakeup_config(gadget, c);
+
 		cfg = container_of(c, struct config_usb_cfg, c);
 		if (!list_empty(&cfg->string_list)) {
 			i = 0;
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 23b0629a8774..3dcbba739db6 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -513,6 +513,33 @@ out:
 }
 EXPORT_SYMBOL_GPL(usb_gadget_wakeup);
 
+/**
+ * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature.
+ * @gadget:the device being configured for remote wakeup
+ * @set:value to be configured.
+ *
+ * set to one to enable remote wakeup feature and zero to disable it.
+ *
+ * returns zero on success, else negative errno.
+ */
+int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set)
+{
+	int ret = 0;
+
+	if (!gadget->ops->set_remote_wakeup) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ret = gadget->ops->set_remote_wakeup(gadget, set);
+
+out:
+	trace_usb_gadget_set_remote_wakeup(gadget, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup);
+
 /**
  * usb_gadget_set_selfpowered - sets the device selfpowered feature.
  * @gadget:the device being declared as self-powered
diff --git a/drivers/usb/gadget/udc/trace.h b/drivers/usb/gadget/udc/trace.h
index abdbcb1bacb0..a5ed26fbc2da 100644
--- a/drivers/usb/gadget/udc/trace.h
+++ b/drivers/usb/gadget/udc/trace.h
@@ -91,6 +91,11 @@ DEFINE_EVENT(udc_log_gadget, usb_gadget_wakeup,
 	TP_ARGS(g, ret)
 );
 
+DEFINE_EVENT(udc_log_gadget, usb_gadget_set_remote_wakeup,
+	TP_PROTO(struct usb_gadget *g, int ret),
+	TP_ARGS(g, ret)
+);
+
 DEFINE_EVENT(udc_log_gadget, usb_gadget_set_selfpowered,
 	TP_PROTO(struct usb_gadget *g, int ret),
 	TP_ARGS(g, ret)
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 608dc962748b..d949e91ceb48 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -413,6 +413,8 @@ extern int composite_dev_prepare(struct usb_composite_driver *composite,
 extern int composite_os_desc_req_prepare(struct usb_composite_dev *cdev,
 					 struct usb_ep *ep0);
 void composite_dev_cleanup(struct usb_composite_dev *cdev);
+void check_remote_wakeup_config(struct usb_gadget *g,
+				struct usb_configuration *c);
 
 static inline struct usb_composite_driver *to_cdriver(
 		struct usb_gadget_driver *gdrv)
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 00750f7020f3..1d796128030b 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -310,6 +310,7 @@ struct usb_udc;
 struct usb_gadget_ops {
 	int	(*get_frame)(struct usb_gadget *);
 	int	(*wakeup)(struct usb_gadget *);
+	int	(*set_remote_wakeup)(struct usb_gadget *, int set);
 	int	(*set_selfpowered) (struct usb_gadget *, int is_selfpowered);
 	int	(*vbus_session) (struct usb_gadget *, int is_active);
 	int	(*vbus_draw) (struct usb_gadget *, unsigned mA);
@@ -384,6 +385,8 @@ struct usb_gadget_ops {
  * @connected: True if gadget is connected.
  * @lpm_capable: If the gadget max_speed is FULL or HIGH, this flag
  *	indicates that it supports LPM as per the LPM ECN & errata.
+ * @wakeup_capable: True if gadget is capable of sending remote wakeup.
+ * @wakeup_armed: True if gadget is armed by the host for remote wakeup.
  * @irq: the interrupt number for device controller.
  * @id_number: a unique ID number for ensuring that gadget names are distinct
  *
@@ -445,6 +448,8 @@ struct usb_gadget {
 	unsigned			deactivated:1;
 	unsigned			connected:1;
 	unsigned			lpm_capable:1;
+	unsigned			wakeup_capable:1;
+	unsigned			wakeup_armed:1;
 	int				irq;
 	int				id_number;
 };
@@ -601,6 +606,7 @@ static inline int gadget_is_otg(struct usb_gadget *g)
 #if IS_ENABLED(CONFIG_USB_GADGET)
 int usb_gadget_frame_number(struct usb_gadget *gadget);
 int usb_gadget_wakeup(struct usb_gadget *gadget);
+int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set);
 int usb_gadget_set_selfpowered(struct usb_gadget *gadget);
 int usb_gadget_clear_selfpowered(struct usb_gadget *gadget);
 int usb_gadget_vbus_connect(struct usb_gadget *gadget);
@@ -616,6 +622,8 @@ static inline int usb_gadget_frame_number(struct usb_gadget *gadget)
 { return 0; }
 static inline int usb_gadget_wakeup(struct usb_gadget *gadget)
 { return 0; }
+static inline int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set)
+{ return 0; }
 static inline int usb_gadget_set_selfpowered(struct usb_gadget *gadget)
 { return 0; }
 static inline int usb_gadget_clear_selfpowered(struct usb_gadget *gadget)
-- 
cgit v1.2.3


From f0db885fb05d35befa81896db6b19eb3ee9ccdfe Mon Sep 17 00:00:00 2001
From: Elson Roy Serrao <quic_eserrao@quicinc.com>
Date: Fri, 24 Mar 2023 14:47:59 -0700
Subject: usb: gadget: Add function wakeup support

USB3.2 spec section 9.2.5.4 quotes that a function may signal that
it wants to exit from Function Suspend by sending a Function
Wake Notification to the host if it is enabled for function
remote wakeup. Add an api in composite layer that can be used
by the function drivers to support this feature. Also expose
a gadget op so that composite layer can trigger a wakeup request
to the UDC driver.

Reviewed-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Signed-off-by: Elson Roy Serrao <quic_eserrao@quicinc.com>
Link: https://lore.kernel.org/r/1679694482-16430-4-git-send-email-quic_eserrao@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/composite.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/composite.h  |  6 ++++++
 include/linux/usb/gadget.h     |  1 +
 3 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 00c89571de2a..d3fd2d1d8096 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -492,6 +492,46 @@ int usb_interface_id(struct usb_configuration *config,
 }
 EXPORT_SYMBOL_GPL(usb_interface_id);
 
+/**
+ * usb_func_wakeup - sends function wake notification to the host.
+ * @func: function that sends the remote wakeup notification.
+ *
+ * Applicable to devices operating at enhanced superspeed when usb
+ * functions are put in function suspend state and armed for function
+ * remote wakeup. On completion, function wake notification is sent. If
+ * the device is in low power state it tries to bring the device to active
+ * state before sending the wake notification. Since it is a synchronous
+ * call, caller must take care of not calling it in interrupt context.
+ * For devices operating at lower speeds  returns negative errno.
+ *
+ * Returns zero on success, else negative errno.
+ */
+int usb_func_wakeup(struct usb_function *func)
+{
+	struct usb_gadget	*gadget = func->config->cdev->gadget;
+	int			id;
+
+	if (!gadget->ops->func_wakeup)
+		return -EOPNOTSUPP;
+
+	if (!func->func_wakeup_armed) {
+		ERROR(func->config->cdev, "not armed for func remote wakeup\n");
+		return -EINVAL;
+	}
+
+	for (id = 0; id < MAX_CONFIG_INTERFACES; id++)
+		if (func->config->interface[id] == func)
+			break;
+
+	if (id == MAX_CONFIG_INTERFACES) {
+		ERROR(func->config->cdev, "Invalid function\n");
+		return -EINVAL;
+	}
+
+	return gadget->ops->func_wakeup(gadget, id);
+}
+EXPORT_SYMBOL_GPL(usb_func_wakeup);
+
 static u8 encode_bMaxPower(enum usb_device_speed speed,
 		struct usb_configuration *c)
 {
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index d949e91ceb48..a2448e98854f 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -150,6 +150,9 @@ struct usb_os_desc_table {
  *	GetStatus() request when the recipient is Interface.
  * @func_suspend: callback to be called when
  *	SetFeature(FUNCTION_SUSPEND) is reseived
+ * @func_suspended: Indicates whether the function is in function suspend state.
+ * @func_wakeup_armed: Indicates whether the function is armed by the host for
+ *	wakeup signaling.
  *
  * A single USB function uses one or more interfaces, and should in most
  * cases support operation at both full and high speeds.  Each function is
@@ -220,6 +223,8 @@ struct usb_function {
 	int			(*get_status)(struct usb_function *);
 	int			(*func_suspend)(struct usb_function *,
 						u8 suspend_opt);
+	bool			func_suspended;
+	bool			func_wakeup_armed;
 	/* private: */
 	/* internals */
 	struct list_head		list;
@@ -241,6 +246,7 @@ int config_ep_by_speed_and_alt(struct usb_gadget *g, struct usb_function *f,
 
 int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f,
 			struct usb_ep *_ep);
+int usb_func_wakeup(struct usb_function *func);
 
 #define	MAX_CONFIG_INTERFACES		16	/* arbitrary; max 255 */
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 1d796128030b..75bda0783395 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -310,6 +310,7 @@ struct usb_udc;
 struct usb_gadget_ops {
 	int	(*get_frame)(struct usb_gadget *);
 	int	(*wakeup)(struct usb_gadget *);
+	int	(*func_wakeup)(struct usb_gadget *gadget, int intf_id);
 	int	(*set_remote_wakeup)(struct usb_gadget *, int set);
 	int	(*set_selfpowered) (struct usb_gadget *, int is_selfpowered);
 	int	(*vbus_session) (struct usb_gadget *, int is_active);
-- 
cgit v1.2.3


From c4ba69f00c18524eb89990e9afda9d2a9b54de2f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 28 Feb 2023 10:59:54 +0100
Subject: mm, page_flags: remove PG_slob_free

With SLOB removed we no longer need the PG_slob_free alias for
PG_private. Also update tools/mm/page-types.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Acked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
---
 include/linux/page-flags.h | 4 ----
 tools/mm/page-types.c      | 6 +-----
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a7e3a3405520..2bdc41cb0594 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -174,9 +174,6 @@ enum pageflags {
 	/* Remapped by swiotlb-xen. */
 	PG_xen_remapped = PG_owner_priv_1,
 
-	/* SLOB */
-	PG_slob_free = PG_private,
-
 #ifdef CONFIG_MEMORY_FAILURE
 	/*
 	 * Compound pages. Stored in first tail page's flags.
@@ -483,7 +480,6 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
 PAGEFLAG(Workingset, workingset, PF_HEAD)
 	TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
 __PAGEFLAG(Slab, slab, PF_NO_TAIL)
-__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
 PAGEFLAG(Checked, checked, PF_NO_COMPOUND)	   /* Used by some filesystems */
 
 /* Xen */
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 381dcc00cb62..8d5595b6c59f 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -85,7 +85,6 @@
  */
 #define KPF_ANON_EXCLUSIVE	47
 #define KPF_READAHEAD		48
-#define KPF_SLOB_FREE		49
 #define KPF_SLUB_FROZEN		50
 #define KPF_SLUB_DEBUG		51
 #define KPF_FILE		61
@@ -141,7 +140,6 @@ static const char * const page_flag_names[] = {
 
 	[KPF_ANON_EXCLUSIVE]	= "d:anon_exclusive",
 	[KPF_READAHEAD]		= "I:readahead",
-	[KPF_SLOB_FREE]		= "P:slob_free",
 	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
 	[KPF_SLUB_DEBUG]	= "E:slub_debug",
 
@@ -478,10 +476,8 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
 	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
 		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
 
-	/* SLOB/SLUB overload several page flags */
+	/* SLUB overloads several page flags */
 	if (flags & BIT(SLAB)) {
-		if (flags & BIT(PRIVATE))
-			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
 		if (flags & BIT(ACTIVE))
 			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
 		if (flags & BIT(ERROR))
-- 
cgit v1.2.3


From de4d6089b9271ed172e92148a04f31578b375525 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 28 Feb 2023 15:34:17 +0100
Subject: mm/slab: remove CONFIG_SLOB code from slab common code

CONFIG_SLOB has been removed from Kconfig. Remove code and #ifdef's
specific to SLOB in the slab headers and common code.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Acked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
---
 include/linux/slab.h | 39 ---------------------------------
 mm/slab.h            | 61 ----------------------------------------------------
 mm/slab_common.c     |  2 --
 3 files changed, 102 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 45af70315a94..7f645a4c1298 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -298,19 +298,6 @@ static inline unsigned int arch_slab_minalign(void)
 #endif
 #endif
 
-#ifdef CONFIG_SLOB
-/*
- * SLOB passes all requests larger than one page to the page allocator.
- * No kmalloc array is necessary since objects of different sizes can
- * be allocated from the same page.
- */
-#define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
-#ifndef KMALLOC_SHIFT_LOW
-#define KMALLOC_SHIFT_LOW	3
-#endif
-#endif
-
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
 /* Maximum size for which we actually use a slab cache */
@@ -366,7 +353,6 @@ enum kmalloc_cache_type {
 	NR_KMALLOC_TYPES
 };
 
-#ifndef CONFIG_SLOB
 extern struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
 
@@ -458,7 +444,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 }
 static_assert(PAGE_SHIFT <= 20);
 #define kmalloc_index(s) __kmalloc_index(s, true)
-#endif /* !CONFIG_SLOB */
 
 void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
 
@@ -487,10 +472,6 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
 
-/*
- * Caller must not use kfree_bulk() on memory not originally allocated
- * by kmalloc(), because the SLOB allocator cannot handle this.
- */
 static __always_inline void kfree_bulk(size_t size, void **p)
 {
 	kmem_cache_free_bulk(NULL, size, p);
@@ -567,7 +548,6 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
  *	Try really hard to succeed the allocation but fail
  *	eventually.
  */
-#ifndef CONFIG_SLOB
 static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size) && size) {
@@ -583,17 +563,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 	}
 	return __kmalloc(size, flags);
 }
-#else
-static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
-{
-	if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE)
-		return kmalloc_large(size, flags);
-
-	return __kmalloc(size, flags);
-}
-#endif
 
-#ifndef CONFIG_SLOB
 static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) && size) {
@@ -609,15 +579,6 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
 	}
 	return __kmalloc_node(size, flags, node);
 }
-#else
-static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE)
-		return kmalloc_large_node(size, flags, node);
-
-	return __kmalloc_node(size, flags, node);
-}
-#endif
 
 /**
  * kmalloc_array - allocate memory for an array.
diff --git a/mm/slab.h b/mm/slab.h
index 43966aa5fadf..399966b3ce52 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -51,14 +51,6 @@ struct slab {
 	};
 	unsigned int __unused;
 
-#elif defined(CONFIG_SLOB)
-
-	struct list_head slab_list;
-	void *__unused_1;
-	void *freelist;		/* first free block */
-	long units;
-	unsigned int __unused_2;
-
 #else
 #error "Unexpected slab allocator configured"
 #endif
@@ -72,11 +64,7 @@ struct slab {
 #define SLAB_MATCH(pg, sl)						\
 	static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
 SLAB_MATCH(flags, __page_flags);
-#ifndef CONFIG_SLOB
 SLAB_MATCH(compound_head, slab_cache);	/* Ensure bit 0 is clear */
-#else
-SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
-#endif
 SLAB_MATCH(_refcount, __page_refcount);
 #ifdef CONFIG_MEMCG
 SLAB_MATCH(memcg_data, memcg_data);
@@ -200,31 +188,6 @@ static inline size_t slab_size(const struct slab *slab)
 	return PAGE_SIZE << slab_order(slab);
 }
 
-#ifdef CONFIG_SLOB
-/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
- */
-struct kmem_cache {
-	unsigned int object_size;/* The original size of the object */
-	unsigned int size;	/* The aligned/padded/added on size  */
-	unsigned int align;	/* Alignment as calculated */
-	slab_flags_t flags;	/* Active flags on the slab */
-	const char *name;	/* Slab name for sysfs */
-	int refcount;		/* Use counter */
-	void (*ctor)(void *);	/* Called on object slot creation */
-	struct list_head list;	/* List of all slab caches on the system */
-};
-
-#endif /* CONFIG_SLOB */
-
 #ifdef CONFIG_SLAB
 #include <linux/slab_def.h>
 #endif
@@ -274,7 +237,6 @@ extern const struct kmalloc_info_struct {
 	unsigned int size;
 } kmalloc_info[];
 
-#ifndef CONFIG_SLOB
 /* Kmalloc array related functions */
 void setup_kmalloc_cache_index_table(void);
 void create_kmalloc_caches(slab_flags_t);
@@ -286,7 +248,6 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
 			      int node, size_t orig_size,
 			      unsigned long caller);
 void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
-#endif
 
 gfp_t kmalloc_fix_flags(gfp_t flags);
 
@@ -303,33 +264,16 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(unsigned size, unsigned align,
 		slab_flags_t flags, const char *name, void (*ctor)(void *));
-#ifndef CONFIG_SLOB
 struct kmem_cache *
 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
 		   slab_flags_t flags, void (*ctor)(void *));
 
 slab_flags_t kmem_cache_flags(unsigned int object_size,
 	slab_flags_t flags, const char *name);
-#else
-static inline struct kmem_cache *
-__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
-		   slab_flags_t flags, void (*ctor)(void *))
-{ return NULL; }
-
-static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
-	slab_flags_t flags, const char *name)
-{
-	return flags;
-}
-#endif
 
 static inline bool is_kmalloc_cache(struct kmem_cache *s)
 {
-#ifndef CONFIG_SLOB
 	return (s->flags & SLAB_KMALLOC);
-#else
-	return false;
-#endif
 }
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
@@ -634,7 +578,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
-#ifndef CONFIG_SLOB
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
 	struct slab *slab;
@@ -684,8 +627,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 
 void free_large_kmalloc(struct folio *folio, void *object);
 
-#endif /* CONFIG_SLOB */
-
 size_t __ksize(const void *objp);
 
 static inline size_t slab_ksize(const struct kmem_cache *s)
@@ -777,7 +718,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
 	memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
 }
 
-#ifndef CONFIG_SLOB
 /*
  * The slab lists for all objects.
  */
@@ -824,7 +764,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 	for (__node = 0; __node < nr_node_ids; __node++) \
 		 if ((__n = get_node(__s, __node)))
 
-#endif
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
 void dump_unreclaimable_slab(void);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bf4e777cfe90..1522693295f5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -625,7 +625,6 @@ void kmem_dump_obj(void *object)
 EXPORT_SYMBOL_GPL(kmem_dump_obj);
 #endif
 
-#ifndef CONFIG_SLOB
 /* Create a cache during boot when no slab services are available yet */
 void __init create_boot_cache(struct kmem_cache *s, const char *name,
 		unsigned int size, slab_flags_t flags,
@@ -1079,7 +1078,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_node_trace);
-#endif /* !CONFIG_SLOB */
 
 gfp_t kmalloc_fix_flags(gfp_t flags)
 {
-- 
cgit v1.2.3


From ae65a5211d90e54ae604012ce9cf234c48780929 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 2 Mar 2023 16:01:00 +0100
Subject: mm/slab: document kfree() as allowed for kmem_cache_alloc() objects

This will make it easier to free objects in situations when they can
come from either kmalloc() or kmem_cache_alloc(), and also allow
kfree_rcu() for freeing objects from kmem_cache_alloc().

For the SLAB and SLUB allocators this was always possible so with SLOB
gone, we can document it as supported.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
---
 Documentation/core-api/memory-allocation.rst | 17 +++++++++++++----
 include/linux/rcupdate.h                     |  6 ++++--
 mm/slab_common.c                             |  5 +----
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 5954ddf6ee13..1c58d883b273 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -170,7 +170,16 @@ should be used if a part of the cache might be copied to the userspace.
 After the cache is created kmem_cache_alloc() and its convenience
 wrappers can allocate memory from that cache.
 
-When the allocated memory is no longer needed it must be freed. You can
-use kvfree() for the memory allocated with `kmalloc`, `vmalloc` and
-`kvmalloc`. The slab caches should be freed with kmem_cache_free(). And
-don't forget to destroy the cache with kmem_cache_destroy().
+When the allocated memory is no longer needed it must be freed.
+
+Objects allocated by `kmalloc` can be freed by `kfree` or `kvfree`. Objects
+allocated by `kmem_cache_alloc` can be freed with `kmem_cache_free`, `kfree`
+or `kvfree`, where the latter two might be more convenient thanks to not
+needing the kmem_cache pointer.
+
+The same rules apply to _bulk and _rcu flavors of freeing functions.
+
+Memory allocated by `vmalloc` can be freed with `vfree` or `kvfree`.
+Memory allocated by `kvmalloc` can be freed with `kvfree`.
+Caches created by `kmem_cache_create` should be freed with
+`kmem_cache_destroy` only after freeing all the allocated objects first.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 094321c17e48..dcd2cf1e8326 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -976,8 +976,10 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * either fall back to use of call_rcu() or rearrange the structure to
  * position the rcu_head structure into the first 4096 bytes.
  *
- * Note that the allowable offset might decrease in the future, for example,
- * to allow something like kmem_cache_free_rcu().
+ * The object to be freed can be allocated either by kmalloc() or
+ * kmem_cache_alloc().
+ *
+ * Note that the allowable offset might decrease in the future.
  *
  * The BUILD_BUG_ON check must not involve any function calls, hence the
  * checks are done in macros here.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1522693295f5..607249785c07 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -989,12 +989,9 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
 
 /**
  * kfree - free previously allocated memory
- * @object: pointer returned by kmalloc.
+ * @object: pointer returned by kmalloc() or kmem_cache_alloc()
  *
  * If @object is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
  */
 void kfree(const void *object)
 {
-- 
cgit v1.2.3


From 2b76ffe81e32afd6d318dc4547e2ba8c46207b77 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 28 Mar 2023 19:15:29 -0700
Subject: linux/vt_buffer.h: allow either builtin or modular for macros

Fix build errors on ARCH=alpha when CONFIG_MDA_CONSOLE=m.
This allows the ARCH macros to be the only ones defined.

In file included from ../drivers/video/console/mdacon.c:37:
../arch/alpha/include/asm/vga.h:17:40: error: expected identifier or '(' before 'volatile'
   17 | static inline void scr_writew(u16 val, volatile u16 *addr)
      |                                        ^~~~~~~~
../include/linux/vt_buffer.h:24:34: note: in definition of macro 'scr_writew'
   24 | #define scr_writew(val, addr) (*(addr) = (val))
      |                                  ^~~~
../include/linux/vt_buffer.h:24:40: error: expected ')' before '=' token
   24 | #define scr_writew(val, addr) (*(addr) = (val))
      |                                        ^
../arch/alpha/include/asm/vga.h:17:20: note: in expansion of macro 'scr_writew'
   17 | static inline void scr_writew(u16 val, volatile u16 *addr)
      |                    ^~~~~~~~~~
../arch/alpha/include/asm/vga.h:25:29: error: expected identifier or '(' before 'volatile'
   25 | static inline u16 scr_readw(volatile const u16 *addr)
      |                             ^~~~~~~~

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Link: https://lore.kernel.org/r/20230329021529.16188-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vt_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 848db1b1569f..919d999a8c1d 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -16,7 +16,7 @@
 
 #include <linux/string.h>
 
-#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
+#if IS_ENABLED(CONFIG_VGA_CONSOLE) || IS_ENABLED(CONFIG_MDA_CONSOLE)
 #include <asm/vga.h>
 #endif
 
-- 
cgit v1.2.3


From 4ca589661d964840d0d5de4b3baabbef78f453e3 Mon Sep 17 00:00:00 2001
From: Daniel Starke <daniel.starke@siemens.com>
Date: Wed, 15 Mar 2023 11:53:52 +0100
Subject: tty: n_gsm: add ioctl for DLC specific parameter configuration

Parameter negotiation has been introduced with
commit 92f1f0c3290d ("tty: n_gsm: add parameter negotiation support")

However, means to set individual parameters per DLCI are not yet
implemented. Furthermore, it is currently not possible to keep a DLCI half
open until the user application sets the right parameters for it. This is
required to allow a user application to set its specific parameters before
the underlying link is established. Otherwise, the link is opened and
re-established right afterwards if the user application sets incompatible
parameters. This may be an unexpected behavior for the peer.

Add parameter 'wait_config' to 'gsm_config' to support setups where the
DLCI specific user application sets its specific parameters after open()
and before the link gets fully established. Setting this to zero disables
the user application specific DLCI configuration option.

Add the ioctls 'GSMIOC_GETCONF_DLCI' and 'GSMIOC_SETCONF_DLCI' for the
ldisc and virtual ttys. This gets/sets the DLCI specific parameters and may
trigger a reconnect of the DLCI if incompatible values have been set. Only
the parameters for the DLCI associated with the virtual tty can be set or
retrieved if called on these.

Add remark within the documentation to introduce the new ioctls.

Link: https://lore.kernel.org/oe-kbuild-all/202302281856.S9Lz4gHB-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Daniel Starke <daniel.starke@siemens.com>
Link: https://lore.kernel.org/r/20230315105354.6234-1-daniel.starke@siemens.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/tty/n_gsm.rst |   4 +
 drivers/tty/n_gsm.c                    | 192 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/gsmmux.h            |  17 ++-
 3 files changed, 207 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/tty/n_gsm.rst b/Documentation/driver-api/tty/n_gsm.rst
index 9447b8a3b8e2..c2624546fb8f 100644
--- a/Documentation/driver-api/tty/n_gsm.rst
+++ b/Documentation/driver-api/tty/n_gsm.rst
@@ -29,6 +29,8 @@ Config Initiator
 
 #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl.
 
+#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults.
+
 #. Obtain base gsmtty number for the used serial port.
 
    Major parts of the initialization program
@@ -120,6 +122,8 @@ Config Requester
 
 #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl.
 
+#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults.
+
 #. Obtain base gsmtty number for the used serial port::
 
         #include <stdio.h>
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 7aa05ebed8e1..9d0e7701ffd4 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -128,6 +128,7 @@ struct gsm_msg {
 
 enum gsm_dlci_state {
 	DLCI_CLOSED,
+	DLCI_WAITING_CONFIG,	/* Waiting for DLCI configuration from user */
 	DLCI_CONFIGURE,		/* Sending PN (for adaption > 1) */
 	DLCI_OPENING,		/* Sending SABM not seen UA */
 	DLCI_OPEN,		/* SABM/UA complete */
@@ -330,6 +331,7 @@ struct gsm_mux {
 	unsigned int t3;	/* Power wake-up timer in seconds. */
 	int n2;			/* Retry count */
 	u8 k;			/* Window size */
+	bool wait_config;	/* Wait for configuration by ioctl before DLCI open */
 	u32 keep_alive;		/* Control channel keep-alive in 10ms */
 
 	/* Statistics (not currently exposed) */
@@ -451,6 +453,7 @@ static int gsm_modem_update(struct gsm_dlci *dlci, u8 brk);
 static struct gsm_msg *gsm_data_alloc(struct gsm_mux *gsm, u8 addr, int len,
 								u8 ctrl);
 static int gsm_send_packet(struct gsm_mux *gsm, struct gsm_msg *msg);
+static struct gsm_dlci *gsm_dlci_alloc(struct gsm_mux *gsm, int addr);
 static void gsmld_write_trigger(struct gsm_mux *gsm);
 static void gsmld_write_task(struct work_struct *work);
 
@@ -2280,6 +2283,7 @@ static void gsm_dlci_begin_open(struct gsm_dlci *dlci)
 
 	switch (dlci->state) {
 	case DLCI_CLOSED:
+	case DLCI_WAITING_CONFIG:
 	case DLCI_CLOSING:
 		dlci->retries = gsm->n2;
 		if (!need_pn) {
@@ -2311,6 +2315,7 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci)
 {
 	switch (dlci->state) {
 	case DLCI_CLOSED:
+	case DLCI_WAITING_CONFIG:
 	case DLCI_CLOSING:
 		dlci->state = DLCI_OPENING;
 		break;
@@ -2319,6 +2324,24 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci)
 	}
 }
 
+/**
+ * gsm_dlci_set_wait_config	-	wait for channel configuration
+ * @dlci: DLCI to configure
+ *
+ * Wait for a DLCI configuration from the application.
+ */
+static void gsm_dlci_set_wait_config(struct gsm_dlci *dlci)
+{
+	switch (dlci->state) {
+	case DLCI_CLOSED:
+	case DLCI_CLOSING:
+		dlci->state = DLCI_WAITING_CONFIG;
+		break;
+	default:
+		break;
+	}
+}
+
 /**
  *	gsm_dlci_begin_close	-	start channel open procedure
  *	@dlci: DLCI to open
@@ -2453,6 +2476,128 @@ static void gsm_kick_timer(struct timer_list *t)
 		pr_info("%s TX queue stalled\n", __func__);
 }
 
+/**
+ * gsm_dlci_copy_config_values	-	copy DLCI configuration
+ * @dlci: source DLCI
+ * @dc: configuration structure to fill
+ */
+static void gsm_dlci_copy_config_values(struct gsm_dlci *dlci, struct gsm_dlci_config *dc)
+{
+	memset(dc, 0, sizeof(*dc));
+	dc->channel = (u32)dlci->addr;
+	dc->adaption = (u32)dlci->adaption;
+	dc->mtu = (u32)dlci->mtu;
+	dc->priority = (u32)dlci->prio;
+	if (dlci->ftype == UIH)
+		dc->i = 1;
+	else
+		dc->i = 2;
+	dc->k = (u32)dlci->k;
+}
+
+/**
+ * gsm_dlci_config	-	configure DLCI from configuration
+ * @dlci: DLCI to configure
+ * @dc: DLCI configuration
+ * @open: open DLCI after configuration?
+ */
+static int gsm_dlci_config(struct gsm_dlci *dlci, struct gsm_dlci_config *dc, int open)
+{
+	struct gsm_mux *gsm;
+	bool need_restart = false;
+	bool need_open = false;
+	unsigned int i;
+
+	/*
+	 * Check that userspace doesn't put stuff in here to prevent breakages
+	 * in the future.
+	 */
+	for (i = 0; i < ARRAY_SIZE(dc->reserved); i++)
+		if (dc->reserved[i])
+			return -EINVAL;
+
+	if (!dlci)
+		return -EINVAL;
+	gsm = dlci->gsm;
+
+	/* Stuff we don't support yet - I frame transport */
+	if (dc->adaption != 1 && dc->adaption != 2)
+		return -EOPNOTSUPP;
+	if (dc->mtu > MAX_MTU || dc->mtu < MIN_MTU || dc->mtu > gsm->mru)
+		return -EINVAL;
+	if (dc->priority >= 64)
+		return -EINVAL;
+	if (dc->i == 0 || dc->i > 2)  /* UIH and UI only */
+		return -EINVAL;
+	if (dc->k > 7)
+		return -EINVAL;
+
+	/*
+	 * See what is needed for reconfiguration
+	 */
+	/* Framing fields */
+	if (dc->adaption != dlci->adaption)
+		need_restart = true;
+	if (dc->mtu != dlci->mtu)
+		need_restart = true;
+	if (dc->i != dlci->ftype)
+		need_restart = true;
+	/* Requires care */
+	if (dc->priority != dlci->prio)
+		need_restart = true;
+
+	if ((open && gsm->wait_config) || need_restart)
+		need_open = true;
+	if (dlci->state == DLCI_WAITING_CONFIG) {
+		need_restart = false;
+		need_open = true;
+	}
+
+	/*
+	 * Close down what is needed, restart and initiate the new
+	 * configuration.
+	 */
+	if (need_restart) {
+		gsm_dlci_begin_close(dlci);
+		wait_event_interruptible(gsm->event, dlci->state == DLCI_CLOSED);
+		if (signal_pending(current))
+			return -EINTR;
+	}
+	/*
+	 * Setup the new configuration values
+	 */
+	dlci->adaption = (int)dc->adaption;
+
+	if (dc->mtu)
+		dlci->mtu = (unsigned int)dc->mtu;
+	else
+		dlci->mtu = gsm->mtu;
+
+	if (dc->priority)
+		dlci->prio = (u8)dc->priority;
+	else
+		dlci->prio = roundup(dlci->addr + 1, 8) - 1;
+
+	if (dc->i == 1)
+		dlci->ftype = UIH;
+	else if (dc->i == 2)
+		dlci->ftype = UI;
+
+	if (dc->k)
+		dlci->k = (u8)dc->k;
+	else
+		dlci->k = gsm->k;
+
+	if (need_open) {
+		if (gsm->initiator)
+			gsm_dlci_begin_open(dlci);
+		else
+			gsm_dlci_set_opening(dlci);
+	}
+
+	return 0;
+}
+
 /*
  *	Allocate/Free DLCI channels
  */
@@ -3078,6 +3223,7 @@ static struct gsm_mux *gsm_alloc_mux(void)
 	gsm->mru = 64;	/* Default to encoding 1 so these should be 64 */
 	gsm->mtu = 64;
 	gsm->dead = true;	/* Avoid early tty opens */
+	gsm->wait_config = false; /* Disabled */
 	gsm->keep_alive = 0;	/* Disabled */
 
 	/* Store the instance to the mux array or abort if no space is
@@ -3218,6 +3364,7 @@ static void gsm_copy_config_ext_values(struct gsm_mux *gsm,
 				       struct gsm_config_ext *ce)
 {
 	memset(ce, 0, sizeof(*ce));
+	ce->wait_config = gsm->wait_config ? 1 : 0;
 	ce->keep_alive = gsm->keep_alive;
 }
 
@@ -3233,7 +3380,12 @@ static int gsm_config_ext(struct gsm_mux *gsm, struct gsm_config_ext *ce)
 		if (ce->reserved[i])
 			return -EINVAL;
 
+	/*
+	 * Setup the new configuration values
+	 */
+	gsm->wait_config = ce->wait_config ? true : false;
 	gsm->keep_alive = ce->keep_alive;
+
 	return 0;
 }
 
@@ -3436,6 +3588,14 @@ static int gsmld_open(struct tty_struct *tty)
 	gsm->encoding = GSM_ADV_OPT;
 	gsmld_attach_gsm(tty, gsm);
 
+	/* The mux will not be activated yet, we wait for correct
+	 * configuration first.
+	 */
+	if (gsm->encoding == GSM_BASIC_OPT)
+		gsm->receive = gsm0_receive;
+	else
+		gsm->receive = gsm1_receive;
+
 	return 0;
 }
 
@@ -4008,7 +4168,6 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	struct tty_port *port = &dlci->port;
-	struct gsm_mux *gsm = dlci->gsm;
 
 	port->count++;
 	tty_port_tty_set(port, tty);
@@ -4018,10 +4177,15 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 	   a DM straight back. This is ok as that will have caused a hangup */
 	tty_port_set_initialized(port, true);
 	/* Start sending off SABM messages */
-	if (gsm->initiator)
-		gsm_dlci_begin_open(dlci);
-	else
-		gsm_dlci_set_opening(dlci);
+	if (!dlci->gsm->wait_config) {
+		/* Start sending off SABM messages */
+		if (dlci->gsm->initiator)
+			gsm_dlci_begin_open(dlci);
+		else
+			gsm_dlci_set_opening(dlci);
+	} else {
+		gsm_dlci_set_wait_config(dlci);
+	}
 	/* And wait for virtual carrier */
 	return tty_port_block_til_ready(port, tty, filp);
 }
@@ -4142,6 +4306,7 @@ static int gsmtty_ioctl(struct tty_struct *tty,
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	struct gsm_netconfig nc;
+	struct gsm_dlci_config dc;
 	int index;
 
 	if (dlci->state == DLCI_CLOSED)
@@ -4165,6 +4330,23 @@ static int gsmtty_ioctl(struct tty_struct *tty,
 		gsm_destroy_network(dlci);
 		mutex_unlock(&dlci->mutex);
 		return 0;
+	case GSMIOC_GETCONF_DLCI:
+		if (copy_from_user(&dc, (void __user *)arg, sizeof(dc)))
+			return -EFAULT;
+		if (dc.channel != dlci->addr)
+			return -EPERM;
+		gsm_dlci_copy_config_values(dlci, &dc);
+		if (copy_to_user((void __user *)arg, &dc, sizeof(dc)))
+			return -EFAULT;
+		return 0;
+	case GSMIOC_SETCONF_DLCI:
+		if (copy_from_user(&dc, (void __user *)arg, sizeof(dc)))
+			return -EFAULT;
+		if (dc.channel >= NUM_DLCI)
+			return -EINVAL;
+		if (dc.channel != 0 && dc.channel != dlci->addr)
+			return -EPERM;
+		return gsm_dlci_config(dlci, &dc, 1);
 	case TIOCMIWAIT:
 		return gsm_wait_modem_change(dlci, (u32)arg);
 	default:
diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h
index a703780aa095..eb67884e5f38 100644
--- a/include/uapi/linux/gsmmux.h
+++ b/include/uapi/linux/gsmmux.h
@@ -43,10 +43,25 @@ struct gsm_config_ext {
 	__u32 keep_alive;	/* Control channel keep-alive in 1/100th of a
 				 * second (0 to disable)
 				 */
-	__u32 reserved[7];	/* For future use, must be initialized to zero */
+	__u32 wait_config;	/* Wait for DLCI config before opening virtual link? */
+	__u32 reserved[6];	/* For future use, must be initialized to zero */
 };
 
 #define GSMIOC_GETCONF_EXT	_IOR('G', 5, struct gsm_config_ext)
 #define GSMIOC_SETCONF_EXT	_IOW('G', 6, struct gsm_config_ext)
 
+/* Set channel accordingly before calling GSMIOC_GETCONF_DLCI. */
+struct gsm_dlci_config {
+	__u32 channel;		/* DLCI (0 for the associated DLCI) */
+	__u32 adaption;		/* Convergence layer type */
+	__u32 mtu;		/* Maximum transfer unit */
+	__u32 priority;		/* Priority (0 for default value) */
+	__u32 i;		/* Frame type (1 = UIH, 2 = UI) */
+	__u32 k;		/* Window size (0 for default value) */
+	__u32 reserved[8];	/* For future use, must be initialized to zero */
+};
+
+#define GSMIOC_GETCONF_DLCI	_IOWR('G', 7, struct gsm_dlci_config)
+#define GSMIOC_SETCONF_DLCI	_IOW('G', 8, struct gsm_dlci_config)
+
 #endif
-- 
cgit v1.2.3


From 2959ab247061e67485d83b6af8feb3761ec08cb9 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Mon, 13 Mar 2023 18:56:30 +0530
Subject: cdx: add the cdx bus driver

Introduce AMD CDX bus, which provides a mechanism for scanning
and probing CDX devices. These devices are memory mapped on
system bus for Application Processors(APUs).

CDX devices can be changed dynamically in the Fabric and CDX
bus interacts with CDX controller to rescan the bus and
rediscover the devices.

Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Tested-by: Nikhil Agarwal <nikhil.agarwal@amd.com>
Link: https://lore.kernel.org/r/20230313132636.31850-2-nipun.gupta@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-cdx |  12 +
 MAINTAINERS                             |   7 +
 drivers/Kconfig                         |   2 +
 drivers/Makefile                        |   1 +
 drivers/cdx/Kconfig                     |  17 ++
 drivers/cdx/Makefile                    |   8 +
 drivers/cdx/cdx.c                       | 408 ++++++++++++++++++++++++++++++++
 drivers/cdx/cdx.h                       |  62 +++++
 include/linux/cdx/cdx_bus.h             | 147 ++++++++++++
 include/linux/mod_devicetable.h         |  15 ++
 scripts/mod/devicetable-offsets.c       |   4 +
 scripts/mod/file2alias.c                |  12 +
 12 files changed, 695 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-cdx
 create mode 100644 drivers/cdx/Kconfig
 create mode 100644 drivers/cdx/Makefile
 create mode 100644 drivers/cdx/cdx.c
 create mode 100644 drivers/cdx/cdx.h
 create mode 100644 include/linux/cdx/cdx_bus.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
new file mode 100644
index 000000000000..43b4e161f226
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -0,0 +1,12 @@
+What:		/sys/bus/cdx/rescan
+Date:		March 2023
+Contact:	nipun.gupta@amd.com
+Description:
+		Writing y/1/on to this file will cause rescan of the bus
+		and devices on the CDX bus. Any new devices are scanned and
+		added to the list of Linux devices and any devices removed are
+		also deleted from Linux.
+
+		For example::
+
+		  # echo 1 > /sys/bus/cdx/rescan
diff --git a/MAINTAINERS b/MAINTAINERS
index 619c30ec36dc..973cfbd3e3e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -964,6 +964,13 @@ Q:	https://patchwork.kernel.org/project/linux-rdma/list/
 F:	drivers/infiniband/hw/efa/
 F:	include/uapi/rdma/efa-abi.h
 
+AMD CDX BUS DRIVER
+M:	Nipun Gupta <nipun.gupta@amd.com>
+M:	Nikhil Agarwal <nikhil.agarwal@amd.com>
+S:	Maintained
+F:	drivers/cdx/*
+F:	include/linux/cdx/*
+
 AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER
 M:	Tom Lendacky <thomas.lendacky@amd.com>
 M:	John Allen <john.allen@amd.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 968bd0a6fd78..514ae6b24cb2 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -241,4 +241,6 @@ source "drivers/peci/Kconfig"
 
 source "drivers/hte/Kconfig"
 
+source "drivers/cdx/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 20b118dca999..7241d80a7b29 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -194,3 +194,4 @@ obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_PECI)		+= peci/
 obj-$(CONFIG_HTE)		+= hte/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
+obj-$(CONFIG_CDX_BUS)		+= cdx/
diff --git a/drivers/cdx/Kconfig b/drivers/cdx/Kconfig
new file mode 100644
index 000000000000..1aaee0ec5bd9
--- /dev/null
+++ b/drivers/cdx/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CDX bus configuration
+#
+# Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+#
+
+config CDX_BUS
+	bool "CDX Bus driver"
+	depends on OF && ARM64
+	help
+	  Driver to enable Composable DMA Transfer(CDX) Bus. CDX bus
+	  exposes Fabric devices which uses composable DMA IP to the
+	  APU. CDX bus provides a mechanism for scanning and probing
+	  of CDX devices. CDX devices are memory mapped on system bus
+	  for embedded CPUs. CDX bus uses CDX controller and firmware
+	  to scan these CDX devices.
diff --git a/drivers/cdx/Makefile b/drivers/cdx/Makefile
new file mode 100644
index 000000000000..c5feb11fc718
--- /dev/null
+++ b/drivers/cdx/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for CDX
+#
+# Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+#
+
+obj-$(CONFIG_CDX_BUS) += cdx.o
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
new file mode 100644
index 000000000000..6b162bbb9b25
--- /dev/null
+++ b/drivers/cdx/cdx.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CDX bus driver.
+ *
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+/*
+ * Architecture Overview
+ * =====================
+ * CDX is a Hardware Architecture designed for AMD FPGA devices. It
+ * consists of sophisticated mechanism for interaction between FPGA,
+ * Firmware and the APUs (Application CPUs).
+ *
+ * Firmware resides on RPU (Realtime CPUs) which interacts with
+ * the FPGA program manager and the APUs. The RPU provides memory-mapped
+ * interface (RPU if) which is used to communicate with APUs.
+ *
+ * The diagram below shows an overview of the CDX architecture:
+ *
+ *          +--------------------------------------+
+ *          |    Application CPUs (APU)            |
+ *          |                                      |
+ *          |                    CDX device drivers|
+ *          |     Linux OS                |        |
+ *          |                        CDX bus       |
+ *          |                             |        |
+ *          |                     CDX controller   |
+ *          |                             |        |
+ *          +-----------------------------|--------+
+ *                                        | (discover, config,
+ *                                        |  reset, rescan)
+ *                                        |
+ *          +------------------------| RPU if |----+
+ *          |                             |        |
+ *          |                             V        |
+ *          |          Realtime CPUs (RPU)         |
+ *          |                                      |
+ *          +--------------------------------------+
+ *                                |
+ *          +---------------------|----------------+
+ *          |  FPGA               |                |
+ *          |      +-----------------------+       |
+ *          |      |           |           |       |
+ *          | +-------+    +-------+   +-------+   |
+ *          | | dev 1 |    | dev 2 |   | dev 3 |   |
+ *          | +-------+    +-------+   +-------+   |
+ *          +--------------------------------------+
+ *
+ * The RPU firmware extracts the device information from the loaded FPGA
+ * image and implements a mechanism that allows the APU drivers to
+ * enumerate such devices (device personality and resource details) via
+ * a dedicated communication channel. RPU mediates operations such as
+ * discover, reset and rescan of the FPGA devices for the APU. This is
+ * done using memory mapped interface provided by the RPU to APU.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/xarray.h>
+#include <linux/cdx/cdx_bus.h>
+#include "cdx.h"
+
+/* Default DMA mask for devices on a CDX bus */
+#define CDX_DEFAULT_DMA_MASK	(~0ULL)
+#define MAX_CDX_CONTROLLERS 16
+
+/* CDX controllers registered with the CDX bus */
+static DEFINE_XARRAY_ALLOC(cdx_controllers);
+
+/**
+ * cdx_unregister_device - Unregister a CDX device
+ * @dev: CDX device
+ * @data: This is always passed as NULL, and is not used in this API,
+ *	  but is required here as the bus_for_each_dev() API expects
+ *	  the passed function (cdx_unregister_device) to have this
+ *	  as an argument.
+ *
+ * Return: 0 on success.
+ */
+static int cdx_unregister_device(struct device *dev,
+				 void *data)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	kfree(cdx_dev->driver_override);
+	cdx_dev->driver_override = NULL;
+	/*
+	 * Do not free cdx_dev here as it would be freed in
+	 * cdx_device_release() called from within put_device().
+	 */
+	device_del(&cdx_dev->dev);
+	put_device(&cdx_dev->dev);
+
+	return 0;
+}
+
+static void cdx_unregister_devices(struct bus_type *bus)
+{
+	/* Reset all the devices attached to cdx bus */
+	bus_for_each_dev(bus, NULL, NULL, cdx_unregister_device);
+}
+
+/**
+ * cdx_match_one_device - Tell if a CDX device structure has a matching
+ *			  CDX device id structure
+ * @id: single CDX device id structure to match
+ * @dev: the CDX device structure to match against
+ *
+ * Return: matching cdx_device_id structure or NULL if there is no match.
+ */
+static inline const struct cdx_device_id *
+cdx_match_one_device(const struct cdx_device_id *id,
+		     const struct cdx_device *dev)
+{
+	/* Use vendor ID and device ID for matching */
+	if ((id->vendor == CDX_ANY_ID || id->vendor == dev->vendor) &&
+	    (id->device == CDX_ANY_ID || id->device == dev->device))
+		return id;
+	return NULL;
+}
+
+/**
+ * cdx_match_id - See if a CDX device matches a given cdx_id table
+ * @ids: array of CDX device ID structures to search in
+ * @dev: the CDX device structure to match against.
+ *
+ * Used by a driver to check whether a CDX device is in its list of
+ * supported devices. Returns the matching cdx_device_id structure or
+ * NULL if there is no match.
+ *
+ * Return: matching cdx_device_id structure or NULL if there is no match.
+ */
+static inline const struct cdx_device_id *
+cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev)
+{
+	if (ids) {
+		while (ids->vendor || ids->device) {
+			if (cdx_match_one_device(ids, dev))
+				return ids;
+			ids++;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * cdx_bus_match - device to driver matching callback
+ * @dev: the cdx device to match against
+ * @drv: the device driver to search for matching cdx device
+ * structures
+ *
+ * Return: true on success, false otherwise.
+ */
+static int cdx_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	struct cdx_driver *cdx_drv = to_cdx_driver(drv);
+	const struct cdx_device_id *found_id = NULL;
+	const struct cdx_device_id *ids;
+
+	ids = cdx_drv->match_id_table;
+
+	/* When driver_override is set, only bind to the matching driver */
+	if (cdx_dev->driver_override && strcmp(cdx_dev->driver_override, drv->name))
+		return false;
+
+	found_id = cdx_match_id(ids, cdx_dev);
+	if (!found_id)
+		return false;
+
+	do {
+		/*
+		 * In case override_only was set, enforce driver_override
+		 * matching.
+		 */
+		if (!found_id->override_only)
+			return true;
+		if (cdx_dev->driver_override)
+			return true;
+
+		ids = found_id + 1;
+		found_id = cdx_match_id(ids, cdx_dev);
+	} while (found_id);
+
+	return false;
+}
+
+static int cdx_probe(struct device *dev)
+{
+	struct cdx_driver *cdx_drv = to_cdx_driver(dev->driver);
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	int error;
+
+	error = cdx_drv->probe(cdx_dev);
+	if (error) {
+		dev_err_probe(dev, error, "%s failed\n", __func__);
+		return error;
+	}
+
+	return 0;
+}
+
+static void cdx_remove(struct device *dev)
+{
+	struct cdx_driver *cdx_drv = to_cdx_driver(dev->driver);
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	if (cdx_drv && cdx_drv->remove)
+		cdx_drv->remove(cdx_dev);
+}
+
+static void cdx_shutdown(struct device *dev)
+{
+	struct cdx_driver *cdx_drv = to_cdx_driver(dev->driver);
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	if (cdx_drv && cdx_drv->shutdown)
+		cdx_drv->shutdown(cdx_dev);
+}
+
+static int cdx_dma_configure(struct device *dev)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	u32 input_id = cdx_dev->req_id;
+	int ret;
+
+	ret = of_dma_configure_id(dev, dev->parent->of_node, 0, &input_id);
+	if (ret && ret != -EPROBE_DEFER) {
+		dev_err(dev, "of_dma_configure_id() failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static ssize_t rescan_store(struct bus_type *bus,
+			    const char *buf, size_t count)
+{
+	struct cdx_controller *cdx;
+	unsigned long index;
+	bool val;
+
+	if (kstrtobool(buf, &val) < 0)
+		return -EINVAL;
+
+	if (!val)
+		return -EINVAL;
+
+	/* Unregister all the devices on the bus */
+	cdx_unregister_devices(&cdx_bus_type);
+
+	/* Rescan all the devices */
+	xa_for_each(&cdx_controllers, index, cdx) {
+		int ret;
+
+		ret = cdx->ops->scan(cdx);
+		if (ret)
+			dev_err(cdx->dev, "cdx bus scanning failed\n");
+	}
+
+	return count;
+}
+static BUS_ATTR_WO(rescan);
+
+static struct attribute *cdx_bus_attrs[] = {
+	&bus_attr_rescan.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cdx_bus);
+
+struct bus_type cdx_bus_type = {
+	.name		= "cdx",
+	.match		= cdx_bus_match,
+	.probe		= cdx_probe,
+	.remove		= cdx_remove,
+	.shutdown	= cdx_shutdown,
+	.dma_configure	= cdx_dma_configure,
+	.bus_groups	= cdx_bus_groups,
+};
+EXPORT_SYMBOL_GPL(cdx_bus_type);
+
+int __cdx_driver_register(struct cdx_driver *cdx_driver,
+			  struct module *owner)
+{
+	int error;
+
+	cdx_driver->driver.owner = owner;
+	cdx_driver->driver.bus = &cdx_bus_type;
+
+	error = driver_register(&cdx_driver->driver);
+	if (error) {
+		pr_err("driver_register() failed for %s: %d\n",
+		       cdx_driver->driver.name, error);
+		return error;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__cdx_driver_register);
+
+void cdx_driver_unregister(struct cdx_driver *cdx_driver)
+{
+	driver_unregister(&cdx_driver->driver);
+}
+EXPORT_SYMBOL_GPL(cdx_driver_unregister);
+
+static void cdx_device_release(struct device *dev)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	kfree(cdx_dev);
+}
+
+int cdx_device_add(struct cdx_dev_params *dev_params)
+{
+	struct cdx_controller *cdx = dev_params->cdx;
+	struct device *parent = cdx->dev;
+	struct cdx_device *cdx_dev;
+	int ret;
+
+	cdx_dev = kzalloc(sizeof(*cdx_dev), GFP_KERNEL);
+	if (!cdx_dev)
+		return -ENOMEM;
+
+	/* Populate resource */
+	memcpy(cdx_dev->res, dev_params->res, sizeof(struct resource) *
+		dev_params->res_count);
+	cdx_dev->res_count = dev_params->res_count;
+
+	/* Populate CDX dev params */
+	cdx_dev->req_id = dev_params->req_id;
+	cdx_dev->vendor = dev_params->vendor;
+	cdx_dev->device = dev_params->device;
+	cdx_dev->bus_num = dev_params->bus_num;
+	cdx_dev->dev_num = dev_params->dev_num;
+	cdx_dev->cdx = dev_params->cdx;
+	cdx_dev->dma_mask = CDX_DEFAULT_DMA_MASK;
+
+	/* Initialize generic device */
+	device_initialize(&cdx_dev->dev);
+	cdx_dev->dev.parent = parent;
+	cdx_dev->dev.bus = &cdx_bus_type;
+	cdx_dev->dev.dma_mask = &cdx_dev->dma_mask;
+	cdx_dev->dev.release = cdx_device_release;
+
+	/* Set Name */
+	dev_set_name(&cdx_dev->dev, "cdx-%02x:%02x",
+		     ((cdx->id << CDX_CONTROLLER_ID_SHIFT) | (cdx_dev->bus_num & CDX_BUS_NUM_MASK)),
+		     cdx_dev->dev_num);
+
+	ret = device_add(&cdx_dev->dev);
+	if (ret) {
+		dev_err(&cdx_dev->dev,
+			"cdx device add failed: %d", ret);
+		goto fail;
+	}
+
+	return 0;
+fail:
+	/*
+	 * Do not free cdx_dev here as it would be freed in
+	 * cdx_device_release() called from put_device().
+	 */
+	put_device(&cdx_dev->dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_device_add);
+
+int cdx_register_controller(struct cdx_controller *cdx)
+{
+	int ret;
+
+	ret = xa_alloc(&cdx_controllers, &cdx->id, cdx,
+		       XA_LIMIT(0, MAX_CDX_CONTROLLERS - 1), GFP_KERNEL);
+	if (ret) {
+		dev_err(cdx->dev,
+			"No free index available. Maximum controllers already registered\n");
+		cdx->id = (u8)MAX_CDX_CONTROLLERS;
+		return ret;
+	}
+
+	/* Scan all the devices */
+	cdx->ops->scan(cdx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cdx_register_controller);
+
+void cdx_unregister_controller(struct cdx_controller *cdx)
+{
+	if (cdx->id >= MAX_CDX_CONTROLLERS)
+		return;
+
+	device_for_each_child(cdx->dev, NULL, cdx_unregister_device);
+	xa_erase(&cdx_controllers, cdx->id);
+}
+EXPORT_SYMBOL_GPL(cdx_unregister_controller);
+
+static int __init cdx_bus_init(void)
+{
+	return bus_register(&cdx_bus_type);
+}
+postcore_initcall(cdx_bus_init);
diff --git a/drivers/cdx/cdx.h b/drivers/cdx/cdx.h
new file mode 100644
index 000000000000..c436ac7ac86f
--- /dev/null
+++ b/drivers/cdx/cdx.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Header file for the CDX Bus
+ *
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _CDX_H_
+#define _CDX_H_
+
+#include <linux/cdx/cdx_bus.h>
+
+/**
+ * struct cdx_dev_params - CDX device parameters
+ * @cdx: CDX controller associated with the device
+ * @parent: Associated CDX controller
+ * @vendor: Vendor ID for CDX device
+ * @device: Device ID for CDX device
+ * @bus_num: Bus number for this CDX device
+ * @dev_num: Device number for this device
+ * @res: array of MMIO region entries
+ * @res_count: number of valid MMIO regions
+ * @req_id: Requestor ID associated with CDX device
+ */
+struct cdx_dev_params {
+	struct cdx_controller *cdx;
+	u16 vendor;
+	u16 device;
+	u8 bus_num;
+	u8 dev_num;
+	struct resource res[MAX_CDX_DEV_RESOURCES];
+	u8 res_count;
+	u32 req_id;
+};
+
+/**
+ * cdx_register_controller - Register a CDX controller and its ports
+ *		on the CDX bus.
+ * @cdx: The CDX controller to register
+ *
+ * Return: -errno on failure, 0 on success.
+ */
+int cdx_register_controller(struct cdx_controller *cdx);
+
+/**
+ * cdx_unregister_controller - Unregister a CDX controller
+ * @cdx: The CDX controller to unregister
+ */
+void cdx_unregister_controller(struct cdx_controller *cdx);
+
+/**
+ * cdx_device_add - Add a CDX device. This function adds a CDX device
+ *		on the CDX bus as per the device parameters provided
+ *		by caller. It also creates and registers an associated
+ *		Linux generic device.
+ * @dev_params: device parameters associated with the device to be created.
+ *
+ * Return: -errno on failure, 0 on success.
+ */
+int cdx_device_add(struct cdx_dev_params *dev_params);
+
+#endif /* _CDX_H_ */
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
new file mode 100644
index 000000000000..d134e0104724
--- /dev/null
+++ b/include/linux/cdx/cdx_bus.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * CDX bus public interface
+ *
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ *
+ */
+
+#ifndef _CDX_BUS_H_
+#define _CDX_BUS_H_
+
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/mod_devicetable.h>
+
+#define MAX_CDX_DEV_RESOURCES	4
+#define CDX_ANY_ID (0xFFFF)
+#define CDX_CONTROLLER_ID_SHIFT 4
+#define CDX_BUS_NUM_MASK 0xF
+
+/* Forward declaration for CDX controller */
+struct cdx_controller;
+
+typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
+
+/**
+ * CDX_DEVICE_DRIVER_OVERRIDE - macro used to describe a CDX device with
+ *                              override_only flags.
+ * @vend: the 16 bit CDX Vendor ID
+ * @dev: the 16 bit CDX Device ID
+ * @driver_override: the 32 bit CDX Device override_only
+ *
+ * This macro is used to create a struct cdx_device_id that matches only a
+ * driver_override device.
+ */
+#define CDX_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
+	.vendor = (vend), .device = (dev), .override_only = (driver_override)
+
+/**
+ * struct cdx_ops - Callbacks supported by CDX controller.
+ * @scan: scan the devices on the controller
+ */
+struct cdx_ops {
+	cdx_scan_cb scan;
+};
+
+/**
+ * struct cdx_controller: CDX controller object
+ * @dev: Linux device associated with the CDX controller.
+ * @priv: private data
+ * @id: Controller ID
+ * @ops: CDX controller ops
+ */
+struct cdx_controller {
+	struct device *dev;
+	void *priv;
+	u32 id;
+	struct cdx_ops *ops;
+};
+
+/**
+ * struct cdx_device - CDX device object
+ * @dev: Linux driver model device object
+ * @cdx: CDX controller associated with the device
+ * @vendor: Vendor ID for CDX device
+ * @device: Device ID for CDX device
+ * @bus_num: Bus number for this CDX device
+ * @dev_num: Device number for this device
+ * @res: array of MMIO region entries
+ * @res_attr: resource binary attribute
+ * @res_count: number of valid MMIO regions
+ * @dma_mask: Default DMA mask
+ * @flags: CDX device flags
+ * @req_id: Requestor ID associated with CDX device
+ * @driver_override: driver name to force a match; do not set directly,
+ *                   because core frees it; use driver_set_override() to
+ *                   set or clear it.
+ */
+struct cdx_device {
+	struct device dev;
+	struct cdx_controller *cdx;
+	u16 vendor;
+	u16 device;
+	u8 bus_num;
+	u8 dev_num;
+	struct resource res[MAX_CDX_DEV_RESOURCES];
+	u8 res_count;
+	u64 dma_mask;
+	u16 flags;
+	u32 req_id;
+	const char *driver_override;
+};
+
+#define to_cdx_device(_dev) \
+	container_of(_dev, struct cdx_device, dev)
+
+/**
+ * struct cdx_driver - CDX device driver
+ * @driver: Generic device driver
+ * @match_id_table: table of supported device matching Ids
+ * @probe: Function called when a device is added
+ * @remove: Function called when a device is removed
+ * @shutdown: Function called at shutdown time to quiesce the device
+ * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA.
+ *		For most device drivers, no need to care about this flag
+ *		as long as all DMAs are handled through the kernel DMA API.
+ *		For some special ones, for example VFIO drivers, they know
+ *		how to manage the DMA themselves and set this flag so that
+ *		the IOMMU layer will allow them to setup and manage their
+ *		own I/O address space.
+ */
+struct cdx_driver {
+	struct device_driver driver;
+	const struct cdx_device_id *match_id_table;
+	int (*probe)(struct cdx_device *dev);
+	int (*remove)(struct cdx_device *dev);
+	void (*shutdown)(struct cdx_device *dev);
+	bool driver_managed_dma;
+};
+
+#define to_cdx_driver(_drv) \
+	container_of(_drv, struct cdx_driver, driver)
+
+/* Macro to avoid include chaining to get THIS_MODULE */
+#define cdx_driver_register(drv) \
+	__cdx_driver_register(drv, THIS_MODULE)
+
+/**
+ * __cdx_driver_register - registers a CDX device driver
+ * @cdx_driver: CDX driver to register
+ * @owner: module owner
+ *
+ * Return: -errno on failure, 0 on success.
+ */
+int __must_check __cdx_driver_register(struct cdx_driver *cdx_driver,
+				       struct module *owner);
+
+/**
+ * cdx_driver_unregister - unregisters a device driver from the
+ * CDX bus.
+ * @cdx_driver: CDX driver to register
+ */
+void cdx_driver_unregister(struct cdx_driver *cdx_driver);
+
+extern struct bus_type cdx_bus_type;
+
+#endif /* _CDX_BUS_H_ */
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 35203ca3fc37..ccaaeda792c0 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -912,4 +912,19 @@ struct ishtp_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/**
+ * struct cdx_device_id - CDX device identifier
+ * @vendor: Vendor ID
+ * @device: Device ID
+ * @override_only: Match only when dev->driver_override is this driver.
+ *
+ * Type of entries in the "device Id" table for CDX devices supported by
+ * a CDX device driver.
+ */
+struct cdx_device_id {
+	__u16 vendor;
+	__u16 device;
+	__u32 override_only;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index c0d3bcb99138..62dc988df84d 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -262,5 +262,9 @@ int main(void)
 	DEVID(ishtp_device_id);
 	DEVID_FIELD(ishtp_device_id, guid);
 
+	DEVID(cdx_device_id);
+	DEVID_FIELD(cdx_device_id, vendor);
+	DEVID_FIELD(cdx_device_id, device);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 91c2e7ba5e52..28da34ba4359 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1452,6 +1452,17 @@ static int do_dfl_entry(const char *filename, void *symval, char *alias)
 	return 1;
 }
 
+/* Looks like: cdx:vNdN */
+static int do_cdx_entry(const char *filename, void *symval,
+			char *alias)
+{
+	DEF_FIELD(symval, cdx_device_id, vendor);
+	DEF_FIELD(symval, cdx_device_id, device);
+
+	sprintf(alias, "cdx:v%08Xd%08Xd", vendor, device);
+	return 1;
+}
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
@@ -1531,6 +1542,7 @@ static const struct devtable devtable[] = {
 	{"ssam", SIZE_ssam_device_id, do_ssam_entry},
 	{"dfl", SIZE_dfl_device_id, do_dfl_entry},
 	{"ishtp", SIZE_ishtp_device_id, do_ishtp_entry},
+	{"cdx", SIZE_cdx_device_id, do_cdx_entry},
 };
 
 /* Create MODULE_ALIAS() statements.
-- 
cgit v1.2.3


From 48a6c7bced2a781c911497f073347bec5ab4d7a5 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Mon, 13 Mar 2023 18:56:36 +0530
Subject: cdx: add device attributes

Create sysfs entry for CDX devices.

Sysfs entries provided in each of the CDX device detected by
the CDX controller
 - vendor id
 - device id
 - remove
 - reset of the device.
 - driver override

Signed-off-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Tarak Reddy <tarak.reddy@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Tested-by: Nikhil Agarwal <nikhil.agarwal@amd.com>
Link: https://lore.kernel.org/r/20230313132636.31850-8-nipun.gupta@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-cdx |  44 +++++++++++
 drivers/cdx/cdx.c                       | 127 ++++++++++++++++++++++++++++++++
 drivers/cdx/controller/cdx_controller.c |  18 +++++
 drivers/cdx/controller/mcdi_functions.c |  14 ++++
 drivers/cdx/controller/mcdi_functions.h |  11 +++
 include/linux/cdx/cdx_bus.h             |  27 +++++++
 6 files changed, 241 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
index 43b4e161f226..7af477f49998 100644
--- a/Documentation/ABI/testing/sysfs-bus-cdx
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -10,3 +10,47 @@ Description:
 		For example::
 
 		  # echo 1 > /sys/bus/cdx/rescan
+
+What:		/sys/bus/cdx/devices/.../vendor
+Date:		March 2023
+Contact:	nipun.gupta@amd.com
+Description:
+		Vendor ID for this CDX device, in hexadecimal. Vendor ID is
+		16 bit identifier which is specific to the device manufacturer.
+		Combination of Vendor ID and Device ID identifies a device.
+
+What:		/sys/bus/cdx/devices/.../device
+Date:		March 2023
+Contact:	nipun.gupta@amd.com
+Description:
+		Device ID for this CDX device, in hexadecimal. Device ID is
+		16 bit identifier to identify a device type within the range
+		of a device manufacturer.
+		Combination of Vendor ID and Device ID identifies a device.
+
+What:		/sys/bus/cdx/devices/.../reset
+Date:		March 2023
+Contact:	nipun.gupta@amd.com
+Description:
+		Writing y/1/on to this file resets the CDX device.
+		On resetting the device, the corresponding driver is notified
+		twice, once before the device is being reset, and again after
+		the reset has been complete.
+
+		For example::
+
+		  # echo 1 > /sys/bus/cdx/.../reset
+
+What:		/sys/bus/cdx/devices/.../remove
+Date:		March 2023
+Contact:	tarak.reddy@amd.com
+Description:
+		Writing y/1/on to this file removes the corresponding
+		device from the CDX bus. If the device is to be reconfigured
+		reconfigured in the Hardware, the device can be removed, so
+		that the device driver does not access the device while it is
+		being reconfigured.
+
+		For example::
+
+		  # echo 1 > /sys/bus/cdx/devices/.../remove
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index 6b162bbb9b25..67c32cb2c006 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -71,6 +71,39 @@
 /* CDX controllers registered with the CDX bus */
 static DEFINE_XARRAY_ALLOC(cdx_controllers);
 
+/**
+ * cdx_dev_reset - Reset a CDX device
+ * @dev: CDX device
+ *
+ * Return: -errno on failure, 0 on success.
+ */
+int cdx_dev_reset(struct device *dev)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	struct cdx_device_config dev_config = {0};
+	struct cdx_driver *cdx_drv;
+	int ret;
+
+	cdx_drv = to_cdx_driver(dev->driver);
+	/* Notify driver that device is being reset */
+	if (cdx_drv && cdx_drv->reset_prepare)
+		cdx_drv->reset_prepare(cdx_dev);
+
+	dev_config.type = CDX_DEV_RESET_CONF;
+	ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+				      cdx_dev->dev_num, &dev_config);
+	if (ret)
+		dev_err(dev, "cdx device reset failed\n");
+
+	/* Notify driver that device reset is complete */
+	if (cdx_drv && cdx_drv->reset_done)
+		cdx_drv->reset_done(cdx_dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_dev_reset);
+
 /**
  * cdx_unregister_device - Unregister a CDX device
  * @dev: CDX device
@@ -237,6 +270,99 @@ static int cdx_dma_configure(struct device *dev)
 	return 0;
 }
 
+/* show configuration fields */
+#define cdx_config_attr(field, format_string)	\
+static ssize_t	\
+field##_show(struct device *dev, struct device_attribute *attr, char *buf)	\
+{	\
+	struct cdx_device *cdx_dev = to_cdx_device(dev);	\
+	return sysfs_emit(buf, format_string, cdx_dev->field);	\
+}	\
+static DEVICE_ATTR_RO(field)
+
+cdx_config_attr(vendor, "0x%04x\n");
+cdx_config_attr(device, "0x%04x\n");
+
+static ssize_t remove_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	bool val;
+
+	if (kstrtobool(buf, &val) < 0)
+		return -EINVAL;
+
+	if (!val)
+		return -EINVAL;
+
+	if (device_remove_file_self(dev, attr)) {
+		int ret;
+
+		ret = cdx_unregister_device(dev, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return count;
+}
+static DEVICE_ATTR_WO(remove);
+
+static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	bool val;
+	int ret;
+
+	if (kstrtobool(buf, &val) < 0)
+		return -EINVAL;
+
+	if (!val)
+		return -EINVAL;
+
+	ret = cdx_dev_reset(dev);
+	if (ret)
+		return ret;
+
+	return count;
+}
+static DEVICE_ATTR_WO(reset);
+
+static ssize_t driver_override_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	int ret;
+
+	if (WARN_ON(dev->bus != &cdx_bus_type))
+		return -EINVAL;
+
+	ret = driver_set_override(dev, &cdx_dev->driver_override, buf, count);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static ssize_t driver_override_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	return sysfs_emit(buf, "%s\n", cdx_dev->driver_override);
+}
+static DEVICE_ATTR_RW(driver_override);
+
+static struct attribute *cdx_dev_attrs[] = {
+	&dev_attr_remove.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_device.attr,
+	&dev_attr_driver_override.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cdx_dev);
+
 static ssize_t rescan_store(struct bus_type *bus,
 			    const char *buf, size_t count)
 {
@@ -280,6 +406,7 @@ struct bus_type cdx_bus_type = {
 	.shutdown	= cdx_shutdown,
 	.dma_configure	= cdx_dma_configure,
 	.bus_groups	= cdx_bus_groups,
+	.dev_groups	= cdx_dev_groups,
 };
 EXPORT_SYMBOL_GPL(cdx_bus_type);
 
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index 0d1826980935..dc52f95f8978 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -45,6 +45,23 @@ void cdx_rpmsg_pre_remove(struct cdx_controller *cdx)
 	cdx_mcdi_wait_for_quiescence(cdx->priv, MCDI_RPC_TIMEOUT);
 }
 
+static int cdx_configure_device(struct cdx_controller *cdx,
+				u8 bus_num, u8 dev_num,
+				struct cdx_device_config *dev_config)
+{
+	int ret = 0;
+
+	switch (dev_config->type) {
+	case CDX_DEV_RESET_CONF:
+		ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 static int cdx_scan_devices(struct cdx_controller *cdx)
 {
 	struct cdx_mcdi *cdx_mcdi = cdx->priv;
@@ -104,6 +121,7 @@ static int cdx_scan_devices(struct cdx_controller *cdx)
 
 static struct cdx_ops cdx_ops = {
 	.scan		= cdx_scan_devices,
+	.dev_configure	= cdx_configure_device,
 };
 
 static int xlnx_cdx_probe(struct platform_device *pdev)
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 012b52881dd5..0158f26533dd 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -123,3 +123,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 
 	return 0;
 }
+
+int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_RESET_IN_LEN);
+	int ret;
+
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_RESET_IN_BUS, bus_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_RESET_IN_DEVICE, dev_num);
+
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_RESET, inbuf, sizeof(inbuf),
+			   NULL, 0, NULL);
+
+	return ret;
+}
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index 6bf5a4a0778f..7440ace5539a 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -47,4 +47,15 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 			    u8 bus_num, u8 dev_num,
 			    struct cdx_dev_params *dev_params);
 
+/**
+ * cdx_mcdi_reset_device - Reset cdx device represented by bus_num:dev_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ * @dev_num: Device number.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_reset_device(struct cdx_mcdi *cdx,
+			  u8 bus_num, u8 dev_num);
+
 #endif /* CDX_MCDI_FUNCTIONS_H */
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index d134e0104724..35ef41d8a61a 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -21,8 +21,20 @@
 /* Forward declaration for CDX controller */
 struct cdx_controller;
 
+enum {
+	CDX_DEV_RESET_CONF,
+};
+
+struct cdx_device_config {
+	u8 type;
+};
+
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
 
+typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
+				    u8 bus_num, u8 dev_num,
+				    struct cdx_device_config *dev_config);
+
 /**
  * CDX_DEVICE_DRIVER_OVERRIDE - macro used to describe a CDX device with
  *                              override_only flags.
@@ -39,9 +51,12 @@ typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
 /**
  * struct cdx_ops - Callbacks supported by CDX controller.
  * @scan: scan the devices on the controller
+ * @dev_configure: configuration like reset, master_enable,
+ *		   msi_config etc for a CDX device
  */
 struct cdx_ops {
 	cdx_scan_cb scan;
+	cdx_dev_configure_cb dev_configure;
 };
 
 /**
@@ -101,6 +116,8 @@ struct cdx_device {
  * @probe: Function called when a device is added
  * @remove: Function called when a device is removed
  * @shutdown: Function called at shutdown time to quiesce the device
+ * @reset_prepare: Function called before is reset to notify driver
+ * @reset_done: Function called after reset is complete to notify driver
  * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA.
  *		For most device drivers, no need to care about this flag
  *		as long as all DMAs are handled through the kernel DMA API.
@@ -115,6 +132,8 @@ struct cdx_driver {
 	int (*probe)(struct cdx_device *dev);
 	int (*remove)(struct cdx_device *dev);
 	void (*shutdown)(struct cdx_device *dev);
+	void (*reset_prepare)(struct cdx_device *dev);
+	void (*reset_done)(struct cdx_device *dev);
 	bool driver_managed_dma;
 };
 
@@ -144,4 +163,12 @@ void cdx_driver_unregister(struct cdx_driver *cdx_driver);
 
 extern struct bus_type cdx_bus_type;
 
+/**
+ * cdx_dev_reset - Reset CDX device
+ * @dev: device pointer
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_dev_reset(struct device *dev);
+
 #endif /* _CDX_BUS_H_ */
-- 
cgit v1.2.3


From 74df43b3f626a3594a4de50556048852bf2753f7 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 28 Mar 2023 17:41:04 +0200
Subject: mtd: spi-nor: Enhance locking to support reads while writes

On devices featuring several banks, the Read While Write (RWW) feature
is here to improve the overall performance when performing parallel
reads and writes at different locations (different banks). The following
constraints have to be taken into account:
1#: A single operation can be performed in a given bank.
2#: Only a single program or erase operation can happen on the entire
    chip (common hardware limitation to limit costs)
3#: Reads must remain serialized even though reads crossing bank
    boundaries are allowed.
4#: The I/O bus is unique and thus is the most constrained resource, all
    spi-nor operations requiring access to the spi bus (through the spi
    controller) must be serialized until the bus exchanges are over. So
    we must ensure a single operation can be "sent" at a time.
5#: Any other operation that would not be either a read or a write or an
    erase is considered requiring access to the full chip and cannot be
    parallelized, we then need to ensure the full chip is in the idle
    state when this occurs.

All these constraints can easily be managed with a proper locking model:
1#: Is enforced by a bitfield of the in-use banks, so that only a single
    operation can happen in a specific bank at any time.
2#: Is handled by the ongoing_pe boolean which is set before any write
    or erase, and is released only at the very end of the
    operation. This way, no other destructive operation on the chip can
    start during this time frame.
3#: An ongoing_rd boolean allows to track the ongoing reads, so that
    only one can be performed at a time.
4#: An ongoing_io boolean is introduced in order to capture and serialize
    bus accessed. This is the one being released "sooner" than before,
    because we only need to protect the chip against other SPI accesses
    during the I/O phase, which for the destructive operations is the
    beginning of the operation (when we send the command cycles and
    possibly the data), while the second part of the operation (the
    erase delay or the programmation delay) is when we can do something
    else in another bank.
5#: Is handled by the three booleans presented above, if any of them is
    set, the chip is not yet ready for the operation and must wait.

All these internal variables are protected by the existing lock, so that
changes in this structure are atomic. The serialization is handled with
a wait queue.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20230328154105.448540-8-miquel.raynal@bootlin.com
Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
---
 drivers/mtd/spi-nor/core.c  | 337 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/mtd/spi-nor.h |  13 ++
 2 files changed, 334 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 75ec7a3d0b43..9e6a0730cdb8 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -588,6 +588,65 @@ int spi_nor_sr_ready(struct spi_nor *nor)
 	return !(nor->bouncebuf[0] & SR_WIP);
 }
 
+/**
+ * spi_nor_use_parallel_locking() - Checks if RWW locking scheme shall be used
+ * @nor:	pointer to 'struct spi_nor'.
+ *
+ * Return: true if parallel locking is enabled, false otherwise.
+ */
+static bool spi_nor_use_parallel_locking(struct spi_nor *nor)
+{
+	return nor->flags & SNOR_F_RWW;
+}
+
+/* Locking helpers for status read operations */
+static int spi_nor_rww_start_rdst(struct spi_nor *nor)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	int ret = -EAGAIN;
+
+	mutex_lock(&nor->lock);
+
+	if (rww->ongoing_io || rww->ongoing_rd)
+		goto busy;
+
+	rww->ongoing_io = true;
+	rww->ongoing_rd = true;
+	ret = 0;
+
+busy:
+	mutex_unlock(&nor->lock);
+	return ret;
+}
+
+static void spi_nor_rww_end_rdst(struct spi_nor *nor)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+
+	mutex_lock(&nor->lock);
+
+	rww->ongoing_io = false;
+	rww->ongoing_rd = false;
+
+	mutex_unlock(&nor->lock);
+}
+
+static int spi_nor_lock_rdst(struct spi_nor *nor)
+{
+	if (spi_nor_use_parallel_locking(nor))
+		return spi_nor_rww_start_rdst(nor);
+
+	return 0;
+}
+
+static void spi_nor_unlock_rdst(struct spi_nor *nor)
+{
+	if (spi_nor_use_parallel_locking(nor)) {
+		spi_nor_rww_end_rdst(nor);
+		wake_up(&nor->rww.wait);
+	}
+}
+
 /**
  * spi_nor_ready() - Query the flash to see if it is ready for new commands.
  * @nor:	pointer to 'struct spi_nor'.
@@ -596,11 +655,21 @@ int spi_nor_sr_ready(struct spi_nor *nor)
  */
 static int spi_nor_ready(struct spi_nor *nor)
 {
+	int ret;
+
+	ret = spi_nor_lock_rdst(nor);
+	if (ret)
+		return 0;
+
 	/* Flashes might override the standard routine. */
 	if (nor->params->ready)
-		return nor->params->ready(nor);
+		ret = nor->params->ready(nor);
+	else
+		ret = spi_nor_sr_ready(nor);
 
-	return spi_nor_sr_ready(nor);
+	spi_nor_unlock_rdst(nor);
+
+	return ret;
 }
 
 /**
@@ -1086,7 +1155,88 @@ static void spi_nor_unprep(struct spi_nor *nor)
 		nor->controller_ops->unprepare(nor);
 }
 
+static void spi_nor_offset_to_banks(u64 bank_size, loff_t start, size_t len,
+				    u8 *first, u8 *last)
+{
+	/* This is currently safe, the number of banks being very small */
+	*first = DIV_ROUND_DOWN_ULL(start, bank_size);
+	*last = DIV_ROUND_DOWN_ULL(start + len - 1, bank_size);
+}
+
 /* Generic helpers for internal locking and serialization */
+static bool spi_nor_rww_start_io(struct spi_nor *nor)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	bool start = false;
+
+	mutex_lock(&nor->lock);
+
+	if (rww->ongoing_io)
+		goto busy;
+
+	rww->ongoing_io = true;
+	start = true;
+
+busy:
+	mutex_unlock(&nor->lock);
+	return start;
+}
+
+static void spi_nor_rww_end_io(struct spi_nor *nor)
+{
+	mutex_lock(&nor->lock);
+	nor->rww.ongoing_io = false;
+	mutex_unlock(&nor->lock);
+}
+
+static int spi_nor_lock_device(struct spi_nor *nor)
+{
+	if (!spi_nor_use_parallel_locking(nor))
+		return 0;
+
+	return wait_event_killable(nor->rww.wait, spi_nor_rww_start_io(nor));
+}
+
+static void spi_nor_unlock_device(struct spi_nor *nor)
+{
+	if (spi_nor_use_parallel_locking(nor)) {
+		spi_nor_rww_end_io(nor);
+		wake_up(&nor->rww.wait);
+	}
+}
+
+/* Generic helpers for internal locking and serialization */
+static bool spi_nor_rww_start_exclusive(struct spi_nor *nor)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	bool start = false;
+
+	mutex_lock(&nor->lock);
+
+	if (rww->ongoing_io || rww->ongoing_rd || rww->ongoing_pe)
+		goto busy;
+
+	rww->ongoing_io = true;
+	rww->ongoing_rd = true;
+	rww->ongoing_pe = true;
+	start = true;
+
+busy:
+	mutex_unlock(&nor->lock);
+	return start;
+}
+
+static void spi_nor_rww_end_exclusive(struct spi_nor *nor)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+
+	mutex_lock(&nor->lock);
+	rww->ongoing_io = false;
+	rww->ongoing_rd = false;
+	rww->ongoing_pe = false;
+	mutex_unlock(&nor->lock);
+}
+
 int spi_nor_prep_and_lock(struct spi_nor *nor)
 {
 	int ret;
@@ -1095,19 +1245,75 @@ int spi_nor_prep_and_lock(struct spi_nor *nor)
 	if (ret)
 		return ret;
 
-	mutex_lock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor))
+		mutex_lock(&nor->lock);
+	else
+		ret = wait_event_killable(nor->rww.wait,
+					  spi_nor_rww_start_exclusive(nor));
 
-	return 0;
+	return ret;
 }
 
 void spi_nor_unlock_and_unprep(struct spi_nor *nor)
 {
-	mutex_unlock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor)) {
+		mutex_unlock(&nor->lock);
+	} else {
+		spi_nor_rww_end_exclusive(nor);
+		wake_up(&nor->rww.wait);
+	}
 
 	spi_nor_unprep(nor);
 }
 
 /* Internal locking helpers for program and erase operations */
+static bool spi_nor_rww_start_pe(struct spi_nor *nor, loff_t start, size_t len)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	unsigned int used_banks = 0;
+	bool started = false;
+	u8 first, last;
+	int bank;
+
+	mutex_lock(&nor->lock);
+
+	if (rww->ongoing_io || rww->ongoing_rd || rww->ongoing_pe)
+		goto busy;
+
+	spi_nor_offset_to_banks(nor->params->bank_size, start, len, &first, &last);
+	for (bank = first; bank <= last; bank++) {
+		if (rww->used_banks & BIT(bank))
+			goto busy;
+
+		used_banks |= BIT(bank);
+	}
+
+	rww->used_banks |= used_banks;
+	rww->ongoing_pe = true;
+	started = true;
+
+busy:
+	mutex_unlock(&nor->lock);
+	return started;
+}
+
+static void spi_nor_rww_end_pe(struct spi_nor *nor, loff_t start, size_t len)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	u8 first, last;
+	int bank;
+
+	mutex_lock(&nor->lock);
+
+	spi_nor_offset_to_banks(nor->params->bank_size, start, len, &first, &last);
+	for (bank = first; bank <= last; bank++)
+		rww->used_banks &= ~BIT(bank);
+
+	rww->ongoing_pe = false;
+
+	mutex_unlock(&nor->lock);
+}
+
 static int spi_nor_prep_and_lock_pe(struct spi_nor *nor, loff_t start, size_t len)
 {
 	int ret;
@@ -1116,19 +1322,77 @@ static int spi_nor_prep_and_lock_pe(struct spi_nor *nor, loff_t start, size_t le
 	if (ret)
 		return ret;
 
-	mutex_lock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor))
+		mutex_lock(&nor->lock);
+	else
+		ret = wait_event_killable(nor->rww.wait,
+					  spi_nor_rww_start_pe(nor, start, len));
 
-	return 0;
+	return ret;
 }
 
 static void spi_nor_unlock_and_unprep_pe(struct spi_nor *nor, loff_t start, size_t len)
 {
-	mutex_unlock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor)) {
+		mutex_unlock(&nor->lock);
+	} else {
+		spi_nor_rww_end_pe(nor, start, len);
+		wake_up(&nor->rww.wait);
+	}
 
 	spi_nor_unprep(nor);
 }
 
 /* Internal locking helpers for read operations */
+static bool spi_nor_rww_start_rd(struct spi_nor *nor, loff_t start, size_t len)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	unsigned int used_banks = 0;
+	bool started = false;
+	u8 first, last;
+	int bank;
+
+	mutex_lock(&nor->lock);
+
+	if (rww->ongoing_io || rww->ongoing_rd)
+		goto busy;
+
+	spi_nor_offset_to_banks(nor->params->bank_size, start, len, &first, &last);
+	for (bank = first; bank <= last; bank++) {
+		if (rww->used_banks & BIT(bank))
+			goto busy;
+
+		used_banks |= BIT(bank);
+	}
+
+	rww->used_banks |= used_banks;
+	rww->ongoing_io = true;
+	rww->ongoing_rd = true;
+	started = true;
+
+busy:
+	mutex_unlock(&nor->lock);
+	return started;
+}
+
+static void spi_nor_rww_end_rd(struct spi_nor *nor, loff_t start, size_t len)
+{
+	struct spi_nor_rww *rww = &nor->rww;
+	u8 first, last;
+	int bank;
+
+	mutex_lock(&nor->lock);
+
+	spi_nor_offset_to_banks(nor->params->bank_size, start, len, &first, &last);
+	for (bank = first; bank <= last; bank++)
+		nor->rww.used_banks &= ~BIT(bank);
+
+	rww->ongoing_io = false;
+	rww->ongoing_rd = false;
+
+	mutex_unlock(&nor->lock);
+}
+
 static int spi_nor_prep_and_lock_rd(struct spi_nor *nor, loff_t start, size_t len)
 {
 	int ret;
@@ -1137,14 +1401,23 @@ static int spi_nor_prep_and_lock_rd(struct spi_nor *nor, loff_t start, size_t le
 	if (ret)
 		return ret;
 
-	mutex_lock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor))
+		mutex_lock(&nor->lock);
+	else
+		ret = wait_event_killable(nor->rww.wait,
+					  spi_nor_rww_start_rd(nor, start, len));
 
-	return 0;
+	return ret;
 }
 
 static void spi_nor_unlock_and_unprep_rd(struct spi_nor *nor, loff_t start, size_t len)
 {
-	mutex_unlock(&nor->lock);
+	if (!spi_nor_use_parallel_locking(nor)) {
+		mutex_unlock(&nor->lock);
+	} else {
+		spi_nor_rww_end_rd(nor, start, len);
+		wake_up(&nor->rww.wait);
+	}
 
 	spi_nor_unprep(nor);
 }
@@ -1453,11 +1726,18 @@ static int spi_nor_erase_multi_sectors(struct spi_nor *nor, u64 addr, u32 len)
 			dev_vdbg(nor->dev, "erase_cmd->size = 0x%08x, erase_cmd->opcode = 0x%02x, erase_cmd->count = %u\n",
 				 cmd->size, cmd->opcode, cmd->count);
 
-			ret = spi_nor_write_enable(nor);
+			ret = spi_nor_lock_device(nor);
 			if (ret)
 				goto destroy_erase_cmd_list;
 
+			ret = spi_nor_write_enable(nor);
+			if (ret) {
+				spi_nor_unlock_device(nor);
+				goto destroy_erase_cmd_list;
+			}
+
 			ret = spi_nor_erase_sector(nor, addr);
+			spi_nor_unlock_device(nor);
 			if (ret)
 				goto destroy_erase_cmd_list;
 
@@ -1510,11 +1790,18 @@ static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
 	if (len == mtd->size && !(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) {
 		unsigned long timeout;
 
-		ret = spi_nor_write_enable(nor);
+		ret = spi_nor_lock_device(nor);
 		if (ret)
 			goto erase_err;
 
+		ret = spi_nor_write_enable(nor);
+		if (ret) {
+			spi_nor_unlock_device(nor);
+			goto erase_err;
+		}
+
 		ret = spi_nor_erase_chip(nor);
+		spi_nor_unlock_device(nor);
 		if (ret)
 			goto erase_err;
 
@@ -1539,11 +1826,18 @@ static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
 	/* "sector"-at-a-time erase */
 	} else if (spi_nor_has_uniform_erase(nor)) {
 		while (len) {
-			ret = spi_nor_write_enable(nor);
+			ret = spi_nor_lock_device(nor);
 			if (ret)
 				goto erase_err;
 
+			ret = spi_nor_write_enable(nor);
+			if (ret) {
+				spi_nor_unlock_device(nor);
+				goto erase_err;
+			}
+
 			ret = spi_nor_erase_sector(nor, addr);
+			spi_nor_unlock_device(nor);
 			if (ret)
 				goto erase_err;
 
@@ -1836,11 +2130,18 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 		addr = spi_nor_convert_addr(nor, addr);
 
-		ret = spi_nor_write_enable(nor);
+		ret = spi_nor_lock_device(nor);
 		if (ret)
 			goto write_err;
 
+		ret = spi_nor_write_enable(nor);
+		if (ret) {
+			spi_nor_unlock_device(nor);
+			goto write_err;
+		}
+
 		ret = spi_nor_write_data(nor, addr, page_remain, buf + i);
+		spi_nor_unlock_device(nor);
 		if (ret < 0)
 			goto write_err;
 		written = ret;
@@ -2531,7 +2832,8 @@ static void spi_nor_init_flags(struct spi_nor *nor)
 	if (flags & NO_CHIP_ERASE)
 		nor->flags |= SNOR_F_NO_OP_CHIP_ERASE;
 
-	if (flags & SPI_NOR_RWW)
+	if (flags & SPI_NOR_RWW && nor->info->n_banks > 1 &&
+	    !nor->controller_ops)
 		nor->flags |= SNOR_F_RWW;
 }
 
@@ -3128,6 +3430,9 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 	if (ret)
 		return ret;
 
+	if (spi_nor_use_parallel_locking(nor))
+		init_waitqueue_head(&nor->rww.wait);
+
 	/*
 	 * Configure the SPI memory:
 	 * - select op codes for (Fast) Read, Page Program and Sector Erase.
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index a3f8cdca90c8..82547b4b3708 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -343,6 +343,12 @@ struct spi_nor_flash_parameter;
  * struct spi_nor - Structure for defining the SPI NOR layer
  * @mtd:		an mtd_info structure
  * @lock:		the lock for the read/write/erase/lock/unlock operations
+ * @rww:		Read-While-Write (RWW) sync lock
+ * @rww.wait:		wait queue for the RWW sync
+ * @rww.ongoing_io:	the bus is busy
+ * @rww.ongoing_rd:	a read is ongoing on the chip
+ * @rww.ongoing_pe:	a program/erase is ongoing on the chip
+ * @rww.used_banks:	bitmap of the banks in use
  * @dev:		pointer to an SPI device or an SPI NOR controller device
  * @spimem:		pointer to the SPI memory device
  * @bouncebuf:		bounce buffer used when the buffer passed by the MTD
@@ -376,6 +382,13 @@ struct spi_nor_flash_parameter;
 struct spi_nor {
 	struct mtd_info		mtd;
 	struct mutex		lock;
+	struct spi_nor_rww {
+		wait_queue_head_t wait;
+		bool		ongoing_io;
+		bool		ongoing_rd;
+		bool		ongoing_pe;
+		unsigned int	used_banks;
+	} rww;
 	struct device		*dev;
 	struct spi_mem		*spimem;
 	u8			*bouncebuf;
-- 
cgit v1.2.3


From e5a26a4048eeb9558e5c84f340a989c78db4adf4 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:08 -0700
Subject: tracing/user_events: Split header into uapi and kernel

The UAPI parts need to be split out from the kernel parts of user_events
now that other parts of the kernel will reference it. Do so by moving
the existing include/linux/user_events.h into
include/uapi/linux/user_events.h.

Link: https://lkml.kernel.org/r/20230328235219.203-2-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      | 52 +++++-----------------------------------
 include/uapi/linux/user_events.h | 48 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_user.c |  5 ----
 3 files changed, 54 insertions(+), 51 deletions(-)
 create mode 100644 include/uapi/linux/user_events.h

(limited to 'include')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 592a3fbed98e..13689589d36e 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -1,54 +1,14 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2021, Microsoft Corporation.
+ * Copyright (c) 2022, Microsoft Corporation.
  *
  * Authors:
  *   Beau Belgrave <beaub@linux.microsoft.com>
  */
-#ifndef _UAPI_LINUX_USER_EVENTS_H
-#define _UAPI_LINUX_USER_EVENTS_H
 
-#include <linux/types.h>
-#include <linux/ioctl.h>
+#ifndef _LINUX_USER_EVENTS_H
+#define _LINUX_USER_EVENTS_H
 
-#ifdef __KERNEL__
-#include <linux/uio.h>
-#else
-#include <sys/uio.h>
-#endif
+#include <uapi/linux/user_events.h>
 
-#define USER_EVENTS_SYSTEM "user_events"
-#define USER_EVENTS_PREFIX "u:"
-
-/* Create dynamic location entry within a 32-bit value */
-#define DYN_LOC(offset, size) ((size) << 16 | (offset))
-
-/*
- * Describes an event registration and stores the results of the registration.
- * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
- * must set the size and name_args before invocation.
- */
-struct user_reg {
-
-	/* Input: Size of the user_reg structure being used */
-	__u32 size;
-
-	/* Input: Pointer to string with event name, description and flags */
-	__u64 name_args;
-
-	/* Output: Bitwise index of the event within the status page */
-	__u32 status_bit;
-
-	/* Output: Index of the event to use when writing data */
-	__u32 write_index;
-} __attribute__((__packed__));
-
-#define DIAG_IOC_MAGIC '*'
-
-/* Requests to register a user_event */
-#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
-
-/* Requests to delete a user_event */
-#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
-
-#endif /* _UAPI_LINUX_USER_EVENTS_H */
+#endif /* _LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
new file mode 100644
index 000000000000..03f92366068d
--- /dev/null
+++ b/include/uapi/linux/user_events.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2021-2022, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+#ifndef _UAPI_LINUX_USER_EVENTS_H
+#define _UAPI_LINUX_USER_EVENTS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_PREFIX "u:"
+
+/* Create dynamic location entry within a 32-bit value */
+#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+
+/*
+ * Describes an event registration and stores the results of the registration.
+ * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
+ * must set the size and name_args before invocation.
+ */
+struct user_reg {
+
+	/* Input: Size of the user_reg structure being used */
+	__u32 size;
+
+	/* Input: Pointer to string with event name, description and flags */
+	__u64 name_args;
+
+	/* Output: Bitwise index of the event within the status page */
+	__u32 status_bit;
+
+	/* Output: Index of the event to use when writing data */
+	__u32 write_index;
+} __attribute__((__packed__));
+
+#define DIAG_IOC_MAGIC '*'
+
+/* Request to register a user_event */
+#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg *)
+
+/* Request to delete a user_event */
+#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *)
+
+#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 908e8a13c675..070551480747 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,12 +19,7 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-/* Reminder to move to uapi when everything works */
-#ifdef CONFIG_COMPILE_TEST
 #include <linux/user_events.h>
-#else
-#include <uapi/linux/user_events.h>
-#endif
 #include "trace.h"
 #include "trace_dynevent.h"
 
-- 
cgit v1.2.3


From fd593511cdfc0b0e38af2eb21c99f5154a1d7acf Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:09 -0700
Subject: tracing/user_events: Track fork/exec/exit for mm lifetime

During tracefs discussions it was decided instead of requiring a mapping
within a user-process to track the lifetime of memory descriptors we
should hook the appropriate calls. Do this by adding the minimal stubs
required for task fork, exec, and exit. Currently this is just a NOP.
Future patches will implement these calls fully.

Link: https://lkml.kernel.org/r/20230328235219.203-3-beaub@linux.microsoft.com

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/exec.c                   |  2 ++
 include/linux/sched.h       |  5 +++++
 include/linux/user_events.h | 18 ++++++++++++++++++
 kernel/exit.c               |  2 ++
 kernel/fork.c               |  2 ++
 5 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..2b0042f8deec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
 #include <linux/syscall_user_dispatch.h>
 #include <linux/coredump.h>
 #include <linux/time_namespace.h>
+#include <linux/user_events.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
+	user_events_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
 	return retval;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..bf37846e90c2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -69,6 +69,7 @@ struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct user_event_mm;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -1528,6 +1529,10 @@ struct task_struct {
 	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
 #endif
 
+#ifdef CONFIG_USER_EVENTS
+	struct user_event_mm		*user_event_mm;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 13689589d36e..3d747c45d2fa 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -11,4 +11,22 @@
 
 #include <uapi/linux/user_events.h>
 
+#ifdef CONFIG_USER_EVENTS
+struct user_event_mm {
+};
+#endif
+
+static inline void user_events_fork(struct task_struct *t,
+				    unsigned long clone_flags)
+{
+}
+
+static inline void user_events_execve(struct task_struct *t)
+{
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+}
+
 #endif /* _LINUX_USER_EVENTS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index f2afdb0add7c..875d6a134df8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
 #include <linux/kprobes.h>
 #include <linux/rethook.h>
 #include <linux/sysfs.h>
+#include <linux/user_events.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -818,6 +819,7 @@ void __noreturn do_exit(long code)
 
 	coredump_task_exit(tsk);
 	ptrace_event(PTRACE_EVENT_EXIT, code);
+	user_events_exit(tsk);
 
 	validate_creds_for_do_exit(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index d8cda4c6de6c..efb1f2257772 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
 #include <linux/stackprotector.h>
+#include <linux/user_events.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2505,6 +2506,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	trace_task_newtask(p, clone_flags);
 	uprobe_copy_process(p, clone_flags);
+	user_events_fork(p, clone_flags);
 
 	copy_oom_score_adj(clone_flags, p);
 
-- 
cgit v1.2.3


From 7235759084a4f8524a46bd2638885ff3b34ce279 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:10 -0700
Subject: tracing/user_events: Use remote writes for event enablement

As part of the discussions for user_events aligned with user space
tracers, it was determined that user programs should register a aligned
value to set or clear a bit when an event becomes enabled. Currently a
shared page is being used that requires mmap(). Remove the shared page
implementation and move to a user registered address implementation.

In this new model during the event registration from user programs 3 new
values are specified. The first is the address to update when the event
is either enabled or disabled. The second is the bit to set/clear to
reflect the event being enabled. The third is the size of the value at
the specified address.

This allows for a local 32/64-bit value in user programs to support
both kernel and user tracers. As an example, setting bit 31 for kernel
tracers when the event becomes enabled allows for user tracers to use
the other bits for ref counts or other flags. The kernel side updates
the bit atomically, user programs need to also update these values
atomically.

User provided addresses must be aligned on a natural boundary, this
allows for single page checking and prevents odd behaviors such as a
enable value straddling 2 pages instead of a single page. Currently
page faults are only logged, future patches will handle these.

Link: https://lkml.kernel.org/r/20230328235219.203-4-beaub@linux.microsoft.com

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      |  53 +++-
 include/uapi/linux/user_events.h |  15 +-
 kernel/trace/Kconfig             |   5 +-
 kernel/trace/trace_events_user.c | 586 ++++++++++++++++++++++++++++++---------
 4 files changed, 517 insertions(+), 142 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 3d747c45d2fa..0120b3dd5b03 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -9,13 +9,63 @@
 #ifndef _LINUX_USER_EVENTS_H
 #define _LINUX_USER_EVENTS_H
 
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/mm_types.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/user_events.h>
 
 #ifdef CONFIG_USER_EVENTS
 struct user_event_mm {
+	struct list_head link;
+	struct list_head enablers;
+	struct mm_struct *mm;
+	struct user_event_mm *next;
+	refcount_t refcnt;
+	refcount_t tasks;
+	struct rcu_work put_rwork;
 };
-#endif
 
+extern void user_event_mm_dup(struct task_struct *t,
+			      struct user_event_mm *old_mm);
+
+extern void user_event_mm_remove(struct task_struct *t);
+
+static inline void user_events_fork(struct task_struct *t,
+				    unsigned long clone_flags)
+{
+	struct user_event_mm *old_mm;
+
+	if (!t || !current->user_event_mm)
+		return;
+
+	old_mm = current->user_event_mm;
+
+	if (clone_flags & CLONE_VM) {
+		t->user_event_mm = old_mm;
+		refcount_inc(&old_mm->tasks);
+		return;
+	}
+
+	user_event_mm_dup(t, old_mm);
+}
+
+static inline void user_events_execve(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}
+#else
 static inline void user_events_fork(struct task_struct *t,
 				    unsigned long clone_flags)
 {
@@ -28,5 +78,6 @@ static inline void user_events_execve(struct task_struct *t)
 static inline void user_events_exit(struct task_struct *t)
 {
 }
+#endif /* CONFIG_USER_EVENTS */
 
 #endif /* _LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 03f92366068d..22521bc622db 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -27,12 +27,21 @@ struct user_reg {
 	/* Input: Size of the user_reg structure being used */
 	__u32 size;
 
+	/* Input: Bit in enable address to use */
+	__u8 enable_bit;
+
+	/* Input: Enable size in bytes at address */
+	__u8 enable_size;
+
+	/* Input: Flags for future use, set to 0 */
+	__u16 flags;
+
+	/* Input: Address to update when enabled */
+	__u64 enable_addr;
+
 	/* Input: Pointer to string with event name, description and flags */
 	__u64 name_args;
 
-	/* Output: Bitwise index of the event within the status page */
-	__u32 status_bit;
-
 	/* Output: Index of the event to use when writing data */
 	__u32 write_index;
 } __attribute__((__packed__));
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5b1e7fa41ca8..c7020e071bf9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -798,9 +798,10 @@ config USER_EVENTS
 	  can be used like an existing kernel trace event.  User trace
 	  events are generated by writing to a tracefs file.  User
 	  processes can determine if their tracing events should be
-	  generated by memory mapping a tracefs file and checking for
-	  an associated byte being non-zero.
+	  generated by registering a value and bit with the kernel
+	  that reflects when it is enabled or not.
 
+	  See Documentation/trace/user_events.rst.
 	  If in doubt, say N.
 
 config HIST_TRIGGERS
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 070551480747..553a82ee7aeb 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,6 +19,7 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 #include <linux/user_events.h>
 #include "trace.h"
 #include "trace_dynevent.h"
@@ -29,34 +30,11 @@
 #define FIELD_DEPTH_NAME 1
 #define FIELD_DEPTH_SIZE 2
 
-/*
- * Limits how many trace_event calls user processes can create:
- * Must be a power of two of PAGE_SIZE.
- */
-#define MAX_PAGE_ORDER 0
-#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
-#define MAX_EVENTS (MAX_BYTES * 8)
-
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
-/*
- * The MAP_STATUS_* macros are used for taking a index and determining the
- * appropriate byte and the bit in the byte to set/reset for an event.
- *
- * The lower 3 bits of the index decide which bit to set.
- * The remaining upper bits of the index decide which byte to use for the bit.
- *
- * This is used when an event has a probe attached/removed to reflect live
- * status of the event wanting tracing or not to user-programs via shared
- * memory maps.
- */
-#define MAP_STATUS_BYTE(index) ((index) >> 3)
-#define MAP_STATUS_MASK(index) BIT((index) & 7)
-
 /*
  * Internal bits (kernel side only) to keep track of connected probes:
  * These are used when status is requested in text form about an event. These
@@ -70,20 +48,14 @@
 #define EVENT_STATUS_OTHER BIT(7)
 
 /*
- * Stores the pages, tables, and locks for a group of events.
- * Each logical grouping of events has its own group, with a
- * matching page for status checks within user programs. This
- * allows for isolation of events to user programs by various
- * means.
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
  */
 struct user_event_group {
-	struct page *pages;
-	char *register_page_data;
 	char *system_name;
 	struct hlist_node node;
 	struct mutex reg_mutex;
 	DECLARE_HASHTABLE(register_table, 8);
-	DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
 };
 
 /* Group for init_user_ns mapping, top-most group */
@@ -106,12 +78,34 @@ struct user_event {
 	struct list_head fields;
 	struct list_head validators;
 	refcount_t refcnt;
-	int index;
-	int flags;
 	int min_size;
 	char status;
 };
 
+/*
+ * Stores per-mm/event properties that enable an address to be
+ * updated properly for each task. As tasks are forked, we use
+ * these to track enablement sites that are tied to an event.
+ */
+struct user_event_enabler {
+	struct list_head link;
+	struct user_event *event;
+	unsigned long addr;
+
+	/* Track enable bit, flags, etc. Aligned for bitops. */
+	unsigned int values;
+};
+
+/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
+#define ENABLE_VAL_BIT_MASK 0x3F
+
+/* Only duplicate the bit value */
+#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+
+/* Global list of memory descriptors using user_events */
+static LIST_HEAD(user_event_mms);
+static DEFINE_SPINLOCK(user_event_mms_lock);
+
 /*
  * Stores per-file events references, as users register events
  * within a file this structure is modified and freed via RCU.
@@ -145,33 +139,17 @@ static int user_event_parse(struct user_event_group *group, char *name,
 			    char *args, char *flags,
 			    struct user_event **newuser);
 
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
+static void user_event_mm_put(struct user_event_mm *mm);
+
 static u32 user_event_key(char *name)
 {
 	return jhash(name, strlen(name), 0);
 }
 
-static void set_page_reservations(char *pages, bool set)
-{
-	int page;
-
-	for (page = 0; page < MAX_PAGES; ++page) {
-		void *addr = pages + (PAGE_SIZE * page);
-
-		if (set)
-			SetPageReserved(virt_to_page(addr));
-		else
-			ClearPageReserved(virt_to_page(addr));
-	}
-}
-
 static void user_event_group_destroy(struct user_event_group *group)
 {
-	if (group->register_page_data)
-		set_page_reservations(group->register_page_data, false);
-
-	if (group->pages)
-		__free_pages(group->pages, MAX_PAGE_ORDER);
-
 	kfree(group->system_name);
 	kfree(group);
 }
@@ -242,19 +220,6 @@ static struct user_event_group
 	if (!group->system_name)
 		goto error;
 
-	group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-
-	if (!group->pages)
-		goto error;
-
-	group->register_page_data = page_address(group->pages);
-
-	set_page_reservations(group->register_page_data, true);
-
-	/* Zero all bits beside 0 (which is reserved for failures) */
-	bitmap_zero(group->page_bitmap, MAX_EVENTS);
-	set_bit(0, group->page_bitmap);
-
 	mutex_init(&group->reg_mutex);
 	hash_init(group->register_table);
 
@@ -266,20 +231,367 @@ error:
 	return NULL;
 };
 
-static __always_inline
-void user_event_register_set(struct user_event *user)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+{
+	list_del_rcu(&enabler->link);
+
+	/* No longer tracking the event via the enabler */
+	refcount_dec(&enabler->event->refcnt);
+
+	kfree(enabler);
+}
+
+static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
+{
+	bool unlocked;
+	int ret;
+
+	mmap_read_lock(mm->mm);
+
+	/* Ensure MM has tasks, cannot use after exit_mm() */
+	if (refcount_read(&mm->tasks) == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+			       &unlocked);
+out:
+	mmap_read_unlock(mm->mm);
+
+	return ret;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+				    struct user_event_enabler *enabler)
+{
+	unsigned long uaddr = enabler->addr;
+	unsigned long *ptr;
+	struct page *page;
+	void *kaddr;
+	int ret;
+
+	lockdep_assert_held(&event_mutex);
+	mmap_assert_locked(mm->mm);
+
+	/* Ensure MM has tasks, cannot use after exit_mm() */
+	if (refcount_read(&mm->tasks) == 0)
+		return -ENOENT;
+
+	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+				    &page, NULL, NULL);
+
+	if (ret <= 0) {
+		pr_warn("user_events: Enable write failed\n");
+		return -EFAULT;
+	}
+
+	kaddr = kmap_local_page(page);
+	ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+	/* Update bit atomically, user tracers must be atomic as well */
+	if (enabler->event && enabler->event->status)
+		set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+	else
+		clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+
+	kunmap_local(kaddr);
+	unpin_user_pages_dirty_lock(&page, 1, true);
+
+	return 0;
+}
+
+static void user_event_enabler_update(struct user_event *user)
+{
+	struct user_event_enabler *enabler;
+	struct user_event_mm *mm = user_event_mm_get_all(user);
+	struct user_event_mm *next;
+
+	while (mm) {
+		next = mm->next;
+		mmap_read_lock(mm->mm);
+		rcu_read_lock();
+
+		list_for_each_entry_rcu(enabler, &mm->enablers, link)
+			if (enabler->event == user)
+				user_event_enabler_write(mm, enabler);
+
+		rcu_read_unlock();
+		mmap_read_unlock(mm->mm);
+		user_event_mm_put(mm);
+		mm = next;
+	}
+}
+
+static bool user_event_enabler_dup(struct user_event_enabler *orig,
+				   struct user_event_mm *mm)
+{
+	struct user_event_enabler *enabler;
+
+	enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT);
+
+	if (!enabler)
+		return false;
+
+	enabler->event = orig->event;
+	enabler->addr = orig->addr;
+
+	/* Only dup part of value (ignore future flags, etc) */
+	enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
+
+	refcount_inc(&enabler->event->refcnt);
+	list_add_rcu(&enabler->link, &mm->enablers);
+
+	return true;
+}
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
+{
+	refcount_inc(&mm->refcnt);
+
+	return mm;
+}
+
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
+{
+	struct user_event_mm *found = NULL;
+	struct user_event_enabler *enabler;
+	struct user_event_mm *mm;
+
+	/*
+	 * We do not want to block fork/exec while enablements are being
+	 * updated, so we use RCU to walk the current tasks that have used
+	 * user_events ABI for 1 or more events. Each enabler found in each
+	 * task that matches the event being updated has a write to reflect
+	 * the kernel state back into the process. Waits/faults must not occur
+	 * during this. So we scan the list under RCU for all the mm that have
+	 * the event within it. This is needed because mm_read_lock() can wait.
+	 * Each user mm returned has a ref inc to handle remove RCU races.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(mm, &user_event_mms, link)
+		list_for_each_entry_rcu(enabler, &mm->enablers, link)
+			if (enabler->event == user) {
+				mm->next = found;
+				found = user_event_mm_get(mm);
+				break;
+			}
+
+	rcu_read_unlock();
+
+	return found;
+}
+
+static struct user_event_mm *user_event_mm_create(struct task_struct *t)
+{
+	struct user_event_mm *user_mm;
+	unsigned long flags;
+
+	user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL);
+
+	if (!user_mm)
+		return NULL;
+
+	user_mm->mm = t->mm;
+	INIT_LIST_HEAD(&user_mm->enablers);
+	refcount_set(&user_mm->refcnt, 1);
+	refcount_set(&user_mm->tasks, 1);
+
+	spin_lock_irqsave(&user_event_mms_lock, flags);
+	list_add_rcu(&user_mm->link, &user_event_mms);
+	spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+	t->user_event_mm = user_mm;
+
+	/*
+	 * The lifetime of the memory descriptor can slightly outlast
+	 * the task lifetime if a ref to the user_event_mm is taken
+	 * between list_del_rcu() and call_rcu(). Therefore we need
+	 * to take a reference to it to ensure it can live this long
+	 * under this corner case. This can also occur in clones that
+	 * outlast the parent.
+	 */
+	mmgrab(user_mm->mm);
+
+	return user_mm;
+}
+
+static struct user_event_mm *current_user_event_mm(void)
+{
+	struct user_event_mm *user_mm = current->user_event_mm;
+
+	if (user_mm)
+		goto inc;
+
+	user_mm = user_event_mm_create(current);
+
+	if (!user_mm)
+		goto error;
+inc:
+	refcount_inc(&user_mm->refcnt);
+error:
+	return user_mm;
+}
+
+static void user_event_mm_destroy(struct user_event_mm *mm)
+{
+	struct user_event_enabler *enabler, *next;
+
+	list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+		user_event_enabler_destroy(enabler);
+
+	mmdrop(mm->mm);
+	kfree(mm);
+}
+
+static void user_event_mm_put(struct user_event_mm *mm)
+{
+	if (mm && refcount_dec_and_test(&mm->refcnt))
+		user_event_mm_destroy(mm);
+}
+
+static void delayed_user_event_mm_put(struct work_struct *work)
+{
+	struct user_event_mm *mm;
+
+	mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
+	user_event_mm_put(mm);
+}
+
+void user_event_mm_remove(struct task_struct *t)
 {
-	int i = user->index;
+	struct user_event_mm *mm;
+	unsigned long flags;
+
+	might_sleep();
+
+	mm = t->user_event_mm;
+	t->user_event_mm = NULL;
+
+	/* Clone will increment the tasks, only remove if last clone */
+	if (!refcount_dec_and_test(&mm->tasks))
+		return;
+
+	/* Remove the mm from the list, so it can no longer be enabled */
+	spin_lock_irqsave(&user_event_mms_lock, flags);
+	list_del_rcu(&mm->link);
+	spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+	/*
+	 * We need to wait for currently occurring writes to stop within
+	 * the mm. This is required since exit_mm() snaps the current rss
+	 * stats and clears them. On the final mmdrop(), check_mm() will
+	 * report a bug if these increment.
+	 *
+	 * All writes/pins are done under mmap_read lock, take the write
+	 * lock to ensure in-progress faults have completed. Faults that
+	 * are pending but yet to run will check the task count and skip
+	 * the fault since the mm is going away.
+	 */
+	mmap_write_lock(mm->mm);
+	mmap_write_unlock(mm->mm);
 
-	user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+	/*
+	 * Put for mm must be done after RCU delay to handle new refs in
+	 * between the list_del_rcu() and now. This ensures any get refs
+	 * during rcu_read_lock() are accounted for during list removal.
+	 *
+	 * CPU A			|	CPU B
+	 * ---------------------------------------------------------------
+	 * user_event_mm_remove()	|	rcu_read_lock();
+	 * list_del_rcu()		|	list_for_each_entry_rcu();
+	 * call_rcu()			|	refcount_inc();
+	 * .				|	rcu_read_unlock();
+	 * schedule_work()		|	.
+	 * user_event_mm_put()		|	.
+	 *
+	 * mmdrop() cannot be called in the softirq context of call_rcu()
+	 * so we use a work queue after call_rcu() to run within.
+	 */
+	INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
+	queue_rcu_work(system_wq, &mm->put_rwork);
 }
 
-static __always_inline
-void user_event_register_clear(struct user_event *user)
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
 {
-	int i = user->index;
+	struct user_event_mm *mm = user_event_mm_create(t);
+	struct user_event_enabler *enabler;
 
-	user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+	if (!mm)
+		return;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+		if (!user_event_enabler_dup(enabler, mm))
+			goto error;
+
+	rcu_read_unlock();
+
+	return;
+error:
+	rcu_read_unlock();
+	user_event_mm_remove(t);
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct user_reg *reg, struct user_event *user,
+			   int *write_result)
+{
+	struct user_event_enabler *enabler;
+	struct user_event_mm *user_mm;
+	unsigned long uaddr = (unsigned long)reg->enable_addr;
+
+	user_mm = current_user_event_mm();
+
+	if (!user_mm)
+		return NULL;
+
+	enabler = kzalloc(sizeof(*enabler), GFP_KERNEL);
+
+	if (!enabler)
+		goto out;
+
+	enabler->event = user;
+	enabler->addr = uaddr;
+	enabler->values = reg->enable_bit;
+retry:
+	/* Prevents state changes from racing with new enablers */
+	mutex_lock(&event_mutex);
+
+	/* Attempt to reflect the current state within the process */
+	mmap_read_lock(user_mm->mm);
+	*write_result = user_event_enabler_write(user_mm, enabler);
+	mmap_read_unlock(user_mm->mm);
+
+	/*
+	 * If the write works, then we will track the enabler. A ref to the
+	 * underlying user_event is held by the enabler to prevent it going
+	 * away while the enabler is still in use by a process. The ref is
+	 * removed when the enabler is destroyed. This means a event cannot
+	 * be forcefully deleted from the system until all tasks using it
+	 * exit or run exec(), which includes forks and clones.
+	 */
+	if (!*write_result) {
+		refcount_inc(&enabler->event->refcnt);
+		list_add_rcu(&enabler->link, &user_mm->enablers);
+	}
+
+	mutex_unlock(&event_mutex);
+
+	if (*write_result) {
+		/* Attempt to fault-in and retry if it worked */
+		if (!user_event_mm_fault_in(user_mm, uaddr))
+			goto retry;
+
+		kfree(enabler);
+		enabler = NULL;
+	}
+out:
+	user_event_mm_put(user_mm);
+
+	return enabler;
 }
 
 static __always_inline __must_check
@@ -824,9 +1136,6 @@ static int destroy_user_event(struct user_event *user)
 		return ret;
 
 	dyn_event_remove(&user->devent);
-
-	user_event_register_clear(user);
-	clear_bit(user->index, user->group->page_bitmap);
 	hash_del(&user->node);
 
 	user_event_destroy_validators(user);
@@ -972,9 +1281,9 @@ discard:
 #endif
 
 /*
- * Update the register page that is shared between user processes.
+ * Update the enabled bit among all user processes.
  */
-static void update_reg_page_for(struct user_event *user)
+static void update_enable_bit_for(struct user_event *user)
 {
 	struct tracepoint *tp = &user->tracepoint;
 	char status = 0;
@@ -1005,12 +1314,9 @@ static void update_reg_page_for(struct user_event *user)
 		rcu_read_unlock_sched();
 	}
 
-	if (status)
-		user_event_register_set(user);
-	else
-		user_event_register_clear(user);
-
 	user->status = status;
+
+	user_event_enabler_update(user);
 }
 
 /*
@@ -1067,10 +1373,10 @@ static int user_event_reg(struct trace_event_call *call,
 	return ret;
 inc:
 	refcount_inc(&user->refcnt);
-	update_reg_page_for(user);
+	update_enable_bit_for(user);
 	return 0;
 dec:
-	update_reg_page_for(user);
+	update_enable_bit_for(user);
 	refcount_dec(&user->refcnt);
 	return 0;
 }
@@ -1266,7 +1572,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
 			    struct user_event **newuser)
 {
 	int ret;
-	int index;
 	u32 key;
 	struct user_event *user;
 
@@ -1285,11 +1590,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
 		return 0;
 	}
 
-	index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
-
-	if (index == MAX_EVENTS)
-		return -EMFILE;
-
 	user = kzalloc(sizeof(*user), GFP_KERNEL);
 
 	if (!user)
@@ -1335,14 +1635,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
 	if (ret)
 		goto put_user_lock;
 
-	user->index = index;
-
 	/* Ensure we track self ref and caller ref (2) */
 	refcount_set(&user->refcnt, 2);
 
 	dyn_event_init(&user->devent, &user_event_dops);
 	dyn_event_add(&user->devent, &user->call);
-	set_bit(user->index, group->page_bitmap);
 	hash_add(group->register_table, &user->node, key);
 
 	mutex_unlock(&event_mutex);
@@ -1559,6 +1856,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
 	if (ret)
 		return ret;
 
+	/* Ensure no flags, since we don't support any yet */
+	if (kreg->flags != 0)
+		return -EINVAL;
+
+	/* Ensure supported size */
+	switch (kreg->enable_size) {
+	case 4:
+		/* 32-bit */
+		break;
+#if BITS_PER_LONG >= 64
+	case 8:
+		/* 64-bit */
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Ensure natural alignment */
+	if (kreg->enable_addr % kreg->enable_size)
+		return -EINVAL;
+
+	/* Ensure bit range for size */
+	if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
+		return -EINVAL;
+
+	/* Ensure accessible */
+	if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
+		       kreg->enable_size))
+		return -EFAULT;
+
 	kreg->size = size;
 
 	return 0;
@@ -1573,8 +1901,10 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
 	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
 	struct user_reg reg;
 	struct user_event *user;
+	struct user_event_enabler *enabler;
 	char *name;
 	long ret;
+	int write_result;
 
 	ret = user_reg_get(ureg, &reg);
 
@@ -1605,8 +1935,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
 	if (ret < 0)
 		return ret;
 
+	/*
+	 * user_events_ref_add succeeded:
+	 * At this point we have a user_event, it's lifetime is bound by the
+	 * reference count, not this file. If anything fails, the user_event
+	 * still has a reference until the file is released. During release
+	 * any remaining references (from user_events_ref_add) are decremented.
+	 *
+	 * Attempt to create an enabler, which too has a lifetime tied in the
+	 * same way for the event. Once the task that caused the enabler to be
+	 * created exits or issues exec() then the enablers it has created
+	 * will be destroyed and the ref to the event will be decremented.
+	 */
+	enabler = user_event_enabler_create(&reg, user, &write_result);
+
+	if (!enabler)
+		return -ENOMEM;
+
+	/* Write failed/faulted, give error back to caller */
+	if (write_result)
+		return write_result;
+
 	put_user((u32)ret, &ureg->write_index);
-	put_user(user->index, &ureg->status_bit);
 
 	return 0;
 }
@@ -1720,38 +2070,6 @@ static const struct file_operations user_data_fops = {
 	.release = user_events_release,
 };
 
-static struct user_event_group *user_status_group(struct file *file)
-{
-	struct seq_file *m = file->private_data;
-
-	if (!m)
-		return NULL;
-
-	return m->private;
-}
-
-/*
- * Maps the shared page into the user process for checking if event is enabled.
- */
-static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	char *pages;
-	struct user_event_group *group = user_status_group(file);
-	unsigned long size = vma->vm_end - vma->vm_start;
-
-	if (size != MAX_BYTES)
-		return -EINVAL;
-
-	if (!group)
-		return -EINVAL;
-
-	pages = group->register_page_data;
-
-	return remap_pfn_range(vma, vma->vm_start,
-			       virt_to_phys(pages) >> PAGE_SHIFT,
-			       size, vm_get_page_prot(VM_READ));
-}
-
 static void *user_seq_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos)
@@ -1775,7 +2093,7 @@ static int user_seq_show(struct seq_file *m, void *p)
 	struct user_event_group *group = m->private;
 	struct user_event *user;
 	char status;
-	int i, active = 0, busy = 0, flags;
+	int i, active = 0, busy = 0;
 
 	if (!group)
 		return -EINVAL;
@@ -1784,11 +2102,10 @@ static int user_seq_show(struct seq_file *m, void *p)
 
 	hash_for_each(group->register_table, i, user, node) {
 		status = user->status;
-		flags = user->flags;
 
-		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+		seq_printf(m, "%s", EVENT_NAME(user));
 
-		if (flags != 0 || status != 0)
+		if (status != 0)
 			seq_puts(m, " #");
 
 		if (status != 0) {
@@ -1811,7 +2128,6 @@ static int user_seq_show(struct seq_file *m, void *p)
 	seq_puts(m, "\n");
 	seq_printf(m, "Active: %d\n", active);
 	seq_printf(m, "Busy: %d\n", busy);
-	seq_printf(m, "Max: %ld\n", MAX_EVENTS);
 
 	return 0;
 }
@@ -1847,7 +2163,6 @@ static int user_status_open(struct inode *node, struct file *file)
 
 static const struct file_operations user_status_fops = {
 	.open = user_status_open,
-	.mmap = user_status_mmap,
 	.read = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release,
@@ -1868,8 +2183,7 @@ static int create_user_tracefs(void)
 		goto err;
 	}
 
-	/* mmap with MAP_SHARED requires writable fd */
-	emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+	emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
 				    NULL, NULL, &user_status_fops);
 
 	if (!emmap) {
-- 
cgit v1.2.3


From dcb8177c13953872c9e5ce4a99b63a87a3c2f683 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:12 -0700
Subject: tracing/user_events: Add ioctl for disabling addresses

Enablements are now tracked by the lifetime of the task/mm. User
processes need to be able to disable their addresses if tracing is
requested to be turned off. Before unmapping the page would suffice.
However, we now need a stronger contract. Add an ioctl to enable this.

A new flag bit is added, freeing, to user_event_enabler to ensure that
if the event is attempted to be removed while a fault is being handled
that the remove is delayed until after the fault is reattempted.

Link: https://lkml.kernel.org/r/20230328235219.203-6-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/uapi/linux/user_events.h | 24 ++++++++++
 kernel/trace/trace_events_user.c | 97 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 22521bc622db..3e7275e3234a 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -46,6 +46,27 @@ struct user_reg {
 	__u32 write_index;
 } __attribute__((__packed__));
 
+/*
+ * Describes an event unregister, callers must set the size, address and bit.
+ * This structure is passed to the DIAG_IOCSUNREG ioctl to disable bit updates.
+ */
+struct user_unreg {
+	/* Input: Size of the user_unreg structure being used */
+	__u32 size;
+
+	/* Input: Bit to unregister */
+	__u8 disable_bit;
+
+	/* Input: Reserved, set to 0 */
+	__u8 __reserved;
+
+	/* Input: Reserved, set to 0 */
+	__u16 __reserved2;
+
+	/* Input: Address to unregister */
+	__u64 disable_addr;
+} __attribute__((__packed__));
+
 #define DIAG_IOC_MAGIC '*'
 
 /* Request to register a user_event */
@@ -54,4 +75,7 @@ struct user_reg {
 /* Request to delete a user_event */
 #define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *)
 
+/* Requests to unregister a user_event */
+#define DIAG_IOCSUNREG _IOW(DIAG_IOC_MAGIC, 2, struct user_unreg*)
+
 #endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 86bda1660536..f88bab3f1fe1 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -102,6 +102,9 @@ struct user_event_enabler {
 /* Bit 6 is for faulting status of enablement */
 #define ENABLE_VAL_FAULTING_BIT 6
 
+/* Bit 7 is for freeing status of enablement */
+#define ENABLE_VAL_FREEING_BIT 7
+
 /* Only duplicate the bit value */
 #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
 
@@ -301,6 +304,12 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
 	/* Prevent state changes from racing */
 	mutex_lock(&event_mutex);
 
+	/* User asked for enabler to be removed during fault */
+	if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
+		user_event_enabler_destroy(enabler);
+		goto out;
+	}
+
 	/*
 	 * If we managed to get the page, re-issue the write. We do not
 	 * want to get into a possible infinite loop, which is why we only
@@ -315,7 +324,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
 		user_event_enabler_write(mm, enabler, true);
 		mmap_read_unlock(mm->mm);
 	}
-
+out:
 	mutex_unlock(&event_mutex);
 
 	/* In all cases we no longer need the mm or fault */
@@ -370,7 +379,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 	if (refcount_read(&mm->tasks) == 0)
 		return -ENOENT;
 
-	if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))))
+	if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
+		     test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
 		return -EBUSY;
 
 	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
@@ -428,6 +438,10 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
 {
 	struct user_event_enabler *enabler;
 
+	/* Skip pending frees */
+	if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
+		return true;
+
 	enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT);
 
 	if (!enabler)
@@ -2086,6 +2100,79 @@ static long user_events_ioctl_del(struct user_event_file_info *info,
 	return ret;
 }
 
+static long user_unreg_get(struct user_unreg __user *ureg,
+			   struct user_unreg *kreg)
+{
+	u32 size;
+	long ret;
+
+	ret = get_user(size, &ureg->size);
+
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	if (size < offsetofend(struct user_unreg, disable_addr))
+		return -EINVAL;
+
+	ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+	/* Ensure no reserved values, since we don't support any yet */
+	if (kreg->__reserved || kreg->__reserved2)
+		return -EINVAL;
+
+	return ret;
+}
+
+/*
+ * Unregisters an enablement address/bit within a task/user mm.
+ */
+static long user_events_ioctl_unreg(unsigned long uarg)
+{
+	struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
+	struct user_event_mm *mm = current->user_event_mm;
+	struct user_event_enabler *enabler, *next;
+	struct user_unreg reg;
+	long ret;
+
+	ret = user_unreg_get(ureg, &reg);
+
+	if (ret)
+		return ret;
+
+	if (!mm)
+		return -ENOENT;
+
+	ret = -ENOENT;
+
+	/*
+	 * Flags freeing and faulting are used to indicate if the enabler is in
+	 * use at all. When faulting is set a page-fault is occurring asyncly.
+	 * During async fault if freeing is set, the enabler will be destroyed.
+	 * If no async fault is happening, we can destroy it now since we hold
+	 * the event_mutex during these checks.
+	 */
+	mutex_lock(&event_mutex);
+
+	list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+		if (enabler->addr == reg.disable_addr &&
+		    (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) {
+			set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+
+			if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
+				user_event_enabler_destroy(enabler);
+
+			/* Removed at least one */
+			ret = 0;
+		}
+
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
 /*
  * Handles the ioctl from user mode to register or alter operations.
  */
@@ -2108,6 +2195,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
 		ret = user_events_ioctl_del(info, uarg);
 		mutex_unlock(&group->reg_mutex);
 		break;
+
+	case DIAG_IOCSUNREG:
+		mutex_lock(&group->reg_mutex);
+		ret = user_events_ioctl_unreg(uarg);
+		mutex_unlock(&group->reg_mutex);
+		break;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From a4c40c1349e32f9510707ed09e0961626980d8cb Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:19 -0700
Subject: tracing/user_events: Align structs with tabs for readability

Add tabs to make struct members easier to read and unify the style of
the code.

Link: https://lkml.kernel.org/r/20230328235219.203-13-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      | 14 +++----
 include/uapi/linux/user_events.h | 24 ++++++------
 kernel/trace/trace_events_user.c | 82 ++++++++++++++++++++--------------------
 3 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 0120b3dd5b03..2847f5a18a86 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -17,13 +17,13 @@
 
 #ifdef CONFIG_USER_EVENTS
 struct user_event_mm {
-	struct list_head link;
-	struct list_head enablers;
-	struct mm_struct *mm;
-	struct user_event_mm *next;
-	refcount_t refcnt;
-	refcount_t tasks;
-	struct rcu_work put_rwork;
+	struct list_head	link;
+	struct list_head	enablers;
+	struct mm_struct	*mm;
+	struct user_event_mm	*next;
+	refcount_t		refcnt;
+	refcount_t		tasks;
+	struct rcu_work		put_rwork;
 };
 
 extern void user_event_mm_dup(struct task_struct *t,
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 3e7275e3234a..2984aae4a2b4 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -25,25 +25,25 @@
 struct user_reg {
 
 	/* Input: Size of the user_reg structure being used */
-	__u32 size;
+	__u32	size;
 
 	/* Input: Bit in enable address to use */
-	__u8 enable_bit;
+	__u8	enable_bit;
 
 	/* Input: Enable size in bytes at address */
-	__u8 enable_size;
+	__u8	enable_size;
 
 	/* Input: Flags for future use, set to 0 */
-	__u16 flags;
+	__u16	flags;
 
 	/* Input: Address to update when enabled */
-	__u64 enable_addr;
+	__u64	enable_addr;
 
 	/* Input: Pointer to string with event name, description and flags */
-	__u64 name_args;
+	__u64	name_args;
 
 	/* Output: Index of the event to use when writing data */
-	__u32 write_index;
+	__u32	write_index;
 } __attribute__((__packed__));
 
 /*
@@ -52,19 +52,19 @@ struct user_reg {
  */
 struct user_unreg {
 	/* Input: Size of the user_unreg structure being used */
-	__u32 size;
+	__u32	size;
 
 	/* Input: Bit to unregister */
-	__u8 disable_bit;
+	__u8	disable_bit;
 
 	/* Input: Reserved, set to 0 */
-	__u8 __reserved;
+	__u8	__reserved;
 
 	/* Input: Reserved, set to 0 */
-	__u16 __reserved2;
+	__u16	__reserved2;
 
 	/* Input: Address to unregister */
-	__u64 disable_addr;
+	__u64	disable_addr;
 } __attribute__((__packed__));
 
 #define DIAG_IOC_MAGIC '*'
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9b43a02e1597..67cb7b53caf6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -53,9 +53,9 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-	char *system_name;
-	struct hlist_node node;
-	struct mutex reg_mutex;
+	char		*system_name;
+	struct		hlist_node node;
+	struct		mutex reg_mutex;
 	DECLARE_HASHTABLE(register_table, 8);
 };
 
@@ -76,17 +76,17 @@ static unsigned int current_user_events;
  * refcnt reaches one.
  */
 struct user_event {
-	struct user_event_group *group;
-	struct tracepoint tracepoint;
-	struct trace_event_call call;
-	struct trace_event_class class;
-	struct dyn_event devent;
-	struct hlist_node node;
-	struct list_head fields;
-	struct list_head validators;
-	refcount_t refcnt;
-	int min_size;
-	char status;
+	struct user_event_group		*group;
+	struct tracepoint		tracepoint;
+	struct trace_event_call		call;
+	struct trace_event_class	class;
+	struct dyn_event		devent;
+	struct hlist_node		node;
+	struct list_head		fields;
+	struct list_head		validators;
+	refcount_t			refcnt;
+	int				min_size;
+	char				status;
 };
 
 /*
@@ -95,12 +95,12 @@ struct user_event {
  * these to track enablement sites that are tied to an event.
  */
 struct user_event_enabler {
-	struct list_head link;
-	struct user_event *event;
-	unsigned long addr;
+	struct list_head	link;
+	struct user_event	*event;
+	unsigned long		addr;
 
 	/* Track enable bit, flags, etc. Aligned for bitops. */
-	unsigned int values;
+	unsigned int		values;
 };
 
 /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
@@ -119,9 +119,9 @@ struct user_event_enabler {
 
 /* Used for asynchronous faulting in of pages */
 struct user_event_enabler_fault {
-	struct work_struct work;
-	struct user_event_mm *mm;
-	struct user_event_enabler *enabler;
+	struct work_struct		work;
+	struct user_event_mm		*mm;
+	struct user_event_enabler	*enabler;
 };
 
 static struct kmem_cache *fault_cache;
@@ -137,23 +137,23 @@ static DEFINE_SPINLOCK(user_event_mms_lock);
  * These are not shared and only accessible by the file that created it.
  */
 struct user_event_refs {
-	struct rcu_head rcu;
-	int count;
-	struct user_event *events[];
+	struct rcu_head		rcu;
+	int			count;
+	struct user_event	*events[];
 };
 
 struct user_event_file_info {
-	struct user_event_group *group;
-	struct user_event_refs *refs;
+	struct user_event_group	*group;
+	struct user_event_refs	*refs;
 };
 
 #define VALIDATOR_ENSURE_NULL (1 << 0)
 #define VALIDATOR_REL (1 << 1)
 
 struct user_event_validator {
-	struct list_head link;
-	int offset;
-	int flags;
+	struct list_head	link;
+	int			offset;
+	int			flags;
 };
 
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
@@ -2276,11 +2276,11 @@ out:
 }
 
 static const struct file_operations user_data_fops = {
-	.open = user_events_open,
-	.write = user_events_write,
-	.write_iter = user_events_write_iter,
+	.open		= user_events_open,
+	.write		= user_events_write,
+	.write_iter	= user_events_write_iter,
 	.unlocked_ioctl	= user_events_ioctl,
-	.release = user_events_release,
+	.release	= user_events_release,
 };
 
 static void *user_seq_start(struct seq_file *m, loff_t *pos)
@@ -2346,10 +2346,10 @@ static int user_seq_show(struct seq_file *m, void *p)
 }
 
 static const struct seq_operations user_seq_ops = {
-	.start = user_seq_start,
-	.next  = user_seq_next,
-	.stop  = user_seq_stop,
-	.show  = user_seq_show,
+	.start	= user_seq_start,
+	.next	= user_seq_next,
+	.stop	= user_seq_stop,
+	.show	= user_seq_show,
 };
 
 static int user_status_open(struct inode *node, struct file *file)
@@ -2375,10 +2375,10 @@ static int user_status_open(struct inode *node, struct file *file)
 }
 
 static const struct file_operations user_status_fops = {
-	.open = user_status_open,
-	.read = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
+	.open		= user_status_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
 };
 
 /*
-- 
cgit v1.2.3


From 1e2bae6ae8f6b404b295edd5ba11a0bb1566544c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 24 Mar 2023 16:21:49 +0000
Subject: regmap: Removed compressed cache support

The compressed register cache support has assumptions that make it hard to
cover in testing, mainly that it requires raw registers defaults be
provided. Rather than either address these assumptions or leave it untested
by the forthcoming KUnit tests let's remove it, the use case is quite thin
and there are no current users.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230324-regcache-lzo-v1-1-08c5d63e2a5e@kernel.org
---
 drivers/base/regmap/Kconfig        |   5 -
 drivers/base/regmap/Makefile       |   1 -
 drivers/base/regmap/internal.h     |   1 -
 drivers/base/regmap/regcache-lzo.c | 368 -------------------------------------
 drivers/base/regmap/regcache.c     |   3 -
 include/linux/regmap.h             |   1 -
 6 files changed, 379 deletions(-)
 delete mode 100644 drivers/base/regmap/regcache-lzo.c

(limited to 'include')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index cd4bb642b9de..9f8dcb32c24b 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -9,11 +9,6 @@ config REGMAP
 	select MDIO_BUS if REGMAP_MDIO
 	bool
 
-config REGCACHE_COMPRESSED
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
-	bool
-
 config REGMAP_AC97
 	tristate
 
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 6990de7ca9a9..8900c340e352 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -4,7 +4,6 @@ CFLAGS_regmap.o := -I$(src)
 
 obj-$(CONFIG_REGMAP) += regmap.o regcache.o
 obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o
-obj-$(CONFIG_REGCACHE_COMPRESSED) += regcache-lzo.o
 obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o
 obj-$(CONFIG_REGMAP_AC97) += regmap-ac97.o
 obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index da8996e7a1f1..919d790a01b7 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -281,7 +281,6 @@ enum regmap_endian regmap_get_val_endian(struct device *dev,
 					 const struct regmap_config *config);
 
 extern struct regcache_ops regcache_rbtree_ops;
-extern struct regcache_ops regcache_lzo_ops;
 extern struct regcache_ops regcache_flat_ops;
 
 static inline const char *regmap_name(const struct regmap *map)
diff --git a/drivers/base/regmap/regcache-lzo.c b/drivers/base/regmap/regcache-lzo.c
deleted file mode 100644
index 7886303eb026..000000000000
--- a/drivers/base/regmap/regcache-lzo.c
+++ /dev/null
@@ -1,368 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-//
-// Register cache access API - LZO caching support
-//
-// Copyright 2011 Wolfson Microelectronics plc
-//
-// Author: Dimitris Papastamos <dp@opensource.wolfsonmicro.com>
-
-#include <linux/device.h>
-#include <linux/lzo.h>
-#include <linux/slab.h>
-
-#include "internal.h"
-
-static int regcache_lzo_exit(struct regmap *map);
-
-struct regcache_lzo_ctx {
-	void *wmem;
-	void *dst;
-	const void *src;
-	size_t src_len;
-	size_t dst_len;
-	size_t decompressed_size;
-	unsigned long *sync_bmp;
-	int sync_bmp_nbits;
-};
-
-#define LZO_BLOCK_NUM 8
-static int regcache_lzo_block_count(struct regmap *map)
-{
-	return LZO_BLOCK_NUM;
-}
-
-static int regcache_lzo_prepare(struct regcache_lzo_ctx *lzo_ctx)
-{
-	lzo_ctx->wmem = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
-	if (!lzo_ctx->wmem)
-		return -ENOMEM;
-	return 0;
-}
-
-static int regcache_lzo_compress(struct regcache_lzo_ctx *lzo_ctx)
-{
-	size_t compress_size;
-	int ret;
-
-	ret = lzo1x_1_compress(lzo_ctx->src, lzo_ctx->src_len,
-			       lzo_ctx->dst, &compress_size, lzo_ctx->wmem);
-	if (ret != LZO_E_OK || compress_size > lzo_ctx->dst_len)
-		return -EINVAL;
-	lzo_ctx->dst_len = compress_size;
-	return 0;
-}
-
-static int regcache_lzo_decompress(struct regcache_lzo_ctx *lzo_ctx)
-{
-	size_t dst_len;
-	int ret;
-
-	dst_len = lzo_ctx->dst_len;
-	ret = lzo1x_decompress_safe(lzo_ctx->src, lzo_ctx->src_len,
-				    lzo_ctx->dst, &dst_len);
-	if (ret != LZO_E_OK || dst_len != lzo_ctx->dst_len)
-		return -EINVAL;
-	return 0;
-}
-
-static int regcache_lzo_compress_cache_block(struct regmap *map,
-		struct regcache_lzo_ctx *lzo_ctx)
-{
-	int ret;
-
-	lzo_ctx->dst_len = lzo1x_worst_compress(PAGE_SIZE);
-	lzo_ctx->dst = kmalloc(lzo_ctx->dst_len, GFP_KERNEL);
-	if (!lzo_ctx->dst) {
-		lzo_ctx->dst_len = 0;
-		return -ENOMEM;
-	}
-
-	ret = regcache_lzo_compress(lzo_ctx);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-static int regcache_lzo_decompress_cache_block(struct regmap *map,
-		struct regcache_lzo_ctx *lzo_ctx)
-{
-	int ret;
-
-	lzo_ctx->dst_len = lzo_ctx->decompressed_size;
-	lzo_ctx->dst = kmalloc(lzo_ctx->dst_len, GFP_KERNEL);
-	if (!lzo_ctx->dst) {
-		lzo_ctx->dst_len = 0;
-		return -ENOMEM;
-	}
-
-	ret = regcache_lzo_decompress(lzo_ctx);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-static inline int regcache_lzo_get_blkindex(struct regmap *map,
-					    unsigned int reg)
-{
-	return ((reg / map->reg_stride) * map->cache_word_size) /
-		DIV_ROUND_UP(map->cache_size_raw,
-			     regcache_lzo_block_count(map));
-}
-
-static inline int regcache_lzo_get_blkpos(struct regmap *map,
-					  unsigned int reg)
-{
-	return (reg / map->reg_stride) %
-		    (DIV_ROUND_UP(map->cache_size_raw,
-				  regcache_lzo_block_count(map)) /
-		     map->cache_word_size);
-}
-
-static inline int regcache_lzo_get_blksize(struct regmap *map)
-{
-	return DIV_ROUND_UP(map->cache_size_raw,
-			    regcache_lzo_block_count(map));
-}
-
-static int regcache_lzo_init(struct regmap *map)
-{
-	struct regcache_lzo_ctx **lzo_blocks;
-	size_t bmp_size;
-	int ret, i, blksize, blkcount;
-	const char *p, *end;
-	unsigned long *sync_bmp;
-
-	ret = 0;
-
-	blkcount = regcache_lzo_block_count(map);
-	map->cache = kcalloc(blkcount, sizeof(*lzo_blocks),
-			     GFP_KERNEL);
-	if (!map->cache)
-		return -ENOMEM;
-	lzo_blocks = map->cache;
-
-	/*
-	 * allocate a bitmap to be used when syncing the cache with
-	 * the hardware.  Each time a register is modified, the corresponding
-	 * bit is set in the bitmap, so we know that we have to sync
-	 * that register.
-	 */
-	bmp_size = map->num_reg_defaults_raw;
-	sync_bmp = bitmap_zalloc(bmp_size, GFP_KERNEL);
-	if (!sync_bmp) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	/* allocate the lzo blocks and initialize them */
-	for (i = 0; i < blkcount; i++) {
-		lzo_blocks[i] = kzalloc(sizeof **lzo_blocks,
-					GFP_KERNEL);
-		if (!lzo_blocks[i]) {
-			bitmap_free(sync_bmp);
-			ret = -ENOMEM;
-			goto err;
-		}
-		lzo_blocks[i]->sync_bmp = sync_bmp;
-		lzo_blocks[i]->sync_bmp_nbits = bmp_size;
-		/* alloc the working space for the compressed block */
-		ret = regcache_lzo_prepare(lzo_blocks[i]);
-		if (ret < 0)
-			goto err;
-	}
-
-	blksize = regcache_lzo_get_blksize(map);
-	p = map->reg_defaults_raw;
-	end = map->reg_defaults_raw + map->cache_size_raw;
-	/* compress the register map and fill the lzo blocks */
-	for (i = 0; i < blkcount; i++, p += blksize) {
-		lzo_blocks[i]->src = p;
-		if (p + blksize > end)
-			lzo_blocks[i]->src_len = end - p;
-		else
-			lzo_blocks[i]->src_len = blksize;
-		ret = regcache_lzo_compress_cache_block(map,
-						       lzo_blocks[i]);
-		if (ret < 0)
-			goto err;
-		lzo_blocks[i]->decompressed_size =
-			lzo_blocks[i]->src_len;
-	}
-
-	return 0;
-err:
-	regcache_lzo_exit(map);
-	return ret;
-}
-
-static int regcache_lzo_exit(struct regmap *map)
-{
-	struct regcache_lzo_ctx **lzo_blocks;
-	int i, blkcount;
-
-	lzo_blocks = map->cache;
-	if (!lzo_blocks)
-		return 0;
-
-	blkcount = regcache_lzo_block_count(map);
-	/*
-	 * the pointer to the bitmap used for syncing the cache
-	 * is shared amongst all lzo_blocks.  Ensure it is freed
-	 * only once.
-	 */
-	if (lzo_blocks[0])
-		bitmap_free(lzo_blocks[0]->sync_bmp);
-	for (i = 0; i < blkcount; i++) {
-		if (lzo_blocks[i]) {
-			kfree(lzo_blocks[i]->wmem);
-			kfree(lzo_blocks[i]->dst);
-		}
-		/* each lzo_block is a pointer returned by kmalloc or NULL */
-		kfree(lzo_blocks[i]);
-	}
-	kfree(lzo_blocks);
-	map->cache = NULL;
-	return 0;
-}
-
-static int regcache_lzo_read(struct regmap *map,
-			     unsigned int reg, unsigned int *value)
-{
-	struct regcache_lzo_ctx *lzo_block, **lzo_blocks;
-	int ret, blkindex, blkpos;
-	size_t tmp_dst_len;
-	void *tmp_dst;
-
-	/* index of the compressed lzo block */
-	blkindex = regcache_lzo_get_blkindex(map, reg);
-	/* register index within the decompressed block */
-	blkpos = regcache_lzo_get_blkpos(map, reg);
-	lzo_blocks = map->cache;
-	lzo_block = lzo_blocks[blkindex];
-
-	/* save the pointer and length of the compressed block */
-	tmp_dst = lzo_block->dst;
-	tmp_dst_len = lzo_block->dst_len;
-
-	/* prepare the source to be the compressed block */
-	lzo_block->src = lzo_block->dst;
-	lzo_block->src_len = lzo_block->dst_len;
-
-	/* decompress the block */
-	ret = regcache_lzo_decompress_cache_block(map, lzo_block);
-	if (ret >= 0)
-		/* fetch the value from the cache */
-		*value = regcache_get_val(map, lzo_block->dst, blkpos);
-
-	kfree(lzo_block->dst);
-	/* restore the pointer and length of the compressed block */
-	lzo_block->dst = tmp_dst;
-	lzo_block->dst_len = tmp_dst_len;
-
-	return ret;
-}
-
-static int regcache_lzo_write(struct regmap *map,
-			      unsigned int reg, unsigned int value)
-{
-	struct regcache_lzo_ctx *lzo_block, **lzo_blocks;
-	int ret, blkindex, blkpos;
-	size_t tmp_dst_len;
-	void *tmp_dst;
-
-	/* index of the compressed lzo block */
-	blkindex = regcache_lzo_get_blkindex(map, reg);
-	/* register index within the decompressed block */
-	blkpos = regcache_lzo_get_blkpos(map, reg);
-	lzo_blocks = map->cache;
-	lzo_block = lzo_blocks[blkindex];
-
-	/* save the pointer and length of the compressed block */
-	tmp_dst = lzo_block->dst;
-	tmp_dst_len = lzo_block->dst_len;
-
-	/* prepare the source to be the compressed block */
-	lzo_block->src = lzo_block->dst;
-	lzo_block->src_len = lzo_block->dst_len;
-
-	/* decompress the block */
-	ret = regcache_lzo_decompress_cache_block(map, lzo_block);
-	if (ret < 0) {
-		kfree(lzo_block->dst);
-		goto out;
-	}
-
-	/* write the new value to the cache */
-	if (regcache_set_val(map, lzo_block->dst, blkpos, value)) {
-		kfree(lzo_block->dst);
-		goto out;
-	}
-
-	/* prepare the source to be the decompressed block */
-	lzo_block->src = lzo_block->dst;
-	lzo_block->src_len = lzo_block->dst_len;
-
-	/* compress the block */
-	ret = regcache_lzo_compress_cache_block(map, lzo_block);
-	if (ret < 0) {
-		kfree(lzo_block->dst);
-		kfree(lzo_block->src);
-		goto out;
-	}
-
-	/* set the bit so we know we have to sync this register */
-	set_bit(reg / map->reg_stride, lzo_block->sync_bmp);
-	kfree(tmp_dst);
-	kfree(lzo_block->src);
-	return 0;
-out:
-	lzo_block->dst = tmp_dst;
-	lzo_block->dst_len = tmp_dst_len;
-	return ret;
-}
-
-static int regcache_lzo_sync(struct regmap *map, unsigned int min,
-			     unsigned int max)
-{
-	struct regcache_lzo_ctx **lzo_blocks;
-	unsigned int val;
-	int i;
-	int ret;
-
-	lzo_blocks = map->cache;
-	i = min;
-	for_each_set_bit_from(i, lzo_blocks[0]->sync_bmp,
-			      lzo_blocks[0]->sync_bmp_nbits) {
-		if (i > max)
-			continue;
-
-		ret = regcache_read(map, i, &val);
-		if (ret)
-			return ret;
-
-		/* Is this the hardware default?  If so skip. */
-		ret = regcache_lookup_reg(map, i);
-		if (ret > 0 && val == map->reg_defaults[ret].def)
-			continue;
-
-		map->cache_bypass = true;
-		ret = _regmap_write(map, i, val);
-		map->cache_bypass = false;
-		if (ret)
-			return ret;
-		dev_dbg(map->dev, "Synced register %#x, value %#x\n",
-			i, val);
-	}
-
-	return 0;
-}
-
-struct regcache_ops regcache_lzo_ops = {
-	.type = REGCACHE_COMPRESSED,
-	.name = "lzo",
-	.init = regcache_lzo_init,
-	.exit = regcache_lzo_exit,
-	.read = regcache_lzo_read,
-	.write = regcache_lzo_write,
-	.sync = regcache_lzo_sync
-};
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 7a337e8d08a9..3f19f68c6a6c 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -17,9 +17,6 @@
 
 static const struct regcache_ops *cache_types[] = {
 	&regcache_rbtree_ops,
-#if IS_ENABLED(CONFIG_REGCACHE_COMPRESSED)
-	&regcache_lzo_ops,
-#endif
 	&regcache_flat_ops,
 };
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 19d54f019cc4..b63204d51b35 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -50,7 +50,6 @@ struct sdw_slave;
 enum regcache_type {
 	REGCACHE_NONE,
 	REGCACHE_RBTREE,
-	REGCACHE_COMPRESSED,
 	REGCACHE_FLAT,
 };
 
-- 
cgit v1.2.3


From 27a2195efa8d26447c40dd4a6299ea0247786d75 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 17 Mar 2023 23:56:55 +0100
Subject: power: supply: core: auto-exposure of simple-battery data

Automatically expose data from the simple-battery firmware
node for all battery drivers.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/power_supply_core.c  | 179 +++++++++++++++++++++++++++---
 drivers/power/supply/power_supply_sysfs.c |  23 +++-
 include/linux/power_supply.h              |   8 ++
 3 files changed, 191 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f3d7c1da299f..b2504d205035 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -388,7 +388,7 @@ static int __power_supply_get_supplier_property(struct device *dev, void *_data)
 	struct psy_get_supplier_prop_data *data = _data;
 
 	if (__power_supply_is_supplied_by(epsy, data->psy))
-		if (!epsy->desc->get_property(epsy, data->psp, data->val))
+		if (!power_supply_get_property(epsy, data->psp, data->val))
 			return 1; /* Success */
 
 	return 0; /* Continue iterating */
@@ -832,6 +832,133 @@ void power_supply_put_battery_info(struct power_supply *psy,
 }
 EXPORT_SYMBOL_GPL(power_supply_put_battery_info);
 
+const enum power_supply_property power_supply_battery_info_properties[] = {
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+	POWER_SUPPLY_PROP_PRECHARGE_CURRENT,
+	POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX,
+	POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN,
+	POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX,
+	POWER_SUPPLY_PROP_TEMP_ALERT_MIN,
+	POWER_SUPPLY_PROP_TEMP_ALERT_MAX,
+	POWER_SUPPLY_PROP_TEMP_MIN,
+	POWER_SUPPLY_PROP_TEMP_MAX,
+};
+EXPORT_SYMBOL_GPL(power_supply_battery_info_properties);
+
+const size_t power_supply_battery_info_properties_size = ARRAY_SIZE(power_supply_battery_info_properties);
+EXPORT_SYMBOL_GPL(power_supply_battery_info_properties_size);
+
+bool power_supply_battery_info_has_prop(struct power_supply_battery_info *info,
+				        enum power_supply_property psp)
+{
+	if (!info)
+		return false;
+
+	switch (psp) {
+		case POWER_SUPPLY_PROP_TECHNOLOGY:
+			return info->technology != POWER_SUPPLY_TECHNOLOGY_UNKNOWN;
+		case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN:
+			return info->energy_full_design_uwh >= 0;
+		case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
+			return info->charge_full_design_uah >= 0;
+		case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+			return info->voltage_min_design_uv >= 0;
+		case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+			return info->voltage_max_design_uv >= 0;
+		case POWER_SUPPLY_PROP_PRECHARGE_CURRENT:
+			return info->precharge_current_ua >= 0;
+		case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT:
+			return info->charge_term_current_ua >= 0;
+		case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+			return info->constant_charge_current_max_ua >= 0;
+		case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+			return info->constant_charge_voltage_max_uv >= 0;
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN:
+			return info->temp_ambient_alert_min > INT_MIN;
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX:
+			return info->temp_ambient_alert_max < INT_MAX;
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+			return info->temp_alert_min > INT_MIN;
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+			return info->temp_alert_max < INT_MAX;
+		case POWER_SUPPLY_PROP_TEMP_MIN:
+			return info->temp_min > INT_MIN;
+		case POWER_SUPPLY_PROP_TEMP_MAX:
+			return info->temp_max < INT_MAX;
+		default:
+			return false;
+	}
+}
+EXPORT_SYMBOL_GPL(power_supply_battery_info_has_prop);
+
+int power_supply_battery_info_get_prop(struct power_supply_battery_info *info,
+				       enum power_supply_property psp,
+				       union power_supply_propval *val)
+{
+	if (!info)
+		return -EINVAL;
+
+	if (!power_supply_battery_info_has_prop(info, psp))
+		return -EINVAL;
+
+	switch (psp) {
+		case POWER_SUPPLY_PROP_TECHNOLOGY:
+			val->intval = info->technology;
+			return 0;
+		case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN:
+			val->intval = info->energy_full_design_uwh;
+			return 0;
+		case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
+			val->intval = info->charge_full_design_uah;
+			return 0;
+		case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+			val->intval = info->voltage_min_design_uv;
+			return 0;
+		case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+			val->intval = info->voltage_max_design_uv;
+			return 0;
+		case POWER_SUPPLY_PROP_PRECHARGE_CURRENT:
+			val->intval = info->precharge_current_ua;
+			return 0;
+		case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT:
+			val->intval = info->charge_term_current_ua;
+			return 0;
+		case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+			val->intval = info->constant_charge_current_max_ua;
+			return 0;
+		case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+			val->intval = info->constant_charge_voltage_max_uv;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN:
+			val->intval = info->temp_ambient_alert_min;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX:
+			val->intval = info->temp_ambient_alert_max;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+			val->intval = info->temp_alert_min;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+			val->intval = info->temp_alert_max;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_MIN:
+			val->intval = info->temp_min;
+			return 0;
+		case POWER_SUPPLY_PROP_TEMP_MAX:
+			val->intval = info->temp_max;
+			return 0;
+		default:
+			return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(power_supply_battery_info_get_prop);
+
 /**
  * power_supply_temp2resist_simple() - find the battery internal resistance
  * percent from temperature
@@ -1046,6 +1173,22 @@ bool power_supply_battery_bti_in_range(struct power_supply_battery_info *info,
 }
 EXPORT_SYMBOL_GPL(power_supply_battery_bti_in_range);
 
+static bool psy_has_property(const struct power_supply_desc *psy_desc,
+			     enum power_supply_property psp)
+{
+	bool found = false;
+	int i;
+
+	for (i = 0; i < psy_desc->num_properties; i++) {
+		if (psy_desc->properties[i] == psp) {
+			found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
 int power_supply_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
@@ -1056,7 +1199,12 @@ int power_supply_get_property(struct power_supply *psy,
 		return -ENODEV;
 	}
 
-	return psy->desc->get_property(psy, psp, val);
+	if (psy_has_property(psy->desc, psp))
+		return psy->desc->get_property(psy, psp, val);
+	else if (power_supply_battery_info_has_prop(psy->battery_info, psp))
+		return power_supply_battery_info_get_prop(psy->battery_info, psp, val);
+	else
+		return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(power_supply_get_property);
 
@@ -1117,22 +1265,6 @@ void power_supply_unreg_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(power_supply_unreg_notifier);
 
-static bool psy_has_property(const struct power_supply_desc *psy_desc,
-			     enum power_supply_property psp)
-{
-	bool found = false;
-	int i;
-
-	for (i = 0; i < psy_desc->num_properties; i++) {
-		if (psy_desc->properties[i] == psp) {
-			found = true;
-			break;
-		}
-	}
-
-	return found;
-}
-
 #ifdef CONFIG_THERMAL
 static int power_supply_read_temp(struct thermal_zone_device *tzd,
 		int *temp)
@@ -1255,6 +1387,17 @@ __power_supply_register(struct device *parent,
 		goto check_supplies_failed;
 	}
 
+	/*
+	 * Expose constant battery info, if it is available. While there are
+	 * some chargers accessing constant battery data, we only want to
+	 * expose battery data to userspace for battery devices.
+	 */
+	if (desc->type == POWER_SUPPLY_TYPE_BATTERY) {
+		rc = power_supply_get_battery_info(psy, &psy->battery_info);
+		if (rc && rc != -ENODEV && rc != -ENOENT)
+			goto check_supplies_failed;
+	}
+
 	spin_lock_init(&psy->changed_lock);
 	rc = device_add(dev);
 	if (rc)
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index c228205e0953..ba3b125cd66e 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -221,9 +221,10 @@ static struct power_supply_attr power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(MANUFACTURER),
 	POWER_SUPPLY_ATTR(SERIAL_NUMBER),
 };
+#define POWER_SUPPLY_ATTR_CNT ARRAY_SIZE(power_supply_attrs)
 
 static struct attribute *
-__power_supply_attrs[ARRAY_SIZE(power_supply_attrs) + 1];
+__power_supply_attrs[POWER_SUPPLY_ATTR_CNT + 1];
 
 static struct power_supply_attr *to_ps_attr(struct device_attribute *attr)
 {
@@ -380,6 +381,9 @@ static umode_t power_supply_attr_is_visible(struct kobject *kobj,
 		}
 	}
 
+	if (power_supply_battery_info_has_prop(psy->battery_info, attrno))
+		return mode;
+
 	return 0;
 }
 
@@ -461,6 +465,10 @@ static int add_prop_uevent(const struct device *dev, struct kobj_uevent_env *env
 int power_supply_uevent(const struct device *dev, struct kobj_uevent_env *env)
 {
 	const struct power_supply *psy = dev_get_drvdata(dev);
+	const enum power_supply_property *battery_props =
+		power_supply_battery_info_properties;
+	unsigned long psy_drv_properties[POWER_SUPPLY_ATTR_CNT /
+					 sizeof(unsigned long) + 1] = {0};
 	int ret = 0, j;
 	char *prop_buf;
 
@@ -482,12 +490,25 @@ int power_supply_uevent(const struct device *dev, struct kobj_uevent_env *env)
 		goto out;
 
 	for (j = 0; j < psy->desc->num_properties; j++) {
+		set_bit(psy->desc->properties[j], psy_drv_properties);
 		ret = add_prop_uevent(dev, env, psy->desc->properties[j],
 				      prop_buf);
 		if (ret)
 			goto out;
 	}
 
+	for (j = 0; j < power_supply_battery_info_properties_size; j++) {
+		if (test_bit(battery_props[j], psy_drv_properties))
+			continue;
+		if (!power_supply_battery_info_has_prop(psy->battery_info,
+				battery_props[j]))
+			continue;
+		ret = add_prop_uevent(dev, env, battery_props[j],
+			      prop_buf);
+		if (ret)
+			goto out;
+	}
+
 out:
 	free_page((unsigned long)prop_buf);
 
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index aa2c4a7c4826..a427f13c757f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -301,6 +301,7 @@ struct power_supply {
 	bool initialized;
 	bool removing;
 	atomic_t use_cnt;
+	struct power_supply_battery_info *battery_info;
 #ifdef CONFIG_THERMAL
 	struct thermal_zone_device *tzd;
 	struct thermal_cooling_device *tcd;
@@ -791,10 +792,17 @@ devm_power_supply_get_by_phandle(struct device *dev, const char *property)
 { return NULL; }
 #endif /* CONFIG_OF */
 
+extern const enum power_supply_property power_supply_battery_info_properties[];
+extern const size_t power_supply_battery_info_properties_size;
 extern int power_supply_get_battery_info(struct power_supply *psy,
 					 struct power_supply_battery_info **info_out);
 extern void power_supply_put_battery_info(struct power_supply *psy,
 					  struct power_supply_battery_info *info);
+extern bool power_supply_battery_info_has_prop(struct power_supply_battery_info *info,
+					       enum power_supply_property psp);
+extern int power_supply_battery_info_get_prop(struct power_supply_battery_info *info,
+					      enum power_supply_property psp,
+					      union power_supply_propval *val);
 extern int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table,
 				       int table_len, int ocv);
 extern struct power_supply_battery_ocv_table *
-- 
cgit v1.2.3


From c8f573f312f36861db8e40d9953b6d4b84f1321b Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 17 Mar 2023 23:56:58 +0100
Subject: power: supply: generic-adc-battery: drop jitter delay support

Drop support for configuring IRQ jitter delay by using big
enough fixed value.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/generic-adc-battery.c | 13 ++++---------
 include/linux/power/generic-adc-battery.h  |  3 ---
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/generic-adc-battery.c b/drivers/power/supply/generic-adc-battery.c
index 535972a332b3..e20894460d7f 100644
--- a/drivers/power/supply/generic-adc-battery.c
+++ b/drivers/power/supply/generic-adc-battery.c
@@ -227,12 +227,10 @@ static void gab_work(struct work_struct *work)
 static irqreturn_t gab_charged(int irq, void *dev_id)
 {
 	struct gab *adc_bat = dev_id;
-	struct gab_platform_data *pdata = adc_bat->pdata;
-	int delay;
 
-	delay = pdata->jitter_delay ? pdata->jitter_delay : JITTER_DEFAULT;
 	schedule_delayed_work(&adc_bat->bat_work,
-			msecs_to_jiffies(delay));
+			      msecs_to_jiffies(JITTER_DEFAULT));
+
 	return IRQ_HANDLED;
 }
 
@@ -358,14 +356,11 @@ static int __maybe_unused gab_suspend(struct device *dev)
 static int __maybe_unused gab_resume(struct device *dev)
 {
 	struct gab *adc_bat = dev_get_drvdata(dev);
-	struct gab_platform_data *pdata = adc_bat->pdata;
-	int delay;
-
-	delay = pdata->jitter_delay ? pdata->jitter_delay : JITTER_DEFAULT;
 
 	/* Schedule timer to check current status */
 	schedule_delayed_work(&adc_bat->bat_work,
-			msecs_to_jiffies(delay));
+			      msecs_to_jiffies(JITTER_DEFAULT));
+
 	return 0;
 }
 
diff --git a/include/linux/power/generic-adc-battery.h b/include/linux/power/generic-adc-battery.h
index c68cbf34cd34..50eb4bf28286 100644
--- a/include/linux/power/generic-adc-battery.h
+++ b/include/linux/power/generic-adc-battery.h
@@ -11,13 +11,10 @@
  * @battery_info:         recommended structure to specify static power supply
  *			   parameters
  * @cal_charge:           calculate charge level.
- * @jitter_delay:         delay required after the interrupt to check battery
- *			  status.Default set is 10ms.
  */
 struct gab_platform_data {
 	struct power_supply_info battery_info;
 	int	(*cal_charge)(long value);
-	int     jitter_delay;
 };
 
 #endif /* GENERIC_ADC_BATTERY_H */
-- 
cgit v1.2.3


From 3b6fd262bfcd696aa98af99d71dcf952f289cbee Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 17 Mar 2023 23:56:59 +0100
Subject: power: supply: generic-adc-battery: drop charge now support

Drop CHARGE_NOW support, which requires a platform specific
calculation method.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/generic-adc-battery.c | 4 ----
 include/linux/power/generic-adc-battery.h  | 2 --
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/generic-adc-battery.c b/drivers/power/supply/generic-adc-battery.c
index e20894460d7f..d07eeb7d46d3 100644
--- a/drivers/power/supply/generic-adc-battery.c
+++ b/drivers/power/supply/generic-adc-battery.c
@@ -72,7 +72,6 @@ static const enum power_supply_property gab_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
 	POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN,
-	POWER_SUPPLY_PROP_CHARGE_NOW,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_TECHNOLOGY,
@@ -166,9 +165,6 @@ static int gab_get_property(struct power_supply *psy,
 	case POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN:
 		val->intval = 0;
 		break;
-	case POWER_SUPPLY_PROP_CHARGE_NOW:
-		val->intval = pdata->cal_charge(result);
-		break;
 	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
 	case POWER_SUPPLY_PROP_POWER_NOW:
diff --git a/include/linux/power/generic-adc-battery.h b/include/linux/power/generic-adc-battery.h
index 50eb4bf28286..54434e4304d3 100644
--- a/include/linux/power/generic-adc-battery.h
+++ b/include/linux/power/generic-adc-battery.h
@@ -10,11 +10,9 @@
  * struct gab_platform_data - platform_data for generic adc iio battery driver.
  * @battery_info:         recommended structure to specify static power supply
  *			   parameters
- * @cal_charge:           calculate charge level.
  */
 struct gab_platform_data {
 	struct power_supply_info battery_info;
-	int	(*cal_charge)(long value);
 };
 
 #endif /* GENERIC_ADC_BATTERY_H */
-- 
cgit v1.2.3


From 1b27bf793fd46219c882b8e545ec736cfadc0841 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 17 Mar 2023 23:57:01 +0100
Subject: power: supply: generic-adc-battery: use simple-battery API

Constant battery data is available through power-supply's simple-battery
API. This works automatically, so the manual handling can be removed
without loosing any feature :)

Note, that the POWER_SUPPLY_STATUS_FULL check for the level variable can
be dropped, since the variable is never written. It can be re-introduced
properly once the driver gets functionality to calculate the current
charge level. Apart from that the check must be done fuzzy anyways,
since charge estimation usually is not precise enough to always return
exactly the full charge capacity for a full battery.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/generic-adc-battery.c | 60 ++----------------------------
 include/linux/power/generic-adc-battery.h  | 18 ---------
 2 files changed, 4 insertions(+), 74 deletions(-)
 delete mode 100644 include/linux/power/generic-adc-battery.h

(limited to 'include')

diff --git a/drivers/power/supply/generic-adc-battery.c b/drivers/power/supply/generic-adc-battery.c
index 771e5cfc49c3..42765cbf7518 100644
--- a/drivers/power/supply/generic-adc-battery.c
+++ b/drivers/power/supply/generic-adc-battery.c
@@ -22,7 +22,6 @@
 #include <linux/slab.h>
 #include <linux/iio/consumer.h>
 #include <linux/iio/types.h>
-#include <linux/power/generic-adc-battery.h>
 #include <linux/devm-helpers.h>
 
 #define JITTER_DEFAULT 10 /* hope 10ms is enough */
@@ -48,9 +47,7 @@ struct gab {
 	struct power_supply		*psy;
 	struct power_supply_desc	psy_desc;
 	struct iio_channel	*channel[GAB_MAX_CHAN_TYPE];
-	struct gab_platform_data	*pdata;
 	struct delayed_work bat_work;
-	int	level;
 	int	status;
 	bool cable_plugged;
 	struct gpio_desc *charge_finished;
@@ -70,14 +67,6 @@ static void gab_ext_power_changed(struct power_supply *psy)
 
 static const enum power_supply_property gab_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
-	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
-	POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN,
-	POWER_SUPPLY_PROP_VOLTAGE_NOW,
-	POWER_SUPPLY_PROP_CURRENT_NOW,
-	POWER_SUPPLY_PROP_TECHNOLOGY,
-	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
-	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
-	POWER_SUPPLY_PROP_MODEL_NAME,
 };
 
 /*
@@ -97,17 +86,6 @@ static bool gab_charge_finished(struct gab *adc_bat)
 	return gpiod_get_value(adc_bat->charge_finished);
 }
 
-static int gab_get_status(struct gab *adc_bat)
-{
-	struct gab_platform_data *pdata = adc_bat->pdata;
-	struct power_supply_info *bat_info;
-
-	bat_info = &pdata->battery_info;
-	if (adc_bat->level == bat_info->charge_full_design)
-		return POWER_SUPPLY_STATUS_FULL;
-	return adc_bat->status;
-}
-
 static enum gab_chan_type gab_prop_to_chan(enum power_supply_property psp)
 {
 	switch (psp) {
@@ -144,27 +122,14 @@ static int read_channel(struct gab *adc_bat, enum power_supply_property psp,
 static int gab_get_property(struct power_supply *psy,
 		enum power_supply_property psp, union power_supply_propval *val)
 {
-	struct gab *adc_bat;
-	struct gab_platform_data *pdata;
-	struct power_supply_info *bat_info;
+	struct gab *adc_bat = to_generic_bat(psy);
 	int result = 0;
 	int ret = 0;
 
-	adc_bat = to_generic_bat(psy);
-	if (!adc_bat) {
-		dev_err(&psy->dev, "no battery infos ?!\n");
-		return -EINVAL;
-	}
-	pdata = adc_bat->pdata;
-	bat_info = &pdata->battery_info;
-
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		val->intval = gab_get_status(adc_bat);
-		break;
-	case POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN:
-		val->intval = 0;
-		break;
+		val->intval = adc_bat->status;
+		return 0;
 	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
 	case POWER_SUPPLY_PROP_POWER_NOW:
@@ -173,21 +138,6 @@ static int gab_get_property(struct power_supply *psy,
 			goto err;
 		val->intval = result;
 		break;
-	case POWER_SUPPLY_PROP_TECHNOLOGY:
-		val->intval = bat_info->technology;
-		break;
-	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
-		val->intval = bat_info->voltage_min_design;
-		break;
-	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
-		val->intval = bat_info->voltage_max_design;
-		break;
-	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
-		val->intval = bat_info->charge_full_design;
-		break;
-	case POWER_SUPPLY_PROP_MODEL_NAME:
-		val->strval = bat_info->name;
-		break;
 	default:
 		return -EINVAL;
 	}
@@ -235,7 +185,6 @@ static int gab_probe(struct platform_device *pdev)
 	struct gab *adc_bat;
 	struct power_supply_desc *psy_desc;
 	struct power_supply_config psy_cfg = {};
-	struct gab_platform_data *pdata = pdev->dev.platform_data;
 	enum power_supply_property *properties;
 	int ret = 0;
 	int chan;
@@ -248,7 +197,7 @@ static int gab_probe(struct platform_device *pdev)
 
 	psy_cfg.drv_data = adc_bat;
 	psy_desc = &adc_bat->psy_desc;
-	psy_desc->name = pdata->battery_info.name;
+	psy_desc->name = dev_name(&pdev->dev);
 
 	/* bootup default values for the battery */
 	adc_bat->cable_plugged = false;
@@ -256,7 +205,6 @@ static int gab_probe(struct platform_device *pdev)
 	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
 	psy_desc->get_property = gab_get_property;
 	psy_desc->external_power_changed = gab_ext_power_changed;
-	adc_bat->pdata = pdata;
 
 	/*
 	 * copying the static properties and allocating extra memory for holding
diff --git a/include/linux/power/generic-adc-battery.h b/include/linux/power/generic-adc-battery.h
deleted file mode 100644
index 54434e4304d3..000000000000
--- a/include/linux/power/generic-adc-battery.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012, Anish Kumar <anish198519851985@gmail.com>
- */
-
-#ifndef GENERIC_ADC_BATTERY_H
-#define GENERIC_ADC_BATTERY_H
-
-/**
- * struct gab_platform_data - platform_data for generic adc iio battery driver.
- * @battery_info:         recommended structure to specify static power supply
- *			   parameters
- */
-struct gab_platform_data {
-	struct power_supply_info battery_info;
-};
-
-#endif /* GENERIC_ADC_BATTERY_H */
-- 
cgit v1.2.3


From 447286ebadaafa551550704ff0b42eb08b1d1cb2 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 16 Feb 2023 21:53:24 +0800
Subject: f2fs: convert to use bitmap API

Let's use BIT() and GENMASK() instead of open it.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    |  2 +-
 fs/f2fs/compress.c      |  4 ++--
 fs/f2fs/data.c          | 12 ++++++------
 fs/f2fs/dir.c           |  2 +-
 fs/f2fs/f2fs.h          | 26 +++++++++++++-------------
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/inode.c         |  4 ++--
 fs/f2fs/node.h          | 20 +++++++++-----------
 fs/f2fs/super.c         | 16 ++++++++--------
 fs/f2fs/sysfs.c         |  2 +-
 include/linux/f2fs_fs.h |  9 ++++-----
 11 files changed, 48 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 57e8ca187dc7..1e0164cde23d 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -982,7 +982,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
 
 	cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
 	if (cur_page == cp2)
-		cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+		cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg));
 
 	for (i = 1; i < cp_blks; i++) {
 		void *sit_bitmap_ptr;
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index b40dec3d7f79..93fec1d37899 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -670,7 +670,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 
 	cc->cbuf->clen = cpu_to_le32(cc->clen);
 
-	if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)
+	if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))
 		chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
 					cc->cbuf->cdata, cc->clen);
 	cc->cbuf->chksum = cpu_to_le32(chksum);
@@ -761,7 +761,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
 
 	ret = cops->decompress_pages(dic);
 
-	if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) {
+	if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) {
 		u32 provided = le32_to_cpu(dic->cbuf->chksum);
 		u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 06b552a0aba2..bf51e6e4eb64 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -93,17 +93,17 @@ static enum count_type __read_io_type(struct page *page)
 /* postprocessing steps for read bios */
 enum bio_post_read_step {
 #ifdef CONFIG_FS_ENCRYPTION
-	STEP_DECRYPT	= 1 << 0,
+	STEP_DECRYPT	= BIT(0),
 #else
 	STEP_DECRYPT	= 0,	/* compile out the decryption-related code */
 #endif
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	STEP_DECOMPRESS	= 1 << 1,
+	STEP_DECOMPRESS	= BIT(1),
 #else
 	STEP_DECOMPRESS	= 0,	/* compile out the decompression-related code */
 #endif
 #ifdef CONFIG_FS_VERITY
-	STEP_VERITY	= 1 << 2,
+	STEP_VERITY	= BIT(2),
 #else
 	STEP_VERITY	= 0,	/* compile out the verity-related code */
 #endif
@@ -420,7 +420,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 
 static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
 {
-	unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
+	unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0);
 	unsigned int fua_flag, meta_flag, io_flag;
 	blk_opf_t op_flags = 0;
 
@@ -442,9 +442,9 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
 	 *    5 |    4 |   3 |    2 |    1 |   0 |
 	 * Cold | Warm | Hot | Cold | Warm | Hot |
 	 */
-	if ((1 << fio->temp) & meta_flag)
+	if (BIT(fio->temp) & meta_flag)
 		op_flags |= REQ_META;
-	if ((1 << fio->temp) & fua_flag)
+	if (BIT(fio->temp) & fua_flag)
 		op_flags |= REQ_FUA;
 	return op_flags;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9ccdbe120425..73c338db5808 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode)
 static unsigned int dir_buckets(unsigned int level, int dir_level)
 {
 	if (level + dir_level < MAX_DIR_HASH_DEPTH / 2)
-		return 1 << (level + dir_level);
+		return BIT(level + dir_level);
 	else
 		return MAX_DIR_BUCKETS;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3c95c8b75a23..9c3ddebd28e3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -65,7 +65,7 @@ enum {
 };
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-#define F2FS_ALL_FAULT_TYPE		((1 << FAULT_MAX) - 1)
+#define F2FS_ALL_FAULT_TYPE		(GENMASK(FAULT_MAX - 1, 0))
 
 struct f2fs_fault_info {
 	atomic_t inject_ops;
@@ -74,7 +74,7 @@ struct f2fs_fault_info {
 };
 
 extern const char *f2fs_fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
 #endif
 
 /*
@@ -1436,7 +1436,7 @@ static inline void set_page_private_##name(struct page *page) \
 static inline void clear_page_private_##name(struct page *page) \
 { \
 	clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
-	if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \
+	if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \
 		set_page_private(page, 0); \
 		if (PagePrivate(page)) { \
 			ClearPagePrivate(page); \
@@ -1482,8 +1482,8 @@ static inline void set_page_private_data(struct page *page, unsigned long data)
 
 static inline void clear_page_private_data(struct page *page)
 {
-	page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1;
-	if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) {
+	page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0);
+	if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) {
 		set_page_private(page, 0);
 		if (PagePrivate(page)) {
 			ClearPagePrivate(page);
@@ -2892,7 +2892,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
 	int mask;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	return mask & *addr;
 }
 
@@ -2901,7 +2901,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr)
 	int mask;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	*addr |= mask;
 }
 
@@ -2910,7 +2910,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr)
 	int mask;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	*addr &= ~mask;
 }
 
@@ -2920,7 +2920,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
 	int ret;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	ret = mask & *addr;
 	*addr |= mask;
 	return ret;
@@ -2932,7 +2932,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr)
 	int ret;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	ret = mask & *addr;
 	*addr &= ~mask;
 	return ret;
@@ -2943,7 +2943,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 	int mask;
 
 	addr += (nr >> 3);
-	mask = 1 << (7 - (nr & 0x07));
+	mask = BIT(7 - (nr & 0x07));
 	*addr ^= mask;
 }
 
@@ -4353,9 +4353,9 @@ static inline int set_compress_context(struct inode *inode)
 			F2FS_OPTION(sbi).compress_log_size;
 	F2FS_I(inode)->i_compress_flag =
 			F2FS_OPTION(sbi).compress_chksum ?
-				1 << COMPRESS_CHKSUM : 0;
+				BIT(COMPRESS_CHKSUM) : 0;
 	F2FS_I(inode)->i_cluster_size =
-			1 << F2FS_I(inode)->i_log_cluster_size;
+			BIT(F2FS_I(inode)->i_log_cluster_size);
 	if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
 		F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
 			F2FS_OPTION(sbi).compress_level)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 15dabeac4690..12d55023f71b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3964,7 +3964,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
 
 	F2FS_I(inode)->i_compress_algorithm = option.algorithm;
 	F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
-	F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size;
+	F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
 	f2fs_mark_inode_dirty_sync(inode, true);
 
 	if (!f2fs_is_compress_backend_ready(inode))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 7d2e2c0dba65..bb5b365a195d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -454,8 +454,8 @@ static int do_read_inode(struct inode *inode)
 			fi->i_compress_level = compress_flag >>
 						COMPRESS_LEVEL_OFFSET;
 			fi->i_compress_flag = compress_flag &
-					(BIT(COMPRESS_LEVEL_OFFSET) - 1);
-			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
+					GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0);
+			fi->i_cluster_size = BIT(fi->i_log_cluster_size);
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
 		}
 	}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 99454d46a939..906fb67a99da 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst,
 static inline void set_nat_flag(struct nat_entry *ne,
 				unsigned int type, bool set)
 {
-	unsigned char mask = 0x01 << type;
 	if (set)
-		ne->ni.flag |= mask;
+		ne->ni.flag |= BIT(type);
 	else
-		ne->ni.flag &= ~mask;
+		ne->ni.flag &= ~BIT(type);
 }
 
 static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
 {
-	unsigned char mask = 0x01 << type;
-	return ne->ni.flag & mask;
+	return ne->ni.flag & BIT(type);
 }
 
 static inline void nat_reset_flag(struct nat_entry *ne)
@@ -225,7 +223,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
 	block_addr -= nm_i->nat_blkaddr;
-	block_addr ^= 1 << sbi->log_blocks_per_seg;
+	block_addr ^= BIT(sbi->log_blocks_per_seg);
 	return block_addr + nm_i->nat_blkaddr;
 }
 
@@ -395,7 +393,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
 static inline int is_node(struct page *page, int type)
 {
 	struct f2fs_node *rn = F2FS_NODE(page);
-	return le32_to_cpu(rn->footer.flag) & (1 << type);
+	return le32_to_cpu(rn->footer.flag) & BIT(type);
 }
 
 #define is_cold_node(page)	is_node(page, COLD_BIT_SHIFT)
@@ -408,9 +406,9 @@ static inline void set_cold_node(struct page *page, bool is_dir)
 	unsigned int flag = le32_to_cpu(rn->footer.flag);
 
 	if (is_dir)
-		flag &= ~(0x1 << COLD_BIT_SHIFT);
+		flag &= ~BIT(COLD_BIT_SHIFT);
 	else
-		flag |= (0x1 << COLD_BIT_SHIFT);
+		flag |= BIT(COLD_BIT_SHIFT);
 	rn->footer.flag = cpu_to_le32(flag);
 }
 
@@ -419,9 +417,9 @@ static inline void set_mark(struct page *page, int mark, int type)
 	struct f2fs_node *rn = F2FS_NODE(page);
 	unsigned int flag = le32_to_cpu(rn->footer.flag);
 	if (mark)
-		flag |= (0x1 << type);
+		flag |= BIT(type);
 	else
-		flag &= ~(0x1 << type);
+		flag &= ~BIT(type);
 	rn->footer.flag = cpu_to_le32(flag);
 
 #ifdef CONFIG_F2FS_CHECK_FS
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fbaaabbcd6de..9c87d91df61b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -880,8 +880,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
 			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
-				f2fs_warn(sbi, "Not support %d, larger than %d",
-					  1 << arg, BIO_MAX_VECS);
+				f2fs_warn(sbi, "Not support %ld, larger than %d",
+					BIT(arg), BIO_MAX_VECS);
 				return -EINVAL;
 			}
 			F2FS_OPTION(sbi).write_io_size_bits = arg;
@@ -1310,7 +1310,7 @@ default_check:
 #endif
 
 	if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
-		f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
+		f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO",
 			 F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
@@ -3348,7 +3348,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	total_sections = le32_to_cpu(raw_super->section_count);
 
 	/* blocks_per_seg should be 512, given the above check */
-	blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg);
+	blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg));
 
 	if (segment_count > F2FS_MAX_SEGMENT ||
 				segment_count < F2FS_MIN_SEGMENTS) {
@@ -3617,9 +3617,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
 	sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
-	sbi->blocksize = 1 << sbi->log_blocksize;
+	sbi->blocksize = BIT(sbi->log_blocksize);
 	sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
-	sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
+	sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg);
 	sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
 	sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
 	sbi->total_sections = le32_to_cpu(raw_super->section_count);
@@ -3875,7 +3875,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason)
 
 	f2fs_down_write(&sbi->sb_lock);
 
-	if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1))
+	if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0))
 		raw_super->s_stop_reason[reason]++;
 
 	err = f2fs_commit_super(sbi, false);
@@ -4025,7 +4025,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 			  FDEV(i).start_blk, FDEV(i).end_blk);
 	}
 	f2fs_info(sbi,
-		  "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
+		  "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi));
 	return 0;
 }
 
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 5397bd8bfcf7..9ddc6ee19433 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -452,7 +452,7 @@ out:
 	if (ret < 0)
 		return ret;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+	if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX))
 		return -EINVAL;
 	if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
 		return -EINVAL;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 1701f25117ea..881eb9321967 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -40,9 +40,8 @@
 
 #define F2FS_ENC_UTF8_12_1	1
 
-#define F2FS_IO_SIZE(sbi)	(1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
-#define F2FS_IO_SIZE_KB(sbi)	(1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */
-#define F2FS_IO_SIZE_BYTES(sbi)	(1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE(sbi)	BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi)	BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */
 #define F2FS_IO_SIZE_BITS(sbi)	(F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */
 #define F2FS_IO_SIZE_MASK(sbi)	(F2FS_IO_SIZE(sbi) - 1)
 #define F2FS_IO_ALIGNED(sbi)	(F2FS_IO_SIZE(sbi) > 1)
@@ -340,7 +339,7 @@ enum {
 	OFFSET_BIT_SHIFT
 };
 
-#define OFFSET_BIT_MASK		(0x07)	/* (0x01 << OFFSET_BIT_SHIFT) - 1 */
+#define OFFSET_BIT_MASK		GENMASK(OFFSET_BIT_SHIFT - 1, 0)
 
 struct node_footer {
 	__le32 nid;		/* node id */
@@ -545,7 +544,7 @@ typedef __le32	f2fs_hash_t;
 #define MAX_DIR_HASH_DEPTH	63
 
 /* MAX buckets in one level of dir */
-#define MAX_DIR_BUCKETS		(1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
+#define MAX_DIR_BUCKETS		BIT((MAX_DIR_HASH_DEPTH / 2) - 1)
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
-- 
cgit v1.2.3


From 7232282dd47cce6a780c9414bd9baccf232c7686 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 27 Mar 2023 13:53:31 +0200
Subject: kunit: increase KUNIT_LOG_SIZE to 2048 bytes

The s390 specific test_unwind kunit test has 39 parameterized tests. The
results in debugfs are truncated since the full log doesn't fit into 1500
bytes.
Therefore increase KUNIT_LOG_SIZE to 2048 bytes in a similar way like it
was done recently with commit "kunit: fix bug in debugfs logs of
parameterized tests". With that the whole test result is present.

Reported-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Rae Moar <rmoar@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 9721584027d8..57b309c6ca27 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -34,7 +34,7 @@ DECLARE_STATIC_KEY_FALSE(kunit_running);
 struct kunit;
 
 /* Size of log associated with test. */
-#define KUNIT_LOG_SIZE 1500
+#define KUNIT_LOG_SIZE 2048
 
 /* Maximum size of parameter description string. */
 #define KUNIT_PARAM_DESC_SIZE 128
-- 
cgit v1.2.3


From 4f704d9a8352f5c0a8fcdb6213b934630342bd44 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 14 Mar 2023 12:51:10 +0100
Subject: nfs: use vfs setgid helper

We've aligned setgid behavior over multiple kernel releases. The details
can be found in the following two merge messages:
cf619f891971 ("Merge tag 'fs.ovl.setgid.v6.2')
426b4ca2d6a5 ("Merge tag 'fs.setgid.v6.0')
Consistent setgid stripping behavior is now encapsulated in the
setattr_should_drop_sgid() helper which is used by all filesystems that
strip setgid bits outside of vfs proper. Switch nfs to rely on this
helper as well. Without this patch the setgid stripping tests in
xfstests will fail.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <20230313-fs-nfs-setgid-v2-1-9a59f436cfc0@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c          | 1 +
 fs/internal.h      | 2 --
 fs/nfs/inode.c     | 4 +---
 include/linux/fs.h | 2 ++
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index aca9ff7aed33..d60dc1edb526 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -47,6 +47,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
 		return ATTR_KILL_SGID;
 	return 0;
 }
+EXPORT_SYMBOL(setattr_should_drop_sgid);
 
 /**
  * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
diff --git a/fs/internal.h b/fs/internal.h
index dc4eb91a577a..ab36ed8fa41c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -259,8 +259,6 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 /*
  * fs/attr.c
  */
-int setattr_should_drop_sgid(struct mnt_idmap *idmap,
-			     const struct inode *inode);
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 222a28320e1c..97a76706fd54 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -717,9 +717,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 		if ((attr->ia_valid & ATTR_KILL_SUID) != 0 &&
 		    inode->i_mode & S_ISUID)
 			inode->i_mode &= ~S_ISUID;
-		if ((attr->ia_valid & ATTR_KILL_SGID) != 0 &&
-		    (inode->i_mode & (S_ISGID | S_IXGRP)) ==
-		     (S_ISGID | S_IXGRP))
+		if (setattr_should_drop_sgid(&nop_mnt_idmap, inode))
 			inode->i_mode &= ~S_ISGID;
 		if ((attr->ia_valid & ATTR_MODE) != 0) {
 			int mode = attr->ia_mode & S_IALLUGO;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..af95b64fc810 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2675,6 +2675,8 @@ extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
 extern int file_remove_privs(struct file *);
+int setattr_should_drop_sgid(struct mnt_idmap *idmap,
+			     const struct inode *inode);
 
 /*
  * This must be used for allocating filesystems specific inodes to set
-- 
cgit v1.2.3


From 9a8aac92eba90b3b7c71d0531db535f5588388f5 Mon Sep 17 00:00:00 2001
From: Kieran Frewen <kieran.frewen@morsemicro.com>
Date: Fri, 24 Feb 2023 10:29:17 +1300
Subject: wifi: nl80211: support advertising S1G capabilities

Include S1G capabilities in netlink band info messages.

Signed-off-by: Kieran Frewen <kieran.frewen@morsemicro.com>
Co-developed-by: Gilad Itzkovitch <gilad.itzkovitch@morsemicro.com>
Signed-off-by: Gilad Itzkovitch <gilad.itzkovitch@morsemicro.com>
Link: https://lore.kernel.org/r/20230223212917.4010246-1-gilad.itzkovitch@virscient.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/nl80211.c       | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cf4fb981e131..c59fec406da5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4061,6 +4061,10 @@ enum nl80211_band_iftype_attr {
  * @NL80211_BAND_ATTR_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes
  *	the allowed channel bandwidth configurations.
  *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13.
+ * @NL80211_BAND_ATTR_S1G_MCS_NSS_SET: S1G capabilities, supported S1G-MCS and NSS
+ *	set subfield, as in the S1G information IE, 5 bytes
+ * @NL80211_BAND_ATTR_S1G_CAPA: S1G capabilities information subfield as in the
+ *	S1G information IE, 10 bytes
  * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined
  * @__NL80211_BAND_ATTR_AFTER_LAST: internal use
  */
@@ -4081,6 +4085,9 @@ enum nl80211_band_attr {
 	NL80211_BAND_ATTR_EDMG_CHANNELS,
 	NL80211_BAND_ATTR_EDMG_BW_CONFIG,
 
+	NL80211_BAND_ATTR_S1G_MCS_NSS_SET,
+	NL80211_BAND_ATTR_S1G_CAPA,
+
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
 	NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f1cd3d9130dd..2c9edb015652 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1961,6 +1961,16 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 
 	nla_nest_end(msg, nl_rates);
 
+	/* S1G capabilities */
+	if (sband->band == NL80211_BAND_S1GHZ && sband->s1g_cap.s1g &&
+	    (nla_put(msg, NL80211_BAND_ATTR_S1G_CAPA,
+		     sizeof(sband->s1g_cap.cap),
+		     sband->s1g_cap.cap) ||
+	     nla_put(msg, NL80211_BAND_ATTR_S1G_MCS_NSS_SET,
+		     sizeof(sband->s1g_cap.nss_mcs),
+		     sband->s1g_cap.nss_mcs)))
+		return -ENOBUFS;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c59647c0dc679008886756a888368da1c6d4ccd3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Mar 2023 23:50:19 +0000
Subject: net: add softnet_data.in_net_rx_action

We want to make two optimizations in napi_schedule_rps() and
____napi_schedule() which require to know if these helpers are
called from net_rx_action(), instead of being called from
other contexts.

sd.in_net_rx_action is only read/written by the owning cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 18a5be6ddd0f..c8c634091a65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3188,6 +3188,7 @@ struct softnet_data {
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
 #endif
+	bool			in_net_rx_action;
 #ifdef CONFIG_NET_FLOW_LIMIT
 	struct sd_flow_limit __rcu *flow_limit;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index f7050b95d125..15331edbacf4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6646,6 +6646,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 	LIST_HEAD(list);
 	LIST_HEAD(repoll);
 
+	sd->in_net_rx_action = true;
 	local_irq_disable();
 	list_splice_init(&sd->poll_list, &list);
 	local_irq_enable();
@@ -6656,6 +6657,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		skb_defer_free_flush(sd);
 
 		if (list_empty(&list)) {
+			sd->in_net_rx_action = false;
 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 				goto end;
 			break;
@@ -6682,6 +6684,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 	list_splice(&list, &sd->poll_list);
 	if (!list_empty(&sd->poll_list))
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	else
+		sd->in_net_rx_action = false;
 
 	net_rps_action_and_irq_enable(sd);
 end:;
-- 
cgit v1.2.3


From e3d53ded577328f2b26d361f2e62fc70e85ab6a3 Mon Sep 17 00:00:00 2001
From: Jaewon Kim <jaewon02.kim@samsung.com>
Date: Mon, 6 Mar 2023 10:42:39 +0900
Subject: spi: s3c64xx: add no_cs description

This patch adds missing variable no_cs descriptions.

Signed-off-by: Jaewon Kim <jaewon02.kim@samsung.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Andi Shyti <andi@etezian.org>
Link: https://lore.kernel.org/r/20230306014239.80570-1-jaewon02.kim@samsung.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/spi-s3c64xx.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/platform_data/spi-s3c64xx.h b/include/linux/platform_data/spi-s3c64xx.h
index 5df1ace6d2c9..3101152ce449 100644
--- a/include/linux/platform_data/spi-s3c64xx.h
+++ b/include/linux/platform_data/spi-s3c64xx.h
@@ -29,6 +29,7 @@ struct s3c64xx_spi_csinfo {
  * struct s3c64xx_spi_info - SPI Controller defining structure
  * @src_clk_nr: Clock source index for the CLK_CFG[SPI_CLKSEL] field.
  * @num_cs: Number of CS this controller emulates.
+ * @no_cs: Used when CS line is not connected.
  * @cfg_gpio: Configure pins for this SPI controller.
  */
 struct s3c64xx_spi_info {
-- 
cgit v1.2.3


From f0545078e13973ca032cf131eb7580d86b78d147 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 20 Mar 2023 17:30:05 +0100
Subject: dt-bindings: clock: r8a7779: Add PWM module clock

Add the module clock used by the PWM Timers on the Renesas R-Car H1
(R8A7779) SoC.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/1397b517fccbe716a71cfae770512ed577730a25.1679329211.git.geert+renesas@glider.be
---
 include/dt-bindings/clock/r8a7779-clock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r8a7779-clock.h b/include/dt-bindings/clock/r8a7779-clock.h
index f0549234b7d8..342a60b11934 100644
--- a/include/dt-bindings/clock/r8a7779-clock.h
+++ b/include/dt-bindings/clock/r8a7779-clock.h
@@ -19,6 +19,7 @@
 #define R8A7779_CLK_OUT		7
 
 /* MSTP 0 */
+#define R8A7779_CLK_PWM		5
 #define R8A7779_CLK_HSPI	7
 #define R8A7779_CLK_TMU2	14
 #define R8A7779_CLK_TMU1	15
-- 
cgit v1.2.3


From de4f5fed3f231a8ff4790bf52975f847b95b85ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Mar 2023 08:52:15 -0600
Subject: iov_iter: add iter_iovec() helper

This returns a pointer to the current iovec entry in the iterator. Only
useful with ITER_IOVEC right now, but it prepares us to treat ITER_UBUF
and ITER_IOVEC identically for the first segment.

Rename struct iov_iter->iov to iov_iter->__iov to find any potentially
troublesome spots, and also to prevent anyone from adding new code that
accesses iter->iov directly.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                          |  4 +--
 drivers/infiniband/hw/hfi1/file_ops.c    |  3 +-
 drivers/infiniband/hw/qib/qib_file_ops.c |  2 +-
 drivers/net/tun.c                        |  3 +-
 drivers/vhost/scsi.c                     |  2 +-
 fs/btrfs/file.c                          | 11 +++++--
 fs/fuse/file.c                           |  2 +-
 include/linux/uio.h                      |  9 +++--
 io_uring/net.c                           |  4 +--
 io_uring/rw.c                            |  8 ++---
 lib/iov_iter.c                           | 56 +++++++++++++++++---------------
 sound/core/pcm_native.c                  | 22 ++++++++-----
 12 files changed, 73 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/block/blk-map.c b/block/blk-map.c
index 3bfcad64d67c..04c55f1c492e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -31,8 +31,8 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
 		return NULL;
 	bmd->iter = *data;
 	if (iter_is_iovec(data)) {
-		memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
-		bmd->iter.iov = bmd->iov;
+		memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs);
+		bmd->iter.__iov = bmd->iov;
 	}
 	return bmd;
 }
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index b1d6ca7e9708..3065db9d6bb9 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -287,11 +287,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 	}
 
 	while (dim) {
+		const struct iovec *iov = iter_iov(from);
 		int ret;
 		unsigned long count = 0;
 
 		ret = hfi1_user_sdma_process_request(
-			fd, (struct iovec *)(from->iov + done),
+			fd, (struct iovec *)(iov + done),
 			dim, &count);
 		if (ret) {
 			reqs = ret;
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 80fe92a21f96..4cee39337866 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -2248,7 +2248,7 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (!iter_is_iovec(from) || !from->nr_segs || !pq)
 		return -EINVAL;
 
-	return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs);
+	return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs);
 }
 
 static struct class *qib_class;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad653b32b2f0..5df1eba7b30a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1486,7 +1486,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
 	skb->truesize += skb->data_len;
 
 	for (i = 1; i < it->nr_segs; i++) {
-		size_t fragsz = it->iov[i].iov_len;
+		const struct iovec *iov = iter_iov(it);
+		size_t fragsz = iov->iov_len;
 		struct page *page;
 		void *frag;
 
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index b244e7c0f514..042caea64007 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -671,7 +671,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls)
 {
 	int sgl_count = 0;
 
-	if (!iter || !iter->iov) {
+	if (!iter || !iter_iov(iter)) {
 		pr_err("%s: iter->iov is NULL, but expected bytes: %zu"
 		       " present\n", __func__, bytes);
 		return -EINVAL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5cc5a1faaef5..f649647392e0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3730,10 +3730,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
 	if (!iter_is_iovec(iter))
 		return 0;
 
-	for (seg = 0; seg < iter->nr_segs; seg++)
-		for (i = seg + 1; i < iter->nr_segs; i++)
-			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		for (i = seg + 1; i < iter->nr_segs; i++) {
+			const struct iovec *iov1 = iter_iov(iter) + seg;
+			const struct iovec *iov2 = iter_iov(iter) + i;
+
+			if (iov1->iov_base == iov2->iov_base)
 				return -EINVAL;
+		}
+	}
 	return 0;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index de37a3a06a71..89d97f6188e0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1419,7 +1419,7 @@ out:
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
 {
-	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+	return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
 }
 
 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27e3fd942960..4218624b7f78 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -51,7 +51,8 @@ struct iov_iter {
 	};
 	size_t count;
 	union {
-		const struct iovec *iov;
+		/* use iter_iov() to get the current vec */
+		const struct iovec *__iov;
 		const struct kvec *kvec;
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
@@ -68,6 +69,8 @@ struct iov_iter {
 	};
 };
 
+#define iter_iov(iter)	(iter)->__iov
+
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
 	return i->iter_type;
@@ -146,9 +149,9 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
 static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 {
 	return (struct iovec) {
-		.iov_base = iter->iov->iov_base + iter->iov_offset,
+		.iov_base = iter_iov(iter)->iov_base + iter->iov_offset,
 		.iov_len = min(iter->count,
-			       iter->iov->iov_len - iter->iov_offset),
+			       iter_iov(iter)->iov_len - iter->iov_offset),
 	};
 }
 
diff --git a/io_uring/net.c b/io_uring/net.c
index 4040cf093318..89e839013837 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -184,8 +184,8 @@ static int io_setup_async_msg(struct io_kiocb *req,
 		async_msg->msg.msg_name = &async_msg->addr;
 	/* if were using fast_iov, set it to the new one */
 	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
-		size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
-		async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
+		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
+		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
 	}
 
 	return -EAGAIN;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e200..7573a34ea42a 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -503,10 +503,10 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!iovec) {
 		unsigned iov_off = 0;
 
-		io->s.iter.iov = io->s.fast_iov;
-		if (iter->iov != fast_iov) {
-			iov_off = iter->iov - fast_iov;
-			io->s.iter.iov += iov_off;
+		io->s.iter.__iov = io->s.fast_iov;
+		if (iter->__iov != fast_iov) {
+			iov_off = iter_iov(iter) - fast_iov;
+			io->s.iter.__iov += iov_off;
 		}
 		if (io->s.fast_iov != fast_iov)
 			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 274014e4eafe..87488c4aad3f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -126,13 +126,13 @@ __out:								\
 			iterate_buf(i, n, base, len, off,	\
 						i->ubuf, (I)) 	\
 		} else if (likely(iter_is_iovec(i))) {		\
-			const struct iovec *iov = i->iov;	\
+			const struct iovec *iov = iter_iov(i);	\
 			void __user *base;			\
 			size_t len;				\
 			iterate_iovec(i, n, base, len, off,	\
 						iov, (I))	\
-			i->nr_segs -= iov - i->iov;		\
-			i->iov = iov;				\
+			i->nr_segs -= iov - iter_iov(i);	\
+			i->__iov = iov;				\
 		} else if (iov_iter_is_bvec(i)) {		\
 			const struct bio_vec *bvec = i->bvec;	\
 			void *base;				\
@@ -355,7 +355,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
 		size_t skip;
 
 		size -= count;
-		for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
+		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
 			size_t len = min(count, p->iov_len - skip);
 			size_t ret;
 
@@ -398,7 +398,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
 		size_t skip;
 
 		size -= count;
-		for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
+		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
 			size_t len = min(count, p->iov_len - skip);
 			size_t ret;
 
@@ -425,7 +425,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.nofault = false,
 		.user_backed = true,
 		.data_source = direction,
-		.iov = iov,
+		.__iov = iov,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
 		.count = count
@@ -876,14 +876,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
 	i->count -= size;
 
 	size += i->iov_offset; // from beginning of current segment
-	for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
+	for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
 		if (likely(size < iov->iov_len))
 			break;
 		size -= iov->iov_len;
 	}
 	i->iov_offset = size;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
+	i->nr_segs -= iov - iter_iov(i);
+	i->__iov = iov;
 }
 
 void iov_iter_advance(struct iov_iter *i, size_t size)
@@ -958,12 +958,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 			unroll -= n;
 		}
 	} else { /* same logics for iovec and kvec */
-		const struct iovec *iov = i->iov;
+		const struct iovec *iov = iter_iov(i);
 		while (1) {
 			size_t n = (--iov)->iov_len;
 			i->nr_segs++;
 			if (unroll <= n) {
-				i->iov = iov;
+				i->__iov = iov;
 				i->iov_offset = n - unroll;
 				return;
 			}
@@ -980,7 +980,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
 	if (i->nr_segs > 1) {
 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
-			return min(i->count, i->iov->iov_len - i->iov_offset);
+			return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
 		if (iov_iter_is_bvec(i))
 			return min(i->count, i->bvec->bv_len - i->iov_offset);
 	}
@@ -1095,13 +1095,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 	unsigned k;
 
 	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->iov[k].iov_len - skip;
+		const struct iovec *iov = iter_iov(i) + k;
+		size_t len = iov->iov_len - skip;
 
 		if (len > size)
 			len = size;
 		if (len & len_mask)
 			return false;
-		if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask)
+		if ((unsigned long)(iov->iov_base + skip) & addr_mask)
 			return false;
 
 		size -= len;
@@ -1194,9 +1195,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 	unsigned k;
 
 	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->iov[k].iov_len - skip;
+		const struct iovec *iov = iter_iov(i) + k;
+		size_t len = iov->iov_len - skip;
 		if (len) {
-			res |= (unsigned long)i->iov[k].iov_base + skip;
+			res |= (unsigned long)iov->iov_base + skip;
 			if (len > size)
 				len = size;
 			res |= len;
@@ -1273,14 +1275,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 		return ~0U;
 
 	for (k = 0; k < i->nr_segs; k++) {
-		if (i->iov[k].iov_len) {
-			unsigned long base = (unsigned long)i->iov[k].iov_base;
+		const struct iovec *iov = iter_iov(i) + k;
+		if (iov->iov_len) {
+			unsigned long base = (unsigned long)iov->iov_base;
 			if (v) // if not the first one
 				res |= base | v; // this start | previous end
-			v = base + i->iov[k].iov_len;
-			if (size <= i->iov[k].iov_len)
+			v = base + iov->iov_len;
+			if (size <= iov->iov_len)
 				break;
-			size -= i->iov[k].iov_len;
+			size -= iov->iov_len;
 		}
 	}
 	return res;
@@ -1396,13 +1399,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
 		return (unsigned long)i->ubuf + i->iov_offset;
 
 	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->iov[k].iov_len - skip;
+		const struct iovec *iov = iter_iov(i) + k;
+		size_t len = iov->iov_len - skip;
 
 		if (unlikely(!len))
 			continue;
 		if (*size > len)
 			*size = len;
-		return (unsigned long)i->iov[k].iov_base + skip;
+		return (unsigned long)iov->iov_base + skip;
 	}
 	BUG(); // if it had been empty, we wouldn't get called
 }
@@ -1614,7 +1618,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages)
 	const struct iovec *p;
 	int npages = 0;
 
-	for (p = i->iov; size; skip = 0, p++) {
+	for (p = iter_iov(i); size; skip = 0, p++) {
 		unsigned offs = offset_in_page(p->iov_base + skip);
 		size_t len = min(p->iov_len - skip, size);
 
@@ -1691,7 +1695,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 				    flags);
 	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
 		/* iovec and kvec have identical layout */
-		return new->iov = kmemdup(new->iov,
+		return new->__iov = kmemdup(new->__iov,
 				   new->nr_segs * sizeof(struct iovec),
 				   flags);
 	return NULL;
@@ -1918,7 +1922,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 	if (iov_iter_is_bvec(i))
 		i->bvec -= state->nr_segs - i->nr_segs;
 	else
-		i->iov -= state->nr_segs - i->nr_segs;
+		i->__iov -= state->nr_segs - i->nr_segs;
 	i->nr_segs = state->nr_segs;
 }
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 331380c2438b..8bb97ee6720d 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3521,6 +3521,7 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to)
 	unsigned long i;
 	void __user **bufs;
 	snd_pcm_uframes_t frames;
+	const struct iovec *iov = iter_iov(to);
 
 	pcm_file = iocb->ki_filp->private_data;
 	substream = pcm_file->substream;
@@ -3534,14 +3535,16 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to)
 		return -EINVAL;
 	if (to->nr_segs > 1024 || to->nr_segs != runtime->channels)
 		return -EINVAL;
-	if (!frame_aligned(runtime, to->iov->iov_len))
+	if (!frame_aligned(runtime, iov->iov_len))
 		return -EINVAL;
-	frames = bytes_to_samples(runtime, to->iov->iov_len);
+	frames = bytes_to_samples(runtime, iov->iov_len);
 	bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL);
 	if (bufs == NULL)
 		return -ENOMEM;
-	for (i = 0; i < to->nr_segs; ++i)
-		bufs[i] = to->iov[i].iov_base;
+	for (i = 0; i < to->nr_segs; ++i) {
+		bufs[i] = iov->iov_base;
+		iov++;
+	}
 	result = snd_pcm_lib_readv(substream, bufs, frames);
 	if (result > 0)
 		result = frames_to_bytes(runtime, result);
@@ -3558,6 +3561,7 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from)
 	unsigned long i;
 	void __user **bufs;
 	snd_pcm_uframes_t frames;
+	const struct iovec *iov = iter_iov(from);
 
 	pcm_file = iocb->ki_filp->private_data;
 	substream = pcm_file->substream;
@@ -3570,14 +3574,16 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from)
 	if (!iter_is_iovec(from))
 		return -EINVAL;
 	if (from->nr_segs > 128 || from->nr_segs != runtime->channels ||
-	    !frame_aligned(runtime, from->iov->iov_len))
+	    !frame_aligned(runtime, iov->iov_len))
 		return -EINVAL;
-	frames = bytes_to_samples(runtime, from->iov->iov_len);
+	frames = bytes_to_samples(runtime, iov->iov_len);
 	bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL);
 	if (bufs == NULL)
 		return -ENOMEM;
-	for (i = 0; i < from->nr_segs; ++i)
-		bufs[i] = from->iov[i].iov_base;
+	for (i = 0; i < from->nr_segs; ++i) {
+		bufs[i] = iov->iov_base;
+		iov++;
+	}
 	result = snd_pcm_lib_writev(substream, bufs, frames);
 	if (result > 0)
 		result = frames_to_bytes(runtime, result);
-- 
cgit v1.2.3


From 95e49cf8373a0a4d1ec85f0512080bb4f945df74 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Mar 2023 09:16:45 -0600
Subject: iov_iter: add iter_iov_addr() and iter_iov_len() helpers

These just return the address and length of the current iovec segment
in the iterator. Convert existing iov_iter_iovec() users to use them
instead of getting a copy of the current vec.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/read_write.c     | 11 +++++------
 include/linux/uio.h |  2 ++
 io_uring/rw.c       | 27 +++++++++++++--------------
 mm/madvise.c        |  9 ++++-----
 4 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index 7a2ff6157eda..a21ba3be7dbe 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 		return -EOPNOTSUPP;
 
 	while (iov_iter_count(iter)) {
-		struct iovec iovec = iov_iter_iovec(iter);
 		ssize_t nr;
 
 		if (type == READ) {
-			nr = filp->f_op->read(filp, iovec.iov_base,
-					      iovec.iov_len, ppos);
+			nr = filp->f_op->read(filp, iter_iov_addr(iter),
+						iter_iov_len(iter), ppos);
 		} else {
-			nr = filp->f_op->write(filp, iovec.iov_base,
-					       iovec.iov_len, ppos);
+			nr = filp->f_op->write(filp, iter_iov_addr(iter),
+						iter_iov_len(iter), ppos);
 		}
 
 		if (nr < 0) {
@@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 			break;
 		}
 		ret += nr;
-		if (nr != iovec.iov_len)
+		if (nr != iter_iov_len(iter))
 			break;
 		iov_iter_advance(iter, nr);
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 4218624b7f78..b7fce87b720e 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -70,6 +70,8 @@ struct iov_iter {
 };
 
 #define iter_iov(iter)	(iter)->__iov
+#define iter_iov_addr(iter)	(iter_iov(iter)->iov_base + (iter)->iov_offset)
+#define iter_iov_len(iter)	(iter_iov(iter)->iov_len - (iter)->iov_offset)
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 7573a34ea42a..f33ba6f28247 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -447,26 +447,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 	ppos = io_kiocb_ppos(kiocb);
 
 	while (iov_iter_count(iter)) {
-		struct iovec iovec;
+		void __user *addr;
+		size_t len;
 		ssize_t nr;
 
 		if (iter_is_ubuf(iter)) {
-			iovec.iov_base = iter->ubuf + iter->iov_offset;
-			iovec.iov_len = iov_iter_count(iter);
+			addr = iter->ubuf + iter->iov_offset;
+			len = iov_iter_count(iter);
 		} else if (!iov_iter_is_bvec(iter)) {
-			iovec = iov_iter_iovec(iter);
+			addr = iter_iov_addr(iter);
+			len = iter_iov_len(iter);
 		} else {
-			iovec.iov_base = u64_to_user_ptr(rw->addr);
-			iovec.iov_len = rw->len;
+			addr = u64_to_user_ptr(rw->addr);
+			len = rw->len;
 		}
 
-		if (ddir == READ) {
-			nr = file->f_op->read(file, iovec.iov_base,
-					      iovec.iov_len, ppos);
-		} else {
-			nr = file->f_op->write(file, iovec.iov_base,
-					       iovec.iov_len, ppos);
-		}
+		if (ddir == READ)
+			nr = file->f_op->read(file, addr, len, ppos);
+		else
+			nr = file->f_op->write(file, addr, len, ppos);
 
 		if (nr < 0) {
 			if (!ret)
@@ -482,7 +481,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 			if (!rw->len)
 				break;
 		}
-		if (nr != iovec.iov_len)
+		if (nr != len)
 			break;
 	}
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 340125d08c03..9f389c5304d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1456,7 +1456,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		size_t, vlen, int, behavior, unsigned int, flags)
 {
 	ssize_t ret;
-	struct iovec iovstack[UIO_FASTIOV], iovec;
+	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
 	struct task_struct *task;
@@ -1503,12 +1503,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	total_len = iov_iter_count(&iter);
 
 	while (iov_iter_count(&iter)) {
-		iovec = iov_iter_iovec(&iter);
-		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-					iovec.iov_len, behavior);
+		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
+					iter_iov_len(&iter), behavior);
 		if (ret < 0)
 			break;
-		iov_iter_advance(&iter, iovec.iov_len);
+		iov_iter_advance(&iter, iter_iov_len(&iter));
 	}
 
 	ret = (total_len - iov_iter_count(&iter)) ? : ret;
-- 
cgit v1.2.3


From 6eb203e1a868187cbc23ae3bad443dc929ca6cca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Mar 2023 09:18:26 -0600
Subject: iov_iter: remove iov_iter_iovec()

No more users are left of this function.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index b7fce87b720e..7f585ceedcb2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -148,15 +148,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
 	return ret;
 }
 
-static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
-{
-	return (struct iovec) {
-		.iov_base = iter_iov(iter)->iov_base + iter->iov_offset,
-		.iov_len = min(iter->count,
-			       iter_iov(iter)->iov_len - iter->iov_offset),
-	};
-}
-
 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
 				  size_t bytes, struct iov_iter *i);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
-- 
cgit v1.2.3


From cd0bd57a9de59019fe99e9305a2337a66a4f9d39 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Mar 2023 14:29:03 -0600
Subject: iov_iter: set nr_segs = 1 for ITER_UBUF

To avoid needing to check if a given user backed iov_iter is of type
ITER_IOVEC or ITER_UBUF, set the number of segments for the ITER_UBUF
case to 1 as we're carrying a single segment.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 7f585ceedcb2..5dbd2dcab35c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -355,7 +355,8 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
-		.count = count
+		.count = count,
+		.nr_segs = 1
 	};
 }
 /* Flags for iov_iter_get/extract_pages*() */
-- 
cgit v1.2.3


From 747b1f65d39ae729b7914075899b0c82d7f667db Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Mar 2023 14:21:06 -0600
Subject: iov_iter: overlay struct iovec and ubuf/len

Add an internal struct iovec that we can return as a pointer, with the
fields of the iovec overlapping with the ITER_UBUF ubuf and length
fields.

Then we can have iter_iov() check for the appropriate type, and return
&iter->__ubuf_iovec for ITER_UBUF and iter->__iov for ITER_IOVEC and
things will magically work out for a single segment request regardless
of either type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5dbd2dcab35c..ed35f4427a0a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -49,15 +49,35 @@ struct iov_iter {
 		size_t iov_offset;
 		int last_offset;
 	};
-	size_t count;
+	/*
+	 * Hack alert: overlay ubuf_iovec with iovec + count, so
+	 * that the members resolve correctly regardless of the type
+	 * of iterator used. This means that you can use:
+	 *
+	 * &iter->__ubuf_iovec or iter->__iov
+	 *
+	 * interchangably for the user_backed cases, hence simplifying
+	 * some of the cases that need to deal with both.
+	 */
 	union {
-		/* use iter_iov() to get the current vec */
-		const struct iovec *__iov;
-		const struct kvec *kvec;
-		const struct bio_vec *bvec;
-		struct xarray *xarray;
-		struct pipe_inode_info *pipe;
-		void __user *ubuf;
+		/*
+		 * This really should be a const, but we cannot do that without
+		 * also modifying any of the zero-filling iter init functions.
+		 * Leave it non-const for now, but it should be treated as such.
+		 */
+		struct iovec __ubuf_iovec;
+		struct {
+			union {
+				/* use iter_iov() to get the current vec */
+				const struct iovec *__iov;
+				const struct kvec *kvec;
+				const struct bio_vec *bvec;
+				struct xarray *xarray;
+				struct pipe_inode_info *pipe;
+				void __user *ubuf;
+			};
+			size_t count;
+		};
 	};
 	union {
 		unsigned long nr_segs;
@@ -69,7 +89,13 @@ struct iov_iter {
 	};
 };
 
-#define iter_iov(iter)	(iter)->__iov
+static inline const struct iovec *iter_iov(const struct iov_iter *iter)
+{
+	if (iter->iter_type == ITER_UBUF)
+		return (const struct iovec *) &iter->__ubuf_iovec;
+	return iter->__iov;
+}
+
 #define iter_iov_addr(iter)	(iter_iov(iter)->iov_base + (iter)->iov_offset)
 #define iter_iov_len(iter)	(iter_iov(iter)->iov_len - (iter)->iov_offset)
 
-- 
cgit v1.2.3


From 665d30119af9ca557fc8811ea1c0e8609c165c8a Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 30 Mar 2023 15:28:47 +0200
Subject: ALSA: ac97: Define dummy functions for snd_ac97_suspend() and
 resume()

For allowing the build without CONFIG_PM, add definitions of dummy
functions for snd_ac97_suspend() and snd_ac97_resume() without
CONFIG_PM, too.

Link: https://lore.kernel.org/r/20230330132847.12882-1-tiwai@suse.de
Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Reviewed-by: Tasos Sahanidis <tasos@tasossah.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/ac97_codec.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/sound/ac97_codec.h b/include/sound/ac97_codec.h
index 49200ec26dc4..c495c6d5fbe0 100644
--- a/include/sound/ac97_codec.h
+++ b/include/sound/ac97_codec.h
@@ -335,6 +335,9 @@ static inline int snd_ac97_update_power(struct snd_ac97 *ac97, int reg,
 #ifdef CONFIG_PM
 void snd_ac97_suspend(struct snd_ac97 *ac97);
 void snd_ac97_resume(struct snd_ac97 *ac97);
+#else
+static inline void snd_ac97_suspend(struct snd_ac97 *ac97) {}
+static inline void snd_ac97_resume(struct snd_ac97 *ac97) {}
 #endif
 int snd_ac97_reset(struct snd_ac97 *ac97, bool try_warm, unsigned int id,
 	unsigned int id_mask);
-- 
cgit v1.2.3


From a0f5276716c80415230b47e321e1c46357d70993 Mon Sep 17 00:00:00 2001
From: Jason Gerecke <jason.gerecke@wacom.com>
Date: Wed, 19 Oct 2022 08:18:32 -0700
Subject: HID: Recognize "Digitizer" as a valid input application

"Digitizer" is a generic usage that may be used by various devices but
which is particularly used by non-display pen tablets. This patch adds the
usage to the list of values matched by the IS_INPUT_APPLICATION() macro
that determines if an input device should be allocated or not.

Signed-off-by: Jason Gerecke <jason.gerecke@wacom.com>
Reviewed-by: Ping Cheng <ping.cheng@wacom.com>
Link: https://lore.kernel.org/r/20221019151832.44522-1-jason.gerecke@wacom.com
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index bff9f459db36..4e4c4fe36911 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -874,7 +874,7 @@ extern bool hid_is_usb(const struct hid_device *hdev);
 /* We ignore a few input applications that are not widely used */
 #define IS_INPUT_APPLICATION(a) \
 		(((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \
-		|| ((a >= HID_DG_PEN) && (a <= HID_DG_WHITEBOARD)) \
+		|| ((a >= HID_DG_DIGITIZER) && (a <= HID_DG_WHITEBOARD)) \
 		|| (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL) \
 		|| (a == HID_GD_WIRELESS_RADIO_CTLS))
 
-- 
cgit v1.2.3


From baad10973fdb442912af676de3348e80bd8fe602 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Thu, 30 Mar 2023 17:35:13 +0200
Subject: Revert "drm/scheduler: track GPU active time per entity"

This reverts commit df622729ddbf as it introduces a use-after-free,
which isn't easy to fix without going back to the design drawing board.

Reported-by: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/scheduler/sched_main.c | 6 ------
 include/drm/gpu_scheduler.h            | 7 -------
 2 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 4e6ad6e122bc..0e4378420271 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -906,12 +906,6 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 
 	spin_unlock(&sched->job_list_lock);
 
-	if (job) {
-		job->entity->elapsed_ns += ktime_to_ns(
-			ktime_sub(job->s_fence->finished.timestamp,
-				  job->s_fence->scheduled.timestamp));
-	}
-
 	return job;
 }
 
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 9db9e5e504ee..9935d1e2ff69 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -228,13 +228,6 @@ struct drm_sched_entity {
 	 */
 	struct rb_node			rb_tree_node;
 
-	/**
-	 * @elapsed_ns:
-	 *
-	 * Records the amount of time where jobs from this entity were active
-	 * on the GPU.
-	 */
-	uint64_t elapsed_ns;
 };
 
 /**
-- 
cgit v1.2.3


From 0d0ae656b71155ccc0be9388beef77a1f7e7558e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:41 +0100
Subject: KVM: arm64: timers: Use a per-vcpu, per-timer accumulator for
 fractional ns

Instead of accumulating the fractional ns value generated every time
we compute a ns delta in a global variable, use a per-vcpu, per-timer
variable. This keeps the fractional ns local to the timer instead of
contributing to any odd, unrelated timer.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-2-maz@kernel.org
---
 arch/arm64/kvm/arch_timer.c  | 2 +-
 include/kvm/arm_arch_timer.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index e1af4301b913..9515c645f03d 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -212,7 +212,7 @@ static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx,
 		ns = cyclecounter_cyc2ns(timecounter->cc,
 					 val - now,
 					 timecounter->mask,
-					 &timecounter->frac);
+					 &timer_ctx->ns_frac);
 		return ns;
 	}
 
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index c52a6e6839da..70d47c4adc6a 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -44,6 +44,7 @@ struct arch_timer_context {
 
 	/* Emulated Timer (may be unused) */
 	struct hrtimer			hrtimer;
+	u64				ns_frac;
 
 	/* Offset for this counter/timer */
 	struct arch_timer_offset	offset;
-- 
cgit v1.2.3


From 2b4825a8694018901e641ccc2eafd0fff58d1415 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:44 +0100
Subject: KVM: arm64: timers: Use CNTPOFF_EL2 to offset the physical timer

With ECV and CNTPOFF_EL2, it is very easy to offer an offset for
the physical timer. So let's do just that.

Nothing can set the offset yet, so this should have no effect
whatsoever (famous last words...).

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-5-maz@kernel.org
---
 arch/arm64/kvm/arch_timer.c          | 18 +++++++++++++++++-
 arch/arm64/kvm/hypercalls.c          |  2 +-
 include/clocksource/arm_arch_timer.h |  1 +
 include/kvm/arm_arch_timer.h         |  2 ++
 4 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 9515c645f03d..3118ea0a1b41 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -52,6 +52,11 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
 			      struct arch_timer_context *timer,
 			      enum kvm_arch_timer_regs treg);
 
+static bool has_cntpoff(void)
+{
+	return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF));
+}
+
 u32 timer_get_ctl(struct arch_timer_context *ctxt)
 {
 	struct kvm_vcpu *vcpu = ctxt->vcpu;
@@ -84,7 +89,7 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
 
 static u64 timer_get_offset(struct arch_timer_context *ctxt)
 {
-	if (ctxt->offset.vm_offset)
+	if (ctxt && ctxt->offset.vm_offset)
 		return *ctxt->offset.vm_offset;
 
 	return 0;
@@ -432,6 +437,12 @@ static void set_cntvoff(u64 cntvoff)
 	kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
 }
 
+static void set_cntpoff(u64 cntpoff)
+{
+	if (has_cntpoff())
+		write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2);
+}
+
 static void timer_save_state(struct arch_timer_context *ctx)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
@@ -480,6 +491,7 @@ static void timer_save_state(struct arch_timer_context *ctx)
 		write_sysreg_el0(0, SYS_CNTP_CTL);
 		isb();
 
+		set_cntpoff(0);
 		break;
 	case NR_KVM_TIMERS:
 		BUG();
@@ -550,6 +562,7 @@ static void timer_restore_state(struct arch_timer_context *ctx)
 		write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
 		break;
 	case TIMER_PTIMER:
+		set_cntpoff(timer_get_offset(ctx));
 		write_sysreg_el0(timer_get_cval(ctx), SYS_CNTP_CVAL);
 		isb();
 		write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL);
@@ -767,6 +780,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	vtimer->vcpu = vcpu;
 	vtimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.voffset;
 	ptimer->vcpu = vcpu;
+	ptimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.poffset;
 
 	/* Synchronize cntvoff across all vtimers of a VM. */
 	timer_set_offset(vtimer, kvm_phys_timer_read());
@@ -1297,6 +1311,8 @@ void kvm_timer_init_vhe(void)
 	val = read_sysreg(cnthctl_el2);
 	val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
 	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
+	if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF))
+		val |= CNTHCTL_ECV;
 	write_sysreg(val, cnthctl_el2);
 }
 
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 5da884e11337..39a4707e081d 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -47,7 +47,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
 		cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset;
 		break;
 	case KVM_PTP_PHYS_COUNTER:
-		cycles = systime_snapshot.cycles;
+		cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset;
 		break;
 	default:
 		return;
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h
index 057c8964aefb..cbbc9a6dc571 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -21,6 +21,7 @@
 #define CNTHCTL_EVNTEN			(1 << 2)
 #define CNTHCTL_EVNTDIR			(1 << 3)
 #define CNTHCTL_EVNTI			(0xF << 4)
+#define CNTHCTL_ECV			(1 << 12)
 
 enum arch_timer_reg {
 	ARCH_TIMER_REG_CTRL,
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 70d47c4adc6a..2dd0fd2406fb 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -34,6 +34,8 @@ struct arch_timer_offset {
 struct arch_timer_vm_data {
 	/* Offset applied to the virtual timer/counter */
 	u64	voffset;
+	/* Offset applied to the physical timer/counter */
+	u64	poffset;
 };
 
 struct arch_timer_context {
-- 
cgit v1.2.3


From 30ec7997d175cd689fc61bfc4059f4d35b11858c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:47 +0100
Subject: KVM: arm64: timers: Allow userspace to set the global counter offset

And this is the moment you have all been waiting for: setting the
counter offset from userspace.

We expose a brand new capability that reports the ability to set
the offset for both the virtual and physical sides.

In keeping with the architecture, the offset is expressed as
a delta that is substracted from the physical counter value.

Once this new API is used, there is no going back, and the counters
cannot be written to to set the offsets implicitly (the writes
are instead ignored).

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-8-maz@kernel.org
---
 arch/arm64/include/asm/kvm_host.h |  4 +++
 arch/arm64/include/uapi/asm/kvm.h |  9 +++++++
 arch/arm64/kvm/arch_timer.c       | 54 +++++++++++++++++++++++++++++++++++----
 arch/arm64/kvm/arm.c              |  8 ++++++
 include/uapi/linux/kvm.h          |  3 +++
 5 files changed, 73 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 002a10cbade2..116233a390e9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -221,6 +221,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
+	/* VM counter offset */
+#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET			6
 
 	unsigned long flags;
 
@@ -1010,6 +1012,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 
 long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 				struct kvm_arm_copy_mte_tags *copy_tags);
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+				    struct kvm_arm_counter_offset *offset);
 
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f8129c624b07..12fb0d8a760a 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
 	__u64 reserved[2];
 };
 
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offset.
+ * To be used with KVM_ARM_SET_COUNTER_OFFSET.
+ */
+struct kvm_arm_counter_offset {
+	__u64 counter_offset;
+	__u64 reserved;
+};
+
 #define KVM_ARM_TAGS_TO_GUEST		0
 #define KVM_ARM_TAGS_FROM_GUEST		1
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index bb64a71ae193..771504c79711 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -851,9 +851,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	ptimer->vcpu = vcpu;
 	ptimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.poffset;
 
-	/* Synchronize cntvoff across all vtimers of a VM. */
-	timer_set_offset(vtimer, kvm_phys_timer_read());
-	timer_set_offset(ptimer, 0);
+	/* Synchronize offsets across timers of a VM if not already provided */
+	if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
+		timer_set_offset(vtimer, kvm_phys_timer_read());
+		timer_set_offset(ptimer, 0);
+	}
 
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	timer->bg_timer.function = kvm_bg_timer_expire;
@@ -897,8 +899,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_TIMER_CNT:
-		timer = vcpu_vtimer(vcpu);
-		timer_set_offset(timer, kvm_phys_timer_read() - value);
+		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_vtimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
 		break;
 	case KVM_REG_ARM_TIMER_CVAL:
 		timer = vcpu_vtimer(vcpu);
@@ -908,6 +913,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
+	case KVM_REG_ARM_PTIMER_CNT:
+		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_ptimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
+		break;
 	case KVM_REG_ARM_PTIMER_CVAL:
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
@@ -1443,3 +1455,35 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	return -ENXIO;
 }
+
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+				    struct kvm_arm_counter_offset *offset)
+{
+	int ret = 0;
+
+	if (offset->reserved)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (lock_all_vcpus(kvm)) {
+		set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
+
+		/*
+		 * If userspace decides to set the offset using this
+		 * API rather than merely restoring the counter
+		 * values, the offset applies to both the virtual and
+		 * physical views.
+		 */
+		kvm->arch.timer_data.voffset = offset->counter_offset;
+		kvm->arch.timer_data.poffset = offset->counter_offset;
+
+		unlock_all_vcpus(kvm);
+	} else {
+		ret = -EBUSY;
+	}
+
+	mutex_unlock(&kvm->lock);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ae5110cc3bad..1c8a4bbae684 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
+	case KVM_CAP_COUNTER_OFFSET:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -1479,6 +1480,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			return -EFAULT;
 		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
 	}
+	case KVM_ARM_SET_COUNTER_OFFSET: {
+		struct kvm_arm_counter_offset offset;
+
+		if (copy_from_user(&offset, argp, sizeof(offset)))
+			return -EFAULT;
+		return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..6a7e1a0ecf04 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1184,6 +1184,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
+#define KVM_CAP_COUNTER_OFFSET 227
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1543,6 +1544,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
 #define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
+/* Available with KVM_CAP_COUNTER_OFFSET */
+#define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO,  0xb5, struct kvm_arm_counter_offset)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From 5591805d2c21b70838b723b71b8ff613de51cfff Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:49 +0100
Subject: KVM: arm64: timers: Rationalise per-vcpu timer init

The way we initialise our timer contexts may be satisfactory
for two timers, but will be getting pretty annoying with four.

Cleanup the whole thing by removing the code duplication and
getting rid of unused IRQ configuration elements.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-10-maz@kernel.org
---
 arch/arm64/kvm/arch_timer.c  | 73 +++++++++++++++++++++++---------------------
 include/kvm/arm_arch_timer.h |  1 -
 2 files changed, 39 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 771504c79711..e46f04ed8f86 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -30,14 +30,9 @@ static u32 host_ptimer_irq_flags;
 
 static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
 
-static const struct kvm_irq_level default_ptimer_irq = {
-	.irq	= 30,
-	.level	= 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
-	.irq	= 27,
-	.level	= 1,
+static const u8 default_ppi[] = {
+	[TIMER_PTIMER]  = 30,
+	[TIMER_VTIMER]  = 27,
 };
 
 static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
@@ -820,12 +815,14 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 	 * resets the timer to be disabled and unmasked and is compliant with
 	 * the ARMv7 architecture.
 	 */
-	timer_set_ctl(vcpu_vtimer(vcpu), 0);
-	timer_set_ctl(vcpu_ptimer(vcpu), 0);
+	for (int i = 0; i < NR_KVM_TIMERS; i++)
+		timer_set_ctl(vcpu_get_timer(vcpu, i), 0);
+
 
 	if (timer->enabled) {
-		kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
-		kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+		for (int i = 0; i < NR_KVM_TIMERS; i++)
+			kvm_timer_update_irq(vcpu, false,
+					     vcpu_get_timer(vcpu, i));
 
 		if (irqchip_in_kernel(vcpu->kvm)) {
 			kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
@@ -840,39 +837,47 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
+{
+	struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);
+	struct kvm *kvm = vcpu->kvm;
+
+	ctxt->vcpu = vcpu;
+
+	if (timerid == TIMER_VTIMER)
+		ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;
+	else
+		ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;
+
+	hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+	ctxt->hrtimer.function = kvm_hrtimer_expire;
+	ctxt->irq.irq = default_ppi[timerid];
+
+	switch (timerid) {
+	case TIMER_PTIMER:
+		ctxt->host_timer_irq = host_ptimer_irq;
+		break;
+	case TIMER_VTIMER:
+		ctxt->host_timer_irq = host_vtimer_irq;
+		break;
+	}
+}
+
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
-	vtimer->vcpu = vcpu;
-	vtimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.voffset;
-	ptimer->vcpu = vcpu;
-	ptimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.poffset;
+	for (int i = 0; i < NR_KVM_TIMERS; i++)
+		timer_context_init(vcpu, i);
 
 	/* Synchronize offsets across timers of a VM if not already provided */
 	if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
-		timer_set_offset(vtimer, kvm_phys_timer_read());
-		timer_set_offset(ptimer, 0);
+		timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read());
+		timer_set_offset(vcpu_ptimer(vcpu), 0);
 	}
 
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	timer->bg_timer.function = kvm_bg_timer_expire;
-
-	hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	vtimer->hrtimer.function = kvm_hrtimer_expire;
-	ptimer->hrtimer.function = kvm_hrtimer_expire;
-
-	vtimer->irq.irq = default_vtimer_irq.irq;
-	ptimer->irq.irq = default_ptimer_irq.irq;
-
-	vtimer->host_timer_irq = host_vtimer_irq;
-	ptimer->host_timer_irq = host_ptimer_irq;
-
-	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
-	ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
 }
 
 void kvm_timer_cpu_up(void)
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 2dd0fd2406fb..c746ef64220b 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -59,7 +59,6 @@ struct arch_timer_context {
 
 	/* Duplicated state from arch_timer.c for convenience */
 	u32				host_timer_irq;
-	u32				host_timer_irq_flags;
 };
 
 struct timer_map {
-- 
cgit v1.2.3


From 33c549460ef9119eb115484e81f54521122341db Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:50 +0100
Subject: KVM: arm64: timers: Abstract per-timer IRQ access

As we are about to move the location of the per-timer IRQ into
the VM structure, abstract the location of the IRQ behind an
accessor. This will make the repainting sligntly less painful.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-11-maz@kernel.org
---
 arch/arm64/kvm/arch_timer.c  | 38 +++++++++++++++++++-------------------
 include/kvm/arm_arch_timer.h |  2 ++
 2 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index e46f04ed8f86..d08d8c2fc30d 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -392,12 +392,12 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	int ret;
 
 	timer_ctx->irq.level = new_level;
-	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
 				   timer_ctx->irq.level);
 
 	if (!userspace_irqchip(vcpu->kvm)) {
 		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					  timer_ctx->irq.irq,
+					  timer_irq(timer_ctx),
 					  timer_ctx->irq.level,
 					  timer_ctx);
 		WARN_ON(ret);
@@ -607,7 +607,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
 	kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
 
 	if (irqchip_in_kernel(vcpu->kvm))
-		phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+		phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));
 
 	phys_active |= ctx->irq.level;
 
@@ -825,9 +825,9 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 					     vcpu_get_timer(vcpu, i));
 
 		if (irqchip_in_kernel(vcpu->kvm)) {
-			kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+			kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer));
 			if (map.direct_ptimer)
-				kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+				kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer));
 		}
 	}
 
@@ -851,7 +851,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
 
 	hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	ctxt->hrtimer.function = kvm_hrtimer_expire;
-	ctxt->irq.irq = default_ppi[timerid];
+	timer_irq(ctxt) = default_ppi[timerid];
 
 	switch (timerid) {
 	case TIMER_PTIMER:
@@ -1295,19 +1295,19 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
 	int vtimer_irq, ptimer_irq, ret;
 	unsigned long i;
 
-	vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
+	vtimer_irq = timer_irq(vcpu_vtimer(vcpu));
 	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
 	if (ret)
 		return false;
 
-	ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
+	ptimer_irq = timer_irq(vcpu_ptimer(vcpu));
 	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
 	if (ret)
 		return false;
 
 	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
-		if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
-		    vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
+		if (timer_irq(vcpu_vtimer(vcpu)) != vtimer_irq ||
+		    timer_irq(vcpu_ptimer(vcpu)) != ptimer_irq)
 			return false;
 	}
 
@@ -1322,9 +1322,9 @@ bool kvm_arch_timer_get_input_level(int vintid)
 	if (WARN(!vcpu, "No vcpu context!\n"))
 		return false;
 
-	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+	if (vintid == timer_irq(vcpu_vtimer(vcpu)))
 		timer = vcpu_vtimer(vcpu);
-	else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+	else if (vintid == timer_irq(vcpu_ptimer(vcpu)))
 		timer = vcpu_ptimer(vcpu);
 	else
 		BUG();
@@ -1358,7 +1358,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 
 	ret = kvm_vgic_map_phys_irq(vcpu,
 				    map.direct_vtimer->host_timer_irq,
-				    map.direct_vtimer->irq.irq,
+				    timer_irq(map.direct_vtimer),
 				    &arch_timer_irq_ops);
 	if (ret)
 		return ret;
@@ -1366,7 +1366,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	if (map.direct_ptimer) {
 		ret = kvm_vgic_map_phys_irq(vcpu,
 					    map.direct_ptimer->host_timer_irq,
-					    map.direct_ptimer->irq.irq,
+					    timer_irq(map.direct_ptimer),
 					    &arch_timer_irq_ops);
 	}
 
@@ -1391,8 +1391,8 @@ static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
 	unsigned long i;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
-		vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
+		timer_irq(vcpu_vtimer(vcpu)) = vtimer_irq;
+		timer_irq(vcpu_ptimer(vcpu)) = ptimer_irq;
 	}
 }
 
@@ -1417,10 +1417,10 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+		set_timer_irqs(vcpu->kvm, irq, timer_irq(ptimer));
 		break;
 	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+		set_timer_irqs(vcpu->kvm, timer_irq(vtimer), irq);
 		break;
 	default:
 		return -ENXIO;
@@ -1446,7 +1446,7 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 		return -ENXIO;
 	}
 
-	irq = timer->irq.irq;
+	irq = timer_irq(timer);
 	return put_user(irq, uaddr);
 }
 
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index c746ef64220b..27cada09f588 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -109,6 +109,8 @@ bool kvm_arch_timer_get_input_level(int vintid);
 
 #define arch_timer_ctx_index(ctx)	((ctx) - vcpu_timer((ctx)->vcpu)->timers)
 
+#define timer_irq(ctx)			((ctx)->irq.irq)
+
 u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
 			      enum kvm_arch_timers tmr,
 			      enum kvm_arch_timer_regs treg);
-- 
cgit v1.2.3


From 8a5eb2d210807e7dbe9ece7075533014cf4b9c27 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:51 +0100
Subject: KVM: arm64: timers: Move the timer IRQs into arch_timer_vm_data

Having the timer IRQs duplicated into each vcpu isn't great, and
becomes absolutely awful with NV. So let's move these into
the per-VM arch_timer_vm_data structure.

This simplifies a lot of code, but requires us to introduce a
mutex so that we can reason about userspace trying to change
an interrupt number while another vcpu is running, something
that wasn't really well handled so far.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-12-maz@kernel.org
---
 arch/arm64/include/asm/kvm_host.h |   2 +
 arch/arm64/kvm/arch_timer.c       | 108 ++++++++++++++++++++++----------------
 arch/arm64/kvm/arm.c              |   2 +
 include/kvm/arm_arch_timer.h      |  18 +++++--
 4 files changed, 82 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 116233a390e9..1280154c9ef3 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -223,6 +223,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
 	/* VM counter offset */
 #define KVM_ARCH_FLAG_VM_COUNTER_OFFSET			6
+	/* Timer PPIs made immutable */
+#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE		7
 
 	unsigned long flags;
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index d08d8c2fc30d..1d811735e05f 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -851,7 +851,6 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
 
 	hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	ctxt->hrtimer.function = kvm_hrtimer_expire;
-	timer_irq(ctxt) = default_ppi[timerid];
 
 	switch (timerid) {
 	case TIMER_PTIMER:
@@ -880,6 +879,13 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	timer->bg_timer.function = kvm_bg_timer_expire;
 }
 
+void kvm_timer_init_vm(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.timer_data.lock);
+	for (int i = 0; i < NR_KVM_TIMERS; i++)
+		kvm->arch.timer_data.ppi[i] = default_ppi[i];
+}
+
 void kvm_timer_cpu_up(void)
 {
 	enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
@@ -1292,44 +1298,56 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
 
 static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
 {
-	int vtimer_irq, ptimer_irq, ret;
-	unsigned long i;
+	u32 ppis = 0;
+	bool valid;
 
-	vtimer_irq = timer_irq(vcpu_vtimer(vcpu));
-	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
-	if (ret)
-		return false;
+	mutex_lock(&vcpu->kvm->arch.timer_data.lock);
 
-	ptimer_irq = timer_irq(vcpu_ptimer(vcpu));
-	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
-	if (ret)
-		return false;
+	for (int i = 0; i < NR_KVM_TIMERS; i++) {
+		struct arch_timer_context *ctx;
+		int irq;
 
-	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
-		if (timer_irq(vcpu_vtimer(vcpu)) != vtimer_irq ||
-		    timer_irq(vcpu_ptimer(vcpu)) != ptimer_irq)
-			return false;
+		ctx = vcpu_get_timer(vcpu, i);
+		irq = timer_irq(ctx);
+		if (kvm_vgic_set_owner(vcpu, irq, ctx))
+			break;
+
+		/*
+		 * We know by construction that we only have PPIs, so
+		 * all values are less than 32.
+		 */
+		ppis |= BIT(irq);
 	}
 
-	return true;
+	valid = hweight32(ppis) == NR_KVM_TIMERS;
+
+	if (valid)
+		set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags);
+
+	mutex_unlock(&vcpu->kvm->arch.timer_data.lock);
+
+	return valid;
 }
 
 bool kvm_arch_timer_get_input_level(int vintid)
 {
 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
-	struct arch_timer_context *timer;
 
 	if (WARN(!vcpu, "No vcpu context!\n"))
 		return false;
 
-	if (vintid == timer_irq(vcpu_vtimer(vcpu)))
-		timer = vcpu_vtimer(vcpu);
-	else if (vintid == timer_irq(vcpu_ptimer(vcpu)))
-		timer = vcpu_ptimer(vcpu);
-	else
-		BUG();
+	for (int i = 0; i < NR_KVM_TIMERS; i++) {
+		struct arch_timer_context *ctx;
+
+		ctx = vcpu_get_timer(vcpu, i);
+		if (timer_irq(ctx) == vintid)
+			return kvm_timer_should_fire(ctx);
+	}
 
-	return kvm_timer_should_fire(timer);
+	/* A timer IRQ has fired, but no matching timer was found? */
+	WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid);
+
+	return false;
 }
 
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
@@ -1385,23 +1403,10 @@ void kvm_timer_init_vhe(void)
 		sysreg_clear_set(cntkctl_el1, 0, CNTHCTL_ECV);
 }
 
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
-	struct kvm_vcpu *vcpu;
-	unsigned long i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		timer_irq(vcpu_vtimer(vcpu)) = vtimer_irq;
-		timer_irq(vcpu_ptimer(vcpu)) = ptimer_irq;
-	}
-}
-
 int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
 	int __user *uaddr = (int __user *)(long)attr->addr;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	int irq;
+	int irq, idx, ret = 0;
 
 	if (!irqchip_in_kernel(vcpu->kvm))
 		return -EINVAL;
@@ -1412,21 +1417,36 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	if (!(irq_is_ppi(irq)))
 		return -EINVAL;
 
-	if (vcpu->arch.timer_cpu.enabled)
-		return -EBUSY;
+	mutex_lock(&vcpu->kvm->arch.timer_data.lock);
+
+	if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE,
+		     &vcpu->kvm->arch.flags)) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		set_timer_irqs(vcpu->kvm, irq, timer_irq(ptimer));
+		idx = TIMER_VTIMER;
 		break;
 	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		set_timer_irqs(vcpu->kvm, timer_irq(vtimer), irq);
+		idx = TIMER_PTIMER;
 		break;
 	default:
-		return -ENXIO;
+		ret = -ENXIO;
+		goto out;
 	}
 
-	return 0;
+	/*
+	 * We cannot validate the IRQ unicity before we run, so take it at
+	 * face value. The verdict will be given on first vcpu run, for each
+	 * vcpu. Yes this is late. Blame it on the stupid API.
+	 */
+	vcpu->kvm->arch.timer_data.ppi[idx] = irq;
+
+out:
+	mutex_unlock(&vcpu->kvm->arch.timer_data.lock);
+	return ret;
 }
 
 int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1c8a4bbae684..4c5e9dfbf83a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -148,6 +148,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm_vgic_early_init(kvm);
 
+	kvm_timer_init_vm(kvm);
+
 	/* The maximum number of VCPUs is limited by the host's GIC model */
 	kvm->max_vcpus = kvm_arm_default_max_vcpus();
 
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 27cada09f588..f093ea9f540d 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -36,14 +36,16 @@ struct arch_timer_vm_data {
 	u64	voffset;
 	/* Offset applied to the physical timer/counter */
 	u64	poffset;
+
+	struct mutex	lock;
+
+	/* The PPI for each timer, global to the VM */
+	u8	ppi[NR_KVM_TIMERS];
 };
 
 struct arch_timer_context {
 	struct kvm_vcpu			*vcpu;
 
-	/* Timer IRQ */
-	struct kvm_irq_level		irq;
-
 	/* Emulated Timer (may be unused) */
 	struct hrtimer			hrtimer;
 	u64				ns_frac;
@@ -57,6 +59,11 @@ struct arch_timer_context {
 	 */
 	bool				loaded;
 
+	/* Output level of the timer IRQ */
+	struct {
+		bool			level;
+	} irq;
+
 	/* Duplicated state from arch_timer.c for convenience */
 	u32				host_timer_irq;
 };
@@ -86,6 +93,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu);
 void kvm_timer_update_run(struct kvm_vcpu *vcpu);
 void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
 
+void kvm_timer_init_vm(struct kvm *kvm);
+
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
@@ -109,7 +118,8 @@ bool kvm_arch_timer_get_input_level(int vintid);
 
 #define arch_timer_ctx_index(ctx)	((ctx) - vcpu_timer((ctx)->vcpu)->timers)
 
-#define timer_irq(ctx)			((ctx)->irq.irq)
+#define timer_vm_data(ctx)		(&(ctx)->vcpu->kvm->arch.timer_data)
+#define timer_irq(ctx)			(timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)])
 
 u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
 			      enum kvm_arch_timers tmr,
-- 
cgit v1.2.3


From 1e0eec09d43a55125ff80e40b2d6e2f369a338b9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:56 +0100
Subject: KVM: arm64: nv: timers: Add a per-timer, per-vcpu offset

Being able to set a global offset isn't enough.

With NV, we also need to a per-vcpu, per-timer offset (for example,
CNTVCT_EL0 being offset by CNTVOFF_EL2).

Use a similar method as the VM-wide offset to have a timer point
to the shadow register that contains the offset value.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-17-maz@kernel.org
---
 arch/arm64/kvm/arch_timer.c             | 13 ++++++++++---
 arch/arm64/kvm/hyp/include/hyp/switch.h |  2 ++
 include/kvm/arm_arch_timer.h            |  5 +++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index d3a7902269c1..b87bf182af33 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -89,10 +89,17 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
 
 static u64 timer_get_offset(struct arch_timer_context *ctxt)
 {
-	if (ctxt && ctxt->offset.vm_offset)
-		return *ctxt->offset.vm_offset;
+	u64 offset = 0;
 
-	return 0;
+	if (!ctxt)
+		return 0;
+
+	if (ctxt->offset.vm_offset)
+		offset += *ctxt->offset.vm_offset;
+	if (ctxt->offset.vcpu_offset)
+		offset += *ctxt->offset.vcpu_offset;
+
+	return offset;
 }
 
 static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 9954368f639d..d07cbc313889 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -353,6 +353,8 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
 
 	if (ctxt->offset.vm_offset)
 		val -= *kern_hyp_va(ctxt->offset.vm_offset);
+	if (ctxt->offset.vcpu_offset)
+		val -= *kern_hyp_va(ctxt->offset.vcpu_offset);
 
 	vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
 	__kvm_skip_instr(vcpu);
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index f093ea9f540d..209da0c2ac9f 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -29,6 +29,11 @@ struct arch_timer_offset {
 	 * structure. If NULL, assume a zero offset.
 	 */
 	u64	*vm_offset;
+	/*
+	 * If set, pointer to one of the offsets in the vcpu's sysreg
+	 * array. If NULL, assume a zero offset.
+	 */
+	u64	*vcpu_offset;
 };
 
 struct arch_timer_vm_data {
-- 
cgit v1.2.3


From 81dc9504a7006b484cfcf074796094ee526b0c45 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:57 +0100
Subject: KVM: arm64: nv: timers: Support hyp timer emulation

Emulating EL2 also means emulating the EL2 timers. To do so, we expand
our timer framework to deal with at most 4 timers. At any given time,
two timers are using the HW timers, and the two others are purely
emulated.

The role of deciding which is which at any given time is left to a
mapping function which is called every time we need to make such a
decision.

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Co-developed-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-18-maz@kernel.org
---
 arch/arm64/include/asm/kvm_host.h       |   4 +
 arch/arm64/include/uapi/asm/kvm.h       |   2 +
 arch/arm64/kvm/arch_timer.c             | 180 ++++++++++++++++++++++++++++++--
 arch/arm64/kvm/hyp/include/hyp/switch.h |  15 +++
 arch/arm64/kvm/trace_arm.h              |   6 +-
 arch/arm64/kvm/vgic/vgic.c              |  15 +++
 include/kvm/arm_arch_timer.h            |   9 +-
 include/kvm/arm_vgic.h                  |   1 +
 8 files changed, 220 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1280154c9ef3..633a7c0750bb 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -369,6 +369,10 @@ enum vcpu_sysreg {
 	TPIDR_EL2,	/* EL2 Software Thread ID Register */
 	CNTHCTL_EL2,	/* Counter-timer Hypervisor Control register */
 	SP_EL2,		/* EL2 Stack Pointer */
+	CNTHP_CTL_EL2,
+	CNTHP_CVAL_EL2,
+	CNTHV_CTL_EL2,
+	CNTHV_CVAL_EL2,
 
 	NR_SYS_REGS	/* Nothing after this line! */
 };
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 12fb0d8a760a..0921f366c49f 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -420,6 +420,8 @@ enum {
 #define KVM_ARM_VCPU_TIMER_CTRL		1
 #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
 #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_HVTIMER	2
+#define   KVM_ARM_VCPU_TIMER_IRQ_HPTIMER	3
 #define KVM_ARM_VCPU_PVTIME_CTRL	2
 #define   KVM_ARM_VCPU_PVTIME_IPA	0
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index b87bf182af33..c5c8cc3c25ae 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -16,6 +16,7 @@
 #include <asm/arch_timer.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
 
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
@@ -33,6 +34,8 @@ static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
 static const u8 default_ppi[] = {
 	[TIMER_PTIMER]  = 30,
 	[TIMER_VTIMER]  = 27,
+	[TIMER_HPTIMER] = 26,
+	[TIMER_HVTIMER] = 28,
 };
 
 static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
@@ -46,6 +49,11 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
 static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
 			      struct arch_timer_context *timer,
 			      enum kvm_arch_timer_regs treg);
+static bool kvm_arch_timer_get_input_level(int vintid);
+
+static struct irq_ops arch_timer_irq_ops = {
+	.get_input_level = kvm_arch_timer_get_input_level,
+};
 
 static bool has_cntpoff(void)
 {
@@ -54,6 +62,9 @@ static bool has_cntpoff(void)
 
 static int nr_timers(struct kvm_vcpu *vcpu)
 {
+	if (!vcpu_has_nv(vcpu))
+		return NR_KVM_EL0_TIMERS;
+
 	return NR_KVM_TIMERS;
 }
 
@@ -66,6 +77,10 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt)
 		return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
 	case TIMER_PTIMER:
 		return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
+	case TIMER_HVTIMER:
+		return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2);
+	case TIMER_HPTIMER:
+		return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2);
 	default:
 		WARN_ON(1);
 		return 0;
@@ -81,6 +96,10 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
 		return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
 	case TIMER_PTIMER:
 		return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+	case TIMER_HVTIMER:
+		return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2);
+	case TIMER_HPTIMER:
+		return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);
 	default:
 		WARN_ON(1);
 		return 0;
@@ -113,6 +132,12 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
 	case TIMER_PTIMER:
 		__vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl;
 		break;
+	case TIMER_HVTIMER:
+		__vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl;
+		break;
+	case TIMER_HPTIMER:
+		__vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl;
+		break;
 	default:
 		WARN_ON(1);
 	}
@@ -129,6 +154,12 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
 	case TIMER_PTIMER:
 		__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval;
 		break;
+	case TIMER_HVTIMER:
+		__vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval;
+		break;
+	case TIMER_HPTIMER:
+		__vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval;
+		break;
 	default:
 		WARN_ON(1);
 	}
@@ -151,13 +182,27 @@ u64 kvm_phys_timer_read(void)
 
 static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
 {
-	if (has_vhe()) {
+	if (vcpu_has_nv(vcpu)) {
+		if (is_hyp_ctxt(vcpu)) {
+			map->direct_vtimer = vcpu_hvtimer(vcpu);
+			map->direct_ptimer = vcpu_hptimer(vcpu);
+			map->emul_vtimer = vcpu_vtimer(vcpu);
+			map->emul_ptimer = vcpu_ptimer(vcpu);
+		} else {
+			map->direct_vtimer = vcpu_vtimer(vcpu);
+			map->direct_ptimer = vcpu_ptimer(vcpu);
+			map->emul_vtimer = vcpu_hvtimer(vcpu);
+			map->emul_ptimer = vcpu_hptimer(vcpu);
+		}
+	} else if (has_vhe()) {
 		map->direct_vtimer = vcpu_vtimer(vcpu);
 		map->direct_ptimer = vcpu_ptimer(vcpu);
+		map->emul_vtimer = NULL;
 		map->emul_ptimer = NULL;
 	} else {
 		map->direct_vtimer = vcpu_vtimer(vcpu);
 		map->direct_ptimer = NULL;
+		map->emul_vtimer = NULL;
 		map->emul_ptimer = vcpu_ptimer(vcpu);
 	}
 
@@ -252,8 +297,11 @@ static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu)
 
 static u64 wfit_delay_ns(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_context *ctx = vcpu_vtimer(vcpu);
 	u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+	struct arch_timer_context *ctx;
+
+	ctx = (vcpu_has_nv(vcpu) && is_hyp_ctxt(vcpu)) ? vcpu_hvtimer(vcpu)
+						       : vcpu_vtimer(vcpu);
 
 	return kvm_counter_compute_delta(ctx, val);
 }
@@ -350,9 +398,11 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 
 		switch (index) {
 		case TIMER_VTIMER:
+		case TIMER_HVTIMER:
 			cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
 			break;
 		case TIMER_PTIMER:
+		case TIMER_HPTIMER:
 			cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
 			break;
 		case NR_KVM_TIMERS:
@@ -468,6 +518,7 @@ static void timer_save_state(struct arch_timer_context *ctx)
 		u64 cval;
 
 	case TIMER_VTIMER:
+	case TIMER_HVTIMER:
 		timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
 		timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL));
 
@@ -493,6 +544,7 @@ static void timer_save_state(struct arch_timer_context *ctx)
 		set_cntvoff(0);
 		break;
 	case TIMER_PTIMER:
+	case TIMER_HPTIMER:
 		timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL));
 		cval = read_sysreg_el0(SYS_CNTP_CVAL);
 
@@ -536,6 +588,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
 	 */
 	if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
 	    !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+	    !kvm_timer_irq_can_fire(map.emul_vtimer) &&
 	    !kvm_timer_irq_can_fire(map.emul_ptimer) &&
 	    !vcpu_has_wfit_active(vcpu))
 		return;
@@ -572,12 +625,14 @@ static void timer_restore_state(struct arch_timer_context *ctx)
 		u64 cval, offset;
 
 	case TIMER_VTIMER:
+	case TIMER_HVTIMER:
 		set_cntvoff(timer_get_offset(ctx));
 		write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
 		isb();
 		write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
 		break;
 	case TIMER_PTIMER:
+	case TIMER_HPTIMER:
 		cval = timer_get_cval(ctx);
 		offset = timer_get_offset(ctx);
 		set_cntpoff(offset);
@@ -663,6 +718,57 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
 			(_clr) |= (_bit);				\
 	} while (0)
 
+static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
+					      struct timer_map *map)
+{
+	int hw, ret;
+
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return;
+
+	/*
+	 * We only ever unmap the vtimer irq on a VHE system that runs nested
+	 * virtualization, in which case we have both a valid emul_vtimer,
+	 * emul_ptimer, direct_vtimer, and direct_ptimer.
+	 *
+	 * Since this is called from kvm_timer_vcpu_load(), a change between
+	 * vEL2 and vEL1/0 will have just happened, and the timer_map will
+	 * represent this, and therefore we switch the emul/direct mappings
+	 * below.
+	 */
+	hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer));
+	if (hw < 0) {
+		kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer));
+		kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer));
+
+		ret = kvm_vgic_map_phys_irq(vcpu,
+					    map->direct_vtimer->host_timer_irq,
+					    timer_irq(map->direct_vtimer),
+					    &arch_timer_irq_ops);
+		WARN_ON_ONCE(ret);
+		ret = kvm_vgic_map_phys_irq(vcpu,
+					    map->direct_ptimer->host_timer_irq,
+					    timer_irq(map->direct_ptimer),
+					    &arch_timer_irq_ops);
+		WARN_ON_ONCE(ret);
+
+		/*
+		 * The virtual offset behaviour is "interresting", as it
+		 * always applies when HCR_EL2.E2H==0, but only when
+		 * accessed from EL1 when HCR_EL2.E2H==1. So make sure we
+		 * track E2H when putting the HV timer in "direct" mode.
+		 */
+		if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
+			struct arch_timer_offset *offs = &map->direct_vtimer->offset;
+
+			if (vcpu_el2_e2h_is_set(vcpu))
+				offs->vcpu_offset = NULL;
+			else
+				offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		}
+	}
+}
+
 static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
 {
 	bool tpt, tpc;
@@ -695,6 +801,22 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
 	if (!has_cntpoff() && timer_get_offset(map->direct_ptimer))
 		tpt = tpc = true;
 
+	/*
+	 * Apply the enable bits that the guest hypervisor has requested for
+	 * its own guest. We can only add traps that wouldn't have been set
+	 * above.
+	 */
+	if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+		u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+		/* Use the VHE format for mental sanity */
+		if (!vcpu_el2_e2h_is_set(vcpu))
+			val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;
+
+		tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
+		tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));
+	}
+
 	/*
 	 * Now that we have collected our requirements, compute the
 	 * trap and enable bits.
@@ -720,6 +842,9 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 	get_timer_map(vcpu, &map);
 
 	if (static_branch_likely(&has_gic_active_state)) {
+		if (vcpu_has_nv(vcpu))
+			kvm_timer_vcpu_load_nested_switch(vcpu, &map);
+
 		kvm_timer_vcpu_load_gic(map.direct_vtimer);
 		if (map.direct_ptimer)
 			kvm_timer_vcpu_load_gic(map.direct_ptimer);
@@ -732,6 +857,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 	timer_restore_state(map.direct_vtimer);
 	if (map.direct_ptimer)
 		timer_restore_state(map.direct_ptimer);
+	if (map.emul_vtimer)
+		timer_emulate(map.emul_vtimer);
 	if (map.emul_ptimer)
 		timer_emulate(map.emul_ptimer);
 
@@ -778,6 +905,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	 * In any case, we re-schedule the hrtimer for the physical timer when
 	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
 	 */
+	if (map.emul_vtimer)
+		soft_timer_cancel(&map.emul_vtimer->hrtimer);
 	if (map.emul_ptimer)
 		soft_timer_cancel(&map.emul_ptimer->hrtimer);
 
@@ -830,6 +959,17 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 	for (int i = 0; i < nr_timers(vcpu); i++)
 		timer_set_ctl(vcpu_get_timer(vcpu, i), 0);
 
+	/*
+	 * A vcpu running at EL2 is in charge of the offset applied to
+	 * the virtual timer, so use the physical VM offset, and point
+	 * the vcpu offset to CNTVOFF_EL2.
+	 */
+	if (vcpu_has_nv(vcpu)) {
+		struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset;
+
+		offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset;
+	}
 
 	if (timer->enabled) {
 		for (int i = 0; i < nr_timers(vcpu); i++)
@@ -843,6 +983,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	if (map.emul_vtimer)
+		soft_timer_cancel(&map.emul_vtimer->hrtimer);
 	if (map.emul_ptimer)
 		soft_timer_cancel(&map.emul_ptimer->hrtimer);
 
@@ -866,9 +1008,11 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
 
 	switch (timerid) {
 	case TIMER_PTIMER:
+	case TIMER_HPTIMER:
 		ctxt->host_timer_irq = host_ptimer_irq;
 		break;
 	case TIMER_VTIMER:
+	case TIMER_HVTIMER:
 		ctxt->host_timer_irq = host_vtimer_irq;
 		break;
 	}
@@ -1020,6 +1164,10 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
 		val = kvm_phys_timer_read() - timer_get_offset(timer);
 		break;
 
+	case TIMER_REG_VOFF:
+		val = *timer->offset.vcpu_offset;
+		break;
+
 	default:
 		BUG();
 	}
@@ -1038,7 +1186,7 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
 	get_timer_map(vcpu, &map);
 	timer = vcpu_get_timer(vcpu, tmr);
 
-	if (timer == map.emul_ptimer)
+	if (timer == map.emul_vtimer || timer == map.emul_ptimer)
 		return kvm_arm_timer_read(vcpu, timer, treg);
 
 	preempt_disable();
@@ -1070,6 +1218,10 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
 		timer_set_cval(timer, val);
 		break;
 
+	case TIMER_REG_VOFF:
+		*timer->offset.vcpu_offset = val;
+		break;
+
 	default:
 		BUG();
 	}
@@ -1085,7 +1237,7 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
 
 	get_timer_map(vcpu, &map);
 	timer = vcpu_get_timer(vcpu, tmr);
-	if (timer == map.emul_ptimer) {
+	if (timer == map.emul_vtimer || timer == map.emul_ptimer) {
 		soft_timer_cancel(&timer->hrtimer);
 		kvm_arm_timer_write(vcpu, timer, treg, val);
 		timer_emulate(timer);
@@ -1165,10 +1317,6 @@ static const struct irq_domain_ops timer_domain_ops = {
 	.free	= timer_irq_domain_free,
 };
 
-static struct irq_ops arch_timer_irq_ops = {
-	.get_input_level = kvm_arch_timer_get_input_level,
-};
-
 static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
 {
 	*flags = irq_get_trigger_type(virq);
@@ -1341,7 +1489,7 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
 	return valid;
 }
 
-bool kvm_arch_timer_get_input_level(int vintid)
+static bool kvm_arch_timer_get_input_level(int vintid)
 {
 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
@@ -1444,6 +1592,12 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
 		idx = TIMER_PTIMER;
 		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+		idx = TIMER_HVTIMER;
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+		idx = TIMER_HPTIMER;
+		break;
 	default:
 		ret = -ENXIO;
 		goto out;
@@ -1474,6 +1628,12 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
 		timer = vcpu_ptimer(vcpu);
 		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+		timer = vcpu_hvtimer(vcpu);
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+		timer = vcpu_hptimer(vcpu);
+		break;
 	default:
 		return -ENXIO;
 	}
@@ -1487,6 +1647,8 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
 	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+	case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+	case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
 		return 0;
 	}
 
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index d07cbc313889..c41166f1a1dd 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -343,6 +343,21 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
 	switch (sysreg) {
 	case SYS_CNTPCT_EL0:
 	case SYS_CNTPCTSS_EL0:
+		if (vcpu_has_nv(vcpu)) {
+			if (is_hyp_ctxt(vcpu)) {
+				ctxt = vcpu_hptimer(vcpu);
+				break;
+			}
+
+			/* Check for guest hypervisor trapping */
+			val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+			if (!vcpu_el2_e2h_is_set(vcpu))
+				val = (val & CNTHCTL_EL1PCTEN) << 10;
+
+			if (!(val & (CNTHCTL_EL1PCTEN << 10)))
+				return false;
+		}
+
 		ctxt = vcpu_ptimer(vcpu);
 		break;
 	default:
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index f3e46a976125..6ce5c025218d 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -206,6 +206,7 @@ TRACE_EVENT(kvm_get_timer_map,
 		__field(	unsigned long,		vcpu_id	)
 		__field(	int,			direct_vtimer	)
 		__field(	int,			direct_ptimer	)
+		__field(	int,			emul_vtimer	)
 		__field(	int,			emul_ptimer	)
 	),
 
@@ -214,14 +215,17 @@ TRACE_EVENT(kvm_get_timer_map,
 		__entry->direct_vtimer		= arch_timer_ctx_index(map->direct_vtimer);
 		__entry->direct_ptimer =
 			(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+		__entry->emul_vtimer =
+			(map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1;
 		__entry->emul_ptimer =
 			(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
 	),
 
-	TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+	TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d",
 		  __entry->vcpu_id,
 		  __entry->direct_vtimer,
 		  __entry->direct_ptimer,
+		  __entry->emul_vtimer,
 		  __entry->emul_ptimer)
 );
 
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index d97e6080b421..ae491ef97188 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -573,6 +573,21 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
 	return 0;
 }
 
+int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+	unsigned long flags;
+	int ret = -1;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	if (irq->hw)
+		ret = irq->hwintid;
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	vgic_put_irq(vcpu->kvm, irq);
+	return ret;
+}
+
 /**
  * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
  *
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 209da0c2ac9f..52008f5cff06 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -13,6 +13,9 @@
 enum kvm_arch_timers {
 	TIMER_PTIMER,
 	TIMER_VTIMER,
+	NR_KVM_EL0_TIMERS,
+	TIMER_HVTIMER = NR_KVM_EL0_TIMERS,
+	TIMER_HPTIMER,
 	NR_KVM_TIMERS
 };
 
@@ -21,6 +24,7 @@ enum kvm_arch_timer_regs {
 	TIMER_REG_CVAL,
 	TIMER_REG_TVAL,
 	TIMER_REG_CTL,
+	TIMER_REG_VOFF,
 };
 
 struct arch_timer_offset {
@@ -76,6 +80,7 @@ struct arch_timer_context {
 struct timer_map {
 	struct arch_timer_context *direct_vtimer;
 	struct arch_timer_context *direct_ptimer;
+	struct arch_timer_context *emul_vtimer;
 	struct arch_timer_context *emul_ptimer;
 };
 
@@ -114,12 +119,12 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu);
 
 void kvm_timer_init_vhe(void);
 
-bool kvm_arch_timer_get_input_level(int vintid);
-
 #define vcpu_timer(v)	(&(v)->arch.timer_cpu)
 #define vcpu_get_timer(v,t)	(&vcpu_timer(v)->timers[(t)])
 #define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
 #define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
+#define vcpu_hvtimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_HVTIMER])
+#define vcpu_hptimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_HPTIMER])
 
 #define arch_timer_ctx_index(ctx)	((ctx) - vcpu_timer((ctx)->vcpu)->timers)
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index d3ad51fde9db..402b545959af 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -380,6 +380,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
 			  u32 vintid, struct irq_ops *ops);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid);
+int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid);
 
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 96a2ff2a6373259cd3bbc5dcaa79865ce271fa4b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 20 Mar 2023 17:45:02 -0400
Subject: dm bufio: remove unused dm_bufio_release_move interface

Was used by multi-snapshot DM target that never went upstream.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c    | 77 ------------------------------------------------
 include/linux/dm-bufio.h |  6 ----
 2 files changed, 83 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cf077f9b30c3..79434b38f368 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1415,83 +1415,6 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
 }
 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
 
-/*
- * We first delete any other buffer that may be at that new location.
- *
- * Then, we write the buffer to the original location if it was dirty.
- *
- * Then, if we are the only one who is holding the buffer, relink the buffer
- * in the buffer tree for the new location.
- *
- * If there was someone else holding the buffer, we write it to the new
- * location but not relink it, because that other user needs to have the buffer
- * at the same place.
- */
-void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
-{
-	struct dm_bufio_client *c = b->c;
-	struct dm_buffer *new;
-
-	BUG_ON(dm_bufio_in_request());
-
-	dm_bufio_lock(c);
-
-retry:
-	new = __find(c, new_block);
-	if (new) {
-		if (new->hold_count) {
-			__wait_for_free_buffer(c);
-			goto retry;
-		}
-
-		/*
-		 * FIXME: Is there any point waiting for a write that's going
-		 * to be overwritten in a bit?
-		 */
-		__make_buffer_clean(new);
-		__unlink_buffer(new);
-		__free_buffer_wake(new);
-	}
-
-	BUG_ON(!b->hold_count);
-	BUG_ON(test_bit(B_READING, &b->state));
-
-	__write_dirty_buffer(b, NULL);
-	if (b->hold_count == 1) {
-		wait_on_bit_io(&b->state, B_WRITING,
-			       TASK_UNINTERRUPTIBLE);
-		set_bit(B_DIRTY, &b->state);
-		b->dirty_start = 0;
-		b->dirty_end = c->block_size;
-		__unlink_buffer(b);
-		__link_buffer(b, new_block, LIST_DIRTY);
-	} else {
-		sector_t old_block;
-
-		wait_on_bit_lock_io(&b->state, B_WRITING,
-				    TASK_UNINTERRUPTIBLE);
-		/*
-		 * Relink buffer to "new_block" so that write_callback
-		 * sees "new_block" as a block number.
-		 * After the write, link the buffer back to old_block.
-		 * All this must be done in bufio lock, so that block number
-		 * change isn't visible to other threads.
-		 */
-		old_block = b->block;
-		__unlink_buffer(b);
-		__link_buffer(b, new_block, b->list_mode);
-		submit_io(b, REQ_OP_WRITE, write_endio);
-		wait_on_bit_io(&b->state, B_WRITING,
-			       TASK_UNINTERRUPTIBLE);
-		__unlink_buffer(b);
-		__link_buffer(b, old_block, b->list_mode);
-	}
-
-	dm_bufio_unlock(c);
-	dm_bufio_release(b);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_release_move);
-
 static void forget_buffer_locked(struct dm_buffer *b)
 {
 	if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) {
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index 2056743aaaaa..681656a1c03d 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -130,12 +130,6 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
  */
 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count);
 
-/*
- * Like dm_bufio_release but also move the buffer to the new
- * block. dm_bufio_write_dirty_buffers is needed to commit the new block.
- */
-void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
-
 /*
  * Free the given buffer.
  * This is just a hint, if the buffer is in use or dirty, this function
-- 
cgit v1.2.3


From 06961c487a33a222fd3d84998dc6398ed0449373 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 1 Mar 2023 12:48:43 -0500
Subject: dm: split discards further if target sets max_discard_granularity

The block core (bio_split_discard) will already split discards based
on the 'discard_granularity' and 'max_discard_sectors' queue_limits.
But the DM thin target also needs to ensure that it doesn't receive a
discard that spans a 'max_discard_sectors' boundary.

Introduce a dm_target 'max_discard_granularity' flag that if set will
cause DM core to split discard bios relative to 'max_discard_sectors'.
This treats 'discard_granularity' as a "min_discard_granularity" and
'max_discard_sectors' as a "max_discard_granularity".

Requested-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 25 +++++++++++++++++++------
 include/linux/device-mapper.h |  6 ++++++
 include/uapi/linux/dm-ioctl.h |  4 ++--
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfde0088147a..20c6b72a0245 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1162,7 +1162,8 @@ static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
 	return ti->len - target_offset;
 }
 
-static sector_t max_io_len(struct dm_target *ti, sector_t sector)
+static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
+			     unsigned int max_granularity)
 {
 	sector_t target_offset = dm_target_offset(ti, sector);
 	sector_t len = max_io_len_target_boundary(ti, target_offset);
@@ -1173,11 +1174,16 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	 *   explains why stacked chunk_sectors based splitting via
 	 *   bio_split_to_limits() isn't possible here.
 	 */
-	if (!ti->max_io_len)
+	if (!max_granularity)
 		return len;
 	return min_t(sector_t, len,
 		min(queue_max_sectors(ti->table->md->queue),
-		    blk_chunk_sectors_left(target_offset, ti->max_io_len)));
+		    blk_chunk_sectors_left(target_offset, max_granularity)));
+}
+
+static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
+{
+	return __max_io_len(ti, sector, ti->max_io_len);
 }
 
 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@ -1565,12 +1571,13 @@ static void __send_empty_flush(struct clone_info *ci)
 }
 
 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
-					unsigned int num_bios)
+					unsigned int num_bios,
+					unsigned int max_granularity)
 {
 	unsigned int len, bios;
 
 	len = min_t(sector_t, ci->sector_count,
-		    max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
+		    __max_io_len(ti, ci->sector, max_granularity));
 
 	atomic_add(num_bios, &ci->io->io_count);
 	bios = __send_duplicate_bios(ci, ti, num_bios, &len);
@@ -1606,10 +1613,16 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 					  struct dm_target *ti)
 {
 	unsigned int num_bios = 0;
+	unsigned int max_granularity = 0;
 
 	switch (bio_op(ci->bio)) {
 	case REQ_OP_DISCARD:
 		num_bios = ti->num_discard_bios;
+		if (ti->max_discard_granularity) {
+			struct queue_limits *limits =
+				dm_get_queue_limits(ti->table->md);
+			max_granularity = limits->max_discard_sectors;
+		}
 		break;
 	case REQ_OP_SECURE_ERASE:
 		num_bios = ti->num_secure_erase_bios;
@@ -1630,7 +1643,7 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 	if (unlikely(!num_bios))
 		return BLK_STS_NOTSUPP;
 
-	__send_changing_extent_only(ci, ti, num_bios);
+	__send_changing_extent_only(ci, ti, num_bios, max_granularity);
 	return BLK_STS_OK;
 }
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 7975483816e4..8aa6b3ea91fa 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -358,6 +358,12 @@ struct dm_target {
 	 */
 	bool discards_supported:1;
 
+	/*
+	 * Set if this target requires that discards be split on both
+	 * 'discard_granularity' and 'max_discard_sectors' boundaries.
+	 */
+	bool max_discard_granularity:1;
+
 	/*
 	 * Set if we need to limit the number of in-flight bios when swapping.
 	 */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 7edf335778ba..1990b5700f69 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -286,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	47
+#define DM_VERSION_MINOR	48
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2022-07-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2023-03-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 00168b415a60cec7558608efb4fc50f2a73daae2 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 23 Mar 2023 14:41:16 -0600
Subject: uapi: net: ipv6: Replace fake flex-array with flex-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zero-length arrays as fake flexible arrays are deprecated and we are
moving towards adopting C99 flexible-array members instead.

Address the following warning found with GCC-13 and
-fstrict-flex-arrays=3 enabled:
net/ipv6/exthdrs.c: In function ‘fl6_update_dst’:
net/ipv6/exthdrs.c:1393:28: warning: array subscript 0 is outside array bounds of ‘struct in6_addr[0]’ [-Warray-bounds=]
 1393 |                 fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
      |                 ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from ./include/linux/ipv6.h:5,
                 from ./include/linux/icmpv6.h:6,
                 from net/ipv6/exthdrs.c:27:
./include/uapi/linux/ipv6.h:84:33: note: while referencing ‘addr’
   84 |         struct in6_addr         addr[0];
      |                                 ^~~~
net/ipv6/exthdrs.c: In function ‘ipv6_push_rthdr0.isra’:
net/ipv6/exthdrs.c:1125:19: warning: array subscript <unknown> is outside array bounds of ‘struct in6_addr[0]’ [-Warray-bounds=]
 1125 |         phdr->addr[hops - 1] = **addr_p;
      |         ~~~~~~~~~~^~~~~~~~~~
./include/uapi/linux/ipv6.h:84:33: note: while referencing ‘addr’
   84 |         struct in6_addr         addr[0];
      |                                 ^~~~

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE
routines on memcpy() and help us make progress towards globally
enabling -fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/21
Link: https://github.com/KSPP/linux/issues/276
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 include/uapi/linux/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 53326dfc59ec..ac56605fe9bc 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -81,7 +81,7 @@ struct ipv6_opt_hdr {
 struct rt0_hdr {
 	struct ipv6_rt_hdr	rt_hdr;
 	__u32			reserved;
-	struct in6_addr		addr[0];
+	struct in6_addr		addr[];
 
 #define rt0_type		rt_hdr.type
 };
-- 
cgit v1.2.3


From 28c1b6df436819a7ed8a781835766e45139771a3 Mon Sep 17 00:00:00 2001
From: Eric Sage <eric_sage@apple.com>
Date: Mon, 27 Mar 2023 13:44:49 -0400
Subject: netfilter: nfnetlink_queue: enable classid socket info retrieval

This enables associating a socket with a v1 net_cls cgroup. Useful for
applying a per-cgroup policy when processing packets in userspace.

Signed-off-by: Eric Sage <eric_sage@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h |  1 +
 net/netfilter/nfnetlink_queue.c                | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index ef7c97f21a15..efcb7c044a74 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -62,6 +62,7 @@ enum nfqnl_attr_type {
 	NFQA_VLAN,			/* nested attribute: packet vlan info */
 	NFQA_L2HDR,			/* full L2 header */
 	NFQA_PRIORITY,			/* skb->priority */
+	NFQA_CGROUP_CLASSID,		/* __u32 cgroup classid */
 
 	__NFQA_MAX
 };
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 87a9009d5234..e311462f6d98 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -29,6 +29,7 @@
 #include <linux/netfilter/nfnetlink_queue.h>
 #include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/list.h>
+#include <linux/cgroup-defs.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/netfilter/nf_queue.h>
@@ -301,6 +302,19 @@ nla_put_failure:
 	return -1;
 }
 
+static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+	if (sk && sk_fullsock(sk)) {
+		u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+
+		if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
+			return -1;
+	}
+#endif
+	return 0;
+}
+
 static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
 {
 	u32 seclen = 0;
@@ -406,6 +420,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		+ nla_total_size(sizeof(u_int32_t))	/* priority */
 		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
 		+ nla_total_size(sizeof(u_int32_t))	/* skbinfo */
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+		+ nla_total_size(sizeof(u_int32_t))	/* classid */
+#endif
 		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */
 
 	tstamp = skb_tstamp_cond(entskb, false);
@@ -599,6 +616,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	    nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
 		goto nla_put_failure;
 
+	if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
+		goto nla_put_failure;
+
 	if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From a25b8b7136ad43760bd876af62b6e59abd30496c Mon Sep 17 00:00:00 2001
From: Matthieu De Beule <matthieu.debeule@proton.ch>
Date: Wed, 29 Mar 2023 12:52:18 +0000
Subject: netfilter: Correct documentation errors in nf_tables.h

NFTA_RANGE_OP incorrectly says nft_cmp_ops instead of nft_range_ops.
NFTA_LOG_GROUP and NFTA_LOG_QTHRESHOLD claim NLA_U32 instead of NLA_U16
NFTA_EXTHDR_SREG isn't documented as a register

Signed-off-by: Matthieu De Beule <matthieu.debeule@proton.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9c6f02c26054..c4d4d8e42dc8 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -685,7 +685,7 @@ enum nft_range_ops {
  * enum nft_range_attributes - nf_tables range expression netlink attributes
  *
  * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers)
- * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_range_ops)
  * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes)
  * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes)
  */
@@ -878,7 +878,7 @@ enum nft_exthdr_op {
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
  * @NFTA_EXTHDR_OP: option match type (NLA_U32)
- * @NFTA_EXTHDR_SREG: option match type (NLA_U32)
+ * @NFTA_EXTHDR_SREG: source register (NLA_U32: nft_registers)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -1262,10 +1262,10 @@ enum nft_last_attributes {
 /**
  * enum nft_log_attributes - nf_tables log expression netlink attributes
  *
- * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32)
+ * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U16)
  * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING)
  * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32)
- * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32)
+ * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U16)
  * @NFTA_LOG_LEVEL: log level (NLA_U32)
  * @NFTA_LOG_FLAGS: logging flags (NLA_U32)
  */
-- 
cgit v1.2.3


From 2384127e98db52a6ac2577924ad9cae25f3e7472 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 29 Mar 2023 11:54:52 +0200
Subject: net/sched: act_tunnel_key: add support for "don't fragment"

extend "act_tunnel_key" to allow specifying TUNNEL_DONT_FRAGMENT.

Suggested-by: Ilya Maximets <i.maximets@ovn.org>
Reviewed-by: Pedro Tammela <pctammela@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h | 1 +
 net/sched/act_tunnel_key.c                | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 49ad4033951b..37c6f612f161 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -34,6 +34,7 @@ enum {
 					 */
 	TCA_TUNNEL_KEY_ENC_TOS,		/* u8 */
 	TCA_TUNNEL_KEY_ENC_TTL,		/* u8 */
+	TCA_TUNNEL_KEY_NO_FRAG,		/* flag */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 2d12d2626415..0c8aa7e686ea 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -420,6 +420,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		    nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
 			flags &= ~TUNNEL_CSUM;
 
+		if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG]))
+			flags |= TUNNEL_DONT_FRAGMENT;
+
 		if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
 			dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
 
@@ -747,6 +750,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 				   key->tp_dst)) ||
 		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
 			       !(key->tun_flags & TUNNEL_CSUM)) ||
+		    ((key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+		     nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) ||
 		    tunnel_key_opts_dump(skb, info))
 			goto nla_put_failure;
 
-- 
cgit v1.2.3


From cd3891158a77685aee6129f7374a018d13540b2c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 22 Mar 2023 13:07:58 -0700
Subject: iommu/sva: Move PASID helpers to sva code

Preparing to remove IOASID infrastructure, PASID management will be
under SVA code. Decouple mm code from IOASID.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20230322200803.869130-3-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu-sva.c | 10 +++++++++-
 include/linux/ioasid.h    | 11 +----------
 include/linux/iommu.h     | 14 +++++++++++++-
 include/linux/sched/mm.h  | 26 --------------------------
 kernel/fork.c             |  1 +
 5 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 24bf9b2b58aa..fcfdc80a3939 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -44,7 +44,7 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
 	if (!pasid_valid(pasid))
 		ret = -ENOMEM;
 	else
-		mm_pasid_set(mm, pasid);
+		mm->pasid = pasid;
 out:
 	mutex_unlock(&iommu_sva_lock);
 	return ret;
@@ -238,3 +238,11 @@ out_put_mm:
 
 	return status;
 }
+
+void mm_pasid_drop(struct mm_struct *mm)
+{
+	if (pasid_valid(mm->pasid)) {
+		ioasid_free(mm->pasid);
+		mm->pasid = INVALID_IOASID;
+	}
+}
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index af1c9d62e642..734bf0e036ee 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -4,8 +4,8 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/iommu.h>
 
-#define INVALID_IOASID ((ioasid_t)-1)
 typedef unsigned int ioasid_t;
 typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
 typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
@@ -40,10 +40,6 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_set_data(ioasid_t ioasid, void *data);
-static inline bool pasid_valid(ioasid_t ioasid)
-{
-	return ioasid != INVALID_IOASID;
-}
 
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
@@ -74,10 +70,5 @@ static inline int ioasid_set_data(ioasid_t ioasid, void *data)
 	return -ENOTSUPP;
 }
 
-static inline bool pasid_valid(ioasid_t ioasid)
-{
-	return false;
-}
-
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6595454d4f48..d3f81dc6e4dd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
-#include <linux/ioasid.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
@@ -192,6 +191,8 @@ enum iommu_dev_features {
 };
 
 #define IOMMU_PASID_INVALID	(-1U)
+typedef unsigned int ioasid_t;
+#define INVALID_IOASID ((ioasid_t)-1)
 
 #ifdef CONFIG_IOMMU_API
 
@@ -1172,7 +1173,16 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 	return false;
 }
 
+static inline bool pasid_valid(ioasid_t ioasid)
+{
+	return ioasid != INVALID_IOASID;
+}
 #ifdef CONFIG_IOMMU_SVA
+static inline void mm_pasid_init(struct mm_struct *mm)
+{
+	mm->pasid = INVALID_IOASID;
+}
+void mm_pasid_drop(struct mm_struct *mm);
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
@@ -1192,6 +1202,8 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	return IOMMU_PASID_INVALID;
 }
+static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif /* CONFIG_IOMMU_SVA */
 
 #endif /* __LINUX_IOMMU_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..da9712a3ba73 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,7 +8,6 @@
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 #include <linux/sync_core.h>
-#include <linux/ioasid.h>
 
 /*
  * Routines for handling mm_structs
@@ -451,29 +450,4 @@ static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
 }
 #endif
 
-#ifdef CONFIG_IOMMU_SVA
-static inline void mm_pasid_init(struct mm_struct *mm)
-{
-	mm->pasid = INVALID_IOASID;
-}
-
-/* Associate a PASID with an mm_struct: */
-static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
-{
-	mm->pasid = pasid;
-}
-
-static inline void mm_pasid_drop(struct mm_struct *mm)
-{
-	if (pasid_valid(mm->pasid)) {
-		ioasid_free(mm->pasid);
-		mm->pasid = INVALID_IOASID;
-	}
-}
-#else
-static inline void mm_pasid_init(struct mm_struct *mm) {}
-static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
-static inline void mm_pasid_drop(struct mm_struct *mm) {}
-#endif
-
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index d8cda4c6de6c..9f14e4084fc0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
 #include <linux/stackprotector.h>
+#include <linux/iommu.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
-- 
cgit v1.2.3


From fffaed1e24b8d114e958d180cb4a8aed3febbb5a Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 22 Mar 2023 13:08:02 -0700
Subject: iommu/ioasid: Rename INVALID_IOASID

INVALID_IOASID and IOMMU_PASID_INVALID are duplicated. Rename
INVALID_IOASID and consolidate since we are moving away from IOASID
infrastructure.

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/20230322200803.869130-7-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/x86/sva.rst   | 2 +-
 arch/x86/kernel/traps.c     | 2 +-
 drivers/dma/idxd/device.c   | 8 ++++----
 drivers/dma/idxd/idxd.h     | 1 +
 drivers/dma/idxd/init.c     | 2 +-
 drivers/dma/idxd/irq.c      | 2 +-
 drivers/iommu/intel/dmar.c  | 4 ++--
 drivers/iommu/intel/iommu.c | 2 +-
 drivers/iommu/intel/svm.c   | 2 +-
 include/linux/ioasid.h      | 1 +
 include/linux/iommu.h       | 6 +++---
 mm/init-mm.c                | 4 ++--
 12 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/Documentation/x86/sva.rst b/Documentation/x86/sva.rst
index 2e9b8b0f9a0f..33cb05005982 100644
--- a/Documentation/x86/sva.rst
+++ b/Documentation/x86/sva.rst
@@ -107,7 +107,7 @@ process share the same page tables, thus the same MSR value.
 PASID Life Cycle Management
 ===========================
 
-PASID is initialized as INVALID_IOASID (-1) when a process is created.
+PASID is initialized as IOMMU_PASID_INVALID (-1) when a process is created.
 
 Only processes that access SVA-capable devices need to have a PASID
 allocated. This allocation happens when a process opens/binds an SVA-capable
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d317dc3d06a3..492a60febb11 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -40,7 +40,7 @@
 #include <linux/io.h>
 #include <linux/hardirq.h>
 #include <linux/atomic.h>
-#include <linux/ioasid.h>
+#include <linux/iommu.h>
 
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5f321f3b4242..6fca8fa8d3a8 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -1194,7 +1194,7 @@ static void idxd_device_set_perm_entry(struct idxd_device *idxd,
 {
 	union msix_perm mperm;
 
-	if (ie->pasid == INVALID_IOASID)
+	if (ie->pasid == IOMMU_PASID_INVALID)
 		return;
 
 	mperm.bits = 0;
@@ -1224,7 +1224,7 @@ void idxd_wq_free_irq(struct idxd_wq *wq)
 	idxd_device_clear_perm_entry(idxd, ie);
 	ie->vector = -1;
 	ie->int_handle = INVALID_INT_HANDLE;
-	ie->pasid = INVALID_IOASID;
+	ie->pasid = IOMMU_PASID_INVALID;
 }
 
 int idxd_wq_request_irq(struct idxd_wq *wq)
@@ -1240,7 +1240,7 @@ int idxd_wq_request_irq(struct idxd_wq *wq)
 
 	ie = &wq->ie;
 	ie->vector = pci_irq_vector(pdev, ie->id);
-	ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : INVALID_IOASID;
+	ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : IOMMU_PASID_INVALID;
 	idxd_device_set_perm_entry(idxd, ie);
 
 	rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie);
@@ -1265,7 +1265,7 @@ err_int_handle:
 	free_irq(ie->vector, ie);
 err_irq:
 	idxd_device_clear_perm_entry(idxd, ie);
-	ie->pasid = INVALID_IOASID;
+	ie->pasid = IOMMU_PASID_INVALID;
 	return rc;
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7ced8d283d98..417e602a46b6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -13,6 +13,7 @@
 #include <linux/ioasid.h>
 #include <linux/bitmap.h>
 #include <linux/perf_event.h>
+#include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include "registers.h"
 
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 640d3048368e..e6ee267da0ff 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -105,7 +105,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 		ie = idxd_get_ie(idxd, msix_idx);
 		ie->id = msix_idx;
 		ie->int_handle = INVALID_INT_HANDLE;
-		ie->pasid = INVALID_IOASID;
+		ie->pasid = IOMMU_PASID_INVALID;
 
 		spin_lock_init(&ie->list_lock);
 		init_llist_head(&ie->pending_llist);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index aa314ebec587..242f1f0b9f09 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -80,7 +80,7 @@ static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
 	desc.opcode = DSA_OPCODE_DRAIN;
 	desc.priv = 1;
 
-	if (ie->pasid != INVALID_IOASID)
+	if (ie->pasid != IOMMU_PASID_INVALID)
 		desc.pasid = ie->pasid;
 	desc.int_handle = ie->int_handle;
 	portal = idxd_wq_portal_addr(wq);
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 0f348439ef0e..10bb20ff50fd 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1958,7 +1958,7 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		return 0;
 	}
 
-	if (pasid == INVALID_IOASID)
+	if (pasid == IOMMU_PASID_INVALID)
 		pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
@@ -2039,7 +2039,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		if (!ratelimited)
 			/* Using pasid -1 if pasid is not present */
 			dmar_fault_do_one(iommu, type, fault_reason,
-					  pasid_present ? pasid : INVALID_IOASID,
+					  pasid_present ? pasid : IOMMU_PASID_INVALID,
 					  source_id, guest_addr);
 
 		fault_index++;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6c2d1ffb5b0a..350c33605fd3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -876,7 +876,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 		return;
 	}
 	/* For request-without-pasid, get the pasid from context entry */
-	if (intel_iommu_sm && pasid == INVALID_IOASID)
+	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 		pasid = PASID_RID2PASID;
 
 	dir_index = pasid >> PASID_PDE_SHIFT;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 7367f56c3bad..3848a1b0800c 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -273,7 +273,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 	if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
 		return -EINVAL;
 
-	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
+	if (pasid == IOMMU_PASID_INVALID || pasid >= PASID_MAX)
 		return -EINVAL;
 
 	svm = pasid_private_find(pasid);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 734bf0e036ee..bbc9c6fd3310 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -7,6 +7,7 @@
 #include <linux/iommu.h>
 
 typedef unsigned int ioasid_t;
+#define INVALID_IOASID ((ioasid_t)-1)
 typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
 typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d3f81dc6e4dd..54f535ff9868 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -192,7 +192,6 @@ enum iommu_dev_features {
 
 #define IOMMU_PASID_INVALID	(-1U)
 typedef unsigned int ioasid_t;
-#define INVALID_IOASID ((ioasid_t)-1)
 
 #ifdef CONFIG_IOMMU_API
 
@@ -1175,12 +1174,13 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 
 static inline bool pasid_valid(ioasid_t ioasid)
 {
-	return ioasid != INVALID_IOASID;
+	return ioasid != IOMMU_PASID_INVALID;
 }
+
 #ifdef CONFIG_IOMMU_SVA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
-	mm->pasid = INVALID_IOASID;
+	mm->pasid = IOMMU_PASID_INVALID;
 }
 void mm_pasid_drop(struct mm_struct *mm);
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..a084039f55d8 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -10,7 +10,7 @@
 
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
-#include <linux/ioasid.h>
+#include <linux/iommu.h>
 #include <asm/mmu.h>
 
 #ifndef INIT_MM_CONTEXT
@@ -40,7 +40,7 @@ struct mm_struct init_mm = {
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-	.pasid		= INVALID_IOASID,
+	.pasid		= IOMMU_PASID_INVALID,
 #endif
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
cgit v1.2.3


From 99b5726b44230329f35b4c4d7fe1577d4f4edb31 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 22 Mar 2023 13:08:03 -0700
Subject: iommu: Remove ioasid infrastructure

This has no use anymore, delete it all.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Link: https://lore.kernel.org/r/20230322200803.869130-8-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/dma/idxd/idxd.h     |   1 -
 drivers/iommu/Kconfig       |   5 -
 drivers/iommu/Makefile      |   1 -
 drivers/iommu/intel/iommu.h |   1 -
 drivers/iommu/intel/svm.c   |   1 -
 drivers/iommu/ioasid.c      | 422 --------------------------------------------
 drivers/iommu/iommu-sva.h   |   1 -
 include/linux/ioasid.h      |  75 --------
 8 files changed, 507 deletions(-)
 delete mode 100644 drivers/iommu/ioasid.c
 delete mode 100644 include/linux/ioasid.h

(limited to 'include')

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 417e602a46b6..dd2a6ed8949b 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -10,7 +10,6 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
-#include <linux/ioasid.h>
 #include <linux/bitmap.h>
 #include <linux/perf_event.h>
 #include <linux/iommu.h>
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c4928514e5e2..db98c3f86e8c 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -3,10 +3,6 @@
 config IOMMU_IOVA
 	tristate
 
-# The IOASID library may also be used by non-IOMMU_API users
-config IOASID
-	tristate
-
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
 	bool
@@ -160,7 +156,6 @@ config IOMMU_DMA
 # Shared Virtual Addressing
 config IOMMU_SVA
 	bool
-	select IOASID
 
 config FSL_PAMU
 	bool "Freescale IOMMU support"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index f461d0651385..769e43d780ce 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
-obj-$(CONFIG_IOASID) += ioasid.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index a2010fb120e2..65b15be72878 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -19,7 +19,6 @@
 #include <linux/iommu.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
-#include <linux/ioasid.h>
 #include <linux/bitfield.h>
 #include <linux/xarray.h>
 #include <linux/perf_event.h>
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 3848a1b0800c..e95b339e9cdc 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/mm_types.h>
 #include <linux/xarray.h>
-#include <linux/ioasid.h>
 #include <asm/page.h>
 #include <asm/fpu/api.h>
 
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
deleted file mode 100644
index a786c034907c..000000000000
--- a/drivers/iommu/ioasid.c
+++ /dev/null
@@ -1,422 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * I/O Address Space ID allocator. There is one global IOASID space, split into
- * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc() and ioasid_free().
- */
-#include <linux/ioasid.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/xarray.h>
-
-struct ioasid_data {
-	ioasid_t id;
-	struct ioasid_set *set;
-	void *private;
-	struct rcu_head rcu;
-};
-
-/*
- * struct ioasid_allocator_data - Internal data structure to hold information
- * about an allocator. There are two types of allocators:
- *
- * - Default allocator always has its own XArray to track the IOASIDs allocated.
- * - Custom allocators may share allocation helpers with different private data.
- *   Custom allocators that share the same helper functions also share the same
- *   XArray.
- * Rules:
- * 1. Default allocator is always available, not dynamically registered. This is
- *    to prevent race conditions with early boot code that want to register
- *    custom allocators or allocate IOASIDs.
- * 2. Custom allocators take precedence over the default allocator.
- * 3. When all custom allocators sharing the same helper functions are
- *    unregistered (e.g. due to hotplug), all outstanding IOASIDs must be
- *    freed. Otherwise, outstanding IOASIDs will be lost and orphaned.
- * 4. When switching between custom allocators sharing the same helper
- *    functions, outstanding IOASIDs are preserved.
- * 5. When switching between custom allocator and default allocator, all IOASIDs
- *    must be freed to ensure unadulterated space for the new allocator.
- *
- * @ops:	allocator helper functions and its data
- * @list:	registered custom allocators
- * @slist:	allocators share the same ops but different data
- * @flags:	attributes of the allocator
- * @xa:		xarray holds the IOASID space
- * @rcu:	used for kfree_rcu when unregistering allocator
- */
-struct ioasid_allocator_data {
-	struct ioasid_allocator_ops *ops;
-	struct list_head list;
-	struct list_head slist;
-#define IOASID_ALLOCATOR_CUSTOM BIT(0) /* Needs framework to track results */
-	unsigned long flags;
-	struct xarray xa;
-	struct rcu_head rcu;
-};
-
-static DEFINE_SPINLOCK(ioasid_allocator_lock);
-static LIST_HEAD(allocators_list);
-
-static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque);
-static void default_free(ioasid_t ioasid, void *opaque);
-
-static struct ioasid_allocator_ops default_ops = {
-	.alloc = default_alloc,
-	.free = default_free,
-};
-
-static struct ioasid_allocator_data default_allocator = {
-	.ops = &default_ops,
-	.flags = 0,
-	.xa = XARRAY_INIT(ioasid_xa, XA_FLAGS_ALLOC),
-};
-
-static struct ioasid_allocator_data *active_allocator = &default_allocator;
-
-static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque)
-{
-	ioasid_t id;
-
-	if (xa_alloc(&default_allocator.xa, &id, opaque, XA_LIMIT(min, max), GFP_ATOMIC)) {
-		pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
-		return INVALID_IOASID;
-	}
-
-	return id;
-}
-
-static void default_free(ioasid_t ioasid, void *opaque)
-{
-	struct ioasid_data *ioasid_data;
-
-	ioasid_data = xa_erase(&default_allocator.xa, ioasid);
-	kfree_rcu(ioasid_data, rcu);
-}
-
-/* Allocate and initialize a new custom allocator with its helper functions */
-static struct ioasid_allocator_data *ioasid_alloc_allocator(struct ioasid_allocator_ops *ops)
-{
-	struct ioasid_allocator_data *ia_data;
-
-	ia_data = kzalloc(sizeof(*ia_data), GFP_ATOMIC);
-	if (!ia_data)
-		return NULL;
-
-	xa_init_flags(&ia_data->xa, XA_FLAGS_ALLOC);
-	INIT_LIST_HEAD(&ia_data->slist);
-	ia_data->flags |= IOASID_ALLOCATOR_CUSTOM;
-	ia_data->ops = ops;
-
-	/* For tracking custom allocators that share the same ops */
-	list_add_tail(&ops->list, &ia_data->slist);
-
-	return ia_data;
-}
-
-static bool use_same_ops(struct ioasid_allocator_ops *a, struct ioasid_allocator_ops *b)
-{
-	return (a->free == b->free) && (a->alloc == b->alloc);
-}
-
-/**
- * ioasid_register_allocator - register a custom allocator
- * @ops: the custom allocator ops to be registered
- *
- * Custom allocators take precedence over the default xarray based allocator.
- * Private data associated with the IOASID allocated by the custom allocators
- * are managed by IOASID framework similar to data stored in xa by default
- * allocator.
- *
- * There can be multiple allocators registered but only one is active. In case
- * of runtime removal of a custom allocator, the next one is activated based
- * on the registration ordering.
- *
- * Multiple allocators can share the same alloc() function, in this case the
- * IOASID space is shared.
- */
-int ioasid_register_allocator(struct ioasid_allocator_ops *ops)
-{
-	struct ioasid_allocator_data *ia_data;
-	struct ioasid_allocator_data *pallocator;
-	int ret = 0;
-
-	spin_lock(&ioasid_allocator_lock);
-
-	ia_data = ioasid_alloc_allocator(ops);
-	if (!ia_data) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	/*
-	 * No particular preference, we activate the first one and keep
-	 * the later registered allocators in a list in case the first one gets
-	 * removed due to hotplug.
-	 */
-	if (list_empty(&allocators_list)) {
-		WARN_ON(active_allocator != &default_allocator);
-		/* Use this new allocator if default is not active */
-		if (xa_empty(&active_allocator->xa)) {
-			rcu_assign_pointer(active_allocator, ia_data);
-			list_add_tail(&ia_data->list, &allocators_list);
-			goto out_unlock;
-		}
-		pr_warn("Default allocator active with outstanding IOASID\n");
-		ret = -EAGAIN;
-		goto out_free;
-	}
-
-	/* Check if the allocator is already registered */
-	list_for_each_entry(pallocator, &allocators_list, list) {
-		if (pallocator->ops == ops) {
-			pr_err("IOASID allocator already registered\n");
-			ret = -EEXIST;
-			goto out_free;
-		} else if (use_same_ops(pallocator->ops, ops)) {
-			/*
-			 * If the new allocator shares the same ops,
-			 * then they will share the same IOASID space.
-			 * We should put them under the same xarray.
-			 */
-			list_add_tail(&ops->list, &pallocator->slist);
-			goto out_free;
-		}
-	}
-	list_add_tail(&ia_data->list, &allocators_list);
-
-	spin_unlock(&ioasid_allocator_lock);
-	return 0;
-out_free:
-	kfree(ia_data);
-out_unlock:
-	spin_unlock(&ioasid_allocator_lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ioasid_register_allocator);
-
-/**
- * ioasid_unregister_allocator - Remove a custom IOASID allocator ops
- * @ops: the custom allocator to be removed
- *
- * Remove an allocator from the list, activate the next allocator in
- * the order it was registered. Or revert to default allocator if all
- * custom allocators are unregistered without outstanding IOASIDs.
- */
-void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
-{
-	struct ioasid_allocator_data *pallocator;
-	struct ioasid_allocator_ops *sops;
-
-	spin_lock(&ioasid_allocator_lock);
-	if (list_empty(&allocators_list)) {
-		pr_warn("No custom IOASID allocators active!\n");
-		goto exit_unlock;
-	}
-
-	list_for_each_entry(pallocator, &allocators_list, list) {
-		if (!use_same_ops(pallocator->ops, ops))
-			continue;
-
-		if (list_is_singular(&pallocator->slist)) {
-			/* No shared helper functions */
-			list_del(&pallocator->list);
-			/*
-			 * All IOASIDs should have been freed before
-			 * the last allocator that shares the same ops
-			 * is unregistered.
-			 */
-			WARN_ON(!xa_empty(&pallocator->xa));
-			if (list_empty(&allocators_list)) {
-				pr_info("No custom IOASID allocators, switch to default.\n");
-				rcu_assign_pointer(active_allocator, &default_allocator);
-			} else if (pallocator == active_allocator) {
-				rcu_assign_pointer(active_allocator,
-						list_first_entry(&allocators_list,
-								struct ioasid_allocator_data, list));
-				pr_info("IOASID allocator changed");
-			}
-			kfree_rcu(pallocator, rcu);
-			break;
-		}
-		/*
-		 * Find the matching shared ops to delete,
-		 * but keep outstanding IOASIDs
-		 */
-		list_for_each_entry(sops, &pallocator->slist, list) {
-			if (sops == ops) {
-				list_del(&ops->list);
-				break;
-			}
-		}
-		break;
-	}
-
-exit_unlock:
-	spin_unlock(&ioasid_allocator_lock);
-}
-EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
-
-/**
- * ioasid_set_data - Set private data for an allocated ioasid
- * @ioasid: the ID to set data
- * @data:   the private data
- *
- * For IOASID that is already allocated, private data can be set
- * via this API. Future lookup can be done via ioasid_find.
- */
-int ioasid_set_data(ioasid_t ioasid, void *data)
-{
-	struct ioasid_data *ioasid_data;
-	int ret = 0;
-
-	spin_lock(&ioasid_allocator_lock);
-	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (ioasid_data)
-		rcu_assign_pointer(ioasid_data->private, data);
-	else
-		ret = -ENOENT;
-	spin_unlock(&ioasid_allocator_lock);
-
-	/*
-	 * Wait for readers to stop accessing the old private data, so the
-	 * caller can free it.
-	 */
-	if (!ret)
-		synchronize_rcu();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ioasid_set_data);
-
-/**
- * ioasid_alloc - Allocate an IOASID
- * @set: the IOASID set
- * @min: the minimum ID (inclusive)
- * @max: the maximum ID (inclusive)
- * @private: data private to the caller
- *
- * Allocate an ID between @min and @max. The @private pointer is stored
- * internally and can be retrieved with ioasid_find().
- *
- * Return: the allocated ID on success, or %INVALID_IOASID on failure.
- */
-ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
-		      void *private)
-{
-	struct ioasid_data *data;
-	void *adata;
-	ioasid_t id;
-
-	data = kzalloc(sizeof(*data), GFP_ATOMIC);
-	if (!data)
-		return INVALID_IOASID;
-
-	data->set = set;
-	data->private = private;
-
-	/*
-	 * Custom allocator needs allocator data to perform platform specific
-	 * operations.
-	 */
-	spin_lock(&ioasid_allocator_lock);
-	adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
-	id = active_allocator->ops->alloc(min, max, adata);
-	if (id == INVALID_IOASID) {
-		pr_err("Failed ASID allocation %lu\n", active_allocator->flags);
-		goto exit_free;
-	}
-
-	if ((active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) &&
-	     xa_alloc(&active_allocator->xa, &id, data, XA_LIMIT(id, id), GFP_ATOMIC)) {
-		/* Custom allocator needs framework to store and track allocation results */
-		pr_err("Failed to alloc ioasid from %d\n", id);
-		active_allocator->ops->free(id, active_allocator->ops->pdata);
-		goto exit_free;
-	}
-	data->id = id;
-
-	spin_unlock(&ioasid_allocator_lock);
-	return id;
-exit_free:
-	spin_unlock(&ioasid_allocator_lock);
-	kfree(data);
-	return INVALID_IOASID;
-}
-EXPORT_SYMBOL_GPL(ioasid_alloc);
-
-/**
- * ioasid_free - Free an ioasid
- * @ioasid: the ID to remove
- */
-void ioasid_free(ioasid_t ioasid)
-{
-	struct ioasid_data *ioasid_data;
-
-	spin_lock(&ioasid_allocator_lock);
-	ioasid_data = xa_load(&active_allocator->xa, ioasid);
-	if (!ioasid_data) {
-		pr_err("Trying to free unknown IOASID %u\n", ioasid);
-		goto exit_unlock;
-	}
-
-	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
-	/* Custom allocator needs additional steps to free the xa element */
-	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
-		ioasid_data = xa_erase(&active_allocator->xa, ioasid);
-		kfree_rcu(ioasid_data, rcu);
-	}
-
-exit_unlock:
-	spin_unlock(&ioasid_allocator_lock);
-}
-EXPORT_SYMBOL_GPL(ioasid_free);
-
-/**
- * ioasid_find - Find IOASID data
- * @set: the IOASID set
- * @ioasid: the IOASID to find
- * @getter: function to call on the found object
- *
- * The optional getter function allows to take a reference to the found object
- * under the rcu lock. The function can also check if the object is still valid:
- * if @getter returns false, then the object is invalid and NULL is returned.
- *
- * If the IOASID exists, return the private pointer passed to ioasid_alloc.
- * Private data can be NULL if not set. Return an error if the IOASID is not
- * found, or if @set is not NULL and the IOASID does not belong to the set.
- */
-void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
-		  bool (*getter)(void *))
-{
-	void *priv;
-	struct ioasid_data *ioasid_data;
-	struct ioasid_allocator_data *idata;
-
-	rcu_read_lock();
-	idata = rcu_dereference(active_allocator);
-	ioasid_data = xa_load(&idata->xa, ioasid);
-	if (!ioasid_data) {
-		priv = ERR_PTR(-ENOENT);
-		goto unlock;
-	}
-	if (set && ioasid_data->set != set) {
-		/* data found but does not belong to the set */
-		priv = ERR_PTR(-EACCES);
-		goto unlock;
-	}
-	/* Now IOASID and its set is verified, we can return the private data */
-	priv = rcu_dereference(ioasid_data->private);
-	if (getter && !getter(priv))
-		priv = NULL;
-unlock:
-	rcu_read_unlock();
-
-	return priv;
-}
-EXPORT_SYMBOL_GPL(ioasid_find);
-
-MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
-MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
-MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
-MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
index c22d0174ad61..54946b5a7caf 100644
--- a/drivers/iommu/iommu-sva.h
+++ b/drivers/iommu/iommu-sva.h
@@ -5,7 +5,6 @@
 #ifndef _IOMMU_SVA_H
 #define _IOMMU_SVA_H
 
-#include <linux/ioasid.h>
 #include <linux/mm_types.h>
 
 /* I/O Page fault */
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
deleted file mode 100644
index bbc9c6fd3310..000000000000
--- a/include/linux/ioasid.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_IOASID_H
-#define __LINUX_IOASID_H
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/iommu.h>
-
-typedef unsigned int ioasid_t;
-#define INVALID_IOASID ((ioasid_t)-1)
-typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
-typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
-
-struct ioasid_set {
-	int dummy;
-};
-
-/**
- * struct ioasid_allocator_ops - IOASID allocator helper functions and data
- *
- * @alloc:	helper function to allocate IOASID
- * @free:	helper function to free IOASID
- * @list:	for tracking ops that share helper functions but not data
- * @pdata:	data belong to the allocator, provided when calling alloc()
- */
-struct ioasid_allocator_ops {
-	ioasid_alloc_fn_t alloc;
-	ioasid_free_fn_t free;
-	struct list_head list;
-	void *pdata;
-};
-
-#define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
-
-#if IS_ENABLED(CONFIG_IOASID)
-ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
-		      void *private);
-void ioasid_free(ioasid_t ioasid);
-void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
-		  bool (*getter)(void *));
-int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
-void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
-int ioasid_set_data(ioasid_t ioasid, void *data);
-
-#else /* !CONFIG_IOASID */
-static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
-				    ioasid_t max, void *private)
-{
-	return INVALID_IOASID;
-}
-
-static inline void ioasid_free(ioasid_t ioasid) { }
-
-static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
-				bool (*getter)(void *))
-{
-	return NULL;
-}
-
-static inline int ioasid_register_allocator(struct ioasid_allocator_ops *allocator)
-{
-	return -ENOTSUPP;
-}
-
-static inline void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator)
-{
-}
-
-static inline int ioasid_set_data(ioasid_t ioasid, void *data)
-{
-	return -ENOTSUPP;
-}
-
-#endif /* CONFIG_IOASID */
-#endif /* __LINUX_IOASID_H */
-- 
cgit v1.2.3


From 16812c96550c30a8d5743167ef4e462d6fbe7472 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 29 Mar 2023 21:47:21 +0800
Subject: iommu/vt-d: Fix an IOMMU perfmon warning when CPU hotplug

A warning can be triggered when hotplug CPU 0.
$ echo 0 > /sys/devices/system/cpu/cpu0/online

 ------------[ cut here ]------------
 Voluntary context switch within RCU read-side critical section!
 WARNING: CPU: 0 PID: 19 at kernel/rcu/tree_plugin.h:318
          rcu_note_context_switch+0x4f4/0x580
 RIP: 0010:rcu_note_context_switch+0x4f4/0x580
 Call Trace:
  <TASK>
  ? perf_event_update_userpage+0x104/0x150
  __schedule+0x8d/0x960
  ? perf_event_set_state.part.82+0x11/0x50
  schedule+0x44/0xb0
  schedule_timeout+0x226/0x310
  ? __perf_event_disable+0x64/0x1a0
  ? _raw_spin_unlock+0x14/0x30
  wait_for_completion+0x94/0x130
  __wait_rcu_gp+0x108/0x130
  synchronize_rcu+0x67/0x70
  ? invoke_rcu_core+0xb0/0xb0
  ? __bpf_trace_rcu_stall_warning+0x10/0x10
  perf_pmu_migrate_context+0x121/0x370
  iommu_pmu_cpu_offline+0x6a/0xa0
  ? iommu_pmu_del+0x1e0/0x1e0
  cpuhp_invoke_callback+0x129/0x510
  cpuhp_thread_fun+0x94/0x150
  smpboot_thread_fn+0x183/0x220
  ? sort_range+0x20/0x20
  kthread+0xe6/0x110
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>
 ---[ end trace 0000000000000000 ]---

The synchronize_rcu() will be invoked in the perf_pmu_migrate_context(),
when migrating a PMU to a new CPU. However, the current for_each_iommu()
is within RCU read-side critical section.

Two methods were considered to fix the issue.
- Use the dmar_global_lock to replace the RCU read lock when going
  through the drhd list. But it triggers a lockdep warning.
- Use the cpuhp_setup_state_multi() to set up a dedicated state for each
  IOMMU PMU. The lock can be avoided.

The latter method is implemented in this patch. Since each IOMMU PMU has
a dedicated state, add cpuhp_node and cpu in struct iommu_pmu to track
the state. The state can be dynamically allocated now. Remove the
CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE.

Fixes: 46284c6ceb5e ("iommu/vt-d: Support cpumask for IOMMU perfmon")
Reported-by: Ammy Yi <ammy.yi@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lore.kernel.org/r/20230328182028.1366416-1-kan.liang@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20230329134721.469447-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.h   |  2 ++
 drivers/iommu/intel/perfmon.c | 68 ++++++++++++++++++++++++++++---------------
 include/linux/cpuhotplug.h    |  1 -
 3 files changed, 46 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index d6df3b865812..694ab9b7d3e9 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -641,6 +641,8 @@ struct iommu_pmu {
 	DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
 	struct perf_event	*event_list[IOMMU_PMU_IDX_MAX];
 	unsigned char		irq_name[16];
+	struct hlist_node	cpuhp_node;
+	int			cpu;
 };
 
 #define IOMMU_IRQ_ID_OFFSET_PRQ		(DMAR_UNITS_SUPPORTED)
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index e17d9743a0d8..cf43e798eca4 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -773,19 +773,34 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
 	iommu->perf_irq = 0;
 }
 
-static int iommu_pmu_cpu_online(unsigned int cpu)
+static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
+	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
+
 	if (cpumask_empty(&iommu_pmu_cpu_mask))
 		cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
 
+	if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
+		iommu_pmu->cpu = cpu;
+
 	return 0;
 }
 
-static int iommu_pmu_cpu_offline(unsigned int cpu)
+static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	int target;
+	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
+	int target = cpumask_first(&iommu_pmu_cpu_mask);
+
+	/*
+	 * The iommu_pmu_cpu_mask has been updated when offline the CPU
+	 * for the first iommu_pmu. Migrate the other iommu_pmu to the
+	 * new target.
+	 */
+	if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
+		perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
+		iommu_pmu->cpu = target;
+		return 0;
+	}
 
 	if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
 		return 0;
@@ -795,45 +810,50 @@ static int iommu_pmu_cpu_offline(unsigned int cpu)
 	if (target < nr_cpu_ids)
 		cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
 	else
-		target = -1;
+		return 0;
 
-	rcu_read_lock();
-
-	for_each_iommu(iommu, drhd) {
-		if (!iommu->pmu)
-			continue;
-		perf_pmu_migrate_context(&iommu->pmu->pmu, cpu, target);
-	}
-	rcu_read_unlock();
+	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
+	iommu_pmu->cpu = target;
 
 	return 0;
 }
 
 static int nr_iommu_pmu;
+static enum cpuhp_state iommu_cpuhp_slot;
 
 static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
 {
 	int ret;
 
-	if (nr_iommu_pmu++)
-		return 0;
+	if (!nr_iommu_pmu) {
+		ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					      "driver/iommu/intel/perfmon:online",
+					      iommu_pmu_cpu_online,
+					      iommu_pmu_cpu_offline);
+		if (ret < 0)
+			return ret;
+		iommu_cpuhp_slot = ret;
+	}
 
-	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE,
-				"driver/iommu/intel/perfmon:online",
-				iommu_pmu_cpu_online,
-				iommu_pmu_cpu_offline);
-	if (ret)
-		nr_iommu_pmu = 0;
+	ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
+	if (ret) {
+		if (!nr_iommu_pmu)
+			cpuhp_remove_multi_state(iommu_cpuhp_slot);
+		return ret;
+	}
+	nr_iommu_pmu++;
 
-	return ret;
+	return 0;
 }
 
 static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
 {
+	cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
+
 	if (--nr_iommu_pmu)
 		return;
 
-	cpuhp_remove_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE);
+	cpuhp_remove_multi_state(iommu_cpuhp_slot);
 }
 
 void iommu_pmu_register(struct intel_iommu *iommu)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index c6fab004104a..5b2f8147d1ae 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -218,7 +218,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_X86_IDXD_ONLINE,
-	CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
-- 
cgit v1.2.3


From 653a180957a85c3fc30320cc7e84f5dc913a64f8 Mon Sep 17 00:00:00 2001
From: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Date: Thu, 30 Mar 2023 17:14:02 +0800
Subject: net: phylink: add phylink_expects_phy() method

Provide phylink_expects_phy() to allow MAC drivers to check if it
is expecting a PHY to attach to. Since fixed-linked setups do not
need to attach to a PHY.

Provides a boolean value as to if the MAC should expect a PHY.
Returns true if a PHY is expected.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 19 +++++++++++++++++++
 include/linux/phylink.h   |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1a2f074685fa..30c166b33468 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1586,6 +1586,25 @@ void phylink_destroy(struct phylink *pl)
 }
 EXPORT_SYMBOL_GPL(phylink_destroy);
 
+/**
+ * phylink_expects_phy() - Determine if phylink expects a phy to be attached
+ * @pl: a pointer to a &struct phylink returned from phylink_create()
+ *
+ * When using fixed-link mode, or in-band mode with 1000base-X or 2500base-X,
+ * no PHY is needed.
+ *
+ * Returns true if phylink will be expecting a PHY.
+ */
+bool phylink_expects_phy(struct phylink *pl)
+{
+	if (pl->cfg_link_an_mode == MLO_AN_FIXED ||
+	    (pl->cfg_link_an_mode == MLO_AN_INBAND &&
+	     phy_interface_mode_is_8023z(pl->link_config.interface)))
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(phylink_expects_phy);
+
 static void phylink_phy_change(struct phy_device *phydev, bool up)
 {
 	struct phylink *pl = phydev->phylink;
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index c492c26202b5..637698ed5cb6 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -574,6 +574,7 @@ struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
 			       phy_interface_t iface,
 			       const struct phylink_mac_ops *mac_ops);
 void phylink_destroy(struct phylink *);
+bool phylink_expects_phy(struct phylink *pl);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
 int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags);
-- 
cgit v1.2.3


From c616fb0cbae8af5f3f837f54c625700992dcd78d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 24 Mar 2023 17:59:38 +0800
Subject: crypto: lib/utils - Move utilities into new header

The utilities have historically resided in algapi.h as they were
first used internally before being exported.  Move them into a
new header file so external users don't see internal API details.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/algapi.h | 63 +-----------------------------------------
 include/crypto/utils.h  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/crypto/utils.c      |  2 +-
 3 files changed, 75 insertions(+), 63 deletions(-)
 create mode 100644 include/crypto/utils.h

(limited to 'include')

diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index e28957993b56..bbf8c43c3320 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -7,15 +7,12 @@
 #ifndef _CRYPTO_ALGAPI_H
 #define _CRYPTO_ALGAPI_H
 
+#include <crypto/utils.h>
 #include <linux/align.h>
 #include <linux/cache.h>
 #include <linux/crypto.h>
-#include <linux/kconfig.h>
-#include <linux/list.h>
 #include <linux/types.h>
 
-#include <asm/unaligned.h>
-
 /*
  * Maximum values for blocksize and alignmask, used to allocate
  * static buffers that are big enough for any combination of
@@ -172,47 +169,6 @@ static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
 }
 
 void crypto_inc(u8 *a, unsigned int size);
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int size);
-
-static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
-{
-	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
-	    __builtin_constant_p(size) &&
-	    (size % sizeof(unsigned long)) == 0) {
-		unsigned long *d = (unsigned long *)dst;
-		unsigned long *s = (unsigned long *)src;
-		unsigned long l;
-
-		while (size > 0) {
-			l = get_unaligned(d) ^ get_unaligned(s++);
-			put_unaligned(l, d++);
-			size -= sizeof(unsigned long);
-		}
-	} else {
-		__crypto_xor(dst, dst, src, size);
-	}
-}
-
-static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
-				  unsigned int size)
-{
-	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
-	    __builtin_constant_p(size) &&
-	    (size % sizeof(unsigned long)) == 0) {
-		unsigned long *d = (unsigned long *)dst;
-		unsigned long *s1 = (unsigned long *)src1;
-		unsigned long *s2 = (unsigned long *)src2;
-		unsigned long l;
-
-		while (size > 0) {
-			l = get_unaligned(s1++) ^ get_unaligned(s2++);
-			put_unaligned(l, d++);
-			size -= sizeof(unsigned long);
-		}
-	} else {
-		__crypto_xor(dst, src1, src2, size);
-	}
-}
 
 static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
 {
@@ -291,23 +247,6 @@ static inline u32 crypto_algt_inherited_mask(struct crypto_attr_type *algt)
 	return crypto_requires_off(algt, CRYPTO_ALG_INHERITED_FLAGS);
 }
 
-noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);
-
-/**
- * crypto_memneq - Compare two areas of memory without leaking
- *		   timing information.
- *
- * @a: One area of memory
- * @b: Another area of memory
- * @size: The size of the area.
- *
- * Returns 0 when data is equal, 1 otherwise.
- */
-static inline int crypto_memneq(const void *a, const void *b, size_t size)
-{
-	return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
-}
-
 int crypto_register_notifier(struct notifier_block *nb);
 int crypto_unregister_notifier(struct notifier_block *nb);
 
diff --git a/include/crypto/utils.h b/include/crypto/utils.h
new file mode 100644
index 000000000000..acbb917a00c6
--- /dev/null
+++ b/include/crypto/utils.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic utilities
+ *
+ * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#ifndef _CRYPTO_UTILS_H
+#define _CRYPTO_UTILS_H
+
+#include <asm/unaligned.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int size);
+
+static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
+{
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    __builtin_constant_p(size) &&
+	    (size % sizeof(unsigned long)) == 0) {
+		unsigned long *d = (unsigned long *)dst;
+		unsigned long *s = (unsigned long *)src;
+		unsigned long l;
+
+		while (size > 0) {
+			l = get_unaligned(d) ^ get_unaligned(s++);
+			put_unaligned(l, d++);
+			size -= sizeof(unsigned long);
+		}
+	} else {
+		__crypto_xor(dst, dst, src, size);
+	}
+}
+
+static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
+				  unsigned int size)
+{
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    __builtin_constant_p(size) &&
+	    (size % sizeof(unsigned long)) == 0) {
+		unsigned long *d = (unsigned long *)dst;
+		unsigned long *s1 = (unsigned long *)src1;
+		unsigned long *s2 = (unsigned long *)src2;
+		unsigned long l;
+
+		while (size > 0) {
+			l = get_unaligned(s1++) ^ get_unaligned(s2++);
+			put_unaligned(l, d++);
+			size -= sizeof(unsigned long);
+		}
+	} else {
+		__crypto_xor(dst, src1, src2, size);
+	}
+}
+
+noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);
+
+/**
+ * crypto_memneq - Compare two areas of memory without leaking
+ *		   timing information.
+ *
+ * @a: One area of memory
+ * @b: Another area of memory
+ * @size: The size of the area.
+ *
+ * Returns 0 when data is equal, 1 otherwise.
+ */
+static inline int crypto_memneq(const void *a, const void *b, size_t size)
+{
+	return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
+}
+
+#endif	/* _CRYPTO_UTILS_H */
diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c
index 53230ab1b195..c852c7151b0a 100644
--- a/lib/crypto/utils.c
+++ b/lib/crypto/utils.c
@@ -6,7 +6,7 @@
  */
 
 #include <asm/unaligned.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
 #include <linux/module.h>
 
 /*
-- 
cgit v1.2.3


From 9e410fe3dc9a938bc47f71dff254be7419bd40d2 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:11 -0800
Subject: dmaengine: idxd: Add descriptor definitions for 16 bytes of pattern
 in memory fill operation

The memory fill operation (0x04) can fill in memory with either 8 bytes
or 16 bytes of pattern. To fill in memory with 16 bytes of pattern, the
first 8 bytes are provided in pattern lower in bytes 16-23 and the next
8 bytes are in pattern upper in bytes 40-47 in the descriptor. Currently
only 8 bytes of pattern is enabled.

Add descriptor definitions for pattern lower and pattern upper so that
user can use 16 bytes of pattern to fill memory.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-2-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 1d553bedbdb5..c43d7df5fc15 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -180,6 +180,7 @@ struct dsa_hw_desc {
 		uint64_t	rdback_addr;
 		uint64_t	pattern;
 		uint64_t	desc_list_addr;
+		uint64_t	pattern_lower;
 	};
 	union {
 		uint64_t	dst_addr;
@@ -244,6 +245,9 @@ struct dsa_hw_desc {
 			uint16_t	dest_app_tag_seed;
 		};
 
+		/* Fill */
+		uint64_t	pattern_upper;
+
 		uint8_t		op_specific[24];
 	};
 } __attribute__((packed));
-- 
cgit v1.2.3


From 12bbc2c2605516e781cd86e3cde9fe1f889b72cc Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:12 -0800
Subject: dmaengine: idxd: Add descriptor definitions for DIX generate
 operation

The Data Integrity Extension (DIX) generate operation (0x17) computes
the Data Integrity Field (DIF) on the source data and writes only the
computed DIF for each source block to the PI destination address.

Add descriptor definitions for this operation so that user can use
DSA to accelerate DIX generate operation.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-3-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index c43d7df5fc15..4c12e93a6aa6 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -78,6 +78,7 @@ enum dsa_opcode {
 	DSA_OPCODE_DIF_INS,
 	DSA_OPCODE_DIF_STRP,
 	DSA_OPCODE_DIF_UPDT,
+	DSA_OPCODE_DIX_GEN = 0x17,
 	DSA_OPCODE_CFLUSH = 0x20,
 };
 
@@ -248,6 +249,17 @@ struct dsa_hw_desc {
 		/* Fill */
 		uint64_t	pattern_upper;
 
+		/* DIX generate */
+		struct {
+			uint8_t		dix_gen_res;
+			uint8_t		dest_dif_flags;
+			uint8_t		dif_flags;
+			uint8_t		dix_gen_res2[13];
+			uint32_t	ref_tag_seed;
+			uint16_t	app_tag_mask;
+			uint16_t	app_tag_seed;
+		};
+
 		uint8_t		op_specific[24];
 	};
 } __attribute__((packed));
@@ -326,6 +338,14 @@ struct dsa_completion_record {
 			uint16_t	dif_upd_dest_app_tag;
 		};
 
+		/* DIX generate */
+		struct {
+			uint64_t	dix_gen_res;
+			uint32_t	dix_ref_tag;
+			uint16_t	dix_app_tag_mask;
+			uint16_t	dix_app_tag;
+		};
+
 		uint8_t		op_specific[16];
 	};
 } __attribute__((packed));
-- 
cgit v1.2.3


From 6fec8938b7b4fe2b2c503fe87b2783a50bff0415 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:13 -0800
Subject: dmaengine: idxd: Add descriptor definitions for translation fetch
 operation

The translation fetch operation (0x0A) fetches address translations for the
address range specified in the descriptor by issuing address translation
(ATS) requests to the IOMMU.

Add descriptor definitions for the operation so that user can use DSA
to accelerate translation fetch.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-4-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 4c12e93a6aa6..fc47635b57dc 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -72,6 +72,7 @@ enum dsa_opcode {
 	DSA_OPCODE_CR_DELTA,
 	DSA_OPCODE_AP_DELTA,
 	DSA_OPCODE_DUALCAST,
+	DSA_OPCODE_TRANSL_FETCH,
 	DSA_OPCODE_CRCGEN = 0x10,
 	DSA_OPCODE_COPY_CRC,
 	DSA_OPCODE_DIF_CHECK,
@@ -182,6 +183,7 @@ struct dsa_hw_desc {
 		uint64_t	pattern;
 		uint64_t	desc_list_addr;
 		uint64_t	pattern_lower;
+		uint64_t	transl_fetch_addr;
 	};
 	union {
 		uint64_t	dst_addr;
@@ -192,6 +194,7 @@ struct dsa_hw_desc {
 	union {
 		uint32_t	xfer_size;
 		uint32_t	desc_count;
+		uint32_t	region_size;
 	};
 	uint16_t	int_handle;
 	uint16_t	rsvd1;
@@ -249,6 +252,12 @@ struct dsa_hw_desc {
 		/* Fill */
 		uint64_t	pattern_upper;
 
+		/* Translation fetch */
+		struct {
+			uint64_t	transl_fetch_res;
+			uint32_t	region_stride;
+		};
+
 		/* DIX generate */
 		struct {
 			uint8_t		dix_gen_res;
-- 
cgit v1.2.3


From 27fc5ec673b527dbc2f44787246a39c5ecc01de5 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Wed, 1 Mar 2023 17:32:53 +0100
Subject: clk: Introduce devm_clk_hw_register_gate_parent_data()

Add an API for clock gate that uses parent_data for the parent instead of
a string parent_name.

Reviewed-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Tested-by: Adam Ford <aford173@gmail.com> #imx8mp-beacon-kit
Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Signed-off-by: Marek Vasut <marex@denx.de>
Tested-by: Richard Leitner <richard.leitner@skidata.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
Link: https://lore.kernel.org/r/20230301163257.49005-1-marex@denx.de
---
 include/linux/clk-provider.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 842e72a5348f..92b7c794c627 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -608,6 +608,25 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	__devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
 			       NULL, (flags), (reg), (bit_idx),		      \
 			       (clk_gate_flags), (lock))
+
+/**
+ * devm_clk_hw_register_gate - register a gate clock with the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_data: parent clk data
+ * @flags: framework-specific flags for this clock
+ * @reg: register address to control gating of this clock
+ * @bit_idx: which bit in the register controls gating of this clock
+ * @clk_gate_flags: gate-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_gate_parent_data(dev, name, parent_data, flags,  \
+					      reg, bit_idx, clk_gate_flags,   \
+					      lock)			      \
+	__devm_clk_hw_register_gate((dev), NULL, (name), NULL, NULL,	      \
+				    (parent_data), (flags), (reg), (bit_idx), \
+				    (clk_gate_flags), (lock))
+
 void clk_unregister_gate(struct clk *clk);
 void clk_hw_unregister_gate(struct clk_hw *hw);
 int clk_gate_is_enabled(struct clk_hw *hw);
-- 
cgit v1.2.3


From f806bea3093cb3568e01b4375d5e1d7c8c47c1d4 Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Thu, 23 Mar 2023 17:31:07 +0530
Subject: dmaengine: ti: k3-udma: Workaround errata i2234

Per [1], UDMA TR15 transactions may hang if ICNT0 is less than 64B
Work around is to set EOL flag is to 1 for ICNT0.

Since, there is no performance penalty / side effects of setting EOL
flag event ICNTO > 64B, just set the flag for all UDMAP TR15
descriptors.

[1] https://www.ti.com/lit/er/sprz455a/sprz455a.pdf
Errata doc for J721E DRA829/TDA4VM Processors Silicon Revision 1.1/1.0
(Rev. A)

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
[j-choudhary@ti.com: minor cleanups]
Signed-off-by: Jayesh Choudhary <j-choudhary@ti.com>
Link: https://lore.kernel.org/r/20230323120107.27638-1-j-choudhary@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c     | 20 +++++++++++---------
 include/linux/dma/ti-cppi5.h |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index f652a217be76..c07feaff69c0 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -2966,6 +2966,7 @@ udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
 	struct scatterlist *sgent;
 	struct cppi5_tr_type15_t *tr_req = NULL;
 	enum dma_slave_buswidth dev_width;
+	u32 csf = CPPI5_TR_CSF_SUPR_EVT;
 	u16 tr_cnt0, tr_cnt1;
 	dma_addr_t dev_addr;
 	struct udma_desc *d;
@@ -3036,6 +3037,7 @@ udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
 
 	if (uc->ud->match_data->type == DMA_TYPE_UDMA) {
 		asel = 0;
+		csf |= CPPI5_TR_CSF_EOL_ICNT0;
 	} else {
 		asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT;
 		dev_addr |= asel;
@@ -3059,7 +3061,7 @@ udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
 
 		cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15, false,
 			      true, CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
-		cppi5_tr_csf_set(&tr_req[tr_idx].flags, CPPI5_TR_CSF_SUPR_EVT);
+		cppi5_tr_csf_set(&tr_req[tr_idx].flags, csf);
 		cppi5_tr_set_trigger(&tr_req[tr_idx].flags,
 				     uc->config.tr_trigger_type,
 				     CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, 0, 0);
@@ -3105,8 +3107,7 @@ udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
 			cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15,
 				      false, true,
 				      CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
-			cppi5_tr_csf_set(&tr_req[tr_idx].flags,
-					 CPPI5_TR_CSF_SUPR_EVT);
+			cppi5_tr_csf_set(&tr_req[tr_idx].flags, csf);
 			cppi5_tr_set_trigger(&tr_req[tr_idx].flags,
 					     uc->config.tr_trigger_type,
 					     CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC,
@@ -3150,8 +3151,7 @@ udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
 		d->residue += sg_len;
 	}
 
-	cppi5_tr_csf_set(&tr_req[tr_idx - 1].flags,
-			 CPPI5_TR_CSF_SUPR_EVT | CPPI5_TR_CSF_EOP);
+	cppi5_tr_csf_set(&tr_req[tr_idx - 1].flags, csf | CPPI5_TR_CSF_EOP);
 
 	return d;
 }
@@ -3680,6 +3680,7 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	int num_tr;
 	size_t tr_size = sizeof(struct cppi5_tr_type15_t);
 	u16 tr0_cnt0, tr0_cnt1, tr1_cnt0;
+	u32 csf = CPPI5_TR_CSF_SUPR_EVT;
 
 	if (uc->config.dir != DMA_MEM_TO_MEM) {
 		dev_err(chan->device->dev,
@@ -3710,13 +3711,15 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	if (uc->ud->match_data->type != DMA_TYPE_UDMA) {
 		src |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT;
 		dest |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT;
+	} else {
+		csf |= CPPI5_TR_CSF_EOL_ICNT0;
 	}
 
 	tr_req = d->hwdesc[0].tr_req_base;
 
 	cppi5_tr_init(&tr_req[0].flags, CPPI5_TR_TYPE15, false, true,
 		      CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
-	cppi5_tr_csf_set(&tr_req[0].flags, CPPI5_TR_CSF_SUPR_EVT);
+	cppi5_tr_csf_set(&tr_req[0].flags, csf);
 
 	tr_req[0].addr = src;
 	tr_req[0].icnt0 = tr0_cnt0;
@@ -3735,7 +3738,7 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	if (num_tr == 2) {
 		cppi5_tr_init(&tr_req[1].flags, CPPI5_TR_TYPE15, false, true,
 			      CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
-		cppi5_tr_csf_set(&tr_req[1].flags, CPPI5_TR_CSF_SUPR_EVT);
+		cppi5_tr_csf_set(&tr_req[1].flags, csf);
 
 		tr_req[1].addr = src + tr0_cnt1 * tr0_cnt0;
 		tr_req[1].icnt0 = tr1_cnt0;
@@ -3750,8 +3753,7 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 		tr_req[1].dicnt3 = 1;
 	}
 
-	cppi5_tr_csf_set(&tr_req[num_tr - 1].flags,
-			 CPPI5_TR_CSF_SUPR_EVT | CPPI5_TR_CSF_EOP);
+	cppi5_tr_csf_set(&tr_req[num_tr - 1].flags, csf | CPPI5_TR_CSF_EOP);
 
 	if (uc->config.metadata_size)
 		d->vd.tx.metadata_ops = &metadata_ops;
diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h
index efa2f0309f00..c53c0f6e3b1a 100644
--- a/include/linux/dma/ti-cppi5.h
+++ b/include/linux/dma/ti-cppi5.h
@@ -616,6 +616,7 @@ static inline void *cppi5_hdesc_get_swdata(struct cppi5_host_desc_t *desc)
 #define   CPPI5_TR_CSF_SUPR_EVT			BIT(2)
 #define   CPPI5_TR_CSF_EOL_ADV_SHIFT		(4U)
 #define   CPPI5_TR_CSF_EOL_ADV_MASK		GENMASK(6, 4)
+#define   CPPI5_TR_CSF_EOL_ICNT0		BIT(4)
 #define   CPPI5_TR_CSF_EOP			BIT(7)
 
 /**
-- 
cgit v1.2.3


From 6f14c02220c791d5c46b0f965b9340c58f3d503d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 31 Mar 2023 11:33:13 +0200
Subject: driver core: create class_is_registered()

Some classes (i.e. gpio), want to know if they have been registered or
not, and poke around in the class's internal structures to try to figure
this out.  Because this is not really a good idea, provide a function
for classes to call to try to figure this out.

Note, this is racy as the state of the class could change at any moment
in time after the call is made, but as usually a class only wants to
know if it has been registered yet or not, it should be fairly safe to
use, and is just as safe as the previous "poke at the class internals"
check was.

Move the gpiolib code to use this function as proof that it works
properly.

Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-gpio@vger.kernel.org
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230331093318.82288-2-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 25 +++++++++++++++++++++++++
 drivers/gpio/gpiolib-sysfs.c |  4 ++--
 include/linux/device/class.h |  1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 68a6f9b56d19..a8a1bf976290 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -634,6 +634,31 @@ void class_compat_remove_link(struct class_compat *cls, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(class_compat_remove_link);
 
+/**
+ * class_is_registered - determine if at this moment in time, a class is
+ *			 registered in the driver core or not.
+ * @class: the class to check
+ *
+ * Returns a boolean to state if the class is registered in the driver core
+ * or not.  Note that the value could switch right after this call is made,
+ * so only use this in places where you "know" it is safe to do so (usually
+ * to determine if the specific class has been registered yet or not).
+ *
+ * Be careful in using this.
+ */
+bool class_is_registered(const struct class *class)
+{
+	struct subsys_private *sp = class_to_subsys(class);
+	bool is_initialized = false;
+
+	if (sp) {
+		is_initialized = true;
+		subsys_put(sp);
+	}
+	return is_initialized;
+}
+EXPORT_SYMBOL_GPL(class_is_registered);
+
 int __init classes_init(void)
 {
 	class_kset = kset_create_and_add("class", NULL, NULL);
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index a895915affa5..1a9b21731cc9 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -554,7 +554,7 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change)
 	int			offset;
 
 	/* can't export until sysfs is available ... */
-	if (!gpio_class.p) {
+	if (!class_is_registered(&gpio_class)) {
 		pr_debug("%s: called too early!\n", __func__);
 		return -ENOENT;
 	}
@@ -728,7 +728,7 @@ int gpiochip_sysfs_register(struct gpio_device *gdev)
 	 * register later, in gpiolib_sysfs_init() ... here we just
 	 * verify that _some_ field of gpio_class got initialized.
 	 */
-	if (!gpio_class.p)
+	if (!class_is_registered(&gpio_class))
 		return 0;
 
 	/*
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index b53728ca56fb..9cb5db0588c8 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -84,6 +84,7 @@ extern struct kobject *sysfs_dev_block_kobj;
 
 int __must_check class_register(struct class *class);
 void class_unregister(const struct class *class);
+bool class_is_registered(const struct class *class);
 
 struct class_compat;
 struct class_compat *class_compat_register(const char *name);
-- 
cgit v1.2.3


From 2df418cf4b720fe3a0db4b4aab67be43d26af9dd Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 31 Mar 2023 11:33:14 +0200
Subject: driver core: class: remove subsystem private pointer from struct
 class

Now that the last users of the subsystem private pointer in struct class
are gone, the pointer can be removed, as no one is using it.  One step
closer to allowing struct class to be const and moved into read-only
memory.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230331093318.82288-3-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 4 ----
 include/linux/device/class.h | 2 --
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index a8a1bf976290..fcfb295363cc 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -97,8 +97,6 @@ static void class_release(struct kobject *kobj)
 
 	pr_debug("class '%s': release.\n", class->name);
 
-	class->p = NULL;
-
 	if (class->class_release)
 		class->class_release(class);
 	else
@@ -206,7 +204,6 @@ int class_register(struct class *cls)
 	cp->subsys.kobj.kset = class_kset;
 	cp->subsys.kobj.ktype = &class_ktype;
 	cp->class = cls;
-	cls->p = cp;
 
 	error = kset_register(&cp->subsys);
 	if (error)
@@ -222,7 +219,6 @@ int class_register(struct class *cls)
 
 err_out:
 	kfree(cp);
-	cls->p = NULL;
 	return error;
 }
 EXPORT_SYMBOL_GPL(class_register);
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 9cb5db0588c8..f7aad64e256a 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -71,8 +71,6 @@ struct class {
 	void (*get_ownership)(const struct device *dev, kuid_t *uid, kgid_t *gid);
 
 	const struct dev_pm_ops *pm;
-
-	struct subsys_private *p;
 };
 
 struct class_dev_iter {
-- 
cgit v1.2.3


From e78195d52981fc634a4e1e66610a316088bd7458 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 31 Mar 2023 11:33:16 +0200
Subject: driver core: class: remove dev_kobj from struct class

The dev_kobj field in struct class is now only written to, but never
read from, so it can be removed as it is useless.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230331093318.82288-5-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c                | 1 -
 drivers/base/class.c         | 4 ----
 include/linux/device/class.h | 2 --
 3 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/block/genhd.c b/block/genhd.c
index e1e1230b1b9f..af7208a37c53 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -899,7 +899,6 @@ static int __init genhd_device_init(void)
 {
 	int error;
 
-	block_class.dev_kobj = sysfs_dev_block_kobj;
 	error = class_register(&block_class);
 	if (unlikely(error))
 		return error;
diff --git a/drivers/base/class.c b/drivers/base/class.c
index fcfb295363cc..06b96d6faa19 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -197,10 +197,6 @@ int class_register(struct class *cls)
 		return error;
 	}
 
-	/* set the default /sys/dev directory for devices of this class */
-	if (!cls->dev_kobj)
-		cls->dev_kobj = sysfs_dev_char_kobj;
-
 	cp->subsys.kobj.kset = class_kset;
 	cp->subsys.kobj.ktype = &class_ktype;
 	cp->class = cls;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index f7aad64e256a..e946642c314e 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -27,7 +27,6 @@ struct fwnode_handle;
  * @name:	Name of the class.
  * @class_groups: Default attributes of this class.
  * @dev_groups:	Default attributes of the devices that belong to the class.
- * @dev_kobj:	The kobject that represents this class and links it into the hierarchy.
  * @dev_uevent:	Called when a device is added, removed from this class, or a
  *		few other things that generate uevents to add the environment
  *		variables.
@@ -55,7 +54,6 @@ struct class {
 
 	const struct attribute_group	**class_groups;
 	const struct attribute_group	**dev_groups;
-	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env);
 	char *(*devnode)(const struct device *dev, umode_t *mode);
-- 
cgit v1.2.3


From 575ab414c90a317158db48475a88de42f37e0afa Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 31 Mar 2023 11:33:17 +0200
Subject: driver core: make sysfs_dev_block_kobj static

Nothing outside of drivers/base/core.c uses sysfs_dev_block_kobj, so
make it static and document what it is used for so we remember it the
next time we touch it 15 years from now.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230331093318.82288-6-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c          | 4 +++-
 include/linux/device/class.h | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index dbc2ba6dfffc..cf6f41c2060c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2256,7 +2256,9 @@ int (*platform_notify)(struct device *dev) = NULL;
 int (*platform_notify_remove)(struct device *dev) = NULL;
 static struct kobject *dev_kobj;
 struct kobject *sysfs_dev_char_kobj;
-struct kobject *sysfs_dev_block_kobj;
+
+/* /sys/dev/block */
+static struct kobject *sysfs_dev_block_kobj;
 
 static DEFINE_MUTEX(device_hotplug_lock);
 
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index e946642c314e..7e4a1a6329f4 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -76,8 +76,6 @@ struct class_dev_iter {
 	const struct device_type	*type;
 };
 
-extern struct kobject *sysfs_dev_block_kobj;
-
 int __must_check class_register(struct class *class);
 void class_unregister(const struct class *class);
 bool class_is_registered(const struct class *class);
-- 
cgit v1.2.3


From 54b47585db6658a5eb898d4d45be18d1e581c1bf Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 27 Mar 2023 02:33:47 -0700
Subject: iommufd: Create access in vfio_iommufd_emulated_bind()

There are needs to created iommufd_access prior to have an IOAS and set
IOAS later. Like the vfio device cdev needs to have an iommufd object
to represent the bond (iommufd_access) and IOAS replacement.

Moves the iommufd_access_create() call into vfio_iommufd_emulated_bind(),
making it symmetric with the __vfio_iommufd_access_destroy() call in the
vfio_iommufd_emulated_unbind(). This means an access is created/destroyed
by the bind()/unbind(), and the vfio_iommufd_emulated_attach_ioas() only
updates the access->ioas pointer.

Since vfio_iommufd_emulated_bind() does not provide ioas_id, drop it from
the argument list of iommufd_access_create(). Instead, add a new access
API iommufd_access_attach() to set the access->ioas pointer. Also, set
vdev->iommufd_attached accordingly, similar to the physical pathway.

Link: https://lore.kernel.org/r/20230327093351.44505-3-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c   | 52 +++++++++++++++++++++++-----------------
 drivers/iommu/iommufd/selftest.c |  5 +++-
 drivers/vfio/iommufd.c           | 24 +++++++++++++------
 include/linux/iommufd.h          |  3 ++-
 4 files changed, 53 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index a0c66f47a65a..ff6fca0ea269 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -412,15 +412,17 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
 	struct iommufd_access *access =
 		container_of(obj, struct iommufd_access, obj);
 
-	iopt_remove_access(&access->ioas->iopt, access);
+	if (access->ioas) {
+		iopt_remove_access(&access->ioas->iopt, access);
+		refcount_dec(&access->ioas->obj.users);
+		access->ioas = NULL;
+	}
 	iommufd_ctx_put(access->ictx);
-	refcount_dec(&access->ioas->obj.users);
 }
 
 /**
  * iommufd_access_create - Create an iommufd_access
  * @ictx: iommufd file descriptor
- * @ioas_id: ID for a IOMMUFD_OBJ_IOAS
  * @ops: Driver's ops to associate with the access
  * @data: Opaque data to pass into ops functions
  *
@@ -431,12 +433,10 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
  * The provided ops are required to use iommufd_access_pin_pages().
  */
 struct iommufd_access *
-iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
+iommufd_access_create(struct iommufd_ctx *ictx,
 		      const struct iommufd_access_ops *ops, void *data)
 {
 	struct iommufd_access *access;
-	struct iommufd_object *obj;
-	int rc;
 
 	/*
 	 * There is no uAPI for the access object, but to keep things symmetric
@@ -449,21 +449,10 @@ iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
 	access->data = data;
 	access->ops = ops;
 
-	obj = iommufd_get_object(ictx, ioas_id, IOMMUFD_OBJ_IOAS);
-	if (IS_ERR(obj)) {
-		rc = PTR_ERR(obj);
-		goto out_abort;
-	}
-	access->ioas = container_of(obj, struct iommufd_ioas, obj);
-	iommufd_ref_to_users(obj);
-
 	if (ops->needs_pin_pages)
 		access->iova_alignment = PAGE_SIZE;
 	else
 		access->iova_alignment = 1;
-	rc = iopt_add_access(&access->ioas->iopt, access);
-	if (rc)
-		goto out_put_ioas;
 
 	/* The calling driver is a user until iommufd_access_destroy() */
 	refcount_inc(&access->obj.users);
@@ -471,11 +460,6 @@ iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
 	iommufd_ctx_get(ictx);
 	iommufd_object_finalize(ictx, &access->obj);
 	return access;
-out_put_ioas:
-	refcount_dec(&access->ioas->obj.users);
-out_abort:
-	iommufd_object_abort(ictx, &access->obj);
-	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
 
@@ -494,6 +478,30 @@ void iommufd_access_destroy(struct iommufd_access *access)
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD);
 
+int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
+{
+	struct iommufd_ioas *new_ioas;
+	int rc = 0;
+
+	if (access->ioas)
+		return -EINVAL;
+
+	new_ioas = iommufd_get_ioas(access->ictx, ioas_id);
+	if (IS_ERR(new_ioas))
+		return PTR_ERR(new_ioas);
+
+	rc = iopt_add_access(&new_ioas->iopt, access);
+	if (rc) {
+		iommufd_put_object(&new_ioas->obj);
+		return rc;
+	}
+	iommufd_ref_to_users(&new_ioas->obj);
+
+	access->ioas = new_ioas;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD);
+
 /**
  * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
  * @iopt: iopt to work on
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 8667eb222cf1..1e89e1a8c5f0 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -571,7 +571,7 @@ static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
 	}
 
 	access = iommufd_access_create(
-		ucmd->ictx, ioas_id,
+		ucmd->ictx,
 		(flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ?
 			&selftest_access_ops_pin :
 			&selftest_access_ops,
@@ -580,6 +580,9 @@ static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
 		rc = PTR_ERR(access);
 		goto out_put_fdno;
 	}
+	rc = iommufd_access_attach(access, ioas_id);
+	if (rc)
+		goto out_destroy;
 	cmd->create_access.out_access_fd = fdno;
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 	if (rc)
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index db4efbd56042..0695a06db30d 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -138,10 +138,18 @@ static const struct iommufd_access_ops vfio_user_ops = {
 int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 			       struct iommufd_ctx *ictx, u32 *out_device_id)
 {
+	struct iommufd_access *user;
+
 	lockdep_assert_held(&vdev->dev_set->lock);
 
-	vdev->iommufd_ictx = ictx;
 	iommufd_ctx_get(ictx);
+	user = iommufd_access_create(ictx, &vfio_user_ops, vdev);
+	if (IS_ERR(user)) {
+		iommufd_ctx_put(ictx);
+		return PTR_ERR(user);
+	}
+	vdev->iommufd_access = user;
+	vdev->iommufd_ictx = ictx;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind);
@@ -152,6 +160,7 @@ void vfio_iommufd_emulated_unbind(struct vfio_device *vdev)
 
 	if (vdev->iommufd_access) {
 		iommufd_access_destroy(vdev->iommufd_access);
+		vdev->iommufd_attached = false;
 		vdev->iommufd_access = NULL;
 	}
 	iommufd_ctx_put(vdev->iommufd_ictx);
@@ -161,15 +170,16 @@ EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind);
 
 int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
 {
-	struct iommufd_access *user;
+	int rc;
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
-	user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops,
-				     vdev);
-	if (IS_ERR(user))
-		return PTR_ERR(user);
-	vdev->iommufd_access = user;
+	if (vdev->iommufd_attached)
+		return -EBUSY;
+	rc = iommufd_access_attach(vdev->iommufd_access, *pt_id);
+	if (rc)
+		return rc;
+	vdev->iommufd_attached = true;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index c0b5b3ac34f1..155d3630aedc 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -40,9 +40,10 @@ enum {
 };
 
 struct iommufd_access *
-iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id,
+iommufd_access_create(struct iommufd_ctx *ictx,
 		      const struct iommufd_access_ops *ops, void *data);
 void iommufd_access_destroy(struct iommufd_access *access);
+int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id);
 
 void iommufd_ctx_get(struct iommufd_ctx *ictx);
 
-- 
cgit v1.2.3


From 4508a533fce4db0004c1e8dd047c1e77046fc9c5 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Mon, 27 Mar 2023 02:33:48 -0700
Subject: vfio-iommufd: No need to record iommufd_ctx in vfio_device

iommufd_ctx is stored in vfio_device for emulated devices per bind_iommufd.
However, as iommufd_access is created in bind, no more need to stored it
since iommufd_access implicitly stores it.

Link: https://lore.kernel.org/r/20230327093351.44505-4-yi.l.liu@intel.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/iommufd.c | 8 +-------
 include/linux/vfio.h   | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 0695a06db30d..78e2486586d7 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -142,14 +142,10 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
-	iommufd_ctx_get(ictx);
 	user = iommufd_access_create(ictx, &vfio_user_ops, vdev);
-	if (IS_ERR(user)) {
-		iommufd_ctx_put(ictx);
+	if (IS_ERR(user))
 		return PTR_ERR(user);
-	}
 	vdev->iommufd_access = user;
-	vdev->iommufd_ictx = ictx;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind);
@@ -163,8 +159,6 @@ void vfio_iommufd_emulated_unbind(struct vfio_device *vdev)
 		vdev->iommufd_attached = false;
 		vdev->iommufd_access = NULL;
 	}
-	iommufd_ctx_put(vdev->iommufd_ictx);
-	vdev->iommufd_ictx = NULL;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind);
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 93134b023968..3188d8a374bd 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -60,7 +60,6 @@ struct vfio_device {
 	void (*put_kvm)(struct kvm *kvm);
 #if IS_ENABLED(CONFIG_IOMMUFD)
 	struct iommufd_device *iommufd_device;
-	struct iommufd_ctx *iommufd_ictx;
 	bool iommufd_attached;
 #endif
 };
-- 
cgit v1.2.3


From 632fda7f912c845f98ff20e69c600160a189d803 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Mon, 27 Mar 2023 02:33:49 -0700
Subject: vfio-iommufd: Make vfio_iommufd_emulated_bind() return iommufd_access
 ID

vfio device cdev needs to return iommufd_access ID to userspace if
bind_iommufd succeeds.

Link: https://lore.kernel.org/r/20230327093351.44505-5-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c   | 4 +++-
 drivers/iommu/iommufd/selftest.c | 3 ++-
 drivers/vfio/iommufd.c           | 2 +-
 include/linux/iommufd.h          | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index ff6fca0ea269..4b565a5b51da 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -425,6 +425,7 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
  * @ictx: iommufd file descriptor
  * @ops: Driver's ops to associate with the access
  * @data: Opaque data to pass into ops functions
+ * @id: Output ID number to return to userspace for this access
  *
  * An iommufd_access allows a driver to read/write to the IOAS without using
  * DMA. The underlying CPU memory can be accessed using the
@@ -434,7 +435,7 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
  */
 struct iommufd_access *
 iommufd_access_create(struct iommufd_ctx *ictx,
-		      const struct iommufd_access_ops *ops, void *data)
+		      const struct iommufd_access_ops *ops, void *data, u32 *id)
 {
 	struct iommufd_access *access;
 
@@ -459,6 +460,7 @@ iommufd_access_create(struct iommufd_ctx *ictx,
 	access->ictx = ictx;
 	iommufd_ctx_get(ictx);
 	iommufd_object_finalize(ictx, &access->obj);
+	*id = access->obj.id;
 	return access;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 1e89e1a8c5f0..e6e04dceffe3 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -554,6 +554,7 @@ static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
 	struct iommu_test_cmd *cmd = ucmd->cmd;
 	struct selftest_access *staccess;
 	struct iommufd_access *access;
+	u32 id;
 	int fdno;
 	int rc;
 
@@ -575,7 +576,7 @@ static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
 		(flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ?
 			&selftest_access_ops_pin :
 			&selftest_access_ops,
-		staccess);
+		staccess, &id);
 	if (IS_ERR(access)) {
 		rc = PTR_ERR(access);
 		goto out_put_fdno;
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 78e2486586d7..1ee558c0be25 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -142,7 +142,7 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
-	user = iommufd_access_create(ictx, &vfio_user_ops, vdev);
+	user = iommufd_access_create(ictx, &vfio_user_ops, vdev, out_device_id);
 	if (IS_ERR(user))
 		return PTR_ERR(user);
 	vdev->iommufd_access = user;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 155d3630aedc..1129a36a74c4 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -41,7 +41,7 @@ enum {
 
 struct iommufd_access *
 iommufd_access_create(struct iommufd_ctx *ictx,
-		      const struct iommufd_access_ops *ops, void *data);
+		      const struct iommufd_access_ops *ops, void *data, u32 *id);
 void iommufd_access_destroy(struct iommufd_access *access);
 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id);
 
-- 
cgit v1.2.3


From 1086a5310f9c9421398cd12c00f605866aad24a5 Mon Sep 17 00:00:00 2001
From: "Garmin.Chang" <Garmin.Chang@mediatek.com>
Date: Fri, 31 Mar 2023 20:36:03 +0800
Subject: dt-bindings: clock: mediatek: Add new MT8188 clock

Add the new binding documentation for system clock
and functional clock on MediaTek MT8188.

Signed-off-by: Garmin.Chang <Garmin.Chang@mediatek.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230331123621.16167-2-Garmin.Chang@mediatek.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/mediatek,mt8188-clock.yaml      |  71 ++
 .../bindings/clock/mediatek,mt8188-sys-clock.yaml  |  55 ++
 include/dt-bindings/clock/mediatek,mt8188-clk.h    | 726 +++++++++++++++++++++
 3 files changed, 852 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml
 create mode 100644 Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml
 create mode 100644 include/dt-bindings/clock/mediatek,mt8188-clk.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml
new file mode 100644
index 000000000000..d7214d97b2ba
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8188-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT8188
+
+maintainers:
+  - Garmin Chang <garmin.chang@mediatek.com>
+
+description: |
+  The clock architecture in MediaTek like below
+  PLLs -->
+          dividers -->
+                      muxes
+                           -->
+                              clock gate
+
+  The devices provide clock gate control in different IP blocks.
+
+properties:
+  compatible:
+    enum:
+      - mediatek,mt8188-adsp-audio26m
+      - mediatek,mt8188-camsys
+      - mediatek,mt8188-camsys-rawa
+      - mediatek,mt8188-camsys-rawb
+      - mediatek,mt8188-camsys-yuva
+      - mediatek,mt8188-camsys-yuvb
+      - mediatek,mt8188-ccusys
+      - mediatek,mt8188-imgsys
+      - mediatek,mt8188-imgsys-wpe1
+      - mediatek,mt8188-imgsys-wpe2
+      - mediatek,mt8188-imgsys-wpe3
+      - mediatek,mt8188-imgsys1-dip-nr
+      - mediatek,mt8188-imgsys1-dip-top
+      - mediatek,mt8188-imp-iic-wrap-c
+      - mediatek,mt8188-imp-iic-wrap-en
+      - mediatek,mt8188-imp-iic-wrap-w
+      - mediatek,mt8188-ipesys
+      - mediatek,mt8188-mfgcfg
+      - mediatek,mt8188-vdecsys
+      - mediatek,mt8188-vdecsys-soc
+      - mediatek,mt8188-vencsys
+      - mediatek,mt8188-vppsys0
+      - mediatek,mt8188-vppsys1
+      - mediatek,mt8188-wpesys
+      - mediatek,mt8188-wpesys-vpp0
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    clock-controller@11283000 {
+        compatible = "mediatek,mt8188-imp-iic-wrap-c";
+        reg = <0x11283000 0x1000>;
+        #clock-cells = <1>;
+    };
+
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml
new file mode 100644
index 000000000000..4cf8d3af9803
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8188-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT8188
+
+maintainers:
+  - Garmin Chang <garmin.chang@mediatek.com>
+
+description: |
+  The clock architecture in MediaTek like below
+  PLLs -->
+          dividers -->
+                      muxes
+                           -->
+                              clock gate
+
+  The apmixedsys provides most of PLLs which generated from SoC 26m.
+  The topckgen provides dividers and muxes which provide the clock source to other IP blocks.
+  The infracfg_ao provides clock gate in peripheral and infrastructure IP blocks.
+  The mcusys provides mux control to select the clock source in AP MCU.
+  The device nodes also provide the system control capacity for configuration.
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - mediatek,mt8188-apmixedsys
+          - mediatek,mt8188-infracfg-ao
+          - mediatek,mt8188-pericfg-ao
+          - mediatek,mt8188-topckgen
+      - const: syscon
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    clock-controller@10000000 {
+        compatible = "mediatek,mt8188-topckgen", "syscon";
+        reg = <0x10000000 0x1000>;
+        #clock-cells = <1>;
+    };
diff --git a/include/dt-bindings/clock/mediatek,mt8188-clk.h b/include/dt-bindings/clock/mediatek,mt8188-clk.h
new file mode 100644
index 000000000000..bd5cd100b796
--- /dev/null
+++ b/include/dt-bindings/clock/mediatek,mt8188-clk.h
@@ -0,0 +1,726 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2022 MediaTek Inc.
+ * Author: Garmin Chang <garmin.chang@mediatek.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_MT8188_H
+#define _DT_BINDINGS_CLK_MT8188_H
+
+/* TOPCKGEN */
+#define CLK_TOP_AXI				0
+#define CLK_TOP_SPM				1
+#define CLK_TOP_SCP				2
+#define CLK_TOP_BUS_AXIMEM			3
+#define CLK_TOP_VPP				4
+#define CLK_TOP_ETHDR				5
+#define CLK_TOP_IPE				6
+#define CLK_TOP_CAM				7
+#define CLK_TOP_CCU				8
+#define CLK_TOP_CCU_AHB				9
+#define CLK_TOP_IMG				10
+#define CLK_TOP_CAMTM				11
+#define CLK_TOP_DSP				12
+#define CLK_TOP_DSP1				13
+#define CLK_TOP_DSP2				14
+#define CLK_TOP_DSP3				15
+#define CLK_TOP_DSP4				16
+#define CLK_TOP_DSP5				17
+#define CLK_TOP_DSP6				18
+#define CLK_TOP_DSP7				19
+#define CLK_TOP_MFG_CORE_TMP			20
+#define CLK_TOP_CAMTG				21
+#define CLK_TOP_CAMTG2				22
+#define CLK_TOP_CAMTG3				23
+#define CLK_TOP_UART				24
+#define CLK_TOP_SPI				25
+#define CLK_TOP_MSDC50_0_HCLK			26
+#define CLK_TOP_MSDC50_0			27
+#define CLK_TOP_MSDC30_1			28
+#define CLK_TOP_MSDC30_2			29
+#define CLK_TOP_INTDIR				30
+#define CLK_TOP_AUD_INTBUS			31
+#define CLK_TOP_AUDIO_H				32
+#define CLK_TOP_PWRAP_ULPOSC			33
+#define CLK_TOP_ATB				34
+#define CLK_TOP_SSPM				35
+#define CLK_TOP_DP				36
+#define CLK_TOP_EDP				37
+#define CLK_TOP_DPI				38
+#define CLK_TOP_DISP_PWM0			39
+#define CLK_TOP_DISP_PWM1			40
+#define CLK_TOP_USB_TOP				41
+#define CLK_TOP_SSUSB_XHCI			42
+#define CLK_TOP_USB_TOP_2P			43
+#define CLK_TOP_SSUSB_XHCI_2P			44
+#define CLK_TOP_USB_TOP_3P			45
+#define CLK_TOP_SSUSB_XHCI_3P			46
+#define CLK_TOP_I2C				47
+#define CLK_TOP_SENINF				48
+#define CLK_TOP_SENINF1				49
+#define CLK_TOP_GCPU				50
+#define CLK_TOP_VENC				51
+#define CLK_TOP_VDEC				52
+#define CLK_TOP_PWM				53
+#define CLK_TOP_MCUPM				54
+#define CLK_TOP_SPMI_P_MST			55
+#define CLK_TOP_SPMI_M_MST			56
+#define CLK_TOP_DVFSRC				57
+#define CLK_TOP_TL				58
+#define CLK_TOP_AES_MSDCFDE			59
+#define CLK_TOP_DSI_OCC				60
+#define CLK_TOP_WPE_VPP				61
+#define CLK_TOP_HDCP				62
+#define CLK_TOP_HDCP_24M			63
+#define CLK_TOP_HDMI_APB			64
+#define CLK_TOP_SNPS_ETH_250M			65
+#define CLK_TOP_SNPS_ETH_62P4M_PTP		66
+#define CLK_TOP_SNPS_ETH_50M_RMII		67
+#define CLK_TOP_ADSP				68
+#define CLK_TOP_AUDIO_LOCAL_BUS			69
+#define CLK_TOP_ASM_H				70
+#define CLK_TOP_ASM_L				71
+#define CLK_TOP_APLL1				72
+#define CLK_TOP_APLL2				73
+#define CLK_TOP_APLL3				74
+#define CLK_TOP_APLL4				75
+#define CLK_TOP_APLL5				76
+#define CLK_TOP_I2SO1				77
+#define CLK_TOP_I2SO2				78
+#define CLK_TOP_I2SI1				79
+#define CLK_TOP_I2SI2				80
+#define CLK_TOP_DPTX				81
+#define CLK_TOP_AUD_IEC				82
+#define CLK_TOP_A1SYS_HP			83
+#define CLK_TOP_A2SYS				84
+#define CLK_TOP_A3SYS				85
+#define CLK_TOP_A4SYS				86
+#define CLK_TOP_ECC				87
+#define CLK_TOP_SPINOR				88
+#define CLK_TOP_ULPOSC				89
+#define CLK_TOP_SRCK				90
+#define CLK_TOP_MFG_CK_FAST_REF			91
+#define CLK_TOP_MAINPLL_D3			92
+#define CLK_TOP_MAINPLL_D4			93
+#define CLK_TOP_MAINPLL_D4_D2			94
+#define CLK_TOP_MAINPLL_D4_D4			95
+#define CLK_TOP_MAINPLL_D4_D8			96
+#define CLK_TOP_MAINPLL_D5			97
+#define CLK_TOP_MAINPLL_D5_D2			98
+#define CLK_TOP_MAINPLL_D5_D4			99
+#define CLK_TOP_MAINPLL_D5_D8			100
+#define CLK_TOP_MAINPLL_D6			101
+#define CLK_TOP_MAINPLL_D6_D2			102
+#define CLK_TOP_MAINPLL_D6_D4			103
+#define CLK_TOP_MAINPLL_D6_D8			104
+#define CLK_TOP_MAINPLL_D7			105
+#define CLK_TOP_MAINPLL_D7_D2			106
+#define CLK_TOP_MAINPLL_D7_D4			107
+#define CLK_TOP_MAINPLL_D7_D8			108
+#define CLK_TOP_MAINPLL_D9			109
+#define CLK_TOP_UNIVPLL_D2			110
+#define CLK_TOP_UNIVPLL_D3			111
+#define CLK_TOP_UNIVPLL_D4			112
+#define CLK_TOP_UNIVPLL_D4_D2			113
+#define CLK_TOP_UNIVPLL_D4_D4			114
+#define CLK_TOP_UNIVPLL_D4_D8			115
+#define CLK_TOP_UNIVPLL_D5			116
+#define CLK_TOP_UNIVPLL_D5_D2			117
+#define CLK_TOP_UNIVPLL_D5_D4			118
+#define CLK_TOP_UNIVPLL_D5_D8			119
+#define CLK_TOP_UNIVPLL_D6			120
+#define CLK_TOP_UNIVPLL_D6_D2			121
+#define CLK_TOP_UNIVPLL_D6_D4			122
+#define CLK_TOP_UNIVPLL_D6_D8			123
+#define CLK_TOP_UNIVPLL_D7			124
+#define CLK_TOP_UNIVPLL_192M			125
+#define CLK_TOP_UNIVPLL_192M_D4			126
+#define CLK_TOP_UNIVPLL_192M_D8			127
+#define CLK_TOP_UNIVPLL_192M_D10		128
+#define CLK_TOP_UNIVPLL_192M_D16		129
+#define CLK_TOP_UNIVPLL_192M_D32		130
+#define CLK_TOP_APLL1_D3			131
+#define CLK_TOP_APLL1_D4			132
+#define CLK_TOP_APLL2_D3			133
+#define CLK_TOP_APLL2_D4			134
+#define CLK_TOP_APLL3_D4			135
+#define CLK_TOP_APLL4_D4			136
+#define CLK_TOP_APLL5_D4			137
+#define CLK_TOP_MMPLL_D4			138
+#define CLK_TOP_MMPLL_D4_D2			139
+#define CLK_TOP_MMPLL_D5			140
+#define CLK_TOP_MMPLL_D5_D2			141
+#define CLK_TOP_MMPLL_D5_D4			142
+#define CLK_TOP_MMPLL_D6			143
+#define CLK_TOP_MMPLL_D6_D2			144
+#define CLK_TOP_MMPLL_D7			145
+#define CLK_TOP_MMPLL_D9			146
+#define CLK_TOP_TVDPLL1				147
+#define CLK_TOP_TVDPLL1_D2			148
+#define CLK_TOP_TVDPLL1_D4			149
+#define CLK_TOP_TVDPLL1_D8			150
+#define CLK_TOP_TVDPLL1_D16			151
+#define CLK_TOP_TVDPLL2				152
+#define CLK_TOP_TVDPLL2_D2			153
+#define CLK_TOP_TVDPLL2_D4			154
+#define CLK_TOP_TVDPLL2_D8			155
+#define CLK_TOP_TVDPLL2_D16			156
+#define CLK_TOP_MSDCPLL_D2			157
+#define CLK_TOP_MSDCPLL_D16			158
+#define CLK_TOP_ETHPLL				159
+#define CLK_TOP_ETHPLL_D2			160
+#define CLK_TOP_ETHPLL_D4			161
+#define CLK_TOP_ETHPLL_D8			162
+#define CLK_TOP_ETHPLL_D10			163
+#define CLK_TOP_ADSPPLL_D2			164
+#define CLK_TOP_ADSPPLL_D4			165
+#define CLK_TOP_ADSPPLL_D8			166
+#define CLK_TOP_ULPOSC1				167
+#define CLK_TOP_ULPOSC1_D2			168
+#define CLK_TOP_ULPOSC1_D4			169
+#define CLK_TOP_ULPOSC1_D8			170
+#define CLK_TOP_ULPOSC1_D7			171
+#define CLK_TOP_ULPOSC1_D10			172
+#define CLK_TOP_ULPOSC1_D16			173
+#define CLK_TOP_MPHONE_SLAVE_BCK		174
+#define CLK_TOP_PAD_FPC				175
+#define CLK_TOP_466M_FMEM			176
+#define CLK_TOP_PEXTP_PIPE			177
+#define CLK_TOP_DSI_PHY				178
+#define CLK_TOP_APLL12_CK_DIV0			179
+#define CLK_TOP_APLL12_CK_DIV1			180
+#define CLK_TOP_APLL12_CK_DIV2			181
+#define CLK_TOP_APLL12_CK_DIV3			182
+#define CLK_TOP_APLL12_CK_DIV4			183
+#define CLK_TOP_APLL12_CK_DIV9			184
+#define CLK_TOP_CFGREG_CLOCK_EN_VPP0		185
+#define CLK_TOP_CFGREG_CLOCK_EN_VPP1		186
+#define CLK_TOP_CFGREG_CLOCK_EN_VDO0		187
+#define CLK_TOP_CFGREG_CLOCK_EN_VDO1		188
+#define CLK_TOP_CFGREG_CLOCK_ISP_AXI_GALS	189
+#define CLK_TOP_CFGREG_F26M_VPP0		190
+#define CLK_TOP_CFGREG_F26M_VPP1		191
+#define CLK_TOP_CFGREG_F26M_VDO0		192
+#define CLK_TOP_CFGREG_F26M_VDO1		193
+#define CLK_TOP_CFGREG_AUD_F26M_AUD		194
+#define CLK_TOP_CFGREG_UNIPLL_SES		195
+#define CLK_TOP_CFGREG_F_PCIE_PHY_REF		196
+#define CLK_TOP_SSUSB_TOP_REF			197
+#define CLK_TOP_SSUSB_PHY_REF			198
+#define CLK_TOP_SSUSB_TOP_P1_REF		199
+#define CLK_TOP_SSUSB_PHY_P1_REF		200
+#define CLK_TOP_SSUSB_TOP_P2_REF		201
+#define CLK_TOP_SSUSB_PHY_P2_REF		202
+#define CLK_TOP_SSUSB_TOP_P3_REF		203
+#define CLK_TOP_SSUSB_PHY_P3_REF		204
+#define CLK_TOP_NR_CLK				205
+
+/* INFRACFG_AO */
+#define CLK_INFRA_AO_PMIC_TMR			0
+#define CLK_INFRA_AO_PMIC_AP			1
+#define CLK_INFRA_AO_PMIC_MD			2
+#define CLK_INFRA_AO_PMIC_CONN			3
+#define CLK_INFRA_AO_SEJ			4
+#define CLK_INFRA_AO_APXGPT			5
+#define CLK_INFRA_AO_GCE			6
+#define CLK_INFRA_AO_GCE2			7
+#define CLK_INFRA_AO_THERM			8
+#define CLK_INFRA_AO_PWM_HCLK			9
+#define CLK_INFRA_AO_PWM1			10
+#define CLK_INFRA_AO_PWM2			11
+#define CLK_INFRA_AO_PWM3			12
+#define CLK_INFRA_AO_PWM4			13
+#define CLK_INFRA_AO_PWM			14
+#define CLK_INFRA_AO_UART0			15
+#define CLK_INFRA_AO_UART1			16
+#define CLK_INFRA_AO_UART2			17
+#define CLK_INFRA_AO_UART3			18
+#define CLK_INFRA_AO_UART4			19
+#define CLK_INFRA_AO_GCE_26M			20
+#define CLK_INFRA_AO_CQ_DMA_FPC			21
+#define CLK_INFRA_AO_UART5			22
+#define CLK_INFRA_AO_HDMI_26M			23
+#define CLK_INFRA_AO_SPI0			24
+#define CLK_INFRA_AO_MSDC0			25
+#define CLK_INFRA_AO_MSDC1			26
+#define CLK_INFRA_AO_MSDC2			27
+#define CLK_INFRA_AO_MSDC0_SRC			28
+#define CLK_INFRA_AO_DVFSRC			29
+#define CLK_INFRA_AO_TRNG			30
+#define CLK_INFRA_AO_AUXADC			31
+#define CLK_INFRA_AO_CPUM			32
+#define CLK_INFRA_AO_HDMI_32K			33
+#define CLK_INFRA_AO_CEC_66M_HCLK		34
+#define CLK_INFRA_AO_PCIE_TL_26M		35
+#define CLK_INFRA_AO_MSDC1_SRC			36
+#define CLK_INFRA_AO_CEC_66M_BCLK		37
+#define CLK_INFRA_AO_PCIE_TL_96M		38
+#define CLK_INFRA_AO_DEVICE_APC			39
+#define CLK_INFRA_AO_ECC_66M_HCLK		40
+#define CLK_INFRA_AO_DEBUGSYS			41
+#define CLK_INFRA_AO_AUDIO			42
+#define CLK_INFRA_AO_PCIE_TL_32K		43
+#define CLK_INFRA_AO_DBG_TRACE			44
+#define CLK_INFRA_AO_DRAMC_F26M			45
+#define CLK_INFRA_AO_IRTX			46
+#define CLK_INFRA_AO_DISP_PWM			47
+#define CLK_INFRA_AO_CLDMA_BCLK			48
+#define CLK_INFRA_AO_AUDIO_26M_BCLK		49
+#define CLK_INFRA_AO_SPI1			50
+#define CLK_INFRA_AO_SPI2			51
+#define CLK_INFRA_AO_SPI3			52
+#define CLK_INFRA_AO_FSSPM			53
+#define CLK_INFRA_AO_SSPM_BUS_HCLK		54
+#define CLK_INFRA_AO_APDMA_BCLK			55
+#define CLK_INFRA_AO_SPI4			56
+#define CLK_INFRA_AO_SPI5			57
+#define CLK_INFRA_AO_CQ_DMA			58
+#define CLK_INFRA_AO_MSDC0_SELF			59
+#define CLK_INFRA_AO_MSDC1_SELF			60
+#define CLK_INFRA_AO_MSDC2_SELF			61
+#define CLK_INFRA_AO_I2S_DMA			62
+#define CLK_INFRA_AO_AP_MSDC0			63
+#define CLK_INFRA_AO_MD_MSDC0			64
+#define CLK_INFRA_AO_MSDC30_2			65
+#define CLK_INFRA_AO_GCPU			66
+#define CLK_INFRA_AO_PCIE_PERI_26M		67
+#define CLK_INFRA_AO_GCPU_66M_BCLK		68
+#define CLK_INFRA_AO_GCPU_133M_BCLK		69
+#define CLK_INFRA_AO_DISP_PWM1			70
+#define CLK_INFRA_AO_FBIST2FPC			71
+#define CLK_INFRA_AO_DEVICE_APC_SYNC		72
+#define CLK_INFRA_AO_PCIE_P1_PERI_26M		73
+#define CLK_INFRA_AO_133M_MCLK_CK		74
+#define CLK_INFRA_AO_66M_MCLK_CK		75
+#define CLK_INFRA_AO_PCIE_PL_P_250M_P0		76
+#define CLK_INFRA_AO_RG_AES_MSDCFDE_CK_0P	77
+#define CLK_INFRA_AO_NR_CLK			78
+
+/* APMIXEDSYS */
+#define CLK_APMIXED_ETHPLL			0
+#define CLK_APMIXED_MSDCPLL			1
+#define CLK_APMIXED_TVDPLL1			2
+#define CLK_APMIXED_TVDPLL2			3
+#define CLK_APMIXED_MMPLL			4
+#define CLK_APMIXED_MAINPLL			5
+#define CLK_APMIXED_IMGPLL			6
+#define CLK_APMIXED_UNIVPLL			7
+#define CLK_APMIXED_ADSPPLL			8
+#define CLK_APMIXED_APLL1			9
+#define CLK_APMIXED_APLL2			10
+#define CLK_APMIXED_APLL3			11
+#define CLK_APMIXED_APLL4			12
+#define CLK_APMIXED_APLL5			13
+#define CLK_APMIXED_MFGPLL			14
+#define CLK_APMIXED_PLL_SSUSB26M_EN		15
+#define CLK_APMIXED_NR_CLK			16
+
+/* AUDIODSP */
+#define CLK_AUDIODSP_AUDIO26M			0
+#define CLK_AUDIODSP_NR_CLK			1
+
+/* PERICFG_AO */
+#define CLK_PERI_AO_ETHERNET			0
+#define CLK_PERI_AO_ETHERNET_BUS		1
+#define CLK_PERI_AO_FLASHIF_BUS			2
+#define CLK_PERI_AO_FLASHIF_26M			3
+#define CLK_PERI_AO_FLASHIFLASHCK		4
+#define CLK_PERI_AO_SSUSB_2P_BUS		5
+#define CLK_PERI_AO_SSUSB_2P_XHCI		6
+#define CLK_PERI_AO_SSUSB_3P_BUS		7
+#define CLK_PERI_AO_SSUSB_3P_XHCI		8
+#define CLK_PERI_AO_SSUSB_BUS			9
+#define CLK_PERI_AO_SSUSB_XHCI			10
+#define CLK_PERI_AO_ETHERNET_MAC		11
+#define CLK_PERI_AO_PCIE_P0_FMEM		12
+#define CLK_PERI_AO_NR_CLK			13
+
+/* IMP_IIC_WRAP_C */
+#define CLK_IMP_IIC_WRAP_C_AP_CLOCK_I2C0	0
+#define CLK_IMP_IIC_WRAP_C_AP_CLOCK_I2C2	1
+#define CLK_IMP_IIC_WRAP_C_AP_CLOCK_I2C3	2
+#define CLK_IMP_IIC_WRAP_C_NR_CLK		3
+
+/* IMP_IIC_WRAP_W */
+#define CLK_IMP_IIC_WRAP_W_AP_CLOCK_I2C1	0
+#define CLK_IMP_IIC_WRAP_W_AP_CLOCK_I2C4	1
+#define CLK_IMP_IIC_WRAP_W_NR_CLK		2
+
+/* IMP_IIC_WRAP_EN */
+#define CLK_IMP_IIC_WRAP_EN_AP_CLOCK_I2C5	0
+#define CLK_IMP_IIC_WRAP_EN_AP_CLOCK_I2C6	1
+#define CLK_IMP_IIC_WRAP_EN_NR_CLK		2
+
+/* MFGCFG */
+#define CLK_MFGCFG_BG3D				0
+#define CLK_MFGCFG_NR_CLK			1
+
+/* VPPSYS0 */
+#define CLK_VPP0_MDP_FG				0
+#define CLK_VPP0_STITCH				1
+#define CLK_VPP0_PADDING			2
+#define CLK_VPP0_MDP_TCC			3
+#define CLK_VPP0_WARP0_ASYNC_TX			4
+#define CLK_VPP0_WARP1_ASYNC_TX			5
+#define CLK_VPP0_MUTEX				6
+#define CLK_VPP02VPP1_RELAY			7
+#define CLK_VPP0_VPP12VPP0_ASYNC		8
+#define CLK_VPP0_MMSYSRAM_TOP			9
+#define CLK_VPP0_MDP_AAL			10
+#define CLK_VPP0_MDP_RSZ			11
+#define CLK_VPP0_SMI_COMMON_MMSRAM		12
+#define CLK_VPP0_GALS_VDO0_LARB0_MMSRAM		13
+#define CLK_VPP0_GALS_VDO0_LARB1_MMSRAM		14
+#define CLK_VPP0_GALS_VENCSYS_MMSRAM		15
+#define CLK_VPP0_GALS_VENCSYS_CORE1_MMSRAM	16
+#define CLK_VPP0_GALS_INFRA_MMSRAM		17
+#define CLK_VPP0_GALS_CAMSYS_MMSRAM		18
+#define CLK_VPP0_GALS_VPP1_LARB5_MMSRAM		19
+#define CLK_VPP0_GALS_VPP1_LARB6_MMSRAM		20
+#define CLK_VPP0_SMI_REORDER_MMSRAM		21
+#define CLK_VPP0_SMI_IOMMU			22
+#define CLK_VPP0_GALS_IMGSYS_CAMSYS		23
+#define CLK_VPP0_MDP_RDMA			24
+#define CLK_VPP0_MDP_WROT			25
+#define CLK_VPP0_GALS_EMI0_EMI1			26
+#define CLK_VPP0_SMI_SUB_COMMON_REORDER		27
+#define CLK_VPP0_SMI_RSI			28
+#define CLK_VPP0_SMI_COMMON_LARB4		29
+#define CLK_VPP0_GALS_VDEC_VDEC_CORE1		30
+#define CLK_VPP0_GALS_VPP1_WPESYS		31
+#define CLK_VPP0_GALS_VDO0_VDO1_VENCSYS_CORE1	32
+#define CLK_VPP0_FAKE_ENG			33
+#define CLK_VPP0_MDP_HDR			34
+#define CLK_VPP0_MDP_TDSHP			35
+#define CLK_VPP0_MDP_COLOR			36
+#define CLK_VPP0_MDP_OVL			37
+#define CLK_VPP0_DSIP_RDMA			38
+#define CLK_VPP0_DISP_WDMA			39
+#define CLK_VPP0_MDP_HMS			40
+#define CLK_VPP0_WARP0_RELAY			41
+#define CLK_VPP0_WARP0_ASYNC			42
+#define CLK_VPP0_WARP1_RELAY			43
+#define CLK_VPP0_WARP1_ASYNC			44
+#define CLK_VPP0_NR_CLK				45
+
+/* WPESYS */
+#define CLK_WPE_TOP_WPE_VPP0			0
+#define CLK_WPE_TOP_SMI_LARB7			1
+#define CLK_WPE_TOP_WPESYS_EVENT_TX		2
+#define CLK_WPE_TOP_SMI_LARB7_PCLK_EN		3
+#define CLK_WPE_TOP_NR_CLK			4
+
+/* WPESYS_VPP0 */
+#define CLK_WPE_VPP0_VECI			0
+#define CLK_WPE_VPP0_VEC2I			1
+#define CLK_WPE_VPP0_VEC3I			2
+#define CLK_WPE_VPP0_WPEO			3
+#define CLK_WPE_VPP0_MSKO			4
+#define CLK_WPE_VPP0_VGEN			5
+#define CLK_WPE_VPP0_EXT			6
+#define CLK_WPE_VPP0_VFC			7
+#define CLK_WPE_VPP0_CACH0_TOP			8
+#define CLK_WPE_VPP0_CACH0_DMA			9
+#define CLK_WPE_VPP0_CACH1_TOP			10
+#define CLK_WPE_VPP0_CACH1_DMA			11
+#define CLK_WPE_VPP0_CACH2_TOP			12
+#define CLK_WPE_VPP0_CACH2_DMA			13
+#define CLK_WPE_VPP0_CACH3_TOP			14
+#define CLK_WPE_VPP0_CACH3_DMA			15
+#define CLK_WPE_VPP0_PSP			16
+#define CLK_WPE_VPP0_PSP2			17
+#define CLK_WPE_VPP0_SYNC			18
+#define CLK_WPE_VPP0_C24			19
+#define CLK_WPE_VPP0_MDP_CROP			20
+#define CLK_WPE_VPP0_ISP_CROP			21
+#define CLK_WPE_VPP0_TOP			22
+#define CLK_WPE_VPP0_NR_CLK			23
+
+/* VPPSYS1 */
+#define CLK_VPP1_SVPP1_MDP_OVL			0
+#define CLK_VPP1_SVPP1_MDP_TCC			1
+#define CLK_VPP1_SVPP1_MDP_WROT			2
+#define CLK_VPP1_SVPP1_VPP_PAD			3
+#define CLK_VPP1_SVPP2_MDP_WROT			4
+#define CLK_VPP1_SVPP2_VPP_PAD			5
+#define CLK_VPP1_SVPP3_MDP_WROT			6
+#define CLK_VPP1_SVPP3_VPP_PAD			7
+#define CLK_VPP1_SVPP1_MDP_RDMA			8
+#define CLK_VPP1_SVPP1_MDP_FG			9
+#define CLK_VPP1_SVPP2_MDP_RDMA			10
+#define CLK_VPP1_SVPP2_MDP_FG			11
+#define CLK_VPP1_SVPP3_MDP_RDMA			12
+#define CLK_VPP1_SVPP3_MDP_FG			13
+#define CLK_VPP1_VPP_SPLIT			14
+#define CLK_VPP1_SVPP2_VDO0_DL_RELAY		15
+#define CLK_VPP1_SVPP1_MDP_RSZ			16
+#define CLK_VPP1_SVPP1_MDP_TDSHP		17
+#define CLK_VPP1_SVPP1_MDP_COLOR		18
+#define CLK_VPP1_SVPP3_VDO1_DL_RELAY		19
+#define CLK_VPP1_SVPP2_MDP_RSZ			20
+#define CLK_VPP1_SVPP2_VPP_MERGE		21
+#define CLK_VPP1_SVPP2_MDP_TDSHP		22
+#define CLK_VPP1_SVPP2_MDP_COLOR		23
+#define CLK_VPP1_SVPP3_MDP_RSZ			24
+#define CLK_VPP1_SVPP3_VPP_MERGE		25
+#define CLK_VPP1_SVPP3_MDP_TDSHP		26
+#define CLK_VPP1_SVPP3_MDP_COLOR		27
+#define CLK_VPP1_GALS5				28
+#define CLK_VPP1_GALS6				29
+#define CLK_VPP1_LARB5				30
+#define CLK_VPP1_LARB6				31
+#define CLK_VPP1_SVPP1_MDP_HDR			32
+#define CLK_VPP1_SVPP1_MDP_AAL			33
+#define CLK_VPP1_SVPP2_MDP_HDR			34
+#define CLK_VPP1_SVPP2_MDP_AAL			35
+#define CLK_VPP1_SVPP3_MDP_HDR			36
+#define CLK_VPP1_SVPP3_MDP_AAL			37
+#define CLK_VPP1_DISP_MUTEX			38
+#define CLK_VPP1_SVPP2_VDO1_DL_RELAY		39
+#define CLK_VPP1_SVPP3_VDO0_DL_RELAY		40
+#define CLK_VPP1_VPP0_DL_ASYNC			41
+#define CLK_VPP1_VPP0_DL1_RELAY			42
+#define CLK_VPP1_LARB5_FAKE_ENG			43
+#define CLK_VPP1_LARB6_FAKE_ENG			44
+#define CLK_VPP1_HDMI_META			45
+#define CLK_VPP1_VPP_SPLIT_HDMI			46
+#define CLK_VPP1_DGI_IN				47
+#define CLK_VPP1_DGI_OUT			48
+#define CLK_VPP1_VPP_SPLIT_DGI			49
+#define CLK_VPP1_DL_CON_OCC			50
+#define CLK_VPP1_VPP_SPLIT_26M			51
+#define CLK_VPP1_NR_CLK				52
+
+/* IMGSYS */
+#define CLK_IMGSYS_MAIN_LARB9			0
+#define CLK_IMGSYS_MAIN_TRAW0			1
+#define CLK_IMGSYS_MAIN_TRAW1			2
+#define CLK_IMGSYS_MAIN_VCORE_GALS		3
+#define CLK_IMGSYS_MAIN_DIP0			4
+#define CLK_IMGSYS_MAIN_WPE0			5
+#define CLK_IMGSYS_MAIN_IPE			6
+#define CLK_IMGSYS_MAIN_WPE1			7
+#define CLK_IMGSYS_MAIN_WPE2			8
+#define CLK_IMGSYS_MAIN_GALS			9
+#define CLK_IMGSYS_MAIN_NR_CLK			10
+
+/* IMGSYS1_DIP_TOP */
+#define CLK_IMGSYS1_DIP_TOP_LARB10		0
+#define CLK_IMGSYS1_DIP_TOP_DIP_TOP		1
+#define CLK_IMGSYS1_DIP_TOP_NR_CLK		2
+
+/* IMGSYS1_DIP_NR */
+#define CLK_IMGSYS1_DIP_NR_LARB15		0
+#define CLK_IMGSYS1_DIP_NR_DIP_NR		1
+#define CLK_IMGSYS1_DIP_NR_NR_CLK		2
+
+/* IMGSYS_WPE1 */
+#define CLK_IMGSYS_WPE1_LARB11			0
+#define CLK_IMGSYS_WPE1				1
+#define CLK_IMGSYS_WPE1_NR_CLK			2
+
+/* IPESYS */
+#define CLK_IPE_DPE				0
+#define CLK_IPE_FDVT				1
+#define CLK_IPE_ME				2
+#define CLK_IPESYS_TOP				3
+#define CLK_IPE_SMI_LARB12			4
+#define CLK_IPE_NR_CLK				5
+
+/* IMGSYS_WPE2 */
+#define CLK_IMGSYS_WPE2_LARB11			0
+#define CLK_IMGSYS_WPE2				1
+#define CLK_IMGSYS_WPE2_NR_CLK			2
+
+/* IMGSYS_WPE3 */
+#define CLK_IMGSYS_WPE3_LARB11			0
+#define CLK_IMGSYS_WPE3				1
+#define CLK_IMGSYS_WPE3_NR_CLK			2
+
+/* CAMSYS */
+#define CLK_CAM_MAIN_LARB13			0
+#define CLK_CAM_MAIN_LARB14			1
+#define CLK_CAM_MAIN_CAM			2
+#define CLK_CAM_MAIN_CAM_SUBA			3
+#define CLK_CAM_MAIN_CAM_SUBB			4
+#define CLK_CAM_MAIN_CAMTG			5
+#define CLK_CAM_MAIN_SENINF			6
+#define CLK_CAM_MAIN_GCAMSVA			7
+#define CLK_CAM_MAIN_GCAMSVB			8
+#define CLK_CAM_MAIN_GCAMSVC			9
+#define CLK_CAM_MAIN_GCAMSVD			10
+#define CLK_CAM_MAIN_GCAMSVE			11
+#define CLK_CAM_MAIN_GCAMSVF			12
+#define CLK_CAM_MAIN_GCAMSVG			13
+#define CLK_CAM_MAIN_GCAMSVH			14
+#define CLK_CAM_MAIN_GCAMSVI			15
+#define CLK_CAM_MAIN_GCAMSVJ			16
+#define CLK_CAM_MAIN_CAMSV_TOP			17
+#define CLK_CAM_MAIN_CAMSV_CQ_A			18
+#define CLK_CAM_MAIN_CAMSV_CQ_B			19
+#define CLK_CAM_MAIN_CAMSV_CQ_C			20
+#define CLK_CAM_MAIN_FAKE_ENG			21
+#define CLK_CAM_MAIN_CAM2MM0_GALS		22
+#define CLK_CAM_MAIN_CAM2MM1_GALS		23
+#define CLK_CAM_MAIN_CAM2SYS_GALS		24
+#define CLK_CAM_MAIN_NR_CLK			25
+
+/* CAMSYS_RAWA */
+#define CLK_CAM_RAWA_LARBX			0
+#define CLK_CAM_RAWA_CAM			1
+#define CLK_CAM_RAWA_CAMTG			2
+#define CLK_CAM_RAWA_NR_CLK			3
+
+/* CAMSYS_YUVA */
+#define CLK_CAM_YUVA_LARBX			0
+#define CLK_CAM_YUVA_CAM			1
+#define CLK_CAM_YUVA_CAMTG			2
+#define CLK_CAM_YUVA_NR_CLK			3
+
+/* CAMSYS_RAWB */
+#define CLK_CAM_RAWB_LARBX			0
+#define CLK_CAM_RAWB_CAM			1
+#define CLK_CAM_RAWB_CAMTG			2
+#define CLK_CAM_RAWB_NR_CLK			3
+
+/* CAMSYS_YUVB */
+#define CLK_CAM_YUVB_LARBX			0
+#define CLK_CAM_YUVB_CAM			1
+#define CLK_CAM_YUVB_CAMTG			2
+#define CLK_CAM_YUVB_NR_CLK			3
+
+/* CCUSYS */
+#define CLK_CCU_LARB27				0
+#define CLK_CCU_AHB				1
+#define CLK_CCU_CCU0				2
+#define CLK_CCU_NR_CLK				3
+
+/* VDECSYS_SOC */
+#define CLK_VDEC1_SOC_LARB1			0
+#define CLK_VDEC1_SOC_LAT			1
+#define CLK_VDEC1_SOC_LAT_ACTIVE			2
+#define CLK_VDEC1_SOC_LAT_ENG			3
+#define CLK_VDEC1_SOC_VDEC			4
+#define CLK_VDEC1_SOC_VDEC_ACTIVE		5
+#define CLK_VDEC1_SOC_VDEC_ENG			6
+#define CLK_VDEC1_NR_CLK				7
+
+/* VDECSYS */
+#define CLK_VDEC2_LARB1				0
+#define CLK_VDEC2_LAT				1
+#define CLK_VDEC2_VDEC				2
+#define CLK_VDEC2_VDEC_ACTIVE			3
+#define CLK_VDEC2_VDEC_ENG			4
+#define CLK_VDEC2_NR_CLK				5
+
+/* VENCSYS */
+#define CLK_VENC1_LARB			0
+#define CLK_VENC1_VENC			1
+#define CLK_VENC1_JPGENC			2
+#define CLK_VENC1_JPGDEC			3
+#define CLK_VENC1_JPGDEC_C1			4
+#define CLK_VENC1_GALS			5
+#define CLK_VENC1_GALS_SRAM			6
+#define CLK_VENC1_NR_CLK				7
+
+/* VDOSYS0 */
+#define CLK_VDO0_DISP_OVL0			0
+#define CLK_VDO0_FAKE_ENG0			1
+#define CLK_VDO0_DISP_CCORR0			2
+#define CLK_VDO0_DISP_MUTEX0			3
+#define CLK_VDO0_DISP_GAMMA0			4
+#define CLK_VDO0_DISP_DITHER0			5
+#define CLK_VDO0_DISP_WDMA0			6
+#define CLK_VDO0_DISP_RDMA0			7
+#define CLK_VDO0_DSI0				8
+#define CLK_VDO0_DSI1				9
+#define CLK_VDO0_DSC_WRAP0			10
+#define CLK_VDO0_VPP_MERGE0			11
+#define CLK_VDO0_DP_INTF0			12
+#define CLK_VDO0_DISP_AAL0			13
+#define CLK_VDO0_INLINEROT0			14
+#define CLK_VDO0_APB_BUS			15
+#define CLK_VDO0_DISP_COLOR0			16
+#define CLK_VDO0_MDP_WROT0			17
+#define CLK_VDO0_DISP_RSZ0			18
+#define CLK_VDO0_DISP_POSTMASK0			19
+#define CLK_VDO0_FAKE_ENG1			20
+#define CLK_VDO0_DL_ASYNC2			21
+#define CLK_VDO0_DL_RELAY3			22
+#define CLK_VDO0_DL_RELAY4			23
+#define CLK_VDO0_SMI_GALS			24
+#define CLK_VDO0_SMI_COMMON			25
+#define CLK_VDO0_SMI_EMI			26
+#define CLK_VDO0_SMI_IOMMU			27
+#define CLK_VDO0_SMI_LARB			28
+#define CLK_VDO0_SMI_RSI			29
+#define CLK_VDO0_DSI0_DSI			30
+#define CLK_VDO0_DSI1_DSI			31
+#define CLK_VDO0_DP_INTF0_DP_INTF		32
+#define CLK_VDO0_NR_CLK				33
+
+/* VDOSYS1 */
+#define CLK_VDO1_SMI_LARB2			0
+#define CLK_VDO1_SMI_LARB3			1
+#define CLK_VDO1_GALS				2
+#define CLK_VDO1_FAKE_ENG0			3
+#define CLK_VDO1_FAKE_ENG1			4
+#define CLK_VDO1_MDP_RDMA0			5
+#define CLK_VDO1_MDP_RDMA1			6
+#define CLK_VDO1_MDP_RDMA2			7
+#define CLK_VDO1_MDP_RDMA3			8
+#define CLK_VDO1_VPP_MERGE0			9
+#define CLK_VDO1_VPP_MERGE1			10
+#define CLK_VDO1_VPP_MERGE2			11
+#define CLK_VDO1_VPP_MERGE3			12
+#define CLK_VDO1_VPP_MERGE4			13
+#define CLK_VDO1_VPP2_TO_VDO1_DL_ASYNC		14
+#define CLK_VDO1_VPP3_TO_VDO1_DL_ASYNC		15
+#define CLK_VDO1_DISP_MUTEX			16
+#define CLK_VDO1_MDP_RDMA4			17
+#define CLK_VDO1_MDP_RDMA5			18
+#define CLK_VDO1_MDP_RDMA6			19
+#define CLK_VDO1_MDP_RDMA7			20
+#define CLK_VDO1_DP_INTF0_MMCK			21
+#define CLK_VDO1_DPI0_MM			22
+#define CLK_VDO1_DPI1_MM			23
+#define CLK_VDO1_MERGE0_DL_ASYNC		24
+#define CLK_VDO1_MERGE1_DL_ASYNC		25
+#define CLK_VDO1_MERGE2_DL_ASYNC		26
+#define CLK_VDO1_MERGE3_DL_ASYNC		27
+#define CLK_VDO1_MERGE4_DL_ASYNC		28
+#define CLK_VDO1_DSC_VDO1_DL_ASYNC		29
+#define CLK_VDO1_MERGE_VDO1_DL_ASYNC		30
+#define CLK_VDO1_PADDING0			31
+#define CLK_VDO1_PADDING1			32
+#define CLK_VDO1_PADDING2			33
+#define CLK_VDO1_PADDING3			34
+#define CLK_VDO1_PADDING4			35
+#define CLK_VDO1_PADDING5			36
+#define CLK_VDO1_PADDING6			37
+#define CLK_VDO1_PADDING7			38
+#define CLK_VDO1_DISP_RSZ0			39
+#define CLK_VDO1_DISP_RSZ1			40
+#define CLK_VDO1_DISP_RSZ2			41
+#define CLK_VDO1_DISP_RSZ3			42
+#define CLK_VDO1_HDR_VDO_FE0			43
+#define CLK_VDO1_HDR_GFX_FE0			44
+#define CLK_VDO1_HDR_VDO_BE			45
+#define CLK_VDO1_HDR_VDO_FE1			46
+#define CLK_VDO1_HDR_GFX_FE1			47
+#define CLK_VDO1_DISP_MIXER			48
+#define CLK_VDO1_HDR_VDO_FE0_DL_ASYNC		49
+#define CLK_VDO1_HDR_VDO_FE1_DL_ASYNC		50
+#define CLK_VDO1_HDR_GFX_FE0_DL_ASYNC		51
+#define CLK_VDO1_HDR_GFX_FE1_DL_ASYNC		52
+#define CLK_VDO1_HDR_VDO_BE_DL_ASYNC		53
+#define CLK_VDO1_DPI0				54
+#define CLK_VDO1_DISP_MONITOR_DPI0		55
+#define CLK_VDO1_DPI1				56
+#define CLK_VDO1_DISP_MONITOR_DPI1		57
+#define CLK_VDO1_DPINTF				58
+#define CLK_VDO1_DISP_MONITOR_DPINTF		59
+#define CLK_VDO1_26M_SLOW			60
+#define CLK_VDO1_NR_CLK				61
+
+#endif /* _DT_BINDINGS_CLK_MT8188_H */
-- 
cgit v1.2.3


From 32a7a02117de01199ce15ec121ac7af417c340eb Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 7 Mar 2023 14:37:25 +0100
Subject: thermal/core: Relocate the traces definition in thermal directory

The traces are exported but only local to the thermal core code. On
the other side, the traces take the thermal zone device structure as
argument, thus they have to rely on the exported thermal.h header
file. As we want to move the structure to the private thermal core
header, first we have to relocate those traces to the same place as
many drivers do.

Cc: Steven Rostedt <rostedt@goodmis.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230307133735.90772-2-daniel.lezcano@linaro.org
---
 drivers/thermal/Makefile                       |   3 +-
 drivers/thermal/cpufreq_cooling.c              |   2 +-
 drivers/thermal/devfreq_cooling.c              |   2 +-
 drivers/thermal/gov_fair_share.c               |   2 +-
 drivers/thermal/gov_power_allocator.c          |   2 +-
 drivers/thermal/gov_step_wise.c                |   2 +-
 drivers/thermal/thermal_core.c                 |   2 +-
 drivers/thermal/thermal_helpers.c              |   3 +-
 drivers/thermal/thermal_trace.h                | 205 +++++++++++++++++++++++++
 drivers/thermal/thermal_trace_ipa.h            |  94 ++++++++++++
 include/trace/events/thermal.h                 | 199 ------------------------
 include/trace/events/thermal_power_allocator.h |  88 -----------
 12 files changed, 308 insertions(+), 296 deletions(-)
 create mode 100644 drivers/thermal/thermal_trace.h
 create mode 100644 drivers/thermal/thermal_trace_ipa.h
 delete mode 100644 include/trace/events/thermal.h
 delete mode 100644 include/trace/events/thermal_power_allocator.h

(limited to 'include')

diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index eed300e83d48..058664bc3ec0 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for sensor chip drivers.
 #
-
+CFLAGS_thermal_core.o		:= -I$(src)
 obj-$(CONFIG_THERMAL)		+= thermal_sys.o
 thermal_sys-y			+= thermal_core.o thermal_sysfs.o
 thermal_sys-y			+= thermal_trip.o thermal_helpers.o
@@ -16,6 +16,7 @@ thermal_sys-$(CONFIG_THERMAL_OF)		+= thermal_of.o
 thermal_sys-$(CONFIG_THERMAL_ACPI)		+= thermal_acpi.o
 
 # governors
+CFLAGS_gov_power_allocator.o			:= -I$(src)
 thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE)	+= gov_fair_share.o
 thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG)	+= gov_bang_bang.o
 thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= gov_step_wise.o
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 9f8b438fcf8f..65ef08b30334 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -23,7 +23,7 @@
 #include <linux/thermal.h>
 #include <linux/units.h>
 
-#include <trace/events/thermal.h>
+#include "thermal_trace.h"
 
 /*
  * Cooling state <-> CPUFreq frequency
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 24b474925cd6..262e62ab6cf2 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -20,7 +20,7 @@
 #include <linux/thermal.h>
 #include <linux/units.h>
 
-#include <trace/events/thermal.h>
+#include "thermal_trace.h"
 
 #define SCALE_ERROR_MITIGATION	100
 
diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index aad7d5fe3a14..03c2daeb6ee8 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/thermal.h>
-#include <trace/events/thermal.h>
+#include "thermal_trace.h"
 
 #include "thermal_core.h"
 
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index 0eaf1527d3e3..8642f1096b91 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -12,7 +12,7 @@
 #include <linux/thermal.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/thermal_power_allocator.h>
+#include "thermal_trace_ipa.h"
 
 #include "thermal_core.h"
 
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 31235e169c5a..3d3067804df2 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -12,7 +12,7 @@
 
 #include <linux/thermal.h>
 #include <linux/minmax.h>
-#include <trace/events/thermal.h>
+#include "thermal_trace.h"
 
 #include "thermal_core.h"
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5ae72f314683..ed628d21e1bc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -22,7 +22,7 @@
 #include <linux/suspend.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/thermal.h>
+#include "thermal_trace.h"
 
 #include "thermal_core.h"
 #include "thermal_hwmon.h"
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 92e7e7fd7357..cfba0965a22d 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -19,9 +19,8 @@
 #include <linux/string.h>
 #include <linux/sysfs.h>
 
-#include <trace/events/thermal.h>
-
 #include "thermal_core.h"
+#include "thermal_trace.h"
 
 int get_tz_trend(struct thermal_zone_device *tz, int trip)
 {
diff --git a/drivers/thermal/thermal_trace.h b/drivers/thermal/thermal_trace.h
new file mode 100644
index 000000000000..459c8ce6cf3b
--- /dev/null
+++ b/drivers/thermal/thermal_trace.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM thermal
+
+#if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_THERMAL_H
+
+#include <linux/devfreq.h>
+#include <linux/thermal.h>
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(THERMAL_TRIP_CRITICAL);
+TRACE_DEFINE_ENUM(THERMAL_TRIP_HOT);
+TRACE_DEFINE_ENUM(THERMAL_TRIP_PASSIVE);
+TRACE_DEFINE_ENUM(THERMAL_TRIP_ACTIVE);
+
+#define show_tzt_type(type)					\
+	__print_symbolic(type,					\
+			 { THERMAL_TRIP_CRITICAL, "CRITICAL"},	\
+			 { THERMAL_TRIP_HOT,      "HOT"},	\
+			 { THERMAL_TRIP_PASSIVE,  "PASSIVE"},	\
+			 { THERMAL_TRIP_ACTIVE,   "ACTIVE"})
+
+TRACE_EVENT(thermal_temperature,
+
+	TP_PROTO(struct thermal_zone_device *tz),
+
+	TP_ARGS(tz),
+
+	TP_STRUCT__entry(
+		__string(thermal_zone, tz->type)
+		__field(int, id)
+		__field(int, temp_prev)
+		__field(int, temp)
+	),
+
+	TP_fast_assign(
+		__assign_str(thermal_zone, tz->type);
+		__entry->id = tz->id;
+		__entry->temp_prev = tz->last_temperature;
+		__entry->temp = tz->temperature;
+	),
+
+	TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d",
+		__get_str(thermal_zone), __entry->id, __entry->temp_prev,
+		__entry->temp)
+);
+
+TRACE_EVENT(cdev_update,
+
+	TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target),
+
+	TP_ARGS(cdev, target),
+
+	TP_STRUCT__entry(
+		__string(type, cdev->type)
+		__field(unsigned long, target)
+	),
+
+	TP_fast_assign(
+		__assign_str(type, cdev->type);
+		__entry->target = target;
+	),
+
+	TP_printk("type=%s target=%lu", __get_str(type), __entry->target)
+);
+
+TRACE_EVENT(thermal_zone_trip,
+
+	TP_PROTO(struct thermal_zone_device *tz, int trip,
+		enum thermal_trip_type trip_type),
+
+	TP_ARGS(tz, trip, trip_type),
+
+	TP_STRUCT__entry(
+		__string(thermal_zone, tz->type)
+		__field(int, id)
+		__field(int, trip)
+		__field(enum thermal_trip_type, trip_type)
+	),
+
+	TP_fast_assign(
+		__assign_str(thermal_zone, tz->type);
+		__entry->id = tz->id;
+		__entry->trip = trip;
+		__entry->trip_type = trip_type;
+	),
+
+	TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%s",
+		__get_str(thermal_zone), __entry->id, __entry->trip,
+		show_tzt_type(__entry->trip_type))
+);
+
+#ifdef CONFIG_CPU_THERMAL
+TRACE_EVENT(thermal_power_cpu_get_power_simple,
+	TP_PROTO(int cpu, u32 power),
+
+	TP_ARGS(cpu, power),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(u32, power)
+	),
+
+	TP_fast_assign(
+		__entry->cpu = cpu;
+		__entry->power = power;
+	),
+
+	TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power)
+);
+
+TRACE_EVENT(thermal_power_cpu_limit,
+	TP_PROTO(const struct cpumask *cpus, unsigned int freq,
+		unsigned long cdev_state, u32 power),
+
+	TP_ARGS(cpus, freq, cdev_state, power),
+
+	TP_STRUCT__entry(
+		__bitmask(cpumask, num_possible_cpus())
+		__field(unsigned int,  freq      )
+		__field(unsigned long, cdev_state)
+		__field(u32,           power     )
+	),
+
+	TP_fast_assign(
+		__assign_bitmask(cpumask, cpumask_bits(cpus),
+				num_possible_cpus());
+		__entry->freq = freq;
+		__entry->cdev_state = cdev_state;
+		__entry->power = power;
+	),
+
+	TP_printk("cpus=%s freq=%u cdev_state=%lu power=%u",
+		__get_bitmask(cpumask), __entry->freq, __entry->cdev_state,
+		__entry->power)
+);
+#endif /* CONFIG_CPU_THERMAL */
+
+#ifdef CONFIG_DEVFREQ_THERMAL
+TRACE_EVENT(thermal_power_devfreq_get_power,
+	TP_PROTO(struct thermal_cooling_device *cdev,
+		 struct devfreq_dev_status *status, unsigned long freq,
+		u32 power),
+
+	TP_ARGS(cdev, status,  freq, power),
+
+	TP_STRUCT__entry(
+		__string(type,         cdev->type    )
+		__field(unsigned long, freq          )
+		__field(u32,           busy_time)
+		__field(u32,           total_time)
+		__field(u32,           power)
+	),
+
+	TP_fast_assign(
+		__assign_str(type, cdev->type);
+		__entry->freq = freq;
+		__entry->busy_time = status->busy_time;
+		__entry->total_time = status->total_time;
+		__entry->power = power;
+	),
+
+	TP_printk("type=%s freq=%lu load=%u power=%u",
+		__get_str(type), __entry->freq,
+		__entry->total_time == 0 ? 0 :
+			(100 * __entry->busy_time) / __entry->total_time,
+		__entry->power)
+);
+
+TRACE_EVENT(thermal_power_devfreq_limit,
+	TP_PROTO(struct thermal_cooling_device *cdev, unsigned long freq,
+		unsigned long cdev_state, u32 power),
+
+	TP_ARGS(cdev, freq, cdev_state, power),
+
+	TP_STRUCT__entry(
+		__string(type,         cdev->type)
+		__field(unsigned int,  freq      )
+		__field(unsigned long, cdev_state)
+		__field(u32,           power     )
+	),
+
+	TP_fast_assign(
+		__assign_str(type, cdev->type);
+		__entry->freq = freq;
+		__entry->cdev_state = cdev_state;
+		__entry->power = power;
+	),
+
+	TP_printk("type=%s freq=%u cdev_state=%lu power=%u",
+		__get_str(type), __entry->freq, __entry->cdev_state,
+		__entry->power)
+);
+#endif /* CONFIG_DEVFREQ_THERMAL */
+#endif /* _TRACE_THERMAL_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE thermal_trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/thermal/thermal_trace_ipa.h b/drivers/thermal/thermal_trace_ipa.h
new file mode 100644
index 000000000000..84568db5421b
--- /dev/null
+++ b/drivers/thermal/thermal_trace_ipa.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM thermal_power_allocator
+
+#if !defined(_TRACE_THERMAL_POWER_ALLOCATOR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_THERMAL_POWER_ALLOCATOR_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(thermal_power_allocator,
+	TP_PROTO(struct thermal_zone_device *tz, u32 *req_power,
+		 u32 total_req_power, u32 *granted_power,
+		 u32 total_granted_power, size_t num_actors,
+		 u32 power_range, u32 max_allocatable_power,
+		 int current_temp, s32 delta_temp),
+	TP_ARGS(tz, req_power, total_req_power, granted_power,
+		total_granted_power, num_actors, power_range,
+		max_allocatable_power, current_temp, delta_temp),
+	TP_STRUCT__entry(
+		__field(int,           tz_id          )
+		__dynamic_array(u32,   req_power, num_actors    )
+		__field(u32,           total_req_power          )
+		__dynamic_array(u32,   granted_power, num_actors)
+		__field(u32,           total_granted_power      )
+		__field(size_t,        num_actors               )
+		__field(u32,           power_range              )
+		__field(u32,           max_allocatable_power    )
+		__field(int,           current_temp             )
+		__field(s32,           delta_temp               )
+	),
+	TP_fast_assign(
+		__entry->tz_id = tz->id;
+		memcpy(__get_dynamic_array(req_power), req_power,
+			num_actors * sizeof(*req_power));
+		__entry->total_req_power = total_req_power;
+		memcpy(__get_dynamic_array(granted_power), granted_power,
+			num_actors * sizeof(*granted_power));
+		__entry->total_granted_power = total_granted_power;
+		__entry->num_actors = num_actors;
+		__entry->power_range = power_range;
+		__entry->max_allocatable_power = max_allocatable_power;
+		__entry->current_temp = current_temp;
+		__entry->delta_temp = delta_temp;
+	),
+
+	TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%d delta_temperature=%d",
+		__entry->tz_id,
+		__print_array(__get_dynamic_array(req_power),
+                              __entry->num_actors, 4),
+		__entry->total_req_power,
+		__print_array(__get_dynamic_array(granted_power),
+                              __entry->num_actors, 4),
+		__entry->total_granted_power, __entry->power_range,
+		__entry->max_allocatable_power, __entry->current_temp,
+		__entry->delta_temp)
+);
+
+TRACE_EVENT(thermal_power_allocator_pid,
+	TP_PROTO(struct thermal_zone_device *tz, s32 err, s32 err_integral,
+		 s64 p, s64 i, s64 d, s32 output),
+	TP_ARGS(tz, err, err_integral, p, i, d, output),
+	TP_STRUCT__entry(
+		__field(int, tz_id       )
+		__field(s32, err         )
+		__field(s32, err_integral)
+		__field(s64, p           )
+		__field(s64, i           )
+		__field(s64, d           )
+		__field(s32, output      )
+	),
+	TP_fast_assign(
+		__entry->tz_id = tz->id;
+		__entry->err = err;
+		__entry->err_integral = err_integral;
+		__entry->p = p;
+		__entry->i = i;
+		__entry->d = d;
+		__entry->output = output;
+	),
+
+	TP_printk("thermal_zone_id=%d err=%d err_integral=%d p=%lld i=%lld d=%lld output=%d",
+		  __entry->tz_id, __entry->err, __entry->err_integral,
+		  __entry->p, __entry->i, __entry->d, __entry->output)
+);
+#endif /* _TRACE_THERMAL_POWER_ALLOCATOR_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE thermal_trace_ipa
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
deleted file mode 100644
index e58bf3072f32..000000000000
--- a/include/trace/events/thermal.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM thermal
-
-#if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_THERMAL_H
-
-#include <linux/devfreq.h>
-#include <linux/thermal.h>
-#include <linux/tracepoint.h>
-
-TRACE_DEFINE_ENUM(THERMAL_TRIP_CRITICAL);
-TRACE_DEFINE_ENUM(THERMAL_TRIP_HOT);
-TRACE_DEFINE_ENUM(THERMAL_TRIP_PASSIVE);
-TRACE_DEFINE_ENUM(THERMAL_TRIP_ACTIVE);
-
-#define show_tzt_type(type)					\
-	__print_symbolic(type,					\
-			 { THERMAL_TRIP_CRITICAL, "CRITICAL"},	\
-			 { THERMAL_TRIP_HOT,      "HOT"},	\
-			 { THERMAL_TRIP_PASSIVE,  "PASSIVE"},	\
-			 { THERMAL_TRIP_ACTIVE,   "ACTIVE"})
-
-TRACE_EVENT(thermal_temperature,
-
-	TP_PROTO(struct thermal_zone_device *tz),
-
-	TP_ARGS(tz),
-
-	TP_STRUCT__entry(
-		__string(thermal_zone, tz->type)
-		__field(int, id)
-		__field(int, temp_prev)
-		__field(int, temp)
-	),
-
-	TP_fast_assign(
-		__assign_str(thermal_zone, tz->type);
-		__entry->id = tz->id;
-		__entry->temp_prev = tz->last_temperature;
-		__entry->temp = tz->temperature;
-	),
-
-	TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d",
-		__get_str(thermal_zone), __entry->id, __entry->temp_prev,
-		__entry->temp)
-);
-
-TRACE_EVENT(cdev_update,
-
-	TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target),
-
-	TP_ARGS(cdev, target),
-
-	TP_STRUCT__entry(
-		__string(type, cdev->type)
-		__field(unsigned long, target)
-	),
-
-	TP_fast_assign(
-		__assign_str(type, cdev->type);
-		__entry->target = target;
-	),
-
-	TP_printk("type=%s target=%lu", __get_str(type), __entry->target)
-);
-
-TRACE_EVENT(thermal_zone_trip,
-
-	TP_PROTO(struct thermal_zone_device *tz, int trip,
-		enum thermal_trip_type trip_type),
-
-	TP_ARGS(tz, trip, trip_type),
-
-	TP_STRUCT__entry(
-		__string(thermal_zone, tz->type)
-		__field(int, id)
-		__field(int, trip)
-		__field(enum thermal_trip_type, trip_type)
-	),
-
-	TP_fast_assign(
-		__assign_str(thermal_zone, tz->type);
-		__entry->id = tz->id;
-		__entry->trip = trip;
-		__entry->trip_type = trip_type;
-	),
-
-	TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%s",
-		__get_str(thermal_zone), __entry->id, __entry->trip,
-		show_tzt_type(__entry->trip_type))
-);
-
-#ifdef CONFIG_CPU_THERMAL
-TRACE_EVENT(thermal_power_cpu_get_power_simple,
-	TP_PROTO(int cpu, u32 power),
-
-	TP_ARGS(cpu, power),
-
-	TP_STRUCT__entry(
-		__field(int, cpu)
-		__field(u32, power)
-	),
-
-	TP_fast_assign(
-		__entry->cpu = cpu;
-		__entry->power = power;
-	),
-
-	TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power)
-);
-
-TRACE_EVENT(thermal_power_cpu_limit,
-	TP_PROTO(const struct cpumask *cpus, unsigned int freq,
-		unsigned long cdev_state, u32 power),
-
-	TP_ARGS(cpus, freq, cdev_state, power),
-
-	TP_STRUCT__entry(
-		__bitmask(cpumask, num_possible_cpus())
-		__field(unsigned int,  freq      )
-		__field(unsigned long, cdev_state)
-		__field(u32,           power     )
-	),
-
-	TP_fast_assign(
-		__assign_bitmask(cpumask, cpumask_bits(cpus),
-				num_possible_cpus());
-		__entry->freq = freq;
-		__entry->cdev_state = cdev_state;
-		__entry->power = power;
-	),
-
-	TP_printk("cpus=%s freq=%u cdev_state=%lu power=%u",
-		__get_bitmask(cpumask), __entry->freq, __entry->cdev_state,
-		__entry->power)
-);
-#endif /* CONFIG_CPU_THERMAL */
-
-#ifdef CONFIG_DEVFREQ_THERMAL
-TRACE_EVENT(thermal_power_devfreq_get_power,
-	TP_PROTO(struct thermal_cooling_device *cdev,
-		 struct devfreq_dev_status *status, unsigned long freq,
-		u32 power),
-
-	TP_ARGS(cdev, status,  freq, power),
-
-	TP_STRUCT__entry(
-		__string(type,         cdev->type    )
-		__field(unsigned long, freq          )
-		__field(u32,           busy_time)
-		__field(u32,           total_time)
-		__field(u32,           power)
-	),
-
-	TP_fast_assign(
-		__assign_str(type, cdev->type);
-		__entry->freq = freq;
-		__entry->busy_time = status->busy_time;
-		__entry->total_time = status->total_time;
-		__entry->power = power;
-	),
-
-	TP_printk("type=%s freq=%lu load=%u power=%u",
-		__get_str(type), __entry->freq,
-		__entry->total_time == 0 ? 0 :
-			(100 * __entry->busy_time) / __entry->total_time,
-		__entry->power)
-);
-
-TRACE_EVENT(thermal_power_devfreq_limit,
-	TP_PROTO(struct thermal_cooling_device *cdev, unsigned long freq,
-		unsigned long cdev_state, u32 power),
-
-	TP_ARGS(cdev, freq, cdev_state, power),
-
-	TP_STRUCT__entry(
-		__string(type,         cdev->type)
-		__field(unsigned int,  freq      )
-		__field(unsigned long, cdev_state)
-		__field(u32,           power     )
-	),
-
-	TP_fast_assign(
-		__assign_str(type, cdev->type);
-		__entry->freq = freq;
-		__entry->cdev_state = cdev_state;
-		__entry->power = power;
-	),
-
-	TP_printk("type=%s freq=%u cdev_state=%lu power=%u",
-		__get_str(type), __entry->freq, __entry->cdev_state,
-		__entry->power)
-);
-#endif /* CONFIG_DEVFREQ_THERMAL */
-#endif /* _TRACE_THERMAL_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/events/thermal_power_allocator.h b/include/trace/events/thermal_power_allocator.h
deleted file mode 100644
index 1c8fb95544f9..000000000000
--- a/include/trace/events/thermal_power_allocator.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM thermal_power_allocator
-
-#if !defined(_TRACE_THERMAL_POWER_ALLOCATOR_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_THERMAL_POWER_ALLOCATOR_H
-
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(thermal_power_allocator,
-	TP_PROTO(struct thermal_zone_device *tz, u32 *req_power,
-		 u32 total_req_power, u32 *granted_power,
-		 u32 total_granted_power, size_t num_actors,
-		 u32 power_range, u32 max_allocatable_power,
-		 int current_temp, s32 delta_temp),
-	TP_ARGS(tz, req_power, total_req_power, granted_power,
-		total_granted_power, num_actors, power_range,
-		max_allocatable_power, current_temp, delta_temp),
-	TP_STRUCT__entry(
-		__field(int,           tz_id          )
-		__dynamic_array(u32,   req_power, num_actors    )
-		__field(u32,           total_req_power          )
-		__dynamic_array(u32,   granted_power, num_actors)
-		__field(u32,           total_granted_power      )
-		__field(size_t,        num_actors               )
-		__field(u32,           power_range              )
-		__field(u32,           max_allocatable_power    )
-		__field(int,           current_temp             )
-		__field(s32,           delta_temp               )
-	),
-	TP_fast_assign(
-		__entry->tz_id = tz->id;
-		memcpy(__get_dynamic_array(req_power), req_power,
-			num_actors * sizeof(*req_power));
-		__entry->total_req_power = total_req_power;
-		memcpy(__get_dynamic_array(granted_power), granted_power,
-			num_actors * sizeof(*granted_power));
-		__entry->total_granted_power = total_granted_power;
-		__entry->num_actors = num_actors;
-		__entry->power_range = power_range;
-		__entry->max_allocatable_power = max_allocatable_power;
-		__entry->current_temp = current_temp;
-		__entry->delta_temp = delta_temp;
-	),
-
-	TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%d delta_temperature=%d",
-		__entry->tz_id,
-		__print_array(__get_dynamic_array(req_power),
-                              __entry->num_actors, 4),
-		__entry->total_req_power,
-		__print_array(__get_dynamic_array(granted_power),
-                              __entry->num_actors, 4),
-		__entry->total_granted_power, __entry->power_range,
-		__entry->max_allocatable_power, __entry->current_temp,
-		__entry->delta_temp)
-);
-
-TRACE_EVENT(thermal_power_allocator_pid,
-	TP_PROTO(struct thermal_zone_device *tz, s32 err, s32 err_integral,
-		 s64 p, s64 i, s64 d, s32 output),
-	TP_ARGS(tz, err, err_integral, p, i, d, output),
-	TP_STRUCT__entry(
-		__field(int, tz_id       )
-		__field(s32, err         )
-		__field(s32, err_integral)
-		__field(s64, p           )
-		__field(s64, i           )
-		__field(s64, d           )
-		__field(s32, output      )
-	),
-	TP_fast_assign(
-		__entry->tz_id = tz->id;
-		__entry->err = err;
-		__entry->err_integral = err_integral;
-		__entry->p = p;
-		__entry->i = i;
-		__entry->d = d;
-		__entry->output = output;
-	),
-
-	TP_printk("thermal_zone_id=%d err=%d err_integral=%d p=%lld i=%lld d=%lld output=%d",
-		  __entry->tz_id, __entry->err, __entry->err_integral,
-		  __entry->p, __entry->i, __entry->d, __entry->output)
-);
-#endif /* _TRACE_THERMAL_POWER_ALLOCATOR_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 15ce79bd9daf956f2840959f81f8c97dce9d87c2 Mon Sep 17 00:00:00 2001
From: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Date: Thu, 30 Mar 2023 12:27:38 +0200
Subject: genetlink: make _genl_cmd_to_str static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Primarily to silence warnings like:
warning: no previous prototype for 'xxx_genl_cmd_to_str' [-Wmissing-prototypes]

Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Link: https://lore.kernel.org/r/20230330102744.2128122-2-christoph.boehmwalder@linbit.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genl_magic_func.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 4a4b387181ad..2984b0cb24b1 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -209,7 +209,7 @@ static int s_name ## _from_attrs_for_change(struct s_name *s,		\
  * Magic: define op number to op name mapping				{{{1
  *									{{{2
  */
-const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
+static const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
 {
 	switch (cmd) {
 #undef GENL_op
-- 
cgit v1.2.3


From dd2d6604407da5b1b260faee409cd601fe914ce9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 30 Mar 2023 21:47:31 -0700
Subject: net: minor reshuffle of napi_struct

napi_id is read by GRO and drivers to mark skbs, and it currently
sits at the end of the structure, in a mostly unused cache line.
Move it up into a hole, and separate the clearly control path
fields from the important ones.

Before:

struct napi_struct {
	struct list_head           poll_list;            /*     0    16 */
	long unsigned int          state;                /*    16     8 */
	int                        weight;               /*    24     4 */
	int                        defer_hard_irqs_count; /*    28     4 */
	long unsigned int          gro_bitmask;          /*    32     8 */
	int                        (*poll)(struct napi_struct *, int); /*    40     8 */
	int                        poll_owner;           /*    48     4 */

	/* XXX 4 bytes hole, try to pack */

	struct net_device *        dev;                  /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct gro_list            gro_hash[8];          /*    64   192 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	struct sk_buff *           skb;                  /*   256     8 */
	struct list_head           rx_list;              /*   264    16 */
	int                        rx_count;             /*   280     4 */

	/* XXX 4 bytes hole, try to pack */

	struct hrtimer             timer;                /*   288    64 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */
	struct list_head           dev_list;             /*   352    16 */
	struct hlist_node          napi_hash_node;       /*   368    16 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	unsigned int               napi_id;              /*   384     4 */

	/* XXX 4 bytes hole, try to pack */

	struct task_struct *       thread;               /*   392     8 */

	/* size: 400, cachelines: 7, members: 17 */
	/* sum members: 388, holes: 3, sum holes: 12 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 16 bytes */
};

After:

struct napi_struct {
	struct list_head           poll_list;            /*     0    16 */
	long unsigned int          state;                /*    16     8 */
	int                        weight;               /*    24     4 */
	int                        defer_hard_irqs_count; /*    28     4 */
	long unsigned int          gro_bitmask;          /*    32     8 */
	int                        (*poll)(struct napi_struct *, int); /*    40     8 */
	int                        poll_owner;           /*    48     4 */

	/* XXX 4 bytes hole, try to pack */

	struct net_device *        dev;                  /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct gro_list            gro_hash[8];          /*    64   192 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	struct sk_buff *           skb;                  /*   256     8 */
	struct list_head           rx_list;              /*   264    16 */
	int                        rx_count;             /*   280     4 */
	unsigned int               napi_id;              /*   284     4 */
	struct hrtimer             timer;                /*   288    64 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */
	struct task_struct *       thread;               /*   352     8 */
	struct list_head           dev_list;             /*   360    16 */
	struct hlist_node          napi_hash_node;       /*   376    16 */

	/* size: 392, cachelines: 7, members: 17 */
	/* sum members: 388, holes: 1, sum holes: 4 */
	/* paddings: 1, sum paddings: 4 */
	/* forced alignments: 1 */
	/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8c634091a65..62e093a6d6d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -367,11 +367,12 @@ struct napi_struct {
 	struct sk_buff		*skb;
 	struct list_head	rx_list; /* Pending GRO_NORMAL skbs */
 	int			rx_count; /* length of rx_list */
+	unsigned int		napi_id;
 	struct hrtimer		timer;
+	struct task_struct	*thread;
+	/* control-path-only fields follow */
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
-	unsigned int		napi_id;
-	struct task_struct	*thread;
 };
 
 enum {
-- 
cgit v1.2.3


From eb0d8623b9b805cbcab87620a363292552c19359 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Wed, 22 Feb 2023 10:42:52 +0100
Subject: soc: mediatek: cmdq: Add inline functions for !CONFIG_MTK_CMDQ

In preparation for a cleanup of ifdef instances of IS_REACHABLE() for
the CONFIG_MTK_CMDQ configuration option, add inline functions that
will either return a failure or, for void functions, do nothing.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Chen-Yu Tsai <wenst@chromium.org>
Link: https://lore.kernel.org/r/20230222094253.23678-9-angelogioacchino.delregno@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 include/linux/soc/mediatek/mtk-cmdq.h | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

(limited to 'include')

diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 2b498f4f3946..649955d2cf5c 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -27,6 +27,8 @@ struct cmdq_client {
 	struct mbox_chan *chan;
 };
 
+#if IS_ENABLED(CONFIG_MTK_CMDQ)
+
 /**
  * cmdq_dev_get_client_reg() - parse cmdq client reg from the device
  *			       node of CMDQ client
@@ -277,4 +279,116 @@ int cmdq_pkt_finalize(struct cmdq_pkt *pkt);
  */
 int cmdq_pkt_flush_async(struct cmdq_pkt *pkt);
 
+#else /* IS_ENABLED(CONFIG_MTK_CMDQ) */
+
+static inline int cmdq_dev_get_client_reg(struct device *dev,
+					  struct cmdq_client_reg *client_reg, int idx)
+{
+	return -ENODEV;
+}
+
+static inline struct cmdq_client *cmdq_mbox_create(struct device *dev, int index)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void cmdq_mbox_destroy(struct cmdq_client *client) { }
+
+static inline  struct cmdq_pkt *cmdq_pkt_create(struct cmdq_client *client, size_t size)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void cmdq_pkt_destroy(struct cmdq_pkt *pkt) { }
+
+static inline int cmdq_pkt_write(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_write_mask(struct cmdq_pkt *pkt, u8 subsys,
+				      u16 offset, u32 value, u32 mask)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_read_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx,
+				  u16 addr_low, u16 reg_idx)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_write_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx,
+				   u16 addr_low, u16 src_reg_idx)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_write_s_mask(struct cmdq_pkt *pkt, u16 high_addr_reg_idx,
+					u16 addr_low, u16 src_reg_idx, u32 mask)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_write_s_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx,
+					 u16 addr_low, u32 value)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_write_s_mask_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx,
+					      u16 addr_low, u32 value, u32 mask)
+{
+	return -ENOENT;
+}
+
+static inline int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event, bool clear)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_clear_event(struct cmdq_pkt *pkt, u16 event)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_set_event(struct cmdq_pkt *pkt, u16 event)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_poll(struct cmdq_pkt *pkt, u8 subsys,
+				u16 offset, u32 value)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys,
+				     u16 offset, u32 value, u32 mask)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
+{
+	return -EINVAL;
+}
+
+static inline int cmdq_pkt_flush_async(struct cmdq_pkt *pkt)
+{
+	return -EINVAL;
+}
+
+#endif /* IS_ENABLED(CONFIG_MTK_CMDQ) */
+
 #endif	/* __MTK_CMDQ_H__ */
-- 
cgit v1.2.3


From 86eb94bf8006a85738f0ccf49e3ce894e03922a6 Mon Sep 17 00:00:00 2001
From: Adrien Thierry <athierry@redhat.com>
Date: Wed, 29 Mar 2023 16:54:25 -0400
Subject: scsi: Revert "scsi: ufs: core: Initialize devfreq synchronously"

This reverts commit 7dafc3e007918384c8693ff8d70381b5c1e9c247.

This patch introduced a regression [1] where hba->pwr_info is used before
being initialized, which could create issues in ufshcd_scale_gear(). Revert
it until a better solution is found.

[1] https://lore.kernel.org/all/CAGaU9a_PMZhqv+YJ0r3w-hJMsR922oxW6Kg59vw+oen-NZ6Otw@mail.gmail.com

Signed-off-by: Adrien Thierry <athierry@redhat.com>
Link: https://lore.kernel.org/r/20230329205426.46393-1-athierry@redhat.com
Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 47 ++++++++++++++++-------------------------------
 include/ufs/ufshcd.h      |  1 -
 2 files changed, 16 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 37e178a9ac47..70b112038792 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1409,13 +1409,6 @@ static int ufshcd_devfreq_target(struct device *dev,
 	struct ufs_clk_info *clki;
 	unsigned long irq_flags;
 
-	/*
-	 * Skip devfreq if UFS initialization is not finished.
-	 * Otherwise ufs could be in a inconsistent state.
-	 */
-	if (!smp_load_acquire(&hba->logical_unit_scan_finished))
-		return 0;
-
 	if (!ufshcd_is_clkscaling_supported(hba))
 		return -EINVAL;
 
@@ -8399,6 +8392,22 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
 	if (ret)
 		goto out;
 
+	/* Initialize devfreq after UFS device is detected */
+	if (ufshcd_is_clkscaling_supported(hba)) {
+		memcpy(&hba->clk_scaling.saved_pwr_info.info,
+			&hba->pwr_info,
+			sizeof(struct ufs_pa_layer_attr));
+		hba->clk_scaling.saved_pwr_info.is_valid = true;
+		hba->clk_scaling.is_allowed = true;
+
+		ret = ufshcd_devfreq_init(hba);
+		if (ret)
+			goto out;
+
+		hba->clk_scaling.is_enabled = true;
+		ufshcd_init_clk_scaling_sysfs(hba);
+	}
+
 	ufs_bsg_probe(hba);
 	ufshpb_init(hba);
 	scsi_scan_host(hba->host);
@@ -8670,12 +8679,6 @@ out:
 	if (ret) {
 		pm_runtime_put_sync(hba->dev);
 		ufshcd_hba_exit(hba);
-	} else {
-		/*
-		 * Make sure that when reader code sees UFS initialization has finished,
-		 * all initialization steps have really been executed.
-		 */
-		smp_store_release(&hba->logical_unit_scan_finished, true);
 	}
 }
 
@@ -10316,30 +10319,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	 */
 	ufshcd_set_ufs_dev_active(hba);
 
-	/* Initialize devfreq */
-	if (ufshcd_is_clkscaling_supported(hba)) {
-		memcpy(&hba->clk_scaling.saved_pwr_info.info,
-			&hba->pwr_info,
-			sizeof(struct ufs_pa_layer_attr));
-		hba->clk_scaling.saved_pwr_info.is_valid = true;
-		hba->clk_scaling.is_allowed = true;
-
-		err = ufshcd_devfreq_init(hba);
-		if (err)
-			goto rpm_put_sync;
-
-		hba->clk_scaling.is_enabled = true;
-		ufshcd_init_clk_scaling_sysfs(hba);
-	}
-
 	async_schedule(ufshcd_async_scan, hba);
 	ufs_sysfs_add_nodes(hba->dev);
 
 	device_enable_async_suspend(dev);
 	return 0;
 
-rpm_put_sync:
-	pm_runtime_put_sync(dev);
 free_tmf_queue:
 	blk_mq_destroy_queue(hba->tmf_queue);
 	blk_put_queue(hba->tmf_queue);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 25aab8ec4f86..431c3afb2ce0 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -979,7 +979,6 @@ struct ufs_hba {
 	struct completion *uic_async_done;
 
 	enum ufshcd_state ufshcd_state;
-	bool logical_unit_scan_finished;
 	u32 eh_flags;
 	u32 intr_mask;
 	u16 ee_ctrl_mask;
-- 
cgit v1.2.3


From 543a827b1db3ef0a123dc7a48e8feb6585eae0a6 Mon Sep 17 00:00:00 2001
From: Stanley Chu <stanley.chu@mediatek.com>
Date: Thu, 30 Mar 2023 09:29:18 +0800
Subject: scsi: core: Clean up struct ufs_saved_pwr_info

The "is_valid" field of the struct ufs_saved_pwr_info is no longer used,
and this struct can be replaced by struct ufs_pa_layer_attr without any
changes to the functionality.

Signed-off-by: Stanley Chu <stanley.chu@mediatek.com>
Link: https://lore.kernel.org/r/20230330012918.13748-1-stanley.chu@mediatek.com
Reviewed-by: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 7 +++----
 include/ufs/ufshcd.h      | 7 +------
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index acae4e194ec4..03c47f9a2750 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1269,7 +1269,7 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
 	struct ufs_pa_layer_attr new_pwr_info;
 
 	if (scale_up) {
-		memcpy(&new_pwr_info, &hba->clk_scaling.saved_pwr_info.info,
+		memcpy(&new_pwr_info, &hba->clk_scaling.saved_pwr_info,
 		       sizeof(struct ufs_pa_layer_attr));
 	} else {
 		memcpy(&new_pwr_info, &hba->pwr_info,
@@ -1278,7 +1278,7 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
 		if (hba->pwr_info.gear_tx > hba->clk_scaling.min_gear ||
 		    hba->pwr_info.gear_rx > hba->clk_scaling.min_gear) {
 			/* save the current power mode */
-			memcpy(&hba->clk_scaling.saved_pwr_info.info,
+			memcpy(&hba->clk_scaling.saved_pwr_info,
 				&hba->pwr_info,
 				sizeof(struct ufs_pa_layer_attr));
 
@@ -10349,10 +10349,9 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 
 	/* Initialize devfreq */
 	if (ufshcd_is_clkscaling_supported(hba)) {
-		memcpy(&hba->clk_scaling.saved_pwr_info.info,
+		memcpy(&hba->clk_scaling.saved_pwr_info,
 			&hba->pwr_info,
 			sizeof(struct ufs_pa_layer_attr));
-		hba->clk_scaling.saved_pwr_info.is_valid = true;
 		hba->clk_scaling.is_allowed = true;
 
 		err = ufshcd_devfreq_init(hba);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 25aab8ec4f86..8b1046c21960 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -394,11 +394,6 @@ struct ufs_clk_gating {
 	struct workqueue_struct *clk_gating_workq;
 };
 
-struct ufs_saved_pwr_info {
-	struct ufs_pa_layer_attr info;
-	bool is_valid;
-};
-
 /**
  * struct ufs_clk_scaling - UFS clock scaling related data
  * @active_reqs: number of requests that are pending. If this is zero when
@@ -428,7 +423,7 @@ struct ufs_clk_scaling {
 	ktime_t window_start_t;
 	ktime_t busy_start_t;
 	struct device_attribute enable_attr;
-	struct ufs_saved_pwr_info saved_pwr_info;
+	struct ufs_pa_layer_attr saved_pwr_info;
 	struct workqueue_struct *workq;
 	struct work_struct suspend_work;
 	struct work_struct resume_work;
-- 
cgit v1.2.3


From c4bffeaa8d50b7279c5a76597efa4b06e709df63 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 2 Apr 2023 15:37:53 +0300
Subject: net: add struct kernel_hwtstamp_config and make
 net_hwtstamp_validate() use it

Jakub Kicinski suggested that we may want to add new UAPI for
controlling hardware timestamping through netlink in the future, and in
that case, we will be limited to the struct hwtstamp_config that is
currently passed in fixed binary format through the SIOCGHWTSTAMP and
SIOCSHWTSTAMP ioctls. It would be good if new kernel code already
started operating on an extensible kernel variant of that structure,
similar in concept to struct kernel_ethtool_coalesce vs struct
ethtool_coalesce.

Since struct hwtstamp_config is in include/uapi/linux/net_tstamp.h, here
we introduce include/linux/net_tstamp.h which shadows that other header,
but also includes it, so that existing includers of this header work as
before. In addition to that, we add the definition for the kernel-only
structure, and a helper which translates all fields by manual copying.
I am doing a manual copy in order to not force the alignment (or type)
of the fields of struct kernel_hwtstamp_config to be the same as of
struct hwtstamp_config, even though now, they are the same.

Link: https://lore.kernel.org/netdev/20230330223519.36ce7d23@kernel.org/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_tstamp.h | 33 +++++++++++++++++++++++++++++++++
 net/core/dev_ioctl.c       |  7 +++++--
 2 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/net_tstamp.h

(limited to 'include')

diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
new file mode 100644
index 000000000000..fd67f3cc0c4b
--- /dev/null
+++ b/include/linux/net_tstamp.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_NET_TIMESTAMPING_H_
+#define _LINUX_NET_TIMESTAMPING_H_
+
+#include <uapi/linux/net_tstamp.h>
+
+/**
+ * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
+ *
+ * @flags: see struct hwtstamp_config
+ * @tx_type: see struct hwtstamp_config
+ * @rx_filter: see struct hwtstamp_config
+ *
+ * Prefer using this structure for in-kernel processing of hardware
+ * timestamping configuration, over the inextensible struct hwtstamp_config
+ * exposed to the %SIOCGHWTSTAMP and %SIOCSHWTSTAMP ioctl UAPI.
+ */
+struct kernel_hwtstamp_config {
+	int flags;
+	int tx_type;
+	int rx_filter;
+};
+
+static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
+					     const struct hwtstamp_config *cfg)
+{
+	kernel_cfg->flags = cfg->flags;
+	kernel_cfg->tx_type = cfg->tx_type;
+	kernel_cfg->rx_filter = cfg->rx_filter;
+}
+
+#endif /* _LINUX_NET_TIMESTAMPING_H_ */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 34a0da5fbcfc..c532ef4d5dff 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -183,7 +183,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
 	return err;
 }
 
-static int net_hwtstamp_validate(const struct hwtstamp_config *cfg)
+static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
 {
 	enum hwtstamp_tx_types tx_type;
 	enum hwtstamp_rx_filters rx_filter;
@@ -259,13 +259,16 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
+	struct kernel_hwtstamp_config kernel_cfg;
 	struct hwtstamp_config cfg;
 	int err;
 
 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
 		return -EFAULT;
 
-	err = net_hwtstamp_validate(&cfg);
+	hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+
+	err = net_hwtstamp_validate(&kernel_cfg);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 88c0a6b503b7f4fffb68a8d49c3987870c5b1d6b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 2 Apr 2023 15:37:55 +0300
Subject: net: create a netdev notifier for DSA to reject PTP on DSA master

The fact that PTP 2-step TX timestamping is broken on DSA switches if
the master also timestamps the same packets is documented by commit
f685e609a301 ("net: dsa: Deny PTP on master if switch supports it").
We attempt to help the users avoid shooting themselves in the foot by
making DSA reject the timestamping ioctls on an interface that is a DSA
master, and the switch tree beneath it contains switches which are aware
of PTP.

The only problem is that there isn't an established way of intercepting
ndo_eth_ioctl calls, so DSA creates avoidable burden upon the network
stack by creating a struct dsa_netdevice_ops with overlaid function
pointers that are manually checked from the relevant call sites. There
used to be 2 such dsa_netdevice_ops, but now, ndo_eth_ioctl is the only
one left.

There is an ongoing effort to migrate driver-visible hardware timestamping
control from the ndo_eth_ioctl() based API to a new ndo_hwtstamp_set()
model, but DSA actively prevents that migration, since dsa_master_ioctl()
is currently coded to manually call the master's legacy ndo_eth_ioctl(),
and so, whenever a network device driver would be converted to the new
API, DSA's restrictions would be circumvented, because any device could
be used as a DSA master.

The established way for unrelated modules to react on a net device event
is via netdevice notifiers. So we create a new notifier which gets
called whenever there is an attempt to change hardware timestamping
settings on a device.

Finally, there is another reason why a netdev notifier will be a good
idea, besides strictly DSA, and this has to do with PHY timestamping.

With ndo_eth_ioctl(), all MAC drivers must manually call
phy_has_hwtstamp() before deciding whether to act upon SIOCSHWTSTAMP,
otherwise they must pass this ioctl to the PHY driver via
phy_mii_ioctl().

With the new ndo_hwtstamp_set() API, it will be desirable to simply not
make any calls into the MAC device driver when timestamping should be
performed at the PHY level.

But there exist drivers, such as the lan966x switch, which need to
install packet traps for PTP regardless of whether they are the layer
that provides the hardware timestamps, or the PHY is. That would be
impossible to support with the new API.

The proposal there, too, is to introduce a netdev notifier which acts as
a better cue for switching drivers to add or remove PTP packet traps,
than ndo_hwtstamp_set(). The one introduced here "almost" works there as
well, except for the fact that packet traps should only be installed if
the PHY driver succeeded to enable hardware timestamping, whereas here,
we need to deny hardware timestamping on the DSA master before it
actually gets enabled. This is why this notifier is called "PRE_", and
the notifier that would get used for PHY timestamping and packet traps
would be called NETDEV_CHANGE_HWTSTAMP. This isn't a new concept, for
example NETDEV_CHANGEUPPER and NETDEV_PRECHANGEUPPER do the same thing.

In expectation of future netlink UAPI, we also pass a non-NULL extack
pointer to the netdev notifier, and we make DSA populate it with an
informative reason for the rejection. To avoid making it go to waste, we
make the ioctl-based dev_set_hwtstamp() create a fake extack and print
the message to the kernel log.

Link: https://lore.kernel.org/netdev/20230401191215.tvveoi3lkawgg6g4@skbuf/
Link: https://lore.kernel.org/netdev/20230310164451.ls7bbs6pdzs4m6pw@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++++++++-
 include/net/dsa.h         | 51 -----------------------------------------------
 net/core/dev.c            |  8 +++-----
 net/core/dev_ioctl.c      | 16 +++++++++++++--
 net/dsa/master.c          | 50 ++++++++++++++--------------------------------
 net/dsa/master.h          |  3 +++
 net/dsa/slave.c           | 11 ++++++++++
 7 files changed, 54 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 62e093a6d6d1..a740be3bb911 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2878,6 +2878,7 @@ enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 	NETDEV_XDP_FEAT_CHANGE,
+	NETDEV_PRE_CHANGE_HWTSTAMP,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2928,6 +2929,11 @@ struct netdev_notifier_pre_changeaddr_info {
 	const unsigned char *dev_addr;
 };
 
+struct netdev_notifier_hwtstamp_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct kernel_hwtstamp_config *config;
+};
+
 enum netdev_offload_xstats_type {
 	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
 };
@@ -2984,7 +2990,8 @@ netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
 }
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
-
+int call_netdevice_notifiers_info(unsigned long val,
+				  struct netdev_notifier_info *info);
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index a15f17a38eca..8903053fa5aa 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -109,16 +109,6 @@ struct dsa_device_ops {
 	bool promisc_on_master;
 };
 
-/* This structure defines the control interfaces that are overlayed by the
- * DSA layer on top of the DSA CPU/management net_device instance. This is
- * used by the core net_device layer while calling various net_device_ops
- * function pointers.
- */
-struct dsa_netdevice_ops {
-	int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr,
-			     int cmd);
-};
-
 struct dsa_lag {
 	struct net_device *dev;
 	unsigned int id;
@@ -317,11 +307,6 @@ struct dsa_port {
 	 */
 	const struct ethtool_ops *orig_ethtool_ops;
 
-	/*
-	 * Original copy of the master netdev net_device_ops
-	 */
-	const struct dsa_netdevice_ops *netdev_ops;
-
 	/* List of MAC addresses that must be forwarded on this port.
 	 * These are only valid on CPU ports and DSA links.
 	 */
@@ -1339,42 +1324,6 @@ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb,
 #endif
 }
 
-#if IS_ENABLED(CONFIG_NET_DSA)
-static inline int __dsa_netdevice_ops_check(struct net_device *dev)
-{
-	int err = -EOPNOTSUPP;
-
-	if (!dev->dsa_ptr)
-		return err;
-
-	if (!dev->dsa_ptr->netdev_ops)
-		return err;
-
-	return 0;
-}
-
-static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
-				    int cmd)
-{
-	const struct dsa_netdevice_ops *ops;
-	int err;
-
-	err = __dsa_netdevice_ops_check(dev);
-	if (err)
-		return err;
-
-	ops = dev->dsa_ptr->netdev_ops;
-
-	return ops->ndo_eth_ioctl(dev, ifr, cmd);
-}
-#else
-static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
-				    int cmd)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
 void dsa_switch_shutdown(struct dsa_switch *ds);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0c4b21291348..7ce5985be84b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -160,8 +160,6 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 
 static int netif_rx_internal(struct sk_buff *skb);
-static int call_netdevice_notifiers_info(unsigned long val,
-					 struct netdev_notifier_info *info);
 static int call_netdevice_notifiers_extack(unsigned long val,
 					   struct net_device *dev,
 					   struct netlink_ext_ack *extack);
@@ -1614,7 +1612,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
-	N(XDP_FEAT_CHANGE)
+	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
@@ -1919,8 +1917,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev,
  *	are as for raw_notifier_call_chain().
  */
 
-static int call_netdevice_notifiers_info(unsigned long val,
-					 struct netdev_notifier_info *info)
+int call_netdevice_notifiers_info(unsigned long val,
+				  struct netdev_notifier_info *info)
 {
 	struct net *net = dev_net(info->dev);
 	int ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index c532ef4d5dff..6d772837eb3f 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,7 +259,11 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
+	struct netdev_notifier_hwtstamp_info info = {
+		.info.dev = dev,
+	};
 	struct kernel_hwtstamp_config kernel_cfg;
+	struct netlink_ext_ack extack = {};
 	struct hwtstamp_config cfg;
 	int err;
 
@@ -272,9 +276,17 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	err = dsa_ndo_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
-	if (err != -EOPNOTSUPP)
+	info.info.extack = &extack;
+	info.config = &kernel_cfg;
+
+	err = call_netdevice_notifiers_info(NETDEV_PRE_CHANGE_HWTSTAMP,
+					    &info.info);
+	err = notifier_to_errno(err);
+	if (err) {
+		if (extack._msg)
+			netdev_err(dev, "%s\n", extack._msg);
 		return err;
+	}
 
 	return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
 }
diff --git a/net/dsa/master.c b/net/dsa/master.c
index e397641382ca..c2cabe6248b1 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -195,38 +195,31 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	}
 }
 
-static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+/* Deny PTP operations on master if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+				   const struct kernel_hwtstamp_config *config,
+				   struct netlink_ext_ack *extack)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct dsa_switch *ds = cpu_dp->ds;
 	struct dsa_switch_tree *dst;
-	int err = -EOPNOTSUPP;
 	struct dsa_port *dp;
 
 	dst = ds->dst;
 
-	switch (cmd) {
-	case SIOCGHWTSTAMP:
-	case SIOCSHWTSTAMP:
-		/* Deny PTP operations on master if there is at least one
-		 * switch in the tree that is PTP capable.
-		 */
-		list_for_each_entry(dp, &dst->ports, list)
-			if (dsa_port_supports_hwtstamp(dp))
-				return -EBUSY;
-		break;
+	list_for_each_entry(dp, &dst->ports, list) {
+		if (dsa_port_supports_hwtstamp(dp)) {
+			NL_SET_ERR_MSG(extack,
+				       "HW timestamping not allowed on DSA master when switch supports the operation");
+			return -EBUSY;
+		}
 	}
 
-	if (dev->netdev_ops->ndo_eth_ioctl)
-		err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd);
-
-	return err;
+	return 0;
 }
 
-static const struct dsa_netdevice_ops dsa_netdev_ops = {
-	.ndo_eth_ioctl = dsa_master_ioctl,
-};
-
 static int dsa_master_ethtool_setup(struct net_device *dev)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -267,15 +260,6 @@ static void dsa_master_ethtool_teardown(struct net_device *dev)
 	cpu_dp->orig_ethtool_ops = NULL;
 }
 
-static void dsa_netdev_ops_set(struct net_device *dev,
-			       const struct dsa_netdevice_ops *ops)
-{
-	if (netif_is_lag_master(dev))
-		return;
-
-	dev->dsa_ptr->netdev_ops = ops;
-}
-
 /* Keep the master always promiscuous if the tagging protocol requires that
  * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
  * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
@@ -414,16 +398,13 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 	if (ret)
 		goto out_err_reset_promisc;
 
-	dsa_netdev_ops_set(dev, &dsa_netdev_ops);
-
 	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
 	if (ret)
-		goto out_err_ndo_teardown;
+		goto out_err_ethtool_teardown;
 
 	return ret;
 
-out_err_ndo_teardown:
-	dsa_netdev_ops_set(dev, NULL);
+out_err_ethtool_teardown:
 	dsa_master_ethtool_teardown(dev);
 out_err_reset_promisc:
 	dsa_master_set_promiscuity(dev, -1);
@@ -433,7 +414,6 @@ out_err_reset_promisc:
 void dsa_master_teardown(struct net_device *dev)
 {
 	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
-	dsa_netdev_ops_set(dev, NULL);
 	dsa_master_ethtool_teardown(dev);
 	dsa_master_reset_mtu(dev);
 	dsa_master_set_promiscuity(dev, -1);
diff --git a/net/dsa/master.h b/net/dsa/master.h
index 3fc0e610b5b5..80842f4e27f7 100644
--- a/net/dsa/master.h
+++ b/net/dsa/master.h
@@ -15,5 +15,8 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
 			 struct netlink_ext_ack *extack);
 void dsa_master_lag_teardown(struct net_device *lag_dev,
 			     struct dsa_port *cpu_dp);
+int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+				   const struct kernel_hwtstamp_config *config,
+				   struct netlink_ext_ack *extack);
 
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 165bb2cb8431..8abc1658ac47 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -3289,6 +3289,7 @@ static int dsa_master_changeupper(struct net_device *dev,
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
 {
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
@@ -3418,6 +3419,16 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 
 		return NOTIFY_OK;
 	}
+	case NETDEV_PRE_CHANGE_HWTSTAMP: {
+		struct netdev_notifier_hwtstamp_info *info = ptr;
+		int err;
+
+		if (!netdev_uses_dsa(dev))
+			return NOTIFY_DONE;
+
+		err = dsa_master_pre_change_hwtstamp(dev, info->config, extack);
+		return notifier_from_errno(err);
+	}
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 657de1cf258dbe2489906c81bd91e4af536de255 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 2 Apr 2023 17:16:56 +0200
Subject: net: phy: smsc: add support for edpd tunable

This adds support for the EDPD PHY tunable.
Per default EDPD is disabled in interrupt mode, the tunable can be used
to override this, e.g. if the link partner doesn't use EDPD.
The interval to check for energy can be chosen between 1000ms and
2000ms. Note that this value consists of the 1000ms phylib interval
for state machine runs plus the time to wait for energy being detected.

v2:
- consider that phylib core holds phydev->lock when calling the
  phy tunable hooks

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/smsc.c  | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/smscphy.h |  4 +++
 2 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 659a3ab10d82..0eba69ad5745 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -34,6 +34,8 @@
 #define SPECIAL_CTRL_STS_AMDIX_STATE_	0x2000
 
 #define EDPD_MAX_WAIT_DFLT_MS		640
+/* interval between phylib state machine runs in ms */
+#define PHY_STATE_MACH_MS		1000
 
 struct smsc_hw_stat {
 	const char *string;
@@ -295,6 +297,79 @@ static void smsc_get_stats(struct phy_device *phydev,
 		data[i] = smsc_get_stat(phydev, i);
 }
 
+static int smsc_phy_get_edpd(struct phy_device *phydev, u16 *edpd)
+{
+	struct smsc_phy_priv *priv = phydev->priv;
+
+	if (!priv)
+		return -EOPNOTSUPP;
+
+	if (!priv->edpd_enable)
+		*edpd = ETHTOOL_PHY_EDPD_DISABLE;
+	else if (!priv->edpd_max_wait_ms)
+		*edpd = ETHTOOL_PHY_EDPD_NO_TX;
+	else
+		*edpd = PHY_STATE_MACH_MS + priv->edpd_max_wait_ms;
+
+	return 0;
+}
+
+static int smsc_phy_set_edpd(struct phy_device *phydev, u16 edpd)
+{
+	struct smsc_phy_priv *priv = phydev->priv;
+
+	if (!priv)
+		return -EOPNOTSUPP;
+
+	switch (edpd) {
+	case ETHTOOL_PHY_EDPD_DISABLE:
+		priv->edpd_enable = false;
+		break;
+	case ETHTOOL_PHY_EDPD_NO_TX:
+		priv->edpd_enable = true;
+		priv->edpd_max_wait_ms = 0;
+		break;
+	case ETHTOOL_PHY_EDPD_DFLT_TX_MSECS:
+		edpd = PHY_STATE_MACH_MS + EDPD_MAX_WAIT_DFLT_MS;
+		fallthrough;
+	default:
+		if (phydev->irq != PHY_POLL)
+			return -EOPNOTSUPP;
+		if (edpd < PHY_STATE_MACH_MS || edpd > PHY_STATE_MACH_MS + 1000)
+			return -EINVAL;
+		priv->edpd_enable = true;
+		priv->edpd_max_wait_ms = edpd - PHY_STATE_MACH_MS;
+	}
+
+	priv->edpd_mode_set_by_user = true;
+
+	return smsc_phy_config_edpd(phydev);
+}
+
+int smsc_phy_get_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, void *data)
+{
+	switch (tuna->id) {
+	case ETHTOOL_PHY_EDPD:
+		return smsc_phy_get_edpd(phydev, data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(smsc_phy_get_tunable);
+
+int smsc_phy_set_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, const void *data)
+{
+	switch (tuna->id) {
+	case ETHTOOL_PHY_EDPD:
+		return smsc_phy_set_edpd(phydev, *(u16 *)data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(smsc_phy_set_tunable);
+
 int smsc_phy_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h
index 80f37c1dba58..e1c88627755a 100644
--- a/include/linux/smscphy.h
+++ b/include/linux/smscphy.h
@@ -32,6 +32,10 @@ int smsc_phy_config_intr(struct phy_device *phydev);
 irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev);
 int smsc_phy_config_init(struct phy_device *phydev);
 int lan87xx_read_status(struct phy_device *phydev);
+int smsc_phy_get_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, void *data);
+int smsc_phy_set_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, const void *data);
 int smsc_phy_probe(struct phy_device *phydev);
 
 #endif /* __LINUX_SMSCPHY_H__ */
-- 
cgit v1.2.3


From 6ae930d9dbf2d093157be33428538c91966d8a9f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 27 Mar 2023 20:22:51 +0200
Subject: pid: add pidfd_prepare()

Add a new helper that allows to reserve a pidfd and allocates a new
pidfd file that stashes the provided struct pid. This will allow us to
remove places that either open code this function or that call
pidfd_create() but then have to call close_fd() because there are still
failure points after pidfd_create() has been called.

Reviewed-by: Jan Kara <jack@suse.cz>
Message-Id: <20230327-pidfd-file-api-v1-1-5c0e9a3158e4@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h |  1 +
 kernel/fork.c       | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/pid.c        | 19 +++++-------
 3 files changed, 93 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 343abf22092e..b75de288a8c2 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -80,6 +80,7 @@ extern struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
 int pidfd_create(struct pid *pid, unsigned int flags);
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
 
 static inline struct pid *get_pid(struct pid *pid)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..a34395bd708a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1951,6 +1951,91 @@ const struct file_operations pidfd_fops = {
 #endif
 };
 
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid:   the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ *         pidfd file is returned in the last argument to the function. On
+ *         error, a negative error code is returned from the function and the
+ *         last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+	int pidfd;
+	struct file *pidfd_file;
+
+	if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+		return -EINVAL;
+
+	pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (pidfd < 0)
+		return pidfd;
+
+	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+					flags | O_RDWR | O_CLOEXEC);
+	if (IS_ERR(pidfd_file)) {
+		put_unused_fd(pidfd);
+		return PTR_ERR(pidfd_file);
+	}
+	get_pid(pid); /* held by pidfd_file now */
+	*ret = pidfd_file;
+	return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid:   the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ *         pidfd file is returned in the last argument to the function. On
+ *         error, a negative error code is returned from the function and the
+ *         last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+		return -EINVAL;
+
+	return __pidfd_prepare(pid, flags, ret);
+}
+
 static void __delayed_free_task(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
diff --git a/kernel/pid.c b/kernel/pid.c
index 3fbc5e46b721..f93954a0384d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
  */
 int pidfd_create(struct pid *pid, unsigned int flags)
 {
-	int fd;
-
-	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
-		return -EINVAL;
+	int pidfd;
+	struct file *pidfd_file;
 
-	if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
-		return -EINVAL;
-
-	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
-			      flags | O_RDWR | O_CLOEXEC);
-	if (fd < 0)
-		put_pid(pid);
+	pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+	if (pidfd < 0)
+		return pidfd;
 
-	return fd;
+	fd_install(pidfd, pidfd_file);
+	return pidfd;
 }
 
 /**
-- 
cgit v1.2.3


From f98e0640c5c6b8bb00336dae8d06ede862754c28 Mon Sep 17 00:00:00 2001
From: Bastien Nocera <hadess@hadess.net>
Date: Thu, 2 Mar 2023 11:55:53 +0100
Subject: USB: core: Add wireless_status sysfs attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a wireless_status sysfs attribute to USB devices to keep track of
whether a USB device that's comprised of a receiver dongle and an emitter
device over a, most of the time proprietary, wireless link has its emitter
connected or disconnected.

This will be used by user-space OS components to determine whether the
battery-powered part of the device is wirelessly connected or not,
allowing, for example:
- upower to hide the battery for devices where the device is turned off
  but the receiver plugged in, rather than showing 0%, or other values
  that could be confusing to users
- Pipewire to hide a headset from the list of possible inputs or outputs
  or route audio appropriately if the headset is suddenly turned off, or
  turned on
- libinput to determine whether a keyboard or mouse is present when its
  receiver is plugged in.

This is done at the USB interface level as:
- the interface on which the wireless status is detected is sometimes
  not the same as where it could be consumed (eg. the audio interface
  on a headset dongle will still appear even if the headset is turned
  off), and we cannot have synchronisation of status across subsystems.
- this behaviour is not specific to HID devices, even if the protocols
  used to determine whether or not the remote device is connected can
  be HID.

This is not an attribute that is meant to replace protocol specific
APIs, such as the ones available for WWAN, WLAN/Wi-Fi, or Bluetooth
or any other sort of networking, but solely for wireless devices with
an ad-hoc “lose it and your device is e-waste” receiver dongle.

The USB interface will only be exporting the wireless_status sysfs
attribute if it gets set through the API exported in the next commit.

Signed-off-by: Bastien Nocera <hadess@hadess.net>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20230302105555.51417-4-hadess@hadess.net
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 Documentation/ABI/testing/sysfs-bus-usb | 17 +++++++++++
 drivers/usb/core/sysfs.c                | 50 +++++++++++++++++++++++++++++++++
 drivers/usb/core/usb.h                  |  1 +
 include/linux/usb.h                     |  9 ++++++
 4 files changed, 77 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index 545c2dd97ed0..cb172db41b34 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -166,6 +166,23 @@ Description:
 		The file will be present for all speeds of USB devices, and will
 		always read "no" for USB 1.1 and USB 2.0 devices.
 
+What:		/sys/bus/usb/devices/<INTERFACE>/wireless_status
+Date:		February 2023
+Contact:	Bastien Nocera <hadess@hadess.net>
+Description:
+		Some USB devices use a USB receiver dongle to communicate
+		wirelessly with their device using proprietary protocols. This
+		attribute allows user-space to know whether the device is
+		connected to its receiver dongle, and, for example, consider
+		the device to be absent when choosing whether to show the
+		device's battery, show a headset in a list of outputs, or show
+		an on-screen keyboard if the only wireless keyboard is
+		turned off.
+		This attribute is not to be used to replace protocol specific
+		statuses available in WWAN, WLAN/Wi-Fi, Bluetooth, etc.
+		If the device does not use a receiver dongle with a wireless
+		device, then this attribute will not exist.
+
 What:		/sys/bus/usb/devices/.../<hub_interface>/port<X>
 Date:		August 2012
 Contact:	Lan Tianyu <tianyu.lan@intel.com>
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index b63f78e48c74..323dc02becbe 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -1227,9 +1227,59 @@ static const struct attribute_group intf_assoc_attr_grp = {
 	.is_visible =	intf_assoc_attrs_are_visible,
 };
 
+static ssize_t wireless_status_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct usb_interface *intf;
+
+	intf = to_usb_interface(dev);
+	if (intf->wireless_status == USB_WIRELESS_STATUS_DISCONNECTED)
+		return sysfs_emit(buf, "%s\n", "disconnected");
+	return sysfs_emit(buf, "%s\n", "connected");
+}
+static DEVICE_ATTR_RO(wireless_status);
+
+static struct attribute *intf_wireless_status_attrs[] = {
+	&dev_attr_wireless_status.attr,
+	NULL
+};
+
+static umode_t intf_wireless_status_attr_is_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct usb_interface *intf = to_usb_interface(dev);
+
+	if (a != &dev_attr_wireless_status.attr ||
+	    intf->wireless_status != USB_WIRELESS_STATUS_NA)
+		return a->mode;
+	return 0;
+}
+
+static const struct attribute_group intf_wireless_status_attr_grp = {
+	.attrs =	intf_wireless_status_attrs,
+	.is_visible =	intf_wireless_status_attr_is_visible,
+};
+
+int usb_update_wireless_status_attr(struct usb_interface *intf)
+{
+	struct device *dev = &intf->dev;
+	int ret;
+
+	ret = sysfs_update_group(&dev->kobj, &intf_wireless_status_attr_grp);
+	if (ret < 0)
+		return ret;
+
+	sysfs_notify(&dev->kobj, NULL, "wireless_status");
+	kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
 const struct attribute_group *usb_interface_groups[] = {
 	&intf_attr_grp,
 	&intf_assoc_attr_grp,
+	&intf_wireless_status_attr_grp,
 	NULL
 };
 
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 0eac7d4285d1..3f14e15f07f6 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -15,6 +15,7 @@ extern int usb_create_sysfs_dev_files(struct usb_device *dev);
 extern void usb_remove_sysfs_dev_files(struct usb_device *dev);
 extern void usb_create_sysfs_intf_files(struct usb_interface *intf);
 extern void usb_remove_sysfs_intf_files(struct usb_interface *intf);
+extern int usb_update_wireless_status_attr(struct usb_interface *intf);
 extern int usb_create_ep_devs(struct device *parent,
 				struct usb_host_endpoint *endpoint,
 				struct usb_device *udev);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 9642ee02d713..eee54cc961b4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -170,6 +170,12 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt,
 	return usb_find_common_endpoints_reverse(alt, NULL, NULL, NULL, int_out);
 }
 
+enum usb_wireless_status {
+	USB_WIRELESS_STATUS_NA = 0,
+	USB_WIRELESS_STATUS_DISCONNECTED,
+	USB_WIRELESS_STATUS_CONNECTED,
+};
+
 /**
  * struct usb_interface - what usb device drivers talk to
  * @altsetting: array of interface structures, one for each alternate
@@ -203,6 +209,8 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt,
  * @reset_ws: Used for scheduling resets from atomic context.
  * @resetting_device: USB core reset the device, so use alt setting 0 as
  *	current; needs bandwidth alloc after reset.
+ * @wireless_status: if the USB device uses a receiver/emitter combo, whether
+ *	the emitter is connected.
  *
  * USB device drivers attach to interfaces on a physical device.  Each
  * interface encapsulates a single high level function, such as feeding
@@ -253,6 +261,7 @@ struct usb_interface {
 	unsigned needs_binding:1;	/* needs delayed unbind/rebind */
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
 	unsigned authorized:1;		/* used for interface authorization */
+	enum usb_wireless_status wireless_status;
 
 	struct device dev;		/* interface specific device info */
 	struct device *usb_dev;
-- 
cgit v1.2.3


From 0a4db185f0788dfc828512d0004c468921bf6c0a Mon Sep 17 00:00:00 2001
From: Bastien Nocera <hadess@hadess.net>
Date: Thu, 2 Mar 2023 11:55:54 +0100
Subject: USB: core: Add API to change the wireless_status

This adds the API that allows device specific drivers to tell user-space
about whether the wireless device is connected to its receiver dongle.

See "USB: core: Add wireless_status sysfs attribute" for a detailed
explanation of what this attribute should be used for.

Signed-off-by: Bastien Nocera <hadess@hadess.net>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20230302105555.51417-5-hadess@hadess.net
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/usb/core/message.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h        |  5 +++++
 2 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index cc404bb7e8f7..33392a6f4972 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1908,6 +1908,45 @@ static void __usb_queue_reset_device(struct work_struct *ws)
 	usb_put_intf(iface);	/* Undo _get_ in usb_queue_reset_device() */
 }
 
+/*
+ * Internal function to set the wireless_status sysfs attribute
+ * See usb_set_wireless_status() for more details
+ */
+static void __usb_wireless_status_intf(struct work_struct *ws)
+{
+	struct usb_interface *iface =
+		container_of(ws, struct usb_interface, wireless_status_work);
+
+	device_lock(iface->dev.parent);
+	if (iface->sysfs_files_created)
+		usb_update_wireless_status_attr(iface);
+	device_unlock(iface->dev.parent);
+	usb_put_intf(iface);	/* Undo _get_ in usb_set_wireless_status() */
+}
+
+/**
+ * usb_set_wireless_status - sets the wireless_status struct member
+ * @dev: the device to modify
+ * @status: the new wireless status
+ *
+ * Set the wireless_status struct member to the new value, and emit
+ * sysfs changes as necessary.
+ *
+ * Returns: 0 on success, -EALREADY if already set.
+ */
+int usb_set_wireless_status(struct usb_interface *iface,
+		enum usb_wireless_status status)
+{
+	if (iface->wireless_status == status)
+		return -EALREADY;
+
+	usb_get_intf(iface);
+	iface->wireless_status = status;
+	schedule_work(&iface->wireless_status_work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_set_wireless_status);
 
 /*
  * usb_set_configuration - Makes a particular device setting be current
@@ -2100,6 +2139,7 @@ free_interfaces:
 		intf->dev.type = &usb_if_device_type;
 		intf->dev.groups = usb_interface_groups;
 		INIT_WORK(&intf->reset_ws, __usb_queue_reset_device);
+		INIT_WORK(&intf->wireless_status_work, __usb_wireless_status_intf);
 		intf->minor = -1;
 		device_initialize(&intf->dev);
 		pm_runtime_no_callbacks(&intf->dev);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index eee54cc961b4..6dfd59d3d695 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -262,6 +262,7 @@ struct usb_interface {
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
 	unsigned authorized:1;		/* used for interface authorization */
 	enum usb_wireless_status wireless_status;
+	struct work_struct wireless_status_work;
 
 	struct device dev;		/* interface specific device info */
 	struct device *usb_dev;
@@ -896,6 +897,10 @@ static inline int usb_interface_claimed(struct usb_interface *iface)
 
 extern void usb_driver_release_interface(struct usb_driver *driver,
 			struct usb_interface *iface);
+
+int usb_set_wireless_status(struct usb_interface *iface,
+			enum usb_wireless_status status);
+
 const struct usb_device_id *usb_match_id(struct usb_interface *interface,
 					 const struct usb_device_id *id);
 extern int usb_match_one_id(struct usb_interface *interface,
-- 
cgit v1.2.3


From f033c26de5a5734625d2dd1dc196745fae186f1b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 30 Mar 2023 01:10:24 +0100
Subject: regmap: Add maple tree based register cache

The current state of the art for sparse register maps is the
rbtree cache.  This works well for most applications but isn't
always ideal for sparser register maps since the rbtree can get
deep, requiring a lot of walking.  Fortunately the kernel has a
data structure intended to address this very problem, the maple
tree.  Provide an initial implementation of a register cache
based on the maple tree to start taking advantage of it.

The entries stored in the maple tree are arrays of register
values, with the maple tree keys holding the register addresses.
We store data in host native format rather than device native
format as we do for rbtree, this will be a benefit for devices
where we don't marshal data within regmap and simplifies the code
but will result in additional CPU overhead when syncing the cache
on devices where we do marshal data in regmap.

This should work well for a lot of devices, though there's some
additional areas that could be looked at such as caching the
last accessed entry like we do for rbtree and trying to minimise
the maple tree level locking. We should also use bulk writes
rather than single register writes when resyncing the cache where
possible, even if we don't store in device native format.

Very small register maps may continue to to better with rbtree
longer term.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230325-regcache-maple-v3-2-23e271f93dc7@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/Makefile         |   2 +-
 drivers/base/regmap/internal.h       |   1 +
 drivers/base/regmap/regcache-maple.c | 278 +++++++++++++++++++++++++++++++++++
 drivers/base/regmap/regcache.c       |   1 +
 drivers/base/regmap/regmap-kunit.c   |   3 +
 include/linux/regmap.h               |   1 +
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/regmap/regcache-maple.c

(limited to 'include')

diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 4cb73468a197..f6c6cb017200 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -3,7 +3,7 @@
 CFLAGS_regmap.o := -I$(src)
 
 obj-$(CONFIG_REGMAP) += regmap.o regcache.o
-obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o
+obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o regcache-maple.o
 obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o
 obj-$(CONFIG_REGMAP_KUNIT) += regmap-kunit.o
 obj-$(CONFIG_REGMAP_AC97) += regmap-ac97.o
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 7b9ef43bcea6..6361df6f553a 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -282,6 +282,7 @@ enum regmap_endian regmap_get_val_endian(struct device *dev,
 					 const struct regmap_config *config);
 
 extern struct regcache_ops regcache_rbtree_ops;
+extern struct regcache_ops regcache_maple_ops;
 extern struct regcache_ops regcache_flat_ops;
 
 static inline const char *regmap_name(const struct regmap *map)
diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c
new file mode 100644
index 000000000000..497cc708d277
--- /dev/null
+++ b/drivers/base/regmap/regcache-maple.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Register cache access API - maple tree based cache
+//
+// Copyright 2023 Arm, Ltd
+//
+// Author: Mark Brown <broonie@kernel.org>
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/maple_tree.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static int regcache_maple_read(struct regmap *map,
+			       unsigned int reg, unsigned int *value)
+{
+	struct maple_tree *mt = map->cache;
+	MA_STATE(mas, mt, reg, reg);
+	unsigned long *entry;
+
+	rcu_read_lock();
+
+	entry = mas_find(&mas, reg);
+	if (!entry) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	*value = entry[reg - mas.index];
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int regcache_maple_write(struct regmap *map, unsigned int reg,
+				unsigned int val)
+{
+	struct maple_tree *mt = map->cache;
+	MA_STATE(mas, mt, reg, reg);
+	unsigned long *entry, *upper, *lower;
+	unsigned long index, last;
+	size_t lower_sz, upper_sz;
+	int ret;
+
+	rcu_read_lock();
+
+	entry = mas_find(&mas, reg);
+	if (entry) {
+		entry[reg - mas.index] = val;
+		rcu_read_unlock();
+		return 0;
+	}
+
+	/* Any adjacent entries to extend/merge? */
+	mas_set_range(&mas, reg - 1, reg + 1);
+	index = reg;
+	last = reg;
+
+	lower = mas_find(&mas, reg - 1);
+	if (lower) {
+		index = mas.index;
+		lower_sz = (mas.last - mas.index + 1) * sizeof(unsigned long);
+	}
+
+	upper = mas_find(&mas, reg + 1);
+	if (upper) {
+		last = mas.last;
+		upper_sz = (mas.last - mas.index + 1) * sizeof(unsigned long);
+	}
+
+	rcu_read_unlock();
+
+	entry = kmalloc((last - index + 1) * sizeof(unsigned long),
+			GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	if (lower)
+		memcpy(entry, lower, lower_sz);
+	entry[reg - index] = val;
+	if (upper)
+		memcpy(&entry[reg - index + 1], upper, upper_sz);
+
+	/*
+	 * This is safe because the regmap lock means the Maple lock
+	 * is redundant, but we need to take it due to lockdep asserts
+	 * in the maple tree code.
+	 */
+	mas_lock(&mas);
+
+	mas_set_range(&mas, index, last);
+	ret = mas_store_gfp(&mas, entry, GFP_KERNEL);
+
+	mas_unlock(&mas);
+
+	if (ret == 0) {
+		kfree(lower);
+		kfree(upper);
+	}
+	
+	return ret;
+}
+
+static int regcache_maple_drop(struct regmap *map, unsigned int min,
+			       unsigned int max)
+{
+	struct maple_tree *mt = map->cache;
+	MA_STATE(mas, mt, min, max);
+	unsigned long *entry, *lower, *upper;
+	unsigned long lower_index, lower_last;
+	unsigned long upper_index, upper_last;
+	int ret;
+
+	lower = NULL;
+	upper = NULL;
+
+	mas_lock(&mas);
+
+	mas_for_each(&mas, entry, max) {
+		/*
+		 * This is safe because the regmap lock means the
+		 * Maple lock is redundant, but we need to take it due
+		 * to lockdep asserts in the maple tree code.
+		 */
+		mas_unlock(&mas);
+
+		/* Do we need to save any of this entry? */
+		if (mas.index < min) {
+			lower_index = mas.index;
+			lower_last = min -1;
+
+			lower = kmemdup(entry, ((min - mas.index) *
+						sizeof(unsigned long)),
+					GFP_KERNEL);
+			if (!lower) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		if (mas.last > max) {
+			upper_index = max + 1;
+			upper_last = mas.last;
+
+			upper = kmemdup(&entry[max + 1],
+					((mas.last - max) *
+					 sizeof(unsigned long)),
+					GFP_KERNEL);
+			if (!upper) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		kfree(entry);
+		mas_lock(&mas);
+		mas_erase(&mas);
+
+		/* Insert new nodes with the saved data */
+		if (lower) {
+			mas_set_range(&mas, lower_index, lower_last);
+			ret = mas_store_gfp(&mas, lower, GFP_KERNEL);
+			if (ret != 0)
+				goto out;
+			lower = NULL;
+		}
+
+		if (upper) {
+			mas_set_range(&mas, upper_index, upper_last);
+			ret = mas_store_gfp(&mas, upper, GFP_KERNEL);
+			if (ret != 0)
+				goto out;
+			upper = NULL;
+		}
+	}
+
+out:
+	mas_unlock(&mas);
+	kfree(lower);
+	kfree(upper);
+
+	return ret;
+}
+
+static int regcache_maple_sync(struct regmap *map, unsigned int min,
+			       unsigned int max)
+{
+	struct maple_tree *mt = map->cache;
+	unsigned long *entry;
+	MA_STATE(mas, mt, min, max);
+	unsigned long lmin = min;
+	unsigned long lmax = max;
+	unsigned int r;
+	int ret;
+
+	map->cache_bypass = true;
+
+	rcu_read_lock();
+
+	mas_for_each(&mas, entry, max) {
+		for (r = max(mas.index, lmin); r <= min(mas.last, lmax); r++) {
+			ret = regcache_sync_val(map, r, entry[r - mas.index]);
+			if (ret != 0)
+				goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+
+	map->cache_bypass = false;
+
+	return ret;
+}
+
+static int regcache_maple_exit(struct regmap *map)
+{
+	struct maple_tree *mt = map->cache;
+	MA_STATE(mas, mt, 0, UINT_MAX);
+	unsigned int *entry;;
+
+	/* if we've already been called then just return */
+	if (!mt)
+		return 0;
+
+	mas_lock(&mas);
+	mas_for_each(&mas, entry, UINT_MAX)
+		kfree(entry);
+	__mt_destroy(mt);
+	mas_unlock(&mas);
+
+	kfree(mt);
+	map->cache = NULL;
+
+	return 0;
+}
+
+static int regcache_maple_init(struct regmap *map)
+{
+	struct maple_tree *mt;
+	int i;
+	int ret;
+
+	mt = kmalloc(sizeof(*mt), GFP_KERNEL);
+	if (!mt)
+		return -ENOMEM;
+	map->cache = mt;
+
+	mt_init(mt);
+
+	for (i = 0; i < map->num_reg_defaults; i++) {
+		ret = regcache_maple_write(map,
+					   map->reg_defaults[i].reg,
+					   map->reg_defaults[i].def);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	regcache_maple_exit(map);
+	return ret;
+}
+
+struct regcache_ops regcache_maple_ops = {
+	.type = REGCACHE_MAPLE,
+	.name = "maple",
+	.init = regcache_maple_init,
+	.exit = regcache_maple_exit,
+	.read = regcache_maple_read,
+	.write = regcache_maple_write,
+	.drop = regcache_maple_drop,
+	.sync = regcache_maple_sync,
+};
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index a5f11bcc1215..029564695dbb 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -17,6 +17,7 @@
 
 static const struct regcache_ops *cache_types[] = {
 	&regcache_rbtree_ops,
+	&regcache_maple_ops,
 	&regcache_flat_ops,
 };
 
diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c
index c78f45cf9a8d..f76d41688134 100644
--- a/drivers/base/regmap/regmap-kunit.c
+++ b/drivers/base/regmap/regmap-kunit.c
@@ -29,6 +29,7 @@ static const struct regcache_types regcache_types_list[] = {
 	{ REGCACHE_NONE, "none" },
 	{ REGCACHE_FLAT, "flat" },
 	{ REGCACHE_RBTREE, "rbtree" },
+	{ REGCACHE_MAPLE, "maple" },
 };
 
 KUNIT_ARRAY_PARAM(regcache_types, regcache_types_list, case_to_desc);
@@ -36,12 +37,14 @@ KUNIT_ARRAY_PARAM(regcache_types, regcache_types_list, case_to_desc);
 static const struct regcache_types real_cache_types_list[] = {
 	{ REGCACHE_FLAT, "flat" },
 	{ REGCACHE_RBTREE, "rbtree" },
+	{ REGCACHE_MAPLE, "maple" },
 };
 
 KUNIT_ARRAY_PARAM(real_cache_types, real_cache_types_list, case_to_desc);
 
 static const struct regcache_types sparse_cache_types_list[] = {
 	{ REGCACHE_RBTREE, "rbtree" },
+	{ REGCACHE_MAPLE, "maple" },
 };
 
 KUNIT_ARRAY_PARAM(sparse_cache_types, sparse_cache_types_list, case_to_desc);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index b63204d51b35..4d55ac88ba9e 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -51,6 +51,7 @@ enum regcache_type {
 	REGCACHE_NONE,
 	REGCACHE_RBTREE,
 	REGCACHE_FLAT,
+	REGCACHE_MAPLE,
 };
 
 /**
-- 
cgit v1.2.3


From d8aeb44a9ae324c4b823689fabb30b6621d93c88 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 7 Mar 2023 09:40:28 -0700
Subject: fs: add FMODE_DIO_PARALLEL_WRITE flag

Some filesystems support multiple threads writing to the same file with
O_DIRECT without requiring exclusive access to it. io_uring can use this
hint to avoid serializing dio writes to this inode, instead allowing them
to run in parallel.

XFS and ext4 both fall into this category, so set the flag for both of
them.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ext4/file.c     | 3 ++-
 fs/xfs/xfs_file.c  | 3 ++-
 include/linux/fs.h | 3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0b8b4499e5ca..d101b3b0c7da 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
 			return ret;
 	}
 
-	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
+			FMODE_DIO_PARALLEL_WRITE;
 	return dquot_file_open(inode, filp);
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 705250f9f90a..863289aaa441 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1171,7 +1171,8 @@ xfs_file_open(
 {
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
-	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
+			FMODE_DIO_PARALLEL_WRITE;
 	return generic_file_open(inode, file);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..475d88640d3d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 #define	FMODE_NOREUSE		((__force fmode_t)0x800000)
 
+/* File supports non-exclusive O_DIRECT writes from multiple threads */
+#define FMODE_DIO_PARALLEL_WRITE	((__force fmode_t)0x1000000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
-- 
cgit v1.2.3


From 81cf17cd3ab3e5441e876a8e9e9c38ae9920cecb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 11:01:45 -0600
Subject: io_uring/kbuf: rename struct io_uring_buf_reg 'pad' to'flags'

In preparation for allowing flags to be set for registration, rename
the padding and use it for that.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 io_uring/kbuf.c               | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 709de6d4feb2..c3f3ea997f3a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -640,7 +640,7 @@ struct io_uring_buf_reg {
 	__u64	ring_addr;
 	__u32	ring_entries;
 	__u16	bgid;
-	__u16	pad;
+	__u16	flags;
 	__u64	resv[3];
 };
 
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index db5f189267b7..4b2f4a0ee962 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -494,7 +494,9 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (reg.flags)
 		return -EINVAL;
 	if (!reg.ring_addr)
 		return -EFAULT;
@@ -544,7 +546,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (reg.flags)
 		return -EINVAL;
 
 	bl = io_buffer_get_list(ctx, reg.bgid);
-- 
cgit v1.2.3


From c56e022c0a27142b7b59ae6bdf45f86bf4b298a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 11:07:19 -0600
Subject: io_uring: add support for user mapped provided buffer ring

The ring mapped provided buffer rings rely on the application allocating
the memory for the ring, and then the kernel will map it. This generally
works fine, but runs into issues on some architectures where we need
to be able to ensure that the kernel and application virtual address for
the ring play nicely together. This at least impacts architectures that
set SHM_COLOUR, but potentially also anyone setting SHMLBA.

To use this variant of ring provided buffers, the application need not
allocate any memory for the ring. Instead the kernel will do so, and
the allocation must subsequently call mmap(2) on the ring with the
offset set to:

	IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)

to get a virtual address for the buffer ring. Normally the application
would allocate a suitable piece of memory (and correctly aligned) and
simply pass that in via io_uring_buf_reg.ring_addr and the kernel would
map it.

Outside of the setup differences, the kernel allocate + user mapped
provided buffer ring works exactly the same.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 17 ++++++++
 io_uring/io_uring.c           | 13 +++++-
 io_uring/kbuf.c               | 99 +++++++++++++++++++++++++++++++++----------
 io_uring/kbuf.h               |  4 ++
 4 files changed, 109 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index c3f3ea997f3a..1d59c816a5b8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -389,6 +389,9 @@ enum {
 #define IORING_OFF_SQ_RING		0ULL
 #define IORING_OFF_CQ_RING		0x8000000ULL
 #define IORING_OFF_SQES			0x10000000ULL
+#define IORING_OFF_PBUF_RING		0x80000000ULL
+#define IORING_OFF_PBUF_SHIFT		16
+#define IORING_OFF_MMAP_MASK		0xf8000000ULL
 
 /*
  * Filled with the offset for mmap(2)
@@ -635,6 +638,20 @@ struct io_uring_buf_ring {
 	};
 };
 
+/*
+ * Flags for IORING_REGISTER_PBUF_RING.
+ *
+ * IOU_PBUF_RING_MMAP:	If set, kernel will allocate the memory for the ring.
+ *			The application must not set a ring_addr in struct
+ *			io_uring_buf_reg, instead it must subsequently call
+ *			mmap(2) with the offset set as:
+ *			IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
+ *			to get a virtual mapping for the ring.
+ */
+enum {
+	IOU_PBUF_RING_MMAP	= 1,
+};
+
 /* argument for IORING_(UN)REGISTER_PBUF_RING */
 struct io_uring_buf_reg {
 	__u64	ring_addr;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b49b7ee12d60..d72aa92ce2d6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3289,7 +3289,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	struct page *page;
 	void *ptr;
 
-	switch (offset) {
+	switch (offset & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
 		ptr = ctx->rings;
@@ -3297,6 +3297,17 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	case IORING_OFF_SQES:
 		ptr = ctx->sq_sqes;
 		break;
+	case IORING_OFF_PBUF_RING: {
+		unsigned int bgid;
+
+		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+		mutex_lock(&ctx->uring_lock);
+		ptr = io_pbuf_get_address(ctx, bgid);
+		mutex_unlock(&ctx->uring_lock);
+		if (!ptr)
+			return ERR_PTR(-EINVAL);
+		break;
+		}
 	default:
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 4b2f4a0ee962..cd1d9dddf58e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		return NULL;
 
 	head &= bl->mask;
-	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+	/* mmaped buffers are always contig */
+	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
 		buf = &br->bufs[head];
 	} else {
 		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
-	if (bl->is_mapped && bl->buf_nr_pages) {
-		int j;
-
+	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
-		for (j = 0; j < bl->buf_nr_pages; j++)
-			unpin_user_page(bl->buf_pages[j]);
-		kvfree(bl->buf_pages);
-		bl->buf_pages = NULL;
-		bl->buf_nr_pages = 0;
+		if (bl->is_mmap) {
+			if (bl->buf_ring) {
+				struct page *page;
+
+				page = virt_to_head_page(bl->buf_ring);
+				if (put_page_testzero(page))
+					free_compound_page(page);
+				bl->buf_ring = NULL;
+			}
+			bl->is_mmap = 0;
+		} else if (bl->buf_nr_pages) {
+			int j;
+
+			for (j = 0; j < bl->buf_nr_pages; j++)
+				unpin_user_page(bl->buf_pages[j]);
+			kvfree(bl->buf_pages);
+			bl->buf_pages = NULL;
+			bl->buf_nr_pages = 0;
+		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
 		bl->is_mapped = 0;
@@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
 	bl->is_mapped = 1;
+	bl->is_mmap = 0;
+	return 0;
+}
+
+static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+			      struct io_buffer_list *bl)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	size_t ring_size;
+	void *ptr;
+
+	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+	if (!ptr)
+		return -ENOMEM;
+
+	bl->buf_ring = ptr;
+	bl->is_mapped = 1;
+	bl->is_mmap = 1;
 	return 0;
 }
 
@@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 		return -EINVAL;
-	if (reg.flags)
-		return -EINVAL;
-	if (!reg.ring_addr)
-		return -EFAULT;
-	if (reg.ring_addr & ~PAGE_MASK)
+	if (reg.flags & ~IOU_PBUF_RING_MMAP)
 		return -EINVAL;
+	if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+		if (!reg.ring_addr)
+			return -EFAULT;
+		if (reg.ring_addr & ~PAGE_MASK)
+			return -EINVAL;
+	} else {
+		if (reg.ring_addr)
+			return -EINVAL;
+	}
+
 	if (!is_power_of_2(reg.ring_entries))
 		return -EINVAL;
 
@@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 			return -ENOMEM;
 	}
 
-	ret = io_pin_pbuf_ring(&reg, bl);
-	if (ret) {
-		kfree(free_bl);
-		return ret;
-	}
+	if (!(reg.flags & IOU_PBUF_RING_MMAP))
+		ret = io_pin_pbuf_ring(&reg, bl);
+	else
+		ret = io_alloc_pbuf_ring(&reg, bl);
 
-	bl->nr_entries = reg.ring_entries;
-	bl->mask = reg.ring_entries - 1;
+	if (!ret) {
+		bl->nr_entries = reg.ring_entries;
+		bl->mask = reg.ring_entries - 1;
 
-	io_buffer_add_list(ctx, bl, reg.bgid);
-	return 0;
+		io_buffer_add_list(ctx, bl, reg.bgid);
+		return 0;
+	}
+
+	kfree(free_bl);
+	return ret;
 }
 
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	}
 	return 0;
 }
+
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+{
+	struct io_buffer_list *bl;
+
+	bl = io_buffer_get_list(ctx, bgid);
+	if (!bl || !bl->is_mmap)
+		return NULL;
+
+	return bl->buf_ring;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 61b9c7dade9d..d14345ef61fc 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -26,6 +26,8 @@ struct io_buffer_list {
 
 	/* ring mapped provided buffers */
 	__u8 is_mapped;
+	/* ring mapped provided buffers, but mmap'ed by application */
+	__u8 is_mmap;
 };
 
 struct io_buffer {
@@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+
 static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
 {
 	/*
-- 
cgit v1.2.3


From efba1a9e653e107577a48157b5424878c46f2285 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 23 Feb 2023 08:43:52 -0800
Subject: io_uring: Move from hlist to io_wq_work_node

Having cache entries linked using the hlist format brings no benefit, and
also requires an unnecessary extra pointer address per cache entry.

Use the internal io_wq_work_node single-linked list for the internal
alloc caches (async_msghdr and async_poll)

This is required to be able to use KASAN on cache entries, since we do
not need to touch unused (and poisoned) cache entries when adding more
entries to the list.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/alloc_cache.h         | 24 +++++++++++++-----------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 00689c12f6ab..757105ebd8af 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -188,7 +188,7 @@ struct io_ev_fd {
 };
 
 struct io_alloc_cache {
-	struct hlist_head	list;
+	struct io_wq_work_node	list;
 	unsigned int		nr_cached;
 };
 
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index c2cde88aeed5..aaa838c31d92 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -7,7 +7,7 @@
 #define IO_ALLOC_CACHE_MAX	512
 
 struct io_cache_entry {
-	struct hlist_node	node;
+	struct io_wq_work_node node;
 };
 
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
@@ -15,7 +15,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 {
 	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
 		cache->nr_cached++;
-		hlist_add_head(&entry->node, &cache->list);
+		wq_stack_add_head(&entry->node, &cache->list);
 		return true;
 	}
 	return false;
@@ -23,12 +23,13 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
-	if (!hlist_empty(&cache->list)) {
-		struct hlist_node *node = cache->list.first;
+	if (cache->list.next) {
+		struct io_cache_entry *entry;
 
-		hlist_del(node);
+		entry = container_of(cache->list.next, struct io_cache_entry, node);
+		cache->list.next = cache->list.next->next;
 		cache->nr_cached--;
-		return container_of(node, struct io_cache_entry, node);
+		return entry;
 	}
 
 	return NULL;
@@ -36,18 +37,19 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 
 static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
 {
-	INIT_HLIST_HEAD(&cache->list);
+	cache->list.next = NULL;
 	cache->nr_cached = 0;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
 					void (*free)(struct io_cache_entry *))
 {
-	while (!hlist_empty(&cache->list)) {
-		struct hlist_node *node = cache->list.first;
+	while (1) {
+		struct io_cache_entry *entry = io_alloc_cache_get(cache);
 
-		hlist_del(node);
-		free(container_of(node, struct io_cache_entry, node));
+		if (!entry)
+			break;
+		free(entry);
 	}
 	cache->nr_cached = 0;
 }
-- 
cgit v1.2.3


From e1fe7ee885dc0712e982ee465d9f8b96254c30c1 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 23 Feb 2023 08:43:53 -0800
Subject: io_uring: Add KASAN support for alloc_caches

Add support for KASAN in the alloc_caches (apoll and netmsg_cache).
Thus, if something touches the unused caches, it will raise a KASAN
warning/exception.

It poisons the object when the object is put to the cache, and unpoisons
it when the object is gotten or freed.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/alloc_cache.h         | 6 +++++-
 io_uring/io_uring.c            | 4 ++--
 io_uring/net.h                 | 5 ++++-
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 757105ebd8af..3d152bdcd30a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -190,6 +190,7 @@ struct io_ev_fd {
 struct io_alloc_cache {
 	struct io_wq_work_node	list;
 	unsigned int		nr_cached;
+	size_t			elem_size;
 };
 
 struct io_ring_ctx {
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index aaa838c31d92..2fbecaa3a1ba 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,6 +16,8 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
 		cache->nr_cached++;
 		wq_stack_add_head(&entry->node, &cache->list);
+		/* KASAN poisons object */
+		kasan_slab_free_mempool(entry);
 		return true;
 	}
 	return false;
@@ -27,6 +29,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 		struct io_cache_entry *entry;
 
 		entry = container_of(cache->list.next, struct io_cache_entry, node);
+		kasan_unpoison_range(entry, cache->elem_size);
 		cache->list.next = cache->list.next->next;
 		cache->nr_cached--;
 		return entry;
@@ -35,10 +38,11 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size)
 {
 	cache->list.next = NULL;
 	cache->nr_cached = 0;
+	cache->elem_size = size;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d72aa92ce2d6..24be4992821b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,8 +310,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	io_alloc_cache_init(&ctx->apoll_cache);
-	io_alloc_cache_init(&ctx->netmsg_cache);
+	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
+	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
diff --git a/io_uring/net.h b/io_uring/net.h
index 5ffa11bf5d2e..191009979bcb 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -5,8 +5,8 @@
 
 #include "alloc_cache.h"
 
-#if defined(CONFIG_NET)
 struct io_async_msghdr {
+#if defined(CONFIG_NET)
 	union {
 		struct iovec		fast_iov[UIO_FASTIOV];
 		struct {
@@ -22,8 +22,11 @@ struct io_async_msghdr {
 	struct sockaddr __user		*uaddr;
 	struct msghdr			msg;
 	struct sockaddr_storage		addr;
+#endif
 };
 
+#if defined(CONFIG_NET)
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
-- 
cgit v1.2.3


From d322818ef4c752d79cd667474418691237aa9ccf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 27 Mar 2023 16:34:48 +0100
Subject: io_uring: kill unused notif declarations

There are two leftover structures from the notification registration
mechanism that has never been released, kill them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f05f65aebaf8b1b5bf28519a8fdb350e3e7c9ad0.1679924536.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1d59c816a5b8..f8d14d1c58d3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -571,19 +571,6 @@ struct io_uring_rsrc_update2 {
 	__u32 resv2;
 };
 
-struct io_uring_notification_slot {
-	__u64 tag;
-	__u64 resv[3];
-};
-
-struct io_uring_notification_register {
-	__u32 nr_slots;
-	__u32 resv;
-	__u64 resv2;
-	__u64 data;
-	__u64 resv3;
-};
-
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
-- 
cgit v1.2.3


From a282967c848fb1d92c28334430c472da9c334e54 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 27 Mar 2023 16:38:15 +0100
Subject: io_uring: encapsulate task_work state

For task works we're passing around a bool pointer for whether the
current ring is locked or not, let's wrap it in a structure, that
will make it more opaque preventing abuse and will also help us
to pass more info in the future if needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  7 ++++-
 io_uring/io_uring.c            | 71 +++++++++++++++++++++---------------------
 io_uring/io_uring.h            | 14 ++++-----
 io_uring/notif.c               |  4 +--
 io_uring/poll.c                | 32 +++++++++----------
 io_uring/rw.c                  |  6 ++--
 io_uring/timeout.c             | 14 ++++-----
 io_uring/uring_cmd.c           |  4 +--
 8 files changed, 79 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3d152bdcd30a..561fa421c453 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -367,6 +367,11 @@ struct io_ring_ctx {
 	unsigned			evfd_last_cq_tail;
 };
 
+struct io_tw_state {
+	/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
+	bool locked;
+};
+
 enum {
 	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
 	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
@@ -473,7 +478,7 @@ enum {
 	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
 };
 
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
 
 struct io_task_work {
 	struct llist_node		node;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2669aca0ba39..536940675c67 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -247,12 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	bool locked = true;
+	struct io_tw_state ts = { .locked = true, };
 
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
-		req->io_task_work.func(req, &locked);
-	if (WARN_ON_ONCE(!locked))
+		req->io_task_work.func(req, &ts);
+	if (WARN_ON_ONCE(!ts.locked))
 		return;
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
@@ -457,7 +457,7 @@ static void io_prep_async_link(struct io_kiocb *req)
 	}
 }
 
-void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
+void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
@@ -1153,22 +1153,23 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return nxt;
 }
 
-static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
+static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
 	if (!ctx)
 		return;
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (*locked) {
+	if (ts->locked) {
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
-		*locked = false;
+		ts->locked = false;
 	}
 	percpu_ref_put(&ctx->refs);
 }
 
 static unsigned int handle_tw_list(struct llist_node *node,
-				   struct io_ring_ctx **ctx, bool *locked,
+				   struct io_ring_ctx **ctx,
+				   struct io_tw_state *ts,
 				   struct llist_node *last)
 {
 	unsigned int count = 0;
@@ -1181,17 +1182,17 @@ static unsigned int handle_tw_list(struct llist_node *node,
 		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
 
 		if (req->ctx != *ctx) {
-			ctx_flush_and_put(*ctx, locked);
+			ctx_flush_and_put(*ctx, ts);
 			*ctx = req->ctx;
 			/* if not contended, grab and improve batching */
-			*locked = mutex_trylock(&(*ctx)->uring_lock);
+			ts->locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
 		}
-		req->io_task_work.func(req, locked);
+		req->io_task_work.func(req, ts);
 		node = next;
 		count++;
 		if (unlikely(need_resched())) {
-			ctx_flush_and_put(*ctx, locked);
+			ctx_flush_and_put(*ctx, ts);
 			*ctx = NULL;
 			cond_resched();
 		}
@@ -1232,7 +1233,7 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
 
 void tctx_task_work(struct callback_head *cb)
 {
-	bool uring_locked = false;
+	struct io_tw_state ts = {};
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
@@ -1249,12 +1250,12 @@ void tctx_task_work(struct callback_head *cb)
 	do {
 		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &ts, &fake);
 
 		/* skip expensive cmpxchg if there are items in the list */
 		if (READ_ONCE(tctx->task_list.first) != &fake)
 			continue;
-		if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
+		if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
 			io_submit_flush_completions(ctx);
 			if (READ_ONCE(tctx->task_list.first) != &fake)
 				continue;
@@ -1262,7 +1263,7 @@ void tctx_task_work(struct callback_head *cb)
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	} while (node != &fake);
 
-	ctx_flush_and_put(ctx, &uring_locked);
+	ctx_flush_and_put(ctx, &ts);
 
 	/* relaxed read is enough as only the task itself sets ->in_cancel */
 	if (unlikely(atomic_read(&tctx->in_cancel)))
@@ -1351,7 +1352,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 	}
 }
 
-static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
 	struct llist_node *node;
 	unsigned int loops = 0;
@@ -1368,7 +1369,7 @@ again:
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-		req->io_task_work.func(req, locked);
+		req->io_task_work.func(req, ts);
 		ret++;
 		node = next;
 	}
@@ -1376,7 +1377,7 @@ again:
 
 	if (!llist_empty(&ctx->work_llist))
 		goto again;
-	if (*locked) {
+	if (ts->locked) {
 		io_submit_flush_completions(ctx);
 		if (!llist_empty(&ctx->work_llist))
 			goto again;
@@ -1387,46 +1388,46 @@ again:
 
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
 {
-	bool locked;
+	struct io_tw_state ts = { .locked = true, };
 	int ret;
 
 	if (llist_empty(&ctx->work_llist))
 		return 0;
 
-	locked = true;
-	ret = __io_run_local_work(ctx, &locked);
+	ret = __io_run_local_work(ctx, &ts);
 	/* shouldn't happen! */
-	if (WARN_ON_ONCE(!locked))
+	if (WARN_ON_ONCE(!ts.locked))
 		mutex_lock(&ctx->uring_lock);
 	return ret;
 }
 
 static int io_run_local_work(struct io_ring_ctx *ctx)
 {
-	bool locked = mutex_trylock(&ctx->uring_lock);
+	struct io_tw_state ts = {};
 	int ret;
 
-	ret = __io_run_local_work(ctx, &locked);
-	if (locked)
+	ts.locked = mutex_trylock(&ctx->uring_lock);
+	ret = __io_run_local_work(ctx, &ts);
+	if (ts.locked)
 		mutex_unlock(&ctx->uring_lock);
 
 	return ret;
 }
 
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	io_req_defer_failed(req, req->cqe.res);
 }
 
-void io_req_task_submit(struct io_kiocb *req, bool *locked)
+void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
-		io_queue_iowq(req, locked);
+		io_queue_iowq(req, ts);
 	else
 		io_queue_sqe(req);
 }
@@ -1652,9 +1653,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	return ret;
 }
 
-void io_req_task_complete(struct io_kiocb *req, bool *locked)
+void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	if (*locked)
+	if (ts->locked)
 		io_req_complete_defer(req);
 	else
 		io_req_complete_post(req, IO_URING_F_UNLOCKED);
@@ -1933,9 +1934,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
-int io_poll_issue(struct io_kiocb *req, bool *locked)
+int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
 				 IO_URING_F_COMPLETE_DEFER);
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2711865f1e19..c33f719731ac 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -52,16 +52,16 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
-void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
-void io_req_task_complete(struct io_kiocb *req, bool *locked);
+void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
+void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
-void io_req_task_submit(struct io_kiocb *req, bool *locked);
+void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
 void tctx_task_work(struct callback_head *cb);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 
-int io_poll_issue(struct io_kiocb *req, bool *locked);
+int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
@@ -299,11 +299,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 	return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
 }
 
-static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
-	if (!*locked) {
+	if (!ts->locked) {
 		mutex_lock(&ctx->uring_lock);
-		*locked = true;
+		ts->locked = true;
 	}
 }
 
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 09dfd0832d19..172105eb347d 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -9,7 +9,7 @@
 #include "notif.h"
 #include "rsrc.h"
 
-static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
+static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts)
 {
 	struct io_notif_data *nd = io_notif_to_data(notif);
 	struct io_ring_ctx *ctx = notif->ctx;
@@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
 		__io_unaccount_mem(ctx->user, nd->account_pages);
 		nd->account_pages = 0;
 	}
-	io_req_task_complete(notif, locked);
+	io_req_task_complete(notif, ts);
 }
 
 static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 55306e801081..c90e47dc1e29 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -148,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
 	hlist_add_head(&req->hash_node, &table->hbs[index].list);
 }
 
-static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
+static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -159,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
 		 * already grabbed the mutex for us, but there is a chance it
 		 * failed.
 		 */
-		io_tw_lock(ctx, locked);
+		io_tw_lock(ctx, ts);
 		hash_del(&req->hash_node);
 		req->flags &= ~REQ_F_HASH_LOCKED;
 	} else {
@@ -238,7 +238,7 @@ enum {
  * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
  * poll and that the result is stored in req->cqe.
  */
-static int io_poll_check_events(struct io_kiocb *req, bool *locked)
+static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int v;
 
@@ -300,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data,
+			if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data,
 					mask, IORING_CQE_F_MORE, false)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
 		} else {
-			int ret = io_poll_issue(req, locked);
+			int ret = io_poll_issue(req, ts);
 			if (ret == IOU_STOP_MULTISHOT)
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			if (ret < 0)
@@ -326,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 	return IOU_POLL_NO_ACTION;
 }
 
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int ret;
 
-	ret = io_poll_check_events(req, locked);
+	ret = io_poll_check_events(req, ts);
 	if (ret == IOU_POLL_NO_ACTION)
 		return;
 	io_poll_remove_entries(req);
-	io_poll_tw_hash_eject(req, locked);
+	io_poll_tw_hash_eject(req, ts);
 
 	if (req->opcode == IORING_OP_POLL_ADD) {
 		if (ret == IOU_POLL_DONE) {
@@ -343,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 			poll = io_kiocb_to_cmd(req, struct io_poll);
 			req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 		} else if (ret == IOU_POLL_REISSUE) {
-			io_req_task_submit(req, locked);
+			io_req_task_submit(req, ts);
 			return;
 		} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 			req->cqe.res = ret;
@@ -351,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		}
 
 		io_req_set_res(req, req->cqe.res, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 	} else {
-		io_tw_lock(req->ctx, locked);
+		io_tw_lock(req->ctx, ts);
 
 		if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-			io_req_task_complete(req, locked);
+			io_req_task_complete(req, ts);
 		else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
-			io_req_task_submit(req, locked);
+			io_req_task_submit(req, ts);
 		else
 			io_req_defer_failed(req, ret);
 	}
@@ -977,7 +977,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
-	bool locked;
+	struct io_tw_state ts = {};
 
 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
 	ret2 = io_poll_disarm(preq);
@@ -1027,8 +1027,8 @@ found:
 
 	req_set_fail(preq);
 	io_req_set_res(preq, -ECANCELED, 0);
-	locked = !(issue_flags & IO_URING_F_UNLOCKED);
-	io_req_task_complete(preq, &locked);
+	ts.locked = !(issue_flags & IO_URING_F_UNLOCKED);
+	io_req_task_complete(preq, &ts);
 out:
 	if (ret < 0) {
 		req_set_fail(req);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e200..f14868624f41 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -283,16 +283,16 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 	return res;
 }
 
-static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
+static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	io_req_io_end(req);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+		unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 
 		req->cqe.flags |= io_put_kbuf(req, issue_flags);
 	}
-	io_req_task_complete(req, locked);
+	io_req_task_complete(req, ts);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res)
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 826a51bca3e4..5c6c6f720809 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -101,9 +101,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	spin_unlock_irq(&ctx->timeout_lock);
 }
 
-static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
+static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
 {
-	io_tw_lock(link->ctx, locked);
+	io_tw_lock(link->ctx, ts);
 	while (link) {
 		struct io_kiocb *nxt = link->link;
 		long res = -ECANCELED;
@@ -112,7 +112,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
 			res = link->cqe.res;
 		link->link = NULL;
 		io_req_set_res(link, res, 0);
-		io_req_task_complete(link, locked);
+		io_req_task_complete(link, ts);
 		link = nxt;
 	}
 }
@@ -265,9 +265,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	return 0;
 }
 
-static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
+static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
@@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 		io_put_req(prev);
 	} else {
 		io_req_set_res(req, -ETIME, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 	}
 }
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 9a1dee571872..3d825d939b13 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -12,10 +12,10 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
-static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 
 	ioucmd->task_work_cb(ioucmd, issue_flags);
 }
-- 
cgit v1.2.3


From 2ad57931db641f3de627023afb8147a8ec0b41dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 30 Mar 2023 10:03:41 -0600
Subject: io_uring: rename trace_io_uring_submit_sqe() tracepoint

It has nothing to do with the SQE at this point, it's a request
submission. While in there, get rid of the 'force_nonblock' argument
which is also dead, as we only pass in true.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 15 ++++++---------
 io_uring/io_uring.c             |  3 +--
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 936fd41bf147..69454f1f98b0 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete,
 );
 
 /**
- * io_uring_submit_sqe - called before submitting one SQE
+ * io_uring_submit_req - called before submitting a request
  *
  * @req:		pointer to a submitted request
- * @force_nonblock:	whether a context blocking or not
  *
  * Allows to track SQE submitting, to understand what was the source of it, SQ
  * thread or io_uring_enter call.
  */
-TRACE_EVENT(io_uring_submit_sqe,
+TRACE_EVENT(io_uring_submit_req,
 
-	TP_PROTO(struct io_kiocb *req, bool force_nonblock),
+	TP_PROTO(struct io_kiocb *req),
 
-	TP_ARGS(req, force_nonblock),
+	TP_ARGS(req),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  u32,			flags		)
-		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
 
 		__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->flags		= req->flags;
-		__entry->force_nonblock	= force_nonblock;
 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
 
 		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
-		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
+		  "sq_thread %d", __entry->ctx, __entry->req,
 		  __entry->user_data, __get_str(op_str),
-		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
+		  __entry->flags, __entry->sq_thread)
 );
 
 /*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 536940675c67..775b53730c2f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2305,8 +2305,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (unlikely(ret))
 		return io_submit_fail_init(sqe, req, ret);
 
-	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(req, true);
+	trace_io_uring_submit_req(req);
 
 	/*
 	 * If we already have a head request, queue this one for async
-- 
cgit v1.2.3


From ea65b41807a26495ff2a73dd8b1bab2751940887 Mon Sep 17 00:00:00 2001
From: John Keeping <john@metanate.com>
Date: Mon, 27 Mar 2023 18:36:46 +0100
Subject: ftrace: Mark get_lock_parent_ip() __always_inline

If the compiler decides not to inline this function then preemption
tracing will always show an IP inside the preemption disabling path and
never the function actually calling preempt_{enable,disable}.

Link: https://lore.kernel.org/linux-trace-kernel/20230327173647.1690849-1-john@metanate.com

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: stable@vger.kernel.org
Fixes: f904f58263e1d ("sched/debug: Fix preempt_disable_ip recording for preempt_disable()")
Signed-off-by: John Keeping <john@metanate.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 366c730beaa3..402fc061de75 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -980,7 +980,7 @@ static inline void __ftrace_enabled_restore(int enabled)
 #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
 #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
 
-static inline unsigned long get_lock_parent_ip(void)
+static __always_inline unsigned long get_lock_parent_ip(void)
 {
 	unsigned long addr = CALLER_ADDR0;
 
-- 
cgit v1.2.3


From f82e7ca019dfad3b006fd3b772f7ac569672db55 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 9 Mar 2023 22:13:02 -0500
Subject: tracing: Error if a trace event has an array for a __field()

A __field() in the TRACE_EVENT() macro is used to set up the fields of the
trace event data. It is for single storage units (word, char, int,
pointer, etc) and not for complex structures or arrays. Unfortunately,
there's nothing preventing the build from accepting:

    __field(int, arr[5]);

from building. It will turn into a array value. This use to work fine, as
the offset and size use to be determined by the macro using the field name,
but things have changed and the offset and size are now determined by the
type. So the above would only be size 4, and the next field will be
located 4 bytes from it (instead of 20).

The proper way to declare static arrays is to use the __array() macro.

Instead of __field(int, arr[5]) it should be __array(int, arr, 5).

Add some macro tricks to the building of a trace event from the
TRACE_EVENT() macro such that __field(int, arr[5]) will fail to build. A
comment by the failure will explain why the build failed.

Link: https://lore.kernel.org/lkml/20230306122549.236561-1-douglas.raillard@arm.com/
Link: https://lore.kernel.org/linux-trace-kernel/20230309221302.642e82d9@gandalf.local.home

Reported-by: Douglas RAILLARD <douglas.raillard@arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/trace/stages/stage5_get_offsets.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h
index ac5c24d3beeb..e30a13be46ba 100644
--- a/include/trace/stages/stage5_get_offsets.h
+++ b/include/trace/stages/stage5_get_offsets.h
@@ -9,17 +9,30 @@
 #undef __entry
 #define __entry entry
 
+/*
+ * Fields should never declare an array: i.e. __field(int, arr[5])
+ * If they do, it will cause issues in parsing and possibly corrupt the
+ * events. To prevent that from happening, test the sizeof() a fictitious
+ * type called "struct _test_no_array_##item" which will fail if "item"
+ * contains array elements (like "arr[5]").
+ *
+ * If you hit this, use __array(int, arr, 5) instead.
+ */
 #undef __field
-#define __field(type, item)
+#define __field(type, item)					\
+	{ (void)sizeof(struct _test_no_array_##item *); }
 
 #undef __field_ext
-#define __field_ext(type, item, filter_type)
+#define __field_ext(type, item, filter_type)			\
+	{ (void)sizeof(struct _test_no_array_##item *); }
 
 #undef __field_struct
-#define __field_struct(type, item)
+#define __field_struct(type, item)				\
+	{ (void)sizeof(struct _test_no_array_##item *); }
 
 #undef __field_struct_ext
-#define __field_struct_ext(type, item, filter_type)
+#define __field_struct_ext(type, item, filter_type)		\
+	{ (void)sizeof(struct _test_no_array_##item *); }
 
 #undef __array
 #define __array(type, item, len)
-- 
cgit v1.2.3


From d74c36480a679b27ce8a70c2e88fed31b86323d9 Mon Sep 17 00:00:00 2001
From: Chuanhong Guo <gch981213@gmail.com>
Date: Wed, 29 Mar 2023 14:42:40 +0300
Subject: mtd: spinand: add support for ESMT F50x1G41LB

This patch adds support for ESMT F50L1G41LB and F50D1G41LB.
It seems that ESMT likes to use random JEDEC ID from other vendors.
Their 1G chips uses 0xc8 from GigaDevice and 2G/4G chips uses 0x2c from
Micron. For this reason, the ESMT entry is named esmt_c8 with explicit
JEDEC ID in variable name.

Datasheets:
https://www.esmt.com.tw/upload/pdf/ESMT/datasheets/F50L1G41LB(2M).pdf
https://www.esmt.com.tw/upload/pdf/ESMT/datasheets/F50D1G41LB(2M).pdf

Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
Signed-off-by: Martin Kurbanov <mmkurbanov@sberdevices.ru>
Signed-off-by: Dmitry Rokosov <ddrokosov@sberdevices.ru>
Tested-by: Martin Kurbanov <mmkurbanov@sberdevices.ru>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230329114240.378722-1-mmkurbanov@sberdevices.ru
---
 drivers/mtd/nand/spi/Makefile |   3 +-
 drivers/mtd/nand/spi/core.c   |   1 +
 drivers/mtd/nand/spi/esmt.c   | 135 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h   |   1 +
 4 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/esmt.c

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 4ec973b8b6bf..cd8b66bf7740 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o alliancememory.o ato.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
+spinand-objs := core.o alliancememory.o ato.o esmt.o gigadevice.o macronix.o
+spinand-objs += micron.o paragon.o toshiba.o winbond.o xtx.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 638391f77d8c..393ff37f0d23 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -939,6 +939,7 @@ static const struct nand_ops spinand_ops = {
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&alliancememory_spinand_manufacturer,
 	&ato_spinand_manufacturer,
+	&esmt_c8_spinand_manufacturer,
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
diff --git a/drivers/mtd/nand/spi/esmt.c b/drivers/mtd/nand/spi/esmt.c
new file mode 100644
index 000000000000..1a3ffb982335
--- /dev/null
+++ b/drivers/mtd/nand/spi/esmt.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author:
+ *	Chuanhong Guo <gch981213@gmail.com> - the main driver logic
+ *	Martin Kurbanov <mmkurbanov@sberdevices.ru> - OOB layout
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+/* ESMT uses GigaDevice 0xc8 JECDEC ID on some SPI NANDs */
+#define SPINAND_MFR_ESMT_C8			0xc8
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+			   SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+			   SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+			   SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+			   SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+			   SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+/*
+ * OOB spare area map (64 bytes)
+ *
+ * Bad Block Markers
+ * filled by HW and kernel                 Reserved
+ *   |                 +-----------------------+-----------------------+
+ *   |                 |                       |                       |
+ *   |                 |    OOB free data Area |non ECC protected      |
+ *   |   +-------------|-----+-----------------|-----+-----------------|-----+
+ *   |   |             |     |                 |     |                 |     |
+ * +-|---|----------+--|-----|--------------+--|-----|--------------+--|-----|--------------+
+ * | |   | section0 |  |     |    section1  |  |     |    section2  |  |     |    section3  |
+ * +-v-+-v-+---+----+--v--+--v--+-----+-----+--v--+--v--+-----+-----+--v--+--v--+-----+-----+
+ * |   |   |   |    |     |     |     |     |     |     |     |     |     |     |     |     |
+ * |0:1|2:3|4:7|8:15|16:17|18:19|20:23|24:31|32:33|34:35|36:39|40:47|48:49|50:51|52:55|56:63|
+ * |   |   |   |    |     |     |     |     |     |     |     |     |     |     |     |     |
+ * +---+---+-^-+--^-+-----+-----+--^--+--^--+-----+-----+--^--+--^--+-----+-----+--^--+--^--+
+ *           |    |                |     |                 |     |                 |     |
+ *           |    +----------------|-----+-----------------|-----+-----------------|-----+
+ *           |             ECC Area|(Main + Spare) - filled|by ESMT NAND HW        |
+ *           |                     |                       |                       |
+ *           +---------------------+-----------------------+-----------------------+
+ *                         OOB ECC protected Area - not used due to
+ *                         partial programming from some filesystems
+ *                             (like JFFS2 with cleanmarkers)
+ */
+
+#define ESMT_OOB_SECTION_COUNT			4
+#define ESMT_OOB_SECTION_SIZE(nand) \
+	(nanddev_per_page_oobsize(nand) / ESMT_OOB_SECTION_COUNT)
+#define ESMT_OOB_FREE_SIZE(nand) \
+	(ESMT_OOB_SECTION_SIZE(nand) / 2)
+#define ESMT_OOB_ECC_SIZE(nand) \
+	(ESMT_OOB_SECTION_SIZE(nand) - ESMT_OOB_FREE_SIZE(nand))
+#define ESMT_OOB_BBM_SIZE			2
+
+static int f50l1g41lb_ooblayout_ecc(struct mtd_info *mtd, int section,
+				    struct mtd_oob_region *region)
+{
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+
+	if (section >= ESMT_OOB_SECTION_COUNT)
+		return -ERANGE;
+
+	region->offset = section * ESMT_OOB_SECTION_SIZE(nand) +
+			 ESMT_OOB_FREE_SIZE(nand);
+	region->length = ESMT_OOB_ECC_SIZE(nand);
+
+	return 0;
+}
+
+static int f50l1g41lb_ooblayout_free(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *region)
+{
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+
+	if (section >= ESMT_OOB_SECTION_COUNT)
+		return -ERANGE;
+
+	/*
+	 * Reserve space for bad blocks markers (section0) and
+	 * reserved bytes (sections 1-3)
+	 */
+	region->offset = section * ESMT_OOB_SECTION_SIZE(nand) + 2;
+
+	/* Use only 2 non-protected ECC bytes per each OOB section */
+	region->length = 2;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops f50l1g41lb_ooblayout = {
+	.ecc = f50l1g41lb_ooblayout_ecc,
+	.free = f50l1g41lb_ooblayout_free,
+};
+
+static const struct spinand_info esmt_c8_spinand_table[] = {
+	SPINAND_INFO("F50L1G41LB",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x01),
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&f50l1g41lb_ooblayout, NULL)),
+	SPINAND_INFO("F50D1G41LB",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x11),
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&f50l1g41lb_ooblayout, NULL)),
+};
+
+static const struct spinand_manufacturer_ops esmt_spinand_manuf_ops = {
+};
+
+const struct spinand_manufacturer esmt_c8_spinand_manufacturer = {
+	.id = SPINAND_MFR_ESMT_C8,
+	.name = "ESMT",
+	.chips = esmt_c8_spinand_table,
+	.nchips = ARRAY_SIZE(esmt_c8_spinand_table),
+	.ops = &esmt_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 01be9f0f008a..3e285c09d16d 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -262,6 +262,7 @@ struct spinand_manufacturer {
 /* SPI NAND manufacturers */
 extern const struct spinand_manufacturer alliancememory_spinand_manufacturer;
 extern const struct spinand_manufacturer ato_spinand_manufacturer;
+extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer;
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
-- 
cgit v1.2.3


From 979207cac517833c828133ffb2633bcdf6edce00 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 2 Apr 2023 19:58:46 +0200
Subject: driver core: class: mark class_release() as taking a const *

The struct class callback, class_release(), is only called in 2 places,
the pcmcia cardservices code, and in the class driver core code.  Both
places it is safe to mark the structure as a const *, to allow us to
in the future mark all struct class usages as constant and move into
read-only memory.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023040248-outrage-obsolete-5a9a@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 2 +-
 drivers/pcmcia/cs.c          | 2 +-
 include/linux/device/class.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 65502bd7d5c5..53fc7052340c 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -235,7 +235,7 @@ void class_unregister(const struct class *cls)
 }
 EXPORT_SYMBOL_GPL(class_unregister);
 
-static void class_create_release(struct class *cls)
+static void class_create_release(const struct class *cls)
 {
 	pr_debug("%s called for %s\n", __func__, cls->name);
 	kfree(cls);
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index e3224e49c43f..5658745c398f 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -824,7 +824,7 @@ static int pcmcia_socket_uevent(const struct device *dev,
 
 static struct completion pcmcia_unload;
 
-static void pcmcia_release_socket_class(struct class *data)
+static void pcmcia_release_socket_class(const struct class *data)
 {
 	complete(&pcmcia_unload);
 }
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 7e4a1a6329f4..f3c418fa129a 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -58,7 +58,7 @@ struct class {
 	int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env);
 	char *(*devnode)(const struct device *dev, umode_t *mode);
 
-	void (*class_release)(struct class *class);
+	void (*class_release)(const struct class *class);
 	void (*dev_release)(struct device *dev);
 
 	int (*shutdown_pre)(struct device *dev);
-- 
cgit v1.2.3


From 43a7206b0963c2153c95d6985624d1dc1b3abd4d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 2 Apr 2023 19:58:47 +0200
Subject: driver core: class: make class_register() take a const *

Now that the class code is cleaned up to not modify the class pointer
registered with it, change class_register() to take a const * to allow
the structure to be placed into read-only memory.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023040248-customary-release-4aec@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h          | 2 +-
 drivers/base/class.c         | 6 +++---
 include/linux/device/class.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index e96f3343fd7c..eb4c0ace9242 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -54,7 +54,7 @@ struct subsys_private {
 	struct device *dev_root;
 
 	struct kset glue_dirs;
-	struct class *class;
+	const struct class *class;
 
 	struct lock_class_key lock_key;
 };
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 53fc7052340c..05bce79d3d19 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -93,7 +93,7 @@ static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
 static void class_release(struct kobject *kobj)
 {
 	struct subsys_private *cp = to_subsys_private(kobj);
-	struct class *class = cp->class;
+	const struct class *class = cp->class;
 
 	pr_debug("class '%s': release.\n", class->name);
 
@@ -110,7 +110,7 @@ static void class_release(struct kobject *kobj)
 static const struct kobj_ns_type_operations *class_child_ns_type(const struct kobject *kobj)
 {
 	const struct subsys_private *cp = to_subsys_private(kobj);
-	struct class *class = cp->class;
+	const struct class *class = cp->class;
 
 	return class->ns_type;
 }
@@ -175,7 +175,7 @@ static void klist_class_dev_put(struct klist_node *n)
 	put_device(dev);
 }
 
-int class_register(struct class *cls)
+int class_register(const struct class *cls)
 {
 	struct subsys_private *cp;
 	struct lock_class_key *key;
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index f3c418fa129a..4bf46f9bbb56 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -76,7 +76,7 @@ struct class_dev_iter {
 	const struct device_type	*type;
 };
 
-int __must_check class_register(struct class *class);
+int __must_check class_register(const struct class *class);
 void class_unregister(const struct class *class);
 bool class_is_registered(const struct class *class);
 
-- 
cgit v1.2.3


From 6b0d49be81cf4f1cf30ebd079cbf4643cab0a01d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 2 Apr 2023 19:58:48 +0200
Subject: driver core: class: mark the struct class in struct class_interface
 constant

The struct class pointer in struct class_interface is never modified, so
mark it as const so that no one accidentally tries to modify it in the
future.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023040249-handball-gruffly-5da7@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c         | 2 +-
 include/linux/device/class.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 05bce79d3d19..ad8b9f163fd2 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(class_interface_register);
 void class_interface_unregister(struct class_interface *class_intf)
 {
 	struct subsys_private *sp;
-	struct class *parent = class_intf->class;
+	const struct class *parent = class_intf->class;
 	struct class_dev_iter iter;
 	struct device *dev;
 
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 4bf46f9bbb56..53287aa105b8 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -217,7 +217,7 @@ ssize_t show_class_attr_string(const struct class *class, const struct class_att
 
 struct class_interface {
 	struct list_head	node;
-	struct class		*class;
+	const struct class	*class;
 
 	int (*add_dev)		(struct device *, struct class_interface *);
 	void (*remove_dev)	(struct device *, struct class_interface *);
-- 
cgit v1.2.3


From 2243acd50ac4026e93ac4f024109be6dbd7f9098 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 2 Apr 2023 19:58:49 +0200
Subject: driver core: class: remove struct class_interface * from callbacks

The add_dev and remove_dev callbacks in struct class_interface currently
pass in a pointer back to the class_interface structure that is calling
them, but none of the callback implementations actually use this pointer
as it is pointless (the structure is known, the driver passed it in in
the first place if it is really needed again.)

So clean this up and just remove the pointer from the callbacks and fix
up all callback functions.

Cc: Jean Delvare <jdelvare@suse.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Cc: Jon Mason <jdmason@kudzu.us>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: John Stultz <jstultz@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Wang Weiyang <wangweiyang2@huawei.com>
Cc: Yang Yingliang <yangyingliang@huawei.com>
Cc: Jakob Koschel <jakobkoschel@gmail.com>
Cc: Cai Xinchen <caixinchen1@huawei.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/2023040250-pushover-platter-509c@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c                     |  4 ++--
 drivers/base/core.c                      | 10 ++++------
 drivers/hwmon/drivetemp.c                |  4 ++--
 drivers/net/rionet.c                     |  3 +--
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c   |  6 ++----
 drivers/pcmcia/ds.c                      |  6 ++----
 drivers/pcmcia/rsrc_nonstatic.c          |  6 ++----
 drivers/rapidio/devices/rio_mport_cdev.c |  7 ++-----
 drivers/rapidio/rio_cm.c                 |  8 ++------
 drivers/scsi/ses.c                       |  6 ++----
 drivers/scsi/sg.c                        |  8 ++++----
 include/linux/device/class.h             |  4 ++--
 kernel/time/alarmtimer.c                 |  3 +--
 13 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index ad8b9f163fd2..ac1808d1a2e8 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -486,7 +486,7 @@ int class_interface_register(struct class_interface *class_intf)
 	if (class_intf->add_dev) {
 		class_dev_iter_init(&iter, parent, NULL, NULL);
 		while ((dev = class_dev_iter_next(&iter)))
-			class_intf->add_dev(dev, class_intf);
+			class_intf->add_dev(dev);
 		class_dev_iter_exit(&iter);
 	}
 	mutex_unlock(&sp->mutex);
@@ -514,7 +514,7 @@ void class_interface_unregister(struct class_interface *class_intf)
 	if (class_intf->remove_dev) {
 		class_dev_iter_init(&iter, parent, NULL, NULL);
 		while ((dev = class_dev_iter_next(&iter)))
-			class_intf->remove_dev(dev, class_intf);
+			class_intf->remove_dev(dev);
 		class_dev_iter_exit(&iter);
 	}
 	mutex_unlock(&sp->mutex);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 64d188be4df9..7a42d1b6b721 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -541,8 +541,7 @@ static struct class devlink_class = {
 	.dev_release = devlink_dev_release,
 };
 
-static int devlink_add_symlinks(struct device *dev,
-				struct class_interface *class_intf)
+static int devlink_add_symlinks(struct device *dev)
 {
 	int ret;
 	size_t len;
@@ -591,8 +590,7 @@ out:
 	return ret;
 }
 
-static void devlink_remove_symlinks(struct device *dev,
-				   struct class_interface *class_intf)
+static void devlink_remove_symlinks(struct device *dev)
 {
 	struct device_link *link = to_devlink(dev);
 	size_t len;
@@ -3647,7 +3645,7 @@ int device_add(struct device *dev)
 		/* notify any interfaces that the device is here */
 		list_for_each_entry(class_intf, &sp->interfaces, node)
 			if (class_intf->add_dev)
-				class_intf->add_dev(dev, class_intf);
+				class_intf->add_dev(dev);
 		mutex_unlock(&sp->mutex);
 		subsys_put(sp);
 	}
@@ -3805,7 +3803,7 @@ void device_del(struct device *dev)
 		/* notify any interfaces that the device is now gone */
 		list_for_each_entry(class_intf, &sp->interfaces, node)
 			if (class_intf->remove_dev)
-				class_intf->remove_dev(dev, class_intf);
+				class_intf->remove_dev(dev);
 		/* remove the device from the class list */
 		klist_del(&dev->p->knode_class);
 		mutex_unlock(&sp->mutex);
diff --git a/drivers/hwmon/drivetemp.c b/drivers/hwmon/drivetemp.c
index 8e5759b42390..86171031ddc5 100644
--- a/drivers/hwmon/drivetemp.c
+++ b/drivers/hwmon/drivetemp.c
@@ -550,7 +550,7 @@ static const struct hwmon_chip_info drivetemp_chip_info = {
  * The device argument points to sdev->sdev_dev. Its parent is
  * sdev->sdev_gendev, which we can use to get the scsi_device pointer.
  */
-static int drivetemp_add(struct device *dev, struct class_interface *intf)
+static int drivetemp_add(struct device *dev)
 {
 	struct scsi_device *sdev = to_scsi_device(dev->parent);
 	struct drivetemp_data *st;
@@ -585,7 +585,7 @@ abort:
 	return err;
 }
 
-static void drivetemp_remove(struct device *dev, struct class_interface *intf)
+static void drivetemp_remove(struct device *dev)
 {
 	struct drivetemp_data *st, *tmp;
 
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index fbcb9d05da64..4eececc94513 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -662,8 +662,7 @@ static int rionet_shutdown(struct notifier_block *nb, unsigned long code,
 	return NOTIFY_DONE;
 }
 
-static void rionet_remove_mport(struct device *dev,
-				struct class_interface *class_intf)
+static void rionet_remove_mport(struct device *dev)
 {
 	struct rio_mport *mport = to_rio_mport(dev);
 	struct net_device *ndev;
diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 88ae18b0efa8..d6bbcc7b5b90 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -1470,8 +1470,7 @@ static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev)
 	return rc;
 }
 
-static int switchtec_ntb_add(struct device *dev,
-			     struct class_interface *class_intf)
+static int switchtec_ntb_add(struct device *dev)
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 	struct switchtec_ntb *sndev;
@@ -1541,8 +1540,7 @@ free_and_exit:
 	return rc;
 }
 
-static void switchtec_ntb_remove(struct device *dev,
-				 struct class_interface *class_intf)
+static void switchtec_ntb_remove(struct device *dev)
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 	struct switchtec_ntb *sndev = stdev->sndev;
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index c8087efa5e4a..d500e5dbbc3f 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -1335,8 +1335,7 @@ static struct pcmcia_callback pcmcia_bus_callback = {
 	.resume = pcmcia_bus_resume,
 };
 
-static int pcmcia_bus_add_socket(struct device *dev,
-					   struct class_interface *class_intf)
+static int pcmcia_bus_add_socket(struct device *dev)
 {
 	struct pcmcia_socket *socket = dev_get_drvdata(dev);
 	int ret;
@@ -1369,8 +1368,7 @@ static int pcmcia_bus_add_socket(struct device *dev,
 	return 0;
 }
 
-static void pcmcia_bus_remove_socket(struct device *dev,
-				     struct class_interface *class_intf)
+static void pcmcia_bus_remove_socket(struct device *dev)
 {
 	struct pcmcia_socket *socket = dev_get_drvdata(dev);
 
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index ad1141fddb4c..471e0c5815f3 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -1200,8 +1200,7 @@ static const struct attribute_group rsrc_attributes = {
 	.attrs = pccard_rsrc_attributes,
 };
 
-static int pccard_sysfs_add_rsrc(struct device *dev,
-					   struct class_interface *class_intf)
+static int pccard_sysfs_add_rsrc(struct device *dev)
 {
 	struct pcmcia_socket *s = dev_get_drvdata(dev);
 
@@ -1210,8 +1209,7 @@ static int pccard_sysfs_add_rsrc(struct device *dev,
 	return sysfs_create_group(&dev->kobj, &rsrc_attributes);
 }
 
-static void pccard_sysfs_remove_rsrc(struct device *dev,
-					       struct class_interface *class_intf)
+static void pccard_sysfs_remove_rsrc(struct device *dev)
 {
 	struct pcmcia_socket *s = dev_get_drvdata(dev);
 
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index deb96c3160a7..a115730ebf14 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -2536,10 +2536,8 @@ static void mport_cdev_remove(struct mport_dev *md)
 /*
  * mport_add_mport() - Add rio_mport from LDM device struct
  * @dev:		Linux device model struct
- * @class_intf:	Linux class_interface
  */
-static int mport_add_mport(struct device *dev,
-		struct class_interface *class_intf)
+static int mport_add_mport(struct device *dev)
 {
 	struct rio_mport *mport = NULL;
 	struct mport_dev *chdev = NULL;
@@ -2559,8 +2557,7 @@ static int mport_add_mport(struct device *dev,
  * mport_remove_mport() - Remove rio_mport from global list
  * TODO remove device from global mport_dev list
  */
-static void mport_remove_mport(struct device *dev,
-		struct class_interface *class_intf)
+static void mport_remove_mport(struct device *dev)
 {
 	struct rio_mport *mport = NULL;
 	struct mport_dev *chdev;
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index acaf9cda014c..49f8d111e546 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -2087,13 +2087,11 @@ static int riocm_cdev_add(dev_t devno)
 /*
  * riocm_add_mport - add new local mport device into channel management core
  * @dev: device object associated with mport
- * @class_intf: class interface
  *
  * When a new mport device is added, CM immediately reserves inbound and
  * outbound RapidIO mailboxes that will be used.
  */
-static int riocm_add_mport(struct device *dev,
-			   struct class_interface *class_intf)
+static int riocm_add_mport(struct device *dev)
 {
 	int rc;
 	int i;
@@ -2166,14 +2164,12 @@ static int riocm_add_mport(struct device *dev,
 /*
  * riocm_remove_mport - remove local mport device from channel management core
  * @dev: device object associated with mport
- * @class_intf: class interface
  *
  * Removes a local mport device from the list of registered devices that provide
  * channel management services. Returns an error if the specified mport is not
  * registered with the CM core.
  */
-static void riocm_remove_mport(struct device *dev,
-			       struct class_interface *class_intf)
+static void riocm_remove_mport(struct device *dev)
 {
 	struct rio_mport *mport = to_rio_mport(dev);
 	struct cm_dev *cm;
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index b11a9162e73a..57feb0cae896 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -663,8 +663,7 @@ static void ses_match_to_enclosure(struct enclosure_device *edev,
 	}
 }
 
-static int ses_intf_add(struct device *cdev,
-			struct class_interface *intf)
+static int ses_intf_add(struct device *cdev)
 {
 	struct scsi_device *sdev = to_scsi_device(cdev->parent);
 	struct scsi_device *tmp_sdev;
@@ -869,8 +868,7 @@ static void ses_intf_remove_enclosure(struct scsi_device *sdev)
 	enclosure_unregister(edev);
 }
 
-static void ses_intf_remove(struct device *cdev,
-			    struct class_interface *intf)
+static void ses_intf_remove(struct device *cdev)
 {
 	struct scsi_device *sdev = to_scsi_device(cdev->parent);
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 4997f880d4a4..037f8c98a6d3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -96,8 +96,8 @@ static int scatter_elem_sz_prev = SG_SCATTER_SZ;
 
 #define SG_SECTOR_SZ 512
 
-static int sg_add_device(struct device *, struct class_interface *);
-static void sg_remove_device(struct device *, struct class_interface *);
+static int sg_add_device(struct device *);
+static void sg_remove_device(struct device *);
 
 static DEFINE_IDR(sg_index_idr);
 static DEFINE_RWLOCK(sg_index_lock);	/* Also used to lock
@@ -1488,7 +1488,7 @@ out_unlock:
 }
 
 static int
-sg_add_device(struct device *cl_dev, struct class_interface *cl_intf)
+sg_add_device(struct device *cl_dev)
 {
 	struct scsi_device *scsidp = to_scsi_device(cl_dev->parent);
 	Sg_device *sdp = NULL;
@@ -1578,7 +1578,7 @@ sg_device_destroy(struct kref *kref)
 }
 
 static void
-sg_remove_device(struct device *cl_dev, struct class_interface *cl_intf)
+sg_remove_device(struct device *cl_dev)
 {
 	struct scsi_device *scsidp = to_scsi_device(cl_dev->parent);
 	Sg_device *sdp = dev_get_drvdata(cl_dev);
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 53287aa105b8..9deeaeb457bb 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -219,8 +219,8 @@ struct class_interface {
 	struct list_head	node;
 	const struct class	*class;
 
-	int (*add_dev)		(struct device *, struct class_interface *);
-	void (*remove_dev)	(struct device *, struct class_interface *);
+	int (*add_dev)		(struct device *dev);
+	void (*remove_dev)	(struct device *dev);
 };
 
 int __must_check class_interface_register(struct class_interface *);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7e5dff602585..82b28ab0f328 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -81,8 +81,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
 }
 EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
 
-static int alarmtimer_rtc_add_device(struct device *dev,
-				struct class_interface *class_intf)
+static int alarmtimer_rtc_add_device(struct device *dev)
 {
 	unsigned long flags;
 	struct rtc_device *rtc = to_rtc_device(dev);
-- 
cgit v1.2.3


From 862d8312eed994a8a9af7aa8e9e15456183b10a7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 2 Apr 2023 19:58:50 +0200
Subject: tty: make tty_class a static const structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the driver core allows for struct class to be in read-only
memory, move the tty_class structure to be declared at build time
placing it into read-only memory, instead of having to be dynamically
allocated at boot time.

Cc: "Ilpo Järvinen" <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/2023040250-landowner-unfitted-11f4@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/pty.c    |  2 +-
 drivers/tty/tty_io.c | 24 +++++++++++-------------
 drivers/tty/vt/vt.c  |  2 +-
 include/linux/tty.h  |  2 +-
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 07394fdaf522..2b1c8ab99dba 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -931,7 +931,7 @@ static void __init unix98_pty_init(void)
 	if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) ||
 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0)
 		panic("Couldn't register /dev/ptmx driver");
-	device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
+	device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
 }
 
 #else
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 1382d9050ce8..44e0d53aa0b8 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3070,7 +3070,7 @@ static struct device *tty_get_device(struct tty_struct *tty)
 {
 	dev_t devt = tty_devnum(tty);
 
-	return class_find_device_by_devt(tty_class, devt);
+	return class_find_device_by_devt(&tty_class, devt);
 }
 
 
@@ -3143,8 +3143,6 @@ int tty_put_char(struct tty_struct *tty, unsigned char ch)
 }
 EXPORT_SYMBOL_GPL(tty_put_char);
 
-struct class *tty_class;
-
 static int tty_cdev_add(struct tty_driver *driver, dev_t dev,
 		unsigned int index, unsigned int count)
 {
@@ -3239,7 +3237,7 @@ struct device *tty_register_device_attr(struct tty_driver *driver,
 		return ERR_PTR(-ENOMEM);
 
 	dev->devt = devt;
-	dev->class = tty_class;
+	dev->class = &tty_class;
 	dev->parent = device;
 	dev->release = tty_device_create_release;
 	dev_set_name(dev, "%s", name);
@@ -3294,8 +3292,7 @@ EXPORT_SYMBOL_GPL(tty_register_device_attr);
  */
 void tty_unregister_device(struct tty_driver *driver, unsigned index)
 {
-	device_destroy(tty_class,
-		MKDEV(driver->major, driver->minor_start) + index);
+	device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index);
 	if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) {
 		cdev_del(driver->cdevs[index]);
 		driver->cdevs[index] = NULL;
@@ -3510,13 +3507,14 @@ static char *tty_devnode(const struct device *dev, umode_t *mode)
 	return NULL;
 }
 
+const struct class tty_class = {
+	.name		= "tty",
+	.devnode	= tty_devnode,
+};
+
 static int __init tty_class_init(void)
 {
-	tty_class = class_create("tty");
-	if (IS_ERR(tty_class))
-		return PTR_ERR(tty_class);
-	tty_class->devnode = tty_devnode;
-	return 0;
+	return class_register(&tty_class);
 }
 
 postcore_initcall(tty_class_init);
@@ -3643,13 +3641,13 @@ int __init tty_init(void)
 	if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0)
 		panic("Couldn't register /dev/tty driver\n");
-	device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty");
+	device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty");
 
 	cdev_init(&console_cdev, &console_fops);
 	if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) ||
 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0)
 		panic("Couldn't register /dev/console driver\n");
-	consdev = device_create_with_groups(tty_class, NULL,
+	consdev = device_create_with_groups(&tty_class, NULL,
 					    MKDEV(TTYAUX_MAJOR, 1), NULL,
 					    cons_dev_groups, "console");
 	if (IS_ERR(consdev))
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 81c148064aa6..40e4928eddaf 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -3539,7 +3539,7 @@ int __init vty_init(const struct file_operations *console_fops)
 	if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
 	    register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
 		panic("Couldn't register /dev/tty0 driver\n");
-	tty0dev = device_create_with_groups(tty_class, NULL,
+	tty0dev = device_create_with_groups(&tty_class, NULL,
 					    MKDEV(TTY_MAJOR, 0), NULL,
 					    vt_dev_groups, "tty0");
 	if (IS_ERR(tty0dev))
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 093935e97f42..121eb4b0d325 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -387,7 +387,7 @@ extern struct ktermios tty_std_termios;
 
 int vcs_init(void);
 
-extern struct class *tty_class;
+extern const struct class tty_class;
 
 /**
  *	tty_kref_get		-	get a tty reference
-- 
cgit v1.2.3


From 7fe1b00d92eaceb83f95200b5114cf5df0919892 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Fri, 31 Mar 2023 07:46:03 +0000
Subject: mtd: spi-nor: Stop exporting spi_nor_restore()

Some SPI NOR controllers that used this method were moved to
drivers/spi/. We don't accept new support for the existing SPI NOR
controllers drivers under drivers/mtd/spi-nor/controllers/ and we
encourage their owners to move the drivers under drivers/spi/.
Make spi_nor_restore() private as we're going to use it just in core.c.

Link: https://lore.kernel.org/r/20230331074606.3559258-8-tudor.ambarus@linaro.org
Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
---
 Documentation/driver-api/mtd/spi-nor.rst | 3 ---
 drivers/mtd/spi-nor/core.c               | 3 +--
 include/linux/mtd/spi-nor.h              | 6 ------
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/mtd/spi-nor.rst b/Documentation/driver-api/mtd/spi-nor.rst
index 4a3adca417fd..c22f8c0f7950 100644
--- a/Documentation/driver-api/mtd/spi-nor.rst
+++ b/Documentation/driver-api/mtd/spi-nor.rst
@@ -63,6 +63,3 @@ The main API is spi_nor_scan(). Before you call the hook, a driver should
 initialize the necessary fields for spi_nor{}. Please see
 drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to spi-fsl-qspi.c
 when you want to write a new driver for a SPI NOR controller.
-Another API is spi_nor_restore(), this is used to restore the status of SPI
-flash chip such as addressing mode. Call it whenever detach the driver from
-device or reboot the system.
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index afb56b2cf8fe..75a7839af034 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -3291,7 +3291,7 @@ static void spi_nor_put_device(struct mtd_info *mtd)
 	module_put(dev->driver->owner);
 }
 
-void spi_nor_restore(struct spi_nor *nor)
+static void spi_nor_restore(struct spi_nor *nor)
 {
 	int ret;
 
@@ -3311,7 +3311,6 @@ void spi_nor_restore(struct spi_nor *nor)
 	if (nor->flags & SNOR_F_SOFT_RESET)
 		spi_nor_soft_reset(nor);
 }
-EXPORT_SYMBOL_GPL(spi_nor_restore);
 
 static const struct flash_info *spi_nor_match_name(struct spi_nor *nor,
 						   const char *name)
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 82547b4b3708..cdcfe0fd2e7d 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -450,10 +450,4 @@ static inline struct device_node *spi_nor_get_flash_node(struct spi_nor *nor)
 int spi_nor_scan(struct spi_nor *nor, const char *name,
 		 const struct spi_nor_hwcaps *hwcaps);
 
-/**
- * spi_nor_restore_addr_mode() - restore the status of SPI NOR
- * @nor:	the spi_nor structure
- */
-void spi_nor_restore(struct spi_nor *nor);
-
 #endif
-- 
cgit v1.2.3


From 59611370f92923089ceb284072d01445164a0191 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Tue, 4 Apr 2023 12:21:05 +0300
Subject: ASoC: SOF: Add flag and state which will be used for DSP-less mode

The DSPless mode of the ASoC/SOF driver can be used for hardware
verification and debug on platforms with HDaudio codecs. The DSP mode is
still needed on existing platforms for SSP, DMIC, SoundWire interfaces
managed by the GP-DMA.

This mode is also helpful to compare the legacy HDaudio driver with the
ASoC/SOF driver wrt. codec management and handling. In theory we use the
same code but differences are sometimes seen on jack detection and event
handling.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Link: https://lore.kernel.org/r/20230404092115.27949-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof.h      |  5 +++++
 sound/soc/sof/core.c     |  9 +++++++++
 sound/soc/sof/sof-priv.h | 11 +++++++++++
 3 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/sound/sof.h b/include/sound/sof.h
index 266e66318f9c..d3c41f87ac31 100644
--- a/include/sound/sof.h
+++ b/include/sound/sof.h
@@ -21,6 +21,7 @@ struct snd_sof_dev;
 /**
  * enum sof_fw_state - DSP firmware state definitions
  * @SOF_FW_BOOT_NOT_STARTED:	firmware boot is not yet started
+ * @SOF_DSPLESS_MODE:		DSP is not used
  * @SOF_FW_BOOT_PREPARE:	preparing for boot (firmware loading for exaqmple)
  * @SOF_FW_BOOT_IN_PROGRESS:	firmware boot is in progress
  * @SOF_FW_BOOT_FAILED:		firmware boot failed
@@ -31,6 +32,7 @@ struct snd_sof_dev;
  */
 enum sof_fw_state {
 	SOF_FW_BOOT_NOT_STARTED = 0,
+	SOF_DSPLESS_MODE,
 	SOF_FW_BOOT_PREPARE,
 	SOF_FW_BOOT_IN_PROGRESS,
 	SOF_FW_BOOT_FAILED,
@@ -130,6 +132,9 @@ struct sof_dev_desc {
 	unsigned int ipc_supported_mask;
 	enum sof_ipc_type ipc_default;
 
+	/* The platform supports DSPless mode */
+	bool dspless_mode_supported;
+
 	/* defaults paths for firmware, library and topology files */
 	const char *default_fw_path[SOF_IPC_TYPE_COUNT];
 	const char *default_lib_path[SOF_IPC_TYPE_COUNT];
diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c
index 7de8673a01ce..06bcae631612 100644
--- a/sound/soc/sof/core.c
+++ b/sound/soc/sof/core.c
@@ -365,6 +365,15 @@ int snd_sof_device_probe(struct device *dev, struct snd_sof_pdata *plat_data)
 	if (sof_core_debug)
 		dev_info(dev, "sof_debug value: %#x\n", sof_core_debug);
 
+	if (sof_debug_check_flag(SOF_DBG_DSPLESS_MODE)) {
+		if (plat_data->desc->dspless_mode_supported) {
+			dev_info(dev, "Switching to DSPless mode\n");
+			sdev->dspless_mode_selected = true;
+		} else {
+			dev_info(dev, "DSPless mode is not supported by the platform\n");
+		}
+	}
+
 	/* check IPC support */
 	if (!(BIT(plat_data->ipc_type) & plat_data->desc->ipc_supported_mask)) {
 		dev_err(dev, "ipc_type %d is not supported on this platform, mask is %#x\n",
diff --git a/sound/soc/sof/sof-priv.h b/sound/soc/sof/sof-priv.h
index 5f919162a555..1170989bea57 100644
--- a/sound/soc/sof/sof-priv.h
+++ b/sound/soc/sof/sof-priv.h
@@ -48,6 +48,7 @@ struct snd_sof_pcm_stream;
 #define SOF_DBG_FORCE_NOCODEC			BIT(10) /* ignore all codec-related
 							 * configurations
 							 */
+#define SOF_DBG_DSPLESS_MODE			BIT(15) /* Do not initialize and use the DSP */
 
 /* Flag definitions used for controlling the DSP dump behavior */
 #define SOF_DBG_DUMP_REGS		BIT(0)
@@ -528,6 +529,16 @@ struct snd_sof_dev {
 	spinlock_t ipc_lock;	/* lock for IPC users */
 	spinlock_t hw_lock;	/* lock for HW IO access */
 
+	/*
+	 * When true the DSP is not used.
+	 * It is set under the following condition:
+	 * User sets the SOF_DBG_DSPLESS_MODE flag in sof_debug module parameter
+	 * and
+	 * the platform advertises that it can support such mode
+	 * pdata->desc->dspless_mode_supported is true.
+	 */
+	bool dspless_mode_selected;
+
 	/* Main, Base firmware image */
 	struct sof_firmware basefw;
 
-- 
cgit v1.2.3


From f1ba4e674febf5c0e9f725a75ca43b7722b4e963 Mon Sep 17 00:00:00 2001
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Date: Thu, 30 Mar 2023 17:49:52 -0400
Subject: virtio-blk: fix to match virtio spec

The merged patch series to support zoned block devices in virtio-blk
is not the most up to date version. The merged patch can be found at

https://lore.kernel.org/linux-block/20221016034127.330942-3-dmitry.fomichev@wdc.com/

but the latest and reviewed version is

https://lore.kernel.org/linux-block/20221110053952.3378990-3-dmitry.fomichev@wdc.com/

The reason is apparently that the correct mailing lists and
maintainers were not copied.

The differences between the two are mostly cleanups, but there is one
change that is very important in terms of compatibility with the
approved virtio-zbd specification.

Before it was approved, the OASIS virtio spec had a change in
VIRTIO_BLK_T_ZONE_APPEND request layout that is not reflected in the
current virtio-blk driver code. In the running code, the status is
the first byte of the in-header that is followed by some pad bytes
and the u64 that carries the sector at which the data has been written
to the zone back to the driver, aka the append sector.

This layout turned out to be problematic for implementing in QEMU and
the request status byte has been eventually made the last byte of the
in-header. The current code doesn't expect that and this causes the
append sector value always come as zero to the block layer. This needs
to be fixed ASAP.

Fixes: 95bfec41bd3d ("virtio-blk: add support for zoned block devices")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Message-Id: <20230330214953.1088216-2-dmitry.fomichev@wdc.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c      | 238 ++++++++++++++++++++++++++--------------
 include/uapi/linux/virtio_blk.h |  18 +--
 2 files changed, 166 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2723eede6f21..4f0dbbb3d4a5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -96,16 +96,14 @@ struct virtblk_req {
 
 		/*
 		 * The zone append command has an extended in header.
-		 * The status field in zone_append_in_hdr must have
-		 * the same offset in virtblk_req as the non-zoned
-		 * status field above.
+		 * The status field in zone_append_in_hdr must always
+		 * be the last byte.
 		 */
 		struct {
+			__virtio64 sector;
 			u8 status;
-			u8 reserved[7];
-			__le64 append_sector;
-		} zone_append_in_hdr;
-	};
+		} zone_append;
+	} in_hdr;
 
 	size_t in_hdr_len;
 
@@ -154,7 +152,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
 			sgs[num_out + num_in++] = vbr->sg_table.sgl;
 	}
 
-	sg_init_one(&in_hdr, &vbr->status, vbr->in_hdr_len);
+	sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len);
 	sgs[num_out + num_in++] = &in_hdr;
 
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
@@ -242,11 +240,14 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 				      struct request *req,
 				      struct virtblk_req *vbr)
 {
-	size_t in_hdr_len = sizeof(vbr->status);
+	size_t in_hdr_len = sizeof(vbr->in_hdr.status);
 	bool unmap = false;
 	u32 type;
 	u64 sector = 0;
 
+	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req)))
+		return BLK_STS_NOTSUPP;
+
 	/* Set fields for all request types */
 	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
 
@@ -287,7 +288,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 	case REQ_OP_ZONE_APPEND:
 		type = VIRTIO_BLK_T_ZONE_APPEND;
 		sector = blk_rq_pos(req);
-		in_hdr_len = sizeof(vbr->zone_append_in_hdr);
+		in_hdr_len = sizeof(vbr->in_hdr.zone_append);
 		break;
 	case REQ_OP_ZONE_RESET:
 		type = VIRTIO_BLK_T_ZONE_RESET;
@@ -297,7 +298,10 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 		type = VIRTIO_BLK_T_ZONE_RESET_ALL;
 		break;
 	case REQ_OP_DRV_IN:
-		/* Out header already filled in, nothing to do */
+		/*
+		 * Out header has already been prepared by the caller (virtblk_get_id()
+		 * or virtblk_submit_zone_report()), nothing to do here.
+		 */
 		return 0;
 	default:
 		WARN_ON_ONCE(1);
@@ -318,16 +322,28 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 	return 0;
 }
 
+/*
+ * The status byte is always the last byte of the virtblk request
+ * in-header. This helper fetches its value for all in-header formats
+ * that are currently defined.
+ */
+static inline u8 virtblk_vbr_status(struct virtblk_req *vbr)
+{
+	return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1);
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
-	blk_status_t status = virtblk_result(vbr->status);
+	blk_status_t status = virtblk_result(virtblk_vbr_status(vbr));
+	struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
 
 	virtblk_unmap_data(req, vbr);
 	virtblk_cleanup_cmd(req);
 
 	if (req_op(req) == REQ_OP_ZONE_APPEND)
-		req->__sector = le64_to_cpu(vbr->zone_append_in_hdr.append_sector);
+		req->__sector = virtio64_to_cpu(vblk->vdev,
+						vbr->in_hdr.zone_append.sector);
 
 	blk_mq_end_request(req, status);
 }
@@ -355,7 +371,7 @@ static int virtblk_handle_req(struct virtio_blk_vq *vq,
 
 		if (likely(!blk_should_fake_timeout(req->q)) &&
 		    !blk_mq_complete_request_remote(req) &&
-		    !blk_mq_add_to_batch(req, iob, vbr->status,
+		    !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr),
 					 virtblk_complete_batch))
 			virtblk_request_done(req);
 		req_done++;
@@ -550,7 +566,6 @@ static void virtio_queue_rqs(struct request **rqlist)
 #ifdef CONFIG_BLK_DEV_ZONED
 static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk,
 					  unsigned int nr_zones,
-					  unsigned int zone_sectors,
 					  size_t *buflen)
 {
 	struct request_queue *q = vblk->disk->queue;
@@ -558,7 +573,7 @@ static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk,
 	void *buf;
 
 	nr_zones = min_t(unsigned int, nr_zones,
-			 get_capacity(vblk->disk) >> ilog2(zone_sectors));
+			 get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors));
 
 	bufsize = sizeof(struct virtio_blk_zone_report) +
 		nr_zones * sizeof(struct virtio_blk_zone_descriptor);
@@ -592,7 +607,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
 		return PTR_ERR(req);
 
 	vbr = blk_mq_rq_to_pdu(req);
-	vbr->in_hdr_len = sizeof(vbr->status);
+	vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
 	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
 	vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
 
@@ -601,7 +616,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
 		goto out;
 
 	blk_execute_rq(req, false);
-	err = blk_status_to_errno(virtblk_result(vbr->status));
+	err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
 out:
 	blk_mq_free_request(req);
 	return err;
@@ -609,29 +624,72 @@ out:
 
 static int virtblk_parse_zone(struct virtio_blk *vblk,
 			       struct virtio_blk_zone_descriptor *entry,
-			       unsigned int idx, unsigned int zone_sectors,
-			       report_zones_cb cb, void *data)
+			       unsigned int idx, report_zones_cb cb, void *data)
 {
 	struct blk_zone zone = { };
 
-	if (entry->z_type != VIRTIO_BLK_ZT_SWR &&
-	    entry->z_type != VIRTIO_BLK_ZT_SWP &&
-	    entry->z_type != VIRTIO_BLK_ZT_CONV) {
-		dev_err(&vblk->vdev->dev, "invalid zone type %#x\n",
-			entry->z_type);
-		return -EINVAL;
+	zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start);
+	if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk))
+		zone.len = vblk->zone_sectors;
+	else
+		zone.len = get_capacity(vblk->disk) - zone.start;
+	zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap);
+	zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp);
+
+	switch (entry->z_type) {
+	case VIRTIO_BLK_ZT_SWR:
+		zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		break;
+	case VIRTIO_BLK_ZT_SWP:
+		zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF;
+		break;
+	case VIRTIO_BLK_ZT_CONV:
+		zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+		break;
+	default:
+		dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n",
+			zone.start, entry->z_type);
+		return -EIO;
 	}
 
-	zone.type = entry->z_type;
-	zone.cond = entry->z_state;
-	zone.len = zone_sectors;
-	zone.capacity = le64_to_cpu(entry->z_cap);
-	zone.start = le64_to_cpu(entry->z_start);
-	if (zone.cond == BLK_ZONE_COND_FULL)
+	switch (entry->z_state) {
+	case VIRTIO_BLK_ZS_EMPTY:
+		zone.cond = BLK_ZONE_COND_EMPTY;
+		break;
+	case VIRTIO_BLK_ZS_CLOSED:
+		zone.cond = BLK_ZONE_COND_CLOSED;
+		break;
+	case VIRTIO_BLK_ZS_FULL:
+		zone.cond = BLK_ZONE_COND_FULL;
 		zone.wp = zone.start + zone.len;
-	else
-		zone.wp = le64_to_cpu(entry->z_wp);
+		break;
+	case VIRTIO_BLK_ZS_EOPEN:
+		zone.cond = BLK_ZONE_COND_EXP_OPEN;
+		break;
+	case VIRTIO_BLK_ZS_IOPEN:
+		zone.cond = BLK_ZONE_COND_IMP_OPEN;
+		break;
+	case VIRTIO_BLK_ZS_NOT_WP:
+		zone.cond = BLK_ZONE_COND_NOT_WP;
+		break;
+	case VIRTIO_BLK_ZS_RDONLY:
+		zone.cond = BLK_ZONE_COND_READONLY;
+		zone.wp = ULONG_MAX;
+		break;
+	case VIRTIO_BLK_ZS_OFFLINE:
+		zone.cond = BLK_ZONE_COND_OFFLINE;
+		zone.wp = ULONG_MAX;
+		break;
+	default:
+		dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n",
+			zone.start, entry->z_state);
+		return -EIO;
+	}
 
+	/*
+	 * The callback below checks the validity of the reported
+	 * entry data, no need to further validate it here.
+	 */
 	return cb(&zone, idx, data);
 }
 
@@ -641,39 +699,47 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
 {
 	struct virtio_blk *vblk = disk->private_data;
 	struct virtio_blk_zone_report *report;
-	unsigned int zone_sectors = vblk->zone_sectors;
-	unsigned int nz, i;
-	int ret, zone_idx = 0;
+	unsigned long long nz, i;
 	size_t buflen;
+	unsigned int zone_idx = 0;
+	int ret;
 
 	if (WARN_ON_ONCE(!vblk->zone_sectors))
 		return -EOPNOTSUPP;
 
-	report = virtblk_alloc_report_buffer(vblk, nr_zones,
-					     zone_sectors, &buflen);
+	report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen);
 	if (!report)
 		return -ENOMEM;
 
+	mutex_lock(&vblk->vdev_mutex);
+
+	if (!vblk->vdev) {
+		ret = -ENXIO;
+		goto fail_report;
+	}
+
 	while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) {
 		memset(report, 0, buflen);
 
 		ret = virtblk_submit_zone_report(vblk, (char *)report,
 						 buflen, sector);
-		if (ret) {
-			if (ret > 0)
-				ret = -EIO;
-			goto out_free;
-		}
-		nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones);
+		if (ret)
+			goto fail_report;
+
+		nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones),
+			   nr_zones);
 		if (!nz)
 			break;
 
 		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
 			ret = virtblk_parse_zone(vblk, &report->zones[i],
-						 zone_idx, zone_sectors, cb, data);
+						 zone_idx, cb, data);
 			if (ret)
-				goto out_free;
-			sector = le64_to_cpu(report->zones[i].z_start) + zone_sectors;
+				goto fail_report;
+
+			sector = virtio64_to_cpu(vblk->vdev,
+						 report->zones[i].z_start) +
+				 vblk->zone_sectors;
 			zone_idx++;
 		}
 	}
@@ -682,7 +748,8 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
 		ret = zone_idx;
 	else
 		ret = -EINVAL;
-out_free:
+fail_report:
+	mutex_unlock(&vblk->vdev_mutex);
 	kvfree(report);
 	return ret;
 }
@@ -691,20 +758,28 @@ static void virtblk_revalidate_zones(struct virtio_blk *vblk)
 {
 	u8 model;
 
-	if (!vblk->zone_sectors)
-		return;
-
 	virtio_cread(vblk->vdev, struct virtio_blk_config,
 		     zoned.model, &model);
-	if (!blk_revalidate_disk_zones(vblk->disk, NULL))
-		set_capacity_and_notify(vblk->disk, 0);
+	switch (model) {
+	default:
+		dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model);
+		fallthrough;
+	case VIRTIO_BLK_Z_NONE:
+	case VIRTIO_BLK_Z_HA:
+		disk_set_zoned(vblk->disk, BLK_ZONED_NONE);
+		return;
+	case VIRTIO_BLK_Z_HM:
+		WARN_ON_ONCE(!vblk->zone_sectors);
+		if (!blk_revalidate_disk_zones(vblk->disk, NULL))
+			set_capacity_and_notify(vblk->disk, 0);
+	}
 }
 
 static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 				       struct virtio_blk *vblk,
 				       struct request_queue *q)
 {
-	u32 v;
+	u32 v, wg;
 	u8 model;
 	int ret;
 
@@ -713,16 +788,11 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	switch (model) {
 	case VIRTIO_BLK_Z_NONE:
+	case VIRTIO_BLK_Z_HA:
+		/* Present the host-aware device as non-zoned */
 		return 0;
 	case VIRTIO_BLK_Z_HM:
 		break;
-	case VIRTIO_BLK_Z_HA:
-		/*
-		 * Present the host-aware device as a regular drive.
-		 * TODO It is possible to add an option to make it appear
-		 * in the system as a zoned drive.
-		 */
-		return 0;
 	default:
 		dev_err(&vdev->dev, "unsupported zone model %d\n", model);
 		return -EINVAL;
@@ -735,32 +805,31 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	virtio_cread(vdev, struct virtio_blk_config,
 		     zoned.max_open_zones, &v);
-	disk_set_max_open_zones(vblk->disk, le32_to_cpu(v));
-
-	dev_dbg(&vdev->dev, "max open zones = %u\n", le32_to_cpu(v));
+	disk_set_max_open_zones(vblk->disk, v);
+	dev_dbg(&vdev->dev, "max open zones = %u\n", v);
 
 	virtio_cread(vdev, struct virtio_blk_config,
 		     zoned.max_active_zones, &v);
-	disk_set_max_active_zones(vblk->disk, le32_to_cpu(v));
-	dev_dbg(&vdev->dev, "max active zones = %u\n", le32_to_cpu(v));
+	disk_set_max_active_zones(vblk->disk, v);
+	dev_dbg(&vdev->dev, "max active zones = %u\n", v);
 
 	virtio_cread(vdev, struct virtio_blk_config,
-		     zoned.write_granularity, &v);
-	if (!v) {
+		     zoned.write_granularity, &wg);
+	if (!wg) {
 		dev_warn(&vdev->dev, "zero write granularity reported\n");
 		return -ENODEV;
 	}
-	blk_queue_physical_block_size(q, le32_to_cpu(v));
-	blk_queue_io_min(q, le32_to_cpu(v));
+	blk_queue_physical_block_size(q, wg);
+	blk_queue_io_min(q, wg);
 
-	dev_dbg(&vdev->dev, "write granularity = %u\n", le32_to_cpu(v));
+	dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
 
 	/*
 	 * virtio ZBD specification doesn't require zones to be a power of
 	 * two sectors in size, but the code in this driver expects that.
 	 */
-	virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, &v);
-	vblk->zone_sectors = le32_to_cpu(v);
+	virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors,
+		     &vblk->zone_sectors);
 	if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) {
 		dev_err(&vdev->dev,
 			"zoned device with non power of two zone size %u\n",
@@ -783,8 +852,15 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 			dev_warn(&vdev->dev, "zero max_append_sectors reported\n");
 			return -ENODEV;
 		}
-		blk_queue_max_zone_append_sectors(q, le32_to_cpu(v));
-		dev_dbg(&vdev->dev, "max append sectors = %u\n", le32_to_cpu(v));
+		if ((v << SECTOR_SHIFT) < wg) {
+			dev_err(&vdev->dev,
+				"write granularity %u exceeds max_append_sectors %u limit\n",
+				wg, v);
+			return -ENODEV;
+		}
+
+		blk_queue_max_zone_append_sectors(q, v);
+		dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 	}
 
 	return ret;
@@ -794,6 +870,7 @@ static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
 {
 	return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED);
 }
+
 #else
 
 /*
@@ -801,9 +878,11 @@ static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev)
  * We only need to define a few symbols to avoid compilation errors.
  */
 #define virtblk_report_zones       NULL
+
 static inline void virtblk_revalidate_zones(struct virtio_blk *vblk)
 {
 }
+
 static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
 			struct virtio_blk *vblk, struct request_queue *q)
 {
@@ -831,7 +910,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		return PTR_ERR(req);
 
 	vbr = blk_mq_rq_to_pdu(req);
-	vbr->in_hdr_len = sizeof(vbr->status);
+	vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
 	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 	vbr->out_hdr.sector = 0;
 
@@ -840,7 +919,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		goto out;
 
 	blk_execute_rq(req, false);
-	err = blk_status_to_errno(virtblk_result(vbr->status));
+	err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
 out:
 	blk_mq_free_request(req);
 	return err;
@@ -1504,9 +1583,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 			goto out_cleanup_disk;
 	}
 
-	dev_info(&vdev->dev, "blk config size: %zu\n",
-		sizeof(struct virtio_blk_config));
-
 	err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
 	if (err)
 		goto out_cleanup_disk;
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 5af2a0300bb9..3744e4da1b2a 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -140,11 +140,11 @@ struct virtio_blk_config {
 
 	/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
 	struct virtio_blk_zoned_characteristics {
-		__le32 zone_sectors;
-		__le32 max_open_zones;
-		__le32 max_active_zones;
-		__le32 max_append_sectors;
-		__le32 write_granularity;
+		__virtio32 zone_sectors;
+		__virtio32 max_open_zones;
+		__virtio32 max_active_zones;
+		__virtio32 max_append_sectors;
+		__virtio32 write_granularity;
 		__u8 model;
 		__u8 unused2[3];
 	} zoned;
@@ -241,11 +241,11 @@ struct virtio_blk_outhdr {
  */
 struct virtio_blk_zone_descriptor {
 	/* Zone capacity */
-	__le64 z_cap;
+	__virtio64 z_cap;
 	/* The starting sector of the zone */
-	__le64 z_start;
+	__virtio64 z_start;
 	/* Zone write pointer position in sectors */
-	__le64 z_wp;
+	__virtio64 z_wp;
 	/* Zone type */
 	__u8 z_type;
 	/* Zone state */
@@ -254,7 +254,7 @@ struct virtio_blk_zone_descriptor {
 };
 
 struct virtio_blk_zone_report {
-	__le64 nr_zones;
+	__virtio64 nr_zones;
 	__u8 reserved[56];
 	struct virtio_blk_zone_descriptor zones[];
 };
-- 
cgit v1.2.3


From 8e15c0e71b8ae64fb7163532860f8d608165281f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:46 +0100
Subject: io_uring/rsrc: keep cached refs per node

We cache refs of the current node (i.e. ctx->rsrc_node) in
ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the
cached refs in struct io_rsrc_node for now. It's a prep patch and
shouldn't change anything in practise.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9edc3669c1d71b06c2dca78b2b2b8bb9292738b9.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/rsrc.c                | 15 +++++++++------
 io_uring/rsrc.h                | 16 +++++++++-------
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 561fa421c453..a0a5b5964d3a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -240,7 +240,6 @@ struct io_ring_ctx {
 		 * uring_lock, and updated through io_uring_register(2)
 		 */
 		struct io_rsrc_node	*rsrc_node;
-		int			rsrc_cached_refs;
 		atomic_t		cancel_seq;
 		struct io_file_table	file_table;
 		unsigned		nr_user_files;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f2da9e251e3f..1e7c960737fd 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -36,9 +36,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	if (ctx->rsrc_cached_refs) {
-		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
-		ctx->rsrc_cached_refs = 0;
+	struct io_rsrc_node *node = ctx->rsrc_node;
+
+	if (node && node->cached_refs) {
+		io_rsrc_put_node(node, node->cached_refs);
+		node->cached_refs = 0;
 	}
 }
 
@@ -151,11 +153,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 	__must_hold(&ctx->uring_lock)
 {
-	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
-	refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs);
+	node->cached_refs += IO_RSRC_REF_BATCH;
+	refcount_add(IO_RSRC_REF_BATCH, &node->refs);
 }
 
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -300,6 +302,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	if (!ctx->rsrc_node) {
 		ctx->rsrc_node = ctx->rsrc_backup_node;
 		ctx->rsrc_backup_node = NULL;
+		ctx->rsrc_node->cached_refs = 0;
 	}
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1467b31843bc..950535e2b9f4 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -43,6 +43,7 @@ struct io_rsrc_node {
 	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	bool				done;
+	int				cached_refs;
 };
 
 struct io_mapped_ubuf {
@@ -56,7 +57,7 @@ struct io_mapped_ubuf {
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
 void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
@@ -128,17 +129,18 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 
 	if (node) {
 		if (node == ctx->rsrc_node)
-			ctx->rsrc_cached_refs++;
+			node->cached_refs++;
 		else
 			io_rsrc_put_node(node, 1);
 	}
 }
 
-static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx)
+static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
+				       struct io_rsrc_node *node)
 {
-	ctx->rsrc_cached_refs--;
-	if (unlikely(ctx->rsrc_cached_refs < 0))
-		io_rsrc_refs_refill(ctx);
+	node->cached_refs--;
+	if (unlikely(node->cached_refs < 0))
+		io_rsrc_refs_refill(ctx, node);
 }
 
 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
@@ -151,7 +153,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 		lockdep_assert_held(&ctx->uring_lock);
 
 		req->rsrc_node = ctx->rsrc_node;
-		io_charge_rsrc_node(ctx);
+		io_charge_rsrc_node(ctx, ctx->rsrc_node);
 		io_ring_submit_unlock(ctx, issue_flags);
 	}
 }
-- 
cgit v1.2.3


From 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:50 +0100
Subject: io_uring/rsrc: kill rsrc_ref_lock

We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in
io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means
io_rsrc_node_ref_zero() is not executed from the irq context as an RCU
callback anymore, and we also put it under ->uring_lock.
io_rsrc_node_switch(), which queues up nodes into the list, is also
protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6b60af883c263551190b526a55ff2c9d5ae07141.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 +-
 io_uring/io_uring.c            | 1 -
 io_uring/rsrc.c                | 5 -----
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index a0a5b5964d3a..9492889f00c0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -333,8 +333,8 @@ struct io_ring_ctx {
 	struct delayed_work		rsrc_put_work;
 	struct callback_head		rsrc_put_tw;
 	struct llist_head		rsrc_put_llist;
+	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
-	spinlock_t			rsrc_ref_lock;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 36a76c7b34f0..764df5694d73 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
-	spin_lock_init(&ctx->rsrc_ref_lock);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1237fc77c250..e122b6e5f9c5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -209,11 +209,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	__must_hold(&node->rsrc_data->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	unsigned long flags;
 	bool first_add = false;
 	unsigned long delay = HZ;
 
-	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
 	node->done = true;
 
 	/* if we are mid-quiesce then do not delay */
@@ -229,7 +227,6 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		list_del(&node->node);
 		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 	}
-	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
 
 	if (!first_add)
 		return;
@@ -268,9 +265,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 
 		rsrc_node->rsrc_data = data_to_kill;
-		spin_lock_irq(&ctx->rsrc_ref_lock);
 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-		spin_unlock_irq(&ctx->rsrc_ref_lock);
 
 		atomic_inc(&data_to_kill->refs);
 		/* put master ref */
-- 
cgit v1.2.3


From 36b9818a5a84cb7c977fb723babca1c8d74f288f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:53 +0100
Subject: io_uring/rsrc: don't offload node free

struct delayed_work rsrc_put_work was previously used to offload node
freeing because io_rsrc_node_ref_zero() was previously called by RCU in
the IRQ context. Now, as percpu refcounting is gone, we can do it
eagerly at the spot without pushing it to a worker.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/13fb1aac1e8d068ad8fd4a0c6d0d157ab61b90c0.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 ---
 io_uring/io_uring.c            |  6 -----
 io_uring/rsrc.c                | 59 +++---------------------------------------
 3 files changed, 4 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9492889f00c0..47496059e13a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -330,9 +330,6 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
-	struct delayed_work		rsrc_put_work;
-	struct callback_head		rsrc_put_tw;
-	struct llist_head		rsrc_put_llist;
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 764df5694d73..d6a0025afc31 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -326,9 +326,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
-	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
-	init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
-	init_llist_head(&ctx->rsrc_put_llist);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
@@ -2821,11 +2818,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_rsrc_node_destroy(ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
 		io_rsrc_node_destroy(ctx->rsrc_backup_node);
-	flush_delayed_work(&ctx->rsrc_put_work);
-	flush_delayed_work(&ctx->fallback_work);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 95e71300bb35..0f4e245dee1b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -145,15 +145,8 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 {
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 
-	if (prsrc->tag) {
-		if (ctx->flags & IORING_SETUP_IOPOLL) {
-			mutex_lock(&ctx->uring_lock);
-			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-			mutex_unlock(&ctx->uring_lock);
-		} else {
-			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-		}
-	}
+	if (prsrc->tag)
+		io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 	rsrc_data->do_put(ctx, prsrc);
 }
 
@@ -176,32 +169,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		complete(&rsrc_data->done);
 }
 
-void io_rsrc_put_work(struct work_struct *work)
-{
-	struct io_ring_ctx *ctx;
-	struct llist_node *node;
-
-	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
-	node = llist_del_all(&ctx->rsrc_put_llist);
-
-	while (node) {
-		struct io_rsrc_node *ref_node;
-		struct llist_node *next = node->next;
-
-		ref_node = llist_entry(node, struct io_rsrc_node, llist);
-		__io_rsrc_put_work(ref_node);
-		node = next;
-	}
-}
-
-void io_rsrc_put_tw(struct callback_head *cb)
-{
-	struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
-					       rsrc_put_tw);
-
-	io_rsrc_put_work(&ctx->rsrc_put_work.work);
-}
-
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
 	if (data && !atomic_dec_and_test(&data->refs))
@@ -217,34 +184,18 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	__must_hold(&node->rsrc_data->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	bool first_add = false;
-	unsigned long delay = HZ;
 
 	node->done = true;
-
-	/* if we are mid-quiesce then do not delay */
-	if (node->rsrc_data->quiesce)
-		delay = 0;
-
 	while (!list_empty(&ctx->rsrc_ref_list)) {
 		node = list_first_entry(&ctx->rsrc_ref_list,
 					    struct io_rsrc_node, node);
 		/* recycle ref nodes in order */
 		if (!node->done)
 			break;
-		list_del(&node->node);
-		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-	}
 
-	if (!first_add)
-		return;
-
-	if (ctx->submitter_task) {
-		if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
-				   ctx->notify_method))
-			return;
+		list_del(&node->node);
+		__io_rsrc_put_work(node);
 	}
-	mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
 static struct io_rsrc_node *io_rsrc_node_alloc(void)
@@ -320,13 +271,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		if (ret < 0) {
 			atomic_inc(&data->refs);
 			/* wait for all works potentially completing data->done */
-			flush_delayed_work(&ctx->rsrc_put_work);
 			reinit_completion(&data->done);
 			mutex_lock(&ctx->uring_lock);
 			break;
 		}
 
-		flush_delayed_work(&ctx->rsrc_put_work);
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret) {
 			mutex_lock(&ctx->uring_lock);
-- 
cgit v1.2.3


From 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:54 +0100
Subject: io_uring/rsrc: cache struct io_rsrc_node

Add allocation cache for struct io_rsrc_node, it's always allocated and
put under ->uring_lock, so it doesn't need any extra synchronisation
around caches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/252a9d9ef9654e6467af30fdc02f57c0118fb76e.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 11 +++++++++--
 io_uring/rsrc.c                | 23 +++++++++++++++--------
 io_uring/rsrc.h                |  9 +++++++--
 4 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 47496059e13a..5d772e36e7fc 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -332,6 +332,7 @@ struct io_ring_ctx {
 
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
+	struct io_alloc_cache		rsrc_node_cache;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d6a0025afc31..419d6f42935f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
+	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
@@ -2790,6 +2791,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
+static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_rsrc_node, cache));
+}
+
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
@@ -2815,9 +2821,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx->rsrc_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
-		io_rsrc_node_destroy(ctx->rsrc_backup_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 
@@ -2829,6 +2835,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 #endif
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
+	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0f4e245dee1b..345631091d80 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		kfree(prsrc);
 	}
 
-	io_rsrc_node_destroy(ref_node);
+	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 	if (atomic_dec_and_test(&rsrc_data->refs))
 		complete(&rsrc_data->done);
 }
@@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)
 		wait_for_completion(&data->done);
 }
 
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	kfree(ref_node);
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+		kfree(node);
 }
 
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
@@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	}
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
+	struct io_cache_entry *entry;
 
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
+	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (entry) {
+		ref_node = container_of(entry, struct io_rsrc_node, cache);
+	} else {
+		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+		if (!ref_node)
+			return NULL;
+	}
 
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
@@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc();
+	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 11703082d125..3b9f4c57c47c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,6 +4,8 @@
 
 #include <net/af_unix.h>
 
+#include "alloc_cache.h"
+
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
@@ -37,8 +39,11 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
+	union {
+		struct io_cache_entry		cache;
+		struct io_rsrc_data		*rsrc_data;
+	};
 	struct list_head		node;
-	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	int				refs;
 	bool				done;
@@ -65,7 +70,7 @@ void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);
-- 
cgit v1.2.3


From 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:57 +0100
Subject: io_uring/rsrc: add custom limit for node caching

The number of entries in the rsrc node cache is limited to 512, which
still seems unnecessarily large. Add per cache thresholds and set to
to 32 for the rsrc node cache.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d0cd538b944dac0bf878e276fc0199f21e6bccea.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/alloc_cache.h         | 6 ++++--
 io_uring/io_uring.c            | 9 ++++++---
 io_uring/rsrc.h                | 2 ++
 4 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5d772e36e7fc..4a6ce03a4903 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -190,6 +190,7 @@ struct io_ev_fd {
 struct io_alloc_cache {
 	struct io_wq_work_node	list;
 	unsigned int		nr_cached;
+	unsigned int		max_cached;
 	size_t			elem_size;
 };
 
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 2fbecaa3a1ba..851a527afb5e 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -13,7 +13,7 @@ struct io_cache_entry {
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 				      struct io_cache_entry *entry)
 {
-	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
+	if (cache->nr_cached < cache->max_cached) {
 		cache->nr_cached++;
 		wq_stack_add_head(&entry->node, &cache->list);
 		/* KASAN poisons object */
@@ -38,10 +38,12 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size)
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+				       unsigned max_nr, size_t size)
 {
 	cache->list.next = NULL;
 	cache->nr_cached = 0;
+	cache->max_cached = max_nr;
 	cache->elem_size = size;
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index da36fa1eeac9..ae90d2753e0d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,9 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
-	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
-	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
+	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+			    sizeof(struct io_rsrc_node));
+	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct async_poll));
+	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 7ab9b2b2e757..8729f2fee256 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -6,6 +6,8 @@
 
 #include "alloc_cache.h"
 
+#define IO_NODE_ALLOC_CACHE_MAX 32
+
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
-- 
cgit v1.2.3


From 3e67cb8a3c6251c86e5d058d8ee4e1909bc25af0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 08:06:56 -0700
Subject: srcu: Add whitespace to __SRCU_STRUCT_INIT() & __DEFINE_SRCU()

This is a whitespace-only commit with no change in functionality.
Its purpose is to prepare for later commits that: (1) Cause statically
allocated srcu_struct structures to rely on compile-time initialization
and (2) Move fields from the srcu_struct structure to a new srcu_usage
structure.

Cc: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 558057b517b7..488d0e5d1ba3 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -108,13 +108,13 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-#define __SRCU_STRUCT_INIT(name, pcpu_name)				\
-{									\
-	.sda = &pcpu_name,						\
-	.lock = __SPIN_LOCK_UNLOCKED(name.lock),			\
-	.srcu_gp_seq_needed = -1UL,					\
-	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),		\
-	__SRCU_DEP_MAP_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, pcpu_name)							\
+{												\
+	.sda = &pcpu_name,									\
+	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
+	.srcu_gp_seq_needed = -1UL,								\
+	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
+	__SRCU_DEP_MAP_INIT(name)								\
 }
 
 /*
@@ -137,15 +137,15 @@ struct srcu_struct {
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #ifdef MODULE
-# define __DEFINE_SRCU(name, is_static)					\
-	is_static struct srcu_struct name;				\
-	extern struct srcu_struct * const __srcu_struct_##name;		\
-	struct srcu_struct * const __srcu_struct_##name			\
+# define __DEFINE_SRCU(name, is_static)								\
+	is_static struct srcu_struct name;							\
+	extern struct srcu_struct * const __srcu_struct_##name;					\
+	struct srcu_struct * const __srcu_struct_##name						\
 		__section("___srcu_struct_ptrs") = &name
 #else
-# define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);	\
-	is_static struct srcu_struct name =				\
+# define __DEFINE_SRCU(name, is_static)								\
+	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);				\
+	is_static struct srcu_struct name =							\
 		__SRCU_STRUCT_INIT(name, name##_srcu_data)
 #endif
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
-- 
cgit v1.2.3


From f4d01a259374ef358cd6b00a96b4dfc0fb05a844 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 13:28:04 -0700
Subject: srcu: Use static init for statically allocated in-module srcu_struct

Further shrinking the srcu_struct structure is eased by requiring
that in-module srcu_struct structures rely more heavily on static
initialization.  In particular, this preserves the property that
a module-load-time srcu_struct initialization can fail only due
to memory-allocation failure of the per-CPU srcu_data structures.
It might also slightly improve robustness by keeping the number of memory
allocations that must succeed down percpu_alloc() call.

This is in preparation for splitting an srcu_usage structure out
of the srcu_struct structure.

[ paulmck: Fold in qiang1.zhang@intel.com feedback. ]

Cc: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 19 ++++++++++++++-----
 kernel/rcu/srcutree.c    | 19 +++++++++++++------
 2 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 488d0e5d1ba3..3ce6deee1dbe 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -108,15 +108,24 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-#define __SRCU_STRUCT_INIT(name, pcpu_name)							\
-{												\
-	.sda = &pcpu_name,									\
+#define __SRCU_STRUCT_INIT_COMMON(name)								\
 	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
 	.srcu_gp_seq_needed = -1UL,								\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
-	__SRCU_DEP_MAP_INIT(name)								\
+	__SRCU_DEP_MAP_INIT(name)
+
+#define __SRCU_STRUCT_INIT_MODULE(name)								\
+{												\
+	__SRCU_STRUCT_INIT_COMMON(name)								\
 }
 
+#define __SRCU_STRUCT_INIT(name, pcpu_name)							\
+{												\
+	.sda = &pcpu_name,									\
+	__SRCU_STRUCT_INIT_COMMON(name)								\
+}
+
+
 /*
  * Define and initialize a srcu struct at build time.
  * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
@@ -138,7 +147,7 @@ struct srcu_struct {
  */
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)								\
-	is_static struct srcu_struct name;							\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name);			\
 	extern struct srcu_struct * const __srcu_struct_##name;					\
 	struct srcu_struct * const __srcu_struct_##name						\
 		__section("___srcu_struct_ptrs") = &name
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ab4ee58af84b..7e6e7dfb1a87 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1873,13 +1873,14 @@ void __init srcu_init(void)
 static int srcu_module_coming(struct module *mod)
 {
 	int i;
+	struct srcu_struct *ssp;
 	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
-	int ret;
 
 	for (i = 0; i < mod->num_srcu_structs; i++) {
-		ret = init_srcu_struct(*(sspp++));
-		if (WARN_ON_ONCE(ret))
-			return ret;
+		ssp = *(sspp++);
+		ssp->sda = alloc_percpu(struct srcu_data);
+		if (WARN_ON_ONCE(!ssp->sda))
+			return -ENOMEM;
 	}
 	return 0;
 }
@@ -1888,10 +1889,16 @@ static int srcu_module_coming(struct module *mod)
 static void srcu_module_going(struct module *mod)
 {
 	int i;
+	struct srcu_struct *ssp;
 	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
 
-	for (i = 0; i < mod->num_srcu_structs; i++)
-		cleanup_srcu_struct(*(sspp++));
+	for (i = 0; i < mod->num_srcu_structs; i++) {
+		ssp = *(sspp++);
+		if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed)) &&
+		    !WARN_ON_ONCE(!ssp->sda_is_static))
+			cleanup_srcu_struct(ssp);
+		free_percpu(ssp->sda);
+	}
 }
 
 /* Handle one module, either coming or going. */
-- 
cgit v1.2.3


From 95433f7263011e0e6e83caef85d98896dd99cab7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 16 Mar 2023 17:58:51 -0700
Subject: srcu: Begin offloading srcu_struct fields to srcu_update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current srcu_struct structure is on the order of 200 bytes in size
(depending on architecture and .config), which is much better than the
old-style 26K bytes, but still all too inconvenient when one is trying
to achieve good cache locality on a fastpath involving SRCU readers.

However, only a few fields in srcu_struct are used by SRCU readers.
The remaining fields could be offloaded to a new srcu_update
structure, thus shrinking the srcu_struct structure down to a few
tens of bytes.  This commit begins this noble quest, a quest that is
complicated by open-coded initialization of the srcu_struct within the
srcu_notifier_head structure.  This complication is addressed by updating
the srcu_notifier_head structure's open coding, given that there does
not appear to be a straightforward way of abstracting that initialization.

This commit moves only the ->node pointer to srcu_update.  Later commits
will move additional fields.

[ paulmck: Fold in qiang1.zhang@intel.com's memory-leak fix. ]

Link: https://lore.kernel.org/all/20230320055751.4120251-1-qiang1.zhang@intel.com/
Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/notifier.h |  5 ++++-
 include/linux/srcutiny.h |  6 +++---
 include/linux/srcutree.h | 27 ++++++++++++++++++---------
 kernel/rcu/rcu.h         |  6 ++++--
 kernel/rcu/srcutree.c    | 28 +++++++++++++++++++---------
 5 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index aef88c2d1173..2aba75145144 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -73,6 +73,9 @@ struct raw_notifier_head {
 
 struct srcu_notifier_head {
 	struct mutex mutex;
+#ifdef CONFIG_TREE_SRCU
+	struct srcu_usage srcuu;
+#endif
 	struct srcu_struct srcu;
 	struct notifier_block __rcu *head;
 };
@@ -107,7 +110,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 	{							\
 		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
 		.head = NULL,					\
-		.srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),	\
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
 	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 5aa5e0faf6a1..ebd72491af99 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -31,7 +31,7 @@ struct srcu_struct {
 
 void srcu_drive_gp(struct work_struct *wp);
 
-#define __SRCU_STRUCT_INIT(name, __ignored)				\
+#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored)			\
 {									\
 	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
 	.srcu_cb_tail = &name.srcu_cb_head,				\
@@ -44,9 +44,9 @@ void srcu_drive_gp(struct work_struct *wp);
  * Tree SRCU, which needs some per-CPU data.
  */
 #define DEFINE_SRCU(name) \
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name)
 #define DEFINE_STATIC_SRCU(name) \
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name)
 
 void synchronize_srcu(struct srcu_struct *ssp);
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 3ce6deee1dbe..276f325f1296 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -57,11 +57,17 @@ struct srcu_node {
 	int grphi;				/* Biggest CPU for node. */
 };
 
+/*
+ * Per-SRCU-domain structure, update-side data linked from srcu_struct.
+ */
+struct srcu_usage {
+	struct srcu_node *node;			/* Combining tree. */
+};
+
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	struct srcu_node *node;			/* Combining tree. */
 	struct srcu_node *level[RCU_NUM_LVLS + 1];
 						/* First node at each level. */
 	int srcu_size_state;			/* Small-to-big transition state. */
@@ -90,6 +96,7 @@ struct srcu_struct {
 	unsigned long reschedule_count;
 	struct delayed_work work;
 	struct lockdep_map dep_map;
+	struct srcu_usage *srcu_sup;		/* Update-side data. */
 };
 
 /* Values for size state variable (->srcu_size_state). */
@@ -108,24 +115,24 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-#define __SRCU_STRUCT_INIT_COMMON(name)								\
+#define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
 	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
 	.srcu_gp_seq_needed = -1UL,								\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
+	.srcu_sup = &usage_name,								\
 	__SRCU_DEP_MAP_INIT(name)
 
-#define __SRCU_STRUCT_INIT_MODULE(name)								\
+#define __SRCU_STRUCT_INIT_MODULE(name, usage_name)						\
 {												\
-	__SRCU_STRUCT_INIT_COMMON(name)								\
+	__SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
 }
 
-#define __SRCU_STRUCT_INIT(name, pcpu_name)							\
+#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name)						\
 {												\
 	.sda = &pcpu_name,									\
-	__SRCU_STRUCT_INIT_COMMON(name)								\
+	__SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
 }
 
-
 /*
  * Define and initialize a srcu struct at build time.
  * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
@@ -147,15 +154,17 @@ struct srcu_struct {
  */
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)								\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name);			\
+	static struct srcu_usage name##_srcu_usage;						\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage);	\
 	extern struct srcu_struct * const __srcu_struct_##name;					\
 	struct srcu_struct * const __srcu_struct_##name						\
 		__section("___srcu_struct_ptrs") = &name
 #else
 # define __DEFINE_SRCU(name, is_static)								\
 	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);				\
+	static struct srcu_usage name##_srcu_usage;						\
 	is_static struct srcu_struct name =							\
-		__SRCU_STRUCT_INIT(name, name##_srcu_data)
+		__SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data)
 #endif
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 115616ac3bfa..8d18d4bf0e29 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -341,11 +341,13 @@ extern void rcu_init_geometry(void);
  * specified state structure (for SRCU) or the only rcu_state structure
  * (for RCU).
  */
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
 	for ((rnp) = &(sp)->node[0]; \
 	     (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
 #define rcu_for_each_node_breadth_first(rnp) \
-	srcu_for_each_node_breadth_first(&rcu_state, rnp)
+	_rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+	_rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
 
 /*
  * Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7e6e7dfb1a87..049e20dbec76 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -173,12 +173,12 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 
 	/* Initialize geometry if it has not already been initialized. */
 	rcu_init_geometry();
-	ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
-	if (!ssp->node)
+	ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+	if (!ssp->srcu_sup->node)
 		return false;
 
 	/* Work out the overall tree geometry. */
-	ssp->level[0] = &ssp->node[0];
+	ssp->level[0] = &ssp->srcu_sup->node[0];
 	for (i = 1; i < rcu_num_lvls; i++)
 		ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
 	rcu_init_levelspread(levelspread, num_rcu_lvl);
@@ -195,7 +195,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 		snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
 		snp->grplo = -1;
 		snp->grphi = -1;
-		if (snp == &ssp->node[0]) {
+		if (snp == &ssp->srcu_sup->node[0]) {
 			/* Root node, special case. */
 			snp->srcu_parent = NULL;
 			continue;
@@ -236,8 +236,12 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
  */
 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 {
+	if (!is_static)
+		ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+	if (!ssp->srcu_sup)
+		return -ENOMEM;
 	ssp->srcu_size_state = SRCU_SIZE_SMALL;
-	ssp->node = NULL;
+	ssp->srcu_sup->node = NULL;
 	mutex_init(&ssp->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
@@ -249,8 +253,11 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
-	if (!ssp->sda)
+	if (!ssp->sda) {
+		if (!is_static)
+			kfree(ssp->srcu_sup);
 		return -ENOMEM;
+	}
 	init_srcu_struct_data(ssp);
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
@@ -259,6 +266,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 			if (!ssp->sda_is_static) {
 				free_percpu(ssp->sda);
 				ssp->sda = NULL;
+				kfree(ssp->srcu_sup);
 				return -ENOMEM;
 			}
 		} else {
@@ -656,13 +664,15 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 			rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
+	kfree(ssp->srcu_sup->node);
+	ssp->srcu_sup->node = NULL;
+	ssp->srcu_size_state = SRCU_SIZE_SMALL;
 	if (!ssp->sda_is_static) {
 		free_percpu(ssp->sda);
 		ssp->sda = NULL;
+		kfree(ssp->srcu_sup);
+		ssp->srcu_sup = NULL;
 	}
-	kfree(ssp->node);
-	ssp->node = NULL;
-	ssp->srcu_size_state = SRCU_SIZE_SMALL;
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
-- 
cgit v1.2.3


From 208f41b1312443401353bec0c1939e2bfc28adce Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 14:43:08 -0700
Subject: srcu: Move ->level from srcu_struct to srcu_usage

This commit moves the ->level[] array from the srcu_struct structure to
the srcu_usage structure to reduce the size of the former in order to
improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  4 ++--
 kernel/rcu/srcutree.c    | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 276f325f1296..c7373fe5c14b 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -62,14 +62,14 @@ struct srcu_node {
  */
 struct srcu_usage {
 	struct srcu_node *node;			/* Combining tree. */
+	struct srcu_node *level[RCU_NUM_LVLS + 1];
+						/* First node at each level. */
 };
 
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	struct srcu_node *level[RCU_NUM_LVLS + 1];
-						/* First node at each level. */
 	int srcu_size_state;			/* Small-to-big transition state. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 	spinlock_t __private lock;		/* Protect counters and size state. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 049e20dbec76..acb0862faafa 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -178,9 +178,9 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 		return false;
 
 	/* Work out the overall tree geometry. */
-	ssp->level[0] = &ssp->srcu_sup->node[0];
+	ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
 	for (i = 1; i < rcu_num_lvls; i++)
-		ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+		ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
 	rcu_init_levelspread(levelspread, num_rcu_lvl);
 
 	/* Each pass through this loop initializes one srcu_node structure. */
@@ -202,10 +202,10 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 		}
 
 		/* Non-root node. */
-		if (snp == ssp->level[level + 1])
+		if (snp == ssp->srcu_sup->level[level + 1])
 			level++;
-		snp->srcu_parent = ssp->level[level - 1] +
-				   (snp - ssp->level[level]) /
+		snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+				   (snp - ssp->srcu_sup->level[level]) /
 				   levelspread[level - 1];
 	}
 
@@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 	 * leaves of the srcu_node tree.
 	 */
 	level = rcu_num_lvls - 1;
-	snp_first = ssp->level[level];
+	snp_first = ssp->srcu_sup->level[level];
 	for_each_possible_cpu(cpu) {
 		sdp = per_cpu_ptr(ssp->sda, cpu);
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
@@ -889,7 +889,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 		srcu_for_each_node_breadth_first(ssp, snp) {
 			spin_lock_irq_rcu_node(snp);
 			cbs = false;
-			last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+			last_lvl = snp >= ssp->srcu_sup->level[rcu_num_lvls - 1];
 			if (last_lvl)
 				cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
 			snp->srcu_have_cbs[idx] = gpseq;
-- 
cgit v1.2.3


From a0d8cbd3821369dc9478cabd605417afb9eb24dc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 17:16:30 -0700
Subject: srcu: Move ->srcu_size_state from srcu_struct to srcu_usage

This commit moves the ->srcu_size_state field from the srcu_struct
structure to the srcu_usage structure to reduce the size of the former
in order to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  2 +-
 kernel/rcu/srcutree.c    | 37 +++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index c7373fe5c14b..443d27a214ef 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -64,13 +64,13 @@ struct srcu_usage {
 	struct srcu_node *node;			/* Combining tree. */
 	struct srcu_node *level[RCU_NUM_LVLS + 1];
 						/* First node at each level. */
+	int srcu_size_state;			/* Small-to-big transition state. */
 };
 
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	int srcu_size_state;			/* Small-to-big transition state. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 	spinlock_t __private lock;		/* Protect counters and size state. */
 	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index acb0862faafa..8428a184d506 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 		}
 		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 	}
-	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
 	return true;
 }
 
@@ -240,7 +240,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
 	if (!ssp->srcu_sup)
 		return -ENOMEM;
-	ssp->srcu_size_state = SRCU_SIZE_SMALL;
+	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
 	ssp->srcu_sup->node = NULL;
 	mutex_init(&ssp->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
@@ -261,7 +261,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	init_srcu_struct_data(ssp);
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
-	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
 			if (!ssp->sda_is_static) {
 				free_percpu(ssp->sda);
@@ -270,7 +270,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 				return -ENOMEM;
 			}
 		} else {
-			WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+			WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 		}
 	}
 	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -315,7 +315,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 static void __srcu_transition_to_big(struct srcu_struct *ssp)
 {
 	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
-	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
 }
 
 /*
@@ -326,10 +326,10 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
 	unsigned long flags;
 
 	/* Double-checked locking on ->srcu_size-state. */
-	if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
 		return;
 	spin_lock_irqsave_rcu_node(ssp, flags);
-	if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
+	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
 		spin_unlock_irqrestore_rcu_node(ssp, flags);
 		return;
 	}
@@ -345,7 +345,7 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
 {
 	unsigned long j;
 
-	if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+	if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
 		return;
 	j = jiffies;
 	if (ssp->srcu_size_jiffies != j) {
@@ -666,7 +666,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 	}
 	kfree(ssp->srcu_sup->node);
 	ssp->srcu_sup->node = NULL;
-	ssp->srcu_size_state = SRCU_SIZE_SMALL;
+	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
 	if (!ssp->sda_is_static) {
 		free_percpu(ssp->sda);
 		ssp->sda = NULL;
@@ -770,7 +770,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	struct srcu_data *sdp;
 	int state;
 
-	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
 		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
 	else
 		sdp = this_cpu_ptr(ssp->sda);
@@ -880,7 +880,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	/* A new grace period can start at this point.  But only one. */
 
 	/* Initiate callback invocation as needed. */
-	ss_state = smp_load_acquire(&ssp->srcu_size_state);
+	ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
 	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
 		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
 					cbdelay);
@@ -940,7 +940,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 		if (ss_state == SRCU_SIZE_ALLOC)
 			init_srcu_struct_nodes(ssp, GFP_KERNEL);
 		else
-			smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+			smp_store_release(&ssp->srcu_sup->srcu_size_state, ss_state + 1);
 	}
 }
 
@@ -1002,7 +1002,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 	unsigned long snp_seq;
 
 	/* Ensure that snp node tree is fully initialized before traversing it */
-	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
 		snp_leaf = NULL;
 	else
 		snp_leaf = sdp->mynode;
@@ -1209,7 +1209,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 * sequence number cannot wrap around in the meantime.
 	 */
 	idx = __srcu_read_lock_nmisafe(ssp);
-	ss_state = smp_load_acquire(&ssp->srcu_size_state);
+	ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
 	if (ss_state < SRCU_SIZE_WAIT_CALL)
 		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
 	else
@@ -1546,7 +1546,7 @@ void srcu_barrier(struct srcu_struct *ssp)
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
 
 	idx = __srcu_read_lock_nmisafe(ssp);
-	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
 		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,	get_boot_cpu_id()));
 	else
 		for_each_possible_cpu(cpu)
@@ -1784,7 +1784,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 	int cpu;
 	int idx;
 	unsigned long s0 = 0, s1 = 0;
-	int ss_state = READ_ONCE(ssp->srcu_size_state);
+	int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
 	int ss_state_idx = ss_state;
 
 	idx = ssp->srcu_idx & 0x1;
@@ -1871,8 +1871,9 @@ void __init srcu_init(void)
 		ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
 				      work.work.entry);
 		list_del_init(&ssp->work.work.entry);
-		if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
-			ssp->srcu_size_state = SRCU_SIZE_ALLOC;
+		if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+		    ssp->srcu_sup->srcu_size_state == SRCU_SIZE_SMALL)
+			ssp->srcu_sup->srcu_size_state = SRCU_SIZE_ALLOC;
 		queue_work(rcu_gp_wq, &ssp->work.work);
 	}
 }
-- 
cgit v1.2.3


From 574dc1a7efe490dffe5c1ce0285306feec16a880 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 17:22:27 -0700
Subject: srcu: Move ->srcu_cb_mutex from srcu_struct to srcu_usage

This commit moves the ->srcu_cb_mutex field from the srcu_struct structure
to the srcu_usage structure to reduce the size of the former in order
to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 2 +-
 kernel/rcu/srcutree.c    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 443d27a214ef..231de66ceb15 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -65,13 +65,13 @@ struct srcu_usage {
 	struct srcu_node *level[RCU_NUM_LVLS + 1];
 						/* First node at each level. */
 	int srcu_size_state;			/* Small-to-big transition state. */
+	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 };
 
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 	spinlock_t __private lock;		/* Protect counters and size state. */
 	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
 	unsigned int srcu_idx;			/* Current rdr array element. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 8428a184d506..1814f3bfc219 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -242,7 +242,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		return -ENOMEM;
 	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
 	ssp->srcu_sup->node = NULL;
-	mutex_init(&ssp->srcu_cb_mutex);
+	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
 	ssp->srcu_gp_seq = 0;
@@ -861,7 +861,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	int ss_state;
 
 	/* Prevent more than one additional grace period. */
-	mutex_lock(&ssp->srcu_cb_mutex);
+	mutex_lock(&ssp->srcu_sup->srcu_cb_mutex);
 
 	/* End the current grace period. */
 	spin_lock_irq_rcu_node(ssp);
@@ -921,7 +921,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 		}
 
 	/* Callback initiation done, allow grace periods after next. */
-	mutex_unlock(&ssp->srcu_cb_mutex);
+	mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex);
 
 	/* Start a new grace period if needed. */
 	spin_lock_irq_rcu_node(ssp);
-- 
cgit v1.2.3


From b3fb11f7e9c3c64dd86403409a070c996d8ac081 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 18:29:38 -0700
Subject: srcu: Move ->lock from srcu_struct to srcu_usage

This commit moves the ->lock field from the srcu_struct structure to
the srcu_usage structure to reduce the size of the former in order to
improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 11 ++++++----
 kernel/rcu/srcutree.c    | 56 ++++++++++++++++++++++++------------------------
 2 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 231de66ceb15..694d87b81917 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -66,13 +66,13 @@ struct srcu_usage {
 						/* First node at each level. */
 	int srcu_size_state;			/* Small-to-big transition state. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
+	spinlock_t __private lock;		/* Protect counters and size state. */
 };
 
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	spinlock_t __private lock;		/* Protect counters and size state. */
 	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
@@ -116,7 +116,6 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN2	2
 
 #define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
-	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
 	.srcu_gp_seq_needed = -1UL,								\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 	.srcu_sup = &usage_name,								\
@@ -154,7 +153,9 @@ struct srcu_struct {
  */
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)								\
-	static struct srcu_usage name##_srcu_usage;						\
+	static struct srcu_usage name##_srcu_usage = {						\
+		.lock = __SPIN_LOCK_UNLOCKED(name.lock),					\
+	};											\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage);	\
 	extern struct srcu_struct * const __srcu_struct_##name;					\
 	struct srcu_struct * const __srcu_struct_##name						\
@@ -162,7 +163,9 @@ struct srcu_struct {
 #else
 # define __DEFINE_SRCU(name, is_static)								\
 	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);				\
-	static struct srcu_usage name##_srcu_usage;						\
+	static struct srcu_usage name##_srcu_usage = {						\
+		.lock = __SPIN_LOCK_UNLOCKED(name.lock),					\
+	};											\
 	is_static struct srcu_struct name =							\
 		__SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data)
 #endif
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c2a024a60f1a..c42248cf18f6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -103,7 +103,7 @@ do {										\
 
 #define spin_trylock_irqsave_rcu_node(p, flags)					\
 ({										\
-	bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags);	\
+	bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
 										\
 	if (___locked)								\
 		smp_mb__after_unlock_lock();					\
@@ -241,7 +241,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	if (!ssp->srcu_sup)
 		return -ENOMEM;
 	if (!is_static)
-		spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
+		spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
 	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
 	ssp->srcu_sup->node = NULL;
 	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
@@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  */
 static void __srcu_transition_to_big(struct srcu_struct *ssp)
 {
-	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
 	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
 }
 
@@ -328,13 +328,13 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
 	/* Double-checked locking on ->srcu_size-state. */
 	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
 		return;
-	spin_lock_irqsave_rcu_node(ssp, flags);
+	spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
 	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
-		spin_unlock_irqrestore_rcu_node(ssp, flags);
+		spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 		return;
 	}
 	__srcu_transition_to_big(ssp);
-	spin_unlock_irqrestore_rcu_node(ssp, flags);
+	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
 /*
@@ -369,9 +369,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
 
 	if (spin_trylock_irqsave_rcu_node(sdp, *flags))
 		return;
-	spin_lock_irqsave_rcu_node(ssp, *flags);
+	spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
 	spin_lock_irqsave_check_contention(ssp);
-	spin_unlock_irqrestore_rcu_node(ssp, *flags);
+	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
 	spin_lock_irqsave_rcu_node(sdp, *flags);
 }
 
@@ -383,9 +383,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
  */
 static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
 {
-	if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+	if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
 		return;
-	spin_lock_irqsave_rcu_node(ssp, *flags);
+	spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
 	spin_lock_irqsave_check_contention(ssp);
 }
 
@@ -404,13 +404,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
 	/* The smp_load_acquire() pairs with the smp_store_release(). */
 	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
 		return; /* Already initialized. */
-	spin_lock_irqsave_rcu_node(ssp, flags);
+	spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
 	if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
-		spin_unlock_irqrestore_rcu_node(ssp, flags);
+		spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 		return;
 	}
 	init_srcu_struct_fields(ssp, true);
-	spin_unlock_irqrestore_rcu_node(ssp, flags);
+	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
 /*
@@ -774,7 +774,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
 	else
 		sdp = this_cpu_ptr(ssp->sda);
-	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
 	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
@@ -864,7 +864,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	mutex_lock(&ssp->srcu_sup->srcu_cb_mutex);
 
 	/* End the current grace period. */
-	spin_lock_irq_rcu_node(ssp);
+	spin_lock_irq_rcu_node(ssp->srcu_sup);
 	idx = rcu_seq_state(ssp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
 	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
@@ -875,7 +875,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
 		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
-	spin_unlock_irq_rcu_node(ssp);
+	spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	mutex_unlock(&ssp->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
 
@@ -924,15 +924,15 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex);
 
 	/* Start a new grace period if needed. */
-	spin_lock_irq_rcu_node(ssp);
+	spin_lock_irq_rcu_node(ssp->srcu_sup);
 	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
 	if (!rcu_seq_state(gpseq) &&
 	    ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
 		srcu_gp_start(ssp);
-		spin_unlock_irq_rcu_node(ssp);
+		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 		srcu_reschedule(ssp, 0);
 	} else {
-		spin_unlock_irq_rcu_node(ssp);
+		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	}
 
 	/* Transition to big if needed. */
@@ -975,7 +975,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 	spin_lock_irqsave_ssp_contention(ssp, &flags);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
 		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
-	spin_unlock_irqrestore_rcu_node(ssp, flags);
+	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
 /*
@@ -1064,7 +1064,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 		else if (list_empty(&ssp->work.work.entry))
 			list_add(&ssp->work.work.entry, &srcu_boot_list);
 	}
-	spin_unlock_irqrestore_rcu_node(ssp, flags);
+	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
 /*
@@ -1599,17 +1599,17 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 	 */
 	idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq_rcu_node(ssp);
+		spin_lock_irq_rcu_node(ssp->srcu_sup);
 		if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
 			WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
-			spin_unlock_irq_rcu_node(ssp);
+			spin_unlock_irq_rcu_node(ssp->srcu_sup);
 			mutex_unlock(&ssp->srcu_gp_mutex);
 			return;
 		}
 		idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
 			srcu_gp_start(ssp);
-		spin_unlock_irq_rcu_node(ssp);
+		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 		if (idx != SRCU_STATE_IDLE) {
 			mutex_unlock(&ssp->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
@@ -1623,10 +1623,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 			return; /* readers present, retry later. */
 		}
 		srcu_flip(ssp);
-		spin_lock_irq_rcu_node(ssp);
+		spin_lock_irq_rcu_node(ssp->srcu_sup);
 		rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
 		ssp->srcu_n_exp_nodelay = 0;
-		spin_unlock_irq_rcu_node(ssp);
+		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	}
 
 	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
@@ -1710,7 +1710,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
 {
 	bool pushgp = true;
 
-	spin_lock_irq_rcu_node(ssp);
+	spin_lock_irq_rcu_node(ssp->srcu_sup);
 	if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
 		if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
 			/* All requests fulfilled, time to go idle. */
@@ -1720,7 +1720,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
 		/* Outstanding request and no GP.  Start one. */
 		srcu_gp_start(ssp);
 	}
-	spin_unlock_irq_rcu_node(ssp);
+	spin_unlock_irq_rcu_node(ssp->srcu_sup);
 
 	if (pushgp)
 		queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
-- 
cgit v1.2.3


From e3a6ab25cfa0fcdcb31c346b9871a566d440980d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 19:13:16 -0700
Subject: srcu: Move ->srcu_gp_mutex from srcu_struct to srcu_usage

This commit moves the ->srcu_gp_mutex field from the srcu_struct structure
to the srcu_usage structure to reduce the size of the former in order
to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  2 +-
 kernel/rcu/srcutree.c    | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 694d87b81917..d04e3da6181c 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -67,13 +67,13 @@ struct srcu_usage {
 	int srcu_size_state;			/* Small-to-big transition state. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 	spinlock_t __private lock;		/* Protect counters and size state. */
+	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
 };
 
 /*
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
 	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c42248cf18f6..a36066798de7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -245,7 +245,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
 	ssp->srcu_sup->node = NULL;
 	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
-	mutex_init(&ssp->srcu_gp_mutex);
+	mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
 	ssp->srcu_gp_seq = 0;
 	ssp->srcu_barrier_seq = 0;
@@ -876,7 +876,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
 		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
 	spin_unlock_irq_rcu_node(ssp->srcu_sup);
-	mutex_unlock(&ssp->srcu_gp_mutex);
+	mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
 
 	/* Initiate callback invocation as needed. */
@@ -1585,7 +1585,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 {
 	int idx;
 
-	mutex_lock(&ssp->srcu_gp_mutex);
+	mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -1603,7 +1603,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
 			WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
 			spin_unlock_irq_rcu_node(ssp->srcu_sup);
-			mutex_unlock(&ssp->srcu_gp_mutex);
+			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return;
 		}
 		idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
@@ -1611,7 +1611,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 			srcu_gp_start(ssp);
 		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 		if (idx != SRCU_STATE_IDLE) {
-			mutex_unlock(&ssp->srcu_gp_mutex);
+			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
 		}
 	}
@@ -1619,7 +1619,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (ssp->srcu_idx & 1);
 		if (!try_check_zero(ssp, idx, 1)) {
-			mutex_unlock(&ssp->srcu_gp_mutex);
+			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
 		srcu_flip(ssp);
@@ -1637,7 +1637,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		 */
 		idx = 1 ^ (ssp->srcu_idx & 1);
 		if (!try_check_zero(ssp, idx, 2)) {
-			mutex_unlock(&ssp->srcu_gp_mutex);
+			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
 		ssp->srcu_n_exp_nodelay = 0;
-- 
cgit v1.2.3


From 03200b5ca3b4d4edf634dc052bf3b8eb8dc8bbbc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 19:30:50 -0700
Subject: srcu: Move grace-period fields from srcu_struct to srcu_usage

This commit moves the ->srcu_gp_seq, ->srcu_gp_seq_needed,
->srcu_gp_seq_needed_exp, ->srcu_gp_start, and ->srcu_last_gp_end fields
from the srcu_struct structure to the srcu_usage structure to reduce
the size of the former in order to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  25 ++++-----
 kernel/rcu/srcutree.c    | 128 +++++++++++++++++++++++------------------------
 2 files changed, 77 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index d04e3da6181c..372e35b0e8b6 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -68,6 +68,11 @@ struct srcu_usage {
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
 	spinlock_t __private lock;		/* Protect counters and size state. */
 	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
+	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
+	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
+	unsigned long srcu_gp_start;		/* Last GP start timestamp (jiffies) */
+	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
 };
 
 /*
@@ -75,11 +80,6 @@ struct srcu_usage {
  */
 struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
-	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
-	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
-	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
-	unsigned long srcu_gp_start;		/* Last GP start timestamp (jiffies) */
-	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
 	unsigned long srcu_size_jiffies;	/* Current contention-measurement interval. */
 	unsigned long srcu_n_lock_retries;	/* Contention events in current interval. */
 	unsigned long srcu_n_exp_nodelay;	/* # expedited no-delays in current GP phase. */
@@ -115,8 +115,13 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-#define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
+#define __SRCU_USAGE_INIT(name)									\
+{												\
+	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
 	.srcu_gp_seq_needed = -1UL,								\
+}
+
+#define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 	.srcu_sup = &usage_name,								\
 	__SRCU_DEP_MAP_INIT(name)
@@ -153,9 +158,7 @@ struct srcu_struct {
  */
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)								\
-	static struct srcu_usage name##_srcu_usage = {						\
-		.lock = __SPIN_LOCK_UNLOCKED(name.lock),					\
-	};											\
+	static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);	\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage);	\
 	extern struct srcu_struct * const __srcu_struct_##name;					\
 	struct srcu_struct * const __srcu_struct_##name						\
@@ -163,9 +166,7 @@ struct srcu_struct {
 #else
 # define __DEFINE_SRCU(name, is_static)								\
 	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);				\
-	static struct srcu_usage name##_srcu_usage = {						\
-		.lock = __SPIN_LOCK_UNLOCKED(name.lock),					\
-	};											\
+	static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);	\
 	is_static struct srcu_struct name =							\
 		__SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data)
 #endif
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a36066798de7..340eb685cf64 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
 		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
 		rcu_segcblist_init(&sdp->srcu_cblist);
 		sdp->srcu_cblist_invoking = false;
-		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
-		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
 		sdp->mynode = NULL;
 		sdp->cpu = cpu;
 		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -247,7 +247,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
-	ssp->srcu_gp_seq = 0;
+	ssp->srcu_sup->srcu_gp_seq = 0;
 	ssp->srcu_barrier_seq = 0;
 	mutex_init(&ssp->srcu_barrier_mutex);
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
@@ -261,8 +261,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		return -ENOMEM;
 	}
 	init_srcu_struct_data(ssp);
-	ssp->srcu_gp_seq_needed_exp = 0;
-	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+	ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
 			if (!ssp->sda_is_static) {
@@ -275,7 +275,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 			WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 		}
 	}
-	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
 	return 0;
 }
 
@@ -402,10 +402,10 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
 	unsigned long flags;
 
 	/* The smp_load_acquire() pairs with the smp_store_release(). */
-	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
 		return; /* Already initialized. */
 	spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
-	if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
+	if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
 		spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 		return;
 	}
@@ -616,11 +616,11 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 	unsigned long j;
 	unsigned long jbase = SRCU_INTERVAL;
 
-	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp)))
 		jbase = 0;
-	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq))) {
 		j = jiffies - 1;
-		gpstart = READ_ONCE(ssp->srcu_gp_start);
+		gpstart = READ_ONCE(ssp->srcu_sup->srcu_gp_start);
 		if (time_after(j, gpstart))
 			jbase += j - gpstart;
 		if (!jbase) {
@@ -656,12 +656,12 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 		if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
 			return; /* Forgot srcu_barrier(), so just leak it! */
 	}
-	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
-	    WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
+	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+	    WARN_ON(rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq) != ssp->srcu_sup->srcu_gp_seq_needed) ||
 	    WARN_ON(srcu_readers_active(ssp))) {
 		pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
-			__func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
-			rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
+			__func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)),
+			rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ssp->srcu_sup->srcu_gp_seq_needed);
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
 	kfree(ssp->srcu_sup->node);
@@ -775,18 +775,18 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	else
 		sdp = this_cpu_ptr(ssp->sda);
 	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
-	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_gp_seq));
+			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_gp_seq));
+				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
-	WRITE_ONCE(ssp->srcu_gp_start, jiffies);
+	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
 	WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
-	rcu_seq_start(&ssp->srcu_gp_seq);
-	state = rcu_seq_state(ssp->srcu_gp_seq);
+	rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+	state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
 	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
@@ -865,16 +865,16 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 
 	/* End the current grace period. */
 	spin_lock_irq_rcu_node(ssp->srcu_sup);
-	idx = rcu_seq_state(ssp->srcu_gp_seq);
+	idx = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
-	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp)))
 		cbdelay = 0;
 
-	WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
-	rcu_seq_end(&ssp->srcu_gp_seq);
-	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
-	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
-		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
+	WRITE_ONCE(ssp->srcu_sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+	rcu_seq_end(&ssp->srcu_sup->srcu_gp_seq);
+	gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
+	if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq))
+		WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq);
 	spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
@@ -925,9 +925,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 
 	/* Start a new grace period if needed. */
 	spin_lock_irq_rcu_node(ssp->srcu_sup);
-	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+	gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
 	if (!rcu_seq_state(gpseq) &&
-	    ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+	    ULONG_CMP_LT(gpseq, ssp->srcu_sup->srcu_gp_seq_needed)) {
 		srcu_gp_start(ssp);
 		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 		srcu_reschedule(ssp, 0);
@@ -960,7 +960,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 	if (snp)
 		for (; snp != NULL; snp = snp->srcu_parent) {
 			sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
-			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
+			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
 			    (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
 				return;
 			spin_lock_irqsave_rcu_node(snp, flags);
@@ -973,8 +973,8 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 			spin_unlock_irqrestore_rcu_node(snp, flags);
 		}
 	spin_lock_irqsave_ssp_contention(ssp, &flags);
-	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
-		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+	if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+		WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
 	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
@@ -1010,7 +1010,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 	if (snp_leaf)
 		/* Each pass through the loop does one level of the srcu_node tree. */
 		for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
-			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
+			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) && snp != snp_leaf)
 				return; /* GP already done and CBs recorded. */
 			spin_lock_irqsave_rcu_node(snp, flags);
 			snp_seq = snp->srcu_have_cbs[idx];
@@ -1037,20 +1037,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 
 	/* Top of tree, must ensure the grace period will be started. */
 	spin_lock_irqsave_ssp_contention(ssp, &flags);
-	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+	if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed, s)) {
 		/*
 		 * Record need for grace period s.  Pair with load
 		 * acquire setting up for initialization.
 		 */
-		smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+		smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, s); /*^^^*/
 	}
-	if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
-		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+	if (!do_norm && ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+		WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
 
 	/* If grace period not already in progress, start it. */
-	if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
-	    rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
-		WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+	if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) &&
+	    rcu_seq_state(ssp->srcu_sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+		WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
 		srcu_gp_start(ssp);
 
 		// And how can that list_add() in the "else" clause
@@ -1164,18 +1164,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
 
 	/* First, see if enough time has passed since the last GP. */
 	t = ktime_get_mono_fast_ns();
-	tlast = READ_ONCE(ssp->srcu_last_gp_end);
+	tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
 	if (exp_holdoff == 0 ||
 	    time_in_range_open(t, tlast, tlast + exp_holdoff))
 		return false; /* Too soon after last GP. */
 
 	/* Next, check for probable idleness. */
-	curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+	curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
 	smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
-	if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+	if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
 		return false; /* Grace period in progress, so not idle. */
 	smp_mb(); /* Order ->srcu_gp_seq with prior access. */
-	if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+	if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
 		return false; /* GP # changed, so not idle. */
 	return true; /* With reasonable probability, idle! */
 }
@@ -1218,8 +1218,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_gp_seq));
-	s = rcu_seq_snap(&ssp->srcu_gp_seq);
+			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
@@ -1430,7 +1430,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
 	// Any prior manipulation of SRCU-protected data must happen
 	// before the load from ->srcu_gp_seq.
 	smp_mb();
-	return rcu_seq_snap(&ssp->srcu_gp_seq);
+	return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
 
@@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
  */
 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
 {
-	if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+	if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
 		return false;
 	// Ensure that the end of the SRCU grace period happens before
 	// any subsequent code that the caller might execute.
@@ -1597,16 +1597,16 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 	 * The load-acquire ensures that we see the accesses performed
 	 * by the prior grace period.
 	 */
-	idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+	idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
 		spin_lock_irq_rcu_node(ssp->srcu_sup);
-		if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
-			WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
+		if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+			WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
 			spin_unlock_irq_rcu_node(ssp->srcu_sup);
 			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return;
 		}
-		idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+		idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
 			srcu_gp_start(ssp);
 		spin_unlock_irq_rcu_node(ssp->srcu_sup);
@@ -1616,7 +1616,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		}
 	}
 
-	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (ssp->srcu_idx & 1);
 		if (!try_check_zero(ssp, idx, 1)) {
 			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
@@ -1624,12 +1624,12 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		}
 		srcu_flip(ssp);
 		spin_lock_irq_rcu_node(ssp->srcu_sup);
-		rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+		rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
 		ssp->srcu_n_exp_nodelay = 0;
 		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	}
 
-	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
 
 		/*
 		 * SRCU read-side critical sections are normally short,
@@ -1666,7 +1666,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	rcu_cblist_init(&ready_cbs);
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_gp_seq));
+			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	if (sdp->srcu_cblist_invoking ||
 	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
 		spin_unlock_irq_rcu_node(sdp);
@@ -1694,7 +1694,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_gp_seq));
+				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
 	spin_unlock_irq_rcu_node(sdp);
@@ -1711,12 +1711,12 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
 	bool pushgp = true;
 
 	spin_lock_irq_rcu_node(ssp->srcu_sup);
-	if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
-		if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+	if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+		if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
 			/* All requests fulfilled, time to go idle. */
 			pushgp = false;
 		}
-	} else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+	} else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
 		/* Outstanding request and no GP.  Start one. */
 		srcu_gp_start(ssp);
 	}
@@ -1762,7 +1762,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 	if (test_type != SRCU_FLAVOR)
 		return;
 	*flags = 0;
-	*gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+	*gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 
@@ -1791,7 +1791,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 	if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
 		ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
 	pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
-		 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+		 tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
 		 srcu_size_state_name[ss_state_idx]);
 	if (!ssp->sda) {
 		// Called after cleanup_srcu_struct(), perhaps.
@@ -1905,7 +1905,7 @@ static void srcu_module_going(struct module *mod)
 
 	for (i = 0; i < mod->num_srcu_structs; i++) {
 		ssp = *(sspp++);
-		if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed)) &&
+		if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
 		    !WARN_ON_ONCE(!ssp->sda_is_static))
 			cleanup_srcu_struct(ssp);
 		free_percpu(ssp->sda);
-- 
cgit v1.2.3


From 3b46679c623c2766f4c56fd3f9ce8edbb38c5d20 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 20:01:02 -0700
Subject: srcu: Move heuristics fields from srcu_struct to srcu_usage

This commit moves the ->srcu_size_jiffies, ->srcu_n_lock_retries,
and ->srcu_n_exp_nodelay fields from the srcu_struct structure to the
srcu_usage structure to reduce the size of the former in order to improve
cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  6 +++---
 kernel/rcu/srcutree.c    | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 372e35b0e8b6..3023492d8d89 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -73,6 +73,9 @@ struct srcu_usage {
 	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	unsigned long srcu_gp_start;		/* Last GP start timestamp (jiffies) */
 	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
+	unsigned long srcu_size_jiffies;	/* Current contention-measurement interval. */
+	unsigned long srcu_n_lock_retries;	/* Contention events in current interval. */
+	unsigned long srcu_n_exp_nodelay;	/* # expedited no-delays in current GP phase. */
 };
 
 /*
@@ -80,9 +83,6 @@ struct srcu_usage {
  */
 struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
-	unsigned long srcu_size_jiffies;	/* Current contention-measurement interval. */
-	unsigned long srcu_n_lock_retries;	/* Contention events in current interval. */
-	unsigned long srcu_n_exp_nodelay;	/* # expedited no-delays in current GP phase. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
 	bool sda_is_static;			/* May ->sda be passed to free_percpu()? */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 340eb685cf64..291fb520bce0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -348,11 +348,11 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
 	if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
 		return;
 	j = jiffies;
-	if (ssp->srcu_size_jiffies != j) {
-		ssp->srcu_size_jiffies = j;
-		ssp->srcu_n_lock_retries = 0;
+	if (ssp->srcu_sup->srcu_size_jiffies != j) {
+		ssp->srcu_sup->srcu_size_jiffies = j;
+		ssp->srcu_sup->srcu_n_lock_retries = 0;
 	}
-	if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+	if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
 		return;
 	__srcu_transition_to_big(ssp);
 }
@@ -624,8 +624,8 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 		if (time_after(j, gpstart))
 			jbase += j - gpstart;
 		if (!jbase) {
-			WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
-			if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+			WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) + 1);
+			if (READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
 				jbase = 1;
 		}
 	}
@@ -783,7 +783,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
-	WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
+	WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
 	rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
 	state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
@@ -1625,7 +1625,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		srcu_flip(ssp);
 		spin_lock_irq_rcu_node(ssp->srcu_sup);
 		rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
-		ssp->srcu_n_exp_nodelay = 0;
+		ssp->srcu_sup->srcu_n_exp_nodelay = 0;
 		spin_unlock_irq_rcu_node(ssp->srcu_sup);
 	}
 
@@ -1640,7 +1640,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 			mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
-		ssp->srcu_n_exp_nodelay = 0;
+		ssp->srcu_sup->srcu_n_exp_nodelay = 0;
 		srcu_gp_end(ssp);  /* Releases ->srcu_gp_mutex. */
 	}
 }
-- 
cgit v1.2.3


From 660349ac79cb22bb64c44b026d879069783e97d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 20:22:58 -0700
Subject: srcu: Move ->sda_is_static from srcu_struct to srcu_usage

This commit moves the ->sda_is_static field from the srcu_struct structure
to the srcu_usage structure to reduce the size of the former in order
to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 2 +-
 kernel/rcu/srcutree.c    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 3023492d8d89..d3534ecb806e 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -76,6 +76,7 @@ struct srcu_usage {
 	unsigned long srcu_size_jiffies;	/* Current contention-measurement interval. */
 	unsigned long srcu_n_lock_retries;	/* Contention events in current interval. */
 	unsigned long srcu_n_exp_nodelay;	/* # expedited no-delays in current GP phase. */
+	bool sda_is_static;			/* May ->sda be passed to free_percpu()? */
 };
 
 /*
@@ -84,7 +85,6 @@ struct srcu_usage {
 struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
-	bool sda_is_static;			/* May ->sda be passed to free_percpu()? */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
 	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
 	struct completion srcu_barrier_completion;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 291fb520bce0..20f2373f7e25 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -252,7 +252,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	mutex_init(&ssp->srcu_barrier_mutex);
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&ssp->work, process_srcu);
-	ssp->sda_is_static = is_static;
+	ssp->srcu_sup->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
 	if (!ssp->sda) {
@@ -265,7 +265,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
-			if (!ssp->sda_is_static) {
+			if (!ssp->srcu_sup->sda_is_static) {
 				free_percpu(ssp->sda);
 				ssp->sda = NULL;
 				kfree(ssp->srcu_sup);
@@ -667,7 +667,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 	kfree(ssp->srcu_sup->node);
 	ssp->srcu_sup->node = NULL;
 	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
-	if (!ssp->sda_is_static) {
+	if (!ssp->srcu_sup->sda_is_static) {
 		free_percpu(ssp->sda);
 		ssp->sda = NULL;
 		kfree(ssp->srcu_sup);
@@ -1906,7 +1906,7 @@ static void srcu_module_going(struct module *mod)
 	for (i = 0; i < mod->num_srcu_structs; i++) {
 		ssp = *(sspp++);
 		if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
-		    !WARN_ON_ONCE(!ssp->sda_is_static))
+		    !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
 			cleanup_srcu_struct(ssp);
 		free_percpu(ssp->sda);
 	}
-- 
cgit v1.2.3


From d20162e0bfc222183a7c94cd00e74b6bbf1a605b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 21:08:18 -0700
Subject: srcu: Move srcu_barrier() fields from srcu_struct to srcu_usage

This commit moves the ->srcu_barrier_seq, ->srcu_barrier_mutex,
->srcu_barrier_completion, and ->srcu_barrier_cpu_cnt fields from the
srcu_struct structure to the srcu_usage structure to reduce the size of
the former in order to improve cache locality.

Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 14 +++++++-------
 kernel/rcu/srcutree.c    | 38 +++++++++++++++++++-------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index d3534ecb806e..d544ec1c0c8e 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -77,6 +77,13 @@ struct srcu_usage {
 	unsigned long srcu_n_lock_retries;	/* Contention events in current interval. */
 	unsigned long srcu_n_exp_nodelay;	/* # expedited no-delays in current GP phase. */
 	bool sda_is_static;			/* May ->sda be passed to free_percpu()? */
+	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
+	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
+	struct completion srcu_barrier_completion;
+						/* Awaken barrier rq at end. */
+	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
+						/*  callback for the barrier */
+						/*  operation. */
 };
 
 /*
@@ -85,13 +92,6 @@ struct srcu_usage {
 struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
-	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
-	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
-	struct completion srcu_barrier_completion;
-						/* Awaken barrier rq at end. */
-	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
-						/*  callback for the barrier */
-						/*  operation. */
 	unsigned long reschedule_jiffies;
 	unsigned long reschedule_count;
 	struct delayed_work work;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 20f2373f7e25..97d1fe9a160c 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -248,9 +248,9 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
 	ssp->srcu_sup->srcu_gp_seq = 0;
-	ssp->srcu_barrier_seq = 0;
-	mutex_init(&ssp->srcu_barrier_mutex);
-	atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
+	ssp->srcu_sup->srcu_barrier_seq = 0;
+	mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+	atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&ssp->work, process_srcu);
 	ssp->srcu_sup->sda_is_static = is_static;
 	if (!is_static)
@@ -1496,8 +1496,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
 
 	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
 	ssp = sdp->ssp;
-	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
-		complete(&ssp->srcu_barrier_completion);
+	if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+		complete(&ssp->srcu_sup->srcu_barrier_completion);
 }
 
 /*
@@ -1511,13 +1511,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
 static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
 {
 	spin_lock_irq_rcu_node(sdp);
-	atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+	atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
 	sdp->srcu_barrier_head.func = srcu_barrier_cb;
 	debug_rcu_head_queue(&sdp->srcu_barrier_head);
 	if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
 				   &sdp->srcu_barrier_head)) {
 		debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
-		atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+		atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
 	}
 	spin_unlock_irq_rcu_node(sdp);
 }
@@ -1530,20 +1530,20 @@ void srcu_barrier(struct srcu_struct *ssp)
 {
 	int cpu;
 	int idx;
-	unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+	unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
 
 	check_init_srcu_struct(ssp);
-	mutex_lock(&ssp->srcu_barrier_mutex);
-	if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+	mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+	if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
 		smp_mb(); /* Force ordering following return. */
-		mutex_unlock(&ssp->srcu_barrier_mutex);
+		mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
 		return; /* Someone else did our work for us. */
 	}
-	rcu_seq_start(&ssp->srcu_barrier_seq);
-	init_completion(&ssp->srcu_barrier_completion);
+	rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+	init_completion(&ssp->srcu_sup->srcu_barrier_completion);
 
 	/* Initial count prevents reaching zero until all CBs are posted. */
-	atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+	atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
 
 	idx = __srcu_read_lock_nmisafe(ssp);
 	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
@@ -1554,12 +1554,12 @@ void srcu_barrier(struct srcu_struct *ssp)
 	__srcu_read_unlock_nmisafe(ssp, idx);
 
 	/* Remove the initial count, at which point reaching zero can happen. */
-	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
-		complete(&ssp->srcu_barrier_completion);
-	wait_for_completion(&ssp->srcu_barrier_completion);
+	if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+		complete(&ssp->srcu_sup->srcu_barrier_completion);
+	wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
 
-	rcu_seq_end(&ssp->srcu_barrier_seq);
-	mutex_unlock(&ssp->srcu_barrier_mutex);
+	rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+	mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(srcu_barrier);
 
-- 
cgit v1.2.3


From fd1b3f8e097b7fbbab8ac4a802b24fc23c703dcf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Mar 2023 21:30:32 -0700
Subject: srcu: Move work-scheduling fields from srcu_struct to srcu_usage

This commit moves the ->reschedule_jiffies, ->reschedule_count, and
->work fields from the srcu_struct structure to the srcu_usage structure
to reduce the size of the former in order to improve cache locality.

However, this means that the container_of() calls cannot get a pointer
to the srcu_struct because they are no longer in the srcu_struct.
This issue is addressed by adding a ->srcu_ssp field in the srcu_usage
structure that references the corresponding srcu_struct structure.
And given the presence of the sup pointer to the srcu_usage structure,
replace some ssp->srcu_usage-> instances with sup->.

[ paulmck Apply feedback from kernel test robot. ]

Link: https://lore.kernel.org/oe-kbuild-all/202303191400.iO5BOqka-lkp@intel.com/
Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h |  9 +++++----
 kernel/rcu/srcutree.c    | 41 ++++++++++++++++++++++-------------------
 2 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index d544ec1c0c8e..cd0cdd8142c5 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -84,6 +84,10 @@ struct srcu_usage {
 	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
 						/*  callback for the barrier */
 						/*  operation. */
+	unsigned long reschedule_jiffies;
+	unsigned long reschedule_count;
+	struct delayed_work work;
+	struct srcu_struct *srcu_ssp;
 };
 
 /*
@@ -92,9 +96,6 @@ struct srcu_usage {
 struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
-	unsigned long reschedule_jiffies;
-	unsigned long reschedule_count;
-	struct delayed_work work;
 	struct lockdep_map dep_map;
 	struct srcu_usage *srcu_sup;		/* Update-side data. */
 };
@@ -119,10 +120,10 @@ struct srcu_struct {
 {												\
 	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
 	.srcu_gp_seq_needed = -1UL,								\
+	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 }
 
 #define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
-	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 	.srcu_sup = &usage_name,								\
 	__SRCU_DEP_MAP_INIT(name)
 
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 97d1fe9a160c..169a6513b739 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -251,7 +251,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->srcu_barrier_seq = 0;
 	mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
 	atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
-	INIT_DELAYED_WORK(&ssp->work, process_srcu);
+	INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
 	ssp->srcu_sup->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
@@ -275,6 +275,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 			WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 		}
 	}
+	ssp->srcu_sup->srcu_ssp = ssp;
 	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
 	return 0;
 }
@@ -647,7 +648,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(ssp)))
 		return; /* Just leak it! */
-	flush_delayed_work(&ssp->work);
+	flush_delayed_work(&ssp->srcu_sup->work);
 	for_each_possible_cpu(cpu) {
 		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
@@ -1059,10 +1060,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 		// can only be executed during early boot when there is only
 		// the one boot CPU running with interrupts still disabled.
 		if (likely(srcu_init_done))
-			queue_delayed_work(rcu_gp_wq, &ssp->work,
+			queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work,
 					   !!srcu_get_delay(ssp));
-		else if (list_empty(&ssp->work.work.entry))
-			list_add(&ssp->work.work.entry, &srcu_boot_list);
+		else if (list_empty(&ssp->srcu_sup->work.work.entry))
+			list_add(&ssp->srcu_sup->work.work.entry, &srcu_boot_list);
 	}
 	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
@@ -1723,7 +1724,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
 	spin_unlock_irq_rcu_node(ssp->srcu_sup);
 
 	if (pushgp)
-		queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+		queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
 }
 
 /*
@@ -1734,22 +1735,24 @@ static void process_srcu(struct work_struct *work)
 	unsigned long curdelay;
 	unsigned long j;
 	struct srcu_struct *ssp;
+	struct srcu_usage *sup;
 
-	ssp = container_of(work, struct srcu_struct, work.work);
+	sup = container_of(work, struct srcu_usage, work.work);
+	ssp = sup->srcu_ssp;
 
 	srcu_advance_state(ssp);
 	curdelay = srcu_get_delay(ssp);
 	if (curdelay) {
-		WRITE_ONCE(ssp->reschedule_count, 0);
+		WRITE_ONCE(sup->reschedule_count, 0);
 	} else {
 		j = jiffies;
-		if (READ_ONCE(ssp->reschedule_jiffies) == j) {
-			WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
-			if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
+		if (READ_ONCE(sup->reschedule_jiffies) == j) {
+			WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+			if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
 				curdelay = 1;
 		} else {
-			WRITE_ONCE(ssp->reschedule_count, 1);
-			WRITE_ONCE(ssp->reschedule_jiffies, j);
+			WRITE_ONCE(sup->reschedule_count, 1);
+			WRITE_ONCE(sup->reschedule_jiffies, j);
 		}
 	}
 	srcu_reschedule(ssp, curdelay);
@@ -1848,7 +1851,7 @@ early_initcall(srcu_bootup_announce);
 
 void __init srcu_init(void)
 {
-	struct srcu_struct *ssp;
+	struct srcu_usage *sup;
 
 	/* Decide on srcu_struct-size strategy. */
 	if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
@@ -1868,13 +1871,13 @@ void __init srcu_init(void)
 	 */
 	srcu_init_done = true;
 	while (!list_empty(&srcu_boot_list)) {
-		ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+		sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
 				      work.work.entry);
-		list_del_init(&ssp->work.work.entry);
+		list_del_init(&sup->work.work.entry);
 		if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
-		    ssp->srcu_sup->srcu_size_state == SRCU_SIZE_SMALL)
-			ssp->srcu_sup->srcu_size_state = SRCU_SIZE_ALLOC;
-		queue_work(rcu_gp_wq, &ssp->work.work);
+		    sup->srcu_size_state == SRCU_SIZE_SMALL)
+			sup->srcu_size_state = SRCU_SIZE_ALLOC;
+		queue_work(rcu_gp_wq, &sup->work.work);
 	}
 }
 
-- 
cgit v1.2.3


From 144d204df78e40e6250201e71ef7d0e42d2a13fc Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 30 Mar 2023 19:24:29 +0300
Subject: PCI: Introduce pci_resource_n()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce pci_resource_n() and replace open-coded implementations of it
in pci.h.

Link: https://lore.kernel.org/r/20230330162434.35055-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 include/linux/pci.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index fafd8020c6d7..0239f12d0ba9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1994,14 +1994,13 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
  * These helpers provide future and backwards compatibility
  * for accessing popular PCI BAR info
  */
-#define pci_resource_start(dev, bar)	((dev)->resource[(bar)].start)
-#define pci_resource_end(dev, bar)	((dev)->resource[(bar)].end)
-#define pci_resource_flags(dev, bar)	((dev)->resource[(bar)].flags)
-#define pci_resource_len(dev,bar) \
-	((pci_resource_end((dev), (bar)) == 0) ? 0 :	\
-							\
-	 (pci_resource_end((dev), (bar)) -		\
-	  pci_resource_start((dev), (bar)) + 1))
+#define pci_resource_n(dev, bar)	(&(dev)->resource[(bar)])
+#define pci_resource_start(dev, bar)	(pci_resource_n(dev, bar)->start)
+#define pci_resource_end(dev, bar)	(pci_resource_n(dev, bar)->end)
+#define pci_resource_flags(dev, bar)	(pci_resource_n(dev, bar)->flags)
+#define pci_resource_len(dev,bar)					\
+	(pci_resource_end((dev), (bar)) ? 				\
+	 resource_size(pci_resource_n((dev), (bar))) : 0)
 
 /*
  * Similar to the helpers above, these manipulate per-pci_dev
-- 
cgit v1.2.3


From 09cc900632400079619e9154604fd299c2cc9a5a Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 30 Mar 2023 19:24:30 +0300
Subject: PCI: Introduce pci_dev_for_each_resource()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of open-coding it everywhere introduce a tiny helper that can be
used to iterate over each resource of a PCI device, and convert the most
obvious users into it.

While at it drop doubled empty line before pdev_sort_resources().

No functional changes intended.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230330162434.35055-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 .clang-format                             |  1 +
 arch/alpha/kernel/pci.c                   |  5 ++---
 arch/arm/kernel/bios32.c                  | 16 +++++++---------
 arch/arm/mach-dove/pcie.c                 | 10 +++++-----
 arch/arm/mach-mv78xx0/pcie.c              | 10 +++++-----
 arch/arm/mach-orion5x/pci.c               | 10 +++++-----
 arch/mips/pci/ops-bcm63xx.c               |  8 ++++----
 arch/mips/pci/pci-legacy.c                |  3 +--
 arch/powerpc/kernel/pci-common.c          | 21 +++++++++++----------
 arch/powerpc/platforms/4xx/pci.c          |  8 ++++----
 arch/powerpc/platforms/52xx/mpc52xx_pci.c |  5 ++---
 arch/powerpc/platforms/pseries/pci.c      | 16 ++++++++--------
 arch/sh/drivers/pci/pcie-sh7786.c         | 10 +++++-----
 arch/sparc/kernel/leon_pci.c              |  5 ++---
 arch/sparc/kernel/pci.c                   | 10 ++++------
 arch/sparc/kernel/pcic.c                  |  5 ++---
 drivers/pci/remove.c                      |  5 ++---
 drivers/pci/setup-bus.c                   | 27 ++++++++++-----------------
 drivers/pci/setup-res.c                   |  4 +---
 drivers/pci/vgaarb.c                      | 17 +++++------------
 drivers/pci/xen-pcifront.c                |  4 +---
 drivers/pnp/quirks.c                      | 29 ++++++++++-------------------
 include/linux/pci.h                       | 14 ++++++++++++++
 23 files changed, 111 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index d988e9fa9b26..2048b0296d76 100644
--- a/.clang-format
+++ b/.clang-format
@@ -520,6 +520,7 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
+  - 'pci_dev_for_each_resource'
   - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 64fbfb0763b2..4458eb7f44f0 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -288,11 +288,10 @@ pcibios_claim_one_bus(struct pci_bus *b)
 	struct pci_bus *child_bus;
 
 	list_for_each_entry(dev, &b->devices, bus_list) {
+		struct resource *r;
 		int i;
 
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			struct resource *r = &dev->resource[i];
-
+		pci_dev_for_each_resource(dev, r, i) {
 			if (r->parent || !r->start || !r->flags)
 				continue;
 			if (pci_has_flag(PCI_PROBE_ONLY) ||
diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index e7ef2b5bea9c..d334c7fb672b 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -142,15 +142,15 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND2, PCI_DEVICE_ID_WINBOND2_89C940F,
  */
 static void pci_fixup_dec21285(struct pci_dev *dev)
 {
-	int i;
-
 	if (dev->devfn == 0) {
+		struct resource *r;
+
 		dev->class &= 0xff;
 		dev->class |= PCI_CLASS_BRIDGE_HOST << 8;
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			dev->resource[i].start = 0;
-			dev->resource[i].end   = 0;
-			dev->resource[i].flags = 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start = 0;
+			r->end = 0;
+			r->flags = 0;
 		}
 	}
 }
@@ -162,13 +162,11 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, pci_fixup_d
 static void pci_fixup_ide_bases(struct pci_dev *dev)
 {
 	struct resource *r;
-	int i;
 
 	if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
 		return;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		r = dev->resource + i;
+	pci_dev_for_each_resource(dev, r) {
 		if ((r->start & ~0x80) == 0x374) {
 			r->start |= 2;
 			r->end = r->start;
diff --git a/arch/arm/mach-dove/pcie.c b/arch/arm/mach-dove/pcie.c
index 754ca381f600..3044b7e03890 100644
--- a/arch/arm/mach-dove/pcie.c
+++ b/arch/arm/mach-dove/pcie.c
@@ -142,14 +142,14 @@ static struct pci_ops pcie_ops = {
 static void rc_pci_fixup(struct pci_dev *dev)
 {
 	if (dev->bus->parent == NULL && dev->devfn == 0) {
-		int i;
+		struct resource *r;
 
 		dev->class &= 0xff;
 		dev->class |= PCI_CLASS_BRIDGE_HOST << 8;
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			dev->resource[i].start = 0;
-			dev->resource[i].end   = 0;
-			dev->resource[i].flags = 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start = 0;
+			r->end   = 0;
+			r->flags = 0;
 		}
 	}
 }
diff --git a/arch/arm/mach-mv78xx0/pcie.c b/arch/arm/mach-mv78xx0/pcie.c
index 6190f538a124..0ebc909ea273 100644
--- a/arch/arm/mach-mv78xx0/pcie.c
+++ b/arch/arm/mach-mv78xx0/pcie.c
@@ -186,14 +186,14 @@ static struct pci_ops pcie_ops = {
 static void rc_pci_fixup(struct pci_dev *dev)
 {
 	if (dev->bus->parent == NULL && dev->devfn == 0) {
-		int i;
+		struct resource *r;
 
 		dev->class &= 0xff;
 		dev->class |= PCI_CLASS_BRIDGE_HOST << 8;
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			dev->resource[i].start = 0;
-			dev->resource[i].end   = 0;
-			dev->resource[i].flags = 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start = 0;
+			r->end   = 0;
+			r->flags = 0;
 		}
 	}
 }
diff --git a/arch/arm/mach-orion5x/pci.c b/arch/arm/mach-orion5x/pci.c
index 888fdc9099c5..3313bc5a63ea 100644
--- a/arch/arm/mach-orion5x/pci.c
+++ b/arch/arm/mach-orion5x/pci.c
@@ -522,14 +522,14 @@ static int __init pci_setup(struct pci_sys_data *sys)
 static void rc_pci_fixup(struct pci_dev *dev)
 {
 	if (dev->bus->parent == NULL && dev->devfn == 0) {
-		int i;
+		struct resource *r;
 
 		dev->class &= 0xff;
 		dev->class |= PCI_CLASS_BRIDGE_HOST << 8;
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			dev->resource[i].start = 0;
-			dev->resource[i].end   = 0;
-			dev->resource[i].flags = 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start	= 0;
+			r->end		= 0;
+			r->flags	= 0;
 		}
 	}
 }
diff --git a/arch/mips/pci/ops-bcm63xx.c b/arch/mips/pci/ops-bcm63xx.c
index dc6dc2741272..b0ea023c47c0 100644
--- a/arch/mips/pci/ops-bcm63xx.c
+++ b/arch/mips/pci/ops-bcm63xx.c
@@ -413,18 +413,18 @@ struct pci_ops bcm63xx_cb_ops = {
 static void bcm63xx_fixup(struct pci_dev *dev)
 {
 	static int io_window = -1;
-	int i, found, new_io_window;
+	int found, new_io_window;
+	struct resource *r;
 	u32 val;
 
 	/* look for any io resource */
 	found = 0;
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		if (pci_resource_flags(dev, i) & IORESOURCE_IO) {
+	pci_dev_for_each_resource(dev, r) {
+		if (resource_type(r) == IORESOURCE_IO) {
 			found = 1;
 			break;
 		}
 	}
-
 	if (!found)
 		return;
 
diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index 468722c8a5c6..ec2567f8efd8 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -249,12 +249,11 @@ static int pcibios_enable_resources(struct pci_dev *dev, int mask)
 
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	old_cmd = cmd;
-	for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
+	pci_dev_for_each_resource(dev, r, idx) {
 		/* Only set up the requested stuff */
 		if (!(mask & (1<<idx)))
 			continue;
 
-		r = &dev->resource[idx];
 		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
 			continue;
 		if ((idx == PCI_ROM_RESOURCE) &&
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index d67cf79bf5d0..e88d7c9feeec 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -880,6 +880,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 static void pcibios_fixup_resources(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct resource *res;
 	int i;
 
 	if (!hose) {
@@ -891,9 +892,9 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
 	if (dev->is_virtfn)
 		return;
 
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		struct resource *res = dev->resource + i;
+	pci_dev_for_each_resource(dev, res, i) {
 		struct pci_bus_region reg;
+
 		if (!res->flags)
 			continue;
 
@@ -1452,11 +1453,10 @@ void pcibios_claim_one_bus(struct pci_bus *bus)
 	struct pci_bus *child_bus;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
+		struct resource *r;
 		int i;
 
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			struct resource *r = &dev->resource[i];
-
+		pci_dev_for_each_resource(dev, r, i) {
 			if (r->parent || !r->start || !r->flags)
 				continue;
 
@@ -1705,19 +1705,20 @@ EXPORT_SYMBOL_GPL(pcibios_scan_phb);
 
 static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
 {
-	int i, class = dev->class >> 8;
+	int class = dev->class >> 8;
 	/* When configured as agent, programming interface = 1 */
 	int prog_if = dev->class & 0xf;
+	struct resource *r;
 
 	if ((class == PCI_CLASS_PROCESSOR_POWERPC ||
 	     class == PCI_CLASS_BRIDGE_OTHER) &&
 		(dev->hdr_type == PCI_HEADER_TYPE_NORMAL) &&
 		(prog_if == 0) &&
 		(dev->bus->parent == NULL)) {
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			dev->resource[i].start = 0;
-			dev->resource[i].end = 0;
-			dev->resource[i].flags = 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start = 0;
+			r->end = 0;
+			r->flags = 0;
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/4xx/pci.c b/arch/powerpc/platforms/4xx/pci.c
index ca5dd7a5842a..07dcc2b8007f 100644
--- a/arch/powerpc/platforms/4xx/pci.c
+++ b/arch/powerpc/platforms/4xx/pci.c
@@ -57,7 +57,7 @@ static inline int ppc440spe_revA(void)
 static void fixup_ppc4xx_pci_bridge(struct pci_dev *dev)
 {
 	struct pci_controller *hose;
-	int i;
+	struct resource *r;
 
 	if (dev->devfn != 0 || dev->bus->self != NULL)
 		return;
@@ -79,9 +79,9 @@ static void fixup_ppc4xx_pci_bridge(struct pci_dev *dev)
 	/* Hide the PCI host BARs from the kernel as their content doesn't
 	 * fit well in the resource management
 	 */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		dev->resource[i].start = dev->resource[i].end = 0;
-		dev->resource[i].flags = 0;
+	pci_dev_for_each_resource(dev, r) {
+		r->start = r->end = 0;
+		r->flags = 0;
 	}
 
 	printk(KERN_INFO "PCI: Hiding 4xx host bridge resources %s\n",
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
index 859e2818c43d..0ca4401ba781 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
@@ -327,14 +327,13 @@ mpc52xx_pci_setup(struct pci_controller *hose,
 static void
 mpc52xx_pci_fixup_resources(struct pci_dev *dev)
 {
-	int i;
+	struct resource *res;
 
 	pr_debug("%s() %.4x:%.4x\n", __func__, dev->vendor, dev->device);
 
 	/* We don't rely on boot loader for PCI and resets all
 	   devices */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		struct resource *res = &dev->resource[i];
+	pci_dev_for_each_resource(dev, res) {
 		if (res->end > res->start) {	/* Only valid resources */
 			res->end -= res->start;
 			res->start = 0;
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 60e0a58928ef..1772ae3d193d 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -240,7 +240,7 @@ void __init pSeries_final_fixup(void)
  */
 static void fixup_winbond_82c105(struct pci_dev* dev)
 {
-	int i;
+	struct resource *r;
 	unsigned int reg;
 
 	if (!machine_is(pseries))
@@ -251,14 +251,14 @@ static void fixup_winbond_82c105(struct pci_dev* dev)
 	/* Enable LEGIRQ to use INTC instead of ISA interrupts */
 	pci_write_config_dword(dev, 0x40, reg | (1<<11));
 
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+	pci_dev_for_each_resource(dev, r) {
 		/* zap the 2nd function of the winbond chip */
-		if (dev->resource[i].flags & IORESOURCE_IO
-		    && dev->bus->number == 0 && dev->devfn == 0x81)
-			dev->resource[i].flags &= ~IORESOURCE_IO;
-		if (dev->resource[i].start == 0 && dev->resource[i].end) {
-			dev->resource[i].flags = 0;
-			dev->resource[i].end = 0;
+		if (dev->bus->number == 0 && dev->devfn == 0x81 &&
+		    r->flags & IORESOURCE_IO)
+			r->flags &= ~IORESOURCE_IO;
+		if (r->start == 0 && r->end) {
+			r->flags = 0;
+			r->end = 0;
 		}
 	}
 }
diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index b0c2a5238d04..4f5e49f10805 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -140,12 +140,12 @@ static void sh7786_pci_fixup(struct pci_dev *dev)
 	 * Prevent enumeration of root complex resources.
 	 */
 	if (pci_is_root_bus(dev->bus) && dev->devfn == 0) {
-		int i;
+		struct resource *r;
 
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			dev->resource[i].start	= 0;
-			dev->resource[i].end	= 0;
-			dev->resource[i].flags	= 0;
+		pci_dev_for_each_resource(dev, r) {
+			r->start	= 0;
+			r->end		= 0;
+			r->flags	= 0;
 		}
 	}
 }
diff --git a/arch/sparc/kernel/leon_pci.c b/arch/sparc/kernel/leon_pci.c
index e5e5ff6b9a5c..b6663a3fbae9 100644
--- a/arch/sparc/kernel/leon_pci.c
+++ b/arch/sparc/kernel/leon_pci.c
@@ -62,15 +62,14 @@ void leon_pci_init(struct platform_device *ofdev, struct leon_pci_info *info)
 
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
+	struct resource *res;
 	u16 cmd, oldcmd;
 	int i;
 
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	oldcmd = cmd;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *res = &dev->resource[i];
-
+	pci_dev_for_each_resource(dev, res, i) {
 		/* Only set up the requested stuff */
 		if (!(mask & (1<<i)))
 			continue;
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index cb1ef25116e9..a948a49817c7 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -663,11 +663,10 @@ static void pci_claim_bus_resources(struct pci_bus *bus)
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
+		struct resource *r;
 		int i;
 
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			struct resource *r = &dev->resource[i];
-
+		pci_dev_for_each_resource(dev, r, i) {
 			if (r->parent || !r->start || !r->flags)
 				continue;
 
@@ -724,15 +723,14 @@ struct pci_bus *pci_scan_one_pbm(struct pci_pbm_info *pbm,
 
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
+	struct resource *res;
 	u16 cmd, oldcmd;
 	int i;
 
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	oldcmd = cmd;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *res = &dev->resource[i];
-
+	pci_dev_for_each_resource(dev, res, i) {
 		/* Only set up the requested stuff */
 		if (!(mask & (1<<i)))
 			continue;
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index ee4c9a9a171c..25fe0a061732 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -643,15 +643,14 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
+	struct resource *res;
 	u16 cmd, oldcmd;
 	int i;
 
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	oldcmd = cmd;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *res = &dev->resource[i];
-
+	pci_dev_for_each_resource(dev, res, i) {
 		/* Only set up the requested stuff */
 		if (!(mask & (1<<i)))
 			continue;
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 0145aef1b930..c049eddc1e2f 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -5,10 +5,9 @@
 
 static void pci_free_resources(struct pci_dev *dev)
 {
-	int i;
+	struct resource *res;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *res = dev->resource + i;
+	pci_dev_for_each_resource(dev, res) {
 		if (res->parent)
 			release_resource(res);
 	}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index c690572b10ce..027b985dd1ee 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -124,20 +124,17 @@ static resource_size_t get_res_add_align(struct list_head *head,
 	return dev_res ? dev_res->min_align : 0;
 }
 
-
 /* Sort resources by alignment */
 static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
 {
+	struct resource *r;
 	int i;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *r;
+	pci_dev_for_each_resource(dev, r, i) {
 		struct pci_dev_resource *dev_res, *tmp;
 		resource_size_t r_align;
 		struct list_head *n;
 
-		r = &dev->resource[i];
-
 		if (r->flags & IORESOURCE_PCI_FIXED)
 			continue;
 
@@ -895,10 +892,9 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 
 	min_align = window_alignment(bus, IORESOURCE_IO);
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		int i;
+		struct resource *r;
 
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			struct resource *r = &dev->resource[i];
+		pci_dev_for_each_resource(dev, r) {
 			unsigned long r_size;
 
 			if (r->parent || !(r->flags & IORESOURCE_IO))
@@ -1014,10 +1010,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
+		struct resource *r;
 		int i;
 
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			struct resource *r = &dev->resource[i];
+		pci_dev_for_each_resource(dev, r, i) {
 			resource_size_t r_size;
 
 			if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) ||
@@ -1358,11 +1354,10 @@ static void assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
  */
 static void pdev_assign_fixed_resources(struct pci_dev *dev)
 {
-	int i;
+	struct resource *r;
 
-	for (i = 0; i <  PCI_NUM_RESOURCES; i++) {
+	pci_dev_for_each_resource(dev, r) {
 		struct pci_bus *b;
-		struct resource *r = &dev->resource[i];
 
 		if (r->parent || !(r->flags & IORESOURCE_PCI_FIXED) ||
 		    !(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
@@ -1795,11 +1790,9 @@ static void remove_dev_resources(struct pci_dev *dev, struct resource *io,
 				 struct resource *mmio,
 				 struct resource *mmio_pref)
 {
-	int i;
-
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		struct resource *res = &dev->resource[i];
+	struct resource *res;
 
+	pci_dev_for_each_resource(dev, res) {
 		if (resource_type(res) == IORESOURCE_IO) {
 			remove_dev_resource(io, dev, res);
 		} else if (resource_type(res) == IORESOURCE_MEM) {
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index b492e67c3d87..967f9a758923 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -484,12 +484,10 @@ int pci_enable_resources(struct pci_dev *dev, int mask)
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	old_cmd = cmd;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+	pci_dev_for_each_resource(dev, r, i) {
 		if (!(mask & (1 << i)))
 			continue;
 
-		r = &dev->resource[i];
-
 		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
 			continue;
 		if ((i == PCI_ROM_RESOURCE) &&
diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index f80b6ec88dc3..5a696078b382 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -548,10 +548,8 @@ static bool vga_is_firmware_default(struct pci_dev *pdev)
 #if defined(CONFIG_X86) || defined(CONFIG_IA64)
 	u64 base = screen_info.lfb_base;
 	u64 size = screen_info.lfb_size;
+	struct resource *r;
 	u64 limit;
-	resource_size_t start, end;
-	unsigned long flags;
-	int i;
 
 	/* Select the device owning the boot framebuffer if there is one */
 
@@ -561,19 +559,14 @@ static bool vga_is_firmware_default(struct pci_dev *pdev)
 	limit = base + size;
 
 	/* Does firmware framebuffer belong to us? */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		flags = pci_resource_flags(pdev, i);
-
-		if ((flags & IORESOURCE_MEM) == 0)
+	pci_dev_for_each_resource(pdev, r) {
+		if (resource_type(r) != IORESOURCE_MEM)
 			continue;
 
-		start = pci_resource_start(pdev, i);
-		end  = pci_resource_end(pdev, i);
-
-		if (!start || !end)
+		if (!r->start || !r->end)
 			continue;
 
-		if (base < start || limit >= end)
+		if (base < r->start || limit >= r->end)
 			continue;
 
 		return true;
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index fcd029ca2eb1..83c0ab50676d 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -390,9 +390,7 @@ static int pcifront_claim_resource(struct pci_dev *dev, void *data)
 	int i;
 	struct resource *r;
 
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		r = &dev->resource[i];
-
+	pci_dev_for_each_resource(dev, r, i) {
 		if (!r->parent && r->start && r->flags) {
 			dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n",
 				pci_name(dev), i);
diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c
index ac98b9919029..6085a1471de2 100644
--- a/drivers/pnp/quirks.c
+++ b/drivers/pnp/quirks.c
@@ -229,8 +229,7 @@ static void quirk_ad1815_mpu_resources(struct pnp_dev *dev)
 static void quirk_system_pci_resources(struct pnp_dev *dev)
 {
 	struct pci_dev *pdev = NULL;
-	struct resource *res;
-	resource_size_t pnp_start, pnp_end, pci_start, pci_end;
+	struct resource *res, *r;
 	int i, j;
 
 	/*
@@ -243,32 +242,26 @@ static void quirk_system_pci_resources(struct pnp_dev *dev)
 	 * so they won't be claimed by the PNP system driver.
 	 */
 	for_each_pci_dev(pdev) {
-		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-			unsigned long flags, type;
+		pci_dev_for_each_resource(pdev, r, i) {
+			unsigned long type = resource_type(r);
 
-			flags = pci_resource_flags(pdev, i);
-			type = flags & (IORESOURCE_IO | IORESOURCE_MEM);
-			if (!type || pci_resource_len(pdev, i) == 0)
+			if (!(type == IORESOURCE_IO || type == IORESOURCE_MEM) ||
+			    resource_size(r) == 0)
 				continue;
 
-			if (flags & IORESOURCE_UNSET)
+			if (r->flags & IORESOURCE_UNSET)
 				continue;
 
-			pci_start = pci_resource_start(pdev, i);
-			pci_end = pci_resource_end(pdev, i);
 			for (j = 0;
 			     (res = pnp_get_resource(dev, type, j)); j++) {
 				if (res->start == 0 && res->end == 0)
 					continue;
 
-				pnp_start = res->start;
-				pnp_end = res->end;
-
 				/*
 				 * If the PNP region doesn't overlap the PCI
 				 * region at all, there's no problem.
 				 */
-				if (pnp_end < pci_start || pnp_start > pci_end)
+				if (!resource_overlaps(res, r))
 					continue;
 
 				/*
@@ -278,8 +271,7 @@ static void quirk_system_pci_resources(struct pnp_dev *dev)
 				 * PNP device describes a bridge with PCI
 				 * behind it.
 				 */
-				if (pnp_start <= pci_start &&
-				    pnp_end >= pci_end)
+				if (res->start <= r->start && res->end >= r->end)
 					continue;
 
 				/*
@@ -288,9 +280,8 @@ static void quirk_system_pci_resources(struct pnp_dev *dev)
 				 * driver from requesting its resources.
 				 */
 				dev_warn(&dev->dev,
-					 "disabling %pR because it overlaps "
-					 "%s BAR %d %pR\n", res,
-					 pci_name(pdev), i, &pdev->resource[i]);
+					 "disabling %pR because it overlaps %s BAR %d %pR\n",
+					 res, pci_name(pdev), i, r);
 				res->flags |= IORESOURCE_DISABLED;
 			}
 		}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0239f12d0ba9..2234ef961bfe 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2002,6 +2002,20 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
 	(pci_resource_end((dev), (bar)) ? 				\
 	 resource_size(pci_resource_n((dev), (bar))) : 0)
 
+#define __pci_dev_for_each_res0(dev, res, ...)				\
+	for (unsigned int __b = 0;					\
+	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+	     __b++)
+
+#define __pci_dev_for_each_res1(dev, res, __b)				\
+	for (__b = 0;							\
+	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+	     __b++)
+
+#define pci_dev_for_each_resource(dev, res, ...)			\
+	CONCATENATE(__pci_dev_for_each_res, COUNT_ARGS(__VA_ARGS__)) 	\
+		    (dev, res, __VA_ARGS__)
+
 /*
  * Similar to the helpers above, these manipulate per-pci_dev
  * driver-specific data.  They are really just a wrapper around
-- 
cgit v1.2.3


From ceb928be5cab32043f230bcb1adeb0a9a9b8f0f2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 30 Mar 2023 19:24:31 +0300
Subject: PCI: Document pci_bus_for_each_resource()

There might be confusion about why pci_bus_for_each_resource() uses
Logical OR. Document the entire macro and explain how it works and why the
conditional needs to be like that.

Link: https://lore.kernel.org/r/20230330162434.35055-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2234ef961bfe..db3155562320 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1444,6 +1444,26 @@ int devm_request_pci_bus_resources(struct device *dev,
 /* Temporary until new and working PCI SBR API in place */
 int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
 
+/**
+ * pci_bus_for_each_resource - iterate over PCI bus resources
+ * @bus: the PCI bus
+ * @res: pointer to the current resource
+ * @i: index of the current resource
+ *
+ * Iterate over PCI bus resources. The first part is to go over PCI bus
+ * resource array, which has at most the %PCI_BRIDGE_RESOURCE_NUM entries.
+ * After that continue with the separate list of the additional resources,
+ * if not empty. That's why the Logical OR is being used.
+ *
+ * Possible usage:
+ *
+ *	struct pci_bus *bus = ...;
+ *	struct resource *res;
+ *	unsigned int i;
+ *
+ * 	pci_bus_for_each_resource(bus, res, i)
+ * 		pr_info("PCI bus resource[%u]: %pR\n", i, res);
+ */
 #define pci_bus_for_each_resource(bus, res, i)				\
 	for (i = 0;							\
 	    (res = pci_bus_resource_n(bus, i)) || i < PCI_BRIDGE_RESOURCE_NUM; \
-- 
cgit v1.2.3


From d564fa1ff19e893e2971d66e5c8f49dc1cdc8ffc Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 9 Jan 2023 15:11:52 +0200
Subject: asm-generic/io.h: suppress endianness warnings for readq() and
 writeq()

Commit c1d55d50139b ("asm-generic/io.h: Fix sparse warnings on
big-endian architectures") missed fixing the 64-bit accessors.

Arnd explains in the attached link why the casts are necessary, even if
__raw_readq() and __raw_writeq() do not take endian-specific types.

Link: https://lore.kernel.org/lkml/9105d6fc-880b-4734-857d-e3d30b87ccf6@app.fastmail.com/
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/io.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 4c44a29b5e8e..d78c3056c98f 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -236,7 +236,7 @@ static inline u64 readq(const volatile void __iomem *addr)
 
 	log_read_mmio(64, addr, _THIS_IP_, _RET_IP_);
 	__io_br();
-	val = __le64_to_cpu(__raw_readq(addr));
+	val = __le64_to_cpu((__le64 __force)__raw_readq(addr));
 	__io_ar(val);
 	log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_);
 	return val;
@@ -287,7 +287,7 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 {
 	log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 	__io_bw();
-	__raw_writeq(__cpu_to_le64(value), addr);
+	__raw_writeq((u64 __force)__cpu_to_le64(value), addr);
 	__io_aw();
 	log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 }
-- 
cgit v1.2.3


From 05d3855b4d21ef3c2df26be1cbba9d2c68915fcb Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 9 Jan 2023 15:11:53 +0200
Subject: asm-generic/io.h: suppress endianness warnings for relaxed accessors

Copy the forced type casts from the normal MMIO accessors to suppress
the sparse warnings that point out __raw_readl() returns a native endian
word (just like readl()).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/io.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index d78c3056c98f..587e7e9b9a37 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -319,7 +319,7 @@ static inline u16 readw_relaxed(const volatile void __iomem *addr)
 	u16 val;
 
 	log_read_mmio(16, addr, _THIS_IP_, _RET_IP_);
-	val = __le16_to_cpu(__raw_readw(addr));
+	val = __le16_to_cpu((__le16 __force)__raw_readw(addr));
 	log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
@@ -332,7 +332,7 @@ static inline u32 readl_relaxed(const volatile void __iomem *addr)
 	u32 val;
 
 	log_read_mmio(32, addr, _THIS_IP_, _RET_IP_);
-	val = __le32_to_cpu(__raw_readl(addr));
+	val = __le32_to_cpu((__le32 __force)__raw_readl(addr));
 	log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
@@ -345,7 +345,7 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr)
 	u64 val;
 
 	log_read_mmio(64, addr, _THIS_IP_, _RET_IP_);
-	val = __le64_to_cpu(__raw_readq(addr));
+	val = __le64_to_cpu((__le64 __force)__raw_readq(addr));
 	log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
@@ -366,7 +366,7 @@ static inline void writeb_relaxed(u8 value, volatile void __iomem *addr)
 static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 {
 	log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
-	__raw_writew(cpu_to_le16(value), addr);
+	__raw_writew((u16 __force)cpu_to_le16(value), addr);
 	log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
@@ -376,7 +376,7 @@ static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 {
 	log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
-	__raw_writel(__cpu_to_le32(value), addr);
+	__raw_writel((u32 __force)__cpu_to_le32(value), addr);
 	log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
@@ -386,7 +386,7 @@ static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 static inline void writeq_relaxed(u64 value, volatile void __iomem *addr)
 {
 	log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
-	__raw_writeq(__cpu_to_le64(value), addr);
+	__raw_writeq((u64 __force)__cpu_to_le64(value), addr);
 	log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
-- 
cgit v1.2.3


From 656e9007ef5862746cdf7ac16267c8e06e7b0989 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Mar 2023 09:53:31 +0100
Subject: asm-generic: avoid __generic_cmpxchg_local warnings

Code that passes a 32-bit constant into cmpxchg() produces a harmless
sparse warning because of the truncation in the branch that is not taken:

fs/erofs/zdata.c: note: in included file (through /home/arnd/arm-soc/arch/arm/include/asm/cmpxchg.h, /home/arnd/arm-soc/arch/arm/include/asm/atomic.h, /home/arnd/arm-soc/include/linux/atomic.h, ...):
include/asm-generic/cmpxchg-local.h:29:33: warning: cast truncates bits from constant value (5f0ecafe becomes fe)
include/asm-generic/cmpxchg-local.h:33:34: warning: cast truncates bits from constant value (5f0ecafe becomes cafe)
include/asm-generic/cmpxchg-local.h:29:33: warning: cast truncates bits from constant value (5f0ecafe becomes fe)
include/asm-generic/cmpxchg-local.h:30:42: warning: cast truncates bits from constant value (5f0edead becomes ad)
include/asm-generic/cmpxchg-local.h:33:34: warning: cast truncates bits from constant value (5f0ecafe becomes cafe)
include/asm-generic/cmpxchg-local.h:34:44: warning: cast truncates bits from constant value (5f0edead becomes dead)

This was reported as a regression to Matt's recent __generic_cmpxchg_local
patch, though this patch only added more warnings on top of the ones
that were already there.

Rewording the truncation to use an explicit bitmask instead of a cast
to a smaller type avoids the warning but otherwise leaves the code
unchanged.

I had another look at why the cast is even needed for atomic_cmpxchg(),
and as Matt describes the problem here is that atomic_t contains a
signed 'int', but cmpxchg() takes an 'unsigned long' argument, and
converting between the two leads to a 64-bit sign-extension of
negative 32-bit atomics.

I checked the other implementations of arch_cmpxchg() and did not find
any others that run into the same problem as __generic_cmpxchg_local(),
but it's easy to be on the safe side here and always convert the
signed int into an unsigned int when calling arch_cmpxchg(), as this
will work even when any of the arch_cmpxchg() implementations run
into the same problem.

Fixes: 624654152284 ("locking/atomic: cmpxchg: Make __generic_cmpxchg_local compare against zero-extended 'old' value")
Reviewed-by: Matt Evans <mev@rivosinc.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/atomic.h        |  4 ++--
 include/asm-generic/cmpxchg-local.h | 12 ++++++------
 include/asm-generic/cmpxchg.h       |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 04b8be9f1a77..e271d6708c87 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -130,7 +130,7 @@ ATOMIC_OP(xor, ^)
 #define arch_atomic_read(v)			READ_ONCE((v)->counter)
 #define arch_atomic_set(v, i)			WRITE_ONCE(((v)->counter), (i))
 
-#define arch_atomic_xchg(ptr, v)		(arch_xchg(&(ptr)->counter, (v)))
-#define arch_atomic_cmpxchg(v, old, new)	(arch_cmpxchg(&((v)->counter), (old), (new)))
+#define arch_atomic_xchg(ptr, v)		(arch_xchg(&(ptr)->counter, (u32)(v)))
+#define arch_atomic_cmpxchg(v, old, new)	(arch_cmpxchg(&((v)->counter), (u32)(old), (u32)(new)))
 
 #endif /* __ASM_GENERIC_ATOMIC_H */
diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h
index c3e7315b7c1d..3df9f59a544e 100644
--- a/include/asm-generic/cmpxchg-local.h
+++ b/include/asm-generic/cmpxchg-local.h
@@ -26,16 +26,16 @@ static inline unsigned long __generic_cmpxchg_local(volatile void *ptr,
 	raw_local_irq_save(flags);
 	switch (size) {
 	case 1: prev = *(u8 *)ptr;
-		if (prev == (u8)old)
-			*(u8 *)ptr = (u8)new;
+		if (prev == (old & 0xffu))
+			*(u8 *)ptr = (new & 0xffu);
 		break;
 	case 2: prev = *(u16 *)ptr;
-		if (prev == (u16)old)
-			*(u16 *)ptr = (u16)new;
+		if (prev == (old & 0xffffu))
+			*(u16 *)ptr = (new & 0xffffu);
 		break;
 	case 4: prev = *(u32 *)ptr;
-		if (prev == (u32)old)
-			*(u32 *)ptr = (u32)new;
+		if (prev == (old & 0xffffffffffu))
+			*(u32 *)ptr = (new & 0xffffffffu);
 		break;
 	case 8: prev = *(u64 *)ptr;
 		if (prev == old)
diff --git a/include/asm-generic/cmpxchg.h b/include/asm-generic/cmpxchg.h
index dca4419922a9..848de25fc4bf 100644
--- a/include/asm-generic/cmpxchg.h
+++ b/include/asm-generic/cmpxchg.h
@@ -32,7 +32,7 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size)
 #else
 		local_irq_save(flags);
 		ret = *(volatile u8 *)ptr;
-		*(volatile u8 *)ptr = x;
+		*(volatile u8 *)ptr = (x & 0xffu);
 		local_irq_restore(flags);
 		return ret;
 #endif /* __xchg_u8 */
@@ -43,7 +43,7 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size)
 #else
 		local_irq_save(flags);
 		ret = *(volatile u16 *)ptr;
-		*(volatile u16 *)ptr = x;
+		*(volatile u16 *)ptr = (x & 0xffffu);
 		local_irq_restore(flags);
 		return ret;
 #endif /* __xchg_u16 */
@@ -54,7 +54,7 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size)
 #else
 		local_irq_save(flags);
 		ret = *(volatile u32 *)ptr;
-		*(volatile u32 *)ptr = x;
+		*(volatile u32 *)ptr = (x & 0xffffffffu);
 		local_irq_restore(flags);
 		return ret;
 #endif /* __xchg_u32 */
-- 
cgit v1.2.3


From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:23 -0700
Subject: bpf: Remove unused arguments from btf_struct_access().

Remove unused arguments from btf_struct_access() callback.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h              |  3 +--
 include/linux/filter.h           |  3 +--
 kernel/bpf/verifier.c            |  4 ++--
 net/bpf/bpf_dummy_struct_ops.c   | 12 +++++-------
 net/core/filter.c                | 13 +++++--------
 net/ipv4/bpf_tcp_ca.c            |  3 +--
 net/netfilter/nf_conntrack_bpf.c |  3 +--
 7 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2d8f3f639e68..4f689dda748f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -893,8 +893,7 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag);
+				 int off, int size);
 };
 
 struct bpf_prog_offload_ops {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 23c08c31bea9..5364b0c52c1d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 extern struct mutex nf_conn_btf_access_lock;
 extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
-				     int off, int size, enum bpf_access_type atype,
-				     u32 *next_btf_id, enum bpf_type_flag *flag);
+				     int off, int size);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83984568ccb4..5ca520e5eddf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5459,7 +5459,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
 	enum bpf_type_flag flag = 0;
-	u32 btf_id;
+	u32 btf_id = 0;
 	int ret;
 
 	if (!env->allow_ptr_leaks) {
@@ -5509,7 +5509,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
 			return -EFAULT;
 		}
-		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = env->ops->btf_struct_access(&env->log, reg, off, size);
 	} else {
 		/* Writes are permitted with default btf_struct_access for
 		 * program allocated objects (which always have ref_obj_id > 0),
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 9535c8506cda..5918d1b32e19 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -173,14 +173,11 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
 
 static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id,
-					   enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *state;
 	const struct btf_type *t;
 	s32 type_id;
-	int err;
 
 	type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
 					BTF_KIND_STRUCT);
@@ -194,9 +191,10 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 		return -EACCES;
 	}
 
-	err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-	if (err < 0)
-		return err;
+	if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+		bpf_log(log, "write access at off %d with size %d\n", off, size);
+		return -EACCES;
+	}
 
 	return NOT_INIT;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 8b9f409a2ec3..1f2abf0f60e6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8742,20 +8742,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
 
 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 			      const struct bpf_reg_state *reg,
-			      int off, int size, enum bpf_access_type atype,
-			      u32 *next_btf_id, enum bpf_type_flag *flag);
+			      int off, int size);
 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
 
 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
@@ -8822,14 +8820,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag)
+				 int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d6465876bbf6..4406d796cc2f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -72,8 +72,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 
 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	const struct btf_type *t;
 	size_t end;
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 002e9d24a1e9..3f821b7ba646 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -192,8 +192,7 @@ BTF_ID(struct, nf_conn___init)
 /* Check writes into `struct nf_conn` */
 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id, enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *ncit, *nct, *t;
 	size_t end;
-- 
cgit v1.2.3


From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:24 -0700
Subject: bpf: Refactor btf_nested_type_is_trusted().

btf_nested_type_is_trusted() tries to find a struct member at corresponding offset.
It works for flat structures and falls apart in more complex structs with nested structs.
The offset->member search is already performed by btf_struct_walk() including nested structs.
Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted()
instead of offset to make BTF_TYPE_SAFE*() logic more robust.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h   |  7 ++++---
 kernel/bpf/btf.c      | 44 +++++++++++++++++---------------------------
 kernel/bpf/verifier.c | 23 ++++++++++++-----------
 3 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f689dda748f..002a811b6b90 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype,
-		      u32 *next_btf_id, enum bpf_type_flag *flag);
+		      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
 			  const struct btf *need_btf, u32 need_type_id,
@@ -2302,7 +2302,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix);
+				const char *field_name, u32 btf_id, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
@@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 static inline int btf_struct_access(struct bpf_verifier_log *log,
 				    const struct bpf_reg_state *reg,
 				    int off, int size, enum bpf_access_type atype,
-				    u32 *next_btf_id, enum bpf_type_flag *flag)
+				    u32 *next_btf_id, enum bpf_type_flag *flag,
+				    const char **field_name)
 {
 	return -EACCES;
 }
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b7e5a5510b91..593c45a294d0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6166,7 +6166,8 @@ enum bpf_struct_walk_result {
 
 static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 			   const struct btf_type *t, int off, int size,
-			   u32 *next_btf_id, enum bpf_type_flag *flag)
+			   u32 *next_btf_id, enum bpf_type_flag *flag,
+			   const char **field_name)
 {
 	u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
 	const struct btf_type *mtype, *elem_type = NULL;
@@ -6395,6 +6396,8 @@ error:
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
 				*flag |= tmp_flag;
+				if (field_name)
+					*field_name = mname;
 				return WALK_PTR;
 			}
 		}
@@ -6421,7 +6424,8 @@ error:
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype __maybe_unused,
-		      u32 *next_btf_id, enum bpf_type_flag *flag)
+		      u32 *next_btf_id, enum bpf_type_flag *flag,
+		      const char **field_name)
 {
 	const struct btf *btf = reg->btf;
 	enum bpf_type_flag tmp_flag = 0;
@@ -6453,7 +6457,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 
 	t = btf_type_by_id(btf, id);
 	do {
-		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
 
 		switch (err) {
 		case WALK_PTR:
@@ -6528,7 +6532,7 @@ again:
 	type = btf_type_by_id(btf, id);
 	if (!type)
 		return false;
-	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
 	if (err != WALK_STRUCT)
 		return false;
 
@@ -8488,16 +8492,15 @@ out:
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix)
+				const char *field_name, u32 btf_id, const char *suffix)
 {
 	struct btf *btf = reg->btf;
 	const struct btf_type *walk_type, *safe_type;
 	const char *tname;
 	char safe_tname[64];
 	long ret, safe_id;
-	const struct btf_member *member, *m_walk = NULL;
+	const struct btf_member *member;
 	u32 i;
-	const char *walk_name;
 
 	walk_type = btf_type_by_id(btf, reg->btf_id);
 	if (!walk_type)
@@ -8517,30 +8520,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 	if (!safe_type)
 		return false;
 
-	for_each_member(i, walk_type, member) {
-		u32 moff;
-
-		/* We're looking for the PTR_TO_BTF_ID member in the struct
-		 * type we're walking which matches the specified offset.
-		 * Below, we'll iterate over the fields in the safe variant of
-		 * the struct and see if any of them has a matching type /
-		 * name.
-		 */
-		moff = __btf_member_bit_offset(walk_type, member) / 8;
-		if (off == moff) {
-			m_walk = member;
-			break;
-		}
-	}
-	if (m_walk == NULL)
-		return false;
-
-	walk_name = __btf_name_by_offset(btf, m_walk->name_off);
 	for_each_member(i, safe_type, member) {
 		const char *m_name = __btf_name_by_offset(btf, member->name_off);
+		const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+		u32 id;
+
+		if (!btf_type_is_ptr(mtype))
+			continue;
 
+		btf_type_skip_modifiers(btf, mtype->type, &id);
 		/* If we match on both type and name, the field is considered trusted. */
-		if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+		if (btf_id == id && !strcmp(field_name, m_name))
 			return true;
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ca520e5eddf..2cd2e0b725cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,12 +5400,12 @@ BTF_TYPE_SAFE_RCU(struct css_set) {
 
 /* full trusted: these fields are trusted even outside of RCU CS and never NULL */
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
-	__bpf_md_ptr(struct seq_file *, seq);
+	struct seq_file *seq;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
-	__bpf_md_ptr(struct bpf_iter_meta *, meta);
-	__bpf_md_ptr(struct task_struct *, task);
+	struct bpf_iter_meta *meta;
+	struct task_struct *task;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
@@ -5427,17 +5427,17 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) {
 
 static bool type_is_rcu(struct bpf_verifier_env *env,
 			struct bpf_reg_state *reg,
-			int off)
+			const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
 }
 
 static bool type_is_trusted(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *reg,
-			    int off)
+			    const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
@@ -5446,7 +5446,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5458,6 +5458,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = regs + regno;
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+	const char *field_name = NULL;
 	enum bpf_type_flag flag = 0;
 	u32 btf_id = 0;
 	int ret;
@@ -5526,7 +5527,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			return -EFAULT;
 		}
 
-		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
 	}
 
 	if (ret < 0)
@@ -5554,10 +5555,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * A regular RCU-protected pointer with __rcu tag can also be deemed
 		 * trusted if we are in an RCU CS. Such pointer can be NULL.
 		 */
-		if (type_is_trusted(env, reg, off)) {
+		if (type_is_trusted(env, reg, field_name, btf_id)) {
 			flag |= PTR_TRUSTED;
 		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
-			if (type_is_rcu(env, reg, off)) {
+			if (type_is_rcu(env, reg, field_name, btf_id)) {
 				/* ignore __rcu tag and mark it MEM_RCU */
 				flag |= MEM_RCU;
 			} else if (flag & MEM_RCU) {
@@ -5640,7 +5641,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	/* Simulate access to a PTR_TO_BTF_ID */
 	memset(&map_reg, 0, sizeof(map_reg));
 	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
-	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 0a78cf7264d29abeca098eae0b188a10aabc8a32 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 3 Apr 2023 12:49:58 -0700
Subject: raw: Fix NULL deref in raw_get_next().

Dae R. Jeong reported a NULL deref in raw_get_next() [0].

It seems that the repro was running these sequences in parallel so
that one thread was iterating on a socket that was being freed in
another netns.

  unshare(0x40060200)
  r0 = syz_open_procfs(0x0, &(0x7f0000002080)='net/raw\x00')
  socket$inet_icmp_raw(0x2, 0x3, 0x1)
  pread64(r0, &(0x7f0000000000)=""/10, 0xa, 0x10000000007f)

After commit 0daf07e52709 ("raw: convert raw sockets to RCU"), we
use RCU and hlist_nulls_for_each_entry() to iterate over SOCK_RAW
sockets.  However, we should use spinlock for slow paths to avoid
the NULL deref.

Also, SOCK_RAW does not use SLAB_TYPESAFE_BY_RCU, and the slab object
is not reused during iteration in the grace period.  In fact, the
lockless readers do not check the nulls marker with get_nulls_value().
So, SOCK_RAW should use hlist instead of hlist_nulls.

Instead of adding an unnecessary barrier by sk_nulls_for_each_rcu(),
let's convert hlist_nulls to hlist and use sk_for_each_rcu() for
fast paths and sk_for_each() and spinlock for /proc/net/raw.

[0]:
general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
CPU: 2 PID: 20952 Comm: syz-executor.0 Not tainted 6.2.0-g048ec869bafd-dirty #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline]
RIP: 0010:sock_net include/net/sock.h:649 [inline]
RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline]
RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline]
RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995
Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef
RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206
RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000
RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338
RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9
R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78
R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030
FS:  00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055bb9614b35f CR3: 000000003c672000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 seq_read_iter+0x4c6/0x10f0 fs/seq_file.c:225
 seq_read+0x224/0x320 fs/seq_file.c:162
 pde_read fs/proc/inode.c:316 [inline]
 proc_reg_read+0x23f/0x330 fs/proc/inode.c:328
 vfs_read+0x31e/0xd30 fs/read_write.c:468
 ksys_pread64 fs/read_write.c:665 [inline]
 __do_sys_pread64 fs/read_write.c:675 [inline]
 __se_sys_pread64 fs/read_write.c:672 [inline]
 __x64_sys_pread64+0x1e9/0x280 fs/read_write.c:672
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x4e/0xa0 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x478d29
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f843ae8dbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000011
RAX: ffffffffffffffda RBX: 0000000000791408 RCX: 0000000000478d29
RDX: 000000000000000a RSI: 0000000020000000 RDI: 0000000000000003
RBP: 00000000f477909a R08: 0000000000000000 R09: 0000000000000000
R10: 000010000000007f R11: 0000000000000246 R12: 0000000000791740
R13: 0000000000791414 R14: 0000000000791408 R15: 00007ffc2eb48a50
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline]
RIP: 0010:sock_net include/net/sock.h:649 [inline]
RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline]
RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline]
RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995
Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef
RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206
RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000
RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338
RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9
R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78
R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030
FS:  00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f92ff166000 CR3: 000000003c672000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU")
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-by: Dae R. Jeong <threeearcat@gmail.com>
Link: https://lore.kernel.org/netdev/ZCA2mGV_cmq7lIfV@dragonet/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/raw.h   |  4 ++--
 net/ipv4/raw.c      | 36 +++++++++++++++++++-----------------
 net/ipv4/raw_diag.c | 10 ++++------
 net/ipv6/raw.c      | 10 ++++------
 4 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 2c004c20ed99..3af5289fdead 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -37,7 +37,7 @@ int raw_rcv(struct sock *, struct sk_buff *);
 struct raw_hashinfo {
 	spinlock_t lock;
 
-	struct hlist_nulls_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned;
+	struct hlist_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned;
 };
 
 static inline u32 raw_hashfunc(const struct net *net, u32 proto)
@@ -51,7 +51,7 @@ static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo)
 
 	spin_lock_init(&hashinfo->lock);
 	for (i = 0; i < RAW_HTABLE_SIZE; i++)
-		INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i);
+		INIT_HLIST_HEAD(&hashinfo->ht[i]);
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 94df935ee0c5..8088a5011e7d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -91,12 +91,12 @@ EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
 int raw_hash_sk(struct sock *sk)
 {
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
-	struct hlist_nulls_head *hlist;
+	struct hlist_head *hlist;
 
 	hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
 
 	spin_lock(&h->lock);
-	__sk_nulls_add_node_rcu(sk, hlist);
+	sk_add_node_rcu(sk, hlist);
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	spin_unlock(&h->lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -110,7 +110,7 @@ void raw_unhash_sk(struct sock *sk)
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 
 	spin_lock(&h->lock);
-	if (__sk_nulls_del_node_init_rcu(sk))
+	if (sk_del_node_init_rcu(sk))
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	spin_unlock(&h->lock);
 }
@@ -163,16 +163,15 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
 static int raw_v4_input(struct net *net, struct sk_buff *skb,
 			const struct iphdr *iph, int hash)
 {
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
 	int sdif = inet_sdif(skb);
+	struct hlist_head *hlist;
 	int dif = inet_iif(skb);
 	int delivered = 0;
 	struct sock *sk;
 
 	hlist = &raw_v4_hashinfo.ht[hash];
 	rcu_read_lock();
-	sk_nulls_for_each(sk, hnode, hlist) {
+	sk_for_each_rcu(sk, hlist) {
 		if (!raw_v4_match(net, sk, iph->protocol,
 				  iph->saddr, iph->daddr, dif, sdif))
 			continue;
@@ -264,10 +263,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 {
 	struct net *net = dev_net(skb->dev);
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
 	int dif = skb->dev->ifindex;
 	int sdif = inet_sdif(skb);
+	struct hlist_head *hlist;
 	const struct iphdr *iph;
 	struct sock *sk;
 	int hash;
@@ -276,7 +274,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 	hlist = &raw_v4_hashinfo.ht[hash];
 
 	rcu_read_lock();
-	sk_nulls_for_each(sk, hnode, hlist) {
+	sk_for_each_rcu(sk, hlist) {
 		iph = (const struct iphdr *)skb->data;
 		if (!raw_v4_match(net, sk, iph->protocol,
 				  iph->daddr, iph->saddr, dif, sdif))
@@ -950,14 +948,13 @@ static struct sock *raw_get_first(struct seq_file *seq, int bucket)
 {
 	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
+	struct hlist_head *hlist;
 	struct sock *sk;
 
 	for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
 		hlist = &h->ht[state->bucket];
-		sk_nulls_for_each(sk, hnode, hlist) {
+		sk_for_each(sk, hlist) {
 			if (sock_net(sk) == seq_file_net(seq))
 				return sk;
 		}
@@ -970,7 +967,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	do {
-		sk = sk_nulls_next(sk);
+		sk = sk_next(sk);
 	} while (sk && sock_net(sk) != seq_file_net(seq));
 
 	if (!sk)
@@ -989,9 +986,12 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 void *raw_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(RCU)
+	__acquires(&h->lock)
 {
-	rcu_read_lock();
+	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+	spin_lock(&h->lock);
+
 	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1010,9 +1010,11 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 EXPORT_SYMBOL_GPL(raw_seq_next);
 
 void raw_seq_stop(struct seq_file *seq, void *v)
-	__releases(RCU)
+	__releases(&h->lock)
 {
-	rcu_read_unlock();
+	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+	spin_unlock(&h->lock);
 }
 EXPORT_SYMBOL_GPL(raw_seq_stop);
 
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index 999321834b94..da3591a66a16 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -57,8 +57,7 @@ static bool raw_lookup(struct net *net, struct sock *sk,
 static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
 {
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
+	struct hlist_head *hlist;
 	struct sock *sk;
 	int slot;
 
@@ -68,7 +67,7 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2
 	rcu_read_lock();
 	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
 		hlist = &hashinfo->ht[slot];
-		sk_nulls_for_each(sk, hnode, hlist) {
+		sk_for_each_rcu(sk, hlist) {
 			if (raw_lookup(net, sk, r)) {
 				/*
 				 * Grab it and keep until we fill
@@ -142,9 +141,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
 	struct net *net = sock_net(skb->sk);
 	struct inet_diag_dump_data *cb_data;
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
 	int num, s_num, slot, s_slot;
+	struct hlist_head *hlist;
 	struct sock *sk = NULL;
 	struct nlattr *bc;
 
@@ -161,7 +159,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		num = 0;
 
 		hlist = &hashinfo->ht[slot];
-		sk_nulls_for_each(sk, hnode, hlist) {
+		sk_for_each_rcu(sk, hlist) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index bac9ba747bde..a327aa481df4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -141,10 +141,9 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
 static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct net *net = dev_net(skb->dev);
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
 	const struct in6_addr *saddr;
 	const struct in6_addr *daddr;
+	struct hlist_head *hlist;
 	struct sock *sk;
 	bool delivered = false;
 	__u8 hash;
@@ -155,7 +154,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	hash = raw_hashfunc(net, nexthdr);
 	hlist = &raw_v6_hashinfo.ht[hash];
 	rcu_read_lock();
-	sk_nulls_for_each(sk, hnode, hlist) {
+	sk_for_each_rcu(sk, hlist) {
 		int filtered;
 
 		if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
@@ -333,15 +332,14 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
-	struct hlist_nulls_head *hlist;
-	struct hlist_nulls_node *hnode;
+	struct hlist_head *hlist;
 	struct sock *sk;
 	int hash;
 
 	hash = raw_hashfunc(net, nexthdr);
 	hlist = &raw_v6_hashinfo.ht[hash];
 	rcu_read_lock();
-	sk_nulls_for_each(sk, hnode, hlist) {
+	sk_for_each_rcu(sk, hlist) {
 		/* Note: ipv6_hdr(skb) != skb->data */
 		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
 
-- 
cgit v1.2.3


From 054fbf7ff8143d35ca7d3bb5414bb44ee1574194 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 3 Apr 2023 17:43:16 +0200
Subject: net: qrtr: correct types of trace event parameters

The arguments passed to the trace events are of type unsigned int,
however the signature of the events used __le32 parameters.

I may be missing the point here, but sparse flagged this and it
does seem incorrect to me.

  net/qrtr/ns.c: note: in included file (through include/trace/trace_events.h, include/trace/define_trace.h, include/trace/events/qrtr.h):
  ./include/trace/events/qrtr.h:11:1: warning: cast to restricted __le32
  ./include/trace/events/qrtr.h:11:1: warning: restricted __le32 degrades to integer
  ./include/trace/events/qrtr.h:11:1: warning: restricted __le32 degrades to integer
  ... (a lot more similar warnings)
  net/qrtr/ns.c:115:47:    expected restricted __le32 [usertype] service
  net/qrtr/ns.c:115:47:    got unsigned int service
  net/qrtr/ns.c:115:61: warning: incorrect type in argument 2 (different base types)
  ... (a lot more similar warnings)

Fixes: dfddb54043f0 ("net: qrtr: Add tracepoint support")
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230402-qrtr-trace-types-v1-1-92ad55008dd3@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/qrtr.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h
index b1de14c3bb93..441132c67133 100644
--- a/include/trace/events/qrtr.h
+++ b/include/trace/events/qrtr.h
@@ -10,15 +10,16 @@
 
 TRACE_EVENT(qrtr_ns_service_announce_new,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
@@ -36,15 +37,16 @@ TRACE_EVENT(qrtr_ns_service_announce_new,
 
 TRACE_EVENT(qrtr_ns_service_announce_del,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
@@ -62,15 +64,16 @@ TRACE_EVENT(qrtr_ns_service_announce_del,
 
 TRACE_EVENT(qrtr_ns_server_add,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
-- 
cgit v1.2.3


From 123ee7550e52700475eff4d673723e03339e629f Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Thu, 16 Mar 2023 12:48:04 +0100
Subject: dt-bindings: clock: dispcc-qcm2290: Add MDSS_CORE reset

Add the MDSS_CORE reset which can be asserted to reset the state of
the entire MDSS.

Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230316-topic-qcm_dispcc_reset-v1-1-dd3708853014@linaro.org
---
 include/dt-bindings/clock/qcom,dispcc-qcm2290.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,dispcc-qcm2290.h b/include/dt-bindings/clock/qcom,dispcc-qcm2290.h
index 1db513d6b3ee..cb687949be41 100644
--- a/include/dt-bindings/clock/qcom,dispcc-qcm2290.h
+++ b/include/dt-bindings/clock/qcom,dispcc-qcm2290.h
@@ -29,6 +29,10 @@
 #define DISP_CC_XO_CLK				19
 #define DISP_CC_XO_CLK_SRC			20
 
+/* GDSCs */
 #define MDSS_GDSC				0
 
+/* Resets */
+#define DISP_CC_MDSS_CORE_BCR			0
+
 #endif
-- 
cgit v1.2.3


From 65457b74aa9437418e552e8d52d7112d4f9901a6 Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Mar 2023 12:54:16 +0200
Subject: sched/psi: Rename existing poll members in preparation

Renaming in PSI implementation to make a clear distinction between
privileged and unprivileged triggers code to be implemented in the
next patch.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20230330105418.77061-3-cerasuolodomenico@gmail.com
---
 include/linux/psi_types.h |  36 +++++-----
 kernel/sched/psi.c        | 163 +++++++++++++++++++++++-----------------------
 2 files changed, 100 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1e0a0d7ace3a..1819afa8b198 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -175,26 +175,26 @@ struct psi_group {
 	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
 	unsigned long avg[NR_PSI_STATES - 1][3];
 
-	/* Monitor work control */
-	struct task_struct __rcu *poll_task;
-	struct timer_list poll_timer;
-	wait_queue_head_t poll_wait;
-	atomic_t poll_wakeup;
-	atomic_t poll_scheduled;
+	/* Monitor RT polling work control */
+	struct task_struct __rcu *rtpoll_task;
+	struct timer_list rtpoll_timer;
+	wait_queue_head_t rtpoll_wait;
+	atomic_t rtpoll_wakeup;
+	atomic_t rtpoll_scheduled;
 
 	/* Protects data used by the monitor */
-	struct mutex trigger_lock;
-
-	/* Configured polling triggers */
-	struct list_head triggers;
-	u32 nr_triggers[NR_PSI_STATES - 1];
-	u32 poll_states;
-	u64 poll_min_period;
-
-	/* Total stall times at the start of monitor activation */
-	u64 polling_total[NR_PSI_STATES - 1];
-	u64 polling_next_update;
-	u64 polling_until;
+	struct mutex rtpoll_trigger_lock;
+
+	/* Configured RT polling triggers */
+	struct list_head rtpoll_triggers;
+	u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
+	u32 rtpoll_states;
+	u64 rtpoll_min_period;
+
+	/* Total stall times at the start of RT polling monitor activation */
+	u64 rtpoll_total[NR_PSI_STATES - 1];
+	u64 rtpoll_next_update;
+	u64 rtpoll_until;
 };
 
 #else /* CONFIG_PSI */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index fe9269f1d2a4..a3d0b5cf797a 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -189,14 +189,14 @@ static void group_init(struct psi_group *group)
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
 	/* Init trigger-related members */
-	atomic_set(&group->poll_scheduled, 0);
-	mutex_init(&group->trigger_lock);
-	INIT_LIST_HEAD(&group->triggers);
-	group->poll_min_period = U32_MAX;
-	group->polling_next_update = ULLONG_MAX;
-	init_waitqueue_head(&group->poll_wait);
-	timer_setup(&group->poll_timer, poll_timer_fn, 0);
-	rcu_assign_pointer(group->poll_task, NULL);
+	atomic_set(&group->rtpoll_scheduled, 0);
+	mutex_init(&group->rtpoll_trigger_lock);
+	INIT_LIST_HEAD(&group->rtpoll_triggers);
+	group->rtpoll_min_period = U32_MAX;
+	group->rtpoll_next_update = ULLONG_MAX;
+	init_waitqueue_head(&group->rtpoll_wait);
+	timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+	rcu_assign_pointer(group->rtpoll_task, NULL);
 }
 
 void __init psi_init(void)
@@ -440,11 +440,11 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	 * On subsequent updates, calculate growth deltas and let
 	 * watchers know when their specified thresholds are exceeded.
 	 */
-	list_for_each_entry(t, &group->triggers, node) {
+	list_for_each_entry(t, &group->rtpoll_triggers, node) {
 		u64 growth;
 		bool new_stall;
 
-		new_stall = group->polling_total[t->state] != total[t->state];
+		new_stall = group->rtpoll_total[t->state] != total[t->state];
 
 		/* Check for stall activity or a previous threshold breach */
 		if (!new_stall && !t->pending_event)
@@ -486,10 +486,10 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	}
 
 	if (update_total)
-		memcpy(group->polling_total, total,
-				sizeof(group->polling_total));
+		memcpy(group->rtpoll_total, total,
+				sizeof(group->rtpoll_total));
 
-	return now + group->poll_min_period;
+	return now + group->rtpoll_min_period;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
@@ -582,53 +582,53 @@ static void init_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
 
-	list_for_each_entry(t, &group->triggers, node)
+	list_for_each_entry(t, &group->rtpoll_triggers, node)
 		window_reset(&t->win, now,
 				group->total[PSI_POLL][t->state], 0);
-	memcpy(group->polling_total, group->total[PSI_POLL],
-		   sizeof(group->polling_total));
-	group->polling_next_update = now + group->poll_min_period;
+	memcpy(group->rtpoll_total, group->total[PSI_POLL],
+		   sizeof(group->rtpoll_total));
+	group->rtpoll_next_update = now + group->rtpoll_min_period;
 }
 
 /* Schedule polling if it's not already scheduled or forced. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
 				   bool force)
 {
 	struct task_struct *task;
 
 	/*
 	 * atomic_xchg should be called even when !force to provide a
-	 * full memory barrier (see the comment inside psi_poll_work).
+	 * full memory barrier (see the comment inside psi_rtpoll_work).
 	 */
-	if (atomic_xchg(&group->poll_scheduled, 1) && !force)
+	if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
 		return;
 
 	rcu_read_lock();
 
-	task = rcu_dereference(group->poll_task);
+	task = rcu_dereference(group->rtpoll_task);
 	/*
 	 * kworker might be NULL in case psi_trigger_destroy races with
 	 * psi_task_change (hotpath) which can't use locks
 	 */
 	if (likely(task))
-		mod_timer(&group->poll_timer, jiffies + delay);
+		mod_timer(&group->rtpoll_timer, jiffies + delay);
 	else
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 
 	rcu_read_unlock();
 }
 
-static void psi_poll_work(struct psi_group *group)
+static void psi_rtpoll_work(struct psi_group *group)
 {
 	bool force_reschedule = false;
 	u32 changed_states;
 	u64 now;
 
-	mutex_lock(&group->trigger_lock);
+	mutex_lock(&group->rtpoll_trigger_lock);
 
 	now = sched_clock();
 
-	if (now > group->polling_until) {
+	if (now > group->rtpoll_until) {
 		/*
 		 * We are either about to start or might stop polling if no
 		 * state change was recorded. Resetting poll_scheduled leaves
@@ -638,7 +638,7 @@ static void psi_poll_work(struct psi_group *group)
 		 * should be negligible and polling_next_update still keeps
 		 * updates correctly on schedule.
 		 */
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 		/*
 		 * A task change can race with the poll worker that is supposed to
 		 * report on it. To avoid missing events, ensure ordering between
@@ -667,9 +667,9 @@ static void psi_poll_work(struct psi_group *group)
 
 	collect_percpu_times(group, PSI_POLL, &changed_states);
 
-	if (changed_states & group->poll_states) {
+	if (changed_states & group->rtpoll_states) {
 		/* Initialize trigger windows when entering polling mode */
-		if (now > group->polling_until)
+		if (now > group->rtpoll_until)
 			init_triggers(group, now);
 
 		/*
@@ -677,50 +677,50 @@ static void psi_poll_work(struct psi_group *group)
 		 * minimum tracking window as long as monitor states are
 		 * changing.
 		 */
-		group->polling_until = now +
-			group->poll_min_period * UPDATES_PER_WINDOW;
+		group->rtpoll_until = now +
+			group->rtpoll_min_period * UPDATES_PER_WINDOW;
 	}
 
-	if (now > group->polling_until) {
-		group->polling_next_update = ULLONG_MAX;
+	if (now > group->rtpoll_until) {
+		group->rtpoll_next_update = ULLONG_MAX;
 		goto out;
 	}
 
-	if (now >= group->polling_next_update)
-		group->polling_next_update = update_triggers(group, now);
+	if (now >= group->rtpoll_next_update)
+		group->rtpoll_next_update = update_triggers(group, now);
 
-	psi_schedule_poll_work(group,
-		nsecs_to_jiffies(group->polling_next_update - now) + 1,
+	psi_schedule_rtpoll_work(group,
+		nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
 		force_reschedule);
 
 out:
-	mutex_unlock(&group->trigger_lock);
+	mutex_unlock(&group->rtpoll_trigger_lock);
 }
 
-static int psi_poll_worker(void *data)
+static int psi_rtpoll_worker(void *data)
 {
 	struct psi_group *group = (struct psi_group *)data;
 
 	sched_set_fifo_low(current);
 
 	while (true) {
-		wait_event_interruptible(group->poll_wait,
-				atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+		wait_event_interruptible(group->rtpoll_wait,
+				atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
 				kthread_should_stop());
 		if (kthread_should_stop())
 			break;
 
-		psi_poll_work(group);
+		psi_rtpoll_work(group);
 	}
 	return 0;
 }
 
 static void poll_timer_fn(struct timer_list *t)
 {
-	struct psi_group *group = from_timer(group, t, poll_timer);
+	struct psi_group *group = from_timer(group, t, rtpoll_timer);
 
-	atomic_set(&group->poll_wakeup, 1);
-	wake_up_interruptible(&group->poll_wait);
+	atomic_set(&group->rtpoll_wakeup, 1);
+	wake_up_interruptible(&group->rtpoll_wait);
 }
 
 static void record_times(struct psi_group_cpu *groupc, u64 now)
@@ -851,8 +851,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 	write_seqcount_end(&groupc->seq);
 
-	if (state_mask & group->poll_states)
-		psi_schedule_poll_work(group, 1, false);
+	if (state_mask & group->rtpoll_states)
+		psi_schedule_rtpoll_work(group, 1, false);
 
 	if (wake_clock && !delayed_work_pending(&group->avgs_work))
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -1005,8 +1005,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 
 		write_seqcount_end(&groupc->seq);
 
-		if (group->poll_states & (1 << PSI_IRQ_FULL))
-			psi_schedule_poll_work(group, 1, false);
+		if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+			psi_schedule_rtpoll_work(group, 1, false);
 	} while ((group = group->parent));
 }
 #endif
@@ -1101,7 +1101,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	cancel_delayed_work_sync(&cgroup->psi->avgs_work);
 	free_percpu(cgroup->psi->pcpu);
 	/* All triggers must be removed by now */
-	WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+	WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
 	kfree(cgroup->psi);
 }
 
@@ -1302,29 +1302,29 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	init_waitqueue_head(&t->event_wait);
 	t->pending_event = false;
 
-	mutex_lock(&group->trigger_lock);
+	mutex_lock(&group->rtpoll_trigger_lock);
 
-	if (!rcu_access_pointer(group->poll_task)) {
+	if (!rcu_access_pointer(group->rtpoll_task)) {
 		struct task_struct *task;
 
-		task = kthread_create(psi_poll_worker, group, "psimon");
+		task = kthread_create(psi_rtpoll_worker, group, "psimon");
 		if (IS_ERR(task)) {
 			kfree(t);
-			mutex_unlock(&group->trigger_lock);
+			mutex_unlock(&group->rtpoll_trigger_lock);
 			return ERR_CAST(task);
 		}
-		atomic_set(&group->poll_wakeup, 0);
+		atomic_set(&group->rtpoll_wakeup, 0);
 		wake_up_process(task);
-		rcu_assign_pointer(group->poll_task, task);
+		rcu_assign_pointer(group->rtpoll_task, task);
 	}
 
-	list_add(&t->node, &group->triggers);
-	group->poll_min_period = min(group->poll_min_period,
+	list_add(&t->node, &group->rtpoll_triggers);
+	group->rtpoll_min_period = min(group->rtpoll_min_period,
 		div_u64(t->win.size, UPDATES_PER_WINDOW));
-	group->nr_triggers[t->state]++;
-	group->poll_states |= (1 << t->state);
+	group->rtpoll_nr_triggers[t->state]++;
+	group->rtpoll_states |= (1 << t->state);
 
-	mutex_unlock(&group->trigger_lock);
+	mutex_unlock(&group->rtpoll_trigger_lock);
 
 	return t;
 }
@@ -1349,51 +1349,52 @@ void psi_trigger_destroy(struct psi_trigger *t)
 	 */
 	wake_up_pollfree(&t->event_wait);
 
-	mutex_lock(&group->trigger_lock);
+	mutex_lock(&group->rtpoll_trigger_lock);
 
 	if (!list_empty(&t->node)) {
 		struct psi_trigger *tmp;
 		u64 period = ULLONG_MAX;
 
 		list_del(&t->node);
-		group->nr_triggers[t->state]--;
-		if (!group->nr_triggers[t->state])
-			group->poll_states &= ~(1 << t->state);
+		group->rtpoll_nr_triggers[t->state]--;
+		if (!group->rtpoll_nr_triggers[t->state])
+			group->rtpoll_states &= ~(1 << t->state);
 		/* reset min update period for the remaining triggers */
-		list_for_each_entry(tmp, &group->triggers, node)
+		list_for_each_entry(tmp, &group->rtpoll_triggers, node)
 			period = min(period, div_u64(tmp->win.size,
 					UPDATES_PER_WINDOW));
-		group->poll_min_period = period;
-		/* Destroy poll_task when the last trigger is destroyed */
-		if (group->poll_states == 0) {
-			group->polling_until = 0;
+		group->rtpoll_min_period = period;
+		/* Destroy rtpoll_task when the last trigger is destroyed */
+		if (group->rtpoll_states == 0) {
+			group->rtpoll_until = 0;
 			task_to_destroy = rcu_dereference_protected(
-					group->poll_task,
-					lockdep_is_held(&group->trigger_lock));
-			rcu_assign_pointer(group->poll_task, NULL);
-			del_timer(&group->poll_timer);
+					group->rtpoll_task,
+					lockdep_is_held(&group->rtpoll_trigger_lock));
+			rcu_assign_pointer(group->rtpoll_task, NULL);
+			del_timer(&group->rtpoll_timer);
 		}
 	}
 
-	mutex_unlock(&group->trigger_lock);
+	mutex_unlock(&group->rtpoll_trigger_lock);
 
 	/*
-	 * Wait for psi_schedule_poll_work RCU to complete its read-side
+	 * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
 	 * critical section before destroying the trigger and optionally the
-	 * poll_task.
+	 * rtpoll_task.
 	 */
 	synchronize_rcu();
 	/*
-	 * Stop kthread 'psimon' after releasing trigger_lock to prevent a
-	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
+	 * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+	 * a deadlock while waiting for psi_rtpoll_work to acquire
+	 * rtpoll_trigger_lock
 	 */
 	if (task_to_destroy) {
 		/*
 		 * After the RCU grace period has expired, the worker
-		 * can no longer be found through group->poll_task.
+		 * can no longer be found through group->rtpoll_task.
 		 */
 		kthread_stop(task_to_destroy);
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 	}
 	kfree(t);
 }
-- 
cgit v1.2.3


From d82caa273565b45fcf103148950549af76c314b0 Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Mar 2023 12:54:18 +0200
Subject: sched/psi: Allow unprivileged polling of N*2s period

PSI offers 2 mechanisms to get information about a specific resource
pressure. One is reading from /proc/pressure/<resource>, which gives
average pressures aggregated every 2s. The other is creating a pollable
fd for a specific resource and cgroup.

The trigger creation requires CAP_SYS_RESOURCE, and gives the
possibility to pick specific time window and threshold, spawing an RT
thread to aggregate the data.

Systemd would like to provide containers the option to monitor pressure
on their own cgroup and sub-cgroups. For example, if systemd launches a
container that itself then launches services, the container should have
the ability to poll() for pressure in individual services. But neither
the container nor the services are privileged.

This patch implements a mechanism to allow unprivileged users to create
pressure triggers. The difference with privileged triggers creation is
that unprivileged ones must have a time window that's a multiple of 2s.
This is so that we can avoid unrestricted spawning of rt threads, and
use instead the same aggregation mechanism done for the averages, which
runs independently of any triggers.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20230330105418.77061-5-cerasuolodomenico@gmail.com
---
 Documentation/accounting/psi.rst |   4 +
 include/linux/psi.h              |   2 +-
 include/linux/psi_types.h        |   7 ++
 kernel/cgroup/cgroup.c           |   2 +-
 kernel/sched/psi.c               | 175 ++++++++++++++++++++++++---------------
 5 files changed, 121 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index 5e40b3f437f9..df6062eb3abb 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
 after which monitors are most likely not needed and psi averages can be used
 instead.
 
+Unprivileged users can also create monitors, with the only limitation that the
+window size must be a multiple of 2s, in order to prevent excessive resource
+usage.
+
 When activated, psi monitor stays active for at least the duration of one
 tracking window to avoid repeated activations/deactivations when system is
 bouncing in and out of the stall state.
diff --git a/include/linux/psi.h b/include/linux/psi.h
index b029a847def1..ab26200c2803 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res);
+			char *buf, enum psi_res res, struct file *file);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1819afa8b198..040c089581c6 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -151,6 +151,9 @@ struct psi_trigger {
 
 	/* Deferred event(s) from previous ratelimit window */
 	bool pending_event;
+
+	/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
+	enum psi_aggregators aggregator;
 };
 
 struct psi_group {
@@ -171,6 +174,10 @@ struct psi_group {
 	/* Aggregator work control */
 	struct delayed_work avgs_work;
 
+	/* Unprivileged triggers against N*PSI_FREQ windows */
+	struct list_head avg_triggers;
+	u32 avg_nr_triggers[NR_PSI_STATES - 1];
+
 	/* Total stall times and sampled pressure averages */
 	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
 	unsigned long avg[NR_PSI_STATES - 1][3];
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 935e8121b21e..dead36969bba 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3761,7 +3761,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
 	}
 
 	psi = cgroup_psi(cgrp);
-	new = psi_trigger_create(psi, buf, res);
+	new = psi_trigger_create(psi, buf, res, of->file);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
 		return PTR_ERR(new);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index f3df6a8ff493..e072f6b31bf3 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -186,9 +186,14 @@ static void group_init(struct psi_group *group)
 		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
 	group->avg_last_update = sched_clock();
 	group->avg_next_update = group->avg_last_update + psi_period;
-	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
-	/* Init trigger-related members */
+
+	/* Init avg trigger-related members */
+	INIT_LIST_HEAD(&group->avg_triggers);
+	memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+	/* Init rtpoll trigger-related members */
 	atomic_set(&group->rtpoll_scheduled, 0);
 	mutex_init(&group->rtpoll_trigger_lock);
 	INIT_LIST_HEAD(&group->rtpoll_triggers);
@@ -430,21 +435,32 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total)
+static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+						   enum psi_aggregators aggregator)
 {
 	struct psi_trigger *t;
-	u64 *total = group->total[PSI_POLL];
+	u64 *total = group->total[aggregator];
+	struct list_head *triggers;
+	u64 *aggregator_total;
 	*update_total = false;
 
+	if (aggregator == PSI_AVGS) {
+		triggers = &group->avg_triggers;
+		aggregator_total = group->avg_total;
+	} else {
+		triggers = &group->rtpoll_triggers;
+		aggregator_total = group->rtpoll_total;
+	}
+
 	/*
 	 * On subsequent updates, calculate growth deltas and let
 	 * watchers know when their specified thresholds are exceeded.
 	 */
-	list_for_each_entry(t, &group->rtpoll_triggers, node) {
+	list_for_each_entry(t, triggers, node) {
 		u64 growth;
 		bool new_stall;
 
-		new_stall = group->rtpoll_total[t->state] != total[t->state];
+		new_stall = aggregator_total[t->state] != total[t->state];
 
 		/* Check for stall activity or a previous threshold breach */
 		if (!new_stall && !t->pending_event)
@@ -546,6 +562,7 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
+	bool update_total;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -563,8 +580,10 @@ static void psi_avgs_work(struct work_struct *work)
 	 * Once restarted, we'll catch up the running averages in one
 	 * go - see calc_avgs() and missed_periods.
 	 */
-	if (now >= group->avg_next_update)
+	if (now >= group->avg_next_update) {
+		update_triggers(group, now, &update_total, PSI_AVGS);
 		group->avg_next_update = update_averages(group, now);
+	}
 
 	if (changed_states & PSI_STATE_RESCHEDULE) {
 		schedule_delayed_work(dwork, nsecs_to_jiffies(
@@ -574,7 +593,7 @@ static void psi_avgs_work(struct work_struct *work)
 	mutex_unlock(&group->avgs_lock);
 }
 
-static void init_triggers(struct psi_group *group, u64 now)
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
 
@@ -667,7 +686,7 @@ static void psi_rtpoll_work(struct psi_group *group)
 	if (changed_states & group->rtpoll_states) {
 		/* Initialize trigger windows when entering polling mode */
 		if (now > group->rtpoll_until)
-			init_triggers(group, now);
+			init_rtpoll_triggers(group, now);
 
 		/*
 		 * Keep the monitor active for at least the duration of the
@@ -684,7 +703,7 @@ static void psi_rtpoll_work(struct psi_group *group)
 	}
 
 	if (now >= group->rtpoll_next_update) {
-		group->rtpoll_next_update = update_triggers(group, now, &update_total);
+		group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
 		if (update_total)
 			memcpy(group->rtpoll_total, group->total[PSI_POLL],
 				   sizeof(group->rtpoll_total));
@@ -1254,16 +1273,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 }
 
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res)
+			char *buf, enum psi_res res, struct file *file)
 {
 	struct psi_trigger *t;
 	enum psi_states state;
 	u32 threshold_us;
+	bool privileged;
 	u32 window_us;
 
 	if (static_branch_likely(&psi_disabled))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	/*
+	 * Checking the privilege here on file->f_cred implies that a privileged user
+	 * could open the file and delegate the write to an unprivileged one.
+	 */
+	privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
 	if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
 		state = PSI_IO_SOME + res * 2;
 	else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1283,6 +1309,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 		window_us > WINDOW_MAX_US)
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * Unprivileged users can only use 2s windows so that averages aggregation
+	 * work is used, and no RT threads need to be spawned.
+	 */
+	if (!privileged && window_us % 2000000)
+		return ERR_PTR(-EINVAL);
+
 	/* Check threshold */
 	if (threshold_us == 0 || threshold_us > window_us)
 		return ERR_PTR(-EINVAL);
@@ -1302,31 +1335,40 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
 	t->pending_event = false;
+	t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
 
-	mutex_lock(&group->rtpoll_trigger_lock);
+	if (privileged) {
+		mutex_lock(&group->rtpoll_trigger_lock);
 
-	if (!rcu_access_pointer(group->rtpoll_task)) {
-		struct task_struct *task;
+		if (!rcu_access_pointer(group->rtpoll_task)) {
+			struct task_struct *task;
 
-		task = kthread_create(psi_rtpoll_worker, group, "psimon");
-		if (IS_ERR(task)) {
-			kfree(t);
-			mutex_unlock(&group->rtpoll_trigger_lock);
-			return ERR_CAST(task);
+			task = kthread_create(psi_rtpoll_worker, group, "psimon");
+			if (IS_ERR(task)) {
+				kfree(t);
+				mutex_unlock(&group->rtpoll_trigger_lock);
+				return ERR_CAST(task);
+			}
+			atomic_set(&group->rtpoll_wakeup, 0);
+			wake_up_process(task);
+			rcu_assign_pointer(group->rtpoll_task, task);
 		}
-		atomic_set(&group->rtpoll_wakeup, 0);
-		wake_up_process(task);
-		rcu_assign_pointer(group->rtpoll_task, task);
-	}
 
-	list_add(&t->node, &group->rtpoll_triggers);
-	group->rtpoll_min_period = min(group->rtpoll_min_period,
-		div_u64(t->win.size, UPDATES_PER_WINDOW));
-	group->rtpoll_nr_triggers[t->state]++;
-	group->rtpoll_states |= (1 << t->state);
+		list_add(&t->node, &group->rtpoll_triggers);
+		group->rtpoll_min_period = min(group->rtpoll_min_period,
+			div_u64(t->win.size, UPDATES_PER_WINDOW));
+		group->rtpoll_nr_triggers[t->state]++;
+		group->rtpoll_states |= (1 << t->state);
 
-	mutex_unlock(&group->rtpoll_trigger_lock);
+		mutex_unlock(&group->rtpoll_trigger_lock);
+	} else {
+		mutex_lock(&group->avgs_lock);
+
+		list_add(&t->node, &group->avg_triggers);
+		group->avg_nr_triggers[t->state]++;
 
+		mutex_unlock(&group->avgs_lock);
+	}
 	return t;
 }
 
@@ -1350,34 +1392,41 @@ void psi_trigger_destroy(struct psi_trigger *t)
 	 */
 	wake_up_pollfree(&t->event_wait);
 
-	mutex_lock(&group->rtpoll_trigger_lock);
-
-	if (!list_empty(&t->node)) {
-		struct psi_trigger *tmp;
-		u64 period = ULLONG_MAX;
-
-		list_del(&t->node);
-		group->rtpoll_nr_triggers[t->state]--;
-		if (!group->rtpoll_nr_triggers[t->state])
-			group->rtpoll_states &= ~(1 << t->state);
-		/* reset min update period for the remaining triggers */
-		list_for_each_entry(tmp, &group->rtpoll_triggers, node)
-			period = min(period, div_u64(tmp->win.size,
-					UPDATES_PER_WINDOW));
-		group->rtpoll_min_period = period;
-		/* Destroy rtpoll_task when the last trigger is destroyed */
-		if (group->rtpoll_states == 0) {
-			group->rtpoll_until = 0;
-			task_to_destroy = rcu_dereference_protected(
-					group->rtpoll_task,
-					lockdep_is_held(&group->rtpoll_trigger_lock));
-			rcu_assign_pointer(group->rtpoll_task, NULL);
-			del_timer(&group->rtpoll_timer);
+	if (t->aggregator == PSI_AVGS) {
+		mutex_lock(&group->avgs_lock);
+		if (!list_empty(&t->node)) {
+			list_del(&t->node);
+			group->avg_nr_triggers[t->state]--;
 		}
+		mutex_unlock(&group->avgs_lock);
+	} else {
+		mutex_lock(&group->rtpoll_trigger_lock);
+		if (!list_empty(&t->node)) {
+			struct psi_trigger *tmp;
+			u64 period = ULLONG_MAX;
+
+			list_del(&t->node);
+			group->rtpoll_nr_triggers[t->state]--;
+			if (!group->rtpoll_nr_triggers[t->state])
+				group->rtpoll_states &= ~(1 << t->state);
+			/* reset min update period for the remaining triggers */
+			list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+				period = min(period, div_u64(tmp->win.size,
+						UPDATES_PER_WINDOW));
+			group->rtpoll_min_period = period;
+			/* Destroy rtpoll_task when the last trigger is destroyed */
+			if (group->rtpoll_states == 0) {
+				group->rtpoll_until = 0;
+				task_to_destroy = rcu_dereference_protected(
+						group->rtpoll_task,
+						lockdep_is_held(&group->rtpoll_trigger_lock));
+				rcu_assign_pointer(group->rtpoll_task, NULL);
+				del_timer(&group->rtpoll_timer);
+			}
+		}
+		mutex_unlock(&group->rtpoll_trigger_lock);
 	}
 
-	mutex_unlock(&group->rtpoll_trigger_lock);
-
 	/*
 	 * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
 	 * critical section before destroying the trigger and optionally the
@@ -1437,27 +1486,19 @@ static int psi_cpu_show(struct seq_file *m, void *v)
 	return psi_show(m, &psi_system, PSI_CPU);
 }
 
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
-	if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	return single_open(file, psi_show, NULL);
-}
-
 static int psi_io_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_io_show);
+	return single_open(file, psi_io_show, NULL);
 }
 
 static int psi_memory_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_memory_show);
+	return single_open(file, psi_memory_show, NULL);
 }
 
 static int psi_cpu_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_cpu_show);
+	return single_open(file, psi_cpu_show, NULL);
 }
 
 static ssize_t psi_write(struct file *file, const char __user *user_buf,
@@ -1491,7 +1532,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 		return -EBUSY;
 	}
 
-	new = psi_trigger_create(&psi_system, buf, res);
+	new = psi_trigger_create(&psi_system, buf, res, file);
 	if (IS_ERR(new)) {
 		mutex_unlock(&seq->lock);
 		return PTR_ERR(new);
@@ -1571,7 +1612,7 @@ static int psi_irq_show(struct seq_file *m, void *v)
 
 static int psi_irq_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_irq_show);
+	return single_open(file, psi_irq_show, NULL);
 }
 
 static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
-- 
cgit v1.2.3


From b2d110cd5d28ded813f060b735cd68bfe2e5010f Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Mon, 6 Mar 2023 08:56:51 +0100
Subject: interconnect: drop unused icc_link_destroy() interface

Now that the link array is deallocated when destroying nodes and the
explicit link removal has been dropped from the exynos driver there are
no further users of and no need for the icc_link_destroy() interface.

Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20230306075651.2449-24-johan+linaro@kernel.org
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 drivers/interconnect/core.c           | 46 -----------------------------------
 include/linux/interconnect-provider.h |  6 -----
 2 files changed, 52 deletions(-)

(limited to 'include')

diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
index da456329c524..0edf85ba055e 100644
--- a/drivers/interconnect/core.c
+++ b/drivers/interconnect/core.c
@@ -910,52 +910,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(icc_link_create);
 
-/**
- * icc_link_destroy() - destroy a link between two nodes
- * @src: pointer to source node
- * @dst: pointer to destination node
- *
- * Return: 0 on success, or an error code otherwise
- */
-int icc_link_destroy(struct icc_node *src, struct icc_node *dst)
-{
-	struct icc_node **new;
-	size_t slot;
-	int ret = 0;
-
-	if (IS_ERR_OR_NULL(src))
-		return -EINVAL;
-
-	if (IS_ERR_OR_NULL(dst))
-		return -EINVAL;
-
-	mutex_lock(&icc_lock);
-
-	for (slot = 0; slot < src->num_links; slot++)
-		if (src->links[slot] == dst)
-			break;
-
-	if (WARN_ON(slot == src->num_links)) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	src->links[slot] = src->links[--src->num_links];
-
-	new = krealloc(src->links, src->num_links * sizeof(*src->links),
-		       GFP_KERNEL);
-	if (new)
-		src->links = new;
-	else
-		ret = -ENOMEM;
-
-out:
-	mutex_unlock(&icc_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(icc_link_destroy);
-
 /**
  * icc_node_add() - add interconnect node to interconnect provider
  * @node: pointer to the interconnect node
diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index b9af9016a95e..e6d8aca6886d 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -118,7 +118,6 @@ int icc_std_aggregate(struct icc_node *node, u32 tag, u32 avg_bw,
 struct icc_node *icc_node_create(int id);
 void icc_node_destroy(int id);
 int icc_link_create(struct icc_node *node, const int dst_id);
-int icc_link_destroy(struct icc_node *src, struct icc_node *dst);
 void icc_node_add(struct icc_node *node, struct icc_provider *provider);
 void icc_node_del(struct icc_node *node);
 int icc_nodes_remove(struct icc_provider *provider);
@@ -150,11 +149,6 @@ static inline int icc_link_create(struct icc_node *node, const int dst_id)
 	return -ENOTSUPP;
 }
 
-static inline int icc_link_destroy(struct icc_node *src, struct icc_node *dst)
-{
-	return -ENOTSUPP;
-}
-
 static inline void icc_node_add(struct icc_node *node, struct icc_provider *provider)
 {
 }
-- 
cgit v1.2.3


From e65733b5c59a1ea20324a03494364958bef3fc68 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Tue, 4 Apr 2023 15:40:38 +0000
Subject: KVM: x86: Redefine 'longmode' as a flag for KVM_EXIT_HYPERCALL

The 'longmode' field is a bit annoying as it blows an entire __u32 to
represent a boolean value. Since other architectures are looking to add
support for KVM_EXIT_HYPERCALL, now is probably a good time to clean it
up.

Redefine the field (and the remaining padding) as a set of flags.
Preserve the existing ABI by using bit 0 to indicate if the guest was in
long mode and requiring that the remaining 31 bits must be zero.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230404154050.2270077-2-oliver.upton@linux.dev
---
 Documentation/virt/kvm/api.rst  | 3 +--
 arch/x86/include/asm/kvm_host.h | 7 +++++++
 arch/x86/include/uapi/asm/kvm.h | 3 +++
 arch/x86/kvm/x86.c              | 6 +++++-
 include/uapi/linux/kvm.h        | 9 +++++++--
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 62de0768d6aa..9b01e3d0e757 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6218,8 +6218,7 @@ to the byte array.
 			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
-			__u32 longmode;
-			__u32 pad;
+			__u64 flags;
 		} hypercall;
 
 Unused.  This was once used for 'hypercall to userspace'.  To implement
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 808c292ad3f4..15bda40517ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2204,4 +2204,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
+ */
+#define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7f467fe05d42..1a6a1f987949 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
 #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
 
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE	BIT(0)
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7713420abab0..27a1d5c1a018 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9803,7 +9803,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		vcpu->run->hypercall.args[0]  = gpa;
 		vcpu->run->hypercall.args[1]  = npages;
 		vcpu->run->hypercall.args[2]  = attrs;
-		vcpu->run->hypercall.longmode = op_64_bit;
+		vcpu->run->hypercall.flags    = 0;
+		if (op_64_bit)
+			vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+		WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
 		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
 		return 0;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..dd42d7dfb86c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -341,8 +341,13 @@ struct kvm_run {
 			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
-			__u32 longmode;
-			__u32 pad;
+
+			union {
+#ifndef __KERNEL__
+				__u32 longmode;
+#endif
+				__u64 flags;
+			};
 		} hypercall;
 		/* KVM_EXIT_TPR_ACCESS */
 		struct {
-- 
cgit v1.2.3


From aac94968126beb9846c12a940f1302ece7849b4f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Tue, 4 Apr 2023 15:40:41 +0000
Subject: KVM: arm64: Rename SMC/HVC call handler to reflect reality

KVM handles SMCCC calls from virtual EL2 that use the SMC instruction
since commit bd36b1a9eb5a ("KVM: arm64: nv: Handle SMCs taken from
virtual EL2"). Thus, the function name of the handler no longer reflects
reality.

Normalize the name on SMCCC, since that's the only hypercall interface
KVM supports in the first place. No fuctional change intended.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230404154050.2270077-5-oliver.upton@linux.dev
---
 arch/arm64/kvm/handle_exit.c | 4 ++--
 arch/arm64/kvm/hypercalls.c  | 2 +-
 include/kvm/arm_hypercalls.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index a798c0b4d717..5e4f9737cbd5 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -52,7 +52,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	ret = kvm_hvc_call_handler(vcpu);
+	ret = kvm_smccc_call_handler(vcpu);
 	if (ret < 0) {
 		vcpu_set_reg(vcpu, 0, ~0UL);
 		return 1;
@@ -89,7 +89,7 @@ static int handle_smc(struct kvm_vcpu *vcpu)
 	 * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than
 	 * being treated as UNDEFINED.
 	 */
-	ret = kvm_hvc_call_handler(vcpu);
+	ret = kvm_smccc_call_handler(vcpu);
 	if (ret < 0)
 		vcpu_set_reg(vcpu, 0, ~0UL);
 
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index a09a526a7d7c..5ead6c6afff0 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -121,7 +121,7 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
 	}
 }
 
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
 {
 	struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
 	u32 func_id = smccc_get_function(vcpu);
diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h
index 1188f116cf4e..8f4e33bc43e8 100644
--- a/include/kvm/arm_hypercalls.h
+++ b/include/kvm/arm_hypercalls.h
@@ -6,7 +6,7 @@
 
 #include <asm/kvm_emulate.h>
 
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
+int kvm_smccc_call_handler(struct kvm_vcpu *vcpu);
 
 static inline u32 smccc_get_function(struct kvm_vcpu *vcpu)
 {
-- 
cgit v1.2.3


From fb88707dd39bd1d5ec4a058776de9ee99bcc7b72 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Tue, 4 Apr 2023 15:40:44 +0000
Subject: KVM: arm64: Use a maple tree to represent the SMCCC filter

Maple tree is an efficient B-tree implementation that is intended for
storing non-overlapping intervals. Such a data structure is a good fit
for the SMCCC filter as it is desirable to sparsely allocate the 32 bit
function ID space.

To that end, add a maple tree to kvm_arch and correctly init/teardown
along with the VM. Wire in a test against the hypercall filter for HVCs
which does nothing until the controls are exposed to userspace.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230404154050.2270077-8-oliver.upton@linux.dev
---
 arch/arm64/include/asm/kvm_host.h |  5 +++-
 arch/arm64/kvm/arm.c              |  2 ++
 arch/arm64/kvm/hypercalls.c       | 57 +++++++++++++++++++++++++++++++++++++++
 include/kvm/arm_hypercalls.h      |  1 +
 4 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d091d1c9890b..2682b3fd0881 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/jump_label.h>
 #include <linux/kvm_types.h>
+#include <linux/maple_tree.h>
 #include <linux/percpu.h>
 #include <linux/psci.h>
 #include <asm/arch_gicv3.h>
@@ -221,7 +222,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
-
+	/* SMCCC filter initialized for the VM */
+#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED		6
 	unsigned long flags;
 
 	/*
@@ -242,6 +244,7 @@ struct kvm_arch {
 
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
+	struct maple_tree smccc_filter;
 
 	/*
 	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b6e26c0e65e5..1202ac03bee0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -192,6 +192,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_destroy_vcpus(kvm);
 
 	kvm_unshare_hyp(kvm, kvm + 1);
+
+	kvm_arm_teardown_hypercalls(kvm);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 0be974e2f1fc..ba7cd84c6668 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -121,8 +121,58 @@ static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id)
 	}
 }
 
+#define SMCCC_ARCH_RANGE_BEGIN	ARM_SMCCC_VERSION_FUNC_ID
+#define SMCCC_ARCH_RANGE_END				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,		\
+			   ARM_SMCCC_SMC_32,		\
+			   0, ARM_SMCCC_FUNC_MASK)
+
+static void init_smccc_filter(struct kvm *kvm)
+{
+	int r;
+
+	mt_init(&kvm->arch.smccc_filter);
+
+	/*
+	 * Prevent userspace from handling any SMCCC calls in the architecture
+	 * range, avoiding the risk of misrepresenting Spectre mitigation status
+	 * to the guest.
+	 */
+	r = mtree_insert_range(&kvm->arch.smccc_filter,
+			       SMCCC_ARCH_RANGE_BEGIN, SMCCC_ARCH_RANGE_END,
+			       xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
+			       GFP_KERNEL_ACCOUNT);
+	WARN_ON_ONCE(r);
+}
+
+static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id)
+{
+	unsigned long idx = func_id;
+	void *val;
+
+	if (!test_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags))
+		return KVM_SMCCC_FILTER_HANDLE;
+
+	/*
+	 * But where's the error handling, you say?
+	 *
+	 * mt_find() returns NULL if no entry was found, which just so happens
+	 * to match KVM_SMCCC_FILTER_HANDLE.
+	 */
+	val = mt_find(&kvm->arch.smccc_filter, &idx, idx);
+	return xa_to_value(val);
+}
+
 static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id)
 {
+	/*
+	 * Intervening actions in the SMCCC filter take precedence over the
+	 * pseudo-firmware register bitmaps.
+	 */
+	u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id);
+	if (action != KVM_SMCCC_FILTER_HANDLE)
+		return action;
+
 	if (kvm_smccc_test_fw_bmap(vcpu, func_id) ||
 	    kvm_smccc_default_allowed(func_id))
 		return KVM_SMCCC_FILTER_HANDLE;
@@ -263,6 +313,13 @@ void kvm_arm_init_hypercalls(struct kvm *kvm)
 	smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;
 	smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;
 	smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
+
+	init_smccc_filter(kvm);
+}
+
+void kvm_arm_teardown_hypercalls(struct kvm *kvm)
+{
+	mtree_destroy(&kvm->arch.smccc_filter);
 }
 
 int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h
index 8f4e33bc43e8..fe6c31575b05 100644
--- a/include/kvm/arm_hypercalls.h
+++ b/include/kvm/arm_hypercalls.h
@@ -43,6 +43,7 @@ static inline void smccc_set_retval(struct kvm_vcpu *vcpu,
 struct kvm_one_reg;
 
 void kvm_arm_init_hypercalls(struct kvm *kvm);
+void kvm_arm_teardown_hypercalls(struct kvm *kvm);
 int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
 int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
-- 
cgit v1.2.3


From 821d935c87bc95253f82deec3cbb457ccf3de003 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Tue, 4 Apr 2023 15:40:46 +0000
Subject: KVM: arm64: Introduce support for userspace SMCCC filtering

As the SMCCC (and related specifications) march towards an 'everything
and the kitchen sink' interface for interacting with a system it becomes
less likely that KVM will support every related feature. We could do
better by letting userspace have a crack at it instead.

Allow userspace to define an 'SMCCC filter' that applies to both HVCs
and SMCs initiated by the guest. Supporting both conduits with this
interface is important for a couple of reasons. Guest SMC usage is table
stakes for a nested guest, as HVCs are always taken to the virtual EL2.
Additionally, guests may want to interact with a service on the secure
side which can now be proxied by userspace.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230404154050.2270077-10-oliver.upton@linux.dev
---
 Documentation/virt/kvm/api.rst        |  4 ++
 Documentation/virt/kvm/devices/vm.rst | 79 +++++++++++++++++++++++++++++++++++
 arch/arm64/include/uapi/asm/kvm.h     | 11 +++++
 arch/arm64/kvm/arm.c                  |  4 ++
 arch/arm64/kvm/hypercalls.c           | 60 ++++++++++++++++++++++++++
 include/kvm/arm_hypercalls.h          |  3 ++
 6 files changed, 161 insertions(+)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 9497792c4ee5..c8ab2f730945 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6231,6 +6231,10 @@ requires a guest to interact with host userpace.
 For arm64:
 ----------
 
+SMCCC exits can be enabled depending on the configuration of the SMCCC
+filter. See the Documentation/virt/kvm/devices/vm.rst
+``KVM_ARM_SMCCC_FILTER`` for more details.
+
 ``nr`` contains the function ID of the guest's SMCCC call. Userspace is
 expected to use the ``KVM_GET_ONE_REG`` ioctl to retrieve the call
 parameters from the vCPU's GPRs.
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 147efec626e5..9d726e60ec47 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -321,3 +321,82 @@ Allows userspace to query the status of migration mode.
 	     if it is enabled
 :Returns:   -EFAULT if the given address is not accessible from kernel space;
 	    0 in case of success.
+
+6. GROUP: KVM_ARM_VM_SMCCC_CTRL
+===============================
+
+:Architectures: arm64
+
+6.1. ATTRIBUTE: KVM_ARM_VM_SMCCC_FILTER (w/o)
+---------------------------------------------
+
+:Parameters: Pointer to a ``struct kvm_smccc_filter``
+
+:Returns:
+
+        ======  ===========================================
+        EEXIST  Range intersects with a previously inserted
+                or reserved range
+        EBUSY   A vCPU in the VM has already run
+        EINVAL  Invalid filter configuration
+        ENOMEM  Failed to allocate memory for the in-kernel
+                representation of the SMCCC filter
+        ======  ===========================================
+
+Requests the installation of an SMCCC call filter described as follows::
+
+    enum kvm_smccc_filter_action {
+            KVM_SMCCC_FILTER_HANDLE = 0,
+            KVM_SMCCC_FILTER_DENY,
+            KVM_SMCCC_FILTER_FWD_TO_USER,
+    };
+
+    struct kvm_smccc_filter {
+            __u32 base;
+            __u32 nr_functions;
+            __u8 action;
+            __u8 pad[15];
+    };
+
+The filter is defined as a set of non-overlapping ranges. Each
+range defines an action to be applied to SMCCC calls within the range.
+Userspace can insert multiple ranges into the filter by using
+successive calls to this attribute.
+
+The default configuration of KVM is such that all implemented SMCCC
+calls are allowed. Thus, the SMCCC filter can be defined sparsely
+by userspace, only describing ranges that modify the default behavior.
+
+The range expressed by ``struct kvm_smccc_filter`` is
+[``base``, ``base + nr_functions``). The range is not allowed to wrap,
+i.e. userspace cannot rely on ``base + nr_functions`` overflowing.
+
+The SMCCC filter applies to both SMC and HVC calls initiated by the
+guest. The SMCCC filter gates the in-kernel emulation of SMCCC calls
+and as such takes effect before other interfaces that interact with
+SMCCC calls (e.g. hypercall bitmap registers).
+
+Actions:
+
+ - ``KVM_SMCCC_FILTER_HANDLE``: Allows the guest SMCCC call to be
+   handled in-kernel. It is strongly recommended that userspace *not*
+   explicitly describe the allowed SMCCC call ranges.
+
+ - ``KVM_SMCCC_FILTER_DENY``: Rejects the guest SMCCC call in-kernel
+   and returns to the guest.
+
+ - ``KVM_SMCCC_FILTER_FWD_TO_USER``: The guest SMCCC call is forwarded
+   to userspace with an exit reason of ``KVM_EXIT_HYPERCALL``.
+
+The ``pad`` field is reserved for future use and must be zero. KVM may
+return ``-EINVAL`` if the field is nonzero.
+
+KVM reserves the 'Arm Architecture Calls' range of function IDs and
+will reject attempts to define a filter for any portion of these ranges:
+
+        =========== ===============
+        Start       End (inclusive)
+        =========== ===============
+        0x8000_0000 0x8000_FFFF
+        0xC000_0000 0xC000_FFFF
+        =========== ===============
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f86446c5a7e3..3dcfa4bfdf83 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -372,6 +372,10 @@ enum {
 #endif
 };
 
+/* Device Control API on vm fd */
+#define KVM_ARM_VM_SMCCC_CTRL		0
+#define   KVM_ARM_VM_SMCCC_FILTER	0
+
 /* Device Control API: ARM VGIC */
 #define KVM_DEV_ARM_VGIC_GRP_ADDR	0
 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
@@ -479,6 +483,13 @@ enum kvm_smccc_filter_action {
 #endif
 };
 
+struct kvm_smccc_filter {
+	__u32 base;
+	__u32 nr_functions;
+	__u8 action;
+	__u8 pad[15];
+};
+
 /* arm64-specific KVM_EXIT_HYPERCALL flags */
 #define KVM_HYPERCALL_EXIT_SMC	(1U << 0)
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1202ac03bee0..efee032c9560 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1444,6 +1444,8 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
+	case KVM_ARM_VM_SMCCC_CTRL:
+		return kvm_vm_smccc_has_attr(kvm, attr);
 	default:
 		return -ENXIO;
 	}
@@ -1452,6 +1454,8 @@ static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
+	case KVM_ARM_VM_SMCCC_CTRL:
+		return kvm_vm_smccc_set_attr(kvm, attr);
 	default:
 		return -ENXIO;
 	}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 2db53709bec1..9a35d6d18193 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -145,6 +145,44 @@ static void init_smccc_filter(struct kvm *kvm)
 	WARN_ON_ONCE(r);
 }
 
+static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr)
+{
+	const void *zero_page = page_to_virt(ZERO_PAGE(0));
+	struct kvm_smccc_filter filter;
+	u32 start, end;
+	int r;
+
+	if (copy_from_user(&filter, uaddr, sizeof(filter)))
+		return -EFAULT;
+
+	if (memcmp(filter.pad, zero_page, sizeof(filter.pad)))
+		return -EINVAL;
+
+	start = filter.base;
+	end = start + filter.nr_functions - 1;
+
+	if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm_vm_has_ran_once(kvm)) {
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
+	r = mtree_insert_range(&kvm->arch.smccc_filter, start, end,
+			       xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT);
+	if (r)
+		goto out_unlock;
+
+	set_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags);
+
+out_unlock:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id)
 {
 	unsigned long idx = func_id;
@@ -569,3 +607,25 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 
 	return -EINVAL;
 }
+
+int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VM_SMCCC_FILTER:
+		return 0;
+	default:
+		return -ENXIO;
+	}
+}
+
+int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user *)attr->addr;
+
+	switch (attr->attr) {
+	case KVM_ARM_VM_SMCCC_FILTER:
+		return kvm_smccc_set_filter(kvm, uaddr);
+	default:
+		return -ENXIO;
+	}
+}
diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h
index fe6c31575b05..2df152207ccd 100644
--- a/include/kvm/arm_hypercalls.h
+++ b/include/kvm/arm_hypercalls.h
@@ -49,4 +49,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
 int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
 int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
 
+int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr);
+int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr);
+
 #endif
-- 
cgit v1.2.3


From 7ddc7f91beb285246e926e3adf0b292b071aea33 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Sun, 2 Apr 2023 22:59:35 +0000
Subject: ASoC: soc.h: clarify Codec2Codec params

snd_soc_dai_link has params/num_params, but it is unclear that
params for what. This patch clarify it is params for Codec2Codec.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87o7o5c2lk.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 12 +++++++++---
 sound/soc/soc-core.c | 11 +++++++++++
 sound/soc/soc-dapm.c | 44 ++++++++++++++++++++++----------------------
 sound/soc/soc-pcm.c  | 10 +++++-----
 4 files changed, 47 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 57c5786a625b..276afdb1f445 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -684,8 +684,14 @@ struct snd_soc_dai_link {
 
 	int id;	/* optional ID for machine driver link identification */
 
-	const struct snd_soc_pcm_stream *params;
-	unsigned int num_params;
+	/*
+	 * for Codec2Codec
+	 */
+	const struct snd_soc_pcm_stream *c2c_params;
+	unsigned int num_c2c_params;
+
+	const struct snd_soc_pcm_stream *params;	/* REMOVE ME */
+	unsigned int num_params;			/* REMOVE ME */
 
 	unsigned int dai_fmt;           /* format to set on init */
 
@@ -1065,7 +1071,7 @@ struct snd_soc_pcm_runtime {
 	struct snd_soc_dai_link *dai_link;
 	struct snd_pcm_ops ops;
 
-	unsigned int params_select; /* currently selected param for dai link */
+	unsigned int c2c_params_select; /* currently selected c2c_param for dai link */
 
 	/* Dynamic PCM BE runtime data */
 	struct snd_soc_dpcm_runtime dpcm[SNDRV_PCM_STREAM_LAST + 1];
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 9bbcff492c1e..04f1bc8a3128 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2294,6 +2294,9 @@ EXPORT_SYMBOL_GPL(snd_soc_add_dai_controls);
  */
 int snd_soc_register_card(struct snd_soc_card *card)
 {
+	struct snd_soc_dai_link *dai_link;
+	int i;
+
 	if (!card->name || !card->dev)
 		return -EINVAL;
 
@@ -2314,6 +2317,14 @@ int snd_soc_register_card(struct snd_soc_card *card)
 	mutex_init(&card->dapm_mutex);
 	mutex_init(&card->pcm_mutex);
 
+	/* REMOVE ME */
+	for_each_card_prelinks(card, i, dai_link) {
+		if (!dai_link->c2c_params) {
+			dai_link->c2c_params	 = dai_link->params;
+			dai_link->num_c2c_params = dai_link->num_params;
+		}
+	}
+
 	return snd_soc_bind_card(card);
 }
 EXPORT_SYMBOL_GPL(snd_soc_register_card);
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 34fdcb7ee079..e7a0c28e0cb1 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1075,7 +1075,7 @@ static int dapm_new_dai_link(struct snd_soc_dapm_widget *w)
 	struct snd_soc_pcm_runtime *rtd = w->priv;
 
 	/* create control for links with > 1 config */
-	if (rtd->dai_link->num_params <= 1)
+	if (rtd->dai_link->num_c2c_params <= 1)
 		return 0;
 
 	/* add kcontrol */
@@ -3864,7 +3864,7 @@ snd_soc_dai_link_event_pre_pmu(struct snd_soc_dapm_widget *w,
 	 * either party on the link to alter the configuration if
 	 * necessary
 	 */
-	config = rtd->dai_link->params + rtd->params_select;
+	config = rtd->dai_link->c2c_params + rtd->c2c_params_select;
 	if (!config) {
 		dev_err(w->dapm->dev, "ASoC: link config missing\n");
 		ret = -EINVAL;
@@ -4010,7 +4010,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
 	struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
 	struct snd_soc_pcm_runtime *rtd = w->priv;
 
-	ucontrol->value.enumerated.item[0] = rtd->params_select;
+	ucontrol->value.enumerated.item[0] = rtd->c2c_params_select;
 
 	return 0;
 }
@@ -4025,13 +4025,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
 	if (w->power)
 		return -EBUSY;
 
-	if (ucontrol->value.enumerated.item[0] == rtd->params_select)
+	if (ucontrol->value.enumerated.item[0] == rtd->c2c_params_select)
 		return 0;
 
-	if (ucontrol->value.enumerated.item[0] >= rtd->dai_link->num_params)
+	if (ucontrol->value.enumerated.item[0] >= rtd->dai_link->num_c2c_params)
 		return -EINVAL;
 
-	rtd->params_select = ucontrol->value.enumerated.item[0];
+	rtd->c2c_params_select = ucontrol->value.enumerated.item[0];
 
 	return 1;
 }
@@ -4039,7 +4039,7 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
 static void
 snd_soc_dapm_free_kcontrol(struct snd_soc_card *card,
 			unsigned long *private_value,
-			int num_params,
+			int num_c2c_params,
 			const char **w_param_text)
 {
 	int count;
@@ -4049,7 +4049,7 @@ snd_soc_dapm_free_kcontrol(struct snd_soc_card *card,
 	if (!w_param_text)
 		return;
 
-	for (count = 0 ; count < num_params; count++)
+	for (count = 0 ; count < num_c2c_params; count++)
 		devm_kfree(card->dev, (void *)w_param_text[count]);
 	devm_kfree(card->dev, w_param_text);
 }
@@ -4057,8 +4057,8 @@ snd_soc_dapm_free_kcontrol(struct snd_soc_card *card,
 static struct snd_kcontrol_new *
 snd_soc_dapm_alloc_kcontrol(struct snd_soc_card *card,
 			char *link_name,
-			const struct snd_soc_pcm_stream *params,
-			int num_params, const char **w_param_text,
+			const struct snd_soc_pcm_stream *c2c_params,
+			int num_c2c_params, const char **w_param_text,
 			unsigned long *private_value)
 {
 	struct soc_enum w_param_enum[] = {
@@ -4070,10 +4070,10 @@ snd_soc_dapm_alloc_kcontrol(struct snd_soc_card *card,
 			     snd_soc_dapm_dai_link_put),
 	};
 	struct snd_kcontrol_new *kcontrol_news;
-	const struct snd_soc_pcm_stream *config = params;
+	const struct snd_soc_pcm_stream *config = c2c_params;
 	int count;
 
-	for (count = 0 ; count < num_params; count++) {
+	for (count = 0 ; count < num_c2c_params; count++) {
 		if (!config->stream_name) {
 			dev_warn(card->dapm.dev,
 				"ASoC: anonymous config %d for dai link %s\n",
@@ -4093,7 +4093,7 @@ snd_soc_dapm_alloc_kcontrol(struct snd_soc_card *card,
 		config++;
 	}
 
-	w_param_enum[0].items = num_params;
+	w_param_enum[0].items = num_c2c_params;
 	w_param_enum[0].texts = w_param_text;
 
 	*private_value =
@@ -4118,7 +4118,7 @@ snd_soc_dapm_alloc_kcontrol(struct snd_soc_card *card,
 	return kcontrol_news;
 
 outfree_w_param:
-	snd_soc_dapm_free_kcontrol(card, private_value, num_params, w_param_text);
+	snd_soc_dapm_free_kcontrol(card, private_value, num_c2c_params, w_param_text);
 	return NULL;
 }
 
@@ -4146,17 +4146,17 @@ snd_soc_dapm_new_dai(struct snd_soc_card *card,
 	w_param_text	= NULL;
 	kcontrol_news	= NULL;
 	num_kcontrols	= 0;
-	if (rtd->dai_link->num_params > 1) {
+	if (rtd->dai_link->num_c2c_params > 1) {
 		w_param_text = devm_kcalloc(card->dev,
-					    rtd->dai_link->num_params,
+					    rtd->dai_link->num_c2c_params,
 					    sizeof(char *), GFP_KERNEL);
 		if (!w_param_text)
 			goto param_fail;
 
 		num_kcontrols = 1;
 		kcontrol_news = snd_soc_dapm_alloc_kcontrol(card, link_name,
-							    rtd->dai_link->params,
-							    rtd->dai_link->num_params,
+							    rtd->dai_link->c2c_params,
+							    rtd->dai_link->num_c2c_params,
 							    w_param_text, &private_value);
 		if (!kcontrol_news)
 			goto param_fail;
@@ -4187,7 +4187,7 @@ snd_soc_dapm_new_dai(struct snd_soc_card *card,
 outfree_kcontrol_news:
 	devm_kfree(card->dev, (void *)template.kcontrol_news);
 	snd_soc_dapm_free_kcontrol(card, &private_value,
-				   rtd->dai_link->num_params, w_param_text);
+				   rtd->dai_link->num_c2c_params, w_param_text);
 param_fail:
 	devm_kfree(card->dev, link_name);
 name_fail:
@@ -4336,7 +4336,7 @@ static void dapm_connect_dai_pair(struct snd_soc_card *card,
 	struct snd_pcm_str *streams = rtd->pcm->streams;
 	int stream;
 
-	if (dai_link->params) {
+	if (dai_link->c2c_params) {
 		playback_cpu	= snd_soc_dai_get_widget_capture(cpu_dai);
 		capture_cpu	= snd_soc_dai_get_widget_playback(cpu_dai);
 	} else {
@@ -4349,7 +4349,7 @@ static void dapm_connect_dai_pair(struct snd_soc_card *card,
 	codec = snd_soc_dai_get_widget(codec_dai, stream);
 
 	if (playback_cpu && codec) {
-		if (dai_link->params && !rtd->c2c_widget[stream]) {
+		if (dai_link->c2c_params && !rtd->c2c_widget[stream]) {
 			substream = streams[stream].substream;
 			dai = snd_soc_dapm_new_dai(card, substream, "playback");
 			if (IS_ERR(dai))
@@ -4368,7 +4368,7 @@ capture:
 	codec = snd_soc_dai_get_widget(codec_dai, stream);
 
 	if (codec && capture_cpu) {
-		if (dai_link->params && !rtd->c2c_widget[stream]) {
+		if (dai_link->c2c_params && !rtd->c2c_widget[stream]) {
 			substream = streams[stream].substream;
 			dai = snd_soc_dapm_new_dai(card, substream, "capture");
 			if (IS_ERR(dai))
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index b830a53ceacb..913a7d98e742 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -2793,9 +2793,9 @@ static int soc_get_playback_capture(struct snd_soc_pcm_runtime *rtd,
 		struct snd_soc_dai *codec_dai;
 
 		/* Adapt stream for codec2codec links */
-		int cpu_capture = rtd->dai_link->params ?
+		int cpu_capture = rtd->dai_link->c2c_params ?
 			SNDRV_PCM_STREAM_PLAYBACK : SNDRV_PCM_STREAM_CAPTURE;
-		int cpu_playback = rtd->dai_link->params ?
+		int cpu_playback = rtd->dai_link->c2c_params ?
 			SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK;
 
 		for_each_rtd_codec_dais(rtd, i, codec_dai) {
@@ -2839,7 +2839,7 @@ static int soc_create_pcm(struct snd_pcm **pcm,
 	int ret;
 
 	/* create the PCM */
-	if (rtd->dai_link->params) {
+	if (rtd->dai_link->c2c_params) {
 		snprintf(new_name, sizeof(new_name), "codec2codec(%s)",
 			 rtd->dai_link->stream_name);
 
@@ -2896,7 +2896,7 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num)
 	 * don't interface with the outside world or application layer
 	 * we don't have to do any special handling on close.
 	 */
-	if (!rtd->dai_link->params)
+	if (!rtd->dai_link->c2c_params)
 		rtd->close_delayed_work_func = snd_soc_close_delayed_work;
 
 	rtd->pcm = pcm;
@@ -2904,7 +2904,7 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num)
 	pcm->private_data = rtd;
 	pcm->no_device_suspend = true;
 
-	if (rtd->dai_link->no_pcm || rtd->dai_link->params) {
+	if (rtd->dai_link->no_pcm || rtd->dai_link->c2c_params) {
 		if (playback)
 			pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream->private_data = rtd;
 		if (capture)
-- 
cgit v1.2.3


From 1ea63f29c27712d6b9c45af67cd71299d849c5e3 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Sun, 2 Apr 2023 23:00:17 +0000
Subject: ASoC: soc.h: remove unused params/num_params

No drivers are using params/num_params any more.
Let's remove these.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87iledc2ke.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  3 ---
 sound/soc/soc-core.c | 11 -----------
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 276afdb1f445..3833184c187f 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -690,9 +690,6 @@ struct snd_soc_dai_link {
 	const struct snd_soc_pcm_stream *c2c_params;
 	unsigned int num_c2c_params;
 
-	const struct snd_soc_pcm_stream *params;	/* REMOVE ME */
-	unsigned int num_params;			/* REMOVE ME */
-
 	unsigned int dai_fmt;           /* format to set on init */
 
 	enum snd_soc_dpcm_trigger trigger[2]; /* trigger type for DPCM */
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 04f1bc8a3128..9bbcff492c1e 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2294,9 +2294,6 @@ EXPORT_SYMBOL_GPL(snd_soc_add_dai_controls);
  */
 int snd_soc_register_card(struct snd_soc_card *card)
 {
-	struct snd_soc_dai_link *dai_link;
-	int i;
-
 	if (!card->name || !card->dev)
 		return -EINVAL;
 
@@ -2317,14 +2314,6 @@ int snd_soc_register_card(struct snd_soc_card *card)
 	mutex_init(&card->dapm_mutex);
 	mutex_init(&card->pcm_mutex);
 
-	/* REMOVE ME */
-	for_each_card_prelinks(card, i, dai_link) {
-		if (!dai_link->c2c_params) {
-			dai_link->c2c_params	 = dai_link->params;
-			dai_link->num_c2c_params = dai_link->num_params;
-		}
-	}
-
 	return snd_soc_bind_card(card);
 }
 EXPORT_SYMBOL_GPL(snd_soc_register_card);
-- 
cgit v1.2.3


From 4c4dd04e75e8177311d17387326253674cb0558b Mon Sep 17 00:00:00 2001
From: Ondrej Kozina <okozina@redhat.com>
Date: Wed, 5 Apr 2023 13:12:23 +0200
Subject: sed-opal: Add command to read locking range parameters.

It returns following attributes:

locking range start
locking range length
read lock enabled
write lock enabled
lock state (RW, RO or LK)

It can be retrieved by user authority provided the authority
was added to locking range via prior IOC_OPAL_ADD_USR_TO_LR
ioctl command. The command was extended to add user in ACE that
allows to read attributes listed above.

Signed-off-by: Ondrej Kozina <okozina@redhat.com>
Tested-by: Luca Boccassi <bluca@debian.org>
Tested-by: Milan Broz <gmazyland@gmail.com>
Link: https://lore.kernel.org/r/20230405111223.272816-6-okozina@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 153 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/sed-opal.h      |   1 +
 include/uapi/linux/sed-opal.h |  11 +++
 3 files changed, 165 insertions(+)

(limited to 'include')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index b95560d9c5eb..3fc4e65db111 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1445,6 +1445,129 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
+static int response_get_column(const struct parsed_resp *resp,
+			       int *iter,
+			       u8 column,
+			       u64 *value)
+{
+	const struct opal_resp_tok *tok;
+	int n = *iter;
+	u64 val;
+
+	tok = response_get_token(resp, n);
+	if (IS_ERR(tok))
+		return PTR_ERR(tok);
+
+	if (!response_token_matches(tok, OPAL_STARTNAME)) {
+		pr_debug("Unexpected response token type %d.\n", n);
+		return OPAL_INVAL_PARAM;
+	}
+	n++;
+
+	if (response_get_u64(resp, n) != column) {
+		pr_debug("Token %d does not match expected column %u.\n",
+			 n, column);
+		return OPAL_INVAL_PARAM;
+	}
+	n++;
+
+	val = response_get_u64(resp, n);
+	n++;
+
+	tok = response_get_token(resp, n);
+	if (IS_ERR(tok))
+		return PTR_ERR(tok);
+
+	if (!response_token_matches(tok, OPAL_ENDNAME)) {
+		pr_debug("Unexpected response token type %d.\n", n);
+		return OPAL_INVAL_PARAM;
+	}
+	n++;
+
+	*value = val;
+	*iter = n;
+
+	return 0;
+}
+
+static int locking_range_status(struct opal_dev *dev, void *data)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	u64 resp;
+	bool rlocked, wlocked;
+	int err, tok_n = 2;
+	struct opal_lr_status *lrst = data;
+
+	err = build_locking_range(lr_buffer, sizeof(lr_buffer),
+				  lrst->session.opal_key.lr);
+	if (err)
+		return err;
+
+	err = generic_get_columns(dev, lr_buffer, OPAL_RANGESTART,
+				  OPAL_WRITELOCKED);
+	if (err) {
+		pr_debug("Couldn't get lr %u table columns %d to %d.\n",
+			 lrst->session.opal_key.lr, OPAL_RANGESTART,
+			 OPAL_WRITELOCKED);
+		return err;
+	}
+
+	/* range start */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGESTART,
+				  &lrst->range_start);
+	if (err)
+		return err;
+
+	/* range length */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGELENGTH,
+				  &lrst->range_length);
+	if (err)
+		return err;
+
+	/* RLE */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKENABLED,
+				  &resp);
+	if (err)
+		return err;
+
+	lrst->RLE = !!resp;
+
+	/* WLE */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKENABLED,
+				  &resp);
+	if (err)
+		return err;
+
+	lrst->WLE = !!resp;
+
+	/* read locked */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKED, &resp);
+	if (err)
+		return err;
+
+	rlocked = !!resp;
+
+	/* write locked */
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKED, &resp);
+	if (err)
+		return err;
+
+	wlocked = !!resp;
+
+	/* opal_lock_state can not map 'read locked' only state. */
+	lrst->l_state = OPAL_RW;
+	if (rlocked && wlocked)
+		lrst->l_state = OPAL_LK;
+	else if (wlocked)
+		lrst->l_state = OPAL_RO;
+	else if (rlocked) {
+		pr_debug("Can not report read locked only state.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int start_generic_opal_session(struct opal_dev *dev,
 				      enum opal_uid auth,
 				      enum opal_uid sp_type,
@@ -2642,6 +2765,33 @@ static int opal_setup_locking_range(struct opal_dev *dev,
 	return ret;
 }
 
+static int opal_locking_range_status(struct opal_dev *dev,
+			  struct opal_lr_status *opal_lrst,
+			  void __user *data)
+{
+	const struct opal_step lr_steps[] = {
+		{ start_auth_opal_session, &opal_lrst->session },
+		{ locking_range_status, opal_lrst },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	/* skip session info when copying back to uspace */
+	if (!ret && copy_to_user(data + offsetof(struct opal_lr_status, range_start),
+				(void *)opal_lrst + offsetof(struct opal_lr_status, range_start),
+				sizeof(*opal_lrst) - offsetof(struct opal_lr_status, range_start))) {
+		pr_debug("Error copying status to userspace\n");
+		return -EFAULT;
+	}
+
+	return ret;
+}
+
 static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 {
 	const struct opal_step pw_steps[] = {
@@ -2876,6 +3026,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_GET_STATUS:
 		ret = opal_get_status(dev, arg);
 		break;
+	case IOC_OPAL_GET_LR_STATUS:
+		ret = opal_locking_range_status(dev, p, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 31ac562a17d7..042c1e2cb0ce 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -45,6 +45,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_WRITE_SHADOW_MBR:
 	case IOC_OPAL_GENERIC_TABLE_RW:
 	case IOC_OPAL_GET_STATUS:
+	case IOC_OPAL_GET_LR_STATUS:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index d7a1524023db..3905c8ffedbf 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -78,6 +78,16 @@ struct opal_user_lr_setup {
 	struct opal_session_info session;
 };
 
+struct opal_lr_status {
+	struct opal_session_info session;
+	__u64 range_start;
+	__u64 range_length;
+	__u32 RLE; /* Read Lock enabled */
+	__u32 WLE; /* Write Lock Enabled */
+	__u32 l_state;
+	__u8  align[4];
+};
+
 struct opal_lock_unlock {
 	struct opal_session_info session;
 	__u32 l_state;
@@ -168,5 +178,6 @@ struct opal_status {
 #define IOC_OPAL_WRITE_SHADOW_MBR   _IOW('p', 234, struct opal_shadow_mbr)
 #define IOC_OPAL_GENERIC_TABLE_RW   _IOW('p', 235, struct opal_read_write_table)
 #define IOC_OPAL_GET_STATUS         _IOR('p', 236, struct opal_status)
+#define IOC_OPAL_GET_LR_STATUS      _IOW('p', 237, struct opal_lr_status)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From e15a19306004b3d7b6a5fe269e4e7cb7934aa3fe Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Wed, 4 Jan 2023 12:29:01 -0800
Subject: srcu: Add comments for srcu_size_state

The SRCU_SIZE_* names are not self-explanatory, so this commit therefore
adds comments to the definitions.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Zhang, Qiang1" <qiang1.zhang@intel.com>
To: rcu@vger.kernel.org
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/srcutree.h | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 558057b517b7..a6910805f9c5 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -92,16 +92,29 @@ struct srcu_struct {
 	struct lockdep_map dep_map;
 };
 
-/* Values for size state variable (->srcu_size_state). */
-#define SRCU_SIZE_SMALL		0
-#define SRCU_SIZE_ALLOC		1
-#define SRCU_SIZE_WAIT_BARRIER	2
-#define SRCU_SIZE_WAIT_CALL	3
-#define SRCU_SIZE_WAIT_CBS1	4
-#define SRCU_SIZE_WAIT_CBS2	5
-#define SRCU_SIZE_WAIT_CBS3	6
-#define SRCU_SIZE_WAIT_CBS4	7
-#define SRCU_SIZE_BIG		8
+// Values for size state variable (->srcu_size_state).  Once the state
+// has been set to SRCU_SIZE_ALLOC, the grace-period code advances through
+// this state machine one step per grace period until the SRCU_SIZE_BIG state
+// is reached.  Otherwise, the state machine remains in the SRCU_SIZE_SMALL
+// state indefinitely.
+#define SRCU_SIZE_SMALL		0	// No srcu_node combining tree, ->node == NULL
+#define SRCU_SIZE_ALLOC		1	// An srcu_node tree is being allocated, initialized,
+					//  and then referenced by ->node.  It will not be used.
+#define SRCU_SIZE_WAIT_BARRIER	2	// The srcu_node tree starts being used by everything
+					//  except call_srcu(), especially by srcu_barrier().
+					//  By the end of this state, all CPUs and threads
+					//  are aware of this tree's existence.
+#define SRCU_SIZE_WAIT_CALL	3	// The srcu_node tree starts being used by call_srcu().
+					//  By the end of this state, all of the call_srcu()
+					//  invocations that were running on a non-boot CPU
+					//  and using the boot CPU's callback queue will have
+					//  completed.
+#define SRCU_SIZE_WAIT_CBS1	4	// Don't trust the ->srcu_have_cbs[] grace-period
+#define SRCU_SIZE_WAIT_CBS2	5	//  sequence elements or the ->srcu_data_have_cbs[]
+#define SRCU_SIZE_WAIT_CBS3	6	//  CPU-bitmask elements until all four elements of
+#define SRCU_SIZE_WAIT_CBS4	7	//  each array have been initialized.
+#define SRCU_SIZE_BIG		8	// The srcu_node combining tree is fully initialized
+					//  and all aspects of it are being put to use.
 
 /* Values for state variable (bottom bits of ->srcu_gp_seq). */
 #define SRCU_STATE_IDLE		0
-- 
cgit v1.2.3


From 58d7668242647e661a20efe065519abd6454287e Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 24 Jan 2023 17:31:26 +0000
Subject: tick/nohz: Fix cpu_is_hotpluggable() by checking with nohz subsystem

For CONFIG_NO_HZ_FULL systems, the tick_do_timer_cpu cannot be offlined.
However, cpu_is_hotpluggable() still returns true for those CPUs. This causes
torture tests that do offlining to end up trying to offline this CPU causing
test failures. Such failure happens on all architectures.

Fix the repeated error messages thrown by this (even if the hotplug errors are
harmless) by asking the opinion of the nohz subsystem on whether the CPU can be
hotplugged.

[ Apply Frederic Weisbecker feedback on refactoring tick_nohz_cpu_down(). ]

For drivers/base/ portion:
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Zhouyi Zhou <zhouzhouyi@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: rcu <rcu@vger.kernel.org>
Cc: stable@vger.kernel.org
Fixes: 2987557f52b9 ("driver-core/cpu: Expose hotpluggability to the rest of the kernel")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 drivers/base/cpu.c       |  3 ++-
 include/linux/tick.h     |  2 ++
 kernel/time/tick-sched.c | 11 ++++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 182c6122f815..c1815b9dae68 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -487,7 +487,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = {
 bool cpu_is_hotpluggable(unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
-	return dev && container_of(dev, struct cpu, dev)->hotpluggable;
+	return dev && container_of(dev, struct cpu, dev)->hotpluggable
+		&& tick_nohz_cpu_hotpluggable(cpu);
 }
 EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index bfd571f18cfd..9459fef5b857 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -216,6 +216,7 @@ extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
 				     enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
 				       enum tick_dep_bits bit);
+extern bool tick_nohz_cpu_hotpluggable(unsigned int cpu);
 
 /*
  * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases
@@ -280,6 +281,7 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
 
 static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
 static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { return true; }
 
 static inline void tick_dep_set(enum tick_dep_bits bit) { }
 static inline void tick_dep_clear(enum tick_dep_bits bit) { }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..68d81a4283c8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -527,7 +527,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 	tick_nohz_full_running = true;
 }
 
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
 {
 	/*
 	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
@@ -535,8 +535,13 @@ static int tick_nohz_cpu_down(unsigned int cpu)
 	 * CPUs. It must remain online when nohz full is enabled.
 	 */
 	if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
-		return -EBUSY;
-	return 0;
+		return false;
+	return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+	return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
 }
 
 void __init tick_nohz_init(void)
-- 
cgit v1.2.3


From 16d78e8cda8b4447812f375480c8cb780f396c1e Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Thu, 5 Jan 2023 20:17:57 +0800
Subject: rcu/trace: use strscpy() to instead of strncpy()

This commit saves a line of code by switching from strncpy() to strscpy()
by permitting the later NUL assignment to be removed.  While in the area,
save another line by taking advantage of 100 characters.

Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/trace/events/rcu.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 90b2fb0292cb..c19ac1fa8a60 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -776,9 +776,7 @@ TRACE_EVENT_RCU(rcu_torture_read,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->rcutorturename, rcutorturename,
-			RCUTORTURENAME_LEN);
-		__entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0;
+		strscpy(__entry->rcutorturename, rcutorturename, RCUTORTURENAME_LEN);
 		__entry->rhp = rhp;
 		__entry->secs = secs;
 		__entry->c_old = c_old;
-- 
cgit v1.2.3


From db7b464df9d820186e98a65aa6a10f0d51fbf8ce Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Tue, 20 Dec 2022 22:16:25 +0800
Subject: rcu: Fix missing TICK_DEP_MASK_RCU_EXP dependency check

This commit adds checks for the TICK_DEP_MASK_RCU_EXP bit, thus enabling
RCU expedited grace periods to actually force-enable scheduling-clock
interrupts on holdout CPUs.

Fixes: df1e849ae455 ("rcu: Enable tick for nohz_full CPUs slow to provide expedited QS")
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/trace/events/timer.h | 3 ++-
 kernel/time/tick-sched.c     | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 2e713a7d9aa3..3e8619c72f77 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -371,7 +371,8 @@ TRACE_EVENT(itimer_expire,
 		tick_dep_name(PERF_EVENTS)		\
 		tick_dep_name(SCHED)			\
 		tick_dep_name(CLOCK_UNSTABLE)		\
-		tick_dep_name_end(RCU)
+		tick_dep_name(RCU)			\
+		tick_dep_name_end(RCU_EXP)
 
 #undef tick_dep_name
 #undef tick_dep_mask_name
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 68d81a4283c8..a46506f7ec6d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep)
 		return true;
 	}
 
+	if (val & TICK_DEP_MASK_RCU_EXP) {
+		trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+		return true;
+	}
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 7fce1e39f01900a294cd2c456c77f3e2512e0634 Mon Sep 17 00:00:00 2001
From: Emil Renner Berthing <kernel@esmil.dk>
Date: Sat, 1 Apr 2023 19:19:13 +0800
Subject: dt-bindings: clock: Add StarFive JH7110 system clock and reset
 generator

Add bindings for the system clock and reset generator (SYSCRG) on the
JH7110 RISC-V SoC by StarFive Ltd.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
Signed-off-by: Hal Feng <hal.feng@starfivetech.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 .../bindings/clock/starfive,jh7110-syscrg.yaml     | 104 +++++++++++
 include/dt-bindings/clock/starfive,jh7110-crg.h    | 203 +++++++++++++++++++++
 include/dt-bindings/reset/starfive,jh7110-crg.h    | 142 ++++++++++++++
 3 files changed, 449 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml
 create mode 100644 include/dt-bindings/clock/starfive,jh7110-crg.h
 create mode 100644 include/dt-bindings/reset/starfive,jh7110-crg.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml
new file mode 100644
index 000000000000..84373ae31644
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7110-syscrg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7110 System Clock and Reset Generator
+
+maintainers:
+  - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+  compatible:
+    const: starfive,jh7110-syscrg
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    oneOf:
+      - items:
+          - description: Main Oscillator (24 MHz)
+          - description: GMAC1 RMII reference or GMAC1 RGMII RX
+          - description: External I2S TX bit clock
+          - description: External I2S TX left/right channel clock
+          - description: External I2S RX bit clock
+          - description: External I2S RX left/right channel clock
+          - description: External TDM clock
+          - description: External audio master clock
+
+      - items:
+          - description: Main Oscillator (24 MHz)
+          - description: GMAC1 RMII reference
+          - description: GMAC1 RGMII RX
+          - description: External I2S TX bit clock
+          - description: External I2S TX left/right channel clock
+          - description: External I2S RX bit clock
+          - description: External I2S RX left/right channel clock
+          - description: External TDM clock
+          - description: External audio master clock
+
+  clock-names:
+    oneOf:
+      - items:
+          - const: osc
+          - enum:
+              - gmac1_rmii_refin
+              - gmac1_rgmii_rxin
+          - const: i2stx_bclk_ext
+          - const: i2stx_lrck_ext
+          - const: i2srx_bclk_ext
+          - const: i2srx_lrck_ext
+          - const: tdm_ext
+          - const: mclk_ext
+
+      - items:
+          - const: osc
+          - const: gmac1_rmii_refin
+          - const: gmac1_rgmii_rxin
+          - const: i2stx_bclk_ext
+          - const: i2stx_lrck_ext
+          - const: i2srx_bclk_ext
+          - const: i2srx_lrck_ext
+          - const: tdm_ext
+          - const: mclk_ext
+
+  '#clock-cells':
+    const: 1
+    description:
+      See <dt-bindings/clock/starfive,jh7110-crg.h> for valid indices.
+
+  '#reset-cells':
+    const: 1
+    description:
+      See <dt-bindings/reset/starfive,jh7110-crg.h> for valid indices.
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - '#clock-cells'
+  - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    clock-controller@13020000 {
+        compatible = "starfive,jh7110-syscrg";
+        reg = <0x13020000 0x10000>;
+        clocks = <&osc>, <&gmac1_rmii_refin>,
+                 <&gmac1_rgmii_rxin>,
+                 <&i2stx_bclk_ext>, <&i2stx_lrck_ext>,
+                 <&i2srx_bclk_ext>, <&i2srx_lrck_ext>,
+                 <&tdm_ext>, <&mclk_ext>;
+        clock-names = "osc", "gmac1_rmii_refin",
+                      "gmac1_rgmii_rxin",
+                      "i2stx_bclk_ext", "i2stx_lrck_ext",
+                      "i2srx_bclk_ext", "i2srx_lrck_ext",
+                      "tdm_ext", "mclk_ext";
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/clock/starfive,jh7110-crg.h b/include/dt-bindings/clock/starfive,jh7110-crg.h
new file mode 100644
index 000000000000..fdd1852e34cc
--- /dev/null
+++ b/include/dt-bindings/clock/starfive,jh7110-crg.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright 2022 Emil Renner Berthing <kernel@esmil.dk>
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_STARFIVE_JH7110_CRG_H__
+#define __DT_BINDINGS_CLOCK_STARFIVE_JH7110_CRG_H__
+
+/* SYSCRG clocks */
+#define JH7110_SYSCLK_CPU_ROOT			0
+#define JH7110_SYSCLK_CPU_CORE			1
+#define JH7110_SYSCLK_CPU_BUS			2
+#define JH7110_SYSCLK_GPU_ROOT			3
+#define JH7110_SYSCLK_PERH_ROOT			4
+#define JH7110_SYSCLK_BUS_ROOT			5
+#define JH7110_SYSCLK_NOCSTG_BUS		6
+#define JH7110_SYSCLK_AXI_CFG0			7
+#define JH7110_SYSCLK_STG_AXIAHB		8
+#define JH7110_SYSCLK_AHB0			9
+#define JH7110_SYSCLK_AHB1			10
+#define JH7110_SYSCLK_APB_BUS			11
+#define JH7110_SYSCLK_APB0			12
+#define JH7110_SYSCLK_PLL0_DIV2			13
+#define JH7110_SYSCLK_PLL1_DIV2			14
+#define JH7110_SYSCLK_PLL2_DIV2			15
+#define JH7110_SYSCLK_AUDIO_ROOT		16
+#define JH7110_SYSCLK_MCLK_INNER		17
+#define JH7110_SYSCLK_MCLK			18
+#define JH7110_SYSCLK_MCLK_OUT			19
+#define JH7110_SYSCLK_ISP_2X			20
+#define JH7110_SYSCLK_ISP_AXI			21
+#define JH7110_SYSCLK_GCLK0			22
+#define JH7110_SYSCLK_GCLK1			23
+#define JH7110_SYSCLK_GCLK2			24
+#define JH7110_SYSCLK_CORE			25
+#define JH7110_SYSCLK_CORE1			26
+#define JH7110_SYSCLK_CORE2			27
+#define JH7110_SYSCLK_CORE3			28
+#define JH7110_SYSCLK_CORE4			29
+#define JH7110_SYSCLK_DEBUG			30
+#define JH7110_SYSCLK_RTC_TOGGLE		31
+#define JH7110_SYSCLK_TRACE0			32
+#define JH7110_SYSCLK_TRACE1			33
+#define JH7110_SYSCLK_TRACE2			34
+#define JH7110_SYSCLK_TRACE3			35
+#define JH7110_SYSCLK_TRACE4			36
+#define JH7110_SYSCLK_TRACE_COM			37
+#define JH7110_SYSCLK_NOC_BUS_CPU_AXI		38
+#define JH7110_SYSCLK_NOC_BUS_AXICFG0_AXI	39
+#define JH7110_SYSCLK_OSC_DIV2			40
+#define JH7110_SYSCLK_PLL1_DIV4			41
+#define JH7110_SYSCLK_PLL1_DIV8			42
+#define JH7110_SYSCLK_DDR_BUS			43
+#define JH7110_SYSCLK_DDR_AXI			44
+#define JH7110_SYSCLK_GPU_CORE			45
+#define JH7110_SYSCLK_GPU_CORE_CLK		46
+#define JH7110_SYSCLK_GPU_SYS_CLK		47
+#define JH7110_SYSCLK_GPU_APB			48
+#define JH7110_SYSCLK_GPU_RTC_TOGGLE		49
+#define JH7110_SYSCLK_NOC_BUS_GPU_AXI		50
+#define JH7110_SYSCLK_ISP_TOP_CORE		51
+#define JH7110_SYSCLK_ISP_TOP_AXI		52
+#define JH7110_SYSCLK_NOC_BUS_ISP_AXI		53
+#define JH7110_SYSCLK_HIFI4_CORE		54
+#define JH7110_SYSCLK_HIFI4_AXI			55
+#define JH7110_SYSCLK_AXI_CFG1_MAIN		56
+#define JH7110_SYSCLK_AXI_CFG1_AHB		57
+#define JH7110_SYSCLK_VOUT_SRC			58
+#define JH7110_SYSCLK_VOUT_AXI			59
+#define JH7110_SYSCLK_NOC_BUS_DISP_AXI		60
+#define JH7110_SYSCLK_VOUT_TOP_AHB		61
+#define JH7110_SYSCLK_VOUT_TOP_AXI		62
+#define JH7110_SYSCLK_VOUT_TOP_HDMITX0_MCLK	63
+#define JH7110_SYSCLK_VOUT_TOP_MIPIPHY_REF	64
+#define JH7110_SYSCLK_JPEGC_AXI			65
+#define JH7110_SYSCLK_CODAJ12_AXI		66
+#define JH7110_SYSCLK_CODAJ12_CORE		67
+#define JH7110_SYSCLK_CODAJ12_APB		68
+#define JH7110_SYSCLK_VDEC_AXI			69
+#define JH7110_SYSCLK_WAVE511_AXI		70
+#define JH7110_SYSCLK_WAVE511_BPU		71
+#define JH7110_SYSCLK_WAVE511_VCE		72
+#define JH7110_SYSCLK_WAVE511_APB		73
+#define JH7110_SYSCLK_VDEC_JPG			74
+#define JH7110_SYSCLK_VDEC_MAIN			75
+#define JH7110_SYSCLK_NOC_BUS_VDEC_AXI		76
+#define JH7110_SYSCLK_VENC_AXI			77
+#define JH7110_SYSCLK_WAVE420L_AXI		78
+#define JH7110_SYSCLK_WAVE420L_BPU		79
+#define JH7110_SYSCLK_WAVE420L_VCE		80
+#define JH7110_SYSCLK_WAVE420L_APB		81
+#define JH7110_SYSCLK_NOC_BUS_VENC_AXI		82
+#define JH7110_SYSCLK_AXI_CFG0_MAIN_DIV		83
+#define JH7110_SYSCLK_AXI_CFG0_MAIN		84
+#define JH7110_SYSCLK_AXI_CFG0_HIFI4		85
+#define JH7110_SYSCLK_AXIMEM2_AXI		86
+#define JH7110_SYSCLK_QSPI_AHB			87
+#define JH7110_SYSCLK_QSPI_APB			88
+#define JH7110_SYSCLK_QSPI_REF_SRC		89
+#define JH7110_SYSCLK_QSPI_REF			90
+#define JH7110_SYSCLK_SDIO0_AHB			91
+#define JH7110_SYSCLK_SDIO1_AHB			92
+#define JH7110_SYSCLK_SDIO0_SDCARD		93
+#define JH7110_SYSCLK_SDIO1_SDCARD		94
+#define JH7110_SYSCLK_USB_125M			95
+#define JH7110_SYSCLK_NOC_BUS_STG_AXI		96
+#define JH7110_SYSCLK_GMAC1_AHB			97
+#define JH7110_SYSCLK_GMAC1_AXI			98
+#define JH7110_SYSCLK_GMAC_SRC			99
+#define JH7110_SYSCLK_GMAC1_GTXCLK		100
+#define JH7110_SYSCLK_GMAC1_RMII_RTX		101
+#define JH7110_SYSCLK_GMAC1_PTP			102
+#define JH7110_SYSCLK_GMAC1_RX			103
+#define JH7110_SYSCLK_GMAC1_RX_INV		104
+#define JH7110_SYSCLK_GMAC1_TX			105
+#define JH7110_SYSCLK_GMAC1_TX_INV		106
+#define JH7110_SYSCLK_GMAC1_GTXC		107
+#define JH7110_SYSCLK_GMAC0_GTXCLK		108
+#define JH7110_SYSCLK_GMAC0_PTP			109
+#define JH7110_SYSCLK_GMAC_PHY			110
+#define JH7110_SYSCLK_GMAC0_GTXC		111
+#define JH7110_SYSCLK_IOMUX_APB			112
+#define JH7110_SYSCLK_MAILBOX_APB		113
+#define JH7110_SYSCLK_INT_CTRL_APB		114
+#define JH7110_SYSCLK_CAN0_APB			115
+#define JH7110_SYSCLK_CAN0_TIMER		116
+#define JH7110_SYSCLK_CAN0_CAN			117
+#define JH7110_SYSCLK_CAN1_APB			118
+#define JH7110_SYSCLK_CAN1_TIMER		119
+#define JH7110_SYSCLK_CAN1_CAN			120
+#define JH7110_SYSCLK_PWM_APB			121
+#define JH7110_SYSCLK_WDT_APB			122
+#define JH7110_SYSCLK_WDT_CORE			123
+#define JH7110_SYSCLK_TIMER_APB			124
+#define JH7110_SYSCLK_TIMER0			125
+#define JH7110_SYSCLK_TIMER1			126
+#define JH7110_SYSCLK_TIMER2			127
+#define JH7110_SYSCLK_TIMER3			128
+#define JH7110_SYSCLK_TEMP_APB			129
+#define JH7110_SYSCLK_TEMP_CORE			130
+#define JH7110_SYSCLK_SPI0_APB			131
+#define JH7110_SYSCLK_SPI1_APB			132
+#define JH7110_SYSCLK_SPI2_APB			133
+#define JH7110_SYSCLK_SPI3_APB			134
+#define JH7110_SYSCLK_SPI4_APB			135
+#define JH7110_SYSCLK_SPI5_APB			136
+#define JH7110_SYSCLK_SPI6_APB			137
+#define JH7110_SYSCLK_I2C0_APB			138
+#define JH7110_SYSCLK_I2C1_APB			139
+#define JH7110_SYSCLK_I2C2_APB			140
+#define JH7110_SYSCLK_I2C3_APB			141
+#define JH7110_SYSCLK_I2C4_APB			142
+#define JH7110_SYSCLK_I2C5_APB			143
+#define JH7110_SYSCLK_I2C6_APB			144
+#define JH7110_SYSCLK_UART0_APB			145
+#define JH7110_SYSCLK_UART0_CORE		146
+#define JH7110_SYSCLK_UART1_APB			147
+#define JH7110_SYSCLK_UART1_CORE		148
+#define JH7110_SYSCLK_UART2_APB			149
+#define JH7110_SYSCLK_UART2_CORE		150
+#define JH7110_SYSCLK_UART3_APB			151
+#define JH7110_SYSCLK_UART3_CORE		152
+#define JH7110_SYSCLK_UART4_APB			153
+#define JH7110_SYSCLK_UART4_CORE		154
+#define JH7110_SYSCLK_UART5_APB			155
+#define JH7110_SYSCLK_UART5_CORE		156
+#define JH7110_SYSCLK_PWMDAC_APB		157
+#define JH7110_SYSCLK_PWMDAC_CORE		158
+#define JH7110_SYSCLK_SPDIF_APB			159
+#define JH7110_SYSCLK_SPDIF_CORE		160
+#define JH7110_SYSCLK_I2STX0_APB		161
+#define JH7110_SYSCLK_I2STX0_BCLK_MST		162
+#define JH7110_SYSCLK_I2STX0_BCLK_MST_INV	163
+#define JH7110_SYSCLK_I2STX0_LRCK_MST		164
+#define JH7110_SYSCLK_I2STX0_BCLK		165
+#define JH7110_SYSCLK_I2STX0_BCLK_INV		166
+#define JH7110_SYSCLK_I2STX0_LRCK		167
+#define JH7110_SYSCLK_I2STX1_APB		168
+#define JH7110_SYSCLK_I2STX1_BCLK_MST		169
+#define JH7110_SYSCLK_I2STX1_BCLK_MST_INV	170
+#define JH7110_SYSCLK_I2STX1_LRCK_MST		171
+#define JH7110_SYSCLK_I2STX1_BCLK		172
+#define JH7110_SYSCLK_I2STX1_BCLK_INV		173
+#define JH7110_SYSCLK_I2STX1_LRCK		174
+#define JH7110_SYSCLK_I2SRX_APB			175
+#define JH7110_SYSCLK_I2SRX_BCLK_MST		176
+#define JH7110_SYSCLK_I2SRX_BCLK_MST_INV	177
+#define JH7110_SYSCLK_I2SRX_LRCK_MST		178
+#define JH7110_SYSCLK_I2SRX_BCLK		179
+#define JH7110_SYSCLK_I2SRX_BCLK_INV		180
+#define JH7110_SYSCLK_I2SRX_LRCK		181
+#define JH7110_SYSCLK_PDM_DMIC			182
+#define JH7110_SYSCLK_PDM_APB			183
+#define JH7110_SYSCLK_TDM_AHB			184
+#define JH7110_SYSCLK_TDM_APB			185
+#define JH7110_SYSCLK_TDM_INTERNAL		186
+#define JH7110_SYSCLK_TDM_TDM			187
+#define JH7110_SYSCLK_TDM_TDM_INV		188
+#define JH7110_SYSCLK_JTAG_CERTIFICATION_TRNG	189
+
+#define JH7110_SYSCLK_END			190
+
+#endif /* __DT_BINDINGS_CLOCK_STARFIVE_JH7110_CRG_H__ */
diff --git a/include/dt-bindings/reset/starfive,jh7110-crg.h b/include/dt-bindings/reset/starfive,jh7110-crg.h
new file mode 100644
index 000000000000..b88216a4fe40
--- /dev/null
+++ b/include/dt-bindings/reset/starfive,jh7110-crg.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2022 Emil Renner Berthing <kernel@esmil.dk>
+ */
+
+#ifndef __DT_BINDINGS_RESET_STARFIVE_JH7110_CRG_H__
+#define __DT_BINDINGS_RESET_STARFIVE_JH7110_CRG_H__
+
+/* SYSCRG resets */
+#define JH7110_SYSRST_JTAG_APB			0
+#define JH7110_SYSRST_SYSCON_APB		1
+#define JH7110_SYSRST_IOMUX_APB			2
+#define JH7110_SYSRST_BUS			3
+#define JH7110_SYSRST_DEBUG			4
+#define JH7110_SYSRST_CORE0			5
+#define JH7110_SYSRST_CORE1			6
+#define JH7110_SYSRST_CORE2			7
+#define JH7110_SYSRST_CORE3			8
+#define JH7110_SYSRST_CORE4			9
+#define JH7110_SYSRST_CORE0_ST			10
+#define JH7110_SYSRST_CORE1_ST			11
+#define JH7110_SYSRST_CORE2_ST			12
+#define JH7110_SYSRST_CORE3_ST			13
+#define JH7110_SYSRST_CORE4_ST			14
+#define JH7110_SYSRST_TRACE0			15
+#define JH7110_SYSRST_TRACE1			16
+#define JH7110_SYSRST_TRACE2			17
+#define JH7110_SYSRST_TRACE3			18
+#define JH7110_SYSRST_TRACE4			19
+#define JH7110_SYSRST_TRACE_COM			20
+#define JH7110_SYSRST_GPU_APB			21
+#define JH7110_SYSRST_GPU_DOMA			22
+#define JH7110_SYSRST_NOC_BUS_APB		23
+#define JH7110_SYSRST_NOC_BUS_AXICFG0_AXI	24
+#define JH7110_SYSRST_NOC_BUS_CPU_AXI		25
+#define JH7110_SYSRST_NOC_BUS_DISP_AXI		26
+#define JH7110_SYSRST_NOC_BUS_GPU_AXI		27
+#define JH7110_SYSRST_NOC_BUS_ISP_AXI		28
+#define JH7110_SYSRST_NOC_BUS_DDRC		29
+#define JH7110_SYSRST_NOC_BUS_STG_AXI		30
+#define JH7110_SYSRST_NOC_BUS_VDEC_AXI		31
+
+#define JH7110_SYSRST_NOC_BUS_VENC_AXI		32
+#define JH7110_SYSRST_AXI_CFG1_AHB		33
+#define JH7110_SYSRST_AXI_CFG1_MAIN		34
+#define JH7110_SYSRST_AXI_CFG0_MAIN		35
+#define JH7110_SYSRST_AXI_CFG0_MAIN_DIV		36
+#define JH7110_SYSRST_AXI_CFG0_HIFI4		37
+#define JH7110_SYSRST_DDR_AXI			38
+#define JH7110_SYSRST_DDR_OSC			39
+#define JH7110_SYSRST_DDR_APB			40
+#define JH7110_SYSRST_ISP_TOP			41
+#define JH7110_SYSRST_ISP_TOP_AXI		42
+#define JH7110_SYSRST_VOUT_TOP_SRC		43
+#define JH7110_SYSRST_CODAJ12_AXI		44
+#define JH7110_SYSRST_CODAJ12_CORE		45
+#define JH7110_SYSRST_CODAJ12_APB		46
+#define JH7110_SYSRST_WAVE511_AXI		47
+#define JH7110_SYSRST_WAVE511_BPU		48
+#define JH7110_SYSRST_WAVE511_VCE		49
+#define JH7110_SYSRST_WAVE511_APB		50
+#define JH7110_SYSRST_VDEC_JPG			51
+#define JH7110_SYSRST_VDEC_MAIN			52
+#define JH7110_SYSRST_AXIMEM0_AXI		53
+#define JH7110_SYSRST_WAVE420L_AXI		54
+#define JH7110_SYSRST_WAVE420L_BPU		55
+#define JH7110_SYSRST_WAVE420L_VCE		56
+#define JH7110_SYSRST_WAVE420L_APB		57
+#define JH7110_SYSRST_AXIMEM1_AXI		58
+#define JH7110_SYSRST_AXIMEM2_AXI		59
+#define JH7110_SYSRST_INTMEM			60
+#define JH7110_SYSRST_QSPI_AHB			61
+#define JH7110_SYSRST_QSPI_APB			62
+#define JH7110_SYSRST_QSPI_REF			63
+
+#define JH7110_SYSRST_SDIO0_AHB			64
+#define JH7110_SYSRST_SDIO1_AHB			65
+#define JH7110_SYSRST_GMAC1_AXI			66
+#define JH7110_SYSRST_GMAC1_AHB			67
+#define JH7110_SYSRST_MAILBOX_APB		68
+#define JH7110_SYSRST_SPI0_APB			69
+#define JH7110_SYSRST_SPI1_APB			70
+#define JH7110_SYSRST_SPI2_APB			71
+#define JH7110_SYSRST_SPI3_APB			72
+#define JH7110_SYSRST_SPI4_APB			73
+#define JH7110_SYSRST_SPI5_APB			74
+#define JH7110_SYSRST_SPI6_APB			75
+#define JH7110_SYSRST_I2C0_APB			76
+#define JH7110_SYSRST_I2C1_APB			77
+#define JH7110_SYSRST_I2C2_APB			78
+#define JH7110_SYSRST_I2C3_APB			79
+#define JH7110_SYSRST_I2C4_APB			80
+#define JH7110_SYSRST_I2C5_APB			81
+#define JH7110_SYSRST_I2C6_APB			82
+#define JH7110_SYSRST_UART0_APB			83
+#define JH7110_SYSRST_UART0_CORE		84
+#define JH7110_SYSRST_UART1_APB			85
+#define JH7110_SYSRST_UART1_CORE		86
+#define JH7110_SYSRST_UART2_APB			87
+#define JH7110_SYSRST_UART2_CORE		88
+#define JH7110_SYSRST_UART3_APB			89
+#define JH7110_SYSRST_UART3_CORE		90
+#define JH7110_SYSRST_UART4_APB			91
+#define JH7110_SYSRST_UART4_CORE		92
+#define JH7110_SYSRST_UART5_APB			93
+#define JH7110_SYSRST_UART5_CORE		94
+#define JH7110_SYSRST_SPDIF_APB			95
+
+#define JH7110_SYSRST_PWMDAC_APB		96
+#define JH7110_SYSRST_PDM_DMIC			97
+#define JH7110_SYSRST_PDM_APB			98
+#define JH7110_SYSRST_I2SRX_APB			99
+#define JH7110_SYSRST_I2SRX_BCLK		100
+#define JH7110_SYSRST_I2STX0_APB		101
+#define JH7110_SYSRST_I2STX0_BCLK		102
+#define JH7110_SYSRST_I2STX1_APB		103
+#define JH7110_SYSRST_I2STX1_BCLK		104
+#define JH7110_SYSRST_TDM_AHB			105
+#define JH7110_SYSRST_TDM_CORE			106
+#define JH7110_SYSRST_TDM_APB			107
+#define JH7110_SYSRST_PWM_APB			108
+#define JH7110_SYSRST_WDT_APB			109
+#define JH7110_SYSRST_WDT_CORE			110
+#define JH7110_SYSRST_CAN0_APB			111
+#define JH7110_SYSRST_CAN0_CORE			112
+#define JH7110_SYSRST_CAN0_TIMER		113
+#define JH7110_SYSRST_CAN1_APB			114
+#define JH7110_SYSRST_CAN1_CORE			115
+#define JH7110_SYSRST_CAN1_TIMER		116
+#define JH7110_SYSRST_TIMER_APB			117
+#define JH7110_SYSRST_TIMER0			118
+#define JH7110_SYSRST_TIMER1			119
+#define JH7110_SYSRST_TIMER2			120
+#define JH7110_SYSRST_TIMER3			121
+#define JH7110_SYSRST_INT_CTRL_APB		122
+#define JH7110_SYSRST_TEMP_APB			123
+#define JH7110_SYSRST_TEMP_CORE			124
+#define JH7110_SYSRST_JTAG_CERTIFICATION	125
+
+#define JH7110_SYSRST_END			126
+
+#endif /* __DT_BINDINGS_RESET_STARFIVE_JH7110_CRG_H__ */
-- 
cgit v1.2.3


From 3de0c91032580d4923624fe6ee773eddd1b574bc Mon Sep 17 00:00:00 2001
From: Emil Renner Berthing <kernel@esmil.dk>
Date: Sat, 1 Apr 2023 19:19:14 +0800
Subject: dt-bindings: clock: Add StarFive JH7110 always-on clock and reset
 generator

Add bindings for the always-on clock and reset generator (AONCRG) on the
JH7110 RISC-V SoC by StarFive Ltd.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
Signed-off-by: Hal Feng <hal.feng@starfivetech.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 .../bindings/clock/starfive,jh7110-aoncrg.yaml     | 107 +++++++++++++++++++++
 include/dt-bindings/clock/starfive,jh7110-crg.h    |  18 ++++
 include/dt-bindings/reset/starfive,jh7110-crg.h    |  12 +++
 3 files changed, 137 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml
new file mode 100644
index 000000000000..923680a44aef
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7110-aoncrg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7110 Always-On Clock and Reset Generator
+
+maintainers:
+  - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+  compatible:
+    const: starfive,jh7110-aoncrg
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    oneOf:
+      - items:
+          - description: Main Oscillator (24 MHz)
+          - description: GMAC0 RMII reference or GMAC0 RGMII RX
+          - description: STG AXI/AHB
+          - description: APB Bus
+          - description: GMAC0 GTX
+
+      - items:
+          - description: Main Oscillator (24 MHz)
+          - description: GMAC0 RMII reference or GMAC0 RGMII RX
+          - description: STG AXI/AHB or GMAC0 RGMII RX
+          - description: APB Bus or STG AXI/AHB
+          - description: GMAC0 GTX or APB Bus
+          - description: RTC Oscillator (32.768 kHz) or GMAC0 GTX
+
+      - items:
+          - description: Main Oscillator (24 MHz)
+          - description: GMAC0 RMII reference
+          - description: GMAC0 RGMII RX
+          - description: STG AXI/AHB
+          - description: APB Bus
+          - description: GMAC0 GTX
+          - description: RTC Oscillator (32.768 kHz)
+
+  clock-names:
+    oneOf:
+      - minItems: 5
+        items:
+          - const: osc
+          - enum:
+              - gmac0_rmii_refin
+              - gmac0_rgmii_rxin
+          - const: stg_axiahb
+          - const: apb_bus
+          - const: gmac0_gtxclk
+          - const: rtc_osc
+
+      - minItems: 6
+        items:
+          - const: osc
+          - const: gmac0_rmii_refin
+          - const: gmac0_rgmii_rxin
+          - const: stg_axiahb
+          - const: apb_bus
+          - const: gmac0_gtxclk
+          - const: rtc_osc
+
+  '#clock-cells':
+    const: 1
+    description:
+      See <dt-bindings/clock/starfive,jh7110-crg.h> for valid indices.
+
+  '#reset-cells':
+    const: 1
+    description:
+      See <dt-bindings/reset/starfive,jh7110-crg.h> for valid indices.
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - '#clock-cells'
+  - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/starfive,jh7110-crg.h>
+
+    clock-controller@17000000 {
+        compatible = "starfive,jh7110-aoncrg";
+        reg = <0x17000000 0x10000>;
+        clocks = <&osc>, <&gmac0_rmii_refin>,
+                 <&gmac0_rgmii_rxin>,
+                 <&syscrg JH7110_SYSCLK_STG_AXIAHB>,
+                 <&syscrg JH7110_SYSCLK_APB_BUS>,
+                 <&syscrg JH7110_SYSCLK_GMAC0_GTXCLK>,
+                 <&rtc_osc>;
+        clock-names = "osc", "gmac0_rmii_refin",
+                      "gmac0_rgmii_rxin", "stg_axiahb",
+                      "apb_bus", "gmac0_gtxclk",
+                      "rtc_osc";
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/clock/starfive,jh7110-crg.h b/include/dt-bindings/clock/starfive,jh7110-crg.h
index fdd1852e34cc..06257bfd9ac1 100644
--- a/include/dt-bindings/clock/starfive,jh7110-crg.h
+++ b/include/dt-bindings/clock/starfive,jh7110-crg.h
@@ -200,4 +200,22 @@
 
 #define JH7110_SYSCLK_END			190
 
+/* AONCRG clocks */
+#define JH7110_AONCLK_OSC_DIV4			0
+#define JH7110_AONCLK_APB_FUNC			1
+#define JH7110_AONCLK_GMAC0_AHB			2
+#define JH7110_AONCLK_GMAC0_AXI			3
+#define JH7110_AONCLK_GMAC0_RMII_RTX		4
+#define JH7110_AONCLK_GMAC0_TX			5
+#define JH7110_AONCLK_GMAC0_TX_INV		6
+#define JH7110_AONCLK_GMAC0_RX			7
+#define JH7110_AONCLK_GMAC0_RX_INV		8
+#define JH7110_AONCLK_OTPC_APB			9
+#define JH7110_AONCLK_RTC_APB			10
+#define JH7110_AONCLK_RTC_INTERNAL		11
+#define JH7110_AONCLK_RTC_32K			12
+#define JH7110_AONCLK_RTC_CAL			13
+
+#define JH7110_AONCLK_END			14
+
 #endif /* __DT_BINDINGS_CLOCK_STARFIVE_JH7110_CRG_H__ */
diff --git a/include/dt-bindings/reset/starfive,jh7110-crg.h b/include/dt-bindings/reset/starfive,jh7110-crg.h
index b88216a4fe40..d78e38690ceb 100644
--- a/include/dt-bindings/reset/starfive,jh7110-crg.h
+++ b/include/dt-bindings/reset/starfive,jh7110-crg.h
@@ -139,4 +139,16 @@
 
 #define JH7110_SYSRST_END			126
 
+/* AONCRG resets */
+#define JH7110_AONRST_GMAC0_AXI			0
+#define JH7110_AONRST_GMAC0_AHB			1
+#define JH7110_AONRST_IOMUX			2
+#define JH7110_AONRST_PMU_APB			3
+#define JH7110_AONRST_PMU_WKUP			4
+#define JH7110_AONRST_RTC_APB			5
+#define JH7110_AONRST_RTC_CAL			6
+#define JH7110_AONRST_RTC_32K			7
+
+#define JH7110_AONRST_END			8
+
 #endif /* __DT_BINDINGS_RESET_STARFIVE_JH7110_CRG_H__ */
-- 
cgit v1.2.3


From 7697c64b9e4908196f0ae68aa6d423dd40607973 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Wed, 5 Apr 2023 11:45:42 -0400
Subject: regmap: Pass irq_drv_data as a parameter for set_type_config()

Allow the struct regmap_irq_chip set_type_config() callback to access
irq_drv_data by passing it as a parameter.

Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20e15cd3afae80922b7e0577c7741df86b3390c5.1680708357.git.william.gray@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 8 +++++---
 include/linux/regmap.h           | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 8c903b8c9714..e489b235b36b 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -329,8 +329,8 @@ static int regmap_irq_set_type(struct irq_data *data, unsigned int type)
 	}
 
 	if (d->chip->set_type_config) {
-		ret = d->chip->set_type_config(d->config_buf, type,
-					       irq_data, reg);
+		ret = d->chip->set_type_config(d->config_buf, type, irq_data,
+					       reg, d->chip->irq_drv_data);
 		if (ret)
 			return ret;
 	}
@@ -651,13 +651,15 @@ EXPORT_SYMBOL_GPL(regmap_irq_get_irq_reg_linear);
  * @type: The requested IRQ type.
  * @irq_data: The IRQ being configured.
  * @idx: Index of the irq's config registers within each array `buf[i]`
+ * @irq_drv_data: Driver specific IRQ data
  *
  * This is a &struct regmap_irq_chip->set_type_config callback suitable for
  * chips with one config register. Register values are updated according to
  * the &struct regmap_irq_type data associated with an IRQ.
  */
 int regmap_irq_set_type_config_simple(unsigned int **buf, unsigned int type,
-				      const struct regmap_irq *irq_data, int idx)
+				      const struct regmap_irq *irq_data,
+				      int idx, void *irq_drv_data)
 {
 	const struct regmap_irq_type *t = &irq_data->type;
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4d10790adeb0..fb92d0ac13af 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1650,7 +1650,8 @@ struct regmap_irq_chip {
 	int (*set_type_virt)(unsigned int **buf, unsigned int type,
 			     unsigned long hwirq, int reg);
 	int (*set_type_config)(unsigned int **buf, unsigned int type,
-			       const struct regmap_irq *irq_data, int idx);
+			       const struct regmap_irq *irq_data, int idx,
+			       void *irq_drv_data);
 	unsigned int (*get_irq_reg)(struct regmap_irq_chip_data *data,
 				    unsigned int base, int index);
 	void *irq_drv_data;
@@ -1659,7 +1660,8 @@ struct regmap_irq_chip {
 unsigned int regmap_irq_get_irq_reg_linear(struct regmap_irq_chip_data *data,
 					   unsigned int base, int index);
 int regmap_irq_set_type_config_simple(unsigned int **buf, unsigned int type,
-				      const struct regmap_irq *irq_data, int idx);
+				      const struct regmap_irq *irq_data,
+				      int idx, void *irq_drv_data);
 
 int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags,
 			int irq_base, const struct regmap_irq_chip *chip,
-- 
cgit v1.2.3


From 673aa1ed1c9b6710bf24e3f0957d85e2f46c77db Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 4 Apr 2023 18:21:16 +0100
Subject: of: Rename of_modalias_node()

This helper does not produce a real modalias, but tries to get the
"product" compatible part of the "vendor,product" compatibles only. It
is far from creating a purely useful modalias string and does not seem
to be used like that directly anyway, so let's try to give this helper a
more meaningful name before moving there a real modalias helper (already
existing under of/device.c).

Also update the various documentations to refer to the strings as
"aliases" rather than "modaliases" which has a real meaning in the Linux
kernel.

There is no functional change.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Wolfram Sang <wsa@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Acked-by: Sebastian Reichel <sre@kernel.org>
Link: https://lore.kernel.org/r/20230404172148.82422-9-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/bus.c                |  7 ++++---
 drivers/gpu/drm/drm_mipi_dsi.c    |  2 +-
 drivers/hsi/hsi_core.c            |  2 +-
 drivers/i2c/busses/i2c-powermac.c |  2 +-
 drivers/i2c/i2c-core-of.c         |  2 +-
 drivers/of/base.c                 | 18 +++++++++++-------
 drivers/spi/spi.c                 |  4 ++--
 include/linux/of.h                |  3 ++-
 8 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 9531dd0fef50..fc74c786a867 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -817,9 +817,10 @@ static bool acpi_of_modalias(struct acpi_device *adev,
  * @modalias:   Pointer to buffer that modalias value will be copied into
  * @len:	Length of modalias buffer
  *
- * This is a counterpart of of_modalias_node() for struct acpi_device objects.
- * If there is a compatible string for @adev, it will be copied to @modalias
- * with the vendor prefix stripped; otherwise, @default_id will be used.
+ * This is a counterpart of of_alias_from_compatible() for struct acpi_device
+ * objects. If there is a compatible string for @adev, it will be copied to
+ * @modalias with the vendor prefix stripped; otherwise, @default_id will be
+ * used.
  */
 void acpi_set_modalias(struct acpi_device *adev, const char *default_id,
 		       char *modalias, size_t len)
diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c
index b41aaf2bb9f1..b62f5e4425f4 100644
--- a/drivers/gpu/drm/drm_mipi_dsi.c
+++ b/drivers/gpu/drm/drm_mipi_dsi.c
@@ -160,7 +160,7 @@ of_mipi_dsi_device_add(struct mipi_dsi_host *host, struct device_node *node)
 	int ret;
 	u32 reg;
 
-	if (of_modalias_node(node, info.type, sizeof(info.type)) < 0) {
+	if (of_alias_from_compatible(node, info.type, sizeof(info.type)) < 0) {
 		drm_err(host, "modalias failure on %pOF\n", node);
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/drivers/hsi/hsi_core.c b/drivers/hsi/hsi_core.c
index 8fda8f1d064d..acbf82f755a8 100644
--- a/drivers/hsi/hsi_core.c
+++ b/drivers/hsi/hsi_core.c
@@ -207,7 +207,7 @@ static void hsi_add_client_from_dt(struct hsi_port *port,
 	if (!cl)
 		return;
 
-	err = of_modalias_node(client, name, sizeof(name));
+	err = of_alias_from_compatible(client, name, sizeof(name));
 	if (err)
 		goto err;
 
diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c
index 2e74747eec9c..ec706a3aba26 100644
--- a/drivers/i2c/busses/i2c-powermac.c
+++ b/drivers/i2c/busses/i2c-powermac.c
@@ -284,7 +284,7 @@ static bool i2c_powermac_get_type(struct i2c_adapter *adap,
 	 */
 
 	/* First try proper modalias */
-	if (of_modalias_node(node, tmp, sizeof(tmp)) >= 0) {
+	if (of_alias_from_compatible(node, tmp, sizeof(tmp)) >= 0) {
 		snprintf(type, type_size, "MAC,%s", tmp);
 		return true;
 	}
diff --git a/drivers/i2c/i2c-core-of.c b/drivers/i2c/i2c-core-of.c
index bce6b796e04c..8941a30574e3 100644
--- a/drivers/i2c/i2c-core-of.c
+++ b/drivers/i2c/i2c-core-of.c
@@ -27,7 +27,7 @@ int of_i2c_get_board_info(struct device *dev, struct device_node *node,
 
 	memset(info, 0, sizeof(*info));
 
-	if (of_modalias_node(node, info->type, sizeof(info->type)) < 0) {
+	if (of_alias_from_compatible(node, info->type, sizeof(info->type)) < 0) {
 		dev_err(dev, "of_i2c: modalias failure on %pOF\n", node);
 		return -EINVAL;
 	}
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ac6fde53342f..161fe3192c46 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1208,19 +1208,23 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 EXPORT_SYMBOL(of_find_matching_node_and_match);
 
 /**
- * of_modalias_node - Lookup appropriate modalias for a device node
+ * of_alias_from_compatible - Lookup appropriate alias for a device node
+ *			      depending on compatible
  * @node:	pointer to a device tree node
- * @modalias:	Pointer to buffer that modalias value will be copied into
- * @len:	Length of modalias value
+ * @alias:	Pointer to buffer that alias value will be copied into
+ * @len:	Length of alias value
  *
  * Based on the value of the compatible property, this routine will attempt
- * to choose an appropriate modalias value for a particular device tree node.
+ * to choose an appropriate alias value for a particular device tree node.
  * It does this by stripping the manufacturer prefix (as delimited by a ',')
  * from the first entry in the compatible list property.
  *
+ * Note: The matching on just the "product" side of the compatible is a relic
+ * from I2C and SPI. Please do not add any new user.
+ *
  * Return: This routine returns 0 on success, <0 on failure.
  */
-int of_modalias_node(struct device_node *node, char *modalias, int len)
+int of_alias_from_compatible(const struct device_node *node, char *alias, int len)
 {
 	const char *compatible, *p;
 	int cplen;
@@ -1229,10 +1233,10 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
 	if (!compatible || strlen(compatible) > cplen)
 		return -ENODEV;
 	p = strchr(compatible, ',');
-	strscpy(modalias, p ? p + 1 : compatible, len);
+	strscpy(alias, p ? p + 1 : compatible, len);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(of_modalias_node);
+EXPORT_SYMBOL_GPL(of_alias_from_compatible);
 
 /**
  * of_find_node_by_phandle - Find a node given a phandle
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 44b85a8d47f1..3bbdc5fe3b99 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2354,8 +2354,8 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
 	}
 
 	/* Select device driver */
-	rc = of_modalias_node(nc, spi->modalias,
-				sizeof(spi->modalias));
+	rc = of_alias_from_compatible(nc, spi->modalias,
+				      sizeof(spi->modalias));
 	if (rc < 0) {
 		dev_err(&ctlr->dev, "cannot find modalias for %pOF\n", nc);
 		goto err_out;
diff --git a/include/linux/of.h b/include/linux/of.h
index 0af611307db2..b1eea8569043 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -373,7 +373,8 @@ extern int of_n_addr_cells(struct device_node *np);
 extern int of_n_size_cells(struct device_node *np);
 extern const struct of_device_id *of_match_node(
 	const struct of_device_id *matches, const struct device_node *node);
-extern int of_modalias_node(struct device_node *node, char *modalias, int len);
+extern int of_alias_from_compatible(const struct device_node *node, char *alias,
+				    int len);
 extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args);
 extern int __of_parse_phandle_with_args(const struct device_node *np,
 	const char *list_name, const char *cells_name, int cell_count,
-- 
cgit v1.2.3


From bd7a7ed774afd1a4174df34227626c95573be517 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 4 Apr 2023 18:21:17 +0100
Subject: of: Move of_modalias() to module.c

Create a specific .c file for OF related module handling.
Move of_modalias() inside as a first step.

The helper is exposed through of.h even though it is only used by core
files because the users from device.c will soon be split into an OF-only
helper in module.c as well as a device-oriented inline helper in
of_device.h. Putting this helper in of_private.h would require to
include of_private.h from of_device.h, which is not acceptable.

Suggested-by: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-10-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/of/Makefile |  2 +-
 drivers/of/device.c | 37 -------------------------------------
 drivers/of/module.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h  |  9 +++++++++
 4 files changed, 54 insertions(+), 38 deletions(-)
 create mode 100644 drivers/of/module.c

(limited to 'include')

diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index e0360a44306e..ae9923fd2940 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y = base.o device.o platform.o property.o
+obj-y = base.o device.o module.o platform.o property.o
 obj-$(CONFIG_OF_KOBJ) += kobj.o
 obj-$(CONFIG_OF_DYNAMIC) += dynamic.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 351c505ecb50..7183cfd754db 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -248,42 +247,6 @@ const void *of_device_get_match_data(const struct device *dev)
 }
 EXPORT_SYMBOL(of_device_get_match_data);
 
-static ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len)
-{
-	const char *compat;
-	char *c;
-	struct property *p;
-	ssize_t csize;
-	ssize_t tsize;
-
-	/* Name & Type */
-	/* %p eats all alphanum characters, so %c must be used here */
-	csize = snprintf(str, len, "of:N%pOFn%c%s", np, 'T',
-			 of_node_get_device_type(np));
-	tsize = csize;
-	len -= csize;
-	if (str)
-		str += csize;
-
-	of_property_for_each_string(np, "compatible", p, compat) {
-		csize = strlen(compat) + 1;
-		tsize += csize;
-		if (csize > len)
-			continue;
-
-		csize = snprintf(str, len, "C%s", compat);
-		for (c = str; c; ) {
-			c = strchr(c, ' ');
-			if (c)
-				*c++ = '_';
-		}
-		len -= csize;
-		str += csize;
-	}
-
-	return tsize;
-}
-
 int of_device_request_module(struct device *dev)
 {
 	char *str;
diff --git a/drivers/of/module.c b/drivers/of/module.c
new file mode 100644
index 000000000000..4c59752bc8d6
--- /dev/null
+++ b/drivers/of/module.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Linux kernel module helpers.
+ */
+
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len)
+{
+	const char *compat;
+	char *c;
+	struct property *p;
+	ssize_t csize;
+	ssize_t tsize;
+
+	/* Name & Type */
+	/* %p eats all alphanum characters, so %c must be used here */
+	csize = snprintf(str, len, "of:N%pOFn%c%s", np, 'T',
+			 of_node_get_device_type(np));
+	tsize = csize;
+	len -= csize;
+	if (str)
+		str += csize;
+
+	of_property_for_each_string(np, "compatible", p, compat) {
+		csize = strlen(compat) + 1;
+		tsize += csize;
+		if (csize > len)
+			continue;
+
+		csize = snprintf(str, len, "C%s", compat);
+		for (c = str; c; ) {
+			c = strchr(c, ' ');
+			if (c)
+				*c++ = '_';
+		}
+		len -= csize;
+		str += csize;
+	}
+
+	return tsize;
+}
diff --git a/include/linux/of.h b/include/linux/of.h
index b1eea8569043..be26c7e8ef9e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -385,6 +385,9 @@ extern int of_parse_phandle_with_args_map(const struct device_node *np,
 extern int of_count_phandle_with_args(const struct device_node *np,
 	const char *list_name, const char *cells_name);
 
+/* module functions */
+extern ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len);
+
 /* phandle iterator functions */
 extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
 				    const struct device_node *np,
@@ -742,6 +745,12 @@ static inline int of_count_phandle_with_args(const struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline ssize_t of_modalias(const struct device_node *np, char *str,
+				  ssize_t len)
+{
+	return -ENODEV;
+}
+
 static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
 					   const struct device_node *np,
 					   const char *list_name,
-- 
cgit v1.2.3


From e6506f06d5e82765666902ccf9e9162f3e31d518 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 4 Apr 2023 18:21:18 +0100
Subject: of: Move the request module helper logic to module.c

Depending on device.c for pure OF handling is considered
backwards. Let's extract the content of of_device_request_module() to
have the real logic under module.c.

The next step will be to convert users of of_device_request_module() to
use the new helper.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-11-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/of/device.c | 25 ++-----------------------
 drivers/of/module.c | 30 ++++++++++++++++++++++++++++++
 include/linux/of.h  |  6 ++++++
 3 files changed, 38 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index 7183cfd754db..874a2e1f6308 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -8,7 +8,6 @@
 #include <linux/dma-direct.h> /* for bus_dma_region */
 #include <linux/dma-map-ops.h>
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
@@ -249,30 +248,10 @@ EXPORT_SYMBOL(of_device_get_match_data);
 
 int of_device_request_module(struct device *dev)
 {
-	char *str;
-	ssize_t size;
-	int ret;
-
-	if (!dev || !dev->of_node)
+	if (!dev)
 		return -ENODEV;
 
-	size = of_modalias(dev->of_node, NULL, 0);
-	if (size < 0)
-		return size;
-
-	/* Reserve an additional byte for the trailing '\0' */
-	size++;
-
-	str = kmalloc(size, GFP_KERNEL);
-	if (!str)
-		return -ENOMEM;
-
-	of_modalias(dev->of_node, str, size);
-	str[size - 1] = '\0';
-	ret = request_module(str);
-	kfree(str);
-
-	return ret;
+	return of_request_module(dev->of_node);
 }
 EXPORT_SYMBOL_GPL(of_device_request_module);
 
diff --git a/drivers/of/module.c b/drivers/of/module.c
index 4c59752bc8d6..0e8aa974f0f2 100644
--- a/drivers/of/module.c
+++ b/drivers/of/module.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/of.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
@@ -42,3 +43,32 @@ ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len)
 
 	return tsize;
 }
+
+int of_request_module(const struct device_node *np)
+{
+	char *str;
+	ssize_t size;
+	int ret;
+
+	if (!np)
+		return -ENODEV;
+
+	size = of_modalias(np, NULL, 0);
+	if (size < 0)
+		return size;
+
+	/* Reserve an additional byte for the trailing '\0' */
+	size++;
+
+	str = kmalloc(size, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	of_modalias(np, str, size);
+	str[size - 1] = '\0';
+	ret = request_module(str);
+	kfree(str);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_request_module);
diff --git a/include/linux/of.h b/include/linux/of.h
index be26c7e8ef9e..9b7a99499ef3 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -387,6 +387,7 @@ extern int of_count_phandle_with_args(const struct device_node *np,
 
 /* module functions */
 extern ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len);
+extern int of_request_module(const struct device_node *np);
 
 /* phandle iterator functions */
 extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
@@ -751,6 +752,11 @@ static inline ssize_t of_modalias(const struct device_node *np, char *str,
 	return -ENODEV;
 }
 
+static inline int of_request_module(const struct device_node *np)
+{
+	return -ENODEV;
+}
+
 static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
 					   const struct device_node *np,
 					   const char *list_name,
-- 
cgit v1.2.3


From 2f555f58f5ce33b201235e104823662d5573c4a5 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 4 Apr 2023 18:21:20 +0100
Subject: of: device: Kill of_device_request_module()

A new helper has been introduced, of_request_module(). Users have been
converted, this helper can now be deleted.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-13-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/of/device.c       | 9 ---------
 include/linux/of_device.h | 6 ------
 2 files changed, 15 deletions(-)

(limited to 'include')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index 874a2e1f6308..0f00f1b80708 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -246,15 +246,6 @@ const void *of_device_get_match_data(const struct device *dev)
 }
 EXPORT_SYMBOL(of_device_get_match_data);
 
-int of_device_request_module(struct device *dev)
-{
-	if (!dev)
-		return -ENODEV;
-
-	return of_request_module(dev->of_node);
-}
-EXPORT_SYMBOL_GPL(of_device_request_module);
-
 /**
  * of_device_modalias - Fill buffer with newline terminated modalias string
  * @dev:	Calling device
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index f4b57614979d..ce20d8b00b3e 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -33,7 +33,6 @@ extern void of_device_unregister(struct platform_device *ofdev);
 extern const void *of_device_get_match_data(const struct device *dev);
 
 extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len);
-extern int of_device_request_module(struct device *dev);
 
 extern void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env);
 extern int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env);
@@ -78,11 +77,6 @@ static inline int of_device_modalias(struct device *dev,
 	return -ENODEV;
 }
 
-static inline int of_device_request_module(struct device *dev)
-{
-	return -ENODEV;
-}
-
 static inline int of_device_uevent_modalias(const struct device *dev,
 				   struct kobj_uevent_env *env)
 {
-- 
cgit v1.2.3


From 266570f496b90dea8fda893c2cf7c28d63ae2bd9 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 4 Apr 2023 18:21:21 +0100
Subject: nvmem: core: introduce NVMEM layouts

NVMEM layouts are used to generate NVMEM cells during runtime. Think of
an EEPROM with a well-defined conent. For now, the content can be
described by a device tree or a board file. But this only works if the
offsets and lengths are static and don't change. One could also argue
that putting the layout of the EEPROM in the device tree is the wrong
place. Instead, the device tree should just have a specific compatible
string.

Right now there are two use cases:
 (1) The NVMEM cell needs special processing. E.g. if it only specifies
     a base MAC address offset and you need to add an offset, or it
     needs to parse a MAC from ASCII format or some proprietary format.
     (Post processing of cells is added in a later commit).
 (2) u-boot environment parsing. The cells don't have a particular
     offset but it needs parsing the content to determine the offsets
     and length.

Co-developed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-14-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/nvmem.rst |  15 +++++
 drivers/nvmem/Kconfig              |   4 ++
 drivers/nvmem/Makefile             |   1 +
 drivers/nvmem/core.c               | 120 +++++++++++++++++++++++++++++++++++++
 drivers/nvmem/layouts/Kconfig      |   5 ++
 drivers/nvmem/layouts/Makefile     |   4 ++
 include/linux/nvmem-consumer.h     |   7 +++
 include/linux/nvmem-provider.h     |  51 ++++++++++++++++
 8 files changed, 207 insertions(+)
 create mode 100644 drivers/nvmem/layouts/Kconfig
 create mode 100644 drivers/nvmem/layouts/Makefile

(limited to 'include')

diff --git a/Documentation/driver-api/nvmem.rst b/Documentation/driver-api/nvmem.rst
index e3366322d46c..de221e91c8e3 100644
--- a/Documentation/driver-api/nvmem.rst
+++ b/Documentation/driver-api/nvmem.rst
@@ -185,3 +185,18 @@ ex::
 =====================
 
 See Documentation/devicetree/bindings/nvmem/nvmem.txt
+
+8. NVMEM layouts
+================
+
+NVMEM layouts are yet another mechanism to create cells. With the device
+tree binding it is possible to specify simple cells by using an offset
+and a length. Sometimes, the cells doesn't have a static offset, but
+the content is still well defined, e.g. tag-length-values. In this case,
+the NVMEM device content has to be first parsed and the cells need to
+be added accordingly. Layouts let you read the content of the NVMEM device
+and let you add cells dynamically.
+
+Another use case for layouts is the post processing of cells. With layouts,
+it is possible to associate a custom post processing hook to a cell. It
+even possible to add this hook to cells not created by the layout itself.
diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
index 3b3832f4dfad..a2afba11c890 100644
--- a/drivers/nvmem/Kconfig
+++ b/drivers/nvmem/Kconfig
@@ -21,6 +21,10 @@ config NVMEM_SYSFS
 	 This interface is mostly used by userspace applications to
 	 read/write directly into nvmem.
 
+# Layouts
+
+source "drivers/nvmem/layouts/Kconfig"
+
 # Devices
 
 config NVMEM_APPLE_EFUSES
diff --git a/drivers/nvmem/Makefile b/drivers/nvmem/Makefile
index 6a1efffa88f0..f82431ec8aef 100644
--- a/drivers/nvmem/Makefile
+++ b/drivers/nvmem/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_NVMEM)		+= nvmem_core.o
 nvmem_core-y			:= core.o
+obj-y				+= layouts/
 
 # Devices
 obj-$(CONFIG_NVMEM_APPLE_EFUSES)	+= nvmem-apple-efuses.o
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 22024b830788..b9be1faeb7be 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -40,6 +40,7 @@ struct nvmem_device {
 	nvmem_reg_write_t	reg_write;
 	nvmem_cell_post_process_t cell_post_process;
 	struct gpio_desc	*wp_gpio;
+	struct nvmem_layout	*layout;
 	void *priv;
 };
 
@@ -74,6 +75,9 @@ static LIST_HEAD(nvmem_lookup_list);
 
 static BLOCKING_NOTIFIER_HEAD(nvmem_notifier);
 
+static DEFINE_SPINLOCK(nvmem_layout_lock);
+static LIST_HEAD(nvmem_layouts);
+
 static int __nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
 			    void *val, size_t bytes)
 {
@@ -728,6 +732,101 @@ static int nvmem_add_cells_from_of(struct nvmem_device *nvmem)
 	return 0;
 }
 
+int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner)
+{
+	layout->owner = owner;
+
+	spin_lock(&nvmem_layout_lock);
+	list_add(&layout->node, &nvmem_layouts);
+	spin_unlock(&nvmem_layout_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__nvmem_layout_register);
+
+void nvmem_layout_unregister(struct nvmem_layout *layout)
+{
+	spin_lock(&nvmem_layout_lock);
+	list_del(&layout->node);
+	spin_unlock(&nvmem_layout_lock);
+}
+EXPORT_SYMBOL_GPL(nvmem_layout_unregister);
+
+static struct nvmem_layout *nvmem_layout_get(struct nvmem_device *nvmem)
+{
+	struct device_node *layout_np, *np = nvmem->dev.of_node;
+	struct nvmem_layout *l, *layout = NULL;
+
+	layout_np = of_get_child_by_name(np, "nvmem-layout");
+	if (!layout_np)
+		return NULL;
+
+	spin_lock(&nvmem_layout_lock);
+
+	list_for_each_entry(l, &nvmem_layouts, node) {
+		if (of_match_node(l->of_match_table, layout_np)) {
+			if (try_module_get(l->owner))
+				layout = l;
+
+			break;
+		}
+	}
+
+	spin_unlock(&nvmem_layout_lock);
+	of_node_put(layout_np);
+
+	return layout;
+}
+
+static void nvmem_layout_put(struct nvmem_layout *layout)
+{
+	if (layout)
+		module_put(layout->owner);
+}
+
+static int nvmem_add_cells_from_layout(struct nvmem_device *nvmem)
+{
+	struct nvmem_layout *layout = nvmem->layout;
+	int ret;
+
+	if (layout && layout->add_cells) {
+		ret = layout->add_cells(&nvmem->dev, nvmem, layout);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_OF)
+/**
+ * of_nvmem_layout_get_container() - Get OF node to layout container.
+ *
+ * @nvmem: nvmem device.
+ *
+ * Return: a node pointer with refcount incremented or NULL if no
+ * container exists. Use of_node_put() on it when done.
+ */
+struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
+{
+	return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
+}
+EXPORT_SYMBOL_GPL(of_nvmem_layout_get_container);
+#endif
+
+const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
+					struct nvmem_layout *layout)
+{
+	struct device_node __maybe_unused *layout_np;
+	const struct of_device_id *match;
+
+	layout_np = of_nvmem_layout_get_container(nvmem);
+	match = of_match_node(layout->of_match_table, layout_np);
+
+	return match ? match->data : NULL;
+}
+EXPORT_SYMBOL_GPL(nvmem_layout_get_match_data);
+
 /**
  * nvmem_register() - Register a nvmem device for given nvmem_config.
  * Also creates a binary entry in /sys/bus/nvmem/devices/dev-name/nvmem
@@ -834,6 +933,12 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 			goto err_put_device;
 	}
 
+	/*
+	 * If the driver supplied a layout by config->layout, the module
+	 * pointer will be NULL and nvmem_layout_put() will be a noop.
+	 */
+	nvmem->layout = config->layout ?: nvmem_layout_get(nvmem);
+
 	if (config->cells) {
 		rval = nvmem_add_cells(nvmem, config->cells, config->ncells);
 		if (rval)
@@ -854,12 +959,17 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	if (rval)
 		goto err_remove_cells;
 
+	rval = nvmem_add_cells_from_layout(nvmem);
+	if (rval)
+		goto err_remove_cells;
+
 	blocking_notifier_call_chain(&nvmem_notifier, NVMEM_ADD, nvmem);
 
 	return nvmem;
 
 err_remove_cells:
 	nvmem_device_remove_all_cells(nvmem);
+	nvmem_layout_put(nvmem->layout);
 	if (config->compat)
 		nvmem_sysfs_remove_compat(nvmem, config);
 err_put_device:
@@ -881,6 +991,7 @@ static void nvmem_device_release(struct kref *kref)
 		device_remove_bin_file(nvmem->base_dev, &nvmem->eeprom);
 
 	nvmem_device_remove_all_cells(nvmem);
+	nvmem_layout_put(nvmem->layout);
 	device_unregister(&nvmem->dev);
 }
 
@@ -1246,6 +1357,15 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id)
 		return ERR_PTR(-EINVAL);
 	}
 
+	/* nvmem layouts produce cells within the nvmem-layout container */
+	if (of_node_name_eq(nvmem_np, "nvmem-layout")) {
+		nvmem_np = of_get_next_parent(nvmem_np);
+		if (!nvmem_np) {
+			of_node_put(cell_np);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
 	nvmem = __nvmem_device_get(nvmem_np, device_match_of_node);
 	of_node_put(nvmem_np);
 	if (IS_ERR(nvmem)) {
diff --git a/drivers/nvmem/layouts/Kconfig b/drivers/nvmem/layouts/Kconfig
new file mode 100644
index 000000000000..9ad3911d1605
--- /dev/null
+++ b/drivers/nvmem/layouts/Kconfig
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Layout Types"
+
+endmenu
diff --git a/drivers/nvmem/layouts/Makefile b/drivers/nvmem/layouts/Makefile
new file mode 100644
index 000000000000..6fdb3c60a4fa
--- /dev/null
+++ b/drivers/nvmem/layouts/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for nvmem layouts.
+#
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 1f62f7ba71ca..fa030d93b768 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -239,6 +239,7 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 				     const char *id);
 struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 					 const char *name);
+struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem);
 #else
 static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 						   const char *id)
@@ -251,6 +252,12 @@ static inline struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct device_node *
+of_nvmem_layout_get_container(struct nvmem_device *nvmem)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
 #endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 0262b86194eb..535c5f9f3309 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -88,6 +88,7 @@ struct nvmem_cell_info {
  * @stride:	Minimum read/write access stride.
  * @priv:	User context passed to read/write callbacks.
  * @ignore_wp:  Write Protect pin is managed by the provider.
+ * @layout:	Fixed layout associated with this nvmem device.
  *
  * Note: A default "nvmem<id>" name will be assigned to the device if
  * no name is specified in its configuration. In such case "<id>" is
@@ -109,6 +110,7 @@ struct nvmem_config {
 	bool			read_only;
 	bool			root_only;
 	bool			ignore_wp;
+	struct nvmem_layout	*layout;
 	struct device_node	*of_node;
 	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
@@ -142,6 +144,33 @@ struct nvmem_cell_table {
 	struct list_head	node;
 };
 
+/**
+ * struct nvmem_layout - NVMEM layout definitions
+ *
+ * @name:		Layout name.
+ * @of_match_table:	Open firmware match table.
+ * @add_cells:		Will be called if a nvmem device is found which
+ *			has this layout. The function will add layout
+ *			specific cells with nvmem_add_one_cell().
+ * @owner:		Pointer to struct module.
+ * @node:		List node.
+ *
+ * A nvmem device can hold a well defined structure which can just be
+ * evaluated during runtime. For example a TLV list, or a list of "name=val"
+ * pairs. A nvmem layout can parse the nvmem device and add appropriate
+ * cells.
+ */
+struct nvmem_layout {
+	const char *name;
+	const struct of_device_id *of_match_table;
+	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem,
+			 struct nvmem_layout *layout);
+
+	/* private */
+	struct module *owner;
+	struct list_head node;
+};
+
 #if IS_ENABLED(CONFIG_NVMEM)
 
 struct nvmem_device *nvmem_register(const struct nvmem_config *cfg);
@@ -156,6 +185,14 @@ void nvmem_del_cell_table(struct nvmem_cell_table *table);
 int nvmem_add_one_cell(struct nvmem_device *nvmem,
 		       const struct nvmem_cell_info *info);
 
+int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner);
+#define nvmem_layout_register(layout) \
+	__nvmem_layout_register(layout, THIS_MODULE)
+void nvmem_layout_unregister(struct nvmem_layout *layout);
+
+const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
+					struct nvmem_layout *layout);
+
 #else
 
 static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c)
@@ -179,5 +216,19 @@ static inline int nvmem_add_one_cell(struct nvmem_device *nvmem,
 	return -EOPNOTSUPP;
 }
 
+static inline int nvmem_layout_register(struct nvmem_layout *layout)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void nvmem_layout_unregister(struct nvmem_layout *layout) {}
+
+static inline const void *
+nvmem_layout_get_match_data(struct nvmem_device *nvmem,
+			    struct nvmem_layout *layout)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_NVMEM */
 #endif  /* ifndef _LINUX_NVMEM_PROVIDER_H */
-- 
cgit v1.2.3


From 345ec382cd4b736c36e01f155d08c913b225b736 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 4 Apr 2023 18:21:24 +0100
Subject: nvmem: core: add per-cell post processing

Instead of relying on the name the consumer is using for the cell, like
it is done for the nvmem .cell_post_process configuration parameter,
provide a per-cell post processing hook. This can then be populated by
the NVMEM provider (or the NVMEM layout) when adding the cell.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-17-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 17 +++++++++++++++++
 include/linux/nvmem-provider.h |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 49b4bbaf59e8..0708f9f27898 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -54,6 +54,7 @@ struct nvmem_cell_entry {
 	int			bytes;
 	int			bit_offset;
 	int			nbits;
+	nvmem_cell_post_process_t read_post_process;
 	struct device_node	*np;
 	struct nvmem_device	*nvmem;
 	struct list_head	node;
@@ -470,6 +471,7 @@ static int nvmem_cell_info_to_nvmem_cell_entry_nodup(struct nvmem_device *nvmem,
 	cell->offset = info->offset;
 	cell->bytes = info->bytes;
 	cell->name = info->name;
+	cell->read_post_process = info->read_post_process;
 
 	cell->bit_offset = info->bit_offset;
 	cell->nbits = info->nbits;
@@ -1563,6 +1565,13 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 	if (cell->bit_offset || cell->nbits)
 		nvmem_shift_read_buffer_in_place(cell, buf);
 
+	if (cell->read_post_process) {
+		rc = cell->read_post_process(nvmem->priv, id, index,
+					     cell->offset, buf, cell->bytes);
+		if (rc)
+			return rc;
+	}
+
 	if (nvmem->cell_post_process) {
 		rc = nvmem->cell_post_process(nvmem->priv, id, index,
 					      cell->offset, buf, cell->bytes);
@@ -1671,6 +1680,14 @@ static int __nvmem_cell_entry_write(struct nvmem_cell_entry *cell, void *buf, si
 	    (cell->bit_offset == 0 && len != cell->bytes))
 		return -EINVAL;
 
+	/*
+	 * Any cells which have a read_post_process hook are read-only because
+	 * we cannot reverse the operation and it might affect other cells,
+	 * too.
+	 */
+	if (cell->read_post_process)
+		return -EINVAL;
+
 	if (cell->bit_offset || cell->nbits) {
 		buf = nvmem_cell_prepare_write_buffer(cell, buf, len);
 		if (IS_ERR(buf))
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 535c5f9f3309..3bfc23553a9e 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -54,6 +54,8 @@ struct nvmem_keepout {
  * @bit_offset:	Bit offset if cell is smaller than a byte.
  * @nbits:	Number of bits.
  * @np:		Optional device_node pointer.
+ * @read_post_process:	Callback for optional post processing of cell data
+ *			on reads.
  */
 struct nvmem_cell_info {
 	const char		*name;
@@ -62,6 +64,7 @@ struct nvmem_cell_info {
 	unsigned int		bit_offset;
 	unsigned int		nbits;
 	struct device_node	*np;
+	nvmem_cell_post_process_t read_post_process;
 };
 
 /**
-- 
cgit v1.2.3


From de12c9691501ccba41a154c223869f82be4c12fd Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 4 Apr 2023 18:21:25 +0100
Subject: nvmem: core: allow to modify a cell before adding it

Provide a way to modify a cell before it will get added. This is useful
to attach a custom post processing hook via a layout.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-18-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 4 ++++
 include/linux/nvmem-provider.h | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 0708f9f27898..f43025ad315b 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -695,6 +695,7 @@ static int nvmem_validate_keepouts(struct nvmem_device *nvmem)
 
 static int nvmem_add_cells_from_of(struct nvmem_device *nvmem)
 {
+	struct nvmem_layout *layout = nvmem->layout;
 	struct device *dev = &nvmem->dev;
 	struct device_node *child;
 	const __be32 *addr;
@@ -724,6 +725,9 @@ static int nvmem_add_cells_from_of(struct nvmem_device *nvmem)
 
 		info.np = of_node_get(child);
 
+		if (layout && layout->fixup_cell_info)
+			layout->fixup_cell_info(nvmem, layout, &info);
+
 		ret = nvmem_add_one_cell(nvmem, &info);
 		kfree(info.name);
 		if (ret) {
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 3bfc23553a9e..be81cc88eabc 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -155,6 +155,8 @@ struct nvmem_cell_table {
  * @add_cells:		Will be called if a nvmem device is found which
  *			has this layout. The function will add layout
  *			specific cells with nvmem_add_one_cell().
+ * @fixup_cell_info:	Will be called before a cell is added. Can be
+ *			used to modify the nvmem_cell_info.
  * @owner:		Pointer to struct module.
  * @node:		List node.
  *
@@ -168,6 +170,9 @@ struct nvmem_layout {
 	const struct of_device_id *of_match_table;
 	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem,
 			 struct nvmem_layout *layout);
+	void (*fixup_cell_info)(struct nvmem_device *nvmem,
+				struct nvmem_layout *layout,
+				struct nvmem_cell_info *cell);
 
 	/* private */
 	struct module *owner;
-- 
cgit v1.2.3


From 011e40a166fdaa65fb9946b7cd91efec85b70dbb Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 4 Apr 2023 18:21:27 +0100
Subject: nvmem: cell: drop global cell_post_process

There are no users anymore for the global cell_post_process callback
anymore. New users should use proper nvmem layouts.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-20-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 9 ---------
 include/linux/nvmem-provider.h | 2 --
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index f43025ad315b..fccb2728193a 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -39,7 +39,6 @@ struct nvmem_device {
 	unsigned int		nkeepout;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
-	nvmem_cell_post_process_t cell_post_process;
 	struct gpio_desc	*wp_gpio;
 	struct nvmem_layout	*layout;
 	void *priv;
@@ -903,7 +902,6 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	nvmem->type = config->type;
 	nvmem->reg_read = config->reg_read;
 	nvmem->reg_write = config->reg_write;
-	nvmem->cell_post_process = config->cell_post_process;
 	nvmem->keepout = config->keepout;
 	nvmem->nkeepout = config->nkeepout;
 	if (config->of_node)
@@ -1576,13 +1574,6 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 			return rc;
 	}
 
-	if (nvmem->cell_post_process) {
-		rc = nvmem->cell_post_process(nvmem->priv, id, index,
-					      cell->offset, buf, cell->bytes);
-		if (rc)
-			return rc;
-	}
-
 	if (len)
 		*len = cell->bytes;
 
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index be81cc88eabc..d3d7af86a283 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -85,7 +85,6 @@ struct nvmem_cell_info {
  * @no_of_node:	Device should not use the parent's of_node even if it's !NULL.
  * @reg_read:	Callback to read data.
  * @reg_write:	Callback to write data.
- * @cell_post_process:	Callback for vendor specific post processing of cell data
  * @size:	Device size.
  * @word_size:	Minimum read/write access granularity.
  * @stride:	Minimum read/write access stride.
@@ -118,7 +117,6 @@ struct nvmem_config {
 	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
-	nvmem_cell_post_process_t cell_post_process;
 	int	size;
 	int	word_size;
 	int	stride;
-- 
cgit v1.2.3


From 8a134fd9f9323f4c39ec27055b3d3723cfb5c1e9 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 4 Apr 2023 18:21:28 +0100
Subject: nvmem: core: provide own priv pointer in post process callback

It doesn't make any more sense to have a opaque pointer set up by the
nvmem device. Usually, the layout isn't associated with a particular
nvmem device. Instead, let the caller who set the post process callback
provide the priv pointer.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-21-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 4 +++-
 include/linux/nvmem-provider.h | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index fccb2728193a..212c5ba5789f 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -54,6 +54,7 @@ struct nvmem_cell_entry {
 	int			bit_offset;
 	int			nbits;
 	nvmem_cell_post_process_t read_post_process;
+	void			*priv;
 	struct device_node	*np;
 	struct nvmem_device	*nvmem;
 	struct list_head	node;
@@ -471,6 +472,7 @@ static int nvmem_cell_info_to_nvmem_cell_entry_nodup(struct nvmem_device *nvmem,
 	cell->bytes = info->bytes;
 	cell->name = info->name;
 	cell->read_post_process = info->read_post_process;
+	cell->priv = info->priv;
 
 	cell->bit_offset = info->bit_offset;
 	cell->nbits = info->nbits;
@@ -1568,7 +1570,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 		nvmem_shift_read_buffer_in_place(cell, buf);
 
 	if (cell->read_post_process) {
-		rc = cell->read_post_process(nvmem->priv, id, index,
+		rc = cell->read_post_process(cell->priv, id, index,
 					     cell->offset, buf, cell->bytes);
 		if (rc)
 			return rc;
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index d3d7af86a283..0cf9f9490514 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -20,7 +20,8 @@ typedef int (*nvmem_reg_write_t)(void *priv, unsigned int offset,
 				 void *val, size_t bytes);
 /* used for vendor specific post processing of cell data */
 typedef int (*nvmem_cell_post_process_t)(void *priv, const char *id, int index,
-					 unsigned int offset, void *buf, size_t bytes);
+					 unsigned int offset, void *buf,
+					 size_t bytes);
 
 enum nvmem_type {
 	NVMEM_TYPE_UNKNOWN = 0,
@@ -56,6 +57,7 @@ struct nvmem_keepout {
  * @np:		Optional device_node pointer.
  * @read_post_process:	Callback for optional post processing of cell data
  *			on reads.
+ * @priv:	Opaque data passed to the read_post_process hook.
  */
 struct nvmem_cell_info {
 	const char		*name;
@@ -65,6 +67,7 @@ struct nvmem_cell_info {
 	unsigned int		nbits;
 	struct device_node	*np;
 	nvmem_cell_post_process_t read_post_process;
+	void			*priv;
 };
 
 /**
-- 
cgit v1.2.3


From 55d4980ce55b6bb4be66877de4dbec513911b988 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Tue, 4 Apr 2023 18:21:42 +0100
Subject: nvmem: core: support specifying both: cell raw data & post read
 lengths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Callback .read_post_process() is designed to modify raw cell content
before providing it to the consumer. So far we were dealing with
modifications that didn't affect cell size (length). In some cases
however cell content needs to be reformatted and resized.

It's required e.g. to provide properly formatted MAC address in case
it's stored in a non-binary format (e.g. using ASCII).

There were few discussions how to optimally handle that. Following
possible solutions were considered:
1. Allow .read_post_process() to realloc (resize) content buffer
2. Allow .read_post_process() to adjust (decrease) just buffer length
3. Register NVMEM cells using post-read sizes

The preferred solution was the last one. The problem is that simply
adjusting "bytes" in NVMEM providers would result in core code NOT
passing whole raw data to .read_post_process() callbacks. It means
callback functions couldn't do their job without somehow manually
reading original cell content on their own.

This patch deals with that by registering NVMEM cells with both lengths:
raw content one and post read one. It allows:
1. Core code to read whole raw cell content
2. Callbacks to return content they want

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-35-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 11 +++++++----
 include/linux/nvmem-provider.h |  2 ++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 212c5ba5789f..a62973d010ff 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -50,6 +50,7 @@ struct nvmem_device {
 struct nvmem_cell_entry {
 	const char		*name;
 	int			offset;
+	size_t			raw_len;
 	int			bytes;
 	int			bit_offset;
 	int			nbits;
@@ -469,6 +470,7 @@ static int nvmem_cell_info_to_nvmem_cell_entry_nodup(struct nvmem_device *nvmem,
 {
 	cell->nvmem = nvmem;
 	cell->offset = info->offset;
+	cell->raw_len = info->raw_len ?: info->bytes;
 	cell->bytes = info->bytes;
 	cell->name = info->name;
 	cell->read_post_process = info->read_post_process;
@@ -1560,7 +1562,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes);
+	rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->raw_len);
 
 	if (rc)
 		return rc;
@@ -1571,7 +1573,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 
 	if (cell->read_post_process) {
 		rc = cell->read_post_process(cell->priv, id, index,
-					     cell->offset, buf, cell->bytes);
+					     cell->offset, buf, cell->raw_len);
 		if (rc)
 			return rc;
 	}
@@ -1594,14 +1596,15 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
  */
 void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
 {
-	struct nvmem_device *nvmem = cell->entry->nvmem;
+	struct nvmem_cell_entry *entry = cell->entry;
+	struct nvmem_device *nvmem = entry->nvmem;
 	u8 *buf;
 	int rc;
 
 	if (!nvmem)
 		return ERR_PTR(-EINVAL);
 
-	buf = kzalloc(cell->entry->bytes, GFP_KERNEL);
+	buf = kzalloc(max_t(size_t, entry->raw_len, entry->bytes), GFP_KERNEL);
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 0cf9f9490514..8ffb42ba0f62 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -51,6 +51,7 @@ struct nvmem_keepout {
  * struct nvmem_cell_info - NVMEM cell description
  * @name:	Name.
  * @offset:	Offset within the NVMEM device.
+ * @raw_len:	Length of raw data (without post processing).
  * @bytes:	Length of the cell.
  * @bit_offset:	Bit offset if cell is smaller than a byte.
  * @nbits:	Number of bits.
@@ -62,6 +63,7 @@ struct nvmem_keepout {
 struct nvmem_cell_info {
 	const char		*name;
 	unsigned int		offset;
+	size_t			raw_len;
 	unsigned int		bytes;
 	unsigned int		bit_offset;
 	unsigned int		nbits;
-- 
cgit v1.2.3


From 814c978f02db17f16e6aa2efa2a929372f06da09 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 4 Apr 2023 18:21:44 +0100
Subject: nvmem: Add macro to register nvmem layout drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide a module_nvmem_layout_driver() macro at the end of the
nvmem-provider.h header to reduce the boilerplate when registering nvmem
layout drivers.

Suggested-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20230404172148.82422-37-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 8ffb42ba0f62..dae26295e6be 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -9,6 +9,7 @@
 #ifndef _LINUX_NVMEM_PROVIDER_H
 #define _LINUX_NVMEM_PROVIDER_H
 
+#include <linux/device/driver.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/gpio/consumer.h>
@@ -242,4 +243,9 @@ nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 }
 
 #endif /* CONFIG_NVMEM */
+
+#define module_nvmem_layout_driver(__layout_driver)		\
+	module_driver(__layout_driver, nvmem_layout_register,	\
+		      nvmem_layout_unregister)
+
 #endif  /* ifndef _LINUX_NVMEM_PROVIDER_H */
-- 
cgit v1.2.3


From 78dfc9d1d1abb9e400386fa9c5724a8f7d75e3b9 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 4 Apr 2023 13:02:46 +0200
Subject: ACPI: video: Add auto_detect arg to __acpi_video_get_backlight_type()

Allow callers of __acpi_video_get_backlight_type() to pass a pointer
to a bool which will get set to false if the backlight-type comes from
the cmdline or a DMI quirk and set to true if auto-detection was used.

And make __acpi_video_get_backlight_type() non static so that it can
be called directly outside of video_detect.c .

While at it turn the acpi_video_get_backlight_type() and
acpi_video_backlight_use_native() wrappers into static inline functions
in include/acpi/video.h, so that we need to export one less symbol.

Fixes: 5aa9d943e9b6 ("ACPI: video: Don't enable fallback path for creating ACPI backlight by default")
Cc: All applicable <stable@vger.kernel.org>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/video_detect.c | 21 ++++++++-------------
 include/acpi/video.h        | 15 +++++++++++++--
 2 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index fd7cbce8076e..f7c218dd8742 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -782,7 +782,7 @@ static bool prefer_native_over_acpi_video(void)
  * Determine which type of backlight interface to use on this system,
  * First check cmdline, then dmi quirks, then do autodetect.
  */
-static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native)
+enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto_detect)
 {
 	static DEFINE_MUTEX(init_mutex);
 	static bool nvidia_wmi_ec_present;
@@ -807,6 +807,9 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native)
 		native_available = true;
 	mutex_unlock(&init_mutex);
 
+	if (auto_detect)
+		*auto_detect = false;
+
 	/*
 	 * The below heuristics / detection steps are in order of descending
 	 * presedence. The commandline takes presedence over anything else.
@@ -818,6 +821,9 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native)
 	if (acpi_backlight_dmi != acpi_backlight_undef)
 		return acpi_backlight_dmi;
 
+	if (auto_detect)
+		*auto_detect = true;
+
 	/* Special cases such as nvidia_wmi_ec and apple gmux. */
 	if (nvidia_wmi_ec_present)
 		return acpi_backlight_nvidia_wmi_ec;
@@ -837,15 +843,4 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native)
 	/* No ACPI video/native (old hw), use vendor specific fw methods. */
 	return acpi_backlight_vendor;
 }
-
-enum acpi_backlight_type acpi_video_get_backlight_type(void)
-{
-	return __acpi_video_get_backlight_type(false);
-}
-EXPORT_SYMBOL(acpi_video_get_backlight_type);
-
-bool acpi_video_backlight_use_native(void)
-{
-	return __acpi_video_get_backlight_type(true) == acpi_backlight_native;
-}
-EXPORT_SYMBOL(acpi_video_backlight_use_native);
+EXPORT_SYMBOL(__acpi_video_get_backlight_type);
diff --git a/include/acpi/video.h b/include/acpi/video.h
index 8ed9bec03e53..ff5a8da5d883 100644
--- a/include/acpi/video.h
+++ b/include/acpi/video.h
@@ -59,8 +59,6 @@ extern void acpi_video_unregister(void);
 extern void acpi_video_register_backlight(void);
 extern int acpi_video_get_edid(struct acpi_device *device, int type,
 			       int device_id, void **edid);
-extern enum acpi_backlight_type acpi_video_get_backlight_type(void);
-extern bool acpi_video_backlight_use_native(void);
 /*
  * Note: The value returned by acpi_video_handles_brightness_key_presses()
  * may change over time and should not be cached.
@@ -69,6 +67,19 @@ extern bool acpi_video_handles_brightness_key_presses(void);
 extern int acpi_video_get_levels(struct acpi_device *device,
 				 struct acpi_video_device_brightness **dev_br,
 				 int *pmax_level);
+
+extern enum acpi_backlight_type __acpi_video_get_backlight_type(bool native,
+								bool *auto_detect);
+
+static inline enum acpi_backlight_type acpi_video_get_backlight_type(void)
+{
+	return __acpi_video_get_backlight_type(false, NULL);
+}
+
+static inline bool acpi_video_backlight_use_native(void)
+{
+	return __acpi_video_get_backlight_type(true, NULL) == acpi_backlight_native;
+}
 #else
 static inline void acpi_video_report_nolcd(void) { return; };
 static inline int acpi_video_register(void) { return -ENODEV; }
-- 
cgit v1.2.3


From 14e985482111a2c6bc75a0348701a4acdc5558d8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 2 Apr 2023 11:42:07 +0200
Subject: clk: Remove mmask and nmask fields in struct clk_fractional_divider

All users of these fields have been removed.
They are now computed when needed with [mn]shift and [mn]width.

This shrinks the size of struct clk_fractional_divider from 72 to 56 bytes.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/680357e5acb338433bfc94114b65b4a4ce2c99e2.1680423909.git.christophe.jaillet@wanadoo.fr
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 842e72a5348f..3271a2bc352f 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1135,10 +1135,8 @@ struct clk_fractional_divider {
 	void __iomem	*reg;
 	u8		mshift;
 	u8		mwidth;
-	u32		mmask;
 	u8		nshift;
 	u8		nwidth;
-	u32		nmask;
 	u8		flags;
 	void		(*approximation)(struct clk_hw *hw,
 				unsigned long rate, unsigned long *parent_rate,
-- 
cgit v1.2.3


From 02992064bdffc97403b6058491094cd41d1a11ef Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 4 Apr 2023 10:45:25 -0500
Subject: PCI: Make pci_bus_for_each_resource() index optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor pci_bus_for_each_resource() in the same way as
pci_dev_for_each_resource(). This allows the index to be hidden inside the
implementation so the caller can omit it when it's not used otherwise.

No functional changes intended.

Link: https://lore.kernel.org/r/20230330162434.35055-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 drivers/pci/bus.c                  |  7 +++----
 drivers/pci/hotplug/shpchp_sysfs.c |  8 ++++----
 drivers/pci/pci.c                  |  3 +--
 drivers/pci/probe.c                |  2 +-
 drivers/pci/setup-bus.c            | 10 ++++------
 include/linux/pci.h                | 24 +++++++++++++++++++-----
 6 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 83ae838ceb5f..f37f23574ad8 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -161,13 +161,13 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, struct resource *res,
 		void *alignf_data,
 		struct pci_bus_region *region)
 {
-	int i, ret;
 	struct resource *r, avail;
 	resource_size_t max;
+	int ret;
 
 	type_mask |= IORESOURCE_TYPE_BITS;
 
-	pci_bus_for_each_resource(bus, r, i) {
+	pci_bus_for_each_resource(bus, r) {
 		resource_size_t min_used = min;
 
 		if (!r)
@@ -268,9 +268,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx)
 	struct resource *res = &dev->resource[idx];
 	struct resource orig_res = *res;
 	struct resource *r;
-	int i;
 
-	pci_bus_for_each_resource(bus, r, i) {
+	pci_bus_for_each_resource(bus, r) {
 		resource_size_t start, end;
 
 		if (!r)
diff --git a/drivers/pci/hotplug/shpchp_sysfs.c b/drivers/pci/hotplug/shpchp_sysfs.c
index 64beed7a26be..01d47a42da04 100644
--- a/drivers/pci/hotplug/shpchp_sysfs.c
+++ b/drivers/pci/hotplug/shpchp_sysfs.c
@@ -24,16 +24,16 @@
 static ssize_t show_ctrl(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct pci_dev *pdev;
-	int index, busnr;
 	struct resource *res;
 	struct pci_bus *bus;
 	size_t len = 0;
+	int busnr;
 
 	pdev = to_pci_dev(dev);
 	bus = pdev->subordinate;
 
 	len += sysfs_emit_at(buf, len, "Free resources: memory\n");
-	pci_bus_for_each_resource(bus, res, index) {
+	pci_bus_for_each_resource(bus, res) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 				!(res->flags & IORESOURCE_PREFETCH)) {
 			len += sysfs_emit_at(buf, len,
@@ -43,7 +43,7 @@ static ssize_t show_ctrl(struct device *dev, struct device_attribute *attr, char
 		}
 	}
 	len += sysfs_emit_at(buf, len, "Free resources: prefetchable memory\n");
-	pci_bus_for_each_resource(bus, res, index) {
+	pci_bus_for_each_resource(bus, res) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 			       (res->flags & IORESOURCE_PREFETCH)) {
 			len += sysfs_emit_at(buf, len,
@@ -53,7 +53,7 @@ static ssize_t show_ctrl(struct device *dev, struct device_attribute *attr, char
 		}
 	}
 	len += sysfs_emit_at(buf, len, "Free resources: IO\n");
-	pci_bus_for_each_resource(bus, res, index) {
+	pci_bus_for_each_resource(bus, res) {
 		if (res && (res->flags & IORESOURCE_IO)) {
 			len += sysfs_emit_at(buf, len,
 					     "start = %8.8llx, length = %8.8llx\n",
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7a67611dc5f4..99299f1299c4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -779,9 +779,8 @@ struct resource *pci_find_parent_resource(const struct pci_dev *dev,
 {
 	const struct pci_bus *bus = dev->bus;
 	struct resource *r;
-	int i;
 
-	pci_bus_for_each_resource(bus, r, i) {
+	pci_bus_for_each_resource(bus, r) {
 		if (!r)
 			continue;
 		if (resource_contains(r, res)) {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a3f68b6ba6ac..f8191750f6b7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -533,7 +533,7 @@ void pci_read_bridge_bases(struct pci_bus *child)
 	pci_read_bridge_mmio_pref(child);
 
 	if (dev->transparent) {
-		pci_bus_for_each_resource(child->parent, res, i) {
+		pci_bus_for_each_resource(child->parent, res) {
 			if (res && res->flags) {
 				pci_bus_add_resource(child, res,
 						     PCI_SUBTRACTIVE_DECODE);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 027b985dd1ee..fdeb121e9175 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -770,9 +770,8 @@ static struct resource *find_bus_resource_of_type(struct pci_bus *bus,
 						  unsigned long type)
 {
 	struct resource *r, *r_assigned = NULL;
-	int i;
 
-	pci_bus_for_each_resource(bus, r, i) {
+	pci_bus_for_each_resource(bus, r) {
 		if (r == &ioport_resource || r == &iomem_resource)
 			continue;
 		if (r && (r->flags & type_mask) == type && !r->parent)
@@ -1204,7 +1203,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head)
 			additional_mmio_pref_size = 0;
 	struct resource *pref;
 	struct pci_host_bridge *host;
-	int hdr_type, i, ret;
+	int hdr_type, ret;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -1228,7 +1227,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head)
 		host = to_pci_host_bridge(bus->bridge);
 		if (!host->size_windows)
 			return;
-		pci_bus_for_each_resource(bus, pref, i)
+		pci_bus_for_each_resource(bus, pref)
 			if (pref && (pref->flags & IORESOURCE_PREFETCH))
 				break;
 		hdr_type = -1;	/* Intentionally invalid - not a PCI device. */
@@ -1333,12 +1332,11 @@ EXPORT_SYMBOL(pci_bus_size_bridges);
 
 static void assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
 {
-	int i;
 	struct resource *parent_r;
 	unsigned long mask = IORESOURCE_IO | IORESOURCE_MEM |
 			     IORESOURCE_PREFETCH;
 
-	pci_bus_for_each_resource(b, parent_r, i) {
+	pci_bus_for_each_resource(b, parent_r) {
 		if (!parent_r)
 			continue;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index db3155562320..c048e70d42b9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1444,11 +1444,21 @@ int devm_request_pci_bus_resources(struct device *dev,
 /* Temporary until new and working PCI SBR API in place */
 int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
 
+#define __pci_bus_for_each_res0(bus, res, ...)				\
+	for (unsigned int __b = 0;					\
+	     (res = pci_bus_resource_n(bus, __b)) || __b < PCI_BRIDGE_RESOURCE_NUM; \
+	     __b++)
+
+#define __pci_bus_for_each_res1(bus, res, __b)				\
+	for (__b = 0;							\
+	     (res = pci_bus_resource_n(bus, __b)) || __b < PCI_BRIDGE_RESOURCE_NUM; \
+	     __b++)
+
 /**
  * pci_bus_for_each_resource - iterate over PCI bus resources
  * @bus: the PCI bus
  * @res: pointer to the current resource
- * @i: index of the current resource
+ * @...: optional index of the current resource
  *
  * Iterate over PCI bus resources. The first part is to go over PCI bus
  * resource array, which has at most the %PCI_BRIDGE_RESOURCE_NUM entries.
@@ -1461,13 +1471,17 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
  *	struct resource *res;
  *	unsigned int i;
  *
+ * 	// With optional index
  * 	pci_bus_for_each_resource(bus, res, i)
  * 		pr_info("PCI bus resource[%u]: %pR\n", i, res);
+ *
+ * 	// Without index
+ * 	pci_bus_for_each_resource(bus, res)
+ * 		_do_something_(res);
  */
-#define pci_bus_for_each_resource(bus, res, i)				\
-	for (i = 0;							\
-	    (res = pci_bus_resource_n(bus, i)) || i < PCI_BRIDGE_RESOURCE_NUM; \
-	     i++)
+#define pci_bus_for_each_resource(bus, res, ...)			\
+	CONCATENATE(__pci_bus_for_each_res, COUNT_ARGS(__VA_ARGS__))	\
+		    (bus, res, __VA_ARGS__)
 
 int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
 			struct resource *res, resource_size_t size,
-- 
cgit v1.2.3


From 3dd4432549415f3c65dd52d5c687629efbf4ece1 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Mon, 27 Feb 2023 09:36:07 -0800
Subject: mm: enable maple tree RCU mode by default

Use the maple tree in RCU mode for VMA tracking.

The maple tree tracks the stack and is able to update the pivot
(lower/upper boundary) in-place to allow the page fault handler to write
to the tree while holding just the mmap read lock.  This is safe as the
writes to the stack have a guard VMA which ensures there will always be a
NULL in the direction of the growth and thus will only update a pivot.

It is possible, but not recommended, to have VMAs that grow up/down
without guard VMAs.  syzbot has constructed a testcase which sets up a VMA
to grow and consume the empty space.  Overwriting the entire NULL entry
causes the tree to be altered in a way that is not safe for concurrent
readers; the readers may see a node being rewritten or one that does not
match the maple state they are using.

Enabling RCU mode allows the concurrent readers to see a stable node and
will return the expected result.

[Liam.Howlett@Oracle.com: we don't need to free the nodes with RCU[
Link: https://lore.kernel.org/linux-mm/000000000000b0a65805f663ace6@google.com/
Link: https://lkml.kernel.org/r/20230227173632.3292573-9-surenb@google.com
Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 3 ++-
 kernel/fork.c            | 3 +++
 mm/mmap.c                | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0722859c3647..a57e6ae78e65 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -774,7 +774,8 @@ struct mm_struct {
 	unsigned long cpu_bitmap[];
 };
 
-#define MM_MT_FLAGS	(MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
+#define MM_MT_FLAGS	(MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
+			 MT_FLAGS_USE_RCU)
 extern struct mm_struct init_mm;
 
 /* Pointer magic because the dynamic array size confuses some compilers. */
diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..0c92f224c68c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	if (retval)
 		goto out;
 
+	mt_clear_in_rcu(vmi.mas.tree);
 	for_each_vma(old_vmi, mpnt) {
 		struct file *file;
 
@@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	retval = arch_dup_mmap(oldmm, mm);
 loop_out:
 	vma_iter_free(&vmi);
+	if (!retval)
+		mt_set_in_rcu(vmi.mas.tree);
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
diff --git a/mm/mmap.c b/mm/mmap.c
index ad499f7b767f..ff68a67a2a7c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	int count = 0;
 	int error = -ENOMEM;
 	MA_STATE(mas_detach, &mt_detach, 0, 0);
-	mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+	mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
 	mt_set_external_lock(&mt_detach, &mm->mmap_lock);
 
 	/*
@@ -3037,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
 	mmap_write_lock(mm);
+	mt_clear_in_rcu(&mm->mm_mt);
 	free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
 		      USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb);
-- 
cgit v1.2.3


From 10739ea3132861b1344264b110ea48d951a0e3b0 Mon Sep 17 00:00:00 2001
From: Shenwei Wang <shenwei.wang@nxp.com>
Date: Mon, 3 Apr 2023 17:23:01 -0500
Subject: net: stmmac: add support for platform specific reset

This patch adds support for platform-specific reset logic in the
stmmac driver. Some SoCs require a different reset mechanism than
the standard dwmac IP reset. To support these platforms, a new function
pointer 'fix_soc_reset' is added to the plat_stmmacenet_data structure.
The stmmac_reset in hwif.h is modified to call the 'fix_soc_reset'
function if it exists. This enables the driver to use the platform-specific
reset logic when necessary.

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20230403222302.328262-1-shenwei.wang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.c | 13 +++++++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h |  3 +--
 include/linux/stmmac.h                     |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index bb7114f970f8..b8ba8f2d8041 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -87,6 +87,19 @@ static int stmmac_dwxlgmac_quirks(struct stmmac_priv *priv)
 	return 0;
 }
 
+int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr)
+{
+	struct plat_stmmacenet_data *plat = priv ? priv->plat : NULL;
+
+	if (!priv)
+		return -EINVAL;
+
+	if (plat && plat->fix_soc_reset)
+		return plat->fix_soc_reset(plat, ioaddr);
+
+	return stmmac_do_callback(priv, dma, reset, ioaddr);
+}
+
 static const struct stmmac_hwif_entry {
 	bool gmac;
 	bool gmac4;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 16a7421715cb..1cc286b000b6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -214,8 +214,6 @@ struct stmmac_dma_ops {
 	int (*enable_tbs)(void __iomem *ioaddr, bool en, u32 chan);
 };
 
-#define stmmac_reset(__priv, __args...) \
-	stmmac_do_callback(__priv, dma, reset, __args)
 #define stmmac_dma_init(__priv, __args...) \
 	stmmac_do_void_callback(__priv, dma, init, __args)
 #define stmmac_init_chan(__priv, __args...) \
@@ -640,6 +638,7 @@ extern const struct stmmac_mmc_ops dwxgmac_mmc_ops;
 #define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
 
+int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr);
 int stmmac_hwif_init(struct stmmac_priv *priv);
 
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a2414c187483..dafa001e9e7a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -223,6 +223,7 @@ struct plat_stmmacenet_data {
 	struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
 	struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
+	int (*fix_soc_reset)(void *priv, void __iomem *ioaddr);
 	int (*serdes_powerup)(struct net_device *ndev, void *priv);
 	void (*serdes_powerdown)(struct net_device *ndev, void *priv);
 	void (*speed_mode_2500)(struct net_device *ndev, void *priv);
-- 
cgit v1.2.3


From 263e721e3ba1f3b42ad61aed3d504f3c8c9c0eb6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 15:34:05 +0100
Subject: mm: make mapping_get_entry available outside of filemap.c

mapping_get_entry is useful for page cache API users that need to know
about xa_value internals.  Rename it and make it available in pagemap.h.

Link: https://lkml.kernel.org/r/20230307143410.28031-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 +
 mm/filemap.c            | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 853184a46411..bcfd798ce0b9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -507,6 +507,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_ENTRY		0x00000080
 #define FGP_STABLE		0x00000100
 
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		int fgp_flags, gfp_t gfp);
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
diff --git a/mm/filemap.c b/mm/filemap.c
index 2723104cc06a..a674108a4d52 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1836,7 +1836,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  */
 
 /*
- * mapping_get_entry - Get a page cache entry.
+ * filemap_get_entry - Get a page cache entry.
  * @mapping: the address_space to search
  * @index: The page cache index.
  *
@@ -1847,7 +1847,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  *
  * Return: The folio, swap or shadow entry, %NULL if nothing is found.
  */
-static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
 	struct folio *folio;
@@ -1917,7 +1917,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 	struct folio *folio;
 
 repeat:
-	folio = mapping_get_entry(mapping, index);
+	folio = filemap_get_entry(mapping, index);
 	if (xa_is_value(folio)) {
 		if (fgp_flags & FGP_ENTRY)
 			return folio;
-- 
cgit v1.2.3


From 48c9d11375fc66f1e59d0e9b27d121e015a50904 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 15:34:09 +0100
Subject: mm: remove FGP_ENTRY

FGP_ENTRY is unused now, so remove it.

Link: https://lkml.kernel.org/r/20230307143410.28031-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +--
 mm/filemap.c            | 7 +------
 mm/folio-compat.c       | 4 ++--
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcfd798ce0b9..306a0f63cea8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,8 +504,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_NOFS		0x00000010
 #define FGP_NOWAIT		0x00000020
 #define FGP_FOR_MMAP		0x00000040
-#define FGP_ENTRY		0x00000080
-#define FGP_STABLE		0x00000100
+#define FGP_STABLE		0x00000080
 
 void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
diff --git a/mm/filemap.c b/mm/filemap.c
index a674108a4d52..ac161b50f5bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1891,8 +1891,6 @@ out:
  *
  * * %FGP_ACCESSED - The folio will be marked accessed.
  * * %FGP_LOCK - The folio is returned locked.
- * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- *   instead of allocating a new folio to replace it.
  * * %FGP_CREAT - If no page is present then a new page is allocated using
  *   @gfp and added to the page cache and the VM's LRU list.
  *   The page is returned locked and with an increased refcount.
@@ -1918,11 +1916,8 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 
 repeat:
 	folio = filemap_get_entry(mapping, index);
-	if (xa_is_value(folio)) {
-		if (fgp_flags & FGP_ENTRY)
-			return folio;
+	if (xa_is_value(folio))
 		folio = NULL;
-	}
 	if (!folio)
 		goto no_page;
 
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index cabcd1de9ecb..1754daa85d35 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -97,8 +97,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 	struct folio *folio;
 
 	folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
-	if (!folio || xa_is_value(folio))
-		return &folio->page;
+	if (!folio)
+		return NULL;
 	return folio_file_page(folio, index);
 }
 EXPORT_SYMBOL(pagecache_get_page);
-- 
cgit v1.2.3


From 66dabbb65d673aef40dd17bf62c042be8f6d4a4b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 15:34:10 +0100
Subject: mm: return an ERR_PTR from __filemap_get_folio

Instead of returning NULL for all errors, distinguish between:

 - no entry found and not asked to allocated (-ENOENT)
 - failed to allocate memory (-ENOMEM)
 - would block (-EAGAIN)

so that callers don't have to guess the error based on the passed in
flags.

Also pass through the error through the direct callers: filemap_get_folio,
filemap_lock_folio filemap_grab_folio and filemap_get_incore_folio.

[hch@lst.de: fix null-pointer deref]
  Link: https://lkml.kernel.org/r/20230310070023.GA13563@lst.de
  Link: https://lkml.kernel.org/r/20230310043137.GA1624890@u2004
Link: https://lkml.kernel.org/r/20230307143410.28031-8-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> [nilfs2]
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/afs/dir.c             | 10 +++++-----
 fs/afs/dir_edit.c        |  2 +-
 fs/afs/write.c           |  4 ++--
 fs/ext4/inode.c          |  2 +-
 fs/ext4/move_extent.c    |  8 ++++----
 fs/hugetlbfs/inode.c     |  2 +-
 fs/iomap/buffered-io.c   | 11 ++---------
 fs/netfs/buffered_read.c |  4 ++--
 fs/nfs/file.c            |  4 ++--
 fs/nilfs2/page.c         |  6 +++---
 include/linux/pagemap.h  | 11 ++++++-----
 mm/filemap.c             | 14 ++++++++------
 mm/folio-compat.c        |  2 +-
 mm/huge_memory.c         |  2 +-
 mm/hugetlb.c             |  6 ++++--
 mm/memcontrol.c          |  2 +-
 mm/mincore.c             |  2 +-
 mm/shmem.c               |  4 ++--
 mm/swap_state.c          | 17 ++++++++++-------
 mm/swapfile.c            |  4 ++--
 mm/truncate.c            | 15 ++++++++-------
 21 files changed, 67 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 82690d1dd49a..f92b9e62d567 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -319,16 +319,16 @@ expand:
 		struct folio *folio;
 
 		folio = filemap_get_folio(mapping, i);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
 				afs_stat_v(dvnode, n_inval);
-
-			ret = -ENOMEM;
 			folio = __filemap_get_folio(mapping,
 						    i, FGP_LOCK | FGP_CREAT,
 						    mapping->gfp_mask);
-			if (!folio)
+			if (IS_ERR(folio)) {
+				ret = PTR_ERR(folio);
 				goto error;
+			}
 			folio_attach_private(folio, (void *)1);
 			folio_unlock(folio);
 		}
@@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
 		 */
 		folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
 					    FGP_ACCESSED, 0);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
 			break;
 		}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 0ab7752d1b75..f0eddccbdd95 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
 	folio = __filemap_get_folio(mapping, index,
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
 				    mapping->gfp_mask);
-	if (!folio)
+	if (IS_ERR(folio))
 		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	else if (folio && !folio_test_private(folio))
 		folio_attach_private(folio, (void *)1);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 571f3b9a417e..c822d6006033 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping,
 		_debug("kill %lx (to %lx)", index, last);
 
 		folio = filemap_get_folio(mapping, index);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			next = index + 1;
 			continue;
 		}
@@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
 		_debug("redirty %llx @%llx", len, start);
 
 		folio = filemap_get_folio(mapping, index);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			next = index + 1;
 			continue;
 		}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf0b7dea4900..d7973743417b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5395,7 +5395,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 	while (1) {
 		struct folio *folio = filemap_lock_folio(inode->i_mapping,
 				      inode->i_size >> PAGE_SHIFT);
-		if (!folio)
+		if (IS_ERR(folio))
 			return;
 		ret = __ext4_journalled_invalidate_folio(folio, offset,
 						folio_size(folio) - offset);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2de9829aed63..7bf6d069199c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,18 +141,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
 	flags = memalloc_nofs_save();
 	folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
 			mapping_gfp_mask(mapping[0]));
-	if (!folio[0]) {
+	if (IS_ERR(folio[0])) {
 		memalloc_nofs_restore(flags);
-		return -ENOMEM;
+		return PTR_ERR(folio[0]);
 	}
 
 	folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
 			mapping_gfp_mask(mapping[1]));
 	memalloc_nofs_restore(flags);
-	if (!folio[1]) {
+	if (IS_ERR(folio[1])) {
 		folio_unlock(folio[0]);
 		folio_put(folio[0]);
-		return -ENOMEM;
+		return PTR_ERR(folio[1]);
 	}
 	/*
 	 * __filemap_get_folio() may not wait on folio's writeback if
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..702d79639c0d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
 	struct folio *folio;
 
 	folio = filemap_lock_folio(mapping, idx);
-	if (!folio)
+	if (IS_ERR(folio))
 		return;
 
 	start = start & ~huge_page_mask(h);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6f4c97a6d7e9..96bb56c203f4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
 {
 	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
-	struct folio *folio;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
 
-	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
-	if (folio)
-		return folio;
-
-	if (iter->flags & IOMAP_NOWAIT)
-		return ERR_PTR(-EAGAIN);
-	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL_GPL(iomap_get_folio);
 
@@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
 		/* grab locked page */
 		folio = filemap_lock_folio(inode->i_mapping,
 				start_byte >> PAGE_SHIFT);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
 					PAGE_SIZE;
 			continue;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7679a68e8193..209726a9cfdb 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -350,8 +350,8 @@ int netfs_write_begin(struct netfs_inode *ctx,
 retry:
 	folio = __filemap_get_folio(mapping, index, fgp_flags,
 				    mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	if (ctx->ops->check_write_begin) {
 		/* Allow the netfs (eg. ceph) to flush conflicts. */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 893625eacab9..1d03406e6c03 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -336,8 +336,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 
 start:
 	folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT);
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 	*pagep = &folio->page;
 
 	ret = nfs_flush_incompatible(file, folio);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 41ccd43cd979..5cf30827f244 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -259,10 +259,10 @@ repeat:
 			NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
 
 		dfolio = filemap_grab_folio(dmap, folio->index);
-		if (unlikely(!dfolio)) {
+		if (unlikely(IS_ERR(dfolio))) {
 			/* No empty page is added to the page cache */
-			err = -ENOMEM;
 			folio_unlock(folio);
+			err = PTR_ERR(dfolio);
 			break;
 		}
 		if (unlikely(!folio_buffers(folio)))
@@ -311,7 +311,7 @@ repeat:
 
 		folio_lock(folio);
 		dfolio = filemap_lock_folio(dmap, index);
-		if (dfolio) {
+		if (!IS_ERR(dfolio)) {
 			/* overwrite existing folio in the destination cache */
 			WARN_ON(folio_test_dirty(dfolio));
 			nilfs_copy_page(&dfolio->page, &folio->page, 0);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 306a0f63cea8..fdcd595d2294 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -520,7 +520,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
  * Looks up the page cache entry at @mapping & @index.  If a folio is
  * present, it is returned with an increased refcount.
  *
- * Otherwise, %NULL is returned.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index.  Will not return a shadow, swap or DAX entry.
  */
 static inline struct folio *filemap_get_folio(struct address_space *mapping,
 					pgoff_t index)
@@ -537,8 +538,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping,
  * present, it is returned locked with an increased refcount.
  *
  * Context: May sleep.
- * Return: A folio or %NULL if there is no folio in the cache for this
- * index.  Will not return a shadow, swap or DAX entry.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index.  Will not return a shadow, swap or DAX entry.
  */
 static inline struct folio *filemap_lock_folio(struct address_space *mapping,
 					pgoff_t index)
@@ -555,8 +556,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
  * a new folio is created. The folio is locked, marked as accessed, and
  * returned.
  *
- * Return: A found or created folio. NULL if no folio is found and failed to
- * create a folio.
+ * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
+ * and failed to create a folio.
  */
 static inline struct folio *filemap_grab_folio(struct address_space *mapping,
 					pgoff_t index)
diff --git a/mm/filemap.c b/mm/filemap.c
index ac161b50f5bc..a34abfe8c654 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1907,7 +1907,7 @@ out:
  *
  * If there is a page cache page, it is returned with an increased refcount.
  *
- * Return: The found folio or %NULL otherwise.
+ * Return: The found folio or an ERR_PTR() otherwise.
  */
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		int fgp_flags, gfp_t gfp)
@@ -1925,7 +1925,7 @@ repeat:
 		if (fgp_flags & FGP_NOWAIT) {
 			if (!folio_trylock(folio)) {
 				folio_put(folio);
-				return NULL;
+				return ERR_PTR(-EAGAIN);
 			}
 		} else {
 			folio_lock(folio);
@@ -1964,7 +1964,7 @@ no_page:
 
 		folio = filemap_alloc_folio(gfp, 0);
 		if (!folio)
-			return NULL;
+			return ERR_PTR(-ENOMEM);
 
 		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
 			fgp_flags |= FGP_LOCK;
@@ -1989,6 +1989,8 @@ no_page:
 			folio_unlock(folio);
 	}
 
+	if (!folio)
+		return ERR_PTR(-ENOENT);
 	return folio;
 }
 EXPORT_SYMBOL(__filemap_get_folio);
@@ -3258,7 +3260,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	folio = filemap_get_folio(mapping, index);
-	if (likely(folio)) {
+	if (likely(!IS_ERR(folio))) {
 		/*
 		 * We found the page, so try async readahead before waiting for
 		 * the lock.
@@ -3287,7 +3289,7 @@ retry_find:
 		folio = __filemap_get_folio(mapping, index,
 					  FGP_CREAT|FGP_FOR_MMAP,
 					  vmf->gfp_mask);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			if (fpin)
 				goto out_retry;
 			filemap_invalidate_unlock_shared(mapping);
@@ -3638,7 +3640,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping,
 		filler = mapping->a_ops->read_folio;
 repeat:
 	folio = filemap_get_folio(mapping, index);
-	if (!folio) {
+	if (IS_ERR(folio)) {
 		folio = filemap_alloc_folio(gfp, 0);
 		if (!folio)
 			return ERR_PTR(-ENOMEM);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 1754daa85d35..2511c055a35f 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -97,7 +97,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 	struct folio *folio;
 
 	folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
-	if (!folio)
+	if (IS_ERR(folio))
 		return NULL;
 	return folio_file_page(folio, index);
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 70008dd7f215..2d860e70fe88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3092,7 +3092,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
 		struct folio *folio = filemap_get_folio(mapping, index);
 
 		nr_pages = 1;
-		if (!folio)
+		if (IS_ERR(folio))
 			continue;
 
 		if (!folio_test_large(folio))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..712e32b38295 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5780,7 +5780,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 */
 	new_folio = false;
 	folio = filemap_lock_folio(mapping, idx);
-	if (!folio) {
+	if (IS_ERR(folio)) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
 			goto out;
@@ -6071,6 +6071,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		vma_end_reservation(h, vma, haddr);
 
 		pagecache_folio = filemap_lock_folio(mapping, idx);
+		if (IS_ERR(pagecache_folio))
+			pagecache_folio = NULL;
 	}
 
 	ptl = huge_pte_lock(h, mm, ptep);
@@ -6182,7 +6184,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (is_continue) {
 		ret = -EFAULT;
 		folio = filemap_lock_folio(mapping, idx);
-		if (!folio)
+		if (IS_ERR(folio))
 			goto out;
 		folio_in_pagecache = true;
 	} else if (!*pagep) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13ec89c45389..0524add35cae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5705,7 +5705,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 	/* shmem/tmpfs may report page out on swap: account for that too. */
 	index = linear_page_index(vma, addr);
 	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
-	if (!folio)
+	if (IS_ERR(folio))
 		return NULL;
 	return folio_file_page(folio, index);
 }
diff --git a/mm/mincore.c b/mm/mincore.c
index d359650b0f75..2d5be013a25a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -61,7 +61,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
 	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
 	 */
 	folio = filemap_get_incore_folio(mapping, index);
-	if (folio) {
+	if (!IS_ERR(folio)) {
 		present = folio_test_uptodate(folio);
 		folio_put(folio);
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index 93cb39852a16..fa6e38f2f55f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -605,7 +605,7 @@ next:
 
 		index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
 		folio = filemap_get_folio(inode->i_mapping, index);
-		if (!folio)
+		if (IS_ERR(folio))
 			goto drop;
 
 		/* No huge page at the end of the file: nothing to split */
@@ -3214,7 +3214,7 @@ static const char *shmem_get_link(struct dentry *dentry,
 
 	if (!dentry) {
 		folio = filemap_get_folio(inode->i_mapping, 0);
-		if (!folio)
+		if (IS_ERR(folio))
 			return ERR_PTR(-ECHILD);
 		if (PageHWPoison(folio_page(folio, 0)) ||
 		    !folio_test_uptodate(folio)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 92234f4b51d2..b76a65ac28b3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -336,7 +336,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
 	struct folio *folio;
 
 	folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
-	if (folio) {
+	if (!IS_ERR(folio)) {
 		bool vma_ra = swap_use_vma_readahead();
 		bool readahead;
 
@@ -366,6 +366,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
 			if (!vma || !vma_ra)
 				atomic_inc(&swapin_readahead_hits);
 		}
+	} else {
+		folio = NULL;
 	}
 
 	return folio;
@@ -388,23 +390,24 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 	struct swap_info_struct *si;
 	struct folio *folio = filemap_get_entry(mapping, index);
 
+	if (!folio)
+		return ERR_PTR(-ENOENT);
 	if (!xa_is_value(folio))
-		goto out;
+		return folio;
 	if (!shmem_mapping(mapping))
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	swp = radix_to_swp_entry(folio);
 	/* There might be swapin error entries in shmem mapping. */
 	if (non_swap_entry(swp))
-		return NULL;
+		return ERR_PTR(-ENOENT);
 	/* Prevent swapoff from happening to us */
 	si = get_swap_device(swp);
 	if (!si)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 	index = swp_offset(swp);
 	folio = filemap_get_folio(swap_address_space(swp), index);
 	put_swap_device(si);
-out:
 	return folio;
 }
 
@@ -431,7 +434,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		folio = filemap_get_folio(swap_address_space(entry),
 						swp_offset(entry));
 		put_swap_device(si);
-		if (folio)
+		if (!IS_ERR(folio))
 			return folio_file_page(folio, swp_offset(entry));
 
 		/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c1b97436f811..00b3e46becad 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -136,7 +136,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	int ret = 0;
 
 	folio = filemap_get_folio(swap_address_space(entry), offset);
-	if (!folio)
+	if (IS_ERR(folio))
 		return 0;
 	/*
 	 * When this function is called from scan_swap_map_slots() and it's
@@ -2095,7 +2095,7 @@ retry:
 
 		entry = swp_entry(type, i);
 		folio = filemap_get_folio(swap_address_space(entry), i);
-		if (!folio)
+		if (IS_ERR(folio))
 			continue;
 
 		/*
diff --git a/mm/truncate.c b/mm/truncate.c
index 7b4ea4c4a46b..86de31ed4d32 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -375,7 +375,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 
 	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
 	folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
-	if (folio) {
+	if (!IS_ERR(folio)) {
 		same_folio = lend < folio_pos(folio) + folio_size(folio);
 		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
 			start = folio->index + folio_nr_pages(folio);
@@ -387,14 +387,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		folio = NULL;
 	}
 
-	if (!same_folio)
+	if (!same_folio) {
 		folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
 						FGP_LOCK, 0);
-	if (folio) {
-		if (!truncate_inode_partial_folio(folio, lstart, lend))
-			end = folio->index;
-		folio_unlock(folio);
-		folio_put(folio);
+		if (!IS_ERR(folio)) {
+			if (!truncate_inode_partial_folio(folio, lstart, lend))
+				end = folio->index;
+			folio_unlock(folio);
+			folio_put(folio);
+		}
 	}
 
 	index = start;
-- 
cgit v1.2.3


From 2bad466cc9d9b4c3b4b16eb9c03c919b59561316 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 9 Mar 2023 17:37:10 -0500
Subject: mm/uffd: UFFD_FEATURE_WP_UNPOPULATED

Patch series "mm/uffd: Add feature bit UFFD_FEATURE_WP_UNPOPULATED", v4.

The new feature bit makes anonymous memory acts the same as file memory on
userfaultfd-wp in that it'll also wr-protect none ptes.

It can be useful in two cases:

(1) Uffd-wp app that needs to wr-protect none ptes like QEMU snapshot,
    so pre-fault can be replaced by enabling this flag and speed up
    protections

(2) It helps to implement async uffd-wp mode that Muhammad is working on [1]

It's debatable whether this is the most ideal solution because with the
new feature bit set, wr-protect none pte needs to pre-populate the
pgtables to the last level (PAGE_SIZE).  But it seems fine so far to
service either purpose above, so we can leave optimizations for later.

The series brings pte markers to anonymous memory too.  There's some
change in the common mm code path in the 1st patch, great to have some eye
looking at it, but hopefully they're still relatively straightforward.


This patch (of 2):

This is a new feature that controls how uffd-wp handles none ptes.  When
it's set, the kernel will handle anonymous memory the same way as file
memory, by allowing the user to wr-protect unpopulated ptes.

File memories handles none ptes consistently by allowing wr-protecting of
none ptes because of the unawareness of page cache being exist or not.
For anonymous it was not as persistent because we used to assume that we
don't need protections on none ptes or known zero pages.

One use case of such a feature bit was VM live snapshot, where if without
wr-protecting empty ptes the snapshot can contain random rubbish in the
holes of the anonymous memory, which can cause misbehave of the guest when
the guest OS assumes the pages should be all zeros.

QEMU worked it around by pre-populate the section with reads to fill in
zero page entries before starting the whole snapshot process [1].

Recently there's another need raised on using userfaultfd wr-protect for
detecting dirty pages (to replace soft-dirty in some cases) [2].  In that
case if without being able to wr-protect none ptes by default, the dirty
info can get lost, since we cannot treat every none pte to be dirty (the
current design is identify a page dirty based on uffd-wp bit being
cleared).

In general, we want to be able to wr-protect empty ptes too even for
anonymous.

This patch implements UFFD_FEATURE_WP_UNPOPULATED so that it'll make
uffd-wp handling on none ptes being consistent no matter what the memory
type is underneath.  It doesn't have any impact on file memories so far
because we already have pte markers taking care of that.  So it only
affects anonymous.

The feature bit is by default off, so the old behavior will be maintained.
Sometimes it may be wanted because the wr-protect of none ptes will
contain overheads not only during UFFDIO_WRITEPROTECT (by applying pte
markers to anonymous), but also on creating the pgtables to store the pte
markers.  So there's potentially less chance of using thp on the first
fault for a none pmd or larger than a pmd.

The major implementation part is teaching the whole kernel to understand
pte markers even for anonymously mapped ranges, meanwhile allowing the
UFFDIO_WRITEPROTECT ioctl to apply pte markers for anonymous too when the
new feature bit is set.

Note that even if the patch subject starts with mm/uffd, there're a few
small refactors to major mm path of handling anonymous page faults.  But
they should be straightforward.

With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all
the memory before wr-protect during taking a live snapshot.  Quotting from
Muhammad's test result here [3] based on a simple program [4]:

  (1) With huge page disabled
  echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
  ./uffd_wp_perf
  Test DEFAULT: 4
  Test PRE-READ: 1111453 (pre-fault 1101011)
  Test MADVISE: 278276 (pre-fault 266378)
  Test WP-UNPOPULATE: 11712

  (2) With Huge page enabled
  echo always > /sys/kernel/mm/transparent_hugepage/enabled
  ./uffd_wp_perf
  Test DEFAULT: 4
  Test PRE-READ: 22521 (pre-fault 22348)
  Test MADVISE: 4909 (pre-fault 4743)
  Test WP-UNPOPULATE: 14448

There'll be a great perf boost for no-thp case, while for thp enabled with
extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE,
but that's low possibility in reality, also the overhead was not reduced
but postponed until a follow up write on any huge zero thp, so potentially
it is faster by making the follow up writes slower.

[1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@virtuozzo.com/
[2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/
[3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@collabora.com/
[4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c

[peterx@redhat.com: comment changes, oneliner fix to khugepaged]
  Link: https://lkml.kernel.org/r/ZB2/8jPhD3fpx5U8@x1n
Link: https://lkml.kernel.org/r/20230309223711.823547-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20230309223711.823547-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst | 17 +++++++++
 fs/userfaultfd.c                             | 16 ++++++++
 include/linux/mm_inline.h                    |  6 +++
 include/linux/userfaultfd_k.h                | 23 ++++++++++++
 include/uapi/linux/userfaultfd.h             | 10 ++++-
 mm/khugepaged.c                              |  2 +-
 mm/memory.c                                  | 56 +++++++++++++++++++++-------
 mm/mprotect.c                                | 51 ++++++++++++++++++++-----
 8 files changed, 155 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 7dc823b56ca4..bd2226299583 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
 you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
 used.
 
+Userfaultfd write-protect mode currently behave differently on none ptes
+(when e.g. page is missing) over different types of memories.
+
+For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
+(e.g. when pages are missing and not populated).  For file-backed memories
+like shmem and hugetlbfs, none ptes will be write protected just like a
+present pte.  In other words, there will be a userfaultfd write fault
+message generated when writing to a missing page on file typed memories,
+as long as the page range was write-protected before.  Such a message will
+not be generated on anonymous memories by default.
+
+If the application wants to be able to write protect none ptes on anonymous
+memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ.  On
+newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
+and set the feature bit in advance to make sure none ptes will also be
+write protected even upon anonymous memory.
+
 QEMU/KVM
 ========
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..881e9c82b9d1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 	return ctx->features & UFFD_FEATURE_INITIALIZED;
 }
 
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+	if (!ctx)
+		return false;
+
+	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 				     vm_flags_t flags)
 {
@@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #endif
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index de1e622dd366..0e1d239a882c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
 	/* The current status of the pte should be "cleared" before calling */
 	WARN_ON_ONCE(!pte_none(*pte));
 
+	/*
+	 * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+	 * thing, because when zapping either it means it's dropping the
+	 * page, or in TTU where the present pte will be quickly replaced
+	 * with a swap pte.  There's no way of leaking the bit.
+	 */
 	if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
 		return;
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..0cf8880219da 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, struct list_head *uf);
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
+extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+	/* Only wr-protect mode uses pte markers */
+	if (!userfaultfd_wp(vma))
+		return false;
+
+	/* File-based uffd-wp always need markers */
+	if (!vma_is_anonymous(vma))
+		return true;
+
+	/*
+	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+	 * enabled (to apply markers on zero pages).
+	 */
+	return userfaultfd_wp_unpopulated(vma);
+}
+
 static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
 {
 #ifdef CONFIG_PTE_MARKER_UFFD_WP
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 005e5e306266..90c958952bfc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -38,7 +38,8 @@
 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
 			   UFFD_FEATURE_MINOR_SHMEM |		\
 			   UFFD_FEATURE_EXACT_ADDRESS |		\
-			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
+			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
+			   UFFD_FEATURE_WP_UNPOPULATED)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -203,6 +204,12 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
 	 * write-protection mode is supported on both shmem and hugetlbfs.
+	 *
+	 * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
+	 * write-protection mode will always apply to unpopulated pages
+	 * (i.e. empty ptes).  This will be the default behavior for shmem
+	 * & hugetlbfs, so this flag only affects anonymous memory behavior
+	 * when userfault write-protection mode is registered.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -217,6 +224,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
 #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
 #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
+#define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 074ea534f786..c7317678cb10 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1177,7 +1177,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				 * enabled swap entries.  Please see
 				 * comment below for pte_uffd_wp().
 				 */
-				if (pte_swp_uffd_wp(pteval)) {
+				if (pte_swp_uffd_wp_any(pteval)) {
 					result = SCAN_PTE_UFFD_WP;
 					goto out_unmap;
 				}
diff --git a/mm/memory.c b/mm/memory.c
index 6285cad1f4fb..a890b2951b53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
 #endif
 
 static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+		return false;
+
+	return pte_marker_uffd_wp(vmf->orig_pte);
+}
 
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
@@ -1346,6 +1360,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 			      unsigned long addr, pte_t *pte,
 			      struct zap_details *details, pte_t pteval)
 {
+	/* Zap on anonymous always means dropping everything */
+	if (vma_is_anonymous(vma))
+		return;
+
 	if (zap_drop_file_uffd_wp(details))
 		return;
 
@@ -1452,8 +1470,12 @@ again:
 				continue;
 			rss[mm_counter(page)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
-			/* Only drop the uffd-wp marker if explicitly requested */
-			if (!zap_drop_file_uffd_wp(details))
+			/*
+			 * For anon: always drop the marker; for file: only
+			 * drop the marker if explicitly requested.
+			 */
+			if (!vma_is_anonymous(vma) &&
+			    !zap_drop_file_uffd_wp(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_swapin_error_entry(entry)) {
@@ -3620,6 +3642,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 	return 0;
 }
 
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+	if (vma_is_anonymous(vmf->vma))
+		return do_anonymous_page(vmf);
+	else
+		return do_fault(vmf);
+}
+
 /*
  * This is actually a page-missing access, but with uffd-wp special pte
  * installed.  It means this pte was wr-protected before being unmapped.
@@ -3630,11 +3660,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
 	 * Just in case there're leftover special ptes even after the region
 	 * got unregistered - we can simply clear them.
 	 */
-	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+	if (unlikely(!userfaultfd_wp(vmf->vma)))
 		return pte_marker_clear(vmf);
 
-	/* do_fault() can handle pte markers too like none pte */
-	return do_fault(vmf);
+	return do_pte_missing(vmf);
 }
 
 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3999,6 +4028,7 @@ out_release:
  */
 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio;
 	vm_fault_t ret = 0;
@@ -4032,7 +4062,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 						vma->vm_page_prot));
 		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 				vmf->address, &vmf->ptl);
-		if (!pte_none(*vmf->pte)) {
+		if (vmf_pte_changed(vmf)) {
 			update_mmu_tlb(vma, vmf->address, vmf->pte);
 			goto unlock;
 		}
@@ -4072,7 +4102,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 			&vmf->ptl);
-	if (!pte_none(*vmf->pte)) {
+	if (vmf_pte_changed(vmf)) {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);
 		goto release;
 	}
@@ -4092,6 +4122,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	folio_add_new_anon_rmap(folio, vma, vmf->address);
 	folio_add_lru_vma(folio, vma);
 setpte:
+	if (uffd_wp)
+		entry = pte_mkuffd_wp(entry);
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
 	/* No need to invalidate - it was non-present before */
@@ -4259,7 +4291,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool prefault = vmf->address != addr;
 	pte_t entry;
@@ -4903,12 +4935,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		}
 	}
 
-	if (!vmf->pte) {
-		if (vma_is_anonymous(vmf->vma))
-			return do_anonymous_page(vmf);
-		else
-			return do_fault(vmf);
-	}
+	if (!vmf->pte)
+		return do_pte_missing(vmf);
 
 	if (!pte_present(vmf->orig_pte))
 		return do_swap_page(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 13e84d8c0797..b9da9a5f87fe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb,
 		} else {
 			/* It must be an none page, or what else?.. */
 			WARN_ON_ONCE(!pte_none(oldpte));
-			if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+			/*
+			 * Nobody plays with any none ptes besides
+			 * userfaultfd when applying the protections.
+			 */
+			if (likely(!uffd_wp))
+				continue;
+
+			if (userfaultfd_wp_use_markers(vma)) {
 				/*
 				 * For file-backed mem, we need to be able to
 				 * wr-protect a none pte, because even if the
@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
 	return 0;
 }
 
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split THPs into PTE mappings in change
+ * protection procedure, false otherwise.
+ */
 static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
 {
+	/*
+	 * pte markers only resides in pte level, if we need pte markers,
+	 * we need to split.  We cannot wr-protect shmem thp because file
+	 * thp is handled differently when split by erasing the pmd so far.
+	 */
 	return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
 }
 
 /*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists.  When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+	/* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+	if (!(cp_flags & MM_CP_UFFD_WP))
+		return false;
+
+	/* Populate if the userfaultfd mode requires pte markers */
+	return userfaultfd_wp_use_markers(vma);
+}
+
+/*
+ * Populate the pgtable underneath for whatever reason if requested.
+ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
+ * allocation failures during page faults by kicking OOM and returning
+ * error.
  */
 #define  change_pmd_prepare(vma, pmd, cp_flags)				\
 	({								\
 		long err = 0;						\
-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
+		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
 			if (pte_alloc(vma->vm_mm, pmd))			\
 				err = -ENOMEM;				\
 		}							\
@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
 #define  change_prepare(vma, high, low, addr, cp_flags)			\
 	  ({								\
 		long err = 0;						\
-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
+		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
 			low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
 			if (p == NULL)					\
 				err = -ENOMEM;				\
@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
 
 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 			if ((next - addr != HPAGE_PMD_SIZE) ||
-			    uffd_wp_protect_file(vma, cp_flags)) {
+			    pgtable_split_needed(vma, cp_flags)) {
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
 				/*
 				 * For file-backed, the pmd could have been
-- 
cgit v1.2.3


From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Mar 2023 14:31:33 +0300
Subject: mm, treewide: redefine MAX_ORDER sanely

MAX_ORDER currently defined as number of orders page allocator supports:
user can ask buddy allocator for page order between 0 and MAX_ORDER-1.

This definition is counter-intuitive and lead to number of bugs all over
the kernel.

Change the definition of MAX_ORDER to be inclusive: the range of orders
user can ask from buddy allocator is 0..MAX_ORDER now.

[kirill@shutemov.name: fix min() warning]
  Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box
[akpm@linux-foundation.org: fix another min_t warning]
[kirill@shutemov.name: fixups per Zi Yan]
  Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name
[akpm@linux-foundation.org: fix underlining in docs]
  Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/
Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>	[powerpc]
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst  |  6 ++--
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 arch/arc/Kconfig                                |  4 +--
 arch/arm/Kconfig                                |  9 ++----
 arch/arm/configs/imx_v6_v7_defconfig            |  2 +-
 arch/arm/configs/milbeaut_m10v_defconfig        |  2 +-
 arch/arm/configs/oxnas_v6_defconfig             |  2 +-
 arch/arm/configs/pxa_defconfig                  |  2 +-
 arch/arm/configs/sama7_defconfig                |  2 +-
 arch/arm/configs/sp7021_defconfig               |  2 +-
 arch/arm64/Kconfig                              | 27 ++++++++----------
 arch/arm64/include/asm/sparsemem.h              |  2 +-
 arch/arm64/kvm/hyp/include/nvhe/gfp.h           |  2 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c            | 10 +++----
 arch/csky/Kconfig                               |  2 +-
 arch/ia64/Kconfig                               |  8 +++---
 arch/ia64/include/asm/sparsemem.h               |  4 +--
 arch/ia64/mm/hugetlbpage.c                      |  2 +-
 arch/loongarch/Kconfig                          | 15 ++++------
 arch/m68k/Kconfig.cpu                           |  5 +---
 arch/mips/Kconfig                               | 19 ++++++-------
 arch/nios2/Kconfig                              |  7 ++---
 arch/powerpc/Kconfig                            | 27 ++++++++----------
 arch/powerpc/configs/85xx/ge_imp3a_defconfig    |  2 +-
 arch/powerpc/configs/fsl-emb-nonhw.config       |  2 +-
 arch/powerpc/mm/book3s64/iommu_api.c            |  2 +-
 arch/powerpc/mm/hugetlbpage.c                   |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c       |  2 +-
 arch/sh/configs/ecovec24_defconfig              |  2 +-
 arch/sh/mm/Kconfig                              | 17 +++++------
 arch/sparc/Kconfig                              |  5 +---
 arch/sparc/kernel/pci_sun4v.c                   |  2 +-
 arch/sparc/kernel/traps_64.c                    |  2 +-
 arch/sparc/mm/tsb.c                             |  4 +--
 arch/um/kernel/um_arch.c                        |  4 +--
 arch/xtensa/Kconfig                             |  5 +---
 drivers/base/regmap/regmap-debugfs.c            |  8 +++---
 drivers/block/floppy.c                          |  2 +-
 drivers/crypto/ccp/sev-dev.c                    |  2 +-
 drivers/crypto/hisilicon/sgl.c                  |  6 ++--
 drivers/gpu/drm/i915/gem/i915_gem_internal.c    |  2 +-
 drivers/gpu/drm/i915/gem/selftests/huge_pages.c |  2 +-
 drivers/gpu/drm/ttm/ttm_pool.c                  | 22 +++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     |  2 +-
 drivers/iommu/dma-iommu.c                       |  2 +-
 drivers/irqchip/irq-gic-v3-its.c                |  4 +--
 drivers/md/dm-bufio.c                           |  2 +-
 drivers/misc/genwqe/card_dev.c                  |  2 +-
 drivers/misc/genwqe/card_utils.c                |  4 +--
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c |  2 +-
 drivers/net/ethernet/ibm/ibmvnic.h              |  2 +-
 drivers/video/fbdev/hyperv_fb.c                 |  4 +--
 drivers/video/fbdev/vermilion/vermilion.c       |  2 +-
 drivers/virtio/virtio_balloon.c                 |  2 +-
 drivers/virtio/virtio_mem.c                     | 12 ++++----
 fs/ramfs/file-nommu.c                           |  2 +-
 include/drm/ttm/ttm_pool.h                      |  2 +-
 include/linux/hugetlb.h                         |  2 +-
 include/linux/mmzone.h                          | 10 +++----
 include/linux/pageblock-flags.h                 |  4 +--
 include/linux/slab.h                            |  6 ++--
 kernel/crash_core.c                             |  2 +-
 kernel/dma/pool.c                               |  6 ++--
 kernel/events/ring_buffer.c                     |  6 ++--
 mm/Kconfig                                      | 10 +++----
 mm/compaction.c                                 |  8 +++---
 mm/debug_vm_pgtable.c                           |  4 +--
 mm/huge_memory.c                                |  2 +-
 mm/hugetlb.c                                    |  4 +--
 mm/kmsan/init.c                                 |  6 ++--
 mm/memblock.c                                   |  2 +-
 mm/memory_hotplug.c                             |  4 +--
 mm/page_alloc.c                                 | 38 ++++++++++++-------------
 mm/page_isolation.c                             | 12 ++++----
 mm/page_owner.c                                 |  6 ++--
 mm/page_reporting.c                             |  6 ++--
 mm/shuffle.h                                    |  2 +-
 mm/slab.c                                       |  2 +-
 mm/slub.c                                       |  6 ++--
 mm/vmscan.c                                     |  2 +-
 mm/vmstat.c                                     | 14 ++++-----
 net/smc/smc_ib.c                                |  2 +-
 security/integrity/ima/ima_crypto.c             |  2 +-
 tools/testing/memblock/linux/mmzone.h           |  6 ++--
 84 files changed, 223 insertions(+), 253 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 86fd88492870..c18d94fa6470 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@ variables.
 Offset of the free_list's member. This value is used to compute the number
 of free pages.
 
-Each zone has a free_area structure array called free_area[MAX_ORDER].
+Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
 The free_list represents a linked list of free page blocks.
 
 (list_head, next|prev)
@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
 information. Makedumpfile gets the start address of the vmalloc region
 from this.
 
-(zone.free_area, MAX_ORDER)
----------------------------
+(zone.free_area, MAX_ORDER + 1)
+-------------------------------
 
 Free areas descriptor. User-space tools use this value to iterate the
 free_area ranges. MAX_ORDER is used by the zone buddy allocator.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6221a1d057dd..50da4f26fad5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3969,7 +3969,7 @@
 			[KNL] Minimal page reporting order
 			Format: <integer>
 			Adjust the minimal page reporting order. The page
-			reporting is disabled when it exceeds (MAX_ORDER-1).
+			reporting is disabled when it exceeds MAX_ORDER.
 
 	panic=		[KNL] Kernel behaviour on panic: delay <timeout>
 			timeout > 0: seconds before rebooting
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d9a13ccf89a3..ab6d701365bb 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -556,7 +556,7 @@ endmenu	 # "ARC Architecture Configuration"
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	default "12" if ARC_HUGEPAGE_16M
-	default "11"
+	default "11" if ARC_HUGEPAGE_16M
+	default "10"
 
 source "kernel/power/Kconfig"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e24a9820e12f..929e646e84b9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1355,9 +1355,9 @@ config ARM_MODULE_PLTS
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	default "12" if SOC_AM33XX
-	default "9" if SA1111
-	default "11"
+	default "11" if SOC_AM33XX
+	default "8" if SA1111
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -1366,9 +1366,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 config ALIGNMENT_TRAP
 	def_bool CPU_CP15_MMU
 	select HAVE_PROC_CPU if PROC_FS
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 6dc6fed12af8..345a67e67dbd 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y
 CONFIG_SMP=y
 CONFIG_ARM_PSCI=y
 CONFIG_HIGHMEM=y
-CONFIG_ARCH_FORCE_MAX_ORDER=14
+CONFIG_ARCH_FORCE_MAX_ORDER=13
 CONFIG_CMDLINE="noinitrd console=ttymxc0,115200"
 CONFIG_KEXEC=y
 CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index bd29e5012cb0..385ad0f391a8 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y
 # CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set
 # CONFIG_ARM_PATCH_IDIV is not set
 CONFIG_HIGHMEM=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
 CONFIG_SECCOMP=y
 CONFIG_KEXEC=y
 CONFIG_EFI=y
diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig
index 70a67b3fc91b..90779812c6dd 100644
--- a/arch/arm/configs/oxnas_v6_defconfig
+++ b/arch/arm/configs/oxnas_v6_defconfig
@@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y
 CONFIG_MACH_OX820=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=16
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
 CONFIG_SECCOMP=y
 CONFIG_ARM_APPENDED_DTB=y
 CONFIG_ARM_ATAG_DTB_COMPAT=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index e656d3af2266..b46e39369dbb 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -20,7 +20,7 @@ CONFIG_PXA_SHARPSL=y
 CONFIG_MACH_AKITA=y
 CONFIG_MACH_BORZOI=y
 CONFIG_AEABI=y
-CONFIG_ARCH_FORCE_MAX_ORDER=9
+CONFIG_ARCH_FORCE_MAX_ORDER=8
 CONFIG_CMDLINE="root=/dev/ram0 ro"
 CONFIG_KEXEC=y
 CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/sama7_defconfig b/arch/arm/configs/sama7_defconfig
index 0d964c613d71..954112041403 100644
--- a/arch/arm/configs/sama7_defconfig
+++ b/arch/arm/configs/sama7_defconfig
@@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y
 # CONFIG_CACHE_L2X0 is not set
 # CONFIG_ARM_PATCH_IDIV is not set
 # CONFIG_CPU_SW_DOMAIN_PAN is not set
-CONFIG_ARCH_FORCE_MAX_ORDER=15
+CONFIG_ARCH_FORCE_MAX_ORDER=14
 CONFIG_UACCESS_WITH_MEMCPY=y
 # CONFIG_ATAGS is not set
 CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel"
diff --git a/arch/arm/configs/sp7021_defconfig b/arch/arm/configs/sp7021_defconfig
index 5bca2eb59b86..c6448ac860b6 100644
--- a/arch/arm/configs/sp7021_defconfig
+++ b/arch/arm/configs/sp7021_defconfig
@@ -17,7 +17,7 @@ CONFIG_ARCH_SUNPLUS=y
 # CONFIG_VDSO is not set
 CONFIG_SMP=y
 CONFIG_THUMB2_KERNEL=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
 CONFIG_VFP=y
 CONFIG_NEON=y
 CONFIG_MODULES=y
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1023e896d46b..cb5c6aa3254e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1476,22 +1476,22 @@ config XEN
 
 # include/linux/mmzone.h requires the following to be true:
 #
-#   MAX_ORDER - 1 + PAGE_SHIFT <= SECTION_SIZE_BITS
+#   MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
 #
-# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS + 1 - PAGE_SHIFT:
+# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
 #
 #     | SECTION_SIZE_BITS |  PAGE_SHIFT  |  max MAX_ORDER  |  default MAX_ORDER |
 # ----+-------------------+--------------+-----------------+--------------------+
-# 4K  |       27          |      12      |       16        |         11         |
-# 16K |       27          |      14      |       14        |         12         |
-# 64K |       29          |      16      |       14        |         14         |
+# 4K  |       27          |      12      |       15        |         10         |
+# 16K |       27          |      14      |       13        |         11         |
+# 64K |       29          |      16      |       13        |         13         |
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order" if ARM64_4K_PAGES || ARM64_16K_PAGES
-	default "14" if ARM64_64K_PAGES
-	range 12 14 if ARM64_16K_PAGES
-	default "12" if ARM64_16K_PAGES
-	range 11 16 if ARM64_4K_PAGES
-	default "11"
+	default "13" if ARM64_64K_PAGES
+	range 11 13 if ARM64_16K_PAGES
+	default "11" if ARM64_16K_PAGES
+	range 10 15 if ARM64_4K_PAGES
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -1500,14 +1500,11 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 	  We make sure that we can allocate up to a HugePage size for each configuration.
 	  Hence we have :
-		MAX_ORDER = (PMD_SHIFT - PAGE_SHIFT) + 1 => PAGE_SHIFT - 2
+		MAX_ORDER = PMD_SHIFT - PAGE_SHIFT  => PAGE_SHIFT - 3
 
-	  However for 4K, we choose a higher default value, 11 as opposed to 10, giving us
+	  However for 4K, we choose a higher default value, 10 as opposed to 9, giving us
 	  4M allocations matching the default size used by generic code.
 
 config UNMAP_KERNEL_AT_EL0
diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h
index 4b73463423c3..5f5437621029 100644
--- a/arch/arm64/include/asm/sparsemem.h
+++ b/arch/arm64/include/asm/sparsemem.h
@@ -10,7 +10,7 @@
 /*
  * Section size must be at least 512MB for 64K base
  * page size config. Otherwise it will be less than
- * (MAX_ORDER - 1) and the build process will fail.
+ * MAX_ORDER and the build process will fail.
  */
 #ifdef CONFIG_ARM64_64K_PAGES
 #define SECTION_SIZE_BITS 29
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 0a048dc06a7d..fe5472a184a3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -16,7 +16,7 @@ struct hyp_pool {
 	 * API at EL2.
 	 */
 	hyp_spinlock_t lock;
-	struct list_head free_area[MAX_ORDER];
+	struct list_head free_area[MAX_ORDER + 1];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
 	unsigned short max_order;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 803ba3222e75..b1e392186a0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 	 * after coalescing, so make sure to mark it HYP_NO_ORDER proactively.
 	 */
 	p->order = HYP_NO_ORDER;
-	for (; (order + 1) < pool->max_order; order++) {
+	for (; (order + 1) <= pool->max_order; order++) {
 		buddy = __find_buddy_avail(pool, p, order);
 		if (!buddy)
 			break;
@@ -203,9 +203,9 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
 	hyp_spin_lock(&pool->lock);
 
 	/* Look for a high-enough-order page */
-	while (i < pool->max_order && list_empty(&pool->free_area[i]))
+	while (i <= pool->max_order && list_empty(&pool->free_area[i]))
 		i++;
-	if (i >= pool->max_order) {
+	if (i > pool->max_order) {
 		hyp_spin_unlock(&pool->lock);
 		return NULL;
 	}
@@ -228,8 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 	int i;
 
 	hyp_spin_lock_init(&pool->lock);
-	pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT));
-	for (i = 0; i < pool->max_order; i++)
+	pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+	for (i = 0; i <= pool->max_order; i++)
 		INIT_LIST_HEAD(&pool->free_area[i]);
 	pool->range_start = phys;
 	pool->range_end = phys + (nr_pages << PAGE_SHIFT);
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index dba02da6fa34..c694fac43bed 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -334,7 +334,7 @@ config HIGHMEM
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	default "11"
+	default "10"
 
 config DRAM_BASE
 	hex "DRAM start addr (the same with memory-section in dts)"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index d7e4a24e8644..0d2f41fa56ee 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -202,10 +202,10 @@ config IA64_CYCLONE
 	  If you're unsure, answer N.
 
 config ARCH_FORCE_MAX_ORDER
-	int "MAX_ORDER (11 - 17)"  if !HUGETLB_PAGE
-	range 11 17  if !HUGETLB_PAGE
-	default "17" if HUGETLB_PAGE
-	default "11"
+	int "MAX_ORDER (10 - 16)"  if !HUGETLB_PAGE
+	range 10 16  if !HUGETLB_PAGE
+	default "16" if HUGETLB_PAGE
+	default "10"
 
 config SMP
 	bool "Symmetric multi-processing support"
diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
index 84e8ce387b69..a58f8b466d96 100644
--- a/arch/ia64/include/asm/sparsemem.h
+++ b/arch/ia64/include/asm/sparsemem.h
@@ -12,9 +12,9 @@
 #define SECTION_SIZE_BITS	(30)
 #define MAX_PHYSMEM_BITS	(50)
 #ifdef CONFIG_ARCH_FORCE_MAX_ORDER
-#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
+#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS)
 #undef SECTION_SIZE_BITS
-#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT)
+#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT)
 #endif
 #endif
 
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 380d2f3966c9..e8dd4323fb86 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -170,7 +170,7 @@ static int __init hugetlb_setup_sz(char *str)
 	size = memparse(str, &str);
 	if (*str || !is_power_of_2(size) || !(tr_pages & size) ||
 		size <= PAGE_SIZE ||
-		size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
+		size > (1UL << PAGE_SHIFT << MAX_ORDER)) {
 		printk(KERN_WARNING "Invalid huge page size specified\n");
 		return 1;
 	}
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 7fd51257e0ed..272a3a12c98d 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -420,12 +420,12 @@ config NODES_SHIFT
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	range 14 64 if PAGE_SIZE_64KB
-	default "14" if PAGE_SIZE_64KB
-	range 12 64 if PAGE_SIZE_16KB
-	default "12" if PAGE_SIZE_16KB
-	range 11 64
-	default "11"
+	range 13 63 if PAGE_SIZE_64KB
+	default "13" if PAGE_SIZE_64KB
+	range 11 63 if PAGE_SIZE_16KB
+	default "11" if PAGE_SIZE_16KB
+	range 10 63
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -434,9 +434,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 	  The page size is not necessarily 4KB.  Keep this in mind
 	  when choosing a value for this option.
 
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 9380f6e3bb66..c9df6572133f 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -400,7 +400,7 @@ config SINGLE_MEMORY_CHUNK
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order" if ADVANCED
 	depends on !SINGLE_MEMORY_CHUNK
-	default "11"
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -413,9 +413,6 @@ config ARCH_FORCE_MAX_ORDER
 	  value also defines the minimal size of the hole that allows
 	  freeing unused memory map.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 config 060_WRITETHROUGH
 	bool "Use write-through caching for 68060 supervisor accesses"
 	depends on ADVANCED && M68060
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index e2f3ca73f40d..3e8b765b8c7b 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2137,14 +2137,14 @@ endchoice
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
-	default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
-	range 13 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
-	default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
-	range 12 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
-	default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
-	range 0 64
-	default "11"
+	range 13 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
+	default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
+	range 12 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
+	default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
+	range 11 63 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
+	default "11" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
+	range 0 63
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -2153,9 +2153,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 	  The page size is not necessarily 4KB.  Keep this in mind
 	  when choosing a value for this option.
 
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index a582f72104f3..89708b95978c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -46,8 +46,8 @@ source "kernel/Kconfig.hz"
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	range 9 20
-	default "11"
+	range 8 19
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -56,9 +56,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 endmenu
 
 source "arch/nios2/platform/Kconfig.platform"
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 49c6d36b2b3e..24d56536b269 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -897,18 +897,18 @@ config DATA_SHIFT
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	range 8 9 if PPC64 && PPC_64K_PAGES
-	default "9" if PPC64 && PPC_64K_PAGES
-	range 13 13 if PPC64 && !PPC_64K_PAGES
-	default "13" if PPC64 && !PPC_64K_PAGES
-	range 9 64 if PPC32 && PPC_16K_PAGES
-	default "9" if PPC32 && PPC_16K_PAGES
-	range 7 64 if PPC32 && PPC_64K_PAGES
-	default "7" if PPC32 && PPC_64K_PAGES
-	range 5 64 if PPC32 && PPC_256K_PAGES
-	default "5" if PPC32 && PPC_256K_PAGES
-	range 11 64
-	default "11"
+	range 7 8 if PPC64 && PPC_64K_PAGES
+	default "8" if PPC64 && PPC_64K_PAGES
+	range 12 12 if PPC64 && !PPC_64K_PAGES
+	default "12" if PPC64 && !PPC_64K_PAGES
+	range 8 63 if PPC32 && PPC_16K_PAGES
+	default "8" if PPC32 && PPC_16K_PAGES
+	range 6 63 if PPC32 && PPC_64K_PAGES
+	default "6" if PPC32 && PPC_64K_PAGES
+	range 4 63 if PPC32 && PPC_256K_PAGES
+	default "4" if PPC32 && PPC_256K_PAGES
+	range 10 63
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -917,9 +917,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 	  The page size is not necessarily 4KB.  For example, on 64-bit
 	  systems, 64KB pages can be enabled via CONFIG_PPC_64K_PAGES.  Keep
 	  this in mind when choosing a value for this option.
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index ea719898b581..6cb7e90d52c1 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -30,7 +30,7 @@ CONFIG_PREEMPT=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=m
 CONFIG_MATH_EMULATION=y
-CONFIG_ARCH_FORCE_MAX_ORDER=17
+CONFIG_ARCH_FORCE_MAX_ORDER=16
 CONFIG_PCI=y
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCI_MSI=y
diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config
index ab8a8c4530d9..3009b0efaf34 100644
--- a/arch/powerpc/configs/fsl-emb-nonhw.config
+++ b/arch/powerpc/configs/fsl-emb-nonhw.config
@@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y
 CONFIG_FONT_8x16=y
 CONFIG_FONT_8x8=y
 CONFIG_FONTS=y
-CONFIG_ARCH_FORCE_MAX_ORDER=13
+CONFIG_ARCH_FORCE_MAX_ORDER=12
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAME_WARN=1024
 CONFIG_FTL=y
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 7fcfba162e0d..81d7185e2ae8 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	}
 
 	mmap_read_lock(mm);
-	chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+	chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) /
 			sizeof(struct vm_area_struct *);
 	chunk = min(chunk, entries);
 	for (entry = 0; entry < entries; entry += chunk) {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f1ba8d1e8c1a..b900933507da 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void)
 		order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
 
 	if (order) {
-		VM_WARN_ON(order < MAX_ORDER);
+		VM_WARN_ON(order <= MAX_ORDER);
 		hugetlb_cma_reserve(order);
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4f6e20a35aa1..5a81f106068e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1740,7 +1740,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER);
 
 	/*
 	 * We create the default window as big as we can. The constraint is
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index b52e14ccb450..4d655e8d4d74 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -8,7 +8,7 @@ CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CPU_SUBTYPE_SH7724=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
 CONFIG_MEMORY_SIZE=0x10000000
 CONFIG_FLATMEM_MANUAL=y
 CONFIG_SH_ECOVEC=y
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 411fdc0901f7..40271090bd7d 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -20,13 +20,13 @@ config PAGE_OFFSET
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	range 9 64 if PAGE_SIZE_16KB
-	default "9" if PAGE_SIZE_16KB
-	range 7 64 if PAGE_SIZE_64KB
-	default "7" if PAGE_SIZE_64KB
-	range 11 64
-	default "14" if !MMU
-	default "11"
+	range 8 63 if PAGE_SIZE_16KB
+	default "8" if PAGE_SIZE_16KB
+	range 6 63 if PAGE_SIZE_64KB
+	default "6" if PAGE_SIZE_64KB
+	range 10 63
+	default "13" if !MMU
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -35,9 +35,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 	  The page size is not necessarily 4KB. Keep this in mind when
 	  choosing a value for this option.
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 84437a4c6545..e3242bf5a8df 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -271,7 +271,7 @@ config ARCH_SPARSEMEM_DEFAULT
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	default "13"
+	default "12"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -280,9 +280,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 13 means that the largest free memory block is 2^12 pages.
-
 if SPARC64 || COMPILE_TEST
 source "kernel/power/Kconfig"
 endif
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 384480971805..7d91ca6aa675 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -193,7 +193,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 
 	size = IO_PAGE_ALIGN(size);
 	order = get_order(size);
-	if (unlikely(order >= MAX_ORDER))
+	if (unlikely(order > MAX_ORDER))
 		return NULL;
 
 	npages = size >> IO_PAGE_SHIFT;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 5b4de4a89dec..08ffd17d5ec3 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
 
 	/* Now allocate error trap reporting scoreboard. */
 	sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info));
-	for (order = 0; order < MAX_ORDER; order++) {
+	for (order = 0; order <= MAX_ORDER; order++) {
 		if ((PAGE_SIZE << order) >= sz)
 			break;
 	}
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index dba8dffe2113..5e2931a18409 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
 	unsigned long new_rss_limit;
 	gfp_t gfp_flags;
 
-	if (max_tsb_size > (PAGE_SIZE << (MAX_ORDER - 1)))
-		max_tsb_size = (PAGE_SIZE << (MAX_ORDER - 1));
+	if (max_tsb_size > PAGE_SIZE << MAX_ORDER)
+		max_tsb_size = PAGE_SIZE << MAX_ORDER;
 
 	new_cache_index = 0;
 	for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 5e5a9c8e0e5d..8dcda617b8bf 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -368,10 +368,10 @@ int __init linux_main(int argc, char **argv)
 	max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC;
 
 	/*
-	 * Zones have to begin on a 1 << MAX_ORDER-1 page boundary,
+	 * Zones have to begin on a 1 << MAX_ORDER page boundary,
 	 * so this makes sure that's true for highmem
 	 */
-	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER - 1)) - 1);
+	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1);
 	if (physmem_size + iomem_size > max_physmem) {
 		highmem = physmem_size + iomem_size - max_physmem;
 		physmem_size -= highmem;
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index bcb0c5d2abc2..3eee334ba873 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -773,7 +773,7 @@ config HIGHMEM
 
 config ARCH_FORCE_MAX_ORDER
 	int "Maximum zone order"
-	default "11"
+	default "10"
 	help
 	  The kernel memory allocator divides physically contiguous memory
 	  blocks into "zones", where each zone is a power of two number of
@@ -782,9 +782,6 @@ config ARCH_FORCE_MAX_ORDER
 	  blocks of physically contiguous memory, then you may need to
 	  increase this value.
 
-	  This config option is actually maximum order plus one. For example,
-	  a value of 11 means that the largest free memory block is 2^10 pages.
-
 endmenu
 
 menu "Power management options"
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 817eda2075aa..c491fabe3617 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << (MAX_ORDER - 1)))
-		count = PAGE_SIZE << (MAX_ORDER - 1);
+	if (count > (PAGE_SIZE << MAX_ORDER))
+		count = PAGE_SIZE << MAX_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
@@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << (MAX_ORDER - 1)))
-		count = PAGE_SIZE << (MAX_ORDER - 1);
+	if (count > (PAGE_SIZE << MAX_ORDER))
+		count = PAGE_SIZE << MAX_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 90d2dfb6448e..cec2c20f5e59 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr)
 	}
 }
 
-#define MAX_LEN (1UL << (MAX_ORDER - 1) << PAGE_SHIFT)
+#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT)
 
 static int raw_cmd_copyin(int cmd, void __user *param,
 				 struct floppy_raw_cmd **rcmd)
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index e2f25926eb51..bf095baca244 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -886,7 +886,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 		/*
 		 * The length of the ID shouldn't be assumed by software since
 		 * it may change in the future.  The allocation size is limited
-		 * to 1 << (PAGE_SHIFT + MAX_ORDER - 1) by the page allocator.
+		 * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator.
 		 * If the allocation fails, simply return ENOMEM rather than
 		 * warning in the kernel log.
 		 */
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index 09586a837b1e..3df7a256e919 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
 			 HISI_ACC_SGL_ALIGN_SIZE);
 
 	/*
-	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1),
+	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER,
 	 * block size may exceed 2^31 on ia64, so the max of block size is 2^31
 	 */
-	block_size = 1 << (PAGE_SHIFT + MAX_ORDER <= 32 ?
-			   PAGE_SHIFT + MAX_ORDER - 1 : 31);
+	block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ?
+			   PAGE_SHIFT + MAX_ORDER : 31);
 	sgl_num_per_block = block_size / sgl_size;
 	block_num = count / sgl_num_per_block;
 	remain_sgl = count % sgl_num_per_block;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index eae9e9f6d3bf..6bc26b4b06b8 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 	struct sg_table *st;
 	struct scatterlist *sg;
 	unsigned int npages; /* restricted by sg_alloc_table */
-	int max_order = MAX_ORDER - 1;
+	int max_order = MAX_ORDER;
 	unsigned int max_segment;
 	gfp_t gfp;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index defece0bcb81..99f39a5feca1 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj)
 		do {
 			struct page *page;
 
-			GEM_BUG_ON(order >= MAX_ORDER);
+			GEM_BUG_ON(order > MAX_ORDER);
 			page = alloc_pages(GFP | __GFP_ZERO, order);
 			if (!page)
 				goto err;
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index aa116a7bbae3..6c8585abe08d 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
-static struct ttm_pool_type global_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_uncached[MAX_ORDER];
+static struct ttm_pool_type global_write_combined[MAX_ORDER + 1];
+static struct ttm_pool_type global_uncached[MAX_ORDER + 1];
 
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
+static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1];
+static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
 
 static spinlock_t shrinker_lock;
 static struct list_head shrinker_list;
@@ -405,7 +405,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 	else
 		gfp_flags |= GFP_HIGHUSER;
 
-	for (order = min_t(unsigned int, MAX_ORDER - 1, __fls(num_pages));
+	for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
 	     num_pages;
 	     order = min_t(unsigned int, order, __fls(num_pages))) {
 		struct ttm_pool_type *pt;
@@ -542,7 +542,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 
 	if (use_dma_alloc) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j < MAX_ORDER; ++j)
+			for (j = 0; j <= MAX_ORDER; ++j)
 				ttm_pool_type_init(&pool->caching[i].orders[j],
 						   pool, i, j);
 	}
@@ -562,7 +562,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
 
 	if (pool->use_dma_alloc) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j < MAX_ORDER; ++j)
+			for (j = 0; j <= MAX_ORDER; ++j)
 				ttm_pool_type_fini(&pool->caching[i].orders[j]);
 	}
 
@@ -616,7 +616,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m)
 	unsigned int i;
 
 	seq_puts(m, "\t ");
-	for (i = 0; i < MAX_ORDER; ++i)
+	for (i = 0; i <= MAX_ORDER; ++i)
 		seq_printf(m, " ---%2u---", i);
 	seq_puts(m, "\n");
 }
@@ -627,7 +627,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_ORDER; ++i)
+	for (i = 0; i <= MAX_ORDER; ++i)
 		seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
 	seq_puts(m, "\n");
 }
@@ -736,7 +736,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	spin_lock_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
-	for (i = 0; i < MAX_ORDER; ++i) {
+	for (i = 0; i <= MAX_ORDER; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
 		ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i);
@@ -769,7 +769,7 @@ void ttm_pool_mgr_fini(void)
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_ORDER; ++i) {
+	for (i = 0; i <= MAX_ORDER; ++i) {
 		ttm_pool_type_fini(&global_write_combined[i]);
 		ttm_pool_type_fini(&global_uncached[i]);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 8d772ea8a583..b574c58a3487 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -182,7 +182,7 @@
 #ifdef CONFIG_CMA_ALIGNMENT
 #define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
 #else
-#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_ORDER - 1)
+#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_ORDER)
 #endif
 
 /*
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ac996fd6bd9c..7a9f0b0bddbd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -736,7 +736,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	struct page **pages;
 	unsigned int i = 0, nid = dev_to_node(dev);
 
-	order_mask &= GENMASK(MAX_ORDER - 1, 0);
+	order_mask &= GENMASK(MAX_ORDER, 0);
 	if (!order_mask)
 		return NULL;
 
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 586271b8aa39..85790b870877 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2440,8 +2440,8 @@ static bool its_parse_indirect_baser(struct its_node *its,
 	 * feature is not supported by hardware.
 	 */
 	new_order = max_t(u32, get_order(esz << ids), new_order);
-	if (new_order >= MAX_ORDER) {
-		new_order = MAX_ORDER - 1;
+	if (new_order > MAX_ORDER) {
+		new_order = MAX_ORDER;
 		ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
 		pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n",
 			&its->phys_base, its_base_type_string[type],
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cf077f9b30c3..733053c2eaa0 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -408,7 +408,7 @@ static void __cache_size_refresh(void)
  * If the allocation may fail we use __get_free_pages. Memory fragmentation
  * won't have a fatal effect here, but it just causes flushes of some other
  * buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order >= MAX_ORDER).
+ * always fails (i.e. order > MAX_ORDER).
  *
  * If the allocation shouldn't fail we use __vmalloc. This is only for the
  * initial reserve allocation, so there's no risk of wasting all vmalloc
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index d0e27438a73c..55fc5b80e649 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (vsize == 0)
 		return -EINVAL;
 
-	if (get_order(vsize) >= MAX_ORDER)
+	if (get_order(vsize) > MAX_ORDER)
 		return -ENOMEM;
 
 	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL);
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index ac29698d085a..1c798d6b2dfb 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init)
 void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
 			       dma_addr_t *dma_handle)
 {
-	if (get_order(size) >= MAX_ORDER)
+	if (get_order(size) > MAX_ORDER)
 		return NULL;
 
 	return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,
@@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl,
 	sgl->write = write;
 	sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages);
 
-	if (get_order(sgl->sgl_size) >= MAX_ORDER) {
+	if (get_order(sgl->sgl_size) > MAX_ORDER) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: too much memory requested!\n", __func__);
 		return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 25be7f8ac7cd..3973ca6adf4c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 		return;
 
 	order = get_order(alloc_size);
-	if (order >= MAX_ORDER) {
+	if (order > MAX_ORDER) {
 		if (net_ratelimit())
 			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
 		return;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index b35c9b6f913b..4e18b4cefa97 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -75,7 +75,7 @@
  * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160
  * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC.
  */
-#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << (MAX_ORDER - 1)) * PAGE_SIZE))
+#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << MAX_ORDER) * PAGE_SIZE))
 #define IBMVNIC_ONE_LTB_SIZE	min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX)
 #define IBMVNIC_LTB_SET_SIZE	(38 << 20)
 
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index ec3f6cf05f8c..34781dec3856 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -946,7 +946,7 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev,
 	if (request_size == 0)
 		return -1;
 
-	if (order < MAX_ORDER) {
+	if (order <= MAX_ORDER) {
 		/* Call alloc_pages if the size is less than 2^MAX_ORDER */
 		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 		if (!page)
@@ -977,7 +977,7 @@ static void hvfb_release_phymem(struct hv_device *hdev,
 {
 	unsigned int order = get_order(size);
 
-	if (order < MAX_ORDER)
+	if (order <= MAX_ORDER)
 		__free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order);
 	else
 		dma_free_coherent(&hdev->device,
diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c
index 0374ee6b6d03..32e74e02a02f 100644
--- a/drivers/video/fbdev/vermilion/vermilion.c
+++ b/drivers/video/fbdev/vermilion/vermilion.c
@@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo,
 		va = &vinfo->vram[i];
 		order = 0;
 
-		while (requested > (PAGE_SIZE << order) && order < MAX_ORDER)
+		while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER)
 			order++;
 
 		err = vmlfb_alloc_vram_area(va, order, 0);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3f78a3a1eb75..5b15936a5214 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -33,7 +33,7 @@
 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
 					     __GFP_NOMEMALLOC)
 /* The order of free page blocks to report to host */
-#define VIRTIO_BALLOON_HINT_BLOCK_ORDER (MAX_ORDER - 1)
+#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER
 /* The size of a free page block in bytes */
 #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 0c2892ec6817..835f6cc2fb66 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1120,13 +1120,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
  */
 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
 {
-	unsigned long order = MAX_ORDER - 1;
+	unsigned long order = MAX_ORDER;
 	unsigned long i;
 
 	/*
 	 * We might get called for ranges that don't cover properly aligned
-	 * MAX_ORDER - 1 pages; however, we can only online properly aligned
-	 * pages with an order of MAX_ORDER - 1 at maximum.
+	 * MAX_ORDER pages; however, we can only online properly aligned
+	 * pages with an order of MAX_ORDER at maximum.
 	 */
 	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
 		order--;
@@ -1237,9 +1237,9 @@ static void virtio_mem_online_page(struct virtio_mem *vm,
 	bool do_online;
 
 	/*
-	 * We can get called with any order up to MAX_ORDER - 1. If our
-	 * subblock size is smaller than that and we have a mixture of plugged
-	 * and unplugged subblocks within such a page, we have to process in
+	 * We can get called with any order up to MAX_ORDER. If our subblock
+	 * size is smaller than that and we have a mixture of plugged and
+	 * unplugged subblocks within such a page, we have to process in
 	 * smaller granularity. In that case we'll adjust the order exactly once
 	 * within the loop.
 	 */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2f67516bb9bf..9fbb9b5256f7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	/* make various checks */
 	order = get_order(newsize);
-	if (unlikely(order >= MAX_ORDER))
+	if (unlikely(order > MAX_ORDER))
 		return -EFBIG;
 
 	ret = inode_newsize_ok(inode, newsize);
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index ef09b23d29e3..8ce14f9d202a 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -72,7 +72,7 @@ struct ttm_pool {
 	bool use_dma32;
 
 	struct {
-		struct ttm_pool_type orders[MAX_ORDER];
+		struct ttm_pool_type orders[MAX_ORDER + 1];
 	} caching[TTM_NUM_CACHING_TYPES];
 };
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7c977d234aba..8fb7d91cd0b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -818,7 +818,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
 
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
-	return huge_page_order(h) >= MAX_ORDER;
+	return huge_page_order(h) > MAX_ORDER;
 }
 
 static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bf8786d45b31..35b11cc210d9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -26,11 +26,11 @@
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 11
+#define MAX_ORDER 10
 #else
 #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
 #endif
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
 
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -93,7 +93,7 @@ static inline bool migratetype_is_mergeable(int mt)
 }
 
 #define for_each_migratetype_order(order, type) \
-	for (order = 0; order < MAX_ORDER; order++) \
+	for (order = 0; order <= MAX_ORDER; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
 
 extern int page_group_by_mobility_disabled;
@@ -922,7 +922,7 @@ struct zone {
 	CACHELINE_PADDING(_pad1_);
 
 	/* free areas of different sizes */
-	struct free_area	free_area[MAX_ORDER];
+	struct free_area	free_area[MAX_ORDER + 1];
 
 	/* zone flags, see below */
 	unsigned long		flags;
@@ -1745,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 #define SECTION_BLOCKFLAGS_BITS \
 	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
 
-#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 5f1ae07d724b..e83c4c095041 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1)
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		(MAX_ORDER-1)
+#define pageblock_order		MAX_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 45af70315a94..aa4575ef2965 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -284,7 +284,7 @@ static inline unsigned int arch_slab_minalign(void)
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
@@ -292,7 +292,7 @@ static inline unsigned int arch_slab_minalign(void)
 
 #ifdef CONFIG_SLUB
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
@@ -305,7 +305,7 @@ static inline unsigned int arch_slab_minalign(void)
  * be allocated from the same page.
  */
 #define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 755f5f08ab38..90ce1dfd591c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vmap_area, va_start);
 	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..1acec2e22827 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 	void *addr;
 	int ret = -ENOMEM;
 
-	/* Cannot allocate larger than MAX_ORDER-1 */
-	order = min(get_order(pool_size), MAX_ORDER-1);
+	/* Cannot allocate larger than MAX_ORDER */
+	order = min(get_order(pool_size), MAX_ORDER);
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
 
 	/*
 	 * If coherent_pool was not used on the command line, default the pool
-	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
 	 */
 	if (!atomic_pool_size) {
 		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index d6bbdb7830b2..a0433f37b024 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -609,8 +609,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
 {
 	struct page *page;
 
-	if (order >= MAX_ORDER)
-		order = MAX_ORDER - 1;
+	if (order > MAX_ORDER)
+		order = MAX_ORDER;
 
 	do {
 		page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+	if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
 		goto fail;
 
 	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index ca98b2072df5..969286ab14a1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -346,9 +346,9 @@ config SHUFFLE_PAGE_ALLOCATOR
 	  the presence of a memory-side-cache. There are also incidental
 	  security benefits as it reduces the predictability of page
 	  allocations to compliment SLAB_FREELIST_RANDOM, but the
-	  default granularity of shuffling on the "MAX_ORDER - 1" i.e,
-	  10th order of pages is selected based on cache utilization
-	  benefits on x86.
+	  default granularity of shuffling on the MAX_ORDER i.e, 10th
+	  order of pages is selected based on cache utilization benefits
+	  on x86.
 
 	  While the randomization improves cache utilization it may
 	  negatively impact workloads on platforms without a cache. For
@@ -666,8 +666,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 	  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
 	  on a platform.
 
-	  Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
-	  clamped down to MAX_ORDER - 1.
+	  Note that the pageblock_order cannot exceed MAX_ORDER and will be
+	  clamped down to MAX_ORDER.
 
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
diff --git a/mm/compaction.c b/mm/compaction.c
index 5a9501e0ae01..709136556b9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -583,7 +583,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		if (PageCompound(page)) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order < MAX_ORDER)) {
+			if (likely(order <= MAX_ORDER)) {
 				blockpfn += (1UL << order) - 1;
 				cursor += (1UL << order) - 1;
 			}
@@ -938,7 +938,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * a valid page order. Consider only values in the
 			 * valid order range to prevent low_pfn overflow.
 			 */
-			if (freepage_order > 0 && freepage_order < MAX_ORDER)
+			if (freepage_order > 0 && freepage_order <= MAX_ORDER)
 				low_pfn += (1UL << freepage_order) - 1;
 			continue;
 		}
@@ -954,7 +954,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (PageCompound(page) && !cc->alloc_contig) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order < MAX_ORDER))
+			if (likely(order <= MAX_ORDER))
 				low_pfn += (1UL << order) - 1;
 			goto isolate_fail;
 		}
@@ -2124,7 +2124,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 
 	/* Direct compactor: Is a suitable page free? */
 	ret = COMPACT_NO_SUITABLE_PAGE;
-	for (order = cc->order; order < MAX_ORDER; order++) {
+	for (order = cc->order; order <= MAX_ORDER; order++) {
 		struct free_area *area = &cc->zone->free_area[order];
 		bool can_steal;
 
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 4362021b1ce7..c54177aabebd 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1086,7 +1086,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	struct page *page = NULL;
 
 #ifdef CONFIG_CONTIG_ALLOC
-	if (order >= MAX_ORDER) {
+	if (order > MAX_ORDER) {
 		page = alloc_contig_pages((1 << order), GFP_KERNEL,
 					  first_online_node, NULL);
 		if (page) {
@@ -1096,7 +1096,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	}
 #endif
 
-	if (order < MAX_ORDER)
+	if (order <= MAX_ORDER)
 		page = alloc_pages(GFP_KERNEL, order);
 
 	return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2d860e70fe88..1df386d13d24 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -467,7 +467,7 @@ static int __init hugepage_init(void)
 	/*
 	 * hugepages can't be allocated by the buddy allocator
 	 */
-	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
 	/*
 	 * we use page->mapping and page->index in second tail page
 	 * as list_head: assuming THP order >= 2
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 712e32b38295..9122e50ae02a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2090,7 +2090,7 @@ pgoff_t hugetlb_basepage_index(struct page *page)
 	pgoff_t index = page_index(page_head);
 	unsigned long compound_idx;
 
-	if (compound_order(page_head) >= MAX_ORDER)
+	if (compound_order(page_head) > MAX_ORDER)
 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
 	else
 		compound_idx = page - page_head;
@@ -4497,7 +4497,7 @@ static int __init default_hugepagesz_setup(char *s)
 	 * The number of default huge pages (for this size) could have been
 	 * specified as the first hugetlb parameter: hugepages=X.  If so,
 	 * then default_hstate_max_huge_pages is set.  If the default huge
-	 * page size is gigantic (>= MAX_ORDER), then the pages must be
+	 * page size is gigantic (> MAX_ORDER), then the pages must be
 	 * allocated here from bootmem allocator.
 	 */
 	if (default_hstate_max_huge_pages) {
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 7fb794242fad..ffedf4dbc49d 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void)
 struct metadata_page_pair {
 	struct page *shadow, *origin;
 };
-static struct metadata_page_pair held_back[MAX_ORDER] __initdata;
+static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
 
 /*
  * Eager metadata allocation. When the memblock allocator is freeing pages to
@@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void)
 	 *    order=N-1,
 	 *  - repeat.
 	 */
-	collect.order = MAX_ORDER - 1;
-	for (int i = MAX_ORDER - 1; i >= 0; i--) {
+	collect.order = MAX_ORDER;
+	for (int i = MAX_ORDER; i >= 0; i--) {
 		if (held_back[i].shadow)
 			smallstack_push(&collect, held_back[i].shadow);
 		if (held_back[i].origin)
diff --git a/mm/memblock.c b/mm/memblock.c
index 25fd0626a9e7..7911224b1ed3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2043,7 +2043,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 	int order;
 
 	while (start < end) {
-		order = min(MAX_ORDER - 1UL, __ffs(start));
+		order = min_t(int, MAX_ORDER, __ffs(start));
 
 		while (start + (1UL << order) > end)
 			order--;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index db3b270254f1..c8f0a8c2d049 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -596,7 +596,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	unsigned long pfn;
 
 	/*
-	 * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+	 * Online the pages in MAX_ORDER aligned chunks. The callback might
 	 * decide to not expose all pages to the buddy (e.g., expose them
 	 * later). We account all pages as being online and belonging to this
 	 * zone ("present").
@@ -605,7 +605,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	 * this and the first chunk to online will be pageblock_nr_pages.
 	 */
 	for (pfn = start_pfn; pfn < end_pfn;) {
-		int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+		int order = min_t(int, MAX_ORDER, __ffs(pfn));
 
 		(*online_page_callback)(pfn_to_page(pfn), order);
 		pfn += (1UL << order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0936bde1d486..c3e49d028a7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1063,7 +1063,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 	unsigned long higher_page_pfn;
 	struct page *higher_page;
 
-	if (order >= MAX_ORDER - 2)
+	if (order >= MAX_ORDER - 1)
 		return false;
 
 	higher_page_pfn = buddy_pfn & pfn;
@@ -1118,7 +1118,7 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
-	while (order < MAX_ORDER - 1) {
+	while (order < MAX_ORDER) {
 		if (compaction_capture(capc, page, order, migratetype)) {
 			__mod_zone_freepage_state(zone, -(1 << order),
 								migratetype);
@@ -2499,7 +2499,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	struct page *page;
 
 	/* Find a page of the appropriate size in the preferred list */
-	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+	for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		page = get_page_from_free_area(area, migratetype);
 		if (!page)
@@ -2871,7 +2871,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order < MAX_ORDER; order++) {
+		for (order = 0; order <= MAX_ORDER; order++) {
 			struct free_area *area = &(zone->free_area[order]);
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -2955,7 +2955,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	 * approximates finding the pageblock with the most free pages, which
 	 * would be too costly to do exactly.
 	 */
-	for (current_order = MAX_ORDER - 1; current_order >= min_order;
+	for (current_order = MAX_ORDER; current_order >= min_order;
 				--current_order) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
@@ -2981,7 +2981,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	return false;
 
 find_smallest:
-	for (current_order = order; current_order < MAX_ORDER;
+	for (current_order = order; current_order <= MAX_ORDER;
 							current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
@@ -2994,7 +2994,7 @@ find_smallest:
 	 * This should not happen - we already found a suitable fallback
 	 * when looking for the largest page.
 	 */
-	VM_BUG_ON(current_order == MAX_ORDER);
+	VM_BUG_ON(current_order > MAX_ORDER);
 
 do_steal:
 	page = get_page_from_free_area(area, fallback_mt);
@@ -3955,7 +3955,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		return true;
 
 	/* For a high-order request, check at least one suitable page is free */
-	for (o = order; o < MAX_ORDER; o++) {
+	for (o = order; o <= MAX_ORDER; o++) {
 		struct free_area *area = &z->free_area[o];
 		int mt;
 
@@ -5475,7 +5475,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 	 * There are several places where we assume that the order value is sane
 	 * so bail out early if the request is out of bound.
 	 */
-	if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
+	if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
 		return NULL;
 
 	gfp &= gfp_allowed_mask;
@@ -6205,8 +6205,8 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
 
 	for_each_populated_zone(zone) {
 		unsigned int order;
-		unsigned long nr[MAX_ORDER], flags, total = 0;
-		unsigned char types[MAX_ORDER];
+		unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+		unsigned char types[MAX_ORDER + 1];
 
 		if (zone_idx(zone) > max_zone_idx)
 			continue;
@@ -6216,7 +6216,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
 		printk(KERN_CONT "%s: ", zone->name);
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order < MAX_ORDER; order++) {
+		for (order = 0; order <= MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 
@@ -6230,7 +6230,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
-		for (order = 0; order < MAX_ORDER; order++) {
+		for (order = 0; order <= MAX_ORDER; order++) {
 			printk(KERN_CONT "%lu*%lukB ",
 			       nr[order], K(1UL) << order);
 			if (nr[order])
@@ -7581,7 +7581,7 @@ static inline void setup_usemap(struct zone *zone) {}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order = MAX_ORDER - 1;
+	unsigned int order = MAX_ORDER;
 
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
@@ -9076,7 +9076,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 			else
 				table = memblock_alloc_raw(size,
 							   SMP_CACHE_BYTES);
-		} else if (get_order(size) >= MAX_ORDER || hashdist) {
+		} else if (get_order(size) > MAX_ORDER || hashdist) {
 			table = vmalloc_huge(size, gfp_flags);
 			virt = true;
 			if (table)
@@ -9290,7 +9290,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
-		if (++order >= MAX_ORDER) {
+		if (++order > MAX_ORDER) {
 			outer_start = start;
 			break;
 		}
@@ -9540,7 +9540,7 @@ bool is_free_buddy_page(struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	unsigned int order;
 
-	for (order = 0; order < MAX_ORDER; order++) {
+	for (order = 0; order <= MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 
 		if (PageBuddy(page_head) &&
@@ -9548,7 +9548,7 @@ bool is_free_buddy_page(struct page *page)
 			break;
 	}
 
-	return order < MAX_ORDER;
+	return order <= MAX_ORDER;
 }
 EXPORT_SYMBOL(is_free_buddy_page);
 
@@ -9599,7 +9599,7 @@ bool take_page_off_buddy(struct page *page)
 	bool ret = false;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	for (order = 0; order < MAX_ORDER; order++) {
+	for (order = 0; order <= MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		int page_order = buddy_order(page_head);
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 47fbc1696466..c6f3605e37ab 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 	 */
 	if (PageBuddy(page)) {
 		order = buddy_order(page);
-		if (order >= pageblock_order && order < MAX_ORDER - 1) {
+		if (order >= pageblock_order && order < MAX_ORDER) {
 			buddy = find_buddy_page_pfn(page, page_to_pfn(page),
 						    order, NULL);
 			if (buddy && !is_migrate_isolate_page(buddy)) {
@@ -290,11 +290,11 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *			isolate_single_pageblock()
  * @migratetype:	migrate type to set in error recovery.
  *
- * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * Free and in-use pages can be as big as MAX_ORDER and contain more than one
  * pageblock. When not all pageblocks within a page are isolated at the same
  * time, free page accounting can go wrong. For example, in the case of
- * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
- * [         MAX_ORDER-1         ]
+ * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks.
+ * [         MAX_ORDER           ]
  * [  pageblock0  |  pageblock1  ]
  * When either pageblock is isolated, if it is a free page, the page is not
  * split into separate migratetype lists, which is supposed to; if it is an
@@ -451,7 +451,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				 * the free page to the right migratetype list.
 				 *
 				 * head_pfn is not used here as a hugetlb page order
-				 * can be bigger than MAX_ORDER-1, but after it is
+				 * can be bigger than MAX_ORDER, but after it is
 				 * freed, the free page order is not. Use pfn within
 				 * the range to find the head of the free page.
 				 */
@@ -459,7 +459,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				outer_pfn = pfn;
 				while (!PageBuddy(pfn_to_page(outer_pfn))) {
 					/* stop if we cannot find the free page */
-					if (++order >= MAX_ORDER)
+					if (++order > MAX_ORDER)
 						goto failed;
 					outer_pfn &= ~0UL << order;
 				}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 220cdeddc295..31169b3e7f06 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -315,7 +315,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				unsigned long freepage_order;
 
 				freepage_order = buddy_order_unsafe(page);
-				if (freepage_order < MAX_ORDER)
+				if (freepage_order <= MAX_ORDER)
 					pfn += (1UL << freepage_order) - 1;
 				continue;
 			}
@@ -549,7 +549,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = buddy_order_unsafe(page);
 
-			if (freepage_order < MAX_ORDER)
+			if (freepage_order <= MAX_ORDER)
 				pfn += (1UL << freepage_order) - 1;
 			continue;
 		}
@@ -657,7 +657,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 			if (PageBuddy(page)) {
 				unsigned long order = buddy_order_unsafe(page);
 
-				if (order > 0 && order < MAX_ORDER)
+				if (order > 0 && order <= MAX_ORDER)
 					pfn += (1UL << order) - 1;
 				continue;
 			}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 275b466de37b..b021f482a4cb 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param *
 	 * If param is set beyond this limit, order is set to default
 	 * pageblock_order value
 	 */
-	return  param_set_uint_minmax(val, kp, 0, MAX_ORDER-1);
+	return  param_set_uint_minmax(val, kp, 0, MAX_ORDER);
 }
 
 static const struct kernel_param_ops page_reporting_param_ops = {
@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 		return err;
 
 	/* Process each free list starting from lowest order/mt */
-	for (order = page_reporting_order; order < MAX_ORDER; order++) {
+	for (order = page_reporting_order; order <= MAX_ORDER; order++) {
 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 			/* We do not pull pages from the isolate free list */
 			if (is_migrate_isolate(mt))
@@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
 	 */
 
 	if (page_reporting_order == -1) {
-		if (prdev->order > 0 && prdev->order < MAX_ORDER)
+		if (prdev->order > 0 && prdev->order <= MAX_ORDER)
 			page_reporting_order = prdev->order;
 		else
 			page_reporting_order = pageblock_order;
diff --git a/mm/shuffle.h b/mm/shuffle.h
index cec62984f7d3..a6bdf54f96f1 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,7 +4,7 @@
 #define _MM_SHUFFLE_H
 #include <linux/jump_label.h>
 
-#define SHUFFLE_ORDER (MAX_ORDER-1)
+#define SHUFFLE_ORDER MAX_ORDER
 
 #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
 DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
diff --git a/mm/slab.c b/mm/slab.c
index edbe722fb906..6b7c172158e5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str)
 {
 	get_option(&str, &slab_max_order);
 	slab_max_order = slab_max_order < 0 ? 0 :
-				min(slab_max_order, MAX_ORDER - 1);
+				min(slab_max_order, MAX_ORDER);
 	slab_max_order_set = true;
 
 	return 1;
diff --git a/mm/slub.c b/mm/slub.c
index 32eb6b50fe18..f49d669ff604 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4171,8 +4171,8 @@ static inline int calculate_order(unsigned int size)
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = calc_slab_order(size, 1, MAX_ORDER - 1, 1);
-	if (order < MAX_ORDER)
+	order = calc_slab_order(size, 1, MAX_ORDER, 1);
+	if (order <= MAX_ORDER)
 		return order;
 	return -ENOSYS;
 }
@@ -4697,7 +4697,7 @@ __setup("slub_min_order=", setup_slub_min_order);
 static int __init setup_slub_max_order(char *str)
 {
 	get_option(&str, (int *)&slub_max_order);
-	slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
+	slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
 
 	return 1;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8faac4310cb5..98719e72b5e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7002,7 +7002,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
 	 * Confirm they are large enough for max values.
 	 */
-	BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+	BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
 	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
 	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1ea6a5ce1c41..b7307627772d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone,
 	info->free_blocks_total = 0;
 	info->free_blocks_suitable = 0;
 
-	for (order = 0; order < MAX_ORDER; order++) {
+	for (order = 0; order <= MAX_ORDER; order++) {
 		unsigned long blocks;
 
 		/*
@@ -1088,7 +1088,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 {
 	unsigned long requested = 1UL << order;
 
-	if (WARN_ON_ONCE(order >= MAX_ORDER))
+	if (WARN_ON_ONCE(order > MAX_ORDER))
 		return 0;
 
 	if (!info->free_blocks_total)
@@ -1462,7 +1462,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 	int order;
 
 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-	for (order = 0; order < MAX_ORDER; ++order)
+	for (order = 0; order <= MAX_ORDER; ++order)
 		/*
 		 * Access to nr_free is lockless as nr_free is used only for
 		 * printing purposes. Use data_race to avoid KCSAN warning.
@@ -1491,7 +1491,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 					pgdat->node_id,
 					zone->name,
 					migratetype_names[mtype]);
-		for (order = 0; order < MAX_ORDER; ++order) {
+		for (order = 0; order <= MAX_ORDER; ++order) {
 			unsigned long freecount = 0;
 			struct free_area *area;
 			struct list_head *curr;
@@ -1531,7 +1531,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
 
 	/* Print header */
 	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
-	for (order = 0; order < MAX_ORDER; ++order)
+	for (order = 0; order <= MAX_ORDER; ++order)
 		seq_printf(m, "%6d ", order);
 	seq_putc(m, '\n');
 
@@ -2153,7 +2153,7 @@ static void unusable_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order < MAX_ORDER; ++order) {
+	for (order = 0; order <= MAX_ORDER; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = unusable_free_index(order, &info);
 		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
@@ -2205,7 +2205,7 @@ static void extfrag_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order < MAX_ORDER; ++order) {
+	for (order = 0; order <= MAX_ORDER; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = __fragmentation_index(order, &info);
 		seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 854772dd52fd..9b66d6aeeb1a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -843,7 +843,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
 		goto out;
 	/* the calculated number of cq entries fits to mlx5 cq allocation */
 	cqe_size_order = cache_line_size() == 128 ? 7 : 6;
-	smc_order = MAX_ORDER - cqe_size_order - 1;
+	smc_order = MAX_ORDER - cqe_size_order;
 	if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
 		cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
 	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 64499056648a..51ad29940f05 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp)
 
 	size = memparse(val, NULL);
 	order = get_order(size);
-	if (order >= MAX_ORDER)
+	if (order > MAX_ORDER)
 		return -EINVAL;
 	ima_maxorder = order;
 	ima_bufsize = PAGE_SIZE << order;
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index e65f89b12f1c..134f8eab0768 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -17,10 +17,10 @@ enum zone_type {
 };
 
 #define MAX_NR_ZONES __MAX_NR_ZONES
-#define MAX_ORDER 11
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
 
-#define pageblock_order		(MAX_ORDER - 1)
+#define pageblock_order		MAX_ORDER
 #define pageblock_nr_pages	BIT(pageblock_order)
 #define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
 #define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
-- 
cgit v1.2.3


From a734991ccaec1985fff42fb26bb6d789d35defb4 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 14 Mar 2023 15:12:47 -0700
Subject: mm: userfaultfd: rename functions for clarity + consistency

Patch series "mm: userfaultfd: refactor and add UFFDIO_CONTINUE_MODE_WP",
v5.

- Commits 1-3 refactor userfaultfd ioctl code without behavior changes, with the
  main goal of improving consistency and reducing the number of function args.

- Commit 4 adds UFFDIO_CONTINUE_MODE_WP.


This patch (of 4):

The basic problem is, over time we've added new userfaultfd ioctls, and
we've refactored the code so functions which used to handle only one case
are now re-used to deal with several cases.  While this happened, we
didn't bother to rename the functions.

Similarly, as we added new functions, we cargo-culted pieces of the
now-inconsistent naming scheme, so those functions too ended up with names
that don't make a lot of sense.

A key point here is, "copy" in most userfaultfd code refers specifically
to UFFDIO_COPY, where we allocate a new page and copy its contents from
userspace.  There are many functions with "copy" in the name that don't
actually do this (at least in some cases).

So, rename things into a consistent scheme.  The high level idea is that
the call stack for userfaultfd ioctls becomes:

userfaultfd_ioctl
  -> userfaultfd_(particular ioctl)
    -> mfill_atomic_(particular kind of fill operation)
      -> mfill_atomic    /* loops over pages in range */
        -> mfill_atomic_pte    /* deals with single pages */
          -> mfill_atomic_pte_(particular kind of fill operation)
            -> mfill_atomic_install_pte

There are of course some special cases (shmem, hugetlb), but this is the
general structure which all function names now adhere to.

Link: https://lkml.kernel.org/r/20230314221250.682452-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20230314221250.682452-2-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 18 ++++----
 include/linux/hugetlb.h       | 30 ++++++-------
 include/linux/userfaultfd_k.h | 18 ++++----
 mm/hugetlb.c                  | 20 ++++-----
 mm/userfaultfd.c              | 98 +++++++++++++++++++++----------------------
 5 files changed, 92 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 881e9c82b9d1..4aedfd98e3f5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1756,9 +1756,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
 		goto out;
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-				   uffdio_copy.len, &ctx->mmap_changing,
-				   uffdio_copy.mode);
+		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+					uffdio_copy.len, &ctx->mmap_changing,
+					uffdio_copy.mode);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1808,9 +1808,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-				     uffdio_zeropage.range.len,
-				     &ctx->mmap_changing);
+		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+					   uffdio_zeropage.range.len,
+					   &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1918,9 +1918,9 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
-				     uffdio_continue.range.len,
-				     &ctx->mmap_changing);
+		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+					    uffdio_continue.range.len,
+					    &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8fb7d91cd0b1..152434396c48 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,13 +158,13 @@ unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 #ifdef CONFIG_USERFAULTFD
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
-				struct vm_area_struct *dst_vma,
-				unsigned long dst_addr,
-				unsigned long src_addr,
-				enum mcopy_atomic_mode mode,
-				struct page **pagep,
-				bool wp_copy);
+int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr,
+			     unsigned long src_addr,
+			     enum mcopy_atomic_mode mode,
+			     struct page **pagep,
+			     bool wp_copy);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -393,14 +393,14 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 }
 
 #ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
-						pte_t *dst_pte,
-						struct vm_area_struct *dst_vma,
-						unsigned long dst_addr,
-						unsigned long src_addr,
-						enum mcopy_atomic_mode mode,
-						struct page **pagep,
-						bool wp_copy)
+static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
+					   pte_t *dst_pte,
+					   struct vm_area_struct *dst_vma,
+					   unsigned long dst_addr,
+					   unsigned long src_addr,
+					   enum mcopy_atomic_mode mode,
+					   struct page **pagep,
+					   bool wp_copy)
 {
 	BUG();
 	return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 0cf8880219da..ac178d810dc7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,15 +61,15 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				    unsigned long dst_addr, struct page *page,
 				    bool newly_allocated, bool wp_copy);
 
-extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-			    unsigned long src_start, unsigned long len,
-			    atomic_t *mmap_changing, __u64 mode);
-extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
-			      unsigned long dst_start,
-			      unsigned long len,
-			      atomic_t *mmap_changing);
-extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-			      unsigned long len, atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+				 unsigned long src_start, unsigned long len,
+				 atomic_t *mmap_changing, __u64 mode);
+extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+				     unsigned long dst_start,
+				     unsigned long len,
+				     atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+				     unsigned long len, atomic_t *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9122e50ae02a..b1e474aa2fc5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6156,17 +6156,17 @@ out_mutex:
 
 #ifdef CONFIG_USERFAULTFD
 /*
- * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
- * modifications for huge pages.
+ * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
+ * with modifications for hugetlb pages.
  */
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
-			    pte_t *dst_pte,
-			    struct vm_area_struct *dst_vma,
-			    unsigned long dst_addr,
-			    unsigned long src_addr,
-			    enum mcopy_atomic_mode mode,
-			    struct page **pagep,
-			    bool wp_copy)
+int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
+			     pte_t *dst_pte,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr,
+			     unsigned long src_addr,
+			     enum mcopy_atomic_mode mode,
+			     struct page **pagep,
+			     bool wp_copy)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct hstate *h = hstate_vma(dst_vma);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 23cabd02ac52..874379ce271f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -127,13 +127,13 @@ out_unlock:
 	return ret;
 }
 
-static int mcopy_atomic_pte(struct mm_struct *dst_mm,
-			    pmd_t *dst_pmd,
-			    struct vm_area_struct *dst_vma,
-			    unsigned long dst_addr,
-			    unsigned long src_addr,
-			    struct page **pagep,
-			    bool wp_copy)
+static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
+				 pmd_t *dst_pmd,
+				 struct vm_area_struct *dst_vma,
+				 unsigned long dst_addr,
+				 unsigned long src_addr,
+				 struct page **pagep,
+				 bool wp_copy)
 {
 	void *page_kaddr;
 	int ret;
@@ -204,10 +204,10 @@ out_release:
 	goto out;
 }
 
-static int mfill_zeropage_pte(struct mm_struct *dst_mm,
-			      pmd_t *dst_pmd,
-			      struct vm_area_struct *dst_vma,
-			      unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
+				     pmd_t *dst_pmd,
+				     struct vm_area_struct *dst_vma,
+				     unsigned long dst_addr)
 {
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
@@ -240,11 +240,11 @@ out_unlock:
 }
 
 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
-				pmd_t *dst_pmd,
-				struct vm_area_struct *dst_vma,
-				unsigned long dst_addr,
-				bool wp_copy)
+static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
+				     pmd_t *dst_pmd,
+				     struct vm_area_struct *dst_vma,
+				     unsigned long dst_addr,
+				     bool wp_copy)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -307,10 +307,10 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 
 #ifdef CONFIG_HUGETLB_PAGE
 /*
- * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
+ * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
  * called with mmap_lock held, it will release mmap_lock before returning.
  */
-static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
 					      struct vm_area_struct *dst_vma,
 					      unsigned long dst_start,
 					      unsigned long src_start,
@@ -411,7 +411,7 @@ retry:
 			goto out_unlock;
 		}
 
-		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+		err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma,
 					       dst_addr, src_addr, mode, &page,
 					       wp_copy);
 
@@ -463,7 +463,7 @@ out:
 }
 #else /* !CONFIG_HUGETLB_PAGE */
 /* fail at build time if gcc attempts to use this */
-extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
 				      struct vm_area_struct *dst_vma,
 				      unsigned long dst_start,
 				      unsigned long src_start,
@@ -484,8 +484,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	ssize_t err;
 
 	if (mode == MCOPY_ATOMIC_CONTINUE) {
-		return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-					    wp_copy);
+		return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma,
+						 dst_addr, wp_copy);
 	}
 
 	/*
@@ -500,11 +500,11 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
 		if (mode == MCOPY_ATOMIC_NORMAL)
-			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-					       dst_addr, src_addr, page,
-					       wp_copy);
+			err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma,
+						    dst_addr, src_addr, page,
+						    wp_copy);
 		else
-			err = mfill_zeropage_pte(dst_mm, dst_pmd,
+			err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
 		err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
@@ -516,13 +516,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	return err;
 }
 
-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
-					      unsigned long dst_start,
-					      unsigned long src_start,
-					      unsigned long len,
-					      enum mcopy_atomic_mode mcopy_mode,
-					      atomic_t *mmap_changing,
-					      __u64 mode)
+static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+					    unsigned long dst_start,
+					    unsigned long src_start,
+					    unsigned long len,
+					    enum mcopy_atomic_mode mcopy_mode,
+					    atomic_t *mmap_changing,
+					    __u64 mode)
 {
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
@@ -588,9 +588,9 @@ retry:
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-					       src_start, len, mcopy_mode,
-					       wp_copy);
+		return  mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+					     src_start, len, mcopy_mode,
+					     wp_copy);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -688,26 +688,26 @@ out:
 	return copied ? copied : err;
 }
 
-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-		     unsigned long src_start, unsigned long len,
-		     atomic_t *mmap_changing, __u64 mode)
+ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+			  unsigned long src_start, unsigned long len,
+			  atomic_t *mmap_changing, __u64 mode)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len,
-			      MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+	return mfill_atomic(dst_mm, dst_start, src_start, len,
+			    MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
-		       unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
+			      unsigned long len, atomic_t *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
-			      mmap_changing, 0);
+	return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+			    mmap_changing, 0);
 }
 
-ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
-		       unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
+			      unsigned long len, atomic_t *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
-			      mmap_changing, 0);
+	return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+			    mmap_changing, 0);
 }
 
 long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
-- 
cgit v1.2.3


From 61c5004022f56c443b86800e8985d8803f3a22aa Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 14 Mar 2023 15:12:48 -0700
Subject: mm: userfaultfd: don't pass around both mm and vma

Quite a few userfaultfd functions took both mm and vma pointers as
arguments.  Since the mm is trivially accessible via vma->vm_mm, there's
no reason to pass both; it just needlessly extends the already long
argument list.

Get rid of the mm pointer, where possible, to shorten the argument list.

Link: https://lkml.kernel.org/r/20230314221250.682452-3-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              |  2 +-
 include/linux/hugetlb.h       |  5 ++--
 include/linux/shmem_fs.h      |  4 +--
 include/linux/userfaultfd_k.h |  4 +--
 mm/hugetlb.c                  |  4 +--
 mm/shmem.c                    |  7 +++--
 mm/userfaultfd.c              | 61 ++++++++++++++++++++-----------------------
 7 files changed, 41 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4aedfd98e3f5..d8d432ca81e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1644,7 +1644,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
 		/* Reset ptes for the whole vma range if wr-protected */
 		if (userfaultfd_wp(vma))
-			uffd_wp_range(mm, vma, start, vma_end - start, false);
+			uffd_wp_range(vma, start, vma_end - start, false);
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 152434396c48..3cb7cd853fa8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,7 +158,7 @@ unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 #ifdef CONFIG_USERFAULTFD
-int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
@@ -393,8 +393,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 }
 
 #ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
-					   pte_t *dst_pte,
+static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 					   struct vm_area_struct *dst_vma,
 					   unsigned long dst_addr,
 					   unsigned long src_addr,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 50bf82b36995..922a2b45fe6f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -152,14 +152,14 @@ extern void shmem_uncharge(struct inode *inode, long pages);
 
 #ifdef CONFIG_USERFAULTFD
 #ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 				  struct vm_area_struct *dst_vma,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
 				  bool zeropage, bool wp_copy,
 				  struct page **pagep);
 #else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
+#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
 			       src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
 #endif /* CONFIG_SHMEM */
 #endif /* CONFIG_USERFAULTFD */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac178d810dc7..9458cd94a508 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -56,7 +56,7 @@ enum mcopy_atomic_mode {
 	MCOPY_ATOMIC_CONTINUE,
 };
 
-extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
 				    struct vm_area_struct *dst_vma,
 				    unsigned long dst_addr, struct page *page,
 				    bool newly_allocated, bool wp_copy);
@@ -73,7 +73,7 @@ extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
-extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b1e474aa2fc5..6dc32cccbd9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6159,8 +6159,7 @@ out_mutex:
  * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
  * with modifications for hugetlb pages.
  */
-int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
-			     pte_t *dst_pte,
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
@@ -6168,6 +6167,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
 			     struct page **pagep,
 			     bool wp_copy)
 {
+	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct hstate *h = hstate_vma(dst_vma);
 	struct address_space *mapping = dst_vma->vm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index fa6e38f2f55f..9d13b9a64144 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2429,8 +2429,7 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
 }
 
 #ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
-			   pmd_t *dst_pmd,
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 			   struct vm_area_struct *dst_vma,
 			   unsigned long dst_addr,
 			   unsigned long src_addr,
@@ -2520,11 +2519,11 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release;
 
 	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
-				      gfp & GFP_RECLAIM_MASK, dst_mm);
+				      gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
 	if (ret)
 		goto out_release;
 
-	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 				       &folio->page, true, wp_copy);
 	if (ret)
 		goto out_delete_from_cache;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 874379ce271f..c3cc6cb04548 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -55,12 +55,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
  * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
  * and anon, and for both shared and private VMAs.
  */
-int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+int mfill_atomic_install_pte(pmd_t *dst_pmd,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr, struct page *page,
 			     bool newly_allocated, bool wp_copy)
 {
 	int ret;
+	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	pte_t _dst_pte, *dst_pte;
 	bool writable = dst_vma->vm_flags & VM_WRITE;
 	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -127,8 +128,7 @@ out_unlock:
 	return ret;
 }
 
-static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
-				 pmd_t *dst_pmd,
+static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 				 struct vm_area_struct *dst_vma,
 				 unsigned long dst_addr,
 				 unsigned long src_addr,
@@ -190,10 +190,10 @@ static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
 	__SetPageUptodate(page);
 
 	ret = -ENOMEM;
-	if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
+	if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL))
 		goto out_release;
 
-	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 				       page, true, wp_copy);
 	if (ret)
 		goto out_release;
@@ -204,8 +204,7 @@ out_release:
 	goto out;
 }
 
-static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
-				     pmd_t *dst_pmd,
+static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr)
 {
@@ -217,7 +216,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
 
 	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
 					 dst_vma->vm_page_prot));
-	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
 	if (dst_vma->vm_file) {
 		/* the shmem MAP_PRIVATE case requires checking the i_size */
 		inode = dst_vma->vm_file->f_inode;
@@ -230,7 +229,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
 	ret = -EEXIST;
 	if (!pte_none(*dst_pte))
 		goto out_unlock;
-	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+	set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 	ret = 0;
@@ -240,8 +239,7 @@ out_unlock:
 }
 
 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
-				     pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr,
 				     bool wp_copy)
@@ -269,7 +267,7 @@ static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
 		goto out_release;
 	}
 
-	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 				       page, false, wp_copy);
 	if (ret)
 		goto out_release;
@@ -310,7 +308,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
  * called with mmap_lock held, it will release mmap_lock before returning.
  */
-static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct vm_area_struct *dst_vma,
 					      unsigned long dst_start,
 					      unsigned long src_start,
@@ -318,6 +316,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
 					      enum mcopy_atomic_mode mode,
 					      bool wp_copy)
 {
+	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
 	pte_t *dst_pte;
@@ -411,7 +410,7 @@ retry:
 			goto out_unlock;
 		}
 
-		err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma,
+		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma,
 					       dst_addr, src_addr, mode, &page,
 					       wp_copy);
 
@@ -463,17 +462,15 @@ out:
 }
 #else /* !CONFIG_HUGETLB_PAGE */
 /* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
-				      struct vm_area_struct *dst_vma,
-				      unsigned long dst_start,
-				      unsigned long src_start,
-				      unsigned long len,
-				      enum mcopy_atomic_mode mode,
-				      bool wp_copy);
+extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+				    unsigned long dst_start,
+				    unsigned long src_start,
+				    unsigned long len,
+				    enum mcopy_atomic_mode mode,
+				    bool wp_copy);
 #endif /* CONFIG_HUGETLB_PAGE */
 
-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
-						pmd_t *dst_pmd,
+static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 						struct vm_area_struct *dst_vma,
 						unsigned long dst_addr,
 						unsigned long src_addr,
@@ -484,7 +481,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	ssize_t err;
 
 	if (mode == MCOPY_ATOMIC_CONTINUE) {
-		return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma,
+		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
 						 dst_addr, wp_copy);
 	}
 
@@ -500,14 +497,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
 		if (mode == MCOPY_ATOMIC_NORMAL)
-			err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma,
+			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
 						    dst_addr, src_addr, page,
 						    wp_copy);
 		else
-			err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd,
+			err = mfill_atomic_pte_zeropage(dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
-		err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
 					     dst_addr, src_addr,
 					     mode != MCOPY_ATOMIC_NORMAL,
 					     wp_copy, page);
@@ -588,7 +585,7 @@ retry:
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+		return  mfill_atomic_hugetlb(dst_vma, dst_start,
 					     src_start, len, mcopy_mode,
 					     wp_copy);
 
@@ -641,7 +638,7 @@ retry:
 		BUG_ON(pmd_none(*dst_pmd));
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
-		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
 				       src_addr, &page, mcopy_mode, wp_copy);
 		cond_resched();
 
@@ -710,7 +707,7 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
 			    mmap_changing, 0);
 }
 
-long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct vm_area_struct *dst_vma,
 		   unsigned long start, unsigned long len, bool enable_wp)
 {
 	unsigned int mm_cp_flags;
@@ -732,7 +729,7 @@ long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
 	 */
 	if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
 		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
-	tlb_gather_mmu(&tlb, dst_mm);
+	tlb_gather_mmu(&tlb, dst_vma->vm_mm);
 	ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
 	tlb_finish_mmu(&tlb);
 
@@ -788,7 +785,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 		_start = max(dst_vma->vm_start, start);
 		_end = min(dst_vma->vm_end, end);
 
-		err = uffd_wp_range(dst_mm, dst_vma, _start, _end - _start, enable_wp);
+		err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
 
 		/* Return 0 on success, <0 on failures */
 		if (err < 0)
-- 
cgit v1.2.3


From d9712937037e0ce887920f321429826e9dbfd960 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 14 Mar 2023 15:12:49 -0700
Subject: mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy'
argument.  In future commits we plan to plumb the flags through to more
places, so we'd be proliferating the very long argument list even further.

Let's take the time to simplify the argument list.  Combine the two
arguments into one - and generalize, so when we add more flags in the
future, it doesn't imply more function arguments.

Since the modes (copy, zeropage, continue) are mutually exclusive, store
them as an integer value (0, 1, 2) in the low bits.  Place combine-able
flag bits in the high bits.

This is quite similar to an earlier patch proposed by Nadav Amit
("userfaultfd: introduce uffd_flags" [1]).  The main difference is that
patch only handled flags, whereas this patch *also* combines the "mode"
argument into the same type to shorten the argument list.

[1]: https://lore.kernel.org/all/20220619233449.181323-2-namit@vmware.com/

Link: https://lkml.kernel.org/r/20230314221250.682452-4-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: James Houghton <jthoughton@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              |  5 ++-
 include/linux/hugetlb.h       | 10 +++---
 include/linux/shmem_fs.h      |  5 +--
 include/linux/userfaultfd_k.h | 46 +++++++++++++++++---------
 mm/hugetlb.c                  | 12 +++----
 mm/shmem.c                    |  7 ++--
 mm/userfaultfd.c              | 76 +++++++++++++++++++------------------------
 7 files changed, 84 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d8d432ca81e6..8971c3613cc6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1729,6 +1729,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	struct uffdio_copy uffdio_copy;
 	struct uffdio_copy __user *user_uffdio_copy;
 	struct userfaultfd_wake_range range;
+	uffd_flags_t flags = 0;
 
 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
 
@@ -1755,10 +1756,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
 		goto out;
+	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+		flags |= MFILL_ATOMIC_WP;
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
 					uffdio_copy.len, &ctx->mmap_changing,
-					uffdio_copy.mode);
+					flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3cb7cd853fa8..2a758bcd6719 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -162,9 +162,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
-			     enum mcopy_atomic_mode mode,
-			     struct page **pagep,
-			     bool wp_copy);
+			     uffd_flags_t flags,
+			     struct page **pagep);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -397,9 +396,8 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 					   struct vm_area_struct *dst_vma,
 					   unsigned long dst_addr,
 					   unsigned long src_addr,
-					   enum mcopy_atomic_mode mode,
-					   struct page **pagep,
-					   bool wp_copy)
+					   uffd_flags_t flags,
+					   struct page **pagep)
 {
 	BUG();
 	return 0;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 922a2b45fe6f..3bb8d21edbb3 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/xattr.h>
 #include <linux/fs_parser.h>
+#include <linux/userfaultfd_k.h>
 
 /* inode in-kernel data */
 
@@ -156,11 +157,11 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 				  struct vm_area_struct *dst_vma,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
-				  bool zeropage, bool wp_copy,
+				  uffd_flags_t flags,
 				  struct page **pagep);
 #else /* !CONFIG_SHMEM */
 #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
-			       src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
+			       src_addr, flags, pagep) ({ BUG(); 0; })
 #endif /* CONFIG_SHMEM */
 #endif /* CONFIG_USERFAULTFD */
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 9458cd94a508..4c477dece540 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -40,30 +40,44 @@ extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
-/*
- * The mode of operation for __mcopy_atomic and its helpers.
- *
- * This is almost an implementation detail (mcopy_atomic below doesn't take this
- * as a parameter), but it's exposed here because memory-kind-specific
- * implementations (e.g. hugetlbfs) need to know the mode of operation.
- */
-enum mcopy_atomic_mode {
-	/* A normal copy_from_user into the destination range. */
-	MCOPY_ATOMIC_NORMAL,
-	/* Don't copy; map the destination range to the zero page. */
-	MCOPY_ATOMIC_ZEROPAGE,
-	/* Just install pte(s) with the existing page(s) in the page cache. */
-	MCOPY_ATOMIC_CONTINUE,
+/* A combined operation mode + behavior flags. */
+typedef unsigned int __bitwise uffd_flags_t;
+
+/* Mutually exclusive modes of operation. */
+enum mfill_atomic_mode {
+	MFILL_ATOMIC_COPY,
+	MFILL_ATOMIC_ZEROPAGE,
+	MFILL_ATOMIC_CONTINUE,
+	NR_MFILL_ATOMIC_MODES,
 };
 
+#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
+#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
+#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
+#define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
+
+static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
+{
+	return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
+}
+
+static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
+{
+	flags &= ~MFILL_ATOMIC_MODE_MASK;
+	return flags | ((__force uffd_flags_t) mode);
+}
+
+/* Flags controlling behavior. These behavior changes are mode-independent. */
+#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
+
 extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
 				    struct vm_area_struct *dst_vma,
 				    unsigned long dst_addr, struct page *page,
-				    bool newly_allocated, bool wp_copy);
+				    bool newly_allocated, uffd_flags_t flags);
 
 extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
 				 unsigned long src_start, unsigned long len,
-				 atomic_t *mmap_changing, __u64 mode);
+				 atomic_t *mmap_changing, uffd_flags_t flags);
 extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
 				     unsigned long dst_start,
 				     unsigned long len,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6dc32cccbd9b..8bfd07f4c143 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6163,12 +6163,12 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
-			     enum mcopy_atomic_mode mode,
-			     struct page **pagep,
-			     bool wp_copy)
+			     uffd_flags_t flags,
+			     struct page **pagep)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
-	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
+	bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
+	bool wp_enabled = (flags & MFILL_ATOMIC_WP);
 	struct hstate *h = hstate_vma(dst_vma);
 	struct address_space *mapping = dst_vma->vm_file->f_mapping;
 	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -6303,7 +6303,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
 	 * with wp flag set, don't set pte write bit.
 	 */
-	if (wp_copy || (is_continue && !vm_shared))
+	if (wp_enabled || (is_continue && !vm_shared))
 		writable = 0;
 	else
 		writable = dst_vma->vm_flags & VM_WRITE;
@@ -6318,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	_dst_pte = huge_pte_mkdirty(_dst_pte);
 	_dst_pte = pte_mkyoung(_dst_pte);
 
-	if (wp_copy)
+	if (wp_enabled)
 		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
 
 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d13b9a64144..b185c1db3009 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -76,7 +76,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/rmap.h>
 #include <linux/uuid.h>
 
@@ -2433,7 +2432,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 			   struct vm_area_struct *dst_vma,
 			   unsigned long dst_addr,
 			   unsigned long src_addr,
-			   bool zeropage, bool wp_copy,
+			   uffd_flags_t flags,
 			   struct page **pagep)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
@@ -2465,7 +2464,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 		if (!folio)
 			goto out_unacct_blocks;
 
-		if (!zeropage) {	/* COPY */
+		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
 			page_kaddr = kmap_local_folio(folio, 0);
 			/*
 			 * The read mmap_lock is held here.  Despite the
@@ -2524,7 +2523,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 		goto out_release;
 
 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
-				       &folio->page, true, wp_copy);
+				       &folio->page, true, flags);
 	if (ret)
 		goto out_delete_from_cache;
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c3cc6cb04548..a9b19b39413d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -58,7 +58,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 int mfill_atomic_install_pte(pmd_t *dst_pmd,
 			     struct vm_area_struct *dst_vma,
 			     unsigned long dst_addr, struct page *page,
-			     bool newly_allocated, bool wp_copy)
+			     bool newly_allocated, uffd_flags_t flags)
 {
 	int ret;
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -77,7 +77,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 		writable = false;
 	if (writable)
 		_dst_pte = pte_mkwrite(_dst_pte);
-	if (wp_copy)
+	if (flags & MFILL_ATOMIC_WP)
 		_dst_pte = pte_mkuffd_wp(_dst_pte);
 
 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -132,8 +132,8 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 				 struct vm_area_struct *dst_vma,
 				 unsigned long dst_addr,
 				 unsigned long src_addr,
-				 struct page **pagep,
-				 bool wp_copy)
+				 uffd_flags_t flags,
+				 struct page **pagep)
 {
 	void *page_kaddr;
 	int ret;
@@ -194,7 +194,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 		goto out_release;
 
 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
-				       page, true, wp_copy);
+				       page, true, flags);
 	if (ret)
 		goto out_release;
 out:
@@ -242,7 +242,7 @@ out_unlock:
 static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr,
-				     bool wp_copy)
+				     uffd_flags_t flags)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -268,7 +268,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 	}
 
 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
-				       page, false, wp_copy);
+				       page, false, flags);
 	if (ret)
 		goto out_release;
 
@@ -313,8 +313,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      enum mcopy_atomic_mode mode,
-					      bool wp_copy)
+					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -334,7 +333,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * by THP.  Since we can not reliably insert a zero page, this
 	 * feature is not supported.
 	 */
-	if (mode == MCOPY_ATOMIC_ZEROPAGE) {
+	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -402,7 +401,7 @@ retry:
 			goto out_unlock;
 		}
 
-		if (mode != MCOPY_ATOMIC_CONTINUE &&
+		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
 		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
 			err = -EEXIST;
 			hugetlb_vma_unlock_read(dst_vma);
@@ -410,9 +409,8 @@ retry:
 			goto out_unlock;
 		}
 
-		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma,
-					       dst_addr, src_addr, mode, &page,
-					       wp_copy);
+		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
+					       src_addr, flags, &page);
 
 		hugetlb_vma_unlock_read(dst_vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -466,23 +464,21 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
 				    unsigned long dst_start,
 				    unsigned long src_start,
 				    unsigned long len,
-				    enum mcopy_atomic_mode mode,
-				    bool wp_copy);
+				    uffd_flags_t flags);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 						struct vm_area_struct *dst_vma,
 						unsigned long dst_addr,
 						unsigned long src_addr,
-						struct page **page,
-						enum mcopy_atomic_mode mode,
-						bool wp_copy)
+						uffd_flags_t flags,
+						struct page **pagep)
 {
 	ssize_t err;
 
-	if (mode == MCOPY_ATOMIC_CONTINUE) {
+	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
 		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
-						 dst_addr, wp_copy);
+						 dst_addr, flags);
 	}
 
 	/*
@@ -496,18 +492,17 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	 * and not in the radix tree.
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
-		if (mode == MCOPY_ATOMIC_NORMAL)
+		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
 			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
-						    dst_addr, src_addr, page,
-						    wp_copy);
+						    dst_addr, src_addr,
+						    flags, pagep);
 		else
 			err = mfill_atomic_pte_zeropage(dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
 		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
 					     dst_addr, src_addr,
-					     mode != MCOPY_ATOMIC_NORMAL,
-					     wp_copy, page);
+					     flags, pagep);
 	}
 
 	return err;
@@ -517,9 +512,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 					    unsigned long dst_start,
 					    unsigned long src_start,
 					    unsigned long len,
-					    enum mcopy_atomic_mode mcopy_mode,
 					    atomic_t *mmap_changing,
-					    __u64 mode)
+					    uffd_flags_t flags)
 {
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
@@ -527,7 +521,6 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	unsigned long src_addr, dst_addr;
 	long copied;
 	struct page *page;
-	bool wp_copy;
 
 	/*
 	 * Sanitize the command parameters:
@@ -577,8 +570,7 @@ retry:
 	 * validate 'mode' now that we know the dst_vma: don't allow
 	 * a wrprotect copy if the userfaultfd didn't register as WP.
 	 */
-	wp_copy = mode & UFFDIO_COPY_MODE_WP;
-	if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
 		goto out_unlock;
 
 	/*
@@ -586,12 +578,12 @@ retry:
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  mfill_atomic_hugetlb(dst_vma, dst_start,
-					     src_start, len, mcopy_mode,
-					     wp_copy);
+					     src_start, len, flags);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
-	if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+	if (!vma_is_shmem(dst_vma) &&
+	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
 	/*
@@ -639,7 +631,7 @@ retry:
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
 		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
-				       src_addr, &page, mcopy_mode, wp_copy);
+				       src_addr, flags, &page);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
@@ -687,24 +679,24 @@ out:
 
 ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
 			  unsigned long src_start, unsigned long len,
-			  atomic_t *mmap_changing, __u64 mode)
+			  atomic_t *mmap_changing, uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, dst_start, src_start, len,
-			    MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
 }
 
 ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
 			      unsigned long len, atomic_t *mmap_changing)
 {
-	return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
-			    mmap_changing, 0);
+	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
 }
 
 ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
 			      unsigned long len, atomic_t *mmap_changing)
 {
-	return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
-			    mmap_changing, 0);
+	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+			    uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE));
 }
 
 long uffd_wp_range(struct vm_area_struct *dst_vma,
-- 
cgit v1.2.3


From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 14 Mar 2023 15:12:50 -0700
Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE
to resolve a missing fault, one can install a write-protected one.  This
is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination.

This was motivated by testing HugeTLB HGM [1], and in particular its
interaction with userfaultfd features.  Existing userfaultfd code supports
using WP and MINOR modes together (i.e.  you can register an area with
both enabled), but without this CONTINUE flag the combination is in
practice unusable.

So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as
UFFDIO_COPY_MODE_WP, but for *minor* faults.

Update the selftest to do some very basic exercising of the new flag.

Update Documentation/ to describe how these flags are used (neither the
COPY nor the new CONTINUE versions of this mode flag were described there
before).

[1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/

Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst | 8 ++++++++
 fs/userfaultfd.c                             | 8 ++++++--
 include/linux/userfaultfd_k.h                | 3 ++-
 include/uapi/linux/userfaultfd.h             | 7 +++++++
 mm/userfaultfd.c                             | 5 +++--
 tools/testing/selftests/mm/userfaultfd.c     | 4 ++++
 6 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index bd2226299583..7c304e432205 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
 and set the feature bit in advance to make sure none ptes will also be
 write protected even upon anonymous memory.
 
+When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either
+``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when
+resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE``
+respectively, it may be desirable for the new page / mapping to be
+write-protected (so future writes will also result in a WP fault). These ioctls
+support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
+respectively) to configure the mapping this way.
+
 QEMU/KVM
 ========
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8971c3613cc6..8395605790f6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	struct uffdio_continue uffdio_continue;
 	struct uffdio_continue __user *user_uffdio_continue;
 	struct userfaultfd_wake_range range;
+	uffd_flags_t flags = 0;
 
 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
 
@@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	    uffdio_continue.range.start) {
 		goto out;
 	}
-	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+				     UFFDIO_CONTINUE_MODE_WP))
 		goto out;
+	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
 					    uffdio_continue.range.len,
-					    &ctx->mmap_changing);
+					    &ctx->mmap_changing, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 4c477dece540..a2c53e98dfd6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
 				     unsigned long len,
 				     atomic_t *mmap_changing);
 extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-				     unsigned long len, atomic_t *mmap_changing);
+				     unsigned long len, atomic_t *mmap_changing,
+				     uffd_flags_t flags);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 90c958952bfc..66dd4cd277bd 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -305,6 +305,13 @@ struct uffdio_writeprotect {
 struct uffdio_continue {
 	struct uffdio_range range;
 #define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
+	/*
+	 * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+	 * the fly.  UFFDIO_CONTINUE_MODE_WP is available only if the
+	 * write protected ioctl is implemented for the range
+	 * according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_CONTINUE_MODE_WP			((__u64)1<<1)
 	__u64 mode;
 
 	/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a9b19b39413d..7f1b5f8b712c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
 }
 
 ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing)
+			      unsigned long len, atomic_t *mmap_changing,
+			      uffd_flags_t flags)
 {
 	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
-			    uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE));
+			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
 }
 
 long uffd_wp_range(struct vm_area_struct *dst_vma,
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
index e030d63c031a..a96d126cb40e 100644
--- a/tools/testing/selftests/mm/userfaultfd.c
+++ b/tools/testing/selftests/mm/userfaultfd.c
@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
 	req.range.start = start;
 	req.range.len = len;
 	req.mode = 0;
+	if (test_uffdio_wp)
+		req.mode |= UFFDIO_CONTINUE_MODE_WP;
 
 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
 	uffdio_register.range.start = (unsigned long)area_dst_alias;
 	uffdio_register.range.len = nr_pages * page_size;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+	if (test_uffdio_wp)
+		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
 		err("register failure");
 
-- 
cgit v1.2.3


From 5d671eb4ef7e1423a78fc0c12dd419c1dc1f6967 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 19 Mar 2023 13:42:14 +0200
Subject: mm: move get_page_from_free_area() to mm/page_alloc.c

The get_page_from_free_area() helper is only used in mm/page_alloc.c so
move it there to reduce noise in include/linux/mmzone.h

Link: https://lkml.kernel.org/r/20230319114214.2133332-1-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 7 -------
 mm/page_alloc.c        | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 35b11cc210d9..2d3d78d01283 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -108,13 +108,6 @@ struct free_area {
 	unsigned long		nr_free;
 };
 
-static inline struct page *get_page_from_free_area(struct free_area *area,
-					    int migratetype)
-{
-	return list_first_entry_or_null(&area->free_list[migratetype],
-					struct page, lru);
-}
-
 static inline bool free_area_empty(struct free_area *area, int migratetype)
 {
 	return list_empty(&area->free_list[migratetype]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3e49d028a7a..4b09711b6f0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,6 +1048,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 	zone->free_area[order].nr_free--;
 }
 
+static inline struct page *get_page_from_free_area(struct free_area *area,
+					    int migratetype)
+{
+	return list_first_entry_or_null(&area->free_list[migratetype],
+					struct page, lru);
+}
+
 /*
  * If this is not the largest possible page, check if the buddy
  * of the next-highest order is free. If it is, it's possible
-- 
cgit v1.2.3


From 9420f89db2dd611c5b436a13e13f74d65ecc3a6a Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:02 +0200
Subject: mm: move most of core MM initialization to mm/mm_init.c

The bulk of memory management initialization code is spread all over
mm/page_alloc.c and makes navigating through page allocator functionality
difficult.

Move most of the functions marked __init and __meminit to mm/mm_init.c to
make it better localized and allow some more spare room before
mm/page_alloc.c reaches 10k lines.

No functional changes.

Link: https://lkml.kernel.org/r/20230321170513.2401534-4-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h |    5 -
 mm/cma.c            |    1 +
 mm/internal.h       |   38 +-
 mm/mm_init.c        | 2304 +++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c     | 2866 +++++----------------------------------------------
 5 files changed, 2614 insertions(+), 2600 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 65a78773dcca..7c554e4bd49f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -361,9 +361,4 @@ extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
 #endif
 void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
-#ifdef CONFIG_CMA
-/* CMA stuff */
-extern void init_cma_reserved_pageblock(struct page *page);
-#endif
-
 #endif /* __LINUX_GFP_H */
diff --git a/mm/cma.c b/mm/cma.c
index a7263aa02c92..6268d6620254 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
 #include <linux/kmemleak.h>
 #include <trace/events/cma.h>
 
+#include "internal.h"
 #include "cma.h"
 
 struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/internal.h b/mm/internal.h
index 2a7ffd9962c4..22f1410a0ee3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -202,6 +202,8 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  * in mm/page_alloc.c
  */
 
+extern char * const zone_names[MAX_NR_ZONES];
+
 /*
  * Structure for holding the mostly immutable allocation parameters passed
  * between functions involved in allocations, including the alloc_pages*
@@ -366,7 +368,29 @@ extern void __putback_isolated_page(struct page *page, unsigned int order,
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
 					unsigned int order);
 extern void __free_pages_core(struct page *page, unsigned int order);
+
+static inline void prep_compound_head(struct page *page, unsigned int order)
+{
+	struct folio *folio = (struct folio *)page;
+
+	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+	set_compound_order(page, order);
+	atomic_set(&folio->_entire_mapcount, -1);
+	atomic_set(&folio->_nr_pages_mapped, 0);
+	atomic_set(&folio->_pincount, 0);
+}
+
+static inline void prep_compound_tail(struct page *head, int tail_idx)
+{
+	struct page *p = head + tail_idx;
+
+	p->mapping = TAIL_MAPPING;
+	set_compound_head(p, head);
+	set_page_private(p, 0);
+}
+
 extern void prep_compound_page(struct page *page, unsigned int order);
+
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
@@ -377,6 +401,7 @@ extern void free_unref_page_list(struct list_head *list);
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
+extern void zone_pcp_init(struct zone *zone);
 
 extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 			  phys_addr_t min_addr,
@@ -474,7 +499,12 @@ isolate_migratepages_range(struct compact_control *cc,
 
 int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end);
-#endif
+
+/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+void init_cma_reserved_pageblock(struct page *page);
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
 
@@ -658,6 +688,12 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
 #endif /* !CONFIG_MMU */
 
 /* Memory initialisation debug and verification */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+DECLARE_STATIC_KEY_TRUE(deferred_pages);
+
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
 enum mminit_level {
 	MMINIT_WARNING,
 	MMINIT_VERIFY,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c1883362e71d..68d0187c7886 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,7 +14,14 @@
 #include <linux/notifier.h>
 #include <linux/sched.h>
 #include <linux/mman.h>
+#include <linux/memblock.h>
+#include <linux/page-isolation.h>
+#include <linux/padata.h>
+#include <linux/nmi.h>
+#include <linux/buffer_head.h>
+#include <linux/kmemleak.h>
 #include "internal.h"
+#include "shuffle.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
 int __meminitdata mminit_loglevel;
@@ -198,3 +205,2300 @@ static int __init mm_sysfs_init(void)
 	return 0;
 }
 postcore_initcall(mm_sysfs_init);
+
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
+
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
+
+bool deferred_struct_pages __meminitdata;
+
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+				     unsigned long *percent)
+{
+	unsigned long long coremem;
+	char *endptr;
+
+	if (!p)
+		return -EINVAL;
+
+	/* Value may be a percentage of total memory, otherwise bytes */
+	coremem = simple_strtoull(p, &endptr, 0);
+	if (*endptr == '%') {
+		/* Paranoid check for percent values greater than 100 */
+		WARN_ON(coremem > 100);
+
+		*percent = coremem;
+	} else {
+		coremem = memparse(p, &p);
+		/* Paranoid check that UL is enough for the coremem value */
+		WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+		*core = coremem >> PAGE_SHIFT;
+		*percent = 0UL;
+	}
+	return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+	/* parse kernelcore=mirror */
+	if (parse_option_str(p, "mirror")) {
+		mirrored_kernelcore = true;
+		return 0;
+	}
+
+	return cmdline_parse_core(p, &required_kernelcore,
+				  &required_kernelcore_percent);
+}
+early_param("kernelcore", cmdline_parse_kernelcore);
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+	return cmdline_parse_core(p, &required_movablecore,
+				  &required_movablecore_percent);
+}
+early_param("movablecore", cmdline_parse_movablecore);
+
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
+{
+	unsigned long totalpages = 0;
+	unsigned long start_pfn, end_pfn;
+	int i, nid;
+
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		unsigned long pages = end_pfn - start_pfn;
+
+		totalpages += pages;
+		if (pages)
+			node_set_state(nid, N_MEMORY);
+	}
+	return totalpages;
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+	int zone_index;
+	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+		if (zone_index == ZONE_MOVABLE)
+			continue;
+
+		if (arch_zone_highest_possible_pfn[zone_index] >
+				arch_zone_lowest_possible_pfn[zone_index])
+			break;
+	}
+
+	VM_BUG_ON(zone_index == -1);
+	movable_zone = zone_index;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(void)
+{
+	int i, nid;
+	unsigned long usable_startpfn;
+	unsigned long kernelcore_node, kernelcore_remaining;
+	/* save the state before borrow the nodemask */
+	nodemask_t saved_node_state = node_states[N_MEMORY];
+	unsigned long totalpages = early_calculate_totalpages();
+	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+	struct memblock_region *r;
+
+	/* Need to find movable_zone earlier when movable_node is specified. */
+	find_usable_zone_for_movable();
+
+	/*
+	 * If movable_node is specified, ignore kernelcore and movablecore
+	 * options.
+	 */
+	if (movable_node_is_enabled()) {
+		for_each_mem_region(r) {
+			if (!memblock_is_hotpluggable(r))
+				continue;
+
+			nid = memblock_get_region_node(r);
+
+			usable_startpfn = PFN_DOWN(r->base);
+			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+				min(usable_startpfn, zone_movable_pfn[nid]) :
+				usable_startpfn;
+		}
+
+		goto out2;
+	}
+
+	/*
+	 * If kernelcore=mirror is specified, ignore movablecore option
+	 */
+	if (mirrored_kernelcore) {
+		bool mem_below_4gb_not_mirrored = false;
+
+		for_each_mem_region(r) {
+			if (memblock_is_mirror(r))
+				continue;
+
+			nid = memblock_get_region_node(r);
+
+			usable_startpfn = memblock_region_memory_base_pfn(r);
+
+			if (usable_startpfn < PHYS_PFN(SZ_4G)) {
+				mem_below_4gb_not_mirrored = true;
+				continue;
+			}
+
+			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+				min(usable_startpfn, zone_movable_pfn[nid]) :
+				usable_startpfn;
+		}
+
+		if (mem_below_4gb_not_mirrored)
+			pr_warn("This configuration results in unmirrored kernel memory.\n");
+
+		goto out2;
+	}
+
+	/*
+	 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+	 * amount of necessary memory.
+	 */
+	if (required_kernelcore_percent)
+		required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+				       10000UL;
+	if (required_movablecore_percent)
+		required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+					10000UL;
+
+	/*
+	 * If movablecore= was specified, calculate what size of
+	 * kernelcore that corresponds so that memory usable for
+	 * any allocation type is evenly spread. If both kernelcore
+	 * and movablecore are specified, then the value of kernelcore
+	 * will be used for required_kernelcore if it's greater than
+	 * what movablecore would have allowed.
+	 */
+	if (required_movablecore) {
+		unsigned long corepages;
+
+		/*
+		 * Round-up so that ZONE_MOVABLE is at least as large as what
+		 * was requested by the user
+		 */
+		required_movablecore =
+			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+		required_movablecore = min(totalpages, required_movablecore);
+		corepages = totalpages - required_movablecore;
+
+		required_kernelcore = max(required_kernelcore, corepages);
+	}
+
+	/*
+	 * If kernelcore was not specified or kernelcore size is larger
+	 * than totalpages, there is no ZONE_MOVABLE.
+	 */
+	if (!required_kernelcore || required_kernelcore >= totalpages)
+		goto out;
+
+	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+	/* Spread kernelcore memory as evenly as possible throughout nodes */
+	kernelcore_node = required_kernelcore / usable_nodes;
+	for_each_node_state(nid, N_MEMORY) {
+		unsigned long start_pfn, end_pfn;
+
+		/*
+		 * Recalculate kernelcore_node if the division per node
+		 * now exceeds what is necessary to satisfy the requested
+		 * amount of memory for the kernel
+		 */
+		if (required_kernelcore < kernelcore_node)
+			kernelcore_node = required_kernelcore / usable_nodes;
+
+		/*
+		 * As the map is walked, we track how much memory is usable
+		 * by the kernel using kernelcore_remaining. When it is
+		 * 0, the rest of the node is usable by ZONE_MOVABLE
+		 */
+		kernelcore_remaining = kernelcore_node;
+
+		/* Go through each range of PFNs within this node */
+		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+			unsigned long size_pages;
+
+			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
+			if (start_pfn >= end_pfn)
+				continue;
+
+			/* Account for what is only usable for kernelcore */
+			if (start_pfn < usable_startpfn) {
+				unsigned long kernel_pages;
+				kernel_pages = min(end_pfn, usable_startpfn)
+								- start_pfn;
+
+				kernelcore_remaining -= min(kernel_pages,
+							kernelcore_remaining);
+				required_kernelcore -= min(kernel_pages,
+							required_kernelcore);
+
+				/* Continue if range is now fully accounted */
+				if (end_pfn <= usable_startpfn) {
+
+					/*
+					 * Push zone_movable_pfn to the end so
+					 * that if we have to rebalance
+					 * kernelcore across nodes, we will
+					 * not double account here
+					 */
+					zone_movable_pfn[nid] = end_pfn;
+					continue;
+				}
+				start_pfn = usable_startpfn;
+			}
+
+			/*
+			 * The usable PFN range for ZONE_MOVABLE is from
+			 * start_pfn->end_pfn. Calculate size_pages as the
+			 * number of pages used as kernelcore
+			 */
+			size_pages = end_pfn - start_pfn;
+			if (size_pages > kernelcore_remaining)
+				size_pages = kernelcore_remaining;
+			zone_movable_pfn[nid] = start_pfn + size_pages;
+
+			/*
+			 * Some kernelcore has been met, update counts and
+			 * break if the kernelcore for this node has been
+			 * satisfied
+			 */
+			required_kernelcore -= min(required_kernelcore,
+								size_pages);
+			kernelcore_remaining -= size_pages;
+			if (!kernelcore_remaining)
+				break;
+		}
+	}
+
+	/*
+	 * If there is still required_kernelcore, we do another pass with one
+	 * less node in the count. This will push zone_movable_pfn[nid] further
+	 * along on the nodes that still have memory until kernelcore is
+	 * satisfied
+	 */
+	usable_nodes--;
+	if (usable_nodes && required_kernelcore > usable_nodes)
+		goto restart;
+
+out2:
+	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+	for (nid = 0; nid < MAX_NUMNODES; nid++) {
+		unsigned long start_pfn, end_pfn;
+
+		zone_movable_pfn[nid] =
+			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+		if (zone_movable_pfn[nid] >= end_pfn)
+			zone_movable_pfn[nid] = 0;
+	}
+
+out:
+	/* restore the node_state */
+	node_states[N_MEMORY] = saved_node_state;
+}
+
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+				unsigned long zone, int nid)
+{
+	mm_zero_struct_page(page);
+	set_page_links(page, zone, nid, pfn);
+	init_page_count(page);
+	page_mapcount_reset(page);
+	page_cpupid_reset_last(page);
+	page_kasan_tag_reset(page);
+
+	INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+	if (!is_highmem_idx(zone))
+		set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+	unsigned long last_start;
+	unsigned long last_end;
+	int last_nid;
+};
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
+					struct mminit_pfnnid_cache *state)
+{
+	unsigned long start_pfn, end_pfn;
+	int nid;
+
+	if (state->last_start <= pfn && pfn < state->last_end)
+		return state->last_nid;
+
+	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+	if (nid != NUMA_NO_NODE) {
+		state->last_start = start_pfn;
+		state->last_end = end_pfn;
+		state->last_nid = nid;
+	}
+
+	return nid;
+}
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+	static DEFINE_SPINLOCK(early_pfn_lock);
+	int nid;
+
+	spin_lock(&early_pfn_lock);
+	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+	if (nid < 0)
+		nid = first_online_node;
+	spin_unlock(&early_pfn_lock);
+
+	return nid;
+}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+	pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn)
+{
+	int nid = early_pfn_to_nid(pfn);
+
+	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
+		return false;
+
+	return true;
+}
+
+/*
+ * Returns true when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+	static unsigned long prev_end_pfn, nr_initialised;
+
+	if (early_page_ext_enabled())
+		return false;
+	/*
+	 * prev_end_pfn static that contains the end of previous zone
+	 * No need to protect because called very early in boot before smp_init.
+	 */
+	if (prev_end_pfn != end_pfn) {
+		prev_end_pfn = end_pfn;
+		nr_initialised = 0;
+	}
+
+	/* Always populate low zones for address-constrained allocations */
+	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+		return false;
+
+	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+		return true;
+	/*
+	 * We start only with one section of pages, more pages are added as
+	 * needed until the rest of deferred pages are initialized.
+	 */
+	nr_initialised++;
+	if ((nr_initialised > PAGES_PER_SECTION) &&
+	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+		NODE_DATA(nid)->first_deferred_pfn = pfn;
+		return true;
+	}
+	return false;
+}
+
+static void __meminit init_reserved_page(unsigned long pfn)
+{
+	pg_data_t *pgdat;
+	int nid, zid;
+
+	if (early_page_initialised(pfn))
+		return;
+
+	nid = early_pfn_to_nid(pfn);
+	pgdat = NODE_DATA(nid);
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &pgdat->node_zones[zid];
+
+		if (zone_spans_pfn(zone, pfn))
+			break;
+	}
+	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+
+static inline bool early_page_initialised(unsigned long pfn)
+{
+	return true;
+}
+
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+	return false;
+}
+
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+{
+	unsigned long start_pfn = PFN_DOWN(start);
+	unsigned long end_pfn = PFN_UP(end);
+
+	for (; start_pfn < end_pfn; start_pfn++) {
+		if (pfn_valid(start_pfn)) {
+			struct page *page = pfn_to_page(start_pfn);
+
+			init_reserved_page(start_pfn);
+
+			/* Avoid false-positive PageTail() */
+			INIT_LIST_HEAD(&page->lru);
+
+			/*
+			 * no need for atomic set_bit because the struct
+			 * page is not visible yet so nobody should
+			 * access it yet.
+			 */
+			__SetPageReserved(page);
+		}
+	}
+}
+
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+	static struct memblock_region *r;
+
+	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+		if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+			for_each_mem_region(r) {
+				if (*pfn < memblock_region_memory_end_pfn(r))
+					break;
+			}
+		}
+		if (*pfn >= memblock_region_memory_base_pfn(r) &&
+		    memblock_is_mirror(r)) {
+			*pfn = memblock_region_memory_end_pfn(r);
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone_range().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ *   arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ *   nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ *   hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ *   the zone boundary; the pages in such holes will be prepended to the
+ *   zone/node above the hole except for the trailing pages in the last
+ *   section that will be appended to the zone/node below.
+ */
+static void __init init_unavailable_range(unsigned long spfn,
+					  unsigned long epfn,
+					  int zone, int node)
+{
+	unsigned long pfn;
+	u64 pgcnt = 0;
+
+	for (pfn = spfn; pfn < epfn; pfn++) {
+		if (!pfn_valid(pageblock_start_pfn(pfn))) {
+			pfn = pageblock_end_pfn(pfn) - 1;
+			continue;
+		}
+		__init_single_page(pfn_to_page(pfn), pfn, zone, node);
+		__SetPageReserved(pfn_to_page(pfn));
+		pgcnt++;
+	}
+
+	if (pgcnt)
+		pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+			node, zone_names[zone], pgcnt);
+}
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by memblock_free_all() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn, unsigned long zone_end_pfn,
+		enum meminit_context context,
+		struct vmem_altmap *altmap, int migratetype)
+{
+	unsigned long pfn, end_pfn = start_pfn + size;
+	struct page *page;
+
+	if (highest_memmap_pfn < end_pfn - 1)
+		highest_memmap_pfn = end_pfn - 1;
+
+#ifdef CONFIG_ZONE_DEVICE
+	/*
+	 * Honor reservation requested by the driver for this ZONE_DEVICE
+	 * memory. We limit the total number of pages to initialize to just
+	 * those that might contain the memory mapping. We will defer the
+	 * ZONE_DEVICE page initialization until after we have released
+	 * the hotplug lock.
+	 */
+	if (zone == ZONE_DEVICE) {
+		if (!altmap)
+			return;
+
+		if (start_pfn == altmap->base_pfn)
+			start_pfn += altmap->reserve;
+		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+	}
+#endif
+
+	for (pfn = start_pfn; pfn < end_pfn; ) {
+		/*
+		 * There can be holes in boot-time mem_map[]s handed to this
+		 * function.  They do not exist on hotplugged memory.
+		 */
+		if (context == MEMINIT_EARLY) {
+			if (overlap_memmap_init(zone, &pfn))
+				continue;
+			if (defer_init(nid, pfn, zone_end_pfn)) {
+				deferred_struct_pages = true;
+				break;
+			}
+		}
+
+		page = pfn_to_page(pfn);
+		__init_single_page(page, pfn, zone, nid);
+		if (context == MEMINIT_HOTPLUG)
+			__SetPageReserved(page);
+
+		/*
+		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+		 * such that unmovable allocations won't be scattered all
+		 * over the place during system boot.
+		 */
+		if (pageblock_aligned(pfn)) {
+			set_pageblock_migratetype(page, migratetype);
+			cond_resched();
+		}
+		pfn++;
+	}
+}
+
+static void __init memmap_init_zone_range(struct zone *zone,
+					  unsigned long start_pfn,
+					  unsigned long end_pfn,
+					  unsigned long *hole_pfn)
+{
+	unsigned long zone_start_pfn = zone->zone_start_pfn;
+	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+	if (start_pfn >= end_pfn)
+		return;
+
+	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+			  zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+	if (*hole_pfn < start_pfn)
+		init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+	*hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
+	unsigned long start_pfn, end_pfn;
+	unsigned long hole_pfn = 0;
+	int i, j, zone_id = 0, nid;
+
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		struct pglist_data *node = NODE_DATA(nid);
+
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = node->node_zones + j;
+
+			if (!populated_zone(zone))
+				continue;
+
+			memmap_init_zone_range(zone, start_pfn, end_pfn,
+					       &hole_pfn);
+			zone_id = j;
+		}
+	}
+
+#ifdef CONFIG_SPARSEMEM
+	/*
+	 * Initialize the memory map for hole in the range [memory_end,
+	 * section_end].
+	 * Append the pages in this hole to the highest zone in the last
+	 * node.
+	 * The call to init_unavailable_range() is outside the ifdef to
+	 * silence the compiler warining about zone_id set but not used;
+	 * for FLATMEM it is a nop anyway
+	 */
+	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
+	if (hole_pfn < end_pfn)
+#endif
+		init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
+					  unsigned long zone_idx, int nid,
+					  struct dev_pagemap *pgmap)
+{
+
+	__init_single_page(page, pfn, zone_idx, nid);
+
+	/*
+	 * Mark page reserved as it will need to wait for onlining
+	 * phase for it to be fully associated with a zone.
+	 *
+	 * We can use the non-atomic __set_bit operation for setting
+	 * the flag as we are still initializing the pages.
+	 */
+	__SetPageReserved(page);
+
+	/*
+	 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+	 * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
+	 * ever freed or placed on a driver-private list.
+	 */
+	page->pgmap = pgmap;
+	page->zone_device_data = NULL;
+
+	/*
+	 * Mark the block movable so that blocks are reserved for
+	 * movable at startup. This will force kernel allocations
+	 * to reserve their blocks rather than leaking throughout
+	 * the address space during boot when many long-lived
+	 * kernel allocations are made.
+	 *
+	 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+	 * because this is done early in section_activate()
+	 */
+	if (pageblock_aligned(pfn)) {
+		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+		cond_resched();
+	}
+
+	/*
+	 * ZONE_DEVICE pages are released directly to the driver page allocator
+	 * which will set the page count to 1 when allocating the page.
+	 */
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+	    pgmap->type == MEMORY_DEVICE_COHERENT)
+		set_page_count(page, 0);
+}
+
+/*
+ * With compound page geometry and when struct pages are stored in ram most
+ * tail pages are reused. Consequently, the amount of unique struct pages to
+ * initialize is a lot smaller that the total amount of struct pages being
+ * mapped. This is a paired / mild layering violation with explicit knowledge
+ * of how the sparse_vmemmap internals handle compound pages in the lack
+ * of an altmap. See vmemmap_populate_compound_pages().
+ */
+static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+					      unsigned long nr_pages)
+{
+	return is_power_of_2(sizeof(struct page)) &&
+		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+}
+
+static void __ref memmap_init_compound(struct page *head,
+				       unsigned long head_pfn,
+				       unsigned long zone_idx, int nid,
+				       struct dev_pagemap *pgmap,
+				       unsigned long nr_pages)
+{
+	unsigned long pfn, end_pfn = head_pfn + nr_pages;
+	unsigned int order = pgmap->vmemmap_shift;
+
+	__SetPageHead(head);
+	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+		prep_compound_tail(head, pfn - head_pfn);
+		set_page_count(page, 0);
+
+		/*
+		 * The first tail page stores important compound page info.
+		 * Call prep_compound_head() after the first tail page has
+		 * been initialized, to not have the data overwritten.
+		 */
+		if (pfn == head_pfn + 1)
+			prep_compound_head(head, order);
+	}
+}
+
+void __ref memmap_init_zone_device(struct zone *zone,
+				   unsigned long start_pfn,
+				   unsigned long nr_pages,
+				   struct dev_pagemap *pgmap)
+{
+	unsigned long pfn, end_pfn = start_pfn + nr_pages;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
+	unsigned long zone_idx = zone_idx(zone);
+	unsigned long start = jiffies;
+	int nid = pgdat->node_id;
+
+	if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
+		return;
+
+	/*
+	 * The call to memmap_init should have already taken care
+	 * of the pages reserved for the memmap, so we can just jump to
+	 * the end of that region and start processing the device pages.
+	 */
+	if (altmap) {
+		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+		nr_pages = end_pfn - start_pfn;
+	}
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
+		struct page *page = pfn_to_page(pfn);
+
+		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+		if (pfns_per_compound == 1)
+			continue;
+
+		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+				     compound_nr_pages(altmap, pfns_per_compound));
+	}
+
+	pr_info("%s initialised %lu pages in %ums\n", __func__,
+		nr_pages, jiffies_to_msecs(jiffies - start));
+}
+#endif
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independent of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __init adjust_zone_range_for_zone_movable(int nid,
+					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
+					unsigned long *zone_start_pfn,
+					unsigned long *zone_end_pfn)
+{
+	/* Only adjust if ZONE_MOVABLE is on this node */
+	if (zone_movable_pfn[nid]) {
+		/* Size ZONE_MOVABLE */
+		if (zone_type == ZONE_MOVABLE) {
+			*zone_start_pfn = zone_movable_pfn[nid];
+			*zone_end_pfn = min(node_end_pfn,
+				arch_zone_highest_possible_pfn[movable_zone]);
+
+		/* Adjust for ZONE_MOVABLE starting within this range */
+		} else if (!mirrored_kernelcore &&
+			*zone_start_pfn < zone_movable_pfn[nid] &&
+			*zone_end_pfn > zone_movable_pfn[nid]) {
+			*zone_end_pfn = zone_movable_pfn[nid];
+
+		/* Check if this whole range is within ZONE_MOVABLE */
+		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
+			*zone_start_pfn = *zone_end_pfn;
+	}
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	unsigned long nr_absent = range_end_pfn - range_start_pfn;
+	unsigned long start_pfn, end_pfn;
+	int i;
+
+	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+		nr_absent -= end_pfn - start_pfn;
+	}
+	return nr_absent;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * Return: the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+static unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn)
+{
+	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+	unsigned long zone_start_pfn, zone_end_pfn;
+	unsigned long nr_absent;
+
+	/* When hotadd a new node from cpu_up(), the node should be empty */
+	if (!node_start_pfn && !node_end_pfn)
+		return 0;
+
+	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+
+	adjust_zone_range_for_zone_movable(nid, zone_type,
+			node_start_pfn, node_end_pfn,
+			&zone_start_pfn, &zone_end_pfn);
+	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+	/*
+	 * ZONE_MOVABLE handling.
+	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+	 * and vice versa.
+	 */
+	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+		unsigned long start_pfn, end_pfn;
+		struct memblock_region *r;
+
+		for_each_mem_region(r) {
+			start_pfn = clamp(memblock_region_memory_base_pfn(r),
+					  zone_start_pfn, zone_end_pfn);
+			end_pfn = clamp(memblock_region_memory_end_pfn(r),
+					zone_start_pfn, zone_end_pfn);
+
+			if (zone_type == ZONE_MOVABLE &&
+			    memblock_is_mirror(r))
+				nr_absent += end_pfn - start_pfn;
+
+			if (zone_type == ZONE_NORMAL &&
+			    !memblock_is_mirror(r))
+				nr_absent += end_pfn - start_pfn;
+		}
+	}
+
+	return nr_absent;
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+static unsigned long __init zone_spanned_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
+					unsigned long *zone_start_pfn,
+					unsigned long *zone_end_pfn)
+{
+	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+	/* When hotadd a new node from cpu_up(), the node should be empty */
+	if (!node_start_pfn && !node_end_pfn)
+		return 0;
+
+	/* Get the start and end of the zone */
+	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+	adjust_zone_range_for_zone_movable(nid, zone_type,
+				node_start_pfn, node_end_pfn,
+				zone_start_pfn, zone_end_pfn);
+
+	/* Check that this node has pages within the zone's required range */
+	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+	*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
+
+	/* Return the spanned pages */
+	return *zone_end_pfn - *zone_start_pfn;
+}
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+						unsigned long node_start_pfn,
+						unsigned long node_end_pfn)
+{
+	unsigned long realtotalpages = 0, totalpages = 0;
+	enum zone_type i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+		unsigned long zone_start_pfn, zone_end_pfn;
+		unsigned long spanned, absent;
+		unsigned long size, real_size;
+
+		spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+						     node_start_pfn,
+						     node_end_pfn,
+						     &zone_start_pfn,
+						     &zone_end_pfn);
+		absent = zone_absent_pages_in_node(pgdat->node_id, i,
+						   node_start_pfn,
+						   node_end_pfn);
+
+		size = spanned;
+		real_size = size - absent;
+
+		if (size)
+			zone->zone_start_pfn = zone_start_pfn;
+		else
+			zone->zone_start_pfn = 0;
+		zone->spanned_pages = size;
+		zone->present_pages = real_size;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+		zone->present_early_pages = real_size;
+#endif
+
+		totalpages += size;
+		realtotalpages += real_size;
+	}
+
+	pgdat->node_spanned_pages = totalpages;
+	pgdat->node_present_pages = realtotalpages;
+	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+						unsigned long present_pages)
+{
+	unsigned long pages = spanned_pages;
+
+	/*
+	 * Provide a more accurate estimation if there are holes within
+	 * the zone and SPARSEMEM is in use. If there are holes within the
+	 * zone, each populated memory region may cost us one or two extra
+	 * memmap pages due to alignment because memmap pages for each
+	 * populated regions may not be naturally aligned on page boundary.
+	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
+	 */
+	if (spanned_pages > present_pages + (present_pages >> 4) &&
+	    IS_ENABLED(CONFIG_SPARSEMEM))
+		pages = present_pages;
+
+	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
+	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+	spin_lock_init(&ds_queue->split_queue_lock);
+	INIT_LIST_HEAD(&ds_queue->split_queue);
+	ds_queue->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
+	init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
+#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+	int i;
+
+	pgdat_resize_init(pgdat);
+	pgdat_kswapd_lock_init(pgdat);
+
+	pgdat_init_split_queue(pgdat);
+	pgdat_init_kcompactd(pgdat);
+
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
+	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+		init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
+	pgdat_page_ext_init(pgdat);
+	lruvec_init(&pgdat->__lruvec);
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+							unsigned long remaining_pages)
+{
+	atomic_long_set(&zone->managed_pages, remaining_pages);
+	zone_set_nid(zone, nid);
+	zone->name = zone_names[idx];
+	zone->zone_pgdat = NODE_DATA(nid);
+	spin_lock_init(&zone->lock);
+	zone_seqlock_init(zone);
+	zone_pcp_init(zone);
+}
+
+static void __meminit zone_init_free_lists(struct zone *zone)
+{
+	unsigned int order, t;
+	for_each_migratetype_order(order, t) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+void __meminit init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int zone_idx = zone_idx(zone) + 1;
+
+	if (zone_idx > pgdat->nr_zones)
+		pgdat->nr_zones = zone_idx;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	mminit_dprintk(MMINIT_TRACE, "memmap_init",
+			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
+			pgdat->node_id,
+			(unsigned long)zone_idx(zone),
+			zone_start_pfn, (zone_start_pfn + size));
+
+	zone_init_free_lists(zone);
+	zone->initialized = 1;
+}
+
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+{
+	unsigned long usemapsize;
+
+	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
+	usemapsize = roundup(zonesize, pageblock_nr_pages);
+	usemapsize = usemapsize >> pageblock_order;
+	usemapsize *= NR_PAGEBLOCK_BITS;
+	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+	return usemapsize / 8;
+}
+
+static void __ref setup_usemap(struct zone *zone)
+{
+	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
+					       zone->spanned_pages);
+	zone->pageblock_flags = NULL;
+	if (usemapsize) {
+		zone->pageblock_flags =
+			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+					    zone_to_nid(zone));
+		if (!zone->pageblock_flags)
+			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+			      usemapsize, zone->name, zone_to_nid(zone));
+	}
+}
+#else
+static inline void setup_usemap(struct zone *zone) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+void __init set_pageblock_order(void)
+{
+	unsigned int order = MAX_ORDER;
+
+	/* Check that pageblock_nr_pages has not already been setup */
+	if (pageblock_order)
+		return;
+
+	/* Don't let pageblocks exceed the maximum allocation granularity. */
+	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
+		order = HUGETLB_PAGE_ORDER;
+
+	/*
+	 * Assume the largest contiguous order of interest is a huge page.
+	 * This value may be variable depending on boot parameters on IA64 and
+	 * powerpc.
+	 */
+	pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
+ */
+void __init set_pageblock_order(void)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
+{
+	int nid = pgdat->node_id;
+	enum zone_type z;
+	int cpu;
+
+	pgdat_init_internals(pgdat);
+
+	if (pgdat->per_cpu_nodestats == &boot_nodestats)
+		pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+	/*
+	 * Reset the nr_zones, order and highest_zoneidx before reuse.
+	 * Note that kswapd will init kswapd_highest_zoneidx properly
+	 * when it starts in the near future.
+	 */
+	pgdat->nr_zones = 0;
+	pgdat->kswapd_order = 0;
+	pgdat->kswapd_highest_zoneidx = 0;
+	pgdat->node_start_pfn = 0;
+	for_each_online_cpu(cpu) {
+		struct per_cpu_nodestat *p;
+
+		p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+		memset(p, 0, sizeof(*p));
+	}
+
+	for (z = 0; z < MAX_NR_ZONES; z++)
+		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+	enum zone_type j;
+	int nid = pgdat->node_id;
+
+	pgdat_init_internals(pgdat);
+	pgdat->per_cpu_nodestats = &boot_nodestats;
+
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, freesize, memmap_pages;
+
+		size = zone->spanned_pages;
+		freesize = zone->present_pages;
+
+		/*
+		 * Adjust freesize so that it accounts for how much memory
+		 * is used by this zone for memmap. This affects the watermark
+		 * and per-cpu initialisations
+		 */
+		memmap_pages = calc_memmap_size(size, freesize);
+		if (!is_highmem_idx(j)) {
+			if (freesize >= memmap_pages) {
+				freesize -= memmap_pages;
+				if (memmap_pages)
+					pr_debug("  %s zone: %lu pages used for memmap\n",
+						 zone_names[j], memmap_pages);
+			} else
+				pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
+					zone_names[j], memmap_pages, freesize);
+		}
+
+		/* Account for reserved pages */
+		if (j == 0 && freesize > dma_reserve) {
+			freesize -= dma_reserve;
+			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
+		}
+
+		if (!is_highmem_idx(j))
+			nr_kernel_pages += freesize;
+		/* Charge for highmem memmap if there are enough kernel pages */
+		else if (nr_kernel_pages > memmap_pages * 2)
+			nr_kernel_pages -= memmap_pages;
+		nr_all_pages += freesize;
+
+		/*
+		 * Set an approximate value for lowmem here, it will be adjusted
+		 * when the bootmem allocator frees pages into the buddy system.
+		 * And all highmem pages will be managed by the buddy system.
+		 */
+		zone_init_internals(zone, j, nid, freesize);
+
+		if (!size)
+			continue;
+
+		set_pageblock_order();
+		setup_usemap(zone);
+		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
+	}
+}
+
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+			  phys_addr_t min_addr, int nid, bool exact_nid)
+{
+	void *ptr;
+
+	if (exact_nid)
+		ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+						   MEMBLOCK_ALLOC_ACCESSIBLE,
+						   nid);
+	else
+		ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+						 MEMBLOCK_ALLOC_ACCESSIBLE,
+						 nid);
+
+	if (ptr && size > 0)
+		page_init_poison(ptr, size);
+
+	return ptr;
+}
+
+#ifdef CONFIG_FLATMEM
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	unsigned long __maybe_unused start = 0;
+	unsigned long __maybe_unused offset = 0;
+
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+	offset = pgdat->node_start_pfn - start;
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size, end;
+		struct page *map;
+
+		/*
+		 * The zone's endpoints aren't required to be MAX_ORDER
+		 * aligned but the node_mem_map endpoints must be in order
+		 * for the buddy allocator to function correctly.
+		 */
+		end = pgdat_end_pfn(pgdat);
+		end = ALIGN(end, MAX_ORDER_NR_PAGES);
+		size =  (end - start) * sizeof(struct page);
+		map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+				   pgdat->node_id, false);
+		if (!map)
+			panic("Failed to allocate %ld bytes for node %d memory map\n",
+			      size, pgdat->node_id);
+		pgdat->node_mem_map = map + offset;
+	}
+	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+				__func__, pgdat->node_id, (unsigned long)pgdat,
+				(unsigned long)pgdat->node_mem_map);
+#ifndef CONFIG_NUMA
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0)) {
+		mem_map = NODE_DATA(0)->node_mem_map;
+		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+			mem_map -= offset;
+	}
+#endif
+}
+#else
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLATMEM */
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by memblock_set_node(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned long this_start_pfn, this_end_pfn;
+	int i;
+
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+		*start_pfn = min(*start_pfn, this_start_pfn);
+		*end_pfn = max(*end_pfn, this_end_pfn);
+	}
+
+	if (*start_pfn == -1UL)
+		*start_pfn = 0;
+}
+
+static void __init free_area_init_node(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	unsigned long start_pfn = 0;
+	unsigned long end_pfn = 0;
+
+	/* pg_data_t should be reset to zero when it's allocated */
+	WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = start_pfn;
+	pgdat->per_cpu_nodestats = NULL;
+
+	if (start_pfn != end_pfn) {
+		pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+			(u64)start_pfn << PAGE_SHIFT,
+			end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+	} else {
+		pr_info("Initmem setup node %d as memoryless\n", nid);
+	}
+
+	calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+
+	alloc_node_mem_map(pgdat);
+	pgdat_set_deferred_range(pgdat);
+
+	free_area_init_core(pgdat);
+	lru_gen_init_pgdat(pgdat);
+}
+
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
+{
+	enum zone_type zone_type;
+
+	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+		struct zone *zone = &pgdat->node_zones[zone_type];
+		if (populated_zone(zone)) {
+			if (IS_ENABLED(CONFIG_HIGHMEM))
+				node_set_state(nid, N_HIGH_MEMORY);
+			if (zone_type <= ZONE_NORMAL)
+				node_set_state(nid, N_NORMAL_MEMORY);
+			break;
+		}
+	}
+}
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+void __init setup_nr_node_ids(void)
+{
+	unsigned int highest;
+
+	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
+	nr_node_ids = highest + 1;
+}
+#endif
+
+static void __init free_area_init_memoryless_node(int nid)
+{
+	free_area_init_node(nid);
+}
+
+/*
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+	return false;
+}
+
+/**
+ * free_area_init - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by memblock_set_node(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init(unsigned long *max_zone_pfn)
+{
+	unsigned long start_pfn, end_pfn;
+	int i, nid, zone;
+	bool descending;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+
+	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
+	descending = arch_has_descending_max_zone_pfns();
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (descending)
+			zone = MAX_NR_ZONES - i - 1;
+		else
+			zone = i;
+
+		if (zone == ZONE_MOVABLE)
+			continue;
+
+		end_pfn = max(max_zone_pfn[zone], start_pfn);
+		arch_zone_lowest_possible_pfn[zone] = start_pfn;
+		arch_zone_highest_possible_pfn[zone] = end_pfn;
+
+		start_pfn = end_pfn;
+	}
+
+	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
+	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+	find_zone_movable_pfns_for_nodes();
+
+	/* Print out the zone ranges */
+	pr_info("Zone ranges:\n");
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (i == ZONE_MOVABLE)
+			continue;
+		pr_info("  %-8s ", zone_names[i]);
+		if (arch_zone_lowest_possible_pfn[i] ==
+				arch_zone_highest_possible_pfn[i])
+			pr_cont("empty\n");
+		else
+			pr_cont("[mem %#018Lx-%#018Lx]\n",
+				(u64)arch_zone_lowest_possible_pfn[i]
+					<< PAGE_SHIFT,
+				((u64)arch_zone_highest_possible_pfn[i]
+					<< PAGE_SHIFT) - 1);
+	}
+
+	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
+	pr_info("Movable zone start for each node\n");
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		if (zone_movable_pfn[i])
+			pr_info("  Node %d: %#018Lx\n", i,
+			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+	}
+
+	/*
+	 * Print out the early node map, and initialize the
+	 * subsection-map relative to active online memory ranges to
+	 * enable future "sub-section" extensions of the memory map.
+	 */
+	pr_info("Early memory node ranges\n");
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+			(u64)start_pfn << PAGE_SHIFT,
+			((u64)end_pfn << PAGE_SHIFT) - 1);
+		subsection_map_init(start_pfn, end_pfn - start_pfn);
+	}
+
+	/* Initialise every node */
+	mminit_verify_pageflags_layout();
+	setup_nr_node_ids();
+	for_each_node(nid) {
+		pg_data_t *pgdat;
+
+		if (!node_online(nid)) {
+			pr_info("Initializing node %d as memoryless\n", nid);
+
+			/* Allocator not initialized yet */
+			pgdat = arch_alloc_nodedata(nid);
+			if (!pgdat)
+				panic("Cannot allocate %zuB for node %d.\n",
+				       sizeof(*pgdat), nid);
+			arch_refresh_nodedata(nid, pgdat);
+			free_area_init_memoryless_node(nid);
+
+			/*
+			 * We do not want to confuse userspace by sysfs
+			 * files/directories for node without any memory
+			 * attached to it, so this node is not marked as
+			 * N_MEMORY and not marked online so that no sysfs
+			 * hierarchy will be created via register_one_node for
+			 * it. The pgdat will get fully initialized by
+			 * hotadd_init_pgdat() when memory is hotplugged into
+			 * this node.
+			 */
+			continue;
+		}
+
+		pgdat = NODE_DATA(nid);
+		free_area_init_node(nid);
+
+		/* Any memory on that node */
+		if (pgdat->node_present_pages)
+			node_set_state(nid, N_MEMORY);
+		check_for_memory(pgdat, nid);
+	}
+
+	memmap_init();
+}
+
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Return: the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+	unsigned long accl_mask = 0, last_end = 0;
+	unsigned long start, end, mask;
+	int last_nid = NUMA_NO_NODE;
+	int i, nid;
+
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+		if (!start || last_nid < 0 || last_nid == nid) {
+			last_nid = nid;
+			last_end = end;
+			continue;
+		}
+
+		/*
+		 * Start with a mask granular enough to pin-point to the
+		 * start pfn and tick off bits one-by-one until it becomes
+		 * too coarse to separate the current node from the last.
+		 */
+		mask = ~((1 << __ffs(start)) - 1);
+		while (mask && last_end <= (start & (mask << 1)))
+			mask <<= 1;
+
+		/* accumulate all internode masks */
+		accl_mask |= mask;
+	}
+
+	/* convert mask to number of pages */
+	return ~accl_mask + 1;
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(unsigned long pfn,
+				       unsigned long nr_pages)
+{
+	struct page *page;
+	unsigned long i;
+
+	if (!nr_pages)
+		return;
+
+	page = pfn_to_page(pfn);
+
+	/* Free a large naturally-aligned chunk if possible */
+	if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
+		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+		__free_pages_core(page, pageblock_order);
+		return;
+	}
+
+	for (i = 0; i < nr_pages; i++, page++, pfn++) {
+		if (pageblock_aligned(pfn))
+			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+		__free_pages_core(page, 0);
+	}
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+	if (atomic_dec_and_test(&pgdat_init_n_undone))
+		complete(&pgdat_init_all_done_comp);
+}
+
+/*
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * We check if a current large page is valid by only checking the validity
+ * of the head pfn.
+ */
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
+{
+	if (pageblock_aligned(pfn) && !pfn_valid(pfn))
+		return false;
+	return true;
+}
+
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * pageblock_nr_pages sizes.
+ */
+static void __init deferred_free_pages(unsigned long pfn,
+				       unsigned long end_pfn)
+{
+	unsigned long nr_free = 0;
+
+	for (; pfn < end_pfn; pfn++) {
+		if (!deferred_pfn_valid(pfn)) {
+			deferred_free_range(pfn - nr_free, nr_free);
+			nr_free = 0;
+		} else if (pageblock_aligned(pfn)) {
+			deferred_free_range(pfn - nr_free, nr_free);
+			nr_free = 1;
+		} else {
+			nr_free++;
+		}
+	}
+	/* Free the last block of pages to allocator */
+	deferred_free_range(pfn - nr_free, nr_free);
+}
+
+/*
+ * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
+ * by performing it only once every pageblock_nr_pages.
+ * Return number of pages initialized.
+ */
+static unsigned long  __init deferred_init_pages(struct zone *zone,
+						 unsigned long pfn,
+						 unsigned long end_pfn)
+{
+	int nid = zone_to_nid(zone);
+	unsigned long nr_pages = 0;
+	int zid = zone_idx(zone);
+	struct page *page = NULL;
+
+	for (; pfn < end_pfn; pfn++) {
+		if (!deferred_pfn_valid(pfn)) {
+			page = NULL;
+			continue;
+		} else if (!page || pageblock_aligned(pfn)) {
+			page = pfn_to_page(pfn);
+		} else {
+			page++;
+		}
+		__init_single_page(page, pfn, zid, nid);
+		nr_pages++;
+	}
+	return (nr_pages);
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+				    unsigned long *spfn, unsigned long *epfn,
+				    unsigned long first_init_pfn)
+{
+	u64 j;
+
+	/*
+	 * Start out by walking through the ranges in this zone that have
+	 * already been initialized. We don't need to do anything with them
+	 * so we just need to flush them out of the system.
+	 */
+	for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+		if (*epfn <= first_init_pfn)
+			continue;
+		if (*spfn < first_init_pfn)
+			*spfn = first_init_pfn;
+		*i = j;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+		       unsigned long *end_pfn)
+{
+	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+	unsigned long spfn = *start_pfn, epfn = *end_pfn;
+	unsigned long nr_pages = 0;
+	u64 j = *i;
+
+	/* First we loop through and initialize the page values */
+	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+		unsigned long t;
+
+		if (mo_pfn <= *start_pfn)
+			break;
+
+		t = min(mo_pfn, *end_pfn);
+		nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+		if (mo_pfn < *end_pfn) {
+			*start_pfn = mo_pfn;
+			break;
+		}
+	}
+
+	/* Reset values and now loop through freeing pages as needed */
+	swap(j, *i);
+
+	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+		unsigned long t;
+
+		if (mo_pfn <= spfn)
+			break;
+
+		t = min(mo_pfn, epfn);
+		deferred_free_pages(spfn, t);
+
+		if (mo_pfn <= epfn)
+			break;
+	}
+
+	return nr_pages;
+}
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+			   void *arg)
+{
+	unsigned long spfn, epfn;
+	struct zone *zone = arg;
+	u64 i;
+
+	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+	/*
+	 * Initialize and free pages in MAX_ORDER sized increments so that we
+	 * can avoid introducing any issues with the buddy allocator.
+	 */
+	while (spfn < end_pfn) {
+		deferred_init_maxorder(&i, zone, &spfn, &epfn);
+		cond_resched();
+	}
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+	return 1;
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+	pg_data_t *pgdat = data;
+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+	unsigned long spfn = 0, epfn = 0;
+	unsigned long first_init_pfn, flags;
+	unsigned long start = jiffies;
+	struct zone *zone;
+	int zid, max_threads;
+	u64 i;
+
+	/* Bind memory initialisation thread to a local node if possible */
+	if (!cpumask_empty(cpumask))
+		set_cpus_allowed_ptr(current, cpumask);
+
+	pgdat_resize_lock(pgdat, &flags);
+	first_init_pfn = pgdat->first_deferred_pfn;
+	if (first_init_pfn == ULONG_MAX) {
+		pgdat_resize_unlock(pgdat, &flags);
+		pgdat_init_report_one_done();
+		return 0;
+	}
+
+	/* Sanity check boundaries */
+	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+	pgdat->first_deferred_pfn = ULONG_MAX;
+
+	/*
+	 * Once we unlock here, the zone cannot be grown anymore, thus if an
+	 * interrupt thread must allocate this early in boot, zone must be
+	 * pre-grown prior to start of deferred page initialization.
+	 */
+	pgdat_resize_unlock(pgdat, &flags);
+
+	/* Only the highest zone is deferred so find it */
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		zone = pgdat->node_zones + zid;
+		if (first_init_pfn < zone_end_pfn(zone))
+			break;
+	}
+
+	/* If the zone is empty somebody else may have cleared out the zone */
+	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+						 first_init_pfn))
+		goto zone_empty;
+
+	max_threads = deferred_page_init_max_threads(cpumask);
+
+	while (spfn < epfn) {
+		unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+		struct padata_mt_job job = {
+			.thread_fn   = deferred_init_memmap_chunk,
+			.fn_arg      = zone,
+			.start       = spfn,
+			.size        = epfn_align - spfn,
+			.align       = PAGES_PER_SECTION,
+			.min_chunk   = PAGES_PER_SECTION,
+			.max_threads = max_threads,
+		};
+
+		padata_do_multithreaded(&job);
+		deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+						    epfn_align);
+	}
+zone_empty:
+	/* Sanity check that the next zone really is unpopulated */
+	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+	pr_info("node %d deferred pages initialised in %ums\n",
+		pgdat->node_id, jiffies_to_msecs(jiffies - start));
+
+	pgdat_init_report_one_done();
+	return 0;
+}
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+	pg_data_t *pgdat = zone->zone_pgdat;
+	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+	unsigned long spfn, epfn, flags;
+	unsigned long nr_pages = 0;
+	u64 i;
+
+	/* Only the last zone may have deferred pages */
+	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+		return false;
+
+	pgdat_resize_lock(pgdat, &flags);
+
+	/*
+	 * If someone grew this zone while we were waiting for spinlock, return
+	 * true, as there might be enough pages already.
+	 */
+	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+		pgdat_resize_unlock(pgdat, &flags);
+		return true;
+	}
+
+	/* If the zone is empty somebody else may have cleared out the zone */
+	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+						 first_deferred_pfn)) {
+		pgdat->first_deferred_pfn = ULONG_MAX;
+		pgdat_resize_unlock(pgdat, &flags);
+		/* Retry only once. */
+		return first_deferred_pfn != ULONG_MAX;
+	}
+
+	/*
+	 * Initialize and free pages in MAX_ORDER sized increments so
+	 * that we can avoid introducing any issues with the buddy
+	 * allocator.
+	 */
+	while (spfn < epfn) {
+		/* update our first deferred PFN for this section */
+		first_deferred_pfn = spfn;
+
+		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+		touch_nmi_watchdog();
+
+		/* We should only stop along section boundaries */
+		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+			continue;
+
+		/* If our quota has been met we can stop here */
+		if (nr_pages >= nr_pages_needed)
+			break;
+	}
+
+	pgdat->first_deferred_pfn = spfn;
+	pgdat_resize_unlock(pgdat, &flags);
+
+	return nr_pages > 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+#ifdef CONFIG_CMA
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+	unsigned i = pageblock_nr_pages;
+	struct page *p = page;
+
+	do {
+		__ClearPageReserved(p);
+		set_page_count(p, 0);
+	} while (++p, --i);
+
+	set_pageblock_migratetype(page, MIGRATE_CMA);
+	set_page_refcounted(page);
+	__free_pages(page, pageblock_order);
+
+	adjust_managed_page_count(page, pageblock_nr_pages);
+	page_zone(page)->cma_pages += pageblock_nr_pages;
+}
+#endif
+
+void __init page_alloc_init_late(void)
+{
+	struct zone *zone;
+	int nid;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+	/* There will be num_node_state(N_MEMORY) threads */
+	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+	for_each_node_state(nid, N_MEMORY) {
+		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+	}
+
+	/* Block until all are initialised */
+	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/*
+	 * We initialized the rest of the deferred pages.  Permanently disable
+	 * on-demand struct page initialization.
+	 */
+	static_branch_disable(&deferred_pages);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
+#endif
+
+	buffer_init();
+
+	/* Discard memblock private memory */
+	memblock_discard();
+
+	for_each_node_state(nid, N_MEMORY)
+		shuffle_free_memory(NODE_DATA(nid));
+
+	for_each_populated_zone(zone)
+		set_zone_contiguous(zone);
+}
+
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE	(64ul << 30)
+#define ADAPT_SCALE_SHIFT	2
+#define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long low_limit,
+				     unsigned long high_limit)
+{
+	unsigned long long max = high_limit;
+	unsigned long log2qty, size;
+	void *table;
+	gfp_t gfp_flags;
+	bool virt;
+	bool huge;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries) {
+		/* round applicable memory size up to nearest megabyte */
+		numentries = nr_kernel_pages;
+		numentries -= arch_reserved_kernel_pages();
+
+		/* It isn't necessary when PAGE_SIZE >= 1MB */
+		if (PAGE_SIZE < SZ_1M)
+			numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
+
+#if __BITS_PER_LONG > 32
+		if (!high_limit) {
+			unsigned long adapt;
+
+			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+			     adapt <<= ADAPT_SCALE_SHIFT)
+				scale++;
+		}
+#endif
+
+		/* limit to 1 bucket per 2^scale bytes of low memory */
+		if (scale > PAGE_SHIFT)
+			numentries >>= (scale - PAGE_SHIFT);
+		else
+			numentries <<= (PAGE_SHIFT - scale);
+
+		/* Make sure we've got at least a 0-order allocation.. */
+		if (unlikely(flags & HASH_SMALL)) {
+			/* Makes no sense without HASH_EARLY */
+			WARN_ON(!(flags & HASH_EARLY));
+			if (!(numentries >> *_hash_shift)) {
+				numentries = 1UL << *_hash_shift;
+				BUG_ON(!numentries);
+			}
+		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+			numentries = PAGE_SIZE / bucketsize;
+	}
+	numentries = roundup_pow_of_two(numentries);
+
+	/* limit allocation size to 1/16 total memory by default */
+	if (max == 0) {
+		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+		do_div(max, bucketsize);
+	}
+	max = min(max, 0x80000000ULL);
+
+	if (numentries < low_limit)
+		numentries = low_limit;
+	if (numentries > max)
+		numentries = max;
+
+	log2qty = ilog2(numentries);
+
+	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
+	do {
+		virt = false;
+		size = bucketsize << log2qty;
+		if (flags & HASH_EARLY) {
+			if (flags & HASH_ZERO)
+				table = memblock_alloc(size, SMP_CACHE_BYTES);
+			else
+				table = memblock_alloc_raw(size,
+							   SMP_CACHE_BYTES);
+		} else if (get_order(size) > MAX_ORDER || hashdist) {
+			table = vmalloc_huge(size, gfp_flags);
+			virt = true;
+			if (table)
+				huge = is_vm_area_hugepages(table);
+		} else {
+			/*
+			 * If bucketsize is not a power-of-two, we may free
+			 * some pages at the end of hash table which
+			 * alloc_pages_exact() automatically does
+			 */
+			table = alloc_pages_exact(size, gfp_flags);
+			kmemleak_alloc(table, size, 1, gfp_flags);
+		}
+	} while (!table && size > PAGE_SIZE && --log2qty);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+	dma_reserve = new_dma_reserve;
+}
+
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
+							unsigned int order)
+{
+	if (!early_page_initialised(pfn))
+		return;
+	if (!kmsan_memblock_free_pages(page, order)) {
+		/* KMSAN will take care of these pages. */
+		return;
+	}
+	__free_pages_core(page, order);
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 33925488040f..8adc70b6034d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,9 +72,7 @@
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
-#include <linux/padata.h>
 #include <linux/khugepaged.h>
-#include <linux/buffer_head.h>
 #include <linux/delayacct.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -355,7 +353,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
 	[ZONE_MOVABLE] = 0,
 };
 
-static char * const zone_names[MAX_NR_ZONES] = {
+char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
@@ -401,17 +399,6 @@ int user_min_free_kbytes = -1;
 int watermark_boost_factor __read_mostly = 15000;
 int watermark_scale_factor = 10;
 
-static unsigned long nr_kernel_pages __initdata;
-static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
-
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long required_kernelcore __initdata;
-static unsigned long required_kernelcore_percent __initdata;
-static unsigned long required_movablecore __initdata;
-static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
 bool mirrored_kernelcore __initdata_memblock;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -427,86 +414,36 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
-bool deferred_struct_pages __meminitdata;
-
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 /*
  * During boot we initialize deferred pages on-demand, as needed, but once
  * page_alloc_init_late() has finished, the deferred pages are all initialized,
  * and we can permanently disable that path.
  */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+DEFINE_STATIC_KEY_TRUE(deferred_pages);
 
 static inline bool deferred_pages_enabled(void)
 {
 	return static_branch_unlikely(&deferred_pages);
 }
 
-/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
-{
-	int nid = early_pfn_to_nid(pfn);
-
-	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
-		return false;
-
-	return true;
-}
-
 /*
- * Returns true when the remaining initialisation should be deferred until
- * later in the boot cycle when it can be parallelised.
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
  */
-static bool __meminit
-defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
 {
-	static unsigned long prev_end_pfn, nr_initialised;
-
-	if (early_page_ext_enabled())
-		return false;
-	/*
-	 * prev_end_pfn static that contains the end of previous zone
-	 * No need to protect because called very early in boot before smp_init.
-	 */
-	if (prev_end_pfn != end_pfn) {
-		prev_end_pfn = end_pfn;
-		nr_initialised = 0;
-	}
-
-	/* Always populate low zones for address-constrained allocations */
-	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
-		return false;
-
-	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
-		return true;
-	/*
-	 * We start only with one section of pages, more pages are added as
-	 * needed until the rest of deferred pages are initialized.
-	 */
-	nr_initialised++;
-	if ((nr_initialised > PAGES_PER_SECTION) &&
-	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
-		NODE_DATA(nid)->first_deferred_pfn = pfn;
-		return true;
-	}
-	return false;
+       return deferred_grow_zone(zone, order);
 }
 #else
 static inline bool deferred_pages_enabled(void)
 {
 	return false;
 }
-
-static inline bool early_page_initialised(unsigned long pfn)
-{
-	return true;
-}
-
-static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
-{
-	return false;
-}
-#endif
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(const struct page *page,
@@ -772,26 +709,6 @@ void free_compound_page(struct page *page)
 	free_the_page(page, compound_order(page));
 }
 
-static void prep_compound_head(struct page *page, unsigned int order)
-{
-	struct folio *folio = (struct folio *)page;
-
-	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
-	set_compound_order(page, order);
-	atomic_set(&folio->_entire_mapcount, -1);
-	atomic_set(&folio->_nr_pages_mapped, 0);
-	atomic_set(&folio->_pincount, 0);
-}
-
-static void prep_compound_tail(struct page *head, int tail_idx)
-{
-	struct page *p = head + tail_idx;
-
-	p->mapping = TAIL_MAPPING;
-	set_compound_head(p, head);
-	set_page_private(p, 0);
-}
-
 void prep_compound_page(struct page *page, unsigned int order)
 {
 	int i;
@@ -1608,80 +1525,6 @@ static void free_one_page(struct zone *zone,
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-				unsigned long zone, int nid)
-{
-	mm_zero_struct_page(page);
-	set_page_links(page, zone, nid, pfn);
-	init_page_count(page);
-	page_mapcount_reset(page);
-	page_cpupid_reset_last(page);
-	page_kasan_tag_reset(page);
-
-	INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
-	if (!is_highmem_idx(zone))
-		set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-}
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __meminit init_reserved_page(unsigned long pfn)
-{
-	pg_data_t *pgdat;
-	int nid, zid;
-
-	if (early_page_initialised(pfn))
-		return;
-
-	nid = early_pfn_to_nid(pfn);
-	pgdat = NODE_DATA(nid);
-
-	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-		struct zone *zone = &pgdat->node_zones[zid];
-
-		if (zone_spans_pfn(zone, pfn))
-			break;
-	}
-	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-}
-#else
-static inline void init_reserved_page(unsigned long pfn)
-{
-}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-/*
- * Initialised pages do not have PageReserved set. This function is
- * called for each range allocated by the bootmem allocator and
- * marks the pages PageReserved. The remaining valid pages are later
- * sent to the buddy page allocator.
- */
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
-{
-	unsigned long start_pfn = PFN_DOWN(start);
-	unsigned long end_pfn = PFN_UP(end);
-
-	for (; start_pfn < end_pfn; start_pfn++) {
-		if (pfn_valid(start_pfn)) {
-			struct page *page = pfn_to_page(start_pfn);
-
-			init_reserved_page(start_pfn);
-
-			/* Avoid false-positive PageTail() */
-			INIT_LIST_HEAD(&page->lru);
-
-			/*
-			 * no need for atomic set_bit because the struct
-			 * page is not visible yet so nobody should
-			 * access it yet.
-			 */
-			__SetPageReserved(page);
-		}
-	}
-}
-
 static void __free_pages_ok(struct page *page, unsigned int order,
 			    fpi_t fpi_flags)
 {
@@ -1740,70 +1583,6 @@ void __free_pages_core(struct page *page, unsigned int order)
 	__free_pages_ok(page, order, FPI_TO_TAIL);
 }
 
-#ifdef CONFIG_NUMA
-
-/*
- * During memory init memblocks map pfns to nids. The search is expensive and
- * this caches recent lookups. The implementation of __early_pfn_to_nid
- * treats start/end as pfns.
- */
-struct mminit_pfnnid_cache {
-	unsigned long last_start;
-	unsigned long last_end;
-	int last_nid;
-};
-
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-static int __meminit __early_pfn_to_nid(unsigned long pfn,
-					struct mminit_pfnnid_cache *state)
-{
-	unsigned long start_pfn, end_pfn;
-	int nid;
-
-	if (state->last_start <= pfn && pfn < state->last_end)
-		return state->last_nid;
-
-	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
-	if (nid != NUMA_NO_NODE) {
-		state->last_start = start_pfn;
-		state->last_end = end_pfn;
-		state->last_nid = nid;
-	}
-
-	return nid;
-}
-
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
-	static DEFINE_SPINLOCK(early_pfn_lock);
-	int nid;
-
-	spin_lock(&early_pfn_lock);
-	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid < 0)
-		nid = first_online_node;
-	spin_unlock(&early_pfn_lock);
-
-	return nid;
-}
-#endif /* CONFIG_NUMA */
-
-void __init memblock_free_pages(struct page *page, unsigned long pfn,
-							unsigned int order)
-{
-	if (!early_page_initialised(pfn))
-		return;
-	if (!kmsan_memblock_free_pages(page, order)) {
-		/* KMSAN will take care of these pages. */
-		return;
-	}
-	__free_pages_core(page, order);
-}
-
 /*
  * Check that the whole (or subset of) a pageblock given by the interval of
  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
@@ -1874,570 +1653,131 @@ void clear_zone_contiguous(struct zone *zone)
 	zone->contiguous = false;
 }
 
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
-				       unsigned long nr_pages)
-{
-	struct page *page;
-	unsigned long i;
-
-	if (!nr_pages)
-		return;
-
-	page = pfn_to_page(pfn);
-
-	/* Free a large naturally-aligned chunk if possible */
-	if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
-		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_core(page, pageblock_order);
-		return;
-	}
-
-	for (i = 0; i < nr_pages; i++, page++, pfn++) {
-		if (pageblock_aligned(pfn))
-			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_core(page, 0);
-	}
-}
-
-/* Completion tracking for deferred_init_memmap() threads */
-static atomic_t pgdat_init_n_undone __initdata;
-static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
-
-static inline void __init pgdat_init_report_one_done(void)
-{
-	if (atomic_dec_and_test(&pgdat_init_n_undone))
-		complete(&pgdat_init_all_done_comp);
-}
-
 /*
- * Returns true if page needs to be initialized or freed to buddy allocator.
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
  *
- * We check if a current large page is valid by only checking the validity
- * of the head pfn.
+ * -- nyc
  */
-static inline bool __init deferred_pfn_valid(unsigned long pfn)
+static inline void expand(struct zone *zone, struct page *page,
+	int low, int high, int migratetype)
 {
-	if (pageblock_aligned(pfn) && !pfn_valid(pfn))
-		return false;
-	return true;
-}
-
-/*
- * Free pages to buddy allocator. Try to free aligned pages in
- * pageblock_nr_pages sizes.
- */
-static void __init deferred_free_pages(unsigned long pfn,
-				       unsigned long end_pfn)
-{
-	unsigned long nr_free = 0;
-
-	for (; pfn < end_pfn; pfn++) {
-		if (!deferred_pfn_valid(pfn)) {
-			deferred_free_range(pfn - nr_free, nr_free);
-			nr_free = 0;
-		} else if (pageblock_aligned(pfn)) {
-			deferred_free_range(pfn - nr_free, nr_free);
-			nr_free = 1;
-		} else {
-			nr_free++;
-		}
-	}
-	/* Free the last block of pages to allocator */
-	deferred_free_range(pfn - nr_free, nr_free);
-}
+	unsigned long size = 1 << high;
 
-/*
- * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
- * by performing it only once every pageblock_nr_pages.
- * Return number of pages initialized.
- */
-static unsigned long  __init deferred_init_pages(struct zone *zone,
-						 unsigned long pfn,
-						 unsigned long end_pfn)
-{
-	int nid = zone_to_nid(zone);
-	unsigned long nr_pages = 0;
-	int zid = zone_idx(zone);
-	struct page *page = NULL;
+	while (high > low) {
+		high--;
+		size >>= 1;
+		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
 
-	for (; pfn < end_pfn; pfn++) {
-		if (!deferred_pfn_valid(pfn)) {
-			page = NULL;
+		/*
+		 * Mark as guard pages (or page), that will allow to
+		 * merge back to allocator when buddy will be freed.
+		 * Corresponding page table entries will not be touched,
+		 * pages will stay not present in virtual address space
+		 */
+		if (set_page_guard(zone, &page[size], high, migratetype))
 			continue;
-		} else if (!page || pageblock_aligned(pfn)) {
-			page = pfn_to_page(pfn);
-		} else {
-			page++;
-		}
-		__init_single_page(page, pfn, zid, nid);
-		nr_pages++;
+
+		add_to_free_list(&page[size], zone, high, migratetype);
+		set_buddy_order(&page[size], high);
 	}
-	return (nr_pages);
 }
 
-/*
- * This function is meant to pre-load the iterator for the zone init.
- * Specifically it walks through the ranges until we are caught up to the
- * first_init_pfn value and exits there. If we never encounter the value we
- * return false indicating there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
-				    unsigned long *spfn, unsigned long *epfn,
-				    unsigned long first_init_pfn)
+static void check_new_page_bad(struct page *page)
 {
-	u64 j;
-
-	/*
-	 * Start out by walking through the ranges in this zone that have
-	 * already been initialized. We don't need to do anything with them
-	 * so we just need to flush them out of the system.
-	 */
-	for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
-		if (*epfn <= first_init_pfn)
-			continue;
-		if (*spfn < first_init_pfn)
-			*spfn = first_init_pfn;
-		*i = j;
-		return true;
+	if (unlikely(page->flags & __PG_HWPOISON)) {
+		/* Don't complain about hwpoisoned pages */
+		page_mapcount_reset(page); /* remove PageBuddy */
+		return;
 	}
 
-	return false;
+	bad_page(page,
+		 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
 }
 
 /*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
- *
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
+ * This page is about to be returned from the page allocator
  */
-static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
-		       unsigned long *end_pfn)
+static int check_new_page(struct page *page)
 {
-	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
-	unsigned long spfn = *start_pfn, epfn = *end_pfn;
-	unsigned long nr_pages = 0;
-	u64 j = *i;
-
-	/* First we loop through and initialize the page values */
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
-		unsigned long t;
+	if (likely(page_expected_state(page,
+				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+		return 0;
 
-		if (mo_pfn <= *start_pfn)
-			break;
+	check_new_page_bad(page);
+	return 1;
+}
 
-		t = min(mo_pfn, *end_pfn);
-		nr_pages += deferred_init_pages(zone, *start_pfn, t);
+static inline bool check_new_pages(struct page *page, unsigned int order)
+{
+	if (is_check_pages_enabled()) {
+		for (int i = 0; i < (1 << order); i++) {
+			struct page *p = page + i;
 
-		if (mo_pfn < *end_pfn) {
-			*start_pfn = mo_pfn;
-			break;
+			if (unlikely(check_new_page(p)))
+				return true;
 		}
 	}
 
-	/* Reset values and now loop through freeing pages as needed */
-	swap(j, *i);
-
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
-		unsigned long t;
-
-		if (mo_pfn <= spfn)
-			break;
-
-		t = min(mo_pfn, epfn);
-		deferred_free_pages(spfn, t);
-
-		if (mo_pfn <= epfn)
-			break;
-	}
-
-	return nr_pages;
+	return false;
 }
 
-static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
-			   void *arg)
+static inline bool should_skip_kasan_unpoison(gfp_t flags)
 {
-	unsigned long spfn, epfn;
-	struct zone *zone = arg;
-	u64 i;
+	/* Don't skip if a software KASAN mode is enabled. */
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+		return false;
 
-	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+	/* Skip, if hardware tag-based KASAN is not enabled. */
+	if (!kasan_hw_tags_enabled())
+		return true;
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so that we
-	 * can avoid introducing any issues with the buddy allocator.
+	 * With hardware tag-based KASAN enabled, skip if this has been
+	 * requested via __GFP_SKIP_KASAN.
 	 */
-	while (spfn < end_pfn) {
-		deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		cond_resched();
-	}
+	return flags & __GFP_SKIP_KASAN;
 }
 
-/* An arch may override for more concurrency. */
-__weak int __init
-deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+static inline bool should_skip_init(gfp_t flags)
 {
-	return 1;
+	/* Don't skip, if hardware tag-based KASAN is not enabled. */
+	if (!kasan_hw_tags_enabled())
+		return false;
+
+	/* For hardware tag-based KASAN, skip if requested. */
+	return (flags & __GFP_SKIP_ZERO);
 }
 
-/* Initialise remaining memory on a node */
-static int __init deferred_init_memmap(void *data)
+inline void post_alloc_hook(struct page *page, unsigned int order,
+				gfp_t gfp_flags)
 {
-	pg_data_t *pgdat = data;
-	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-	unsigned long spfn = 0, epfn = 0;
-	unsigned long first_init_pfn, flags;
-	unsigned long start = jiffies;
-	struct zone *zone;
-	int zid, max_threads;
-	u64 i;
-
-	/* Bind memory initialisation thread to a local node if possible */
-	if (!cpumask_empty(cpumask))
-		set_cpus_allowed_ptr(current, cpumask);
-
-	pgdat_resize_lock(pgdat, &flags);
-	first_init_pfn = pgdat->first_deferred_pfn;
-	if (first_init_pfn == ULONG_MAX) {
-		pgdat_resize_unlock(pgdat, &flags);
-		pgdat_init_report_one_done();
-		return 0;
-	}
+	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+			!should_skip_init(gfp_flags);
+	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+	int i;
+
+	set_page_private(page, 0);
+	set_page_refcounted(page);
 
-	/* Sanity check boundaries */
-	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
-	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
-	pgdat->first_deferred_pfn = ULONG_MAX;
+	arch_alloc_page(page, order);
+	debug_pagealloc_map_pages(page, 1 << order);
 
 	/*
-	 * Once we unlock here, the zone cannot be grown anymore, thus if an
-	 * interrupt thread must allocate this early in boot, zone must be
-	 * pre-grown prior to start of deferred page initialization.
+	 * Page unpoisoning must happen before memory initialization.
+	 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+	 * allocations and the page unpoisoning code will complain.
 	 */
-	pgdat_resize_unlock(pgdat, &flags);
-
-	/* Only the highest zone is deferred so find it */
-	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-		zone = pgdat->node_zones + zid;
-		if (first_init_pfn < zone_end_pfn(zone))
-			break;
-	}
-
-	/* If the zone is empty somebody else may have cleared out the zone */
-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
-						 first_init_pfn))
-		goto zone_empty;
-
-	max_threads = deferred_page_init_max_threads(cpumask);
-
-	while (spfn < epfn) {
-		unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
-		struct padata_mt_job job = {
-			.thread_fn   = deferred_init_memmap_chunk,
-			.fn_arg      = zone,
-			.start       = spfn,
-			.size        = epfn_align - spfn,
-			.align       = PAGES_PER_SECTION,
-			.min_chunk   = PAGES_PER_SECTION,
-			.max_threads = max_threads,
-		};
-
-		padata_do_multithreaded(&job);
-		deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
-						    epfn_align);
-	}
-zone_empty:
-	/* Sanity check that the next zone really is unpopulated */
-	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
-
-	pr_info("node %d deferred pages initialised in %ums\n",
-		pgdat->node_id, jiffies_to_msecs(jiffies - start));
-
-	pgdat_init_report_one_done();
-	return 0;
-}
-
-/*
- * If this zone has deferred pages, try to grow it by initializing enough
- * deferred pages to satisfy the allocation specified by order, rounded up to
- * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
- * of SECTION_SIZE bytes by initializing struct pages in increments of
- * PAGES_PER_SECTION * sizeof(struct page) bytes.
- *
- * Return true when zone was grown, otherwise return false. We return true even
- * when we grow less than requested, to let the caller decide if there are
- * enough pages to satisfy the allocation.
- *
- * Note: We use noinline because this function is needed only during boot, and
- * it is called from a __ref function _deferred_grow_zone. This way we are
- * making sure that it is not inlined into permanent text section.
- */
-static noinline bool __init
-deferred_grow_zone(struct zone *zone, unsigned int order)
-{
-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
-	pg_data_t *pgdat = zone->zone_pgdat;
-	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
-	unsigned long spfn, epfn, flags;
-	unsigned long nr_pages = 0;
-	u64 i;
-
-	/* Only the last zone may have deferred pages */
-	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
-		return false;
-
-	pgdat_resize_lock(pgdat, &flags);
-
-	/*
-	 * If someone grew this zone while we were waiting for spinlock, return
-	 * true, as there might be enough pages already.
-	 */
-	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
-		pgdat_resize_unlock(pgdat, &flags);
-		return true;
-	}
-
-	/* If the zone is empty somebody else may have cleared out the zone */
-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
-						 first_deferred_pfn)) {
-		pgdat->first_deferred_pfn = ULONG_MAX;
-		pgdat_resize_unlock(pgdat, &flags);
-		/* Retry only once. */
-		return first_deferred_pfn != ULONG_MAX;
-	}
-
-	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so
-	 * that we can avoid introducing any issues with the buddy
-	 * allocator.
-	 */
-	while (spfn < epfn) {
-		/* update our first deferred PFN for this section */
-		first_deferred_pfn = spfn;
-
-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		touch_nmi_watchdog();
-
-		/* We should only stop along section boundaries */
-		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
-			continue;
-
-		/* If our quota has been met we can stop here */
-		if (nr_pages >= nr_pages_needed)
-			break;
-	}
-
-	pgdat->first_deferred_pfn = spfn;
-	pgdat_resize_unlock(pgdat, &flags);
-
-	return nr_pages > 0;
-}
-
-/*
- * deferred_grow_zone() is __init, but it is called from
- * get_page_from_freelist() during early boot until deferred_pages permanently
- * disables this call. This is why we have refdata wrapper to avoid warning,
- * and to ensure that the function body gets unloaded.
- */
-static bool __ref
-_deferred_grow_zone(struct zone *zone, unsigned int order)
-{
-	return deferred_grow_zone(zone, order);
-}
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-void __init page_alloc_init_late(void)
-{
-	struct zone *zone;
-	int nid;
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
-	/* There will be num_node_state(N_MEMORY) threads */
-	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
-	for_each_node_state(nid, N_MEMORY) {
-		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
-	}
-
-	/* Block until all are initialised */
-	wait_for_completion(&pgdat_init_all_done_comp);
-
-	/*
-	 * We initialized the rest of the deferred pages.  Permanently disable
-	 * on-demand struct page initialization.
-	 */
-	static_branch_disable(&deferred_pages);
-
-	/* Reinit limits that are based on free pages after the kernel is up */
-	files_maxfiles_init();
-#endif
-
-	buffer_init();
-
-	/* Discard memblock private memory */
-	memblock_discard();
-
-	for_each_node_state(nid, N_MEMORY)
-		shuffle_free_memory(NODE_DATA(nid));
-
-	for_each_populated_zone(zone)
-		set_zone_contiguous(zone);
-}
-
-#ifdef CONFIG_CMA
-/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
-void __init init_cma_reserved_pageblock(struct page *page)
-{
-	unsigned i = pageblock_nr_pages;
-	struct page *p = page;
-
-	do {
-		__ClearPageReserved(p);
-		set_page_count(p, 0);
-	} while (++p, --i);
-
-	set_pageblock_migratetype(page, MIGRATE_CMA);
-	set_page_refcounted(page);
-	__free_pages(page, pageblock_order);
-
-	adjust_managed_page_count(page, pageblock_nr_pages);
-	page_zone(page)->cma_pages += pageblock_nr_pages;
-}
-#endif
-
-/*
- * The order of subdivision here is critical for the IO subsystem.
- * Please do not alter this order without good reasons and regression
- * testing. Specifically, as large blocks of memory are subdivided,
- * the order in which smaller blocks are delivered depends on the order
- * they're subdivided in this function. This is the primary factor
- * influencing the order in which pages are delivered to the IO
- * subsystem according to empirical testing, and this is also justified
- * by considering the behavior of a buddy system containing a single
- * large block of memory acted on by a series of small allocations.
- * This behavior is a critical factor in sglist merging's success.
- *
- * -- nyc
- */
-static inline void expand(struct zone *zone, struct page *page,
-	int low, int high, int migratetype)
-{
-	unsigned long size = 1 << high;
-
-	while (high > low) {
-		high--;
-		size >>= 1;
-		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
-
-		/*
-		 * Mark as guard pages (or page), that will allow to
-		 * merge back to allocator when buddy will be freed.
-		 * Corresponding page table entries will not be touched,
-		 * pages will stay not present in virtual address space
-		 */
-		if (set_page_guard(zone, &page[size], high, migratetype))
-			continue;
-
-		add_to_free_list(&page[size], zone, high, migratetype);
-		set_buddy_order(&page[size], high);
-	}
-}
-
-static void check_new_page_bad(struct page *page)
-{
-	if (unlikely(page->flags & __PG_HWPOISON)) {
-		/* Don't complain about hwpoisoned pages */
-		page_mapcount_reset(page); /* remove PageBuddy */
-		return;
-	}
-
-	bad_page(page,
-		 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
-}
-
-/*
- * This page is about to be returned from the page allocator
- */
-static int check_new_page(struct page *page)
-{
-	if (likely(page_expected_state(page,
-				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
-		return 0;
-
-	check_new_page_bad(page);
-	return 1;
-}
-
-static inline bool check_new_pages(struct page *page, unsigned int order)
-{
-	if (is_check_pages_enabled()) {
-		for (int i = 0; i < (1 << order); i++) {
-			struct page *p = page + i;
-
-			if (unlikely(check_new_page(p)))
-				return true;
-		}
-	}
-
-	return false;
-}
-
-static inline bool should_skip_kasan_unpoison(gfp_t flags)
-{
-	/* Don't skip if a software KASAN mode is enabled. */
-	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
-	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-		return false;
-
-	/* Skip, if hardware tag-based KASAN is not enabled. */
-	if (!kasan_hw_tags_enabled())
-		return true;
-
-	/*
-	 * With hardware tag-based KASAN enabled, skip if this has been
-	 * requested via __GFP_SKIP_KASAN.
-	 */
-	return flags & __GFP_SKIP_KASAN;
-}
-
-static inline bool should_skip_init(gfp_t flags)
-{
-	/* Don't skip, if hardware tag-based KASAN is not enabled. */
-	if (!kasan_hw_tags_enabled())
-		return false;
-
-	/* For hardware tag-based KASAN, skip if requested. */
-	return (flags & __GFP_SKIP_ZERO);
-}
-
-inline void post_alloc_hook(struct page *page, unsigned int order,
-				gfp_t gfp_flags)
-{
-	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
-			!should_skip_init(gfp_flags);
-	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
-	int i;
-
-	set_page_private(page, 0);
-	set_page_refcounted(page);
-
-	arch_alloc_page(page, order);
-	debug_pagealloc_map_pages(page, 1 << order);
-
-	/*
-	 * Page unpoisoning must happen before memory initialization.
-	 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
-	 * allocations and the page unpoisoning code will complain.
-	 */
-	kernel_unpoison_pages(page, 1 << order);
+	kernel_unpoison_pages(page, 1 << order);
 
 	/*
 	 * As memory initialization might be integrated into KASAN,
@@ -6547,7 +5887,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 #define BOOT_PAGESET_BATCH	1
 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
 static void __build_all_zonelists(void *data)
 {
@@ -6661,395 +6000,35 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
 #endif
 }
 
-/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
-static bool __meminit
-overlap_memmap_init(unsigned long zone, unsigned long *pfn)
-{
-	static struct memblock_region *r;
-
-	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
-		if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
-			for_each_mem_region(r) {
-				if (*pfn < memblock_region_memory_end_pfn(r))
-					break;
-			}
-		}
-		if (*pfn >= memblock_region_memory_base_pfn(r) &&
-		    memblock_is_mirror(r)) {
-			*pfn = memblock_region_memory_end_pfn(r);
-			return true;
-		}
-	}
-	return false;
-}
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by memblock_free_all() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- *
- * All aligned pageblocks are initialized to the specified migratetype
- * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
- * zone stats (e.g., nr_isolate_pageblock) are touched.
- */
-void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn, unsigned long zone_end_pfn,
-		enum meminit_context context,
-		struct vmem_altmap *altmap, int migratetype)
+static int zone_batchsize(struct zone *zone)
 {
-	unsigned long pfn, end_pfn = start_pfn + size;
-	struct page *page;
+#ifdef CONFIG_MMU
+	int batch;
 
-	if (highest_memmap_pfn < end_pfn - 1)
-		highest_memmap_pfn = end_pfn - 1;
+	/*
+	 * The number of pages to batch allocate is either ~0.1%
+	 * of the zone or 1MB, whichever is smaller. The batch
+	 * size is striking a balance between allocation latency
+	 * and zone lock contention.
+	 */
+	batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
 
-#ifdef CONFIG_ZONE_DEVICE
 	/*
-	 * Honor reservation requested by the driver for this ZONE_DEVICE
-	 * memory. We limit the total number of pages to initialize to just
-	 * those that might contain the memory mapping. We will defer the
-	 * ZONE_DEVICE page initialization until after we have released
-	 * the hotplug lock.
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
 	 */
-	if (zone == ZONE_DEVICE) {
-		if (!altmap)
-			return;
+	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 
-		if (start_pfn == altmap->base_pfn)
-			start_pfn += altmap->reserve;
-		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
-	}
-#endif
-
-	for (pfn = start_pfn; pfn < end_pfn; ) {
-		/*
-		 * There can be holes in boot-time mem_map[]s handed to this
-		 * function.  They do not exist on hotplugged memory.
-		 */
-		if (context == MEMINIT_EARLY) {
-			if (overlap_memmap_init(zone, &pfn))
-				continue;
-			if (defer_init(nid, pfn, zone_end_pfn)) {
-				deferred_struct_pages = true;
-				break;
-			}
-		}
-
-		page = pfn_to_page(pfn);
-		__init_single_page(page, pfn, zone, nid);
-		if (context == MEMINIT_HOTPLUG)
-			__SetPageReserved(page);
-
-		/*
-		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
-		 * such that unmovable allocations won't be scattered all
-		 * over the place during system boot.
-		 */
-		if (pageblock_aligned(pfn)) {
-			set_pageblock_migratetype(page, migratetype);
-			cond_resched();
-		}
-		pfn++;
-	}
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
-					  unsigned long zone_idx, int nid,
-					  struct dev_pagemap *pgmap)
-{
-
-	__init_single_page(page, pfn, zone_idx, nid);
-
-	/*
-	 * Mark page reserved as it will need to wait for onlining
-	 * phase for it to be fully associated with a zone.
-	 *
-	 * We can use the non-atomic __set_bit operation for setting
-	 * the flag as we are still initializing the pages.
-	 */
-	__SetPageReserved(page);
-
-	/*
-	 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
-	 * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
-	 * ever freed or placed on a driver-private list.
-	 */
-	page->pgmap = pgmap;
-	page->zone_device_data = NULL;
-
-	/*
-	 * Mark the block movable so that blocks are reserved for
-	 * movable at startup. This will force kernel allocations
-	 * to reserve their blocks rather than leaking throughout
-	 * the address space during boot when many long-lived
-	 * kernel allocations are made.
-	 *
-	 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
-	 * because this is done early in section_activate()
-	 */
-	if (pageblock_aligned(pfn)) {
-		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		cond_resched();
-	}
-
-	/*
-	 * ZONE_DEVICE pages are released directly to the driver page allocator
-	 * which will set the page count to 1 when allocating the page.
-	 */
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
-	    pgmap->type == MEMORY_DEVICE_COHERENT)
-		set_page_count(page, 0);
-}
-
-/*
- * With compound page geometry and when struct pages are stored in ram most
- * tail pages are reused. Consequently, the amount of unique struct pages to
- * initialize is a lot smaller that the total amount of struct pages being
- * mapped. This is a paired / mild layering violation with explicit knowledge
- * of how the sparse_vmemmap internals handle compound pages in the lack
- * of an altmap. See vmemmap_populate_compound_pages().
- */
-static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
-					      unsigned long nr_pages)
-{
-	return is_power_of_2(sizeof(struct page)) &&
-		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
-}
-
-static void __ref memmap_init_compound(struct page *head,
-				       unsigned long head_pfn,
-				       unsigned long zone_idx, int nid,
-				       struct dev_pagemap *pgmap,
-				       unsigned long nr_pages)
-{
-	unsigned long pfn, end_pfn = head_pfn + nr_pages;
-	unsigned int order = pgmap->vmemmap_shift;
-
-	__SetPageHead(head);
-	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
-		struct page *page = pfn_to_page(pfn);
-
-		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
-		prep_compound_tail(head, pfn - head_pfn);
-		set_page_count(page, 0);
-
-		/*
-		 * The first tail page stores important compound page info.
-		 * Call prep_compound_head() after the first tail page has
-		 * been initialized, to not have the data overwritten.
-		 */
-		if (pfn == head_pfn + 1)
-			prep_compound_head(head, order);
-	}
-}
-
-void __ref memmap_init_zone_device(struct zone *zone,
-				   unsigned long start_pfn,
-				   unsigned long nr_pages,
-				   struct dev_pagemap *pgmap)
-{
-	unsigned long pfn, end_pfn = start_pfn + nr_pages;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
-	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
-	unsigned long zone_idx = zone_idx(zone);
-	unsigned long start = jiffies;
-	int nid = pgdat->node_id;
-
-	if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
-		return;
-
-	/*
-	 * The call to memmap_init should have already taken care
-	 * of the pages reserved for the memmap, so we can just jump to
-	 * the end of that region and start processing the device pages.
-	 */
-	if (altmap) {
-		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
-		nr_pages = end_pfn - start_pfn;
-	}
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
-		struct page *page = pfn_to_page(pfn);
-
-		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
-
-		if (pfns_per_compound == 1)
-			continue;
-
-		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
-				     compound_nr_pages(altmap, pfns_per_compound));
-	}
-
-	pr_info("%s initialised %lu pages in %ums\n", __func__,
-		nr_pages, jiffies_to_msecs(jiffies - start));
-}
-
-#endif
-static void __meminit zone_init_free_lists(struct zone *zone)
-{
-	unsigned int order, t;
-	for_each_migratetype_order(order, t) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
-		zone->free_area[order].nr_free = 0;
-	}
-}
-
-/*
- * Only struct pages that correspond to ranges defined by memblock.memory
- * are zeroed and initialized by going through __init_single_page() during
- * memmap_init_zone_range().
- *
- * But, there could be struct pages that correspond to holes in
- * memblock.memory. This can happen because of the following reasons:
- * - physical memory bank size is not necessarily the exact multiple of the
- *   arbitrary section size
- * - early reserved memory may not be listed in memblock.memory
- * - memory layouts defined with memmap= kernel parameter may not align
- *   nicely with memmap sections
- *
- * Explicitly initialize those struct pages so that:
- * - PG_Reserved is set
- * - zone and node links point to zone and node that span the page if the
- *   hole is in the middle of a zone
- * - zone and node links point to adjacent zone/node if the hole falls on
- *   the zone boundary; the pages in such holes will be prepended to the
- *   zone/node above the hole except for the trailing pages in the last
- *   section that will be appended to the zone/node below.
- */
-static void __init init_unavailable_range(unsigned long spfn,
-					  unsigned long epfn,
-					  int zone, int node)
-{
-	unsigned long pfn;
-	u64 pgcnt = 0;
-
-	for (pfn = spfn; pfn < epfn; pfn++) {
-		if (!pfn_valid(pageblock_start_pfn(pfn))) {
-			pfn = pageblock_end_pfn(pfn) - 1;
-			continue;
-		}
-		__init_single_page(pfn_to_page(pfn), pfn, zone, node);
-		__SetPageReserved(pfn_to_page(pfn));
-		pgcnt++;
-	}
-
-	if (pgcnt)
-		pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
-			node, zone_names[zone], pgcnt);
-}
-
-static void __init memmap_init_zone_range(struct zone *zone,
-					  unsigned long start_pfn,
-					  unsigned long end_pfn,
-					  unsigned long *hole_pfn)
-{
-	unsigned long zone_start_pfn = zone->zone_start_pfn;
-	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
-
-	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
-	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
-
-	if (start_pfn >= end_pfn)
-		return;
-
-	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
-			  zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
-
-	if (*hole_pfn < start_pfn)
-		init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
-
-	*hole_pfn = end_pfn;
-}
-
-static void __init memmap_init(void)
-{
-	unsigned long start_pfn, end_pfn;
-	unsigned long hole_pfn = 0;
-	int i, j, zone_id = 0, nid;
-
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
-		struct pglist_data *node = NODE_DATA(nid);
-
-		for (j = 0; j < MAX_NR_ZONES; j++) {
-			struct zone *zone = node->node_zones + j;
-
-			if (!populated_zone(zone))
-				continue;
-
-			memmap_init_zone_range(zone, start_pfn, end_pfn,
-					       &hole_pfn);
-			zone_id = j;
-		}
-	}
-
-#ifdef CONFIG_SPARSEMEM
-	/*
-	 * Initialize the memory map for hole in the range [memory_end,
-	 * section_end].
-	 * Append the pages in this hole to the highest zone in the last
-	 * node.
-	 * The call to init_unavailable_range() is outside the ifdef to
-	 * silence the compiler warining about zone_id set but not used;
-	 * for FLATMEM it is a nop anyway
-	 */
-	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
-	if (hole_pfn < end_pfn)
-#endif
-		init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
-}
-
-void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
-			  phys_addr_t min_addr, int nid, bool exact_nid)
-{
-	void *ptr;
-
-	if (exact_nid)
-		ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
-						   MEMBLOCK_ALLOC_ACCESSIBLE,
-						   nid);
-	else
-		ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
-						 MEMBLOCK_ALLOC_ACCESSIBLE,
-						 nid);
-
-	if (ptr && size > 0)
-		page_init_poison(ptr, size);
-
-	return ptr;
-}
-
-static int zone_batchsize(struct zone *zone)
-{
-#ifdef CONFIG_MMU
-	int batch;
-
-	/*
-	 * The number of pages to batch allocate is either ~0.1%
-	 * of the zone or 1MB, whichever is smaller. The batch
-	 * size is striking a balance between allocation latency
-	 * and zone lock contention.
-	 */
-	batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
-	batch /= 4;		/* We effectively *= 4 below */
-	if (batch < 1)
-		batch = 1;
-
-	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
-	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
-	 */
-	batch = rounddown_pow_of_two(batch + batch/2) - 1;
-
-	return batch;
+	return batch;
 
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
@@ -7071,1352 +6050,210 @@ static int zone_batchsize(struct zone *zone)
 
 static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 {
-#ifdef CONFIG_MMU
-	int high;
-	int nr_split_cpus;
-	unsigned long total_pages;
-
-	if (!percpu_pagelist_high_fraction) {
-		/*
-		 * By default, the high value of the pcp is based on the zone
-		 * low watermark so that if they are full then background
-		 * reclaim will not be started prematurely.
-		 */
-		total_pages = low_wmark_pages(zone);
-	} else {
-		/*
-		 * If percpu_pagelist_high_fraction is configured, the high
-		 * value is based on a fraction of the managed pages in the
-		 * zone.
-		 */
-		total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
-	}
-
-	/*
-	 * Split the high value across all online CPUs local to the zone. Note
-	 * that early in boot that CPUs may not be online yet and that during
-	 * CPU hotplug that the cpumask is not yet updated when a CPU is being
-	 * onlined. For memory nodes that have no CPUs, split pcp->high across
-	 * all online CPUs to mitigate the risk that reclaim is triggered
-	 * prematurely due to pages stored on pcp lists.
-	 */
-	nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
-	if (!nr_split_cpus)
-		nr_split_cpus = num_online_cpus();
-	high = total_pages / nr_split_cpus;
-
-	/*
-	 * Ensure high is at least batch*4. The multiple is based on the
-	 * historical relationship between high and batch.
-	 */
-	high = max(high, batch << 2);
-
-	return high;
-#else
-	return 0;
-#endif
-}
-
-/*
- * pcp->high and pcp->batch values are related and generally batch is lower
- * than high. They are also related to pcp->count such that count is lower
- * than high, and as soon as it reaches high, the pcplist is flushed.
- *
- * However, guaranteeing these relations at all times would require e.g. write
- * barriers here but also careful usage of read barriers at the read side, and
- * thus be prone to error and bad for performance. Thus the update only prevents
- * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
- *
- * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
- * outside of boot time (or some other assurance that no concurrent updaters
- * exist).
- */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
-		unsigned long batch)
-{
-	WRITE_ONCE(pcp->batch, batch);
-	WRITE_ONCE(pcp->high, high);
-}
-
-static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
-{
-	int pindex;
-
-	memset(pcp, 0, sizeof(*pcp));
-	memset(pzstats, 0, sizeof(*pzstats));
-
-	spin_lock_init(&pcp->lock);
-	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
-		INIT_LIST_HEAD(&pcp->lists[pindex]);
-
-	/*
-	 * Set batch and high values safe for a boot pageset. A true percpu
-	 * pageset's initialization will update them subsequently. Here we don't
-	 * need to be as careful as pageset_update() as nobody can access the
-	 * pageset yet.
-	 */
-	pcp->high = BOOT_PAGESET_HIGH;
-	pcp->batch = BOOT_PAGESET_BATCH;
-	pcp->free_factor = 0;
-}
-
-static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
-		unsigned long batch)
-{
-	struct per_cpu_pages *pcp;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-		pageset_update(pcp, high, batch);
-	}
-}
-
-/*
- * Calculate and set new high and batch values for all per-cpu pagesets of a
- * zone based on the zone's size.
- */
-static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
-{
-	int new_high, new_batch;
-
-	new_batch = max(1, zone_batchsize(zone));
-	new_high = zone_highsize(zone, new_batch, cpu_online);
-
-	if (zone->pageset_high == new_high &&
-	    zone->pageset_batch == new_batch)
-		return;
-
-	zone->pageset_high = new_high;
-	zone->pageset_batch = new_batch;
-
-	__zone_set_pageset_high_and_batch(zone, new_high, new_batch);
-}
-
-void __meminit setup_zone_pageset(struct zone *zone)
-{
-	int cpu;
-
-	/* Size may be 0 on !SMP && !NUMA */
-	if (sizeof(struct per_cpu_zonestat) > 0)
-		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
-
-	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
-	for_each_possible_cpu(cpu) {
-		struct per_cpu_pages *pcp;
-		struct per_cpu_zonestat *pzstats;
-
-		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
-		per_cpu_pages_init(pcp, pzstats);
-	}
-
-	zone_set_pageset_high_and_batch(zone, 0);
-}
-
-/*
- * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalculated.
- */
-static void zone_pcp_update(struct zone *zone, int cpu_online)
-{
-	mutex_lock(&pcp_batch_high_lock);
-	zone_set_pageset_high_and_batch(zone, cpu_online);
-	mutex_unlock(&pcp_batch_high_lock);
-}
-
-/*
- * Allocate per cpu pagesets and initialize them.
- * Before this call only boot pagesets were available.
- */
-void __init setup_per_cpu_pageset(void)
-{
-	struct pglist_data *pgdat;
-	struct zone *zone;
-	int __maybe_unused cpu;
-
-	for_each_populated_zone(zone)
-		setup_zone_pageset(zone);
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Unpopulated zones continue using the boot pagesets.
-	 * The numa stats for these pagesets need to be reset.
-	 * Otherwise, they will end up skewing the stats of
-	 * the nodes these zones are associated with.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
-		memset(pzstats->vm_numa_event, 0,
-		       sizeof(pzstats->vm_numa_event));
-	}
-#endif
-
-	for_each_online_pgdat(pgdat)
-		pgdat->per_cpu_nodestats =
-			alloc_percpu(struct per_cpu_nodestat);
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
-	/*
-	 * per cpu subsystem is not up at this point. The following code
-	 * relies on the ability of the linker to provide the
-	 * offset of a (static) per cpu variable into the per cpu area.
-	 */
-	zone->per_cpu_pageset = &boot_pageset;
-	zone->per_cpu_zonestats = &boot_zonestats;
-	zone->pageset_high = BOOT_PAGESET_HIGH;
-	zone->pageset_batch = BOOT_PAGESET_BATCH;
-
-	if (populated_zone(zone))
-		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
-			 zone->present_pages, zone_batchsize(zone));
-}
-
-void __meminit init_currently_empty_zone(struct zone *zone,
-					unsigned long zone_start_pfn,
-					unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	int zone_idx = zone_idx(zone) + 1;
-
-	if (zone_idx > pgdat->nr_zones)
-		pgdat->nr_zones = zone_idx;
-
-	zone->zone_start_pfn = zone_start_pfn;
-
-	mminit_dprintk(MMINIT_TRACE, "memmap_init",
-			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
-			pgdat->node_id,
-			(unsigned long)zone_idx(zone),
-			zone_start_pfn, (zone_start_pfn + size));
-
-	zone_init_free_lists(zone);
-	zone->initialized = 1;
-}
-
-/**
- * get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
- *
- * It returns the start and end page frame of a node based on information
- * provided by memblock_set_node(). If called for a node
- * with no available memory, a warning is printed and the start and end
- * PFNs will be 0.
- */
-void __init get_pfn_range_for_nid(unsigned int nid,
-			unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	unsigned long this_start_pfn, this_end_pfn;
-	int i;
-
-	*start_pfn = -1UL;
-	*end_pfn = 0;
-
-	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
-		*start_pfn = min(*start_pfn, this_start_pfn);
-		*end_pfn = max(*end_pfn, this_end_pfn);
-	}
-
-	if (*start_pfn == -1UL)
-		*start_pfn = 0;
-}
-
-/*
- * This finds a zone that can be used for ZONE_MOVABLE pages. The
- * assumption is made that zones within a node are ordered in monotonic
- * increasing memory addresses so that the "highest" populated zone is used
- */
-static void __init find_usable_zone_for_movable(void)
-{
-	int zone_index;
-	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
-		if (zone_index == ZONE_MOVABLE)
-			continue;
-
-		if (arch_zone_highest_possible_pfn[zone_index] >
-				arch_zone_lowest_possible_pfn[zone_index])
-			break;
-	}
-
-	VM_BUG_ON(zone_index == -1);
-	movable_zone = zone_index;
-}
-
-/*
- * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independent of architecture. Unlike the other zones,
- * the starting point for ZONE_MOVABLE is not fixed. It may be different
- * in each node depending on the size of each node and how evenly kernelcore
- * is distributed. This helper function adjusts the zone ranges
- * provided by the architecture for a given node by using the end of the
- * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
- * zones within a node are in order of monotonic increases memory addresses
- */
-static void __init adjust_zone_range_for_zone_movable(int nid,
-					unsigned long zone_type,
-					unsigned long node_start_pfn,
-					unsigned long node_end_pfn,
-					unsigned long *zone_start_pfn,
-					unsigned long *zone_end_pfn)
-{
-	/* Only adjust if ZONE_MOVABLE is on this node */
-	if (zone_movable_pfn[nid]) {
-		/* Size ZONE_MOVABLE */
-		if (zone_type == ZONE_MOVABLE) {
-			*zone_start_pfn = zone_movable_pfn[nid];
-			*zone_end_pfn = min(node_end_pfn,
-				arch_zone_highest_possible_pfn[movable_zone]);
-
-		/* Adjust for ZONE_MOVABLE starting within this range */
-		} else if (!mirrored_kernelcore &&
-			*zone_start_pfn < zone_movable_pfn[nid] &&
-			*zone_end_pfn > zone_movable_pfn[nid]) {
-			*zone_end_pfn = zone_movable_pfn[nid];
-
-		/* Check if this whole range is within ZONE_MOVABLE */
-		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
-			*zone_start_pfn = *zone_end_pfn;
-	}
-}
-
-/*
- * Return the number of pages a zone spans in a node, including holes
- * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
- */
-static unsigned long __init zone_spanned_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long node_start_pfn,
-					unsigned long node_end_pfn,
-					unsigned long *zone_start_pfn,
-					unsigned long *zone_end_pfn)
-{
-	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
-	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
-	/* When hotadd a new node from cpu_up(), the node should be empty */
-	if (!node_start_pfn && !node_end_pfn)
-		return 0;
-
-	/* Get the start and end of the zone */
-	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
-	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-	adjust_zone_range_for_zone_movable(nid, zone_type,
-				node_start_pfn, node_end_pfn,
-				zone_start_pfn, zone_end_pfn);
-
-	/* Check that this node has pages within the zone's required range */
-	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
-		return 0;
-
-	/* Move the zone boundaries inside the node if necessary */
-	*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
-	*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
-
-	/* Return the spanned pages */
-	return *zone_end_pfn - *zone_start_pfn;
-}
-
-/*
- * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for.
- */
-unsigned long __init __absent_pages_in_range(int nid,
-				unsigned long range_start_pfn,
-				unsigned long range_end_pfn)
-{
-	unsigned long nr_absent = range_end_pfn - range_start_pfn;
-	unsigned long start_pfn, end_pfn;
-	int i;
-
-	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
-		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
-		nr_absent -= end_pfn - start_pfn;
-	}
-	return nr_absent;
-}
-
-/**
- * absent_pages_in_range - Return number of page frames in holes within a range
- * @start_pfn: The start PFN to start searching for holes
- * @end_pfn: The end PFN to stop searching for holes
- *
- * Return: the number of pages frames in memory holes within a range.
- */
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __init zone_absent_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long node_start_pfn,
-					unsigned long node_end_pfn)
-{
-	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
-	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
-	unsigned long zone_start_pfn, zone_end_pfn;
-	unsigned long nr_absent;
-
-	/* When hotadd a new node from cpu_up(), the node should be empty */
-	if (!node_start_pfn && !node_end_pfn)
-		return 0;
-
-	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
-	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
-	adjust_zone_range_for_zone_movable(nid, zone_type,
-			node_start_pfn, node_end_pfn,
-			&zone_start_pfn, &zone_end_pfn);
-	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
-
-	/*
-	 * ZONE_MOVABLE handling.
-	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
-	 * and vice versa.
-	 */
-	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
-		unsigned long start_pfn, end_pfn;
-		struct memblock_region *r;
-
-		for_each_mem_region(r) {
-			start_pfn = clamp(memblock_region_memory_base_pfn(r),
-					  zone_start_pfn, zone_end_pfn);
-			end_pfn = clamp(memblock_region_memory_end_pfn(r),
-					zone_start_pfn, zone_end_pfn);
-
-			if (zone_type == ZONE_MOVABLE &&
-			    memblock_is_mirror(r))
-				nr_absent += end_pfn - start_pfn;
-
-			if (zone_type == ZONE_NORMAL &&
-			    !memblock_is_mirror(r))
-				nr_absent += end_pfn - start_pfn;
-		}
-	}
-
-	return nr_absent;
-}
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-						unsigned long node_start_pfn,
-						unsigned long node_end_pfn)
-{
-	unsigned long realtotalpages = 0, totalpages = 0;
-	enum zone_type i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-		unsigned long zone_start_pfn, zone_end_pfn;
-		unsigned long spanned, absent;
-		unsigned long size, real_size;
-
-		spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
-						     node_start_pfn,
-						     node_end_pfn,
-						     &zone_start_pfn,
-						     &zone_end_pfn);
-		absent = zone_absent_pages_in_node(pgdat->node_id, i,
-						   node_start_pfn,
-						   node_end_pfn);
-
-		size = spanned;
-		real_size = size - absent;
-
-		if (size)
-			zone->zone_start_pfn = zone_start_pfn;
-		else
-			zone->zone_start_pfn = 0;
-		zone->spanned_pages = size;
-		zone->present_pages = real_size;
-#if defined(CONFIG_MEMORY_HOTPLUG)
-		zone->present_early_pages = real_size;
-#endif
-
-		totalpages += size;
-		realtotalpages += real_size;
-	}
-
-	pgdat->node_spanned_pages = totalpages;
-	pgdat->node_present_pages = realtotalpages;
-	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-#ifndef CONFIG_SPARSEMEM
-/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
- */
-static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
-{
-	unsigned long usemapsize;
-
-	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
-	usemapsize = roundup(zonesize, pageblock_nr_pages);
-	usemapsize = usemapsize >> pageblock_order;
-	usemapsize *= NR_PAGEBLOCK_BITS;
-	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
-
-	return usemapsize / 8;
-}
-
-static void __ref setup_usemap(struct zone *zone)
-{
-	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
-					       zone->spanned_pages);
-	zone->pageblock_flags = NULL;
-	if (usemapsize) {
-		zone->pageblock_flags =
-			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
-					    zone_to_nid(zone));
-		if (!zone->pageblock_flags)
-			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
-			      usemapsize, zone->name, zone_to_nid(zone));
-	}
-}
-#else
-static inline void setup_usemap(struct zone *zone) {}
-#endif /* CONFIG_SPARSEMEM */
-
-#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-
-/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __init set_pageblock_order(void)
-{
-	unsigned int order = MAX_ORDER;
-
-	/* Check that pageblock_nr_pages has not already been setup */
-	if (pageblock_order)
-		return;
-
-	/* Don't let pageblocks exceed the maximum allocation granularity. */
-	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
-		order = HUGETLB_PAGE_ORDER;
-
-	/*
-	 * Assume the largest contiguous order of interest is a huge page.
-	 * This value may be variable depending on boot parameters on IA64 and
-	 * powerpc.
-	 */
-	pageblock_order = order;
-}
-#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-/*
- * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * is unused as pageblock_order is set at compile-time. See
- * include/linux/pageblock-flags.h for the values of pageblock_order based on
- * the kernel config
- */
-void __init set_pageblock_order(void)
-{
-}
-
-#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
-						unsigned long present_pages)
-{
-	unsigned long pages = spanned_pages;
-
-	/*
-	 * Provide a more accurate estimation if there are holes within
-	 * the zone and SPARSEMEM is in use. If there are holes within the
-	 * zone, each populated memory region may cost us one or two extra
-	 * memmap pages due to alignment because memmap pages for each
-	 * populated regions may not be naturally aligned on page boundary.
-	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
-	 */
-	if (spanned_pages > present_pages + (present_pages >> 4) &&
-	    IS_ENABLED(CONFIG_SPARSEMEM))
-		pages = present_pages;
-
-	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
-	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
-	spin_lock_init(&ds_queue->split_queue_lock);
-	INIT_LIST_HEAD(&ds_queue->split_queue);
-	ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
-#ifdef CONFIG_COMPACTION
-static void pgdat_init_kcompactd(struct pglist_data *pgdat)
-{
-	init_waitqueue_head(&pgdat->kcompactd_wait);
-}
-#else
-static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
-#endif
-
-static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
-{
-	int i;
-
-	pgdat_resize_init(pgdat);
-	pgdat_kswapd_lock_init(pgdat);
-
-	pgdat_init_split_queue(pgdat);
-	pgdat_init_kcompactd(pgdat);
-
-	init_waitqueue_head(&pgdat->kswapd_wait);
-	init_waitqueue_head(&pgdat->pfmemalloc_wait);
-
-	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
-		init_waitqueue_head(&pgdat->reclaim_wait[i]);
-
-	pgdat_page_ext_init(pgdat);
-	lruvec_init(&pgdat->__lruvec);
-}
-
-static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
-							unsigned long remaining_pages)
-{
-	atomic_long_set(&zone->managed_pages, remaining_pages);
-	zone_set_nid(zone, nid);
-	zone->name = zone_names[idx];
-	zone->zone_pgdat = NODE_DATA(nid);
-	spin_lock_init(&zone->lock);
-	zone_seqlock_init(zone);
-	zone_pcp_init(zone);
-}
-
-/*
- * Set up the zone data structures
- * - init pgdat internals
- * - init all zones belonging to this node
- *
- * NOTE: this function is only called during memory hotplug
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
-{
-	int nid = pgdat->node_id;
-	enum zone_type z;
-	int cpu;
-
-	pgdat_init_internals(pgdat);
-
-	if (pgdat->per_cpu_nodestats == &boot_nodestats)
-		pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
-
-	/*
-	 * Reset the nr_zones, order and highest_zoneidx before reuse.
-	 * Note that kswapd will init kswapd_highest_zoneidx properly
-	 * when it starts in the near future.
-	 */
-	pgdat->nr_zones = 0;
-	pgdat->kswapd_order = 0;
-	pgdat->kswapd_highest_zoneidx = 0;
-	pgdat->node_start_pfn = 0;
-	for_each_online_cpu(cpu) {
-		struct per_cpu_nodestat *p;
-
-		p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
-		memset(p, 0, sizeof(*p));
-	}
-
-	for (z = 0; z < MAX_NR_ZONES; z++)
-		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
-}
-#endif
-
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- * NOTE: this function is only called during early init.
- */
-static void __init free_area_init_core(struct pglist_data *pgdat)
-{
-	enum zone_type j;
-	int nid = pgdat->node_id;
-
-	pgdat_init_internals(pgdat);
-	pgdat->per_cpu_nodestats = &boot_nodestats;
-
-	for (j = 0; j < MAX_NR_ZONES; j++) {
-		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, freesize, memmap_pages;
-
-		size = zone->spanned_pages;
-		freesize = zone->present_pages;
-
-		/*
-		 * Adjust freesize so that it accounts for how much memory
-		 * is used by this zone for memmap. This affects the watermark
-		 * and per-cpu initialisations
-		 */
-		memmap_pages = calc_memmap_size(size, freesize);
-		if (!is_highmem_idx(j)) {
-			if (freesize >= memmap_pages) {
-				freesize -= memmap_pages;
-				if (memmap_pages)
-					pr_debug("  %s zone: %lu pages used for memmap\n",
-						 zone_names[j], memmap_pages);
-			} else
-				pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
-					zone_names[j], memmap_pages, freesize);
-		}
-
-		/* Account for reserved pages */
-		if (j == 0 && freesize > dma_reserve) {
-			freesize -= dma_reserve;
-			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
-		}
-
-		if (!is_highmem_idx(j))
-			nr_kernel_pages += freesize;
-		/* Charge for highmem memmap if there are enough kernel pages */
-		else if (nr_kernel_pages > memmap_pages * 2)
-			nr_kernel_pages -= memmap_pages;
-		nr_all_pages += freesize;
-
-		/*
-		 * Set an approximate value for lowmem here, it will be adjusted
-		 * when the bootmem allocator frees pages into the buddy system.
-		 * And all highmem pages will be managed by the buddy system.
-		 */
-		zone_init_internals(zone, j, nid, freesize);
-
-		if (!size)
-			continue;
-
-		set_pageblock_order();
-		setup_usemap(zone);
-		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
-	}
-}
-
-#ifdef CONFIG_FLATMEM
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
-	unsigned long __maybe_unused start = 0;
-	unsigned long __maybe_unused offset = 0;
-
-	/* Skip empty nodes */
-	if (!pgdat->node_spanned_pages)
-		return;
-
-	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
-	offset = pgdat->node_start_pfn - start;
-	/* ia64 gets its own node_mem_map, before this, without bootmem */
-	if (!pgdat->node_mem_map) {
-		unsigned long size, end;
-		struct page *map;
-
-		/*
-		 * The zone's endpoints aren't required to be MAX_ORDER
-		 * aligned but the node_mem_map endpoints must be in order
-		 * for the buddy allocator to function correctly.
-		 */
-		end = pgdat_end_pfn(pgdat);
-		end = ALIGN(end, MAX_ORDER_NR_PAGES);
-		size =  (end - start) * sizeof(struct page);
-		map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
-				   pgdat->node_id, false);
-		if (!map)
-			panic("Failed to allocate %ld bytes for node %d memory map\n",
-			      size, pgdat->node_id);
-		pgdat->node_mem_map = map + offset;
-	}
-	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
-				__func__, pgdat->node_id, (unsigned long)pgdat,
-				(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
-	/*
-	 * With no DISCONTIG, the global mem_map is just set as node 0's
-	 */
-	if (pgdat == NODE_DATA(0)) {
-		mem_map = NODE_DATA(0)->node_mem_map;
-		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-			mem_map -= offset;
-	}
-#endif
-}
-#else
-static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
-#endif /* CONFIG_FLATMEM */
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
-{
-	pgdat->first_deferred_pfn = ULONG_MAX;
-}
-#else
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-#endif
-
-static void __init free_area_init_node(int nid)
-{
-	pg_data_t *pgdat = NODE_DATA(nid);
-	unsigned long start_pfn = 0;
-	unsigned long end_pfn = 0;
-
-	/* pg_data_t should be reset to zero when it's allocated */
-	WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
-
-	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-
-	pgdat->node_id = nid;
-	pgdat->node_start_pfn = start_pfn;
-	pgdat->per_cpu_nodestats = NULL;
-
-	if (start_pfn != end_pfn) {
-		pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-			(u64)start_pfn << PAGE_SHIFT,
-			end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
-	} else {
-		pr_info("Initmem setup node %d as memoryless\n", nid);
-	}
-
-	calculate_node_totalpages(pgdat, start_pfn, end_pfn);
-
-	alloc_node_mem_map(pgdat);
-	pgdat_set_deferred_range(pgdat);
-
-	free_area_init_core(pgdat);
-	lru_gen_init_pgdat(pgdat);
-}
-
-static void __init free_area_init_memoryless_node(int nid)
-{
-	free_area_init_node(nid);
-}
-
-#if MAX_NUMNODES > 1
-/*
- * Figure out the number of possible node ids.
- */
-void __init setup_nr_node_ids(void)
-{
-	unsigned int highest;
-
-	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
-	nr_node_ids = highest + 1;
-}
-#endif
-
-/**
- * node_map_pfn_alignment - determine the maximum internode alignment
- *
- * This function should be called after node map is populated and sorted.
- * It calculates the maximum power of two alignment which can distinguish
- * all the nodes.
- *
- * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
- * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
- * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
- * shifted, 1GiB is enough and this function will indicate so.
- *
- * This is used to test whether pfn -> nid mapping of the chosen memory
- * model has fine enough granularity to avoid incorrect mapping for the
- * populated node map.
- *
- * Return: the determined alignment in pfn's.  0 if there is no alignment
- * requirement (single node).
- */
-unsigned long __init node_map_pfn_alignment(void)
-{
-	unsigned long accl_mask = 0, last_end = 0;
-	unsigned long start, end, mask;
-	int last_nid = NUMA_NO_NODE;
-	int i, nid;
-
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
-		if (!start || last_nid < 0 || last_nid == nid) {
-			last_nid = nid;
-			last_end = end;
-			continue;
-		}
-
-		/*
-		 * Start with a mask granular enough to pin-point to the
-		 * start pfn and tick off bits one-by-one until it becomes
-		 * too coarse to separate the current node from the last.
-		 */
-		mask = ~((1 << __ffs(start)) - 1);
-		while (mask && last_end <= (start & (mask << 1)))
-			mask <<= 1;
-
-		/* accumulate all internode masks */
-		accl_mask |= mask;
-	}
-
-	/* convert mask to number of pages */
-	return ~accl_mask + 1;
-}
-
-/*
- * early_calculate_totalpages()
- * Sum pages in active regions for movable zone.
- * Populate N_MEMORY for calculating usable_nodes.
- */
-static unsigned long __init early_calculate_totalpages(void)
-{
-	unsigned long totalpages = 0;
-	unsigned long start_pfn, end_pfn;
-	int i, nid;
-
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
-		unsigned long pages = end_pfn - start_pfn;
-
-		totalpages += pages;
-		if (pages)
-			node_set_state(nid, N_MEMORY);
-	}
-	return totalpages;
-}
-
-/*
- * Find the PFN the Movable zone begins in each node. Kernel memory
- * is spread evenly between nodes as long as the nodes have enough
- * memory. When they don't, some nodes will have more kernelcore than
- * others
- */
-static void __init find_zone_movable_pfns_for_nodes(void)
-{
-	int i, nid;
-	unsigned long usable_startpfn;
-	unsigned long kernelcore_node, kernelcore_remaining;
-	/* save the state before borrow the nodemask */
-	nodemask_t saved_node_state = node_states[N_MEMORY];
-	unsigned long totalpages = early_calculate_totalpages();
-	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
-	struct memblock_region *r;
-
-	/* Need to find movable_zone earlier when movable_node is specified. */
-	find_usable_zone_for_movable();
-
-	/*
-	 * If movable_node is specified, ignore kernelcore and movablecore
-	 * options.
-	 */
-	if (movable_node_is_enabled()) {
-		for_each_mem_region(r) {
-			if (!memblock_is_hotpluggable(r))
-				continue;
-
-			nid = memblock_get_region_node(r);
-
-			usable_startpfn = PFN_DOWN(r->base);
-			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
-				min(usable_startpfn, zone_movable_pfn[nid]) :
-				usable_startpfn;
-		}
-
-		goto out2;
-	}
-
-	/*
-	 * If kernelcore=mirror is specified, ignore movablecore option
-	 */
-	if (mirrored_kernelcore) {
-		bool mem_below_4gb_not_mirrored = false;
-
-		for_each_mem_region(r) {
-			if (memblock_is_mirror(r))
-				continue;
-
-			nid = memblock_get_region_node(r);
-
-			usable_startpfn = memblock_region_memory_base_pfn(r);
-
-			if (usable_startpfn < PHYS_PFN(SZ_4G)) {
-				mem_below_4gb_not_mirrored = true;
-				continue;
-			}
-
-			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
-				min(usable_startpfn, zone_movable_pfn[nid]) :
-				usable_startpfn;
-		}
-
-		if (mem_below_4gb_not_mirrored)
-			pr_warn("This configuration results in unmirrored kernel memory.\n");
-
-		goto out2;
-	}
-
-	/*
-	 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
-	 * amount of necessary memory.
-	 */
-	if (required_kernelcore_percent)
-		required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
-				       10000UL;
-	if (required_movablecore_percent)
-		required_movablecore = (totalpages * 100 * required_movablecore_percent) /
-					10000UL;
-
-	/*
-	 * If movablecore= was specified, calculate what size of
-	 * kernelcore that corresponds so that memory usable for
-	 * any allocation type is evenly spread. If both kernelcore
-	 * and movablecore are specified, then the value of kernelcore
-	 * will be used for required_kernelcore if it's greater than
-	 * what movablecore would have allowed.
-	 */
-	if (required_movablecore) {
-		unsigned long corepages;
-
-		/*
-		 * Round-up so that ZONE_MOVABLE is at least as large as what
-		 * was requested by the user
-		 */
-		required_movablecore =
-			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
-		required_movablecore = min(totalpages, required_movablecore);
-		corepages = totalpages - required_movablecore;
-
-		required_kernelcore = max(required_kernelcore, corepages);
-	}
-
-	/*
-	 * If kernelcore was not specified or kernelcore size is larger
-	 * than totalpages, there is no ZONE_MOVABLE.
-	 */
-	if (!required_kernelcore || required_kernelcore >= totalpages)
-		goto out;
-
-	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
-
-restart:
-	/* Spread kernelcore memory as evenly as possible throughout nodes */
-	kernelcore_node = required_kernelcore / usable_nodes;
-	for_each_node_state(nid, N_MEMORY) {
-		unsigned long start_pfn, end_pfn;
+#ifdef CONFIG_MMU
+	int high;
+	int nr_split_cpus;
+	unsigned long total_pages;
 
+	if (!percpu_pagelist_high_fraction) {
 		/*
-		 * Recalculate kernelcore_node if the division per node
-		 * now exceeds what is necessary to satisfy the requested
-		 * amount of memory for the kernel
+		 * By default, the high value of the pcp is based on the zone
+		 * low watermark so that if they are full then background
+		 * reclaim will not be started prematurely.
 		 */
-		if (required_kernelcore < kernelcore_node)
-			kernelcore_node = required_kernelcore / usable_nodes;
-
+		total_pages = low_wmark_pages(zone);
+	} else {
 		/*
-		 * As the map is walked, we track how much memory is usable
-		 * by the kernel using kernelcore_remaining. When it is
-		 * 0, the rest of the node is usable by ZONE_MOVABLE
+		 * If percpu_pagelist_high_fraction is configured, the high
+		 * value is based on a fraction of the managed pages in the
+		 * zone.
 		 */
-		kernelcore_remaining = kernelcore_node;
-
-		/* Go through each range of PFNs within this node */
-		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-			unsigned long size_pages;
-
-			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
-			if (start_pfn >= end_pfn)
-				continue;
-
-			/* Account for what is only usable for kernelcore */
-			if (start_pfn < usable_startpfn) {
-				unsigned long kernel_pages;
-				kernel_pages = min(end_pfn, usable_startpfn)
-								- start_pfn;
-
-				kernelcore_remaining -= min(kernel_pages,
-							kernelcore_remaining);
-				required_kernelcore -= min(kernel_pages,
-							required_kernelcore);
-
-				/* Continue if range is now fully accounted */
-				if (end_pfn <= usable_startpfn) {
-
-					/*
-					 * Push zone_movable_pfn to the end so
-					 * that if we have to rebalance
-					 * kernelcore across nodes, we will
-					 * not double account here
-					 */
-					zone_movable_pfn[nid] = end_pfn;
-					continue;
-				}
-				start_pfn = usable_startpfn;
-			}
-
-			/*
-			 * The usable PFN range for ZONE_MOVABLE is from
-			 * start_pfn->end_pfn. Calculate size_pages as the
-			 * number of pages used as kernelcore
-			 */
-			size_pages = end_pfn - start_pfn;
-			if (size_pages > kernelcore_remaining)
-				size_pages = kernelcore_remaining;
-			zone_movable_pfn[nid] = start_pfn + size_pages;
-
-			/*
-			 * Some kernelcore has been met, update counts and
-			 * break if the kernelcore for this node has been
-			 * satisfied
-			 */
-			required_kernelcore -= min(required_kernelcore,
-								size_pages);
-			kernelcore_remaining -= size_pages;
-			if (!kernelcore_remaining)
-				break;
-		}
+		total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
 	}
 
 	/*
-	 * If there is still required_kernelcore, we do another pass with one
-	 * less node in the count. This will push zone_movable_pfn[nid] further
-	 * along on the nodes that still have memory until kernelcore is
-	 * satisfied
+	 * Split the high value across all online CPUs local to the zone. Note
+	 * that early in boot that CPUs may not be online yet and that during
+	 * CPU hotplug that the cpumask is not yet updated when a CPU is being
+	 * onlined. For memory nodes that have no CPUs, split pcp->high across
+	 * all online CPUs to mitigate the risk that reclaim is triggered
+	 * prematurely due to pages stored on pcp lists.
 	 */
-	usable_nodes--;
-	if (usable_nodes && required_kernelcore > usable_nodes)
-		goto restart;
-
-out2:
-	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
-	for (nid = 0; nid < MAX_NUMNODES; nid++) {
-		unsigned long start_pfn, end_pfn;
-
-		zone_movable_pfn[nid] =
-			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
-
-		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-		if (zone_movable_pfn[nid] >= end_pfn)
-			zone_movable_pfn[nid] = 0;
-	}
-
-out:
-	/* restore the node_state */
-	node_states[N_MEMORY] = saved_node_state;
-}
+	nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
+	if (!nr_split_cpus)
+		nr_split_cpus = num_online_cpus();
+	high = total_pages / nr_split_cpus;
 
-/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
-{
-	enum zone_type zone_type;
+	/*
+	 * Ensure high is at least batch*4. The multiple is based on the
+	 * historical relationship between high and batch.
+	 */
+	high = max(high, batch << 2);
 
-	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
-		struct zone *zone = &pgdat->node_zones[zone_type];
-		if (populated_zone(zone)) {
-			if (IS_ENABLED(CONFIG_HIGHMEM))
-				node_set_state(nid, N_HIGH_MEMORY);
-			if (zone_type <= ZONE_NORMAL)
-				node_set_state(nid, N_NORMAL_MEMORY);
-			break;
-		}
-	}
+	return high;
+#else
+	return 0;
+#endif
 }
 
 /*
- * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
- * such cases we allow max_zone_pfn sorted in the descending order
+ * pcp->high and pcp->batch values are related and generally batch is lower
+ * than high. They are also related to pcp->count such that count is lower
+ * than high, and as soon as it reaches high, the pcplist is flushed.
+ *
+ * However, guaranteeing these relations at all times would require e.g. write
+ * barriers here but also careful usage of read barriers at the read side, and
+ * thus be prone to error and bad for performance. Thus the update only prevents
+ * store tearing. Any new users of pcp->batch and pcp->high should ensure they
+ * can cope with those fields changing asynchronously, and fully trust only the
+ * pcp->count field on the local CPU with interrupts disabled.
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
  */
-bool __weak arch_has_descending_max_zone_pfns(void)
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+		unsigned long batch)
 {
-	return false;
+	WRITE_ONCE(pcp->batch, batch);
+	WRITE_ONCE(pcp->high, high);
 }
 
-/**
- * free_area_init - Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
- *
- * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by memblock_set_node(), the size of each
- * zone in each node and their holes is calculated. If the maximum PFN
- * between two adjacent zones match, it is assumed that the zone is empty.
- * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- * starts where the previous one ended. For example, ZONE_DMA32 starts
- * at arch_max_dma_pfn.
- */
-void __init free_area_init(unsigned long *max_zone_pfn)
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
 {
-	unsigned long start_pfn, end_pfn;
-	int i, nid, zone;
-	bool descending;
-
-	/* Record where the zone boundaries are */
-	memset(arch_zone_lowest_possible_pfn, 0,
-				sizeof(arch_zone_lowest_possible_pfn));
-	memset(arch_zone_highest_possible_pfn, 0,
-				sizeof(arch_zone_highest_possible_pfn));
-
-	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
-	descending = arch_has_descending_max_zone_pfns();
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (descending)
-			zone = MAX_NR_ZONES - i - 1;
-		else
-			zone = i;
-
-		if (zone == ZONE_MOVABLE)
-			continue;
+	int pindex;
 
-		end_pfn = max(max_zone_pfn[zone], start_pfn);
-		arch_zone_lowest_possible_pfn[zone] = start_pfn;
-		arch_zone_highest_possible_pfn[zone] = end_pfn;
+	memset(pcp, 0, sizeof(*pcp));
+	memset(pzstats, 0, sizeof(*pzstats));
 
-		start_pfn = end_pfn;
-	}
+	spin_lock_init(&pcp->lock);
+	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+		INIT_LIST_HEAD(&pcp->lists[pindex]);
 
-	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
-	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-	find_zone_movable_pfns_for_nodes();
+	/*
+	 * Set batch and high values safe for a boot pageset. A true percpu
+	 * pageset's initialization will update them subsequently. Here we don't
+	 * need to be as careful as pageset_update() as nobody can access the
+	 * pageset yet.
+	 */
+	pcp->high = BOOT_PAGESET_HIGH;
+	pcp->batch = BOOT_PAGESET_BATCH;
+	pcp->free_factor = 0;
+}
 
-	/* Print out the zone ranges */
-	pr_info("Zone ranges:\n");
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (i == ZONE_MOVABLE)
-			continue;
-		pr_info("  %-8s ", zone_names[i]);
-		if (arch_zone_lowest_possible_pfn[i] ==
-				arch_zone_highest_possible_pfn[i])
-			pr_cont("empty\n");
-		else
-			pr_cont("[mem %#018Lx-%#018Lx]\n",
-				(u64)arch_zone_lowest_possible_pfn[i]
-					<< PAGE_SHIFT,
-				((u64)arch_zone_highest_possible_pfn[i]
-					<< PAGE_SHIFT) - 1);
-	}
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+		unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+	int cpu;
 
-	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
-	pr_info("Movable zone start for each node\n");
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (zone_movable_pfn[i])
-			pr_info("  Node %d: %#018Lx\n", i,
-			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+	for_each_possible_cpu(cpu) {
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		pageset_update(pcp, high, batch);
 	}
+}
 
-	/*
-	 * Print out the early node map, and initialize the
-	 * subsection-map relative to active online memory ranges to
-	 * enable future "sub-section" extensions of the memory map.
-	 */
-	pr_info("Early memory node ranges\n");
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
-		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
-			(u64)start_pfn << PAGE_SHIFT,
-			((u64)end_pfn << PAGE_SHIFT) - 1);
-		subsection_map_init(start_pfn, end_pfn - start_pfn);
-	}
-
-	/* Initialise every node */
-	mminit_verify_pageflags_layout();
-	setup_nr_node_ids();
-	for_each_node(nid) {
-		pg_data_t *pgdat;
-
-		if (!node_online(nid)) {
-			pr_info("Initializing node %d as memoryless\n", nid);
-
-			/* Allocator not initialized yet */
-			pgdat = arch_alloc_nodedata(nid);
-			if (!pgdat)
-				panic("Cannot allocate %zuB for node %d.\n",
-				       sizeof(*pgdat), nid);
-			arch_refresh_nodedata(nid, pgdat);
-			free_area_init_memoryless_node(nid);
+/*
+ * Calculate and set new high and batch values for all per-cpu pagesets of a
+ * zone based on the zone's size.
+ */
+static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
+{
+	int new_high, new_batch;
 
-			/*
-			 * We do not want to confuse userspace by sysfs
-			 * files/directories for node without any memory
-			 * attached to it, so this node is not marked as
-			 * N_MEMORY and not marked online so that no sysfs
-			 * hierarchy will be created via register_one_node for
-			 * it. The pgdat will get fully initialized by
-			 * hotadd_init_pgdat() when memory is hotplugged into
-			 * this node.
-			 */
-			continue;
-		}
+	new_batch = max(1, zone_batchsize(zone));
+	new_high = zone_highsize(zone, new_batch, cpu_online);
 
-		pgdat = NODE_DATA(nid);
-		free_area_init_node(nid);
+	if (zone->pageset_high == new_high &&
+	    zone->pageset_batch == new_batch)
+		return;
 
-		/* Any memory on that node */
-		if (pgdat->node_present_pages)
-			node_set_state(nid, N_MEMORY);
-		check_for_memory(pgdat, nid);
-	}
+	zone->pageset_high = new_high;
+	zone->pageset_batch = new_batch;
 
-	memmap_init();
+	__zone_set_pageset_high_and_batch(zone, new_high, new_batch);
 }
 
-static int __init cmdline_parse_core(char *p, unsigned long *core,
-				     unsigned long *percent)
+void __meminit setup_zone_pageset(struct zone *zone)
 {
-	unsigned long long coremem;
-	char *endptr;
-
-	if (!p)
-		return -EINVAL;
+	int cpu;
 
-	/* Value may be a percentage of total memory, otherwise bytes */
-	coremem = simple_strtoull(p, &endptr, 0);
-	if (*endptr == '%') {
-		/* Paranoid check for percent values greater than 100 */
-		WARN_ON(coremem > 100);
+	/* Size may be 0 on !SMP && !NUMA */
+	if (sizeof(struct per_cpu_zonestat) > 0)
+		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
 
-		*percent = coremem;
-	} else {
-		coremem = memparse(p, &p);
-		/* Paranoid check that UL is enough for the coremem value */
-		WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_pages *pcp;
+		struct per_cpu_zonestat *pzstats;
 
-		*core = coremem >> PAGE_SHIFT;
-		*percent = 0UL;
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+		per_cpu_pages_init(pcp, pzstats);
 	}
-	return 0;
+
+	zone_set_pageset_high_and_batch(zone, 0);
 }
 
 /*
- * kernelcore=size sets the amount of memory for use for allocations that
- * cannot be reclaimed or migrated.
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalculated.
  */
-static int __init cmdline_parse_kernelcore(char *p)
+static void zone_pcp_update(struct zone *zone, int cpu_online)
 {
-	/* parse kernelcore=mirror */
-	if (parse_option_str(p, "mirror")) {
-		mirrored_kernelcore = true;
-		return 0;
-	}
-
-	return cmdline_parse_core(p, &required_kernelcore,
-				  &required_kernelcore_percent);
+	mutex_lock(&pcp_batch_high_lock);
+	zone_set_pageset_high_and_batch(zone, cpu_online);
+	mutex_unlock(&pcp_batch_high_lock);
 }
 
 /*
- * movablecore=size sets the amount of memory for use for allocations that
- * can be reclaimed or migrated.
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
  */
-static int __init cmdline_parse_movablecore(char *p)
+void __init setup_per_cpu_pageset(void)
 {
-	return cmdline_parse_core(p, &required_movablecore,
-				  &required_movablecore_percent);
+	struct pglist_data *pgdat;
+	struct zone *zone;
+	int __maybe_unused cpu;
+
+	for_each_populated_zone(zone)
+		setup_zone_pageset(zone);
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Unpopulated zones continue using the boot pagesets.
+	 * The numa stats for these pagesets need to be reset.
+	 * Otherwise, they will end up skewing the stats of
+	 * the nodes these zones are associated with.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+		memset(pzstats->vm_numa_event, 0,
+		       sizeof(pzstats->vm_numa_event));
+	}
+#endif
+
+	for_each_online_pgdat(pgdat)
+		pgdat->per_cpu_nodestats =
+			alloc_percpu(struct per_cpu_nodestat);
 }
 
-early_param("kernelcore", cmdline_parse_kernelcore);
-early_param("movablecore", cmdline_parse_movablecore);
+__meminit void zone_pcp_init(struct zone *zone)
+{
+	/*
+	 * per cpu subsystem is not up at this point. The following code
+	 * relies on the ability of the linker to provide the
+	 * offset of a (static) per cpu variable into the per cpu area.
+	 */
+	zone->per_cpu_pageset = &boot_pageset;
+	zone->per_cpu_zonestats = &boot_zonestats;
+	zone->pageset_high = BOOT_PAGESET_HIGH;
+	zone->pageset_batch = BOOT_PAGESET_BATCH;
+
+	if (populated_zone(zone))
+		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
+			 zone->present_pages, zone_batchsize(zone));
+}
 
 void adjust_managed_page_count(struct page *page, long count)
 {
@@ -8516,22 +6353,6 @@ void __init mem_init_print_info(void)
 		);
 }
 
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
-	dma_reserve = new_dma_reserve;
-}
-
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
 	struct zone *zone;
@@ -8976,149 +6797,6 @@ out:
 	return ret;
 }
 
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
-	return 0;
-}
-#endif
-
-/*
- * Adaptive scale is meant to reduce sizes of hash tables on large memory
- * machines. As memory size is increased the scale is also increased but at
- * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
- * quadruples the scale is increased by one, which means the size of hash table
- * only doubles, instead of quadrupling as well.
- * Because 32-bit systems cannot have large physical memory, where this scaling
- * makes sense, it is disabled on such platforms.
- */
-#if __BITS_PER_LONG > 32
-#define ADAPT_SCALE_BASE	(64ul << 30)
-#define ADAPT_SCALE_SHIFT	2
-#define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT)
-#endif
-
-/*
- * allocate a large system hash table from bootmem
- * - it is assumed that the hash table must contain an exact power-of-2
- *   quantity of entries
- * - limit is the number of hash buckets, not the total allocation size
- */
-void *__init alloc_large_system_hash(const char *tablename,
-				     unsigned long bucketsize,
-				     unsigned long numentries,
-				     int scale,
-				     int flags,
-				     unsigned int *_hash_shift,
-				     unsigned int *_hash_mask,
-				     unsigned long low_limit,
-				     unsigned long high_limit)
-{
-	unsigned long long max = high_limit;
-	unsigned long log2qty, size;
-	void *table;
-	gfp_t gfp_flags;
-	bool virt;
-	bool huge;
-
-	/* allow the kernel cmdline to have a say */
-	if (!numentries) {
-		/* round applicable memory size up to nearest megabyte */
-		numentries = nr_kernel_pages;
-		numentries -= arch_reserved_kernel_pages();
-
-		/* It isn't necessary when PAGE_SIZE >= 1MB */
-		if (PAGE_SIZE < SZ_1M)
-			numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
-
-#if __BITS_PER_LONG > 32
-		if (!high_limit) {
-			unsigned long adapt;
-
-			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
-			     adapt <<= ADAPT_SCALE_SHIFT)
-				scale++;
-		}
-#endif
-
-		/* limit to 1 bucket per 2^scale bytes of low memory */
-		if (scale > PAGE_SHIFT)
-			numentries >>= (scale - PAGE_SHIFT);
-		else
-			numentries <<= (PAGE_SHIFT - scale);
-
-		/* Make sure we've got at least a 0-order allocation.. */
-		if (unlikely(flags & HASH_SMALL)) {
-			/* Makes no sense without HASH_EARLY */
-			WARN_ON(!(flags & HASH_EARLY));
-			if (!(numentries >> *_hash_shift)) {
-				numentries = 1UL << *_hash_shift;
-				BUG_ON(!numentries);
-			}
-		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
-			numentries = PAGE_SIZE / bucketsize;
-	}
-	numentries = roundup_pow_of_two(numentries);
-
-	/* limit allocation size to 1/16 total memory by default */
-	if (max == 0) {
-		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
-		do_div(max, bucketsize);
-	}
-	max = min(max, 0x80000000ULL);
-
-	if (numentries < low_limit)
-		numentries = low_limit;
-	if (numentries > max)
-		numentries = max;
-
-	log2qty = ilog2(numentries);
-
-	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
-	do {
-		virt = false;
-		size = bucketsize << log2qty;
-		if (flags & HASH_EARLY) {
-			if (flags & HASH_ZERO)
-				table = memblock_alloc(size, SMP_CACHE_BYTES);
-			else
-				table = memblock_alloc_raw(size,
-							   SMP_CACHE_BYTES);
-		} else if (get_order(size) > MAX_ORDER || hashdist) {
-			table = vmalloc_huge(size, gfp_flags);
-			virt = true;
-			if (table)
-				huge = is_vm_area_hugepages(table);
-		} else {
-			/*
-			 * If bucketsize is not a power-of-two, we may free
-			 * some pages at the end of hash table which
-			 * alloc_pages_exact() automatically does
-			 */
-			table = alloc_pages_exact(size, gfp_flags);
-			kmemleak_alloc(table, size, 1, gfp_flags);
-		}
-	} while (!table && size > PAGE_SIZE && --log2qty);
-
-	if (!table)
-		panic("Failed to allocate %s hash table\n", tablename);
-
-	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
-		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
-		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
-
-	if (_hash_shift)
-		*_hash_shift = log2qty;
-	if (_hash_mask)
-		*_hash_mask = (1 << log2qty) - 1;
-
-	return table;
-}
-
 #ifdef CONFIG_CONTIG_ALLOC
 #if defined(CONFIG_DYNAMIC_DEBUG) || \
 	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
-- 
cgit v1.2.3


From c4fbed4b0284d91797ee3cf60983b94c84c396b6 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:04 +0200
Subject: mm/page_alloc: rename page_alloc_init() to page_alloc_init_cpuhp()

The page_alloc_init() name is really misleading because all this function
does is sets up CPU hotplug callbacks for the page allocator.

Rename it to page_alloc_init_cpuhp() so that name will reflect what the
function does.

Link: https://lkml.kernel.org/r/20230321170513.2401534-6-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 init/main.c         | 2 +-
 mm/page_alloc.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7c554e4bd49f..ed8cb537c6a7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -319,7 +319,7 @@ extern void page_frag_free(void *addr);
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
-void page_alloc_init(void);
+void page_alloc_init_cpuhp(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/init/main.c b/init/main.c
index bb87b789c543..f285bc53c345 100644
--- a/init/main.c
+++ b/init/main.c
@@ -973,7 +973,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 	boot_cpu_hotplug_init();
 
 	build_all_zonelists(NULL);
-	page_alloc_init();
+	page_alloc_init_cpuhp();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
 	/* parameters may set static keys */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3abc994bcfa1..1131b87586d5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6393,7 +6393,7 @@ static int page_alloc_cpu_online(unsigned int cpu)
 	return 0;
 }
 
-void __init page_alloc_init(void)
+void __init page_alloc_init_cpuhp(void)
 {
 	int ret;
 
-- 
cgit v1.2.3


From b7ec1bf3e7b9dd9d3335c937f5d834680d74addf Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:06 +0200
Subject: init,mm: move mm_init() to mm/mm_init.c and rename it to
 mm_core_init()

Make mm_init() a part of mm/ codebase.  mm_core_init() better describes
what the function does and does not clash with mm_init() in kernel/fork.c

Link: https://lkml.kernel.org/r/20230321170513.2401534-8-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 init/main.c        | 71 ++--------------------------------------------------
 mm/mm_init.c       | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..6be179cec081 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -38,6 +38,7 @@ struct pt_regs;
 
 extern int sysctl_page_lock_unfairness;
 
+void mm_core_init(void);
 void init_mm_internals(void);
 
 #ifndef CONFIG_NUMA		/* Don't use mapnrs, do it properly */
diff --git a/init/main.c b/init/main.c
index c240cce5bc86..f1b2d8d80987 100644
--- a/init/main.c
+++ b/init/main.c
@@ -807,73 +807,6 @@ static inline void initcall_debug_enable(void)
 }
 #endif
 
-/* Report memory auto-initialization states for this boot. */
-static void __init report_meminit(void)
-{
-	const char *stack;
-
-	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
-		stack = "all(pattern)";
-	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
-		stack = "all(zero)";
-	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
-		stack = "byref_all(zero)";
-	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
-		stack = "byref(zero)";
-	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
-		stack = "__user(zero)";
-	else
-		stack = "off";
-
-	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
-		stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
-		want_init_on_free() ? "on" : "off");
-	if (want_init_on_free())
-		pr_info("mem auto-init: clearing system memory may take some time...\n");
-}
-
-/*
- * Set up kernel memory allocators
- */
-static void __init mm_init(void)
-{
-	/* Initializations relying on SMP setup */
-	build_all_zonelists(NULL);
-	page_alloc_init_cpuhp();
-
-	/*
-	 * page_ext requires contiguous pages,
-	 * bigger than MAX_ORDER unless SPARSEMEM.
-	 */
-	page_ext_init_flatmem();
-	init_mem_debugging_and_hardening();
-	kfence_alloc_pool();
-	report_meminit();
-	kmsan_init_shadow();
-	stack_depot_early_init();
-	mem_init();
-	mem_init_print_info();
-	kmem_cache_init();
-	/*
-	 * page_owner must be initialized after buddy is ready, and also after
-	 * slab is ready so that stack_depot_init() works properly
-	 */
-	page_ext_init_flatmem_late();
-	kmemleak_init();
-	pgtable_init();
-	debug_objects_mem_init();
-	vmalloc_init();
-	/* If no deferred init page_ext now, as vmap is fully initialized */
-	if (!deferred_struct_pages)
-		page_ext_init();
-	/* Should be run before the first non-init thread is created */
-	init_espfix_bsp();
-	/* Should be run after espfix64 is set up. */
-	pti_init();
-	kmsan_init_runtime();
-	mm_cache_init();
-}
-
 #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
 DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
 			   randomize_kstack_offset);
@@ -997,13 +930,13 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 
 	/*
 	 * These use large bootmem allocations and must precede
-	 * kmem_cache_init()
+	 * initalization of page allocator
 	 */
 	setup_log_buf(0);
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
-	mm_init();
+	mm_core_init();
 	poking_init();
 	ftrace_init();
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2e60c7186132..bba73f1fb277 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -20,9 +20,15 @@
 #include <linux/nmi.h>
 #include <linux/buffer_head.h>
 #include <linux/kmemleak.h>
+#include <linux/kfence.h>
+#include <linux/page_ext.h>
+#include <linux/pti.h>
+#include <linux/pgtable.h>
 #include "internal.h"
 #include "shuffle.h"
 
+#include <asm/setup.h>
+
 #ifdef CONFIG_DEBUG_MEMORY_INIT
 int __meminitdata mminit_loglevel;
 
@@ -2524,3 +2530,70 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
 	}
 	__free_pages_core(page, order);
 }
+
+/* Report memory auto-initialization states for this boot. */
+static void __init report_meminit(void)
+{
+	const char *stack;
+
+	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
+		stack = "all(pattern)";
+	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
+		stack = "all(zero)";
+	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
+		stack = "byref_all(zero)";
+	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
+		stack = "byref(zero)";
+	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
+		stack = "__user(zero)";
+	else
+		stack = "off";
+
+	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+		stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
+		want_init_on_free() ? "on" : "off");
+	if (want_init_on_free())
+		pr_info("mem auto-init: clearing system memory may take some time...\n");
+}
+
+/*
+ * Set up kernel memory allocators
+ */
+void __init mm_core_init(void)
+{
+	/* Initializations relying on SMP setup */
+	build_all_zonelists(NULL);
+	page_alloc_init_cpuhp();
+
+	/*
+	 * page_ext requires contiguous pages,
+	 * bigger than MAX_ORDER unless SPARSEMEM.
+	 */
+	page_ext_init_flatmem();
+	init_mem_debugging_and_hardening();
+	kfence_alloc_pool();
+	report_meminit();
+	kmsan_init_shadow();
+	stack_depot_early_init();
+	mem_init();
+	mem_init_print_info();
+	kmem_cache_init();
+	/*
+	 * page_owner must be initialized after buddy is ready, and also after
+	 * slab is ready so that stack_depot_init() works properly
+	 */
+	page_ext_init_flatmem_late();
+	kmemleak_init();
+	pgtable_init();
+	debug_objects_mem_init();
+	vmalloc_init();
+	/* If no deferred init page_ext now, as vmap is fully initialized */
+	if (!deferred_struct_pages)
+		page_ext_init();
+	/* Should be run before the first non-init thread is created */
+	init_espfix_bsp();
+	/* Should be run after espfix64 is set up. */
+	pti_init();
+	kmsan_init_runtime();
+	mm_cache_init();
+}
-- 
cgit v1.2.3


From 4cd1e9edf60efb20fad35cf5e9ade7ad75b34cd1 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:07 +0200
Subject: mm: call {ptlock,pgtable}_cache_init() directly from mm_core_init()

and drop pgtable_init() as it has no real value and its name is
misleading.

Link: https://lkml.kernel.org/r/20230321170513.2401534-9-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Sergei Shtylyov <sergei.shtylyov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 ------
 mm/mm_init.c       | 3 ++-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6be179cec081..dfff8bcaa766 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2637,12 +2637,6 @@ static inline bool ptlock_init(struct page *page) { return true; }
 static inline void ptlock_free(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline void pgtable_init(void)
-{
-	ptlock_cache_init();
-	pgtable_cache_init();
-}
-
 static inline bool pgtable_pte_page_ctor(struct page *page)
 {
 	if (!ptlock_init(page))
diff --git a/mm/mm_init.c b/mm/mm_init.c
index bba73f1fb277..f1475413394d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2584,7 +2584,8 @@ void __init mm_core_init(void)
 	 */
 	page_ext_init_flatmem_late();
 	kmemleak_init();
-	pgtable_init();
+	ptlock_cache_init();
+	pgtable_cache_init();
 	debug_objects_mem_init();
 	vmalloc_init();
 	/* If no deferred init page_ext now, as vmap is fully initialized */
-- 
cgit v1.2.3


From f2fc4b44ec2bb94c51c7ae1af9b1177d72705992 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:08 +0200
Subject: mm: move init_mem_debugging_and_hardening() to mm/mm_init.c

init_mem_debugging_and_hardening() is only called from mm_core_init().

Move it close to the caller, make it static and rename it to
mem_debugging_and_hardening_init() for consistency with surrounding
convention.

Link: https://lkml.kernel.org/r/20230321170513.2401534-10-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/internal.h      |  8 +++++
 mm/mm_init.c       | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/page_alloc.c    | 95 ------------------------------------------------------
 4 files changed, 98 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dfff8bcaa766..0fa6a6f94eda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3251,7 +3251,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
 				   unsigned long address, unsigned long size,
 				   pte_fn_t fn, void *data);
 
-extern void __init init_mem_debugging_and_hardening(void);
 #ifdef CONFIG_PAGE_POISONING
 extern void __kernel_poison_pages(struct page *page, int numpages);
 extern void __kernel_unpoison_pages(struct page *page, int numpages);
diff --git a/mm/internal.h b/mm/internal.h
index 22f1410a0ee3..a2934c610182 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -204,6 +204,14 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 
 extern char * const zone_names[MAX_NR_ZONES];
 
+/* perform sanity checks on struct pages being allocated or freed */
+DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+static inline bool is_check_pages_enabled(void)
+{
+	return static_branch_unlikely(&check_pages_enabled);
+}
+
 /*
  * Structure for holding the mostly immutable allocation parameters passed
  * between functions involved in allocations, including the alloc_pages*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f1475413394d..43f6d3ed24ef 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2531,6 +2531,95 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
 	__free_pages_core(page, order);
 }
 
+static bool _init_on_alloc_enabled_early __read_mostly
+				= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
+static int __init early_init_on_alloc(char *buf)
+{
+
+	return kstrtobool(buf, &_init_on_alloc_enabled_early);
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static bool _init_on_free_enabled_early __read_mostly
+				= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
+static int __init early_init_on_free(char *buf)
+{
+	return kstrtobool(buf, &_init_on_free_enabled_early);
+}
+early_param("init_on_free", early_init_on_free);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+static void __init mem_debugging_and_hardening_init(void)
+{
+	bool page_poisoning_requested = false;
+	bool want_check_pages = false;
+
+#ifdef CONFIG_PAGE_POISONING
+	/*
+	 * Page poisoning is debug page alloc for some arches. If
+	 * either of those options are enabled, enable poisoning.
+	 */
+	if (page_poisoning_enabled() ||
+	     (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+	      debug_pagealloc_enabled())) {
+		static_branch_enable(&_page_poisoning_enabled);
+		page_poisoning_requested = true;
+		want_check_pages = true;
+	}
+#endif
+
+	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+	    page_poisoning_requested) {
+		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+			"will take precedence over init_on_alloc and init_on_free\n");
+		_init_on_alloc_enabled_early = false;
+		_init_on_free_enabled_early = false;
+	}
+
+	if (_init_on_alloc_enabled_early) {
+		want_check_pages = true;
+		static_branch_enable(&init_on_alloc);
+	} else {
+		static_branch_disable(&init_on_alloc);
+	}
+
+	if (_init_on_free_enabled_early) {
+		want_check_pages = true;
+		static_branch_enable(&init_on_free);
+	} else {
+		static_branch_disable(&init_on_free);
+	}
+
+	if (IS_ENABLED(CONFIG_KMSAN) &&
+	    (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
+		pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	if (debug_pagealloc_enabled()) {
+		want_check_pages = true;
+		static_branch_enable(&_debug_pagealloc_enabled);
+
+		if (debug_guardpage_minorder())
+			static_branch_enable(&_debug_guardpage_enabled);
+	}
+#endif
+
+	/*
+	 * Any page debugging or hardening option also enables sanity checking
+	 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
+	 * enabled already.
+	 */
+	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
+		static_branch_enable(&check_pages_enabled);
+}
+
 /* Report memory auto-initialization states for this boot. */
 static void __init report_meminit(void)
 {
@@ -2570,7 +2659,7 @@ void __init mm_core_init(void)
 	 * bigger than MAX_ORDER unless SPARSEMEM.
 	 */
 	page_ext_init_flatmem();
-	init_mem_debugging_and_hardening();
+	mem_debugging_and_hardening_init();
 	kfence_alloc_pool();
 	report_meminit();
 	kmsan_init_shadow();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1131b87586d5..94bf3b7b5c49 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -240,31 +240,6 @@ EXPORT_SYMBOL(init_on_alloc);
 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
 EXPORT_SYMBOL(init_on_free);
 
-/* perform sanity checks on struct pages being allocated or freed */
-static DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
-
-static inline bool is_check_pages_enabled(void)
-{
-	return static_branch_unlikely(&check_pages_enabled);
-}
-
-static bool _init_on_alloc_enabled_early __read_mostly
-				= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
-static int __init early_init_on_alloc(char *buf)
-{
-
-	return kstrtobool(buf, &_init_on_alloc_enabled_early);
-}
-early_param("init_on_alloc", early_init_on_alloc);
-
-static bool _init_on_free_enabled_early __read_mostly
-				= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
-static int __init early_init_on_free(char *buf)
-{
-	return kstrtobool(buf, &_init_on_free_enabled_early);
-}
-early_param("init_on_free", early_init_on_free);
-
 /*
  * A cached value of the page's pageblock's migratetype, used when the page is
  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
@@ -798,76 +773,6 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype) {}
 #endif
 
-/*
- * Enable static keys related to various memory debugging and hardening options.
- * Some override others, and depend on early params that are evaluated in the
- * order of appearance. So we need to first gather the full picture of what was
- * enabled, and then make decisions.
- */
-void __init init_mem_debugging_and_hardening(void)
-{
-	bool page_poisoning_requested = false;
-	bool want_check_pages = false;
-
-#ifdef CONFIG_PAGE_POISONING
-	/*
-	 * Page poisoning is debug page alloc for some arches. If
-	 * either of those options are enabled, enable poisoning.
-	 */
-	if (page_poisoning_enabled() ||
-	     (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
-	      debug_pagealloc_enabled())) {
-		static_branch_enable(&_page_poisoning_enabled);
-		page_poisoning_requested = true;
-		want_check_pages = true;
-	}
-#endif
-
-	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
-	    page_poisoning_requested) {
-		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
-			"will take precedence over init_on_alloc and init_on_free\n");
-		_init_on_alloc_enabled_early = false;
-		_init_on_free_enabled_early = false;
-	}
-
-	if (_init_on_alloc_enabled_early) {
-		want_check_pages = true;
-		static_branch_enable(&init_on_alloc);
-	} else {
-		static_branch_disable(&init_on_alloc);
-	}
-
-	if (_init_on_free_enabled_early) {
-		want_check_pages = true;
-		static_branch_enable(&init_on_free);
-	} else {
-		static_branch_disable(&init_on_free);
-	}
-
-	if (IS_ENABLED(CONFIG_KMSAN) &&
-	    (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
-		pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	if (debug_pagealloc_enabled()) {
-		want_check_pages = true;
-		static_branch_enable(&_debug_pagealloc_enabled);
-
-		if (debug_guardpage_minorder())
-			static_branch_enable(&_debug_guardpage_enabled);
-	}
-#endif
-
-	/*
-	 * Any page debugging or hardening option also enables sanity checking
-	 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
-	 * enabled already.
-	 */
-	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
-		static_branch_enable(&check_pages_enabled);
-}
-
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
-- 
cgit v1.2.3


From de57807e6f267a658a046dbca44dc40fe806d60f Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:09 +0200
Subject: init,mm: fold late call to page_ext_init() to page_alloc_init_late()

When deferred initialization of struct pages is enabled, page_ext_init()
must be called after all the deferred initialization is done, but there is
no point to keep it a separate call from kernel_init_freeable() right
after page_alloc_init_late().

Fold the call to page_ext_init() into page_alloc_init_late() and localize
deferred_struct_pages variable.

Link: https://lkml.kernel.org/r/20230321170513.2401534-11-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h | 2 --
 init/main.c              | 4 ----
 mm/mm_init.c             | 6 +++++-
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index bc2e39090a1f..67314f648aeb 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -29,8 +29,6 @@ struct page_ext_operations {
 	bool need_shared_flags;
 };
 
-extern bool deferred_struct_pages;
-
 #ifdef CONFIG_PAGE_EXTENSION
 
 /*
diff --git a/init/main.c b/init/main.c
index f1b2d8d80987..43bc4c82dc58 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,7 +62,6 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
-#include <linux/page_ext.h>
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
@@ -1565,9 +1564,6 @@ static noinline void __init kernel_init_freeable(void)
 
 	padata_init();
 	page_alloc_init_late();
-	/* Initialize page ext after all struct pages are initialized. */
-	if (deferred_struct_pages)
-		page_ext_init();
 
 	do_basic_setup();
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 43f6d3ed24ef..ff70da11e797 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -225,7 +225,7 @@ static unsigned long nr_kernel_pages __initdata;
 static unsigned long nr_all_pages __initdata;
 static unsigned long dma_reserve __initdata;
 
-bool deferred_struct_pages __meminitdata;
+static bool deferred_struct_pages __meminitdata;
 
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
@@ -2358,6 +2358,10 @@ void __init page_alloc_init_late(void)
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
+
+	/* Initialize page ext after all struct pages are initialized. */
+	if (deferred_struct_pages)
+		page_ext_init();
 }
 
 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-- 
cgit v1.2.3


From eb8589b4f8c107c346421881963c0ee0b8367c2c Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:10 +0200
Subject: mm: move mem_init_print_info() to mm_init.c

mem_init_print_info() is only called from mm_core_init().

Move it close to the caller and make it static.

Link: https://lkml.kernel.org/r/20230321170513.2401534-12-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/internal.h      |  1 +
 mm/mm_init.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c    | 53 -----------------------------------------------------
 4 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0fa6a6f94eda..c3abad7e6ccb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2780,7 +2780,6 @@ extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, const char *s);
 
 extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(void);
 
 extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
diff --git a/mm/internal.h b/mm/internal.h
index a2934c610182..7cea7d833c50 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -201,6 +201,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 /*
  * in mm/page_alloc.c
  */
+#define K(x) ((x) << (PAGE_SHIFT-10))
 
 extern char * const zone_names[MAX_NR_ZONES];
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index ff70da11e797..8adadf51bbd2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -24,6 +24,8 @@
 #include <linux/page_ext.h>
 #include <linux/pti.h>
 #include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/cma.h>
 #include "internal.h"
 #include "shuffle.h"
 
@@ -2649,6 +2651,57 @@ static void __init report_meminit(void)
 		pr_info("mem auto-init: clearing system memory may take some time...\n");
 }
 
+static void __init mem_init_print_info(void)
+{
+	unsigned long physpages, codesize, datasize, rosize, bss_size;
+	unsigned long init_code_size, init_data_size;
+
+	physpages = get_num_physpages();
+	codesize = _etext - _stext;
+	datasize = _edata - _sdata;
+	rosize = __end_rodata - __start_rodata;
+	bss_size = __bss_stop - __bss_start;
+	init_data_size = __init_end - __init_begin;
+	init_code_size = _einittext - _sinittext;
+
+	/*
+	 * Detect special cases and adjust section sizes accordingly:
+	 * 1) .init.* may be embedded into .data sections
+	 * 2) .init.text.* may be out of [__init_begin, __init_end],
+	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
+	 * 3) .rodata.* may be embedded into .text or .data sections.
+	 */
+#define adj_init_size(start, end, size, pos, adj) \
+	do { \
+		if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
+			size -= adj; \
+	} while (0)
+
+	adj_init_size(__init_begin, __init_end, init_data_size,
+		     _sinittext, init_code_size);
+	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef	adj_init_size
+
+	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
+#ifdef	CONFIG_HIGHMEM
+		", %luK highmem"
+#endif
+		")\n",
+		K(nr_free_pages()), K(physpages),
+		codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
+		(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
+		K(physpages - totalram_pages() - totalcma_pages),
+		K(totalcma_pages)
+#ifdef	CONFIG_HIGHMEM
+		, K(totalhigh_pages())
+#endif
+		);
+}
+
 /*
  * Set up kernel memory allocators
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94bf3b7b5c49..d0eb280ec7e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5246,8 +5246,6 @@ static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask
 	return !node_isset(nid, *nodemask);
 }
 
-#define K(x) ((x) << (PAGE_SHIFT-10))
-
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
@@ -6207,57 +6205,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	return pages;
 }
 
-void __init mem_init_print_info(void)
-{
-	unsigned long physpages, codesize, datasize, rosize, bss_size;
-	unsigned long init_code_size, init_data_size;
-
-	physpages = get_num_physpages();
-	codesize = _etext - _stext;
-	datasize = _edata - _sdata;
-	rosize = __end_rodata - __start_rodata;
-	bss_size = __bss_stop - __bss_start;
-	init_data_size = __init_end - __init_begin;
-	init_code_size = _einittext - _sinittext;
-
-	/*
-	 * Detect special cases and adjust section sizes accordingly:
-	 * 1) .init.* may be embedded into .data sections
-	 * 2) .init.text.* may be out of [__init_begin, __init_end],
-	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
-	 * 3) .rodata.* may be embedded into .text or .data sections.
-	 */
-#define adj_init_size(start, end, size, pos, adj) \
-	do { \
-		if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
-			size -= adj; \
-	} while (0)
-
-	adj_init_size(__init_begin, __init_end, init_data_size,
-		     _sinittext, init_code_size);
-	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
-	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
-	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
-	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
-
-#undef	adj_init_size
-
-	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
-#ifdef	CONFIG_HIGHMEM
-		", %luK highmem"
-#endif
-		")\n",
-		K(nr_free_pages()), K(physpages),
-		codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
-		(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
-		K(physpages - totalram_pages() - totalcma_pages),
-		K(totalcma_pages)
-#ifdef	CONFIG_HIGHMEM
-		, K(totalhigh_pages())
-#endif
-		);
-}
-
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
 	struct zone *zone;
-- 
cgit v1.2.3


From d5d2c02a4980c2e22037679457bf2d921b86a503 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:11 +0200
Subject: mm: move kmem_cache_init() declaration to mm/slab.h

kmem_cache_init() is called only from mm_core_init(), there is no need to
declare it in include/linux/slab.h

Move kmem_cache_init() declaration to mm/slab.h

Link: https://lkml.kernel.org/r/20230321170513.2401534-13-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 1 -
 mm/mm_init.c         | 1 +
 mm/slab.h            | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index aa4575ef2965..f8b1d63c63a3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -167,7 +167,6 @@ struct mem_cgroup;
 /*
  * struct kmem_cache related prototypes
  */
-void __init kmem_cache_init(void);
 bool slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8adadf51bbd2..53fb8e9d1e3b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/cma.h>
 #include "internal.h"
+#include "slab.h"
 #include "shuffle.h"
 
 #include <asm/setup.h>
diff --git a/mm/slab.h b/mm/slab.h
index 43966aa5fadf..3f8df2244f5a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,7 @@
 /*
  * Internal slab definitions
  */
+void __init kmem_cache_init(void);
 
 /* Reuses the bits in struct page */
 struct slab {
-- 
cgit v1.2.3


From b671491199acd3609f87754d268f2f5cb10de18d Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Tue, 21 Mar 2023 19:05:12 +0200
Subject: mm: move vmalloc_init() declaration to mm/internal.h

vmalloc_init() is called only from mm_core_init(), there is no need to
declare it in include/linux/vmalloc.h

Move vmalloc_init() declaration to mm/internal.h

Link: https://lkml.kernel.org/r/20230321170513.2401534-14-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Doug Berger <opendmb@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmalloc.h | 4 ----
 mm/internal.h           | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 69250efa03d1..351fc7697214 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -131,12 +131,8 @@ extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
 extern void vm_unmap_aliases(void);
 
 #ifdef CONFIG_MMU
-extern void __init vmalloc_init(void);
 extern unsigned long vmalloc_nr_pages(void);
 #else
-static inline void vmalloc_init(void)
-{
-}
 static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
diff --git a/mm/internal.h b/mm/internal.h
index 7cea7d833c50..82ba61d0ed6a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -879,9 +879,14 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
  * mm/vmalloc.c
  */
 #ifdef CONFIG_MMU
+void __init vmalloc_init(void);
 int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift);
 #else
+static inline void vmalloc_init(void)
+{
+}
+
 static inline
 int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift)
-- 
cgit v1.2.3


From bd23024b9774e681cbe6cc3afcb24244dfcb2390 Mon Sep 17 00:00:00 2001
From: Tomas Mudrunka <tomas.mudrunka@gmail.com>
Date: Tue, 21 Mar 2023 11:34:30 +0100
Subject: mm/memtest: add results of early memtest to /proc/meminfo

Currently the memtest results were only presented in dmesg.

When running a large fleet of devices without ECC RAM it's currently not
easy to do bulk monitoring for memory corruption.  You have to parse
dmesg, but that's a ring buffer so the error might disappear after some
time.  In general I do not consider dmesg to be a great API to query RAM
status.

In several companies I've seen such errors remain undetected and cause
issues for way too long.  So I think it makes sense to provide a
monitoring API, so that we can safely detect and act upon them.

This adds /proc/meminfo entry which can be easily used by scripts.

Link: https://lkml.kernel.org/r/20230321103430.7130-1-tomas.mudrunka@gmail.com
Signed-off-by: Tomas Mudrunka <tomas.mudrunka@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.rst |  8 ++++++++
 fs/proc/meminfo.c                  | 13 +++++++++++++
 include/linux/memblock.h           |  2 ++
 mm/memtest.c                       |  6 ++++++
 4 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 9d5fd9424e8b..8740362f31c6 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -996,6 +996,7 @@ Example output. You may not have all of these fields.
     VmallocUsed:       40444 kB
     VmallocChunk:          0 kB
     Percpu:            29312 kB
+    EarlyMemtestBad:       0 kB
     HardwareCorrupted:     0 kB
     AnonHugePages:   4149248 kB
     ShmemHugePages:        0 kB
@@ -1146,6 +1147,13 @@ VmallocChunk
 Percpu
               Memory allocated to the percpu allocator used to back percpu
               allocations. This stat excludes the cost of metadata.
+EarlyMemtestBad
+              The amount of RAM/memory in kB, that was identified as corrupted
+              by early memtest. If memtest was not run, this field will not
+              be displayed at all. Size is never rounded down to 0 kB.
+              That means if 0 kB is reported, you can safely assume
+              there was at least one pass of memtest and none of the passes
+              found a single faulty byte of RAM.
 HardwareCorrupted
               The amount of RAM/memory in KB, the kernel identifies as
               corrupted.
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/percpu.h>
 #include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "VmallocChunk:   ", 0ul);
 	show_val_kb(m, "Percpu:         ", pcpu_nr_pages());
 
+#ifdef CONFIG_MEMTEST
+	if (early_memtest_done) {
+		unsigned long early_memtest_bad_size_kb;
+
+		early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+		if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+			early_memtest_bad_size_kb = 1;
+		/* When 0 is reported, it means there actually was a successful test */
+		seq_printf(m, "EarlyMemtestBad:   %5lu kB\n", early_memtest_bad_size_kb);
+	}
+#endif
+
 #ifdef CONFIG_MEMORY_FAILURE
 	seq_printf(m, "HardwareCorrupted: %5lu kB\n",
 		   atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 50ad19662a32..f82ee3fac1cd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -597,6 +597,8 @@ extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 #endif
 
 #ifdef CONFIG_MEMTEST
+extern phys_addr_t early_memtest_bad_size;	/* Size of faulty ram found by memtest */
+extern bool early_memtest_done;			/* Was early memtest done? */
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
 #else
 static inline void early_memtest(phys_addr_t start, phys_addr_t end)
diff --git a/mm/memtest.c b/mm/memtest.c
index f53ace709ccd..57149dfee438 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -4,6 +4,9 @@
 #include <linux/init.h>
 #include <linux/memblock.h>
 
+bool early_memtest_done;
+phys_addr_t early_memtest_bad_size;
+
 static u64 patterns[] __initdata = {
 	/* The first entry has to be 0 to leave memtest with zeroed memory */
 	0,
@@ -30,6 +33,7 @@ static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr
 	pr_info("  %016llx bad mem addr %pa - %pa reserved\n",
 		cpu_to_be64(pattern), &start_bad, &end_bad);
 	memblock_reserve(start_bad, end_bad - start_bad);
+	early_memtest_bad_size += (end_bad - start_bad);
 }
 
 static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
@@ -61,6 +65,8 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size
 	}
 	if (start_bad)
 		reserve_bad_mem(pattern, start_bad, last_bad + incr);
+
+	early_memtest_done = true;
 }
 
 static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
-- 
cgit v1.2.3


From 28d8b812e97b31231e95864f36a6b32f4b307daa Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Sun, 12 Mar 2023 23:40:13 +0000
Subject: mm: remove unused vmf_insert_mixed_prot()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Remove drm/ttm-specific mm changes".

Functionality was added specifically for the DRM TTM driver to support
mapping memory for VM_MIXEDMAP VMAs with customised protection flags,
however this has now been rolled back as issues were found with this
approach.

This series removes the mm changes too, retaining some of the useful
comments.


This patch (of 3):

The sole user of vmf_insert_mixed_prot(), the drm ttm module, stopped
using this in commit f91142c62161 ("drm/ttm: nuke VM_MIXEDMAP on BO
mappings v3") citing use of VM_MIXEDMAP in this case being terribly
broken.

Remove this now-dead code and references to it, but retain the useful
description of the prot != vma->vm_page_prot case, moving it to
vmf_insert_pfn_prot() instead.

Link: https://lkml.kernel.org/r/cover.1678661628.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/a069644388e6f1593a7020d15840e6fc9f39bcaf.1678661628.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: "Russell King (Oracle)" <linux@armlinux.org.uk>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       |  2 --
 include/linux/mm_types.h |  7 +-----
 mm/memory.c              | 57 +++++++++++++++---------------------------------
 3 files changed, 19 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3abad7e6ccb..55d02356146c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3179,8 +3179,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
 		unsigned long addr, pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0722859c3647..651a5c256732 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -485,12 +485,7 @@ struct vm_area_struct {
 					   within vm_mm. */
 
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
-
-	/*
-	 * Access permissions of this VMA.
-	 * See vmf_insert_mixed_prot() for discussion.
-	 */
-	pgprot_t vm_page_prot;
+	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
 
 	/*
 	 * Flags, see mm.h.
diff --git a/mm/memory.c b/mm/memory.c
index 0ee13d3de88c..7716f2cc4b68 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2165,8 +2165,20 @@ out_unlock:
  * vmf_insert_pfn_prot should only be used if using multiple VMAs is
  * impractical.
  *
- * See vmf_insert_mixed_prot() for a discussion of the implication of using
- * a value of @pgprot different from that of @vma->vm_page_prot.
+ * pgprot typically only differs from @vma->vm_page_prot when drivers set
+ * caching- and encryption bits different than those of @vma->vm_page_prot,
+ * because the caching- or encryption mode may not be known at mmap() time.
+ *
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ *
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
  *
  * Context: Process context.  May allocate using %GFP_KERNEL.
  * Return: vm_fault_t value.
@@ -2241,9 +2253,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
 }
 
 static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn, pgprot_t pgprot,
-		bool mkwrite)
+		unsigned long addr, pfn_t pfn, bool mkwrite)
 {
+	pgprot_t pgprot = vma->vm_page_prot;
 	int err;
 
 	BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -2286,43 +2298,10 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
-/**
- * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- * @pgprot: pgprot flags for the inserted page
- *
- * This is exactly like vmf_insert_mixed(), except that it allows drivers
- * to override pgprot on a per-page basis.
- *
- * Typically this function should be used by drivers to set caching- and
- * encryption bits different than those of @vma->vm_page_prot, because
- * the caching- or encryption mode may not be known at mmap() time.
- * This is ok as long as @vma->vm_page_prot is not used by the core vm
- * to set caching and encryption bits for those vmas (except for COW pages).
- * This is ensured by core vm only modifying these page table entries using
- * functions that don't touch caching- or encryption bits, using pte_modify()
- * if needed. (See for example mprotect()).
- * Also when new page-table entries are created, this is only done using the
- * fault() callback, and never using the value of vma->vm_page_prot,
- * except for page-table entries that point to anonymous pages as the result
- * of COW.
- *
- * Context: Process context.  May allocate using %GFP_KERNEL.
- * Return: vm_fault_t value.
- */
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
-				 pfn_t pfn, pgprot_t pgprot)
-{
-	return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed_prot);
-
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		pfn_t pfn)
 {
-	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+	return __vm_insert_mixed(vma, addr, pfn, false);
 }
 EXPORT_SYMBOL(vmf_insert_mixed);
 
@@ -2334,7 +2313,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
 		unsigned long addr, pfn_t pfn)
 {
-	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+	return __vm_insert_mixed(vma, addr, pfn, true);
 }
 EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
 
-- 
cgit v1.2.3


From 7b806d229ef151c2997c041b791dfa89593e0e1b Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Sun, 12 Mar 2023 23:40:14 +0000
Subject: mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This functionality's sole user, the drm ttm module, removed support for it
in commit 0d979509539e ("drm/ttm: remove ttm_bo_vm_insert_huge()") as the
whole approach is currently unworkable without a PMD/PUD special bit and
updates to GUP.

Link: https://lkml.kernel.org/r/604c2ad79659d4b8a6e3e1611c6219d5d3233988.1678661628.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: "Russell King (Oracle)" <linux@armlinux.org.uk>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 39 ++-------------------------------------
 mm/huge_memory.c        | 31 +++++++++++++------------------
 2 files changed, 15 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9a3a3af2dd80..20284387b841 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,44 +39,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
 		    unsigned long cp_flags);
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
-				   pgprot_t pgprot, bool write);
 
-/**
- * vmf_insert_pfn_pmd - insert a pmd size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
-					    bool write)
-{
-	return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
-				   pgprot_t pgprot, bool write);
-
-/**
- * vmf_insert_pfn_pud - insert a pud size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
-					    bool write)
-{
-	return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1df386d13d24..81a5689806af 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -885,23 +885,20 @@ out_unlock:
 }
 
 /**
- * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * vmf_insert_pfn_pmd - insert a pmd size pfn
  * @vmf: Structure describing the fault
  * @pfn: pfn to insert
- * @pgprot: page protection to use
  * @write: whether it's a write fault
  *
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
  *
  * Return: vm_fault_t value.
  */
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
-				   pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 {
 	unsigned long addr = vmf->address & PMD_MASK;
 	struct vm_area_struct *vma = vmf->vma;
+	pgprot_t pgprot = vma->vm_page_prot;
 	pgtable_t pgtable = NULL;
 
 	/*
@@ -929,7 +926,7 @@ vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
 	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
 	return VM_FAULT_NOPAGE;
 }
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
@@ -940,9 +937,10 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 }
 
 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+		pud_t *pud, pfn_t pfn, bool write)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	pgprot_t prot = vma->vm_page_prot;
 	pud_t entry;
 	spinlock_t *ptl;
 
@@ -976,23 +974,20 @@ out_unlock:
 }
 
 /**
- * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * vmf_insert_pfn_pud - insert a pud size pfn
  * @vmf: Structure describing the fault
  * @pfn: pfn to insert
- * @pgprot: page protection to use
  * @write: whether it's a write fault
  *
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
  *
  * Return: vm_fault_t value.
  */
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
-				   pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 {
 	unsigned long addr = vmf->address & PUD_MASK;
 	struct vm_area_struct *vma = vmf->vma;
+	pgprot_t pgprot = vma->vm_page_prot;
 
 	/*
 	 * If we had pud_special, we could avoid all these restrictions,
@@ -1010,10 +1005,10 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
 
 	track_pfn_insert(vma, &pgprot, pfn);
 
-	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
+	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
 	return VM_FAULT_NOPAGE;
 }
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
-- 
cgit v1.2.3


From 3f6dac0fd1b83178137e7b4e722d8f29612cbec1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 21 Mar 2023 03:24:15 +0300
Subject: mm/page_alloc: make deferred page init free pages in MAX_ORDER blocks

Normal page init path frees pages during the boot in MAX_ORDER chunks, but
deferred page init path does it in pageblock blocks.

Change deferred page init path to work in MAX_ORDER blocks.

For cases when MAX_ORDER is larger than pageblock, set migrate type to
MIGRATE_MOVABLE for all pageblocks covered by the page.

Link: https://lkml.kernel.org/r/20230321002415.20843-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 mm/mm_init.c           | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2d3d78d01283..2d22e47dc1eb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -32,6 +32,8 @@
 #endif
 #define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
 
+#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
+
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 53fb8e9d1e3b..c88fbe74137c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1951,9 +1951,10 @@ static void __init deferred_free_range(unsigned long pfn,
 	page = pfn_to_page(pfn);
 
 	/* Free a large naturally-aligned chunk if possible */
-	if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
-		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_core(page, pageblock_order);
+	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
+		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
+			set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+		__free_pages_core(page, MAX_ORDER);
 		return;
 	}
 
@@ -1977,19 +1978,19 @@ static inline void __init pgdat_init_report_one_done(void)
 /*
  * Returns true if page needs to be initialized or freed to buddy allocator.
  *
- * We check if a current large page is valid by only checking the validity
+ * We check if a current MAX_ORDER block is valid by only checking the validity
  * of the head pfn.
  */
 static inline bool __init deferred_pfn_valid(unsigned long pfn)
 {
-	if (pageblock_aligned(pfn) && !pfn_valid(pfn))
+	if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
 		return false;
 	return true;
 }
 
 /*
  * Free pages to buddy allocator. Try to free aligned pages in
- * pageblock_nr_pages sizes.
+ * MAX_ORDER_NR_PAGES sizes.
  */
 static void __init deferred_free_pages(unsigned long pfn,
 				       unsigned long end_pfn)
@@ -2000,7 +2001,7 @@ static void __init deferred_free_pages(unsigned long pfn,
 		if (!deferred_pfn_valid(pfn)) {
 			deferred_free_range(pfn - nr_free, nr_free);
 			nr_free = 0;
-		} else if (pageblock_aligned(pfn)) {
+		} else if (IS_MAX_ORDER_ALIGNED(pfn)) {
 			deferred_free_range(pfn - nr_free, nr_free);
 			nr_free = 1;
 		} else {
@@ -2013,7 +2014,7 @@ static void __init deferred_free_pages(unsigned long pfn,
 
 /*
  * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
- * by performing it only once every pageblock_nr_pages.
+ * by performing it only once every MAX_ORDER_NR_PAGES.
  * Return number of pages initialized.
  */
 static unsigned long  __init deferred_init_pages(struct zone *zone,
@@ -2029,7 +2030,7 @@ static unsigned long  __init deferred_init_pages(struct zone *zone,
 		if (!deferred_pfn_valid(pfn)) {
 			page = NULL;
 			continue;
-		} else if (!page || pageblock_aligned(pfn)) {
+		} else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {
 			page = pfn_to_page(pfn);
 		} else {
 			page++;
-- 
cgit v1.2.3


From 4f80818b4a58c9458dce0df7cce9abe107da445e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 22 Mar 2023 18:57:03 +0000
Subject: iov_iter: add copy_page_to_iter_nofault()

Provide a means to copy a page to user space from an iterator, aborting if
a page fault would occur.  This supports compound pages, but may be passed
a tail page with an offset extending further into the compound page, so we
cannot pass a folio.

This allows for this function to be called from atomic context and _try_
to user pages if they are faulted in, aborting if not.

The function does not use _copy_to_iter() in order to not specify
might_fault(), this is similar to copy_page_from_iter_atomic().

This is being added in order that an iteratable form of vread() can be
implemented while holding spinlocks.

Link: https://lkml.kernel.org/r/19734729defb0f498a76bdec1bef3ac48a3af3e8.1679511146.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/uio.h |  2 ++
 lib/iov_iter.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27e3fd942960..29eb18bb6feb 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -173,6 +173,8 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
 {
 	return copy_page_to_iter(&folio->page, offset, bytes, i);
 }
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
+				 size_t bytes, struct iov_iter *i);
 
 static __always_inline __must_check
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 274014e4eafe..34dd6bdf2fba 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -172,6 +172,18 @@ static int copyout(void __user *to, const void *from, size_t n)
 	return n;
 }
 
+static int copyout_nofault(void __user *to, const void *from, size_t n)
+{
+	long res;
+
+	if (should_fail_usercopy())
+		return n;
+
+	res = copy_to_user_nofault(to, from, n);
+
+	return res < 0 ? n : res;
+}
+
 static int copyin(void *to, const void __user *from, size_t n)
 {
 	size_t res = n;
@@ -734,6 +746,42 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_to_iter);
 
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
+				 struct iov_iter *i)
+{
+	size_t res = 0;
+
+	if (!page_copy_sane(page, offset, bytes))
+		return 0;
+	if (WARN_ON_ONCE(i->data_source))
+		return 0;
+	if (unlikely(iov_iter_is_pipe(i)))
+		return copy_page_to_iter_pipe(page, offset, bytes, i);
+	page += offset / PAGE_SIZE; // first subpage
+	offset %= PAGE_SIZE;
+	while (1) {
+		void *kaddr = kmap_local_page(page);
+		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
+
+		iterate_and_advance(i, n, base, len, off,
+			copyout_nofault(base, kaddr + offset + off, len),
+			memcpy(base, kaddr + offset + off, len)
+		)
+		kunmap_local(kaddr);
+		res += n;
+		bytes -= n;
+		if (!bytes || !n)
+			break;
+		offset += n;
+		if (offset == PAGE_SIZE) {
+			page++;
+			offset = 0;
+		}
+	}
+	return res;
+}
+EXPORT_SYMBOL(copy_page_to_iter_nofault);
+
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
-- 
cgit v1.2.3


From 4c91c07c93bbbdd7f2d9de2beb7ee5c2a48ad8e7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 22 Mar 2023 18:57:04 +0000
Subject: mm: vmalloc: convert vread() to vread_iter()

Having previously laid the foundation for converting vread() to an
iterator function, pull the trigger and do so.

This patch attempts to provide minimal refactoring and to reflect the
existing logic as best we can, for example we continue to zero portions of
memory not read, as before.

Overall, there should be no functional difference other than a performance
improvement in /proc/kcore access to vmalloc regions.

Now we have eliminated the need for a bounce buffer in read_kcore_iter(),
we dispense with it, and try to write to user memory optimistically but
with faults disabled via copy_page_to_iter_nofault().  We already have
preemption disabled by holding a spin lock.  We continue faulting in until
the operation is complete.

Additionally, we must account for the fact that at any point a copy may
fail (most likely due to a fault not being able to occur), we exit
indicating fewer bytes retrieved than expected.

[sfr@canb.auug.org.au: fix sparc64 warning]
  Link: https://lkml.kernel.org/r/20230320144721.663280c3@canb.auug.org.au
[lstoakes@gmail.com: redo Stephen's sparc build fix]
  Link: https://lkml.kernel.org/r/8506cbc667c39205e65a323f750ff9c11a463798.1679566220.git.lstoakes@gmail.com
[akpm@linux-foundation.org: unbreak uio.h includes]
Link: https://lkml.kernel.org/r/941f88bc5ab928e6656e1e2593b91bf0f8c81e1b.1679511146.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/kcore.c         |  44 ++++-----
 include/linux/vmalloc.h |   3 +-
 mm/nommu.c              |  10 +--
 mm/vmalloc.c            | 235 ++++++++++++++++++++++++++++++------------------
 4 files changed, 177 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 08b795fd80b4..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -307,13 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
 	*i = ALIGN(*i + descsz, 4);
 }
 
-static ssize_t
-read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct file *file = iocb->ki_filp;
-	char *buf = file->private_data;
 	loff_t *fpos = &iocb->ki_pos;
-
 	size_t phdrs_offset, notes_offset, data_offset;
 	size_t page_offline_frozen = 1;
 	size_t phdrs_len, notes_len;
@@ -507,13 +503,30 @@ read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 		switch (m->type) {
 		case KCORE_VMALLOC:
-			vread(buf, (char *)start, tsz);
-			/* we have to zero-fill user buffer even if no read */
-			if (copy_to_iter(buf, tsz, iter) != tsz) {
-				ret = -EFAULT;
-				goto out;
+		{
+			const char *src = (char *)start;
+			size_t read = 0, left = tsz;
+
+			/*
+			 * vmalloc uses spinlocks, so we optimistically try to
+			 * read memory. If this fails, fault pages in and try
+			 * again until we are done.
+			 */
+			while (true) {
+				read += vread_iter(iter, src, left);
+				if (read == tsz)
+					break;
+
+				src += read;
+				left -= read;
+
+				if (fault_in_iov_iter_writeable(iter, left)) {
+					ret = -EFAULT;
+					goto out;
+				}
 			}
 			break;
+		}
 		case KCORE_USER:
 			/* User page is handled prior to normal kernel page: */
 			if (copy_to_iter((char *)start, tsz, iter) != tsz) {
@@ -582,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
-	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!filp->private_data)
-		return -ENOMEM;
-
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
@@ -596,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int release_kcore(struct inode *inode, struct file *file)
-{
-	kfree(file->private_data);
-	return 0;
-}
-
 static const struct proc_ops kcore_proc_ops = {
 	.proc_read_iter	= read_kcore_iter,
 	.proc_open	= open_kcore,
-	.proc_release	= release_kcore,
 	.proc_lseek	= default_llseek,
 };
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 351fc7697214..c720be70c8dd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -14,6 +14,7 @@
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 struct notifier_block;		/* in notifier.h */
+struct iov_iter;		/* in uio.h */
 
 /* bits in flags of vmalloc's vm_struct below */
 #define VM_IOREMAP		0x00000001	/* ioremap() and friends */
@@ -247,7 +248,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
 #endif
 
 /* for /proc/kcore */
-extern long vread(char *buf, char *addr, unsigned long count);
+extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
 
 /*
  *	Internals.  Don't use..
diff --git a/mm/nommu.c b/mm/nommu.c
index 57ba243c6a37..f670d9979a26 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,6 +36,7 @@
 #include <linux/printk.h>
 
 #include <linux/uaccess.h>
+#include <linux/uio.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -198,14 +199,13 @@ unsigned long vmalloc_to_pfn(const void *addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 {
 	/* Don't allow overflow */
-	if ((unsigned long) buf + count < count)
-		count = -(unsigned long) buf;
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
 
-	memcpy(buf, addr, count);
-	return count;
+	return copy_to_iter(addr, count, iter);
 }
 
 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 978194dc2bb8..5291c6f02cf7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -33,11 +33,11 @@
 #include <linux/compiler.h>
 #include <linux/memcontrol.h>
 #include <linux/llist.h>
+#include <linux/uio.h>
 #include <linux/bitops.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/overflow.h>
 #include <linux/pgtable.h>
-#include <linux/uaccess.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/mm.h>
 #include <asm/tlbflush.h>
@@ -3442,62 +3442,96 @@ void *vmalloc_32_user(unsigned long size)
 EXPORT_SYMBOL(vmalloc_32_user);
 
 /*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
  */
+static size_t zero_iter(struct iov_iter *iter, size_t count)
+{
+	size_t remains = count;
+
+	while (remains > 0) {
+		size_t num, copied;
+
+		num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+		copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+		remains -= copied;
+
+		if (copied < num)
+			break;
+	}
 
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+	return count - remains;
+}
+
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+				 const char *addr, size_t count)
 {
-	struct page *p;
-	int copied = 0;
+	size_t remains = count;
+	struct page *page;
 
-	while (count) {
+	while (remains > 0) {
 		unsigned long offset, length;
+		size_t copied = 0;
 
 		offset = offset_in_page(addr);
 		length = PAGE_SIZE - offset;
-		if (length > count)
-			length = count;
-		p = vmalloc_to_page(addr);
+		if (length > remains)
+			length = remains;
+		page = vmalloc_to_page(addr);
 		/*
-		 * To do safe access to this _mapped_ area, we need
-		 * lock. But adding lock here means that we need to add
-		 * overhead of vmalloc()/vfree() calls for this _debug_
-		 * interface, rarely used. Instead of that, we'll use
-		 * kmap() and get small overhead in this access function.
+		 * To do safe access to this _mapped_ area, we need lock. But
+		 * adding lock here means that we need to add overhead of
+		 * vmalloc()/vfree() calls for this _debug_ interface, rarely
+		 * used. Instead of that, we'll use an local mapping via
+		 * copy_page_to_iter_nofault() and accept a small overhead in
+		 * this access function.
 		 */
-		if (p) {
-			/* We can expect USER0 is not used -- see vread() */
-			void *map = kmap_atomic(p);
-			memcpy(buf, map + offset, length);
-			kunmap_atomic(map);
-		} else
-			memset(buf, 0, length);
+		if (page)
+			copied = copy_page_to_iter_nofault(page, offset,
+							   length, iter);
+		else
+			copied = zero_iter(iter, length);
 
-		addr += length;
-		buf += length;
-		copied += length;
-		count -= length;
+		addr += copied;
+		remains -= copied;
+
+		if (copied != length)
+			break;
 	}
-	return copied;
+
+	return count - remains;
 }
 
-static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+/*
+ * Read from a vm_map_ram region of memory.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+				  size_t count, unsigned long flags)
 {
 	char *start;
 	struct vmap_block *vb;
 	unsigned long offset;
-	unsigned int rs, re, n;
+	unsigned int rs, re;
+	size_t remains, n;
 
 	/*
 	 * If it's area created by vm_map_ram() interface directly, but
 	 * not further subdividing and delegating management to vmap_block,
 	 * handle it here.
 	 */
-	if (!(flags & VMAP_BLOCK)) {
-		aligned_vread(buf, addr, count);
-		return;
-	}
+	if (!(flags & VMAP_BLOCK))
+		return aligned_vread_iter(iter, addr, count);
+
+	remains = count;
 
 	/*
 	 * Area is split into regions and tracked with vmap_block, read out
@@ -3505,50 +3539,64 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
 	 */
 	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
 	if (!vb)
-		goto finished;
+		goto finished_zero;
 
 	spin_lock(&vb->lock);
 	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
 		spin_unlock(&vb->lock);
-		goto finished;
+		goto finished_zero;
 	}
+
 	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
-		if (!count)
-			break;
+		size_t copied;
+
+		if (remains == 0)
+			goto finished;
+
 		start = vmap_block_vaddr(vb->va->va_start, rs);
-		while (addr < start) {
-			if (count == 0)
-				goto unlock;
-			*buf = '\0';
-			buf++;
-			addr++;
-			count--;
+
+		if (addr < start) {
+			size_t to_zero = min_t(size_t, start - addr, remains);
+			size_t zeroed = zero_iter(iter, to_zero);
+
+			addr += zeroed;
+			remains -= zeroed;
+
+			if (remains == 0 || zeroed != to_zero)
+				goto finished;
 		}
+
 		/*it could start reading from the middle of used region*/
 		offset = offset_in_page(addr);
 		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
-		if (n > count)
-			n = count;
-		aligned_vread(buf, start+offset, n);
+		if (n > remains)
+			n = remains;
+
+		copied = aligned_vread_iter(iter, start + offset, n);
 
-		buf += n;
-		addr += n;
-		count -= n;
+		addr += copied;
+		remains -= copied;
+
+		if (copied != n)
+			goto finished;
 	}
-unlock:
+
 	spin_unlock(&vb->lock);
 
-finished:
+finished_zero:
 	/* zero-fill the left dirty or free regions */
-	if (count)
-		memset(buf, 0, count);
+	return count - remains + zero_iter(iter, remains);
+finished:
+	/* We couldn't copy/zero everything */
+	spin_unlock(&vb->lock);
+	return count - remains;
 }
 
 /**
- * vread() - read vmalloc area in a safe way.
- * @buf:     buffer for reading data
- * @addr:    vm address.
- * @count:   number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter:         the iterator to which data should be written.
+ * @addr:         vm address.
+ * @count:        number of bytes to be read.
  *
  * This function checks that addr is a valid vmalloc'ed area, and
  * copy data from that area to a given buffer. If the given memory range
@@ -3568,13 +3616,12 @@ finished:
  * (same number as @count) or %0 if [addr...addr+count) doesn't
  * include any intersection with valid vmalloc area
  */
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 {
 	struct vmap_area *va;
 	struct vm_struct *vm;
-	char *vaddr, *buf_start = buf;
-	unsigned long buflen = count;
-	unsigned long n, size, flags;
+	char *vaddr;
+	size_t n, size, flags, remains;
 
 	addr = kasan_reset_tag(addr);
 
@@ -3582,18 +3629,22 @@ long vread(char *buf, char *addr, unsigned long count)
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
+	remains = count;
+
 	spin_lock(&vmap_area_lock);
 	va = find_vmap_area_exceed_addr((unsigned long)addr);
 	if (!va)
-		goto finished;
+		goto finished_zero;
 
 	/* no intersects with alive vmap_area */
-	if ((unsigned long)addr + count <= va->va_start)
-		goto finished;
+	if ((unsigned long)addr + remains <= va->va_start)
+		goto finished_zero;
 
 	list_for_each_entry_from(va, &vmap_area_list, list) {
-		if (!count)
-			break;
+		size_t copied;
+
+		if (remains == 0)
+			goto finished;
 
 		vm = va->vm;
 		flags = va->flags & VMAP_FLAGS_MASK;
@@ -3608,6 +3659,7 @@ long vread(char *buf, char *addr, unsigned long count)
 
 		if (vm && (vm->flags & VM_UNINITIALIZED))
 			continue;
+
 		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
 		smp_rmb();
 
@@ -3616,38 +3668,45 @@ long vread(char *buf, char *addr, unsigned long count)
 
 		if (addr >= vaddr + size)
 			continue;
-		while (addr < vaddr) {
-			if (count == 0)
+
+		if (addr < vaddr) {
+			size_t to_zero = min_t(size_t, vaddr - addr, remains);
+			size_t zeroed = zero_iter(iter, to_zero);
+
+			addr += zeroed;
+			remains -= zeroed;
+
+			if (remains == 0 || zeroed != to_zero)
 				goto finished;
-			*buf = '\0';
-			buf++;
-			addr++;
-			count--;
 		}
+
 		n = vaddr + size - addr;
-		if (n > count)
-			n = count;
+		if (n > remains)
+			n = remains;
 
 		if (flags & VMAP_RAM)
-			vmap_ram_vread(buf, addr, n, flags);
+			copied = vmap_ram_vread_iter(iter, addr, n, flags);
 		else if (!(vm->flags & VM_IOREMAP))
-			aligned_vread(buf, addr, n);
+			copied = aligned_vread_iter(iter, addr, n);
 		else /* IOREMAP area is treated as memory hole */
-			memset(buf, 0, n);
-		buf += n;
-		addr += n;
-		count -= n;
+			copied = zero_iter(iter, n);
+
+		addr += copied;
+		remains -= copied;
+
+		if (copied != n)
+			goto finished;
 	}
-finished:
-	spin_unlock(&vmap_area_lock);
 
-	if (buf == buf_start)
-		return 0;
+finished_zero:
+	spin_unlock(&vmap_area_lock);
 	/* zero-fill memory holes */
-	if (buf != buf_start + buflen)
-		memset(buf, 0, buflen - (buf - buf_start));
+	return count - remains + zero_iter(iter, remains);
+finished:
+	/* Nothing remains, or We couldn't copy/zero everything. */
+	spin_unlock(&vmap_area_lock);
 
-	return buflen;
+	return count - remains;
 }
 
 /**
-- 
cgit v1.2.3


From c710fac6bfc8c2bcfedbaba27e31524c98eb6188 Mon Sep 17 00:00:00 2001
From: Wenchao Hao <haowenchao2@huawei.com>
Date: Thu, 23 Mar 2023 19:41:36 +0800
Subject: trace: cma: remove unnecessary event class cma_alloc_class

After commit cb6c33d4dc09 ("cma: tracing: print alloc result in
trace_cma_alloc_finish"), cma_alloc_class has only one event which is
cma_alloc_busy_retry.  So we can remove the cma_alloc_class.

Link: https://lkml.kernel.org/r/20230323114136.177677-1-haowenchao2@huawei.com
Signed-off-by: Wenchao Hao <haowenchao2@huawei.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Feilong Lin <linfeilong@huawei.com>
Cc: Hongxiang Lou <louhongxiang@huawei.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/cma.h | 58 ++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index ef75ea606ab2..25103e67737c 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -8,37 +8,6 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-DECLARE_EVENT_CLASS(cma_alloc_class,
-
-	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
-		 unsigned long count, unsigned int align),
-
-	TP_ARGS(name, pfn, page, count, align),
-
-	TP_STRUCT__entry(
-		__string(name, name)
-		__field(unsigned long, pfn)
-		__field(const struct page *, page)
-		__field(unsigned long, count)
-		__field(unsigned int, align)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, name);
-		__entry->pfn = pfn;
-		__entry->page = page;
-		__entry->count = count;
-		__entry->align = align;
-	),
-
-	TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u",
-		  __get_str(name),
-		  __entry->pfn,
-		  __entry->page,
-		  __entry->count,
-		  __entry->align)
-);
-
 TRACE_EVENT(cma_release,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
@@ -125,12 +94,35 @@ TRACE_EVENT(cma_alloc_finish,
 		  __entry->errorno)
 );
 
-DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
+TRACE_EVENT(cma_alloc_busy_retry,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
 		 unsigned long count, unsigned int align),
 
-	TP_ARGS(name, pfn, page, count, align)
+	TP_ARGS(name, pfn, page, count, align),
+
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(unsigned long, pfn)
+		__field(const struct page *, page)
+		__field(unsigned long, count)
+		__field(unsigned int, align)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->pfn = pfn;
+		__entry->page = page;
+		__entry->count = count;
+		__entry->align = align;
+	),
+
+	TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u",
+		  __get_str(name),
+		  __entry->pfn,
+		  __entry->page,
+		  __entry->count,
+		  __entry->align)
 );
 
 #endif /* _TRACE_CMA_H */
-- 
cgit v1.2.3


From 20cce633f4254cc0df39665449726e3172518f6c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <michel@lespinasse.org>
Date: Mon, 27 Feb 2023 09:36:09 -0800
Subject: mm: rcu safe VMA freeing

This prepares for page faults handling under VMA lock, looking up VMAs
under protection of an rcu read lock, instead of the usual mmap read lock.

Link: https://lkml.kernel.org/r/20230227173632.3292573-11-surenb@google.com
Signed-off-by: Michel Lespinasse <michel@lespinasse.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 13 ++++++++++---
 kernel/fork.c            | 20 +++++++++++++++++++-
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 651a5c256732..f203e2562e00 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -480,9 +480,16 @@ struct anon_vma_name {
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
 
-	unsigned long vm_start;		/* Our start address within vm_mm. */
-	unsigned long vm_end;		/* The first byte after our end address
-					   within vm_mm. */
+	union {
+		struct {
+			/* VMA covers [vm_start; vm_end) addresses within mm */
+			unsigned long vm_start;
+			unsigned long vm_end;
+		};
+#ifdef CONFIG_PER_VMA_LOCK
+		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
+#endif
+	};
 
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
 	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
diff --git a/kernel/fork.c b/kernel/fork.c
index cea99f003f24..93ec6e14bb65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -479,12 +479,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
-void vm_area_free(struct vm_area_struct *vma)
+static void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+						  vm_rcu);
+	__vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+	__vm_area_free(vma);
+#endif
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-- 
cgit v1.2.3


From 438b6e12cd603a9416ca7c5462ab75b2a8c4d5b9 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:10 -0800
Subject: mm: move mmap_lock assert function definitions

Move mmap_lock assert function definitions up so that they can be used by
other mmap_lock routines.

Link: https://lkml.kernel.org/r/20230227173632.3292573-12-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmap_lock.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 96e113e23d04..e49ba91bb1f0 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,6 +60,18 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
 
 #endif /* CONFIG_TRACING */
 
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+	lockdep_assert_held(&mm->mmap_lock);
+	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+	lockdep_assert_held_write(&mm->mmap_lock);
+	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -150,18 +162,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 	up_read_non_owner(&mm->mmap_lock);
 }
 
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
-	lockdep_assert_held(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
-	lockdep_assert_held_write(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
 static inline int mmap_lock_is_contended(struct mm_struct *mm)
 {
 	return rwsem_is_contended(&mm->mmap_lock);
-- 
cgit v1.2.3


From 5e31275cc997f8ec5d9e8d65fe9840ebed89db19 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:11 -0800
Subject: mm: add per-VMA lock and helper functions to control it

Introduce per-VMA locking.  The lock implementation relies on a per-vma
and per-mm sequence counters to note exclusive locking:

  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).

Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h        | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  8 +++++
 include/linux/mmap_lock.h | 13 ++++++++
 kernel/fork.c             |  4 +++
 mm/init-mm.c              |  3 ++
 5 files changed, 110 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 55d02356146c..53bfb42663ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -624,6 +624,87 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+	rcu_read_lock(); /* keeps vma alive till the end of up_read */
+	up_read(&vma->lock);
+	rcu_read_unlock();
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -632,6 +713,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f203e2562e00..843a45893991 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -503,6 +503,11 @@ struct vm_area_struct {
 		vm_flags_t __private __vm_flags;
 	};
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct rw_semaphore lock;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -639,6 +644,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_end_write_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_end_write_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 93ec6e14bb65..0907776d8ed4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		data_race(memcpy(new, orig, sizeof(*new)));
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1208,6 +1209,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
cgit v1.2.3


From c732293331a22187d59cd485879276a1da398387 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:12 -0800
Subject: mm: mark VMA as being written when changing vm_flags

Updates to vm_flags have to be done with VMA marked as being written for
preventing concurrent page faults or other modifications.

Link: https://lkml.kernel.org/r/20230227173632.3292573-14-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 53bfb42663ea..d0a6c99aba09 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -727,28 +727,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
 static inline void vm_flags_reset(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	vm_flags_init(vma, flags);
 }
 
 static inline void vm_flags_reset_once(struct vm_area_struct *vma,
 				       vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
 }
 
 static inline void vm_flags_set(struct vm_area_struct *vma,
 				vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	ACCESS_PRIVATE(vma, __vm_flags) |= flags;
 }
 
 static inline void vm_flags_clear(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
 }
 
@@ -769,7 +769,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
 static inline void vm_flags_mod(struct vm_area_struct *vma,
 				vm_flags_t set, vm_flags_t clear)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	__vm_flags_mod(vma, set, clear);
 }
 
-- 
cgit v1.2.3


From 55fd6fccad3172c0feaaa817f0a1283629ff183e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:14 -0800
Subject: mm/khugepaged: write-lock VMA while collapsing a huge page

Protect VMA from concurrent page fault handler while collapsing a huge
page.  Page fault handler needs a stable PMD to use PTL and relies on
per-VMA lock to prevent concurrent PMD changes.  pmdp_collapse_flush(),
set_huge_pmd() and collapse_and_free_pmd() can modify a PMD, which will
not be detected by a page fault handler without proper locking.

Before this patch, page tables can be walked under any one of the
mmap_lock, the mapping lock, and the anon_vma lock; so when khugepaged
unlinks and frees page tables, it must ensure that all of those either are
locked or don't exist.  This patch adds a fourth lock under which page
tables can be traversed, and so khugepaged must also lock out that one.

[surenb@google.com: vm_lock/i_mmap_rwsem inversion in retract_page_tables]
  Link: https://lkml.kernel.org/r/20230303213250.3555716-1-surenb@google.com
[surenb@google.com: build fix]
  Link: https://lkml.kernel.org/r/CAJuCfpFjWhtzRE1X=J+_JjgJzNKhq-=JT8yTBSTHthwp0pqWZw@mail.gmail.com
Link: https://lkml.kernel.org/r/20230227173632.3292573-16-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 41 ++++++++++++++++++++++++++++++-----------
 mm/khugepaged.c    |  8 ++++++++
 mm/rmap.c          | 31 ++++++++++++++++---------------
 3 files changed, 54 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d0a6c99aba09..d6a2abc51e3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -665,18 +665,23 @@ static inline void vma_end_read(struct vm_area_struct *vma)
 	rcu_read_unlock();
 }
 
-static inline void vma_start_write(struct vm_area_struct *vma)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
 {
-	int mm_lock_seq;
-
 	mmap_assert_write_locked(vma->vm_mm);
 
 	/*
 	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
 	 * mm->mm_lock_seq can't be concurrently modified.
 	 */
-	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
-	if (vma->vm_lock_seq == mm_lock_seq)
+	*mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
 	down_write(&vma->lock);
@@ -684,14 +689,26 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	up_write(&vma->lock);
 }
 
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return true;
+
+	if (!down_write_trylock(&vma->vm_lock->lock))
+		return false;
+
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->vm_lock->lock);
+	return true;
+}
+
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_write_locked(vma->vm_mm);
-	/*
-	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
-	 * mm->mm_lock_seq can't be concurrently modified.
-	 */
-	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+	int mm_lock_seq;
+
+	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
 #else /* CONFIG_PER_VMA_LOCK */
@@ -701,6 +718,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 		{ return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+		{ return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c7317678cb10..bee7fd7db380 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1056,6 +1056,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 
+	vma_start_write(vma);
 	anon_vma_lock_write(vma->anon_vma);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1517,6 +1518,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		goto drop_hpage;
 	}
 
+	/* Lock the vma before taking i_mmap and page table locks */
+	vma_start_write(vma);
+
 	/*
 	 * We need to lock the mapping so that from here on, only GUP-fast and
 	 * hardware page walks can access the parts of the page tables that
@@ -1694,6 +1698,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
 		result = SCAN_PTE_MAPPED_HUGEPAGE;
 		if ((cc->is_khugepaged || is_target) &&
 		    mmap_write_trylock(mm)) {
+			/* trylock for the same lock inversion as above */
+			if (!vma_try_start_write(vma))
+				goto unlock_next;
+
 			/*
 			 * Re-check whether we have an ->anon_vma, because
 			 * collapse_and_free_pmd() requires that either no
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ea2756b2797..ba901c416785 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,21 +25,22 @@
  *     mapping->invalidate_lock (in filemap_fault)
  *       page->flags PG_locked (lock_page)
  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
- *           mapping->i_mmap_rwsem
- *             anon_vma->rwsem
- *               mm->page_table_lock or pte_lock
- *                 swap_lock (in swap_duplicate, swap_info_get)
- *                   mmlist_lock (in mmput, drain_mmlist and others)
- *                   mapping->private_lock (in block_dirty_folio)
- *                     folio_lock_memcg move_lock (in block_dirty_folio)
- *                       i_pages lock (widely used)
- *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
- *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                     sb_lock (within inode_lock in fs/fs-writeback.c)
- *                     i_pages lock (widely used, in set_page_dirty,
- *                               in arch-dependent flush_dcache_mmap_lock,
- *                               within bdi.wb->list_lock in __sync_single_inode)
+ *           vma_start_write
+ *             mapping->i_mmap_rwsem
+ *               anon_vma->rwsem
+ *                 mm->page_table_lock or pte_lock
+ *                   swap_lock (in swap_duplicate, swap_info_get)
+ *                     mmlist_lock (in mmput, drain_mmlist and others)
+ *                     mapping->private_lock (in block_dirty_folio)
+ *                       folio_lock_memcg move_lock (in block_dirty_folio)
+ *                         i_pages lock (widely used)
+ *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
+ *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                       sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                       i_pages lock (widely used, in set_page_dirty,
+ *                                 in arch-dependent flush_dcache_mmap_lock,
+ *                                 within bdi.wb->list_lock in __sync_single_inode)
  *
  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
-- 
cgit v1.2.3


From 457f67be5910a2b5f1fda8af06bfe4d3492a0a4f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:21 -0800
Subject: mm: introduce vma detached flag

Per-vma locking mechanism will search for VMA under RCU protection and
then after locking it, has to ensure it was not removed from the VMA tree
after we found it.  To make this check efficient, introduce a
vma->detached flag to mark VMAs which were removed from the VMA tree.

Link: https://lkml.kernel.org/r/20230227173632.3292573-23-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 11 +++++++++++
 include/linux/mm_types.h |  3 +++
 mm/mmap.c                |  2 ++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d6a2abc51e3d..d0f289bfef01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -711,6 +711,14 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+	/* When detaching vma should be write-locked */
+	if (detached)
+		vma_assert_write_locked(vma);
+	vma->detached = detached;
+}
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline void vma_init_lock(struct vm_area_struct *vma) {}
@@ -721,6 +729,8 @@ static inline void vma_start_write(struct vm_area_struct *vma) {}
 static inline bool vma_try_start_write(struct vm_area_struct *vma)
 		{ return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+				     bool detached) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
@@ -732,6 +742,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_mark_detached(vma, false);
 	vma_init_lock(vma);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 843a45893991..3248ae45cb2e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -506,6 +506,9 @@ struct vm_area_struct {
 #ifdef CONFIG_PER_VMA_LOCK
 	int vm_lock_seq;
 	struct rw_semaphore lock;
+
+	/* Flag to indicate areas detached from the mm->mm_mt tree */
+	bool detached;
 #endif
 
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 18aed0ea6bd3..b42f58591b9a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -599,6 +599,7 @@ static inline void vma_complete(struct vma_prepare *vp,
 
 	if (vp->remove) {
 again:
+		vma_mark_detached(vp->remove, true);
 		if (vp->file) {
 			uprobe_munmap(vp->remove, vp->remove->vm_start,
 				      vp->remove->vm_end);
@@ -2276,6 +2277,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
 	if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
 		return -ENOMEM;
 
+	vma_mark_detached(vma, true);
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm -= vma_pages(vma);
 
-- 
cgit v1.2.3


From 50ee32537206140e4cf6e47024be29a84d458d49 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:22 -0800
Subject: mm: introduce lock_vma_under_rcu to be used from arch-specific code

Introduce lock_vma_under_rcu function to lookup and lock a VMA during page
fault handling.  When VMA is not found, can't be locked or changes after
being locked, the function returns NULL.  The lookup is performed under
RCU protection to prevent the found VMA from being destroyed before the
VMA lock is acquired.  VMA lock statistics are updated according to the
results.  For now only anonymous VMAs can be searched this way.  In other
cases the function returns NULL.

Link: https://lkml.kernel.org/r/20230227173632.3292573-24-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d0f289bfef01..0619f9fa7581 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -719,6 +719,9 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
 	vma->detached = detached;
 }
 
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+					  unsigned long address);
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline void vma_init_lock(struct vm_area_struct *vma) {}
diff --git a/mm/memory.c b/mm/memory.c
index 8ca78ae8bba7..1235de20af73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5232,6 +5232,52 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
 
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+					  unsigned long address)
+{
+	MA_STATE(mas, &mm->mm_mt, address, address);
+	struct vm_area_struct *vma;
+
+	rcu_read_lock();
+retry:
+	vma = mas_walk(&mas);
+	if (!vma)
+		goto inval;
+
+	/* Only anonymous vmas are supported for now */
+	if (!vma_is_anonymous(vma))
+		goto inval;
+
+	if (!vma_start_read(vma))
+		goto inval;
+
+	/* Check since vm_start/vm_end might change before we lock the VMA */
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+		vma_end_read(vma);
+		goto inval;
+	}
+
+	/* Check if the VMA got isolated after we found it */
+	if (vma->detached) {
+		vma_end_read(vma);
+		/* The area was replaced with another one */
+		goto retry;
+	}
+
+	rcu_read_unlock();
+	return vma;
+inval:
+	rcu_read_unlock();
+	return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
 #ifndef __PAGETABLE_P4D_FOLDED
 /*
  * Allocate p4d page table.
-- 
cgit v1.2.3


From 55324e46eb8b886ecd08c9a089d3a694c705b3b0 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:24 -0800
Subject: mm: add FAULT_FLAG_VMA_LOCK flag

Add a new flag to distinguish page faults handled under protection of
per-vma lock.

[surenb@google.com: document FAULT_FLAG_VMA_LOCK flag]
  Link: https://lkml.kernel.org/r/20230301022720.1380780-2-surenb@google.com
  Link: https://lore.kernel.org/all/20230301113648.7c279865@canb.auug.org.au/
Link: https://lkml.kernel.org/r/20230227173632.3292573-26-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Laurent Dufour <laurent.dufour@fr.ibm.com>
Cc: Dan Carpenter <error27@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 3 ++-
 include/linux/mm_types.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0619f9fa7581..1876825d6700 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -479,7 +479,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
+	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
+	{ FAULT_FLAG_VMA_LOCK,		"VMA_LOCK" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3248ae45cb2e..3a6cf4c2edd0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1049,6 +1049,7 @@ typedef struct {
  *                      mapped after the fault.
  * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
  *                        We should only access orig_pte if this flag set.
+ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -1086,6 +1087,7 @@ enum fault_flag {
 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
 	FAULT_FLAG_UNSHARE =		1 << 10,
 	FAULT_FLAG_ORIG_PTE_VALID =	1 << 11,
+	FAULT_FLAG_VMA_LOCK =		1 << 12,
 };
 
 typedef unsigned int __bitwise zap_flags_t;
-- 
cgit v1.2.3


From 52f238653e452e0fda61e880f263a173d219acd1 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:27 -0800
Subject: mm: introduce per-VMA lock statistics

Add a new CONFIG_PER_VMA_LOCK_STATS config option to dump extra statistics
about handling page fault under VMA lock.

Link: https://lkml.kernel.org/r/20230227173632.3292573-29-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 6 ++++++
 include/linux/vmstat.h        | 6 ++++++
 mm/Kconfig.debug              | 6 ++++++
 mm/memory.c                   | 2 ++
 mm/vmstat.c                   | 6 ++++++
 5 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..8abfa1240040 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_X86
 		DIRECT_MAP_LEVEL2_SPLIT,
 		DIRECT_MAP_LEVEL3_SPLIT,
+#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+		VMA_LOCK_SUCCESS,
+		VMA_LOCK_ABORT,
+		VMA_LOCK_RETRY,
+		VMA_LOCK_MISS,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 19cf5b6892ce..fed855bae6d8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
 #endif
 
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+#define count_vm_vma_lock_event(x) count_vm_event(x)
+#else
+#define count_vm_vma_lock_event(x) do {} while (0)
+#endif
+
 #define __count_zid_vm_events(item, zid, delta) \
 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
 
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c3547a373c9c..4965a7333a3f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
 
 	  If unsure, say Y.
 
+config PER_VMA_LOCK_STATS
+	bool "Statistics for per-vma locks"
+	depends on PER_VMA_LOCK
+	default y
+	help
+	  Statistics for per-vma locks.
diff --git a/mm/memory.c b/mm/memory.c
index 55ac9cdfd398..9999574a9636 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5284,6 +5284,7 @@ retry:
 	/* Check if the VMA got isolated after we found it */
 	if (vma->detached) {
 		vma_end_read(vma);
+		count_vm_vma_lock_event(VMA_LOCK_MISS);
 		/* The area was replaced with another one */
 		goto retry;
 	}
@@ -5292,6 +5293,7 @@ retry:
 	return vma;
 inval:
 	rcu_read_unlock();
+	count_vm_vma_lock_event(VMA_LOCK_ABORT);
 	return NULL;
 }
 #endif /* CONFIG_PER_VMA_LOCK */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b7307627772d..c28046371b45 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
 	"direct_map_level2_splits",
 	"direct_map_level3_splits",
 #endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+	"vma_lock_success",
+	"vma_lock_abort",
+	"vma_lock_retry",
+	"vma_lock_miss",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
cgit v1.2.3


From 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:31 -0800
Subject: mm/mmap: free vm_area_struct without call_rcu in exit_mmap

call_rcu() can take a long time when callback offloading is enabled.  Its
use in the vm_area_free can cause regressions in the exit path when
multiple VMAs are being freed.

Because exit_mmap() is called only after the last mm user drops its
refcount, the page fault handlers can't be racing with it.  Any other
possible user like oom-reaper or process_mrelease are already synchronized
using mmap_lock.  Therefore exit_mmap() can free VMAs directly, without
the use of call_rcu().

Expose __vm_area_free() and use it from exit_mmap() to avoid possible
call_rcu() floods and performance regressions caused by it.

Link: https://lkml.kernel.org/r/20230227173632.3292573-33-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 kernel/fork.c      |  2 +-
 mm/mmap.c          | 11 +++++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1876825d6700..f31c75dc190e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -257,6 +257,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
diff --git a/kernel/fork.c b/kernel/fork.c
index 76fcc08984d4..f86f7e917954 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -480,7 +480,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
-static void __vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/mmap.c b/mm/mmap.c
index b42f58591b9a..511f656eb423 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
 /*
  * Close a vm structure and free it.
  */
-static void remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
 {
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	vm_area_free(vma);
+	if (unreachable)
+		__vm_area_free(vma);
+	else
+		vm_area_free(vma);
 }
 
 static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
@@ -2145,7 +2148,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += nrpages;
 		vm_stat_account(mm, vma->vm_flags, -nrpages);
-		remove_vma(vma);
+		remove_vma(vma, false);
 	}
 	vm_unacct_memory(nr_accounted);
 	validate_mm(mm);
@@ -3078,7 +3081,7 @@ void exit_mmap(struct mm_struct *mm)
 	do {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
-		remove_vma(vma);
+		remove_vma(vma, true);
 		count++;
 		cond_resched();
 	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
-- 
cgit v1.2.3


From c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:32 -0800
Subject: mm: separate vma->lock from vm_area_struct

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing.  Fix that by moving the lock outside of the vm_area_struct.

All attempts to keep vma->lock inside vm_area_struct in a separate cache
line still produce performance regression especially on NUMA machines.
Smallest regression was achieved when lock is placed in the fourth cache
line but that bloats vm_area_struct to 256 bytes.

Considering performance and memory impact, separate lock looks like the
best option.  It increases memory footprint of each VMA but that can be
optimized later if the new size causes issues.  Note that after this
change vma_init() does not allocate or initialize vma->lock anymore.  A
number of drivers allocate a pseudo VMA on the stack but they never use
the VMA's lock, therefore it does not need to be allocated.  The future
drivers which might need the VMA lock should use
vm_area_alloc()/vm_area_free() to allocate the VMA.

Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 23 +++++++--------
 include/linux/mm_types.h |  6 +++-
 kernel/fork.c            | 73 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 74 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f31c75dc190e..f5487a86b154 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -628,12 +628,6 @@ struct vm_operations_struct {
 };
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
-{
-	init_rwsem(&vma->lock);
-	vma->vm_lock_seq = -1;
-}
-
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -645,17 +639,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
 		return false;
 
 	/*
 	 * Overflow might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 */
 	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->lock);
+		up_read(&vma->vm_lock->lock);
 		return false;
 	}
 	return true;
@@ -664,7 +658,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->lock);
+	up_read(&vma->vm_lock->lock);
 	rcu_read_unlock();
 }
 
@@ -687,9 +681,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->lock);
+	down_write(&vma->vm_lock->lock);
 	vma->vm_lock_seq = mm_lock_seq;
-	up_write(&vma->lock);
+	up_write(&vma->vm_lock->lock);
 }
 
 static inline bool vma_try_start_write(struct vm_area_struct *vma)
@@ -740,6 +734,10 @@ static inline void vma_mark_detached(struct vm_area_struct *vma,
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -749,7 +747,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
-	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3a6cf4c2edd0..1b3031e95215 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
 	char name[];
 };
 
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -505,7 +509,7 @@ struct vm_area_struct {
 
 #ifdef CONFIG_PER_VMA_LOCK
 	int vm_lock_seq;
-	struct rw_semaphore lock;
+	struct vma_lock *vm_lock;
 
 	/* Flag to indicate areas detached from the mm->mm_mt tree */
 	bool detached;
diff --git a/kernel/fork.c b/kernel/fork.c
index f86f7e917954..c75088367bb4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3152,6 +3194,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
-- 
cgit v1.2.3


From ef6a22b70f6d90449a5c797b8968a682824e2011 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 1 Mar 2023 17:49:00 +0530
Subject: sched/numa: apply the scan delay to every new vma

Pach series "sched/numa: Enhance vma scanning", v3.

The patchset proposes one of the enhancements to numa vma scanning
suggested by Mel.  This is continuation of [3].

Reposting the rebased patchset to akpm mm-unstable tree (March 1)

Existing mechanism of scan period involves, scan period derived from
per-thread stats.  Process Adaptive autoNUMA [1] proposed to gather NUMA
fault stats at per-process level to capture aplication behaviour better.

During that course of discussion, Mel proposed several ideas to enhance
current numa balancing.  One of the suggestion was below

Track what threads access a VMA.  The suggestion was to use an unsigned
long pid_mask and use the lower bits to tag approximately what threads
access a VMA.  Skip VMAs that did not trap a fault.  This would be
approximate because of PID collisions but would reduce scanning of areas
the thread is not interested in.  The above suggestion intends not to
penalize threads that has no interest in the vma, thus reduce scanning
overhead.

V3 changes are mostly based on PeterZ comments (details below in changes)

Summary of patchset:

Current patchset implements:

1. Delay the vma scanning logic for newly created VMA's so that
   additional overhead of scanning is not incurred for short lived tasks
   (implementation by Mel)

2. Store the information of tasks accessing VMA in 2 windows.  It is
   regularly cleared in (4*sysctl_numa_balancing_scan_delay) interval.
   The above time is derived from experimenting (Suggested by PeterZ) to
   balance between frequent clearing vs obsolete access data

3. hash_32 used to encode task index accessing VMA information

4. VMA's acess information is used to skip scanning for the tasks
   which had not accessed VMA

Changes since V2:
patch1:
 - Renaming of structure, macro to function,
 - Add explanation to heuristics
 - Adding more details from result (PeterZ)
 Patch2:
 - Usage of test and set bit (PeterZ)
 - Move storing access PID info to numa_migrate_prep()
 - Add a note on fainess among tasks allowed to scan
   (PeterZ)
 Patch3:
 - Maintain two windows of access PID information
  (PeterZ supported implementation and Gave idea to extend
   to N if needed)
 Patch4:
 - Apply hash_32 function to track VMA accessing PIDs (PeterZ)

Changes since RFC V1:
 - Include Mel's vma scan delay patch
 - Change the accessing pid store logic (Thanks Mel)
 - Fencing structure / code to NUMA_BALANCING (David, Mel)
 - Adding clearing access PID logic (Mel)
 - Descriptive change log ( Mike Rapoport)

Things to ponder over:
==========================================

- Improvement to clearing accessing PIDs logic (discussed in-detail in
  patch3 itself (Done in this patchset by implementing 2 window history)

- Current scan period is not changed in the patchset, so we do see
  frequent tries to scan.  Relaxing scan period dynamically could improve
  results further.

[1] sched/numa: Process Adaptive autoNUMA
 Link: https://lore.kernel.org/lkml/20220128052851.17162-1-bharata@amd.com/T/

[2] RFC V1 Link:
  https://lore.kernel.org/all/cover.1673610485.git.raghavendra.kt@amd.com/

[3] V2 Link:
  https://lore.kernel.org/lkml/cover.1675159422.git.raghavendra.kt@amd.com/


Results:
Summary: Huge autonuma cost reduction seen in mmtest. Kernbench improvement
is more than 5% and huge system time (80%+) improvement from mmtest autonuma.
(dbench had huge std deviation to post)

kernbench
===========
                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Amean     user-256    22002.51 (   0.00%)    22649.95 *  -2.94%*
Amean     syst-256    10162.78 (   0.00%)     8214.13 *  19.17%*
Amean     elsp-256      160.74 (   0.00%)      156.92 *   2.38%*

Duration User       66017.43    67959.84
Duration System     30503.15    24657.03
Duration Elapsed      504.61      493.12

                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Ops NUMA alloc hit                1738835089.00  1738780310.00
Ops NUMA alloc local              1738834448.00  1738779711.00
Ops NUMA base-page range updates      477310.00      392566.00
Ops NUMA PTE updates                  477310.00      392566.00
Ops NUMA hint faults                   96817.00       87555.00
Ops NUMA hint local faults %           10150.00        2192.00
Ops NUMA hint local percent               10.48           2.50
Ops NUMA pages migrated                86660.00       85363.00
Ops AutoNUMA cost                        489.07         442.14

autonumabench
===============
                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Amean     syst-NUMA01                  399.50 (   0.00%)       52.05 *  86.97%*
Amean     syst-NUMA01_THREADLOCAL        0.21 (   0.00%)        0.22 *  -5.41%*
Amean     syst-NUMA02                    0.80 (   0.00%)        0.78 *   2.68%*
Amean     syst-NUMA02_SMT                0.65 (   0.00%)        0.68 *  -3.95%*
Amean     elsp-NUMA01                  313.26 (   0.00%)      313.11 *   0.05%*
Amean     elsp-NUMA01_THREADLOCAL        1.06 (   0.00%)        1.08 *  -1.76%*
Amean     elsp-NUMA02                    3.19 (   0.00%)        3.24 *  -1.52%*
Amean     elsp-NUMA02_SMT                3.72 (   0.00%)        3.61 *   2.92%*

Duration User      396433.47   324835.96
Duration System      2808.70      376.66
Duration Elapsed     2258.61     2258.12

                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Ops NUMA alloc hit                  59921806.00    49623489.00
Ops NUMA alloc miss                        0.00           0.00
Ops NUMA interleave hit                    0.00           0.00
Ops NUMA alloc local                59920880.00    49622594.00
Ops NUMA base-page range updates   152259275.00       50075.00
Ops NUMA PTE updates               152259275.00       50075.00
Ops NUMA PMD updates                       0.00           0.00
Ops NUMA hint faults               154660352.00       39014.00
Ops NUMA hint local faults %       138550501.00       23139.00
Ops NUMA hint local percent               89.58          59.31
Ops NUMA pages migrated              8179067.00       14147.00
Ops AutoNUMA cost                     774522.98         195.69


This patch (of 4):

Currently whenever a new task is created we wait for
sysctl_numa_balancing_scan_delay to avoid unnessary scanning overhead.
Extend the same logic to new or very short-lived VMAs.

[raghavendra.kt@amd.com: add initialization in vm_area_dup())]
Link: https://lkml.kernel.org/r/cover.1677672277.git.raghavendra.kt@amd.com
Link: https://lkml.kernel.org/r/7a6fbba87c8b51e67efd3e74285bb4cb311a16ca.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 16 ++++++++++++++++
 include/linux/mm_types.h |  7 +++++++
 kernel/fork.c            |  2 ++
 kernel/sched/fair.c      | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5487a86b154..9f08a11b355c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
 #include <linux/memremap.h>
+#include <linux/slab.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -627,6 +628,20 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+	vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+	kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
 #ifdef CONFIG_PER_VMA_LOCK
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
@@ -747,6 +762,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
+	vma_numab_state_init(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1b3031e95215..3e1a42673769 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -475,6 +475,10 @@ struct vma_lock {
 	struct rw_semaphore lock;
 };
 
+struct vma_numab_state {
+	unsigned long next_scan;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -560,6 +564,9 @@ struct vm_area_struct {
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
diff --git a/kernel/fork.c b/kernel/fork.c
index c75088367bb4..2bde99037652 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -516,6 +516,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		return NULL;
 	}
 	INIT_LIST_HEAD(&new->anon_vma_chain);
+	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
 
 	return new;
@@ -523,6 +524,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
+	vma_numab_state_free(vma);
 	free_anon_vma_name(vma);
 	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..7072de1686d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3027,6 +3027,25 @@ static void task_numa_work(struct callback_head *work)
 		if (!vma_is_accessible(vma))
 			continue;
 
+		/* Initialise new per-VMA NUMAB state. */
+		if (!vma->numab_state) {
+			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+				GFP_KERNEL);
+			if (!vma->numab_state)
+				continue;
+
+			vma->numab_state->next_scan = now +
+				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+		}
+
+		/*
+		 * Scanning the VMA's of short lived tasks add more overhead. So
+		 * delay the scan for new VMAs.
+		 */
+		if (mm->numa_scan_seq && time_before(jiffies,
+						vma->numab_state->next_scan))
+			continue;
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-- 
cgit v1.2.3


From fc137c0ddab29b591db6a091dc6d7ce20ccb73f2 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Wed, 1 Mar 2023 17:49:01 +0530
Subject: sched/numa: enhance vma scanning logic

During Numa scanning make sure only relevant vmas of the tasks are
scanned.

Before:
 All the tasks of a process participate in scanning the vma even if they
 do not access vma in it's lifespan.

Now:
 Except cases of first few unconditional scans, if a process do
 not touch vma (exluding false positive cases of PID collisions)
 tasks no longer scan all vma

Logic used:

1) 6 bits of PID used to mark active bit in vma numab status during
   fault to remember PIDs accessing vma.  (Thanks Mel)

2) Subsequently in scan path, vma scanning is skipped if current PID
   had not accessed vma.

3) First two times we do allow unconditional scan to preserve earlier
   behaviour of scanning.

Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch to
store pid information and Peter Zijlstra <peterz@infradead.org> (Usage of
test and set bit)

Link: https://lkml.kernel.org/r/092f03105c7c1d3450f4636b1ea350407f07640e.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 14 ++++++++++++++
 include/linux/mm_types.h |  1 +
 kernel/sched/fair.c      | 19 +++++++++++++++++++
 mm/memory.c              |  3 +++
 4 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9f08a11b355c..215327daffae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1686,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
 	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+	unsigned int pid_bit;
+
+	pid_bit = current->pid % BITS_PER_LONG;
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) {
+		__set_bit(pid_bit, &vma->numab_state->access_pids);
+	}
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
@@ -1735,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 {
 	return false;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3e1a42673769..f8cbd8efc7cb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -477,6 +477,7 @@ struct vma_lock {
 
 struct vma_numab_state {
 	unsigned long next_scan;
+	unsigned long access_pids;
 };
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7072de1686d5..ef27b5931480 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2928,6 +2928,21 @@ static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+	/*
+	 * Allow unconditional access first two times, so that all the (pages)
+	 * of VMAs get prot_none fault introduced irrespective of accesses.
+	 * This is also done to avoid any side effect of task scanning
+	 * amplifying the unfairness of disjoint set of VMAs' access.
+	 */
+	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+		return true;
+
+	return test_bit(current->pid % BITS_PER_LONG,
+				&vma->numab_state->access_pids);
+}
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3046,6 +3061,10 @@ static void task_numa_work(struct callback_head *work)
 						vma->numab_state->next_scan))
 			continue;
 
+		/* Do not scan the VMA if task has not accessed */
+		if (!vma_is_accessed(vma))
+			continue;
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
diff --git a/mm/memory.c b/mm/memory.c
index 9999574a9636..f77fccb5310c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4661,6 +4661,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 {
 	get_page(page);
 
+	/* Record the current PID acceesing VMA */
+	vma_set_access_pid_bit(vma);
+
 	count_vm_numa_event(NUMA_HINT_FAULTS);
 	if (page_nid == numa_node_id()) {
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-- 
cgit v1.2.3


From 20f586486b87dcfe10b8c79398e24e720885588a Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Wed, 1 Mar 2023 17:49:02 +0530
Subject: sched/numa: implement access PID reset logic

This helps to ensure that only recently accessed PIDs scan the VMAs.

Current implementation: (idea supported by PeterZ)

 1. Accessing PID information is maintained in two windows.
    access_pids[1] being newest.

 2. Reset old access PID info i.e.  access_pid[0] every (4 *
    sysctl_numa_balancing_scan_delay) interval after initial scan delay
    period expires.

The above interval seemed to be experimentally optimum since it avoids
frequent reset of access info as well as helps clearing the old access
info regularly.  The reset logic is implemented in scan path.

Link: https://lkml.kernel.org/r/f7a675f66d1442d048b4216b2baf94515012c405.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       |  4 ++--
 include/linux/mm_types.h |  3 ++-
 kernel/sched/fair.c      | 23 +++++++++++++++++++++--
 3 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 215327daffae..e05a878e186e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1692,8 +1692,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = current->pid % BITS_PER_LONG;
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8cbd8efc7cb..092f842a854f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -477,7 +477,8 @@ struct vma_lock {
 
 struct vma_numab_state {
 	unsigned long next_scan;
-	unsigned long access_pids;
+	unsigned long next_pid_reset;
+	unsigned long access_pids[2];
 };
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef27b5931480..a962d4b60cd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2930,6 +2930,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
 
 static bool vma_is_accessed(struct vm_area_struct *vma)
 {
+	unsigned long pids;
 	/*
 	 * Allow unconditional access first two times, so that all the (pages)
 	 * of VMAs get prot_none fault introduced irrespective of accesses.
@@ -2939,10 +2940,12 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
 		return true;
 
-	return test_bit(current->pid % BITS_PER_LONG,
-				&vma->numab_state->access_pids);
+	pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+	return test_bit(current->pid % BITS_PER_LONG, &pids);
 }
 
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3051,6 +3054,10 @@ static void task_numa_work(struct callback_head *work)
 
 			vma->numab_state->next_scan = now +
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+			/* Reset happens after 4 times scan delay of scan start */
+			vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
 		}
 
 		/*
@@ -3065,6 +3072,18 @@ static void task_numa_work(struct callback_head *work)
 		if (!vma_is_accessed(vma))
 			continue;
 
+		/*
+		 * RESET access PIDs regularly for old VMAs. Resetting after checking
+		 * vma for recent access to avoid clearing PID info before access..
+		 */
+		if (mm->numa_scan_seq &&
+				time_after(jiffies, vma->numab_state->next_pid_reset)) {
+			vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+			vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
+			vma->numab_state->access_pids[1] = 0;
+		}
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-- 
cgit v1.2.3


From d46031f40e0f7f7bf63914bb3f2e404ad3886ecd Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Wed, 1 Mar 2023 17:49:03 +0530
Subject: sched/numa: use hash_32 to mix up PIDs accessing VMA

before: last 6 bits of PID is used as index to store information about
tasks accessing VMA's.

after: hash_32 is used to take of cases where tasks are created over a
period of time, and thus improve collision probability.

Result:
The patch series overall improves autonuma cost.

Kernbench around more than 5% improvement and system time in mmtest
autonuma showed more than 80% improvement

Link: https://lkml.kernel.org/r/d5a9f75513300caed74e5c8570bba9317b963c2b.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  | 2 +-
 kernel/sched/fair.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e05a878e186e..e249208f8fbe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1691,7 +1691,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 {
 	unsigned int pid_bit;
 
-	pid_bit = current->pid % BITS_PER_LONG;
+	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
 	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
 		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a962d4b60cd7..db6fc9d978ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2941,7 +2941,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 		return true;
 
 	pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
-	return test_bit(current->pid % BITS_PER_LONG, &pids);
+	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
-- 
cgit v1.2.3


From 810c03d9d7c978b4ee5287d8987043a9be1d614e Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 21 Mar 2023 16:40:37 +0100
Subject: rpmsg: qcom_smd: Make qcom_smd_unregister_edge() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function returned zero unconditionally. Convert it to return no
value instead. This makes it more obvious what happens in the callers.

One caller is converted to return zero explicitly. The only other caller
(smd_subdev_stop() in drivers/remoteproc/qcom_common.c) already ignored
the return value before.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230321154039.355098-2-u.kleine-koenig@pengutronix.de
---
 drivers/rpmsg/qcom_smd.c       | 8 ++++----
 include/linux/rpmsg/qcom_smd.h | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c
index 1044cf03c542..38352f5792f4 100644
--- a/drivers/rpmsg/qcom_smd.c
+++ b/drivers/rpmsg/qcom_smd.c
@@ -1533,7 +1533,7 @@ static int qcom_smd_remove_device(struct device *dev, void *data)
  * qcom_smd_unregister_edge() - release an edge and its children
  * @edge:      edge reference acquired from qcom_smd_register_edge
  */
-int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
+void qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
 {
 	int ret;
 
@@ -1547,8 +1547,6 @@ int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
 
 	mbox_free_channel(edge->mbox_chan);
 	device_unregister(&edge->dev);
-
-	return 0;
 }
 EXPORT_SYMBOL(qcom_smd_unregister_edge);
 
@@ -1572,7 +1570,9 @@ static int qcom_smd_remove_edge(struct device *dev, void *data)
 {
 	struct qcom_smd_edge *edge = to_smd_edge(dev);
 
-	return qcom_smd_unregister_edge(edge);
+	qcom_smd_unregister_edge(edge);
+
+	return 0;
 }
 
 /*
diff --git a/include/linux/rpmsg/qcom_smd.h b/include/linux/rpmsg/qcom_smd.h
index 2e92d7407a85..3379bf4e1cb1 100644
--- a/include/linux/rpmsg/qcom_smd.h
+++ b/include/linux/rpmsg/qcom_smd.h
@@ -11,7 +11,7 @@ struct qcom_smd_edge;
 
 struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent,
 					     struct device_node *node);
-int qcom_smd_unregister_edge(struct qcom_smd_edge *edge);
+void qcom_smd_unregister_edge(struct qcom_smd_edge *edge);
 
 #else
 
@@ -22,9 +22,8 @@ qcom_smd_register_edge(struct device *parent,
 	return NULL;
 }
 
-static inline int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
+static inline void qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
 {
-	return 0;
 }
 
 #endif
-- 
cgit v1.2.3


From 34bf93472f8fb60b4189aa2872471017e739cf0a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Thu, 30 Mar 2023 12:20:01 +0200
Subject: kallsyms: move module-related functions under correct configs

Functions for searching module kallsyms should have non-empty
definitions only if CONFIG_MODULES=y and CONFIG_KALLSYMS=y. Until now,
only CONFIG_MODULES check was used for many of these, which may have
caused complilation errors on some configs.

This patch moves all relevant functions under the correct configs.

Fixes: bd5314f8dd2d ("kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header")
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303181535.RFDCnz3E-lkp@intel.com/
Link: https://lore.kernel.org/r/20230330102001.2183693-1-vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/module.h | 135 +++++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 41cfd3be57e5..886d24877c7c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -608,16 +608,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 /* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
-/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
-   symnum out of range. */
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-			char *name, char *module_name, int *exported);
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name);
-
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
-
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -664,17 +654,6 @@ static inline void __module_get(struct module *module)
 /* Dereference module function descriptor */
 void *dereference_module_function_descriptor(struct module *mod, void *ptr);
 
-/* For kallsyms to ask for address resolution.  namebuf should be at
- * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
- * found, otherwise NULL. */
-const char *module_address_lookup(unsigned long addr,
-			    unsigned long *symbolsize,
-			    unsigned long *offset,
-			    char **modname, const unsigned char **modbuildid,
-			    char *namebuf);
-int lookup_module_symbol_name(unsigned long addr, char *symname);
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
-
 int register_module_notifier(struct notifier_block *nb);
 int unregister_module_notifier(struct notifier_block *nb);
 
@@ -765,45 +744,6 @@ static inline void module_put(struct module *module)
 
 #define module_name(mod) "kernel"
 
-/* For kallsyms to ask for address resolution.  NULL means not found. */
-static inline const char *module_address_lookup(unsigned long addr,
-					  unsigned long *symbolsize,
-					  unsigned long *offset,
-					  char **modname,
-					  const unsigned char **modbuildid,
-					  char *namebuf)
-{
-	return NULL;
-}
-
-static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
-	return -ERANGE;
-}
-
-static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
-{
-	return -ERANGE;
-}
-
-static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
-					char *type, char *name,
-					char *module_name, int *exported)
-{
-	return -ERANGE;
-}
-
-static inline unsigned long module_kallsyms_lookup_name(const char *name)
-{
-	return 0;
-}
-
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
-
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
@@ -899,7 +839,36 @@ int module_kallsyms_on_each_symbol(const char *modname,
 				   int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data);
-#else
+
+/* For kallsyms to ask for address resolution.  namebuf should be at
+ * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
+ * found, otherwise NULL.
+ */
+const char *module_address_lookup(unsigned long addr,
+				  unsigned long *symbolsize,
+				  unsigned long *offset,
+				  char **modname, const unsigned char **modbuildid,
+				  char *namebuf);
+int lookup_module_symbol_name(unsigned long addr, char *symname);
+int lookup_module_symbol_attrs(unsigned long addr,
+			       unsigned long *size,
+			       unsigned long *offset,
+			       char *modname,
+			       char *name);
+
+/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
+ * symnum out of range.
+ */
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *name, char *module_name, int *exported);
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name);
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
+#else	/* CONFIG_MODULES && CONFIG_KALLSYMS */
+
 static inline int module_kallsyms_on_each_symbol(const char *modname,
 						 int (*fn)(void *, const char *,
 						 struct module *, unsigned long),
@@ -907,6 +876,50 @@ static inline int module_kallsyms_on_each_symbol(const char *modname,
 {
 	return -EOPNOTSUPP;
 }
+
+/* For kallsyms to ask for address resolution.  NULL means not found. */
+static inline const char *module_address_lookup(unsigned long addr,
+						unsigned long *symbolsize,
+						unsigned long *offset,
+						char **modname,
+						const unsigned char **modbuildid,
+						char *namebuf)
+{
+	return NULL;
+}
+
+static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	return -ERANGE;
+}
+
+static inline int lookup_module_symbol_attrs(unsigned long addr,
+					     unsigned long *size,
+					     unsigned long *offset,
+					     char *modname,
+					     char *name)
+{
+	return -ERANGE;
+}
+
+static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
+				     char *type, char *name,
+				     char *module_name, int *exported)
+{
+	return -ERANGE;
+}
+
+static inline unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	return 0;
+}
+
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 #endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */
 
 #endif /* _LINUX_MODULE_H */
-- 
cgit v1.2.3


From 9697b328d11152d7b918ee82438d4283d4edb563 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Mar 2023 11:57:09 +0800
Subject: crypto: hash - Remove maximum statesize limit

Remove the HASH_MAX_STATESIZE limit now that it is unused.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/shash.c        | 3 +--
 include/crypto/hash.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/crypto/shash.c b/crypto/shash.c
index dcc6a7170ce4..4cefa614dbbd 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -569,8 +569,7 @@ int hash_prepare_alg(struct hash_alg_common *alg)
 	struct crypto_istat_hash *istat = hash_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
-	if (alg->digestsize > HASH_MAX_DIGESTSIZE ||
-	    alg->statesize > HASH_MAX_STATESIZE)
+	if (alg->digestsize > HASH_MAX_DIGESTSIZE)
 		return -EINVAL;
 
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 1ed674ba8429..3a04e601ad6a 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -183,8 +183,6 @@ struct shash_desc {
  */
 #define HASH_MAX_DESCSIZE	(sizeof(struct shash_desc) + 360)
 
-#define HASH_MAX_STATESIZE	512
-
 #define SHASH_DESC_ON_STACK(shash, ctx)					     \
 	char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \
 		__aligned(__alignof__(struct shash_desc));		     \
-- 
cgit v1.2.3


From 94623f579ce338b5fa61b5acaa5beb8aa657fb9e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 3 Apr 2023 13:54:37 +0200
Subject: netfilter: br_netfilter: fix recent physdev match breakage

Recent attempt to ensure PREROUTING hook is executed again when a
decrypted ipsec packet received on a bridge passes through the network
stack a second time broke the physdev match in INPUT hook.

We can't discard the nf_bridge info strct from sabotage_in hook, as
this is needed by the physdev match.

Keep the struct around and handle this with another conditional instead.

Fixes: 2b272bb558f1 ("netfilter: br_netfilter: disable sabotage_in hook after first suppression")
Reported-and-tested-by: Farid BENAMROUCHE <fariouche@yahoo.fr>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h          |  1 +
 net/bridge/br_netfilter_hooks.c | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff7ad331fb82..1ec3530c8191 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -294,6 +294,7 @@ struct nf_bridge_info {
 	u8			pkt_otherhost:1;
 	u8			in_prerouting:1;
 	u8			bridged_dnat:1;
+	u8			sabotage_in_done:1;
 	__u16			frag_max_size;
 	struct net_device	*physindev;
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 638a4d5359db..4bc6761517bb 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv,
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
-	if (nf_bridge && !nf_bridge->in_prerouting &&
-	    !netif_is_l3_master(skb->dev) &&
-	    !netif_is_l3_slave(skb->dev)) {
-		nf_bridge_info_free(skb);
-		state->okfn(state->net, state->sk, skb);
-		return NF_STOLEN;
+	if (nf_bridge) {
+		if (nf_bridge->sabotage_in_done)
+			return NF_ACCEPT;
+
+		if (!nf_bridge->in_prerouting &&
+		    !netif_is_l3_master(skb->dev) &&
+		    !netif_is_l3_slave(skb->dev)) {
+			nf_bridge->sabotage_in_done = 1;
+			state->okfn(state->net, state->sk, skb);
+			return NF_STOLEN;
+		}
 	}
 
 	return NF_ACCEPT;
-- 
cgit v1.2.3


From fb131c472df92b2885262e747906c0a4038c82e5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 4 Apr 2023 13:19:55 +0200
Subject: platform/x86: apple-gmux: Fix iomem_base __iomem annotation

Fix the __iomem annotation of the iomem_base pointers in the apple-gmux
code. The __iomem should go before the *.

This fixes a bunch of sparse warnings like this one:

drivers/platform/x86/apple-gmux.c:224:48: sparse:
 expected void const [noderef] __iomem *
 got unsigned char [usertype] *

Fixes: 0c18184de990 ("platform/x86: apple-gmux: support MMIO gmux on T2 Macs")
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202304040401.IMxt7Ubi-lkp@intel.com/
Suggested-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Orlando Chamberlain <orlandoch.dev@gmail.com>
Link: https://lore.kernel.org/r/20230404111955.43266-1-hdegoede@redhat.com
---
 drivers/platform/x86/apple-gmux.c | 2 +-
 include/linux/apple-gmux.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index 4c311e1dedad..e02b4aea4f1e 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -57,7 +57,7 @@
 struct apple_gmux_config;
 
 struct apple_gmux_data {
-	u8 *__iomem iomem_base;
+	u8 __iomem *iomem_base;
 	unsigned long iostart;
 	unsigned long iolen;
 	const struct apple_gmux_config *config;
diff --git a/include/linux/apple-gmux.h b/include/linux/apple-gmux.h
index a7a7d430024b..206d97ffda79 100644
--- a/include/linux/apple-gmux.h
+++ b/include/linux/apple-gmux.h
@@ -66,7 +66,7 @@ static inline bool apple_gmux_is_indexed(unsigned long iostart)
 
 static inline bool apple_gmux_is_mmio(unsigned long iostart)
 {
-	u8 *__iomem iomem_base = ioremap(iostart, 16);
+	u8 __iomem *iomem_base = ioremap(iostart, 16);
 	u8 val;
 
 	if (!iomem_base)
-- 
cgit v1.2.3


From d0a4564bd024eaa81cab8c7255e7c44230bdd8a2 Mon Sep 17 00:00:00 2001
From: Olivier Moysan <olivier.moysan@foss.st.com>
Date: Tue, 13 Dec 2022 11:27:07 +0100
Subject: pwm: stm32: Enforce settings for PWM capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PWM capture assumes that the input selector is set to default
input and that the slave mode is disabled. Force reset state for
TISEL and SMCR registers to match this requirement.

Note that slave mode disabling is not a pre-requisite by itself
for capture mode, as hardware supports it for PWM capture.
However, the current implementation of the driver does not
allow slave mode for PWM capture. Setting slave mode for PWM
capture results in wrong capture values.

Signed-off-by: Olivier Moysan <olivier.moysan@foss.st.com>
Acked-by: Lee Jones <lee@kernel.org>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/pwm-stm32.c          | 4 ++++
 include/linux/mfd/stm32-timers.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a482f7e0e4ab..62e397aeb9aa 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -207,6 +207,10 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 	regmap_write(priv->regmap, TIM_ARR, priv->max_arr);
 	regmap_write(priv->regmap, TIM_PSC, psc);
 
+	/* Reset input selector to its default input and disable slave mode */
+	regmap_write(priv->regmap, TIM_TISEL, 0x0);
+	regmap_write(priv->regmap, TIM_SMCR, 0x0);
+
 	/* Map TI1 or TI2 PWM input to IC1 & IC2 (or TI3/4 to IC3 & IC4) */
 	regmap_update_bits(priv->regmap,
 			   pwm->hwpwm < 2 ? TIM_CCMR1 : TIM_CCMR2,
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 5f5c43fd69dd..1b94325febb3 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -31,6 +31,7 @@
 #define TIM_BDTR	0x44	/* Break and Dead-Time Reg */
 #define TIM_DCR		0x48	/* DMA control register    */
 #define TIM_DMAR	0x4C	/* DMA register for transfer */
+#define TIM_TISEL	0x68	/* Input Selection         */
 
 #define TIM_CR1_CEN	BIT(0)	/* Counter Enable	   */
 #define TIM_CR1_DIR	BIT(4)  /* Counter Direction	   */
-- 
cgit v1.2.3


From 102882b5c62f6bfe403178bb36adef3ba542d148 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Thu, 6 Apr 2023 15:25:21 +0200
Subject: ALSA: document that struct __snd_pcm_mmap_control64 is messed up

I'm not the first one to run into this, see e.g.
https://lore.kernel.org/all/29QBMJU8DE71E.2YZSH8IHT5HMH@mforney.org/

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230406132521.2252019-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/asound.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index de6810e94abe..7eecc99ddef7 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -570,7 +570,8 @@ struct __snd_pcm_mmap_status64 {
 struct __snd_pcm_mmap_control64 {
 	__pad_before_uframe __pad1;
 	snd_pcm_uframes_t appl_ptr;	 /* RW: appl ptr (0...boundary-1) */
-	__pad_before_uframe __pad2;
+	__pad_before_uframe __pad2;	 // This should be __pad_after_uframe, but binary
+					 // backwards compatibility constraints prevent a fix.
 
 	__pad_before_uframe __pad3;
 	snd_pcm_uframes_t  avail_min;	 /* RW: min available frames for wakeup */
-- 
cgit v1.2.3


From 539adfedbd2d5cda2f9e2d83b35b364834b67d58 Mon Sep 17 00:00:00 2001
From: Bastien Nocera <hadess@hadess.net>
Date: Wed, 5 Apr 2023 11:27:54 +0200
Subject: USB: core: Fix docs warning caused by wireless_status feature

Fix wrongly named 'dev' parameter in doc block, should have been iface:
drivers/usb/core/message.c:1939: warning: Function parameter or member 'iface' not described in 'usb_set_wireless_status'
drivers/usb/core/message.c:1939: warning: Excess function parameter 'dev' description in 'usb_set_wireless_status'

And fix missing struct member doc in kernel API, and reorder to
match struct:
include/linux/usb.h:270: warning: Function parameter or member 'wireless_status_work' not described in 'usb_interface'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/linux-next/20230405114807.5a57bf46@canb.auug.org.au/T/#t
Fixes: 0a4db185f078 ("USB: core: Add API to change the wireless_status")
Signed-off-by: Bastien Nocera <hadess@hadess.net>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20230405092754.36579-1-hadess@hadess.net
[bentiss: fix checkpatch warning]
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/usb/core/message.c | 2 +-
 include/linux/usb.h        | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 33392a6f4972..b5811620f1de 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1926,7 +1926,7 @@ static void __usb_wireless_status_intf(struct work_struct *ws)
 
 /**
  * usb_set_wireless_status - sets the wireless_status struct member
- * @dev: the device to modify
+ * @iface: the interface to modify
  * @status: the new wireless status
  *
  * Set the wireless_status struct member to the new value, and emit
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 6dfd59d3d695..0a81e0f5beb4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -203,14 +203,16 @@ enum usb_wireless_status {
  *	following a reset or suspend operation it doesn't support.
  * @authorized: This allows to (de)authorize individual interfaces instead
  *	a whole device in contrast to the device authorization.
+ * @wireless_status: if the USB device uses a receiver/emitter combo, whether
+ *	the emitter is connected.
+ * @wireless_status_work: Used for scheduling wireless status changes
+ *	from atomic context.
  * @dev: driver model's view of this device
  * @usb_dev: if an interface is bound to the USB major, this will point
  *	to the sysfs representation for that device.
  * @reset_ws: Used for scheduling resets from atomic context.
  * @resetting_device: USB core reset the device, so use alt setting 0 as
  *	current; needs bandwidth alloc after reset.
- * @wireless_status: if the USB device uses a receiver/emitter combo, whether
- *	the emitter is connected.
  *
  * USB device drivers attach to interfaces on a physical device.  Each
  * interface encapsulates a single high level function, such as feeding
-- 
cgit v1.2.3


From 34e582b559c78746c544442b3925554e0665351b Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:11 +0300
Subject: ALSA: hda: add HDaudio Extended link definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new definitions for the HDaudio Extended link support,
specifically new registers for SoundWire, Intel DMIC and INTEL SSP
interfaces.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda_register.h | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h
index d37cf43546eb..9c7872c0ca79 100644
--- a/include/sound/hda_register.h
+++ b/include/sound/hda_register.h
@@ -258,14 +258,27 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 };
 #define AZX_ML_BASE			0x40
 #define AZX_ML_INTERVAL			0x40
 
+/* HDaudio registers valid for HDaudio and HDaudio extended links */
 #define AZX_REG_ML_LCAP			0x00
-#define AZX_REG_ML_LCTL			0x04
 
+#define AZX_ML_HDA_LCAP_ALT		BIT(28)
+#define AZX_ML_HDA_LCAP_ALT_HDA		0x0
+#define AZX_ML_HDA_LCAP_ALT_HDA_EXT	0x1
+
+#define AZX_ML_HDA_LCAP_INTC		BIT(27)		/* only used if ALT == 1 */
+#define AZX_ML_HDA_LCAP_OFLS		BIT(26)		/* only used if ALT == 1 */
+#define AZX_ML_HDA_LCAP_LSS		BIT(23)		/* only used if ALT == 1 */
+#define AZX_ML_HDA_LCAP_SLCOUNT		GENMASK(22, 20)	/* only used if ALT == 1 */
+
+#define AZX_REG_ML_LCTL			0x04
+#define AZX_ML_LCTL_INTSTS		BIT(31)		/* only used if ALT == 1 */
 #define AZX_ML_LCTL_CPA			BIT(23)
 #define AZX_ML_LCTL_CPA_SHIFT		23
 #define AZX_ML_LCTL_SPA			BIT(16)
 #define AZX_ML_LCTL_SPA_SHIFT		16
-#define AZX_ML_LCTL_SCF			GENMASK(3, 0)
+#define AZX_ML_LCTL_INTEN		BIT(5)		/* only used if ALT == 1 */
+#define AZX_ML_LCTL_OFLEN		BIT(4)		/* only used if ALT == 1 */
+#define AZX_ML_LCTL_SCF			GENMASK(3, 0)	/* only used if ALT == 0 */
 
 #define AZX_REG_ML_LOSIDV		0x08
 
@@ -273,12 +286,35 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 };
 #define AZX_ML_LOSIDV_STREAM_MASK	0xFFFE
 
 #define AZX_REG_ML_LSDIID		0x0C
+#define AZX_REG_ML_LSDIID_OFFSET(x)	(0x0C + (x) * 0x02)	/* only used if ALT == 1 */
+
+/* HDaudio registers only valid if LCAP.ALT == 0 */
 #define AZX_REG_ML_LPSOO		0x10
 #define AZX_REG_ML_LPSIO		0x12
 #define AZX_REG_ML_LWALFC		0x18
 #define AZX_REG_ML_LOUTPAY		0x20
 #define AZX_REG_ML_LINPAY		0x30
 
+/* HDaudio Extended link registers only valid if LCAP.ALT == 1 */
+#define AZX_REG_ML_LSYNC		0x1C
+
+#define AZX_REG_ML_LSYNC_CMDSYNC	BIT(24)
+#define AZX_REG_ML_LSYNC_CMDSYNC_SHIFT	24
+#define AZX_REG_ML_LSYNC_SYNCGO		BIT(23)
+#define AZX_REG_ML_LSYNC_SYNCPU		BIT(20)
+#define AZX_REG_ML_LSYNC_SYNCPRD	GENMASK(19, 0)
+
+#define AZX_REG_ML_LEPTR		0x20
+
+#define AZX_REG_ML_LEPTR_ID		GENMASK(31, 24)
+#define AZX_REG_ML_LEPTR_ID_SHIFT	24
+#define AZX_REG_ML_LEPTR_ID_SDW		0x00
+#define AZX_REG_ML_LEPTR_ID_INTEL_SSP	0xC0
+#define AZX_REG_ML_LEPTR_ID_INTEL_DMIC  0xC1
+#define AZX_REG_ML_LEPTR_ID_INTEL_UAOL  0xC2
+#define AZX_REG_ML_LEPTR_VER		GENMASK(23, 20)
+#define AZX_REG_ML_LEPTR_PTR		GENMASK(19, 0)
+
 /* registers for DMA Resume Capability Structure */
 #define AZX_DRSM_CAP_ID			0x5
 #define AZX_REG_DRSM_CTL		0x4
-- 
cgit v1.2.3


From 18227585d8374ce16d3a2792deb125794710de43 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:14 +0300
Subject: ASoC: SOF: Intel: hda-mlink: move to a dedicated module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some of the functions will be used for SoundWire enumeration and power
management, to avoid cycles in module dependencies and simplify
integration all the HDaudio multi-link needs to move to a dedicated
module.

Drop no longer needed headers at the same time.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-6-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       | 31 +++++++++++++++++++++++++++++++
 sound/soc/sof/intel/Kconfig     |  7 +++++++
 sound/soc/sof/intel/Makefile    |  5 ++++-
 sound/soc/sof/intel/hda-ctrl.c  |  1 +
 sound/soc/sof/intel/hda-dsp.c   |  1 +
 sound/soc/sof/intel/hda-mlink.c | 24 +++++++++++-------------
 sound/soc/sof/intel/hda.c       |  2 ++
 sound/soc/sof/intel/hda.h       | 20 --------------------
 8 files changed, 57 insertions(+), 34 deletions(-)
 create mode 100644 include/sound/hda-mlink.h

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
new file mode 100644
index 000000000000..beef5f509e47
--- /dev/null
+++ b/include/sound/hda-mlink.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Copyright(c) 2022-2023 Intel Corporation. All rights reserved.
+ */
+
+struct hdac_bus;
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_MLINK)
+
+int hda_bus_ml_get_capabilities(struct hdac_bus *bus);
+void hda_bus_ml_free(struct hdac_bus *bus);
+void hda_bus_ml_put_all(struct hdac_bus *bus);
+void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
+int hda_bus_ml_resume(struct hdac_bus *bus);
+int hda_bus_ml_suspend(struct hdac_bus *bus);
+
+#else
+
+static inline int
+hda_bus_ml_get_capabilities(struct hdac_bus *bus) { return 0; }
+
+static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
+static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
+static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
+static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
+static inline int hda_bus_ml_suspend(struct hdac_bus *bus) { return 0; }
+
+#endif /* CONFIG_SND_SOC_SOF_HDA */
diff --git a/sound/soc/sof/intel/Kconfig b/sound/soc/sof/intel/Kconfig
index 715ba8a7f2f8..f4eeacf1f281 100644
--- a/sound/soc/sof/intel/Kconfig
+++ b/sound/soc/sof/intel/Kconfig
@@ -269,6 +269,13 @@ config SND_SOC_SOF_HDA_COMMON
 	select SND_INTEL_DSP_CONFIG
 	select SND_SOC_SOF_HDA_LINK_BASELINE
 	select SND_SOC_SOF_HDA_PROBES
+	select SND_SOC_SOF_HDA_MLINK if SND_SOC_SOF_HDA_LINK
+	help
+	  This option is not user-selectable but automagically handled by
+	  'select' statements at a higher level.
+
+config SND_SOC_SOF_HDA_MLINK
+	tristate
 	help
 	  This option is not user-selectable but automagically handled by
 	  'select' statements at a higher level.
diff --git a/sound/soc/sof/intel/Makefile b/sound/soc/sof/intel/Makefile
index 38ab86b6a9fe..fdb463c12e91 100644
--- a/sound/soc/sof/intel/Makefile
+++ b/sound/soc/sof/intel/Makefile
@@ -5,10 +5,12 @@ snd-sof-acpi-intel-bdw-objs := bdw.o
 
 snd-sof-intel-hda-common-objs := hda.o hda-loader.o hda-stream.o hda-trace.o \
 				 hda-dsp.o hda-ipc.o hda-ctrl.o hda-pcm.o \
-				 hda-dai.o hda-dai-ops.o hda-bus.o hda-mlink.o \
+				 hda-dai.o hda-dai-ops.o hda-bus.o \
 				 skl.o hda-loader-skl.o \
 				 apl.o cnl.o tgl.o icl.o mtl.o hda-common-ops.o
 
+snd-sof-intel-hda-mlink-objs := hda-mlink.o
+
 snd-sof-intel-hda-common-$(CONFIG_SND_SOC_SOF_HDA_PROBES) += hda-probes.o
 
 snd-sof-intel-hda-objs := hda-codec.o
@@ -19,6 +21,7 @@ obj-$(CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP) += snd-sof-intel-atom.o
 obj-$(CONFIG_SND_SOC_SOF_BAYTRAIL) += snd-sof-acpi-intel-byt.o
 obj-$(CONFIG_SND_SOC_SOF_BROADWELL) += snd-sof-acpi-intel-bdw.o
 obj-$(CONFIG_SND_SOC_SOF_HDA_COMMON) += snd-sof-intel-hda-common.o
+obj-$(CONFIG_SND_SOC_SOF_HDA_MLINK) += snd-sof-intel-hda-mlink.o
 obj-$(CONFIG_SND_SOC_SOF_HDA) += snd-sof-intel-hda.o
 
 snd-sof-pci-intel-tng-objs := pci-tng.o
diff --git a/sound/soc/sof/intel/hda-ctrl.c b/sound/soc/sof/intel/hda-ctrl.c
index e1dba6b79065..84bf01bd360a 100644
--- a/sound/soc/sof/intel/hda-ctrl.c
+++ b/sound/soc/sof/intel/hda-ctrl.c
@@ -19,6 +19,7 @@
 #include <sound/hdaudio_ext.h>
 #include <sound/hda_register.h>
 #include <sound/hda_component.h>
+#include <sound/hda-mlink.h>
 #include "../ops.h"
 #include "hda.h"
 
diff --git a/sound/soc/sof/intel/hda-dsp.c b/sound/soc/sof/intel/hda-dsp.c
index 23c05e6f424a..77df536cf901 100644
--- a/sound/soc/sof/intel/hda-dsp.c
+++ b/sound/soc/sof/intel/hda-dsp.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <sound/hdaudio_ext.h>
 #include <sound/hda_register.h>
+#include <sound/hda-mlink.h>
 #include <trace/events/sof_intel.h>
 #include "../sof-audio.h"
 #include "../ops.h"
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index e426d5e41e52..fbf86f2350fb 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -12,21 +12,11 @@
 
 #include <sound/hdaudio_ext.h>
 #include <sound/hda_register.h>
+#include <sound/hda-mlink.h>
 
-#include <linux/acpi.h>
 #include <linux/module.h>
-#include <linux/soundwire/sdw.h>
-#include <linux/soundwire/sdw_intel.h>
-#include <sound/intel-dsp-config.h>
-#include <sound/intel-nhlt.h>
-#include <sound/sof.h>
-#include <sound/sof/xtensa.h>
-#include "../sof-audio.h"
-#include "../sof-pci-dev.h"
-#include "../ops.h"
-#include "hda.h"
-
-#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_MLINK)
 
 int hda_bus_ml_get_capabilities(struct hdac_bus *bus)
 {
@@ -34,6 +24,7 @@ int hda_bus_ml_get_capabilities(struct hdac_bus *bus)
 		return snd_hdac_ext_bus_get_ml_capabilities(bus);
 	return 0;
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_get_capabilities, SND_SOC_SOF_HDA_MLINK);
 
 void hda_bus_ml_free(struct hdac_bus *bus)
 {
@@ -47,6 +38,7 @@ void hda_bus_ml_free(struct hdac_bus *bus)
 		kfree(hlink);
 	}
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_free, SND_SOC_SOF_HDA_MLINK);
 
 void hda_bus_ml_put_all(struct hdac_bus *bus)
 {
@@ -55,6 +47,7 @@ void hda_bus_ml_put_all(struct hdac_bus *bus)
 	list_for_each_entry(hlink, &bus->hlink_list, list)
 		snd_hdac_ext_bus_link_put(bus, hlink);
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_put_all, SND_SOC_SOF_HDA_MLINK);
 
 void hda_bus_ml_reset_losidv(struct hdac_bus *bus)
 {
@@ -64,6 +57,7 @@ void hda_bus_ml_reset_losidv(struct hdac_bus *bus)
 	list_for_each_entry(hlink, &bus->hlink_list, list)
 		writel(0, hlink->ml_addr + AZX_REG_ML_LOSIDV);
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_reset_losidv, SND_SOC_SOF_HDA_MLINK);
 
 int hda_bus_ml_resume(struct hdac_bus *bus)
 {
@@ -80,10 +74,14 @@ int hda_bus_ml_resume(struct hdac_bus *bus)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_resume, SND_SOC_SOF_HDA_MLINK);
 
 int hda_bus_ml_suspend(struct hdac_bus *bus)
 {
 	return snd_hdac_ext_bus_link_power_down_all(bus);
 }
+EXPORT_SYMBOL_NS(hda_bus_ml_suspend, SND_SOC_SOF_HDA_MLINK);
 
 #endif
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 483ec80f3160..8845ae255f1e 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -26,6 +26,7 @@
 #include <sound/intel-nhlt.h>
 #include <sound/sof.h>
 #include <sound/sof/xtensa.h>
+#include <sound/hda-mlink.h>
 #include "../sof-audio.h"
 #include "../sof-pci-dev.h"
 #include "../ops.h"
@@ -1730,3 +1731,4 @@ MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA);
 MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL);
+MODULE_IMPORT_NS(SND_SOC_SOF_HDA_MLINK);
diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h
index 259b34eea677..0e0cfa81a8f2 100644
--- a/sound/soc/sof/intel/hda.h
+++ b/sound/soc/sof/intel/hda.h
@@ -763,26 +763,6 @@ static inline int hda_codec_i915_exit(struct snd_sof_dev *sdev) { return 0; }
 
 #endif
 
-#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
-
-int hda_bus_ml_get_capabilities(struct hdac_bus *bus);
-void hda_bus_ml_free(struct hdac_bus *bus);
-void hda_bus_ml_put_all(struct hdac_bus *bus);
-void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
-int hda_bus_ml_resume(struct hdac_bus *bus);
-int hda_bus_ml_suspend(struct hdac_bus *bus);
-
-#else
-
-static inline int hda_bus_ml_get_capabilities(struct hdac_bus *bus) { return 0; }
-static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
-static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
-static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
-static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
-static inline int hda_bus_ml_suspend(struct hdac_bus *bus) { return 0; }
-
-#endif /* CONFIG_SND_SOC_SOF_HDA */
-
 /*
  * Trace Control.
  */
-- 
cgit v1.2.3


From 17c9b6ec35c0d6228db4f94590702a0cb03cd96a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:15 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add structures to parse ALT links
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend hdac_ext_link to store information needed for ALT
links. Follow-up patches will include more functional patches for
power-up and down.

Note that this patch suggests the use of an 'eml_lock' to serialize
access to shared registers. SoundWire-specific sequence require the
lock to be taken at a higher level, as a result the helpers added in
follow-up patches will provide 'unlocked' versions when needed.

Also note that the low-level sequences with the 'hdaml_' prefix are
taken directly from the hardware specifications - naming conventions
included. The code will be split in two, with locking and linked-list
management handled separately to avoid mixing required hardware setup
and Linux-based resource management.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-7-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |   4 +-
 sound/soc/sof/intel/hda-mlink.c | 221 +++++++++++++++++++++++++++++++++++++++-
 sound/soc/sof/intel/hda.c       |   2 +-
 3 files changed, 219 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index beef5f509e47..8048bf01c133 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -10,7 +10,7 @@ struct hdac_bus;
 
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_MLINK)
 
-int hda_bus_ml_get_capabilities(struct hdac_bus *bus);
+int hda_bus_ml_init(struct hdac_bus *bus);
 void hda_bus_ml_free(struct hdac_bus *bus);
 void hda_bus_ml_put_all(struct hdac_bus *bus);
 void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
@@ -20,7 +20,7 @@ int hda_bus_ml_suspend(struct hdac_bus *bus);
 #else
 
 static inline int
-hda_bus_ml_get_capabilities(struct hdac_bus *bus) { return 0; }
+hda_bus_ml_init(struct hdac_bus *bus) { return 0; }
 
 static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
 static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index fbf86f2350fb..e6b20182f099 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -14,28 +14,239 @@
 #include <sound/hda_register.h>
 #include <sound/hda-mlink.h>
 
+#include <linux/bitfield.h>
 #include <linux/module.h>
 
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_MLINK)
 
-int hda_bus_ml_get_capabilities(struct hdac_bus *bus)
+/**
+ * struct hdac_ext2_link - HDAudio extended+alternate link
+ *
+ * @hext_link:		hdac_ext_link
+ * @alt:		flag set for alternate extended links
+ * @intc:		boolean for interrupt capable
+ * @ofls:		boolean for offload support
+ * @lss:		boolean for link synchronization capabilities
+ * @slcount:		sublink count
+ * @elid:		extended link ID (AZX_REG_ML_LEPTR_ID_ defines)
+ * @elver:		extended link version
+ * @leptr:		extended link pointer
+ * @eml_lock:		mutual exclusion to access shared registers e.g. CPA/SPA bits
+ * in LCTL register
+ * @base_ptr:		pointer to shim/ip/shim_vs space
+ * @instance_offset:	offset between each of @slcount instances managed by link
+ * @shim_offset:	offset to SHIM register base
+ * @ip_offset:		offset to IP register base
+ * @shim_vs_offset:	offset to vendor-specific (VS) SHIM base
+ */
+struct hdac_ext2_link {
+	struct hdac_ext_link hext_link;
+
+	/* read directly from LCAP register */
+	bool alt;
+	bool intc;
+	bool ofls;
+	bool lss;
+	int slcount;
+	int elid;
+	int elver;
+	u32 leptr;
+
+	struct mutex eml_lock; /* prevent concurrent access to e.g. CPA/SPA */
+
+	/* internal values computed from LCAP contents */
+	void __iomem *base_ptr;
+	u32 instance_offset;
+	u32 shim_offset;
+	u32 ip_offset;
+	u32 shim_vs_offset;
+};
+
+#define hdac_ext_link_to_ext2(h) container_of(h, struct hdac_ext2_link, hext_link)
+
+#define AZX_REG_SDW_INSTANCE_OFFSET			0x8000
+#define AZX_REG_SDW_SHIM_OFFSET				0x0
+#define AZX_REG_SDW_IP_OFFSET				0x100
+#define AZX_REG_SDW_VS_SHIM_OFFSET			0x6000
+
+/* only one instance supported */
+#define AZX_REG_INTEL_DMIC_SHIM_OFFSET			0x0
+#define AZX_REG_INTEL_DMIC_IP_OFFSET			0x100
+#define AZX_REG_INTEL_DMIC_VS_SHIM_OFFSET		0x6000
+
+#define AZX_REG_INTEL_SSP_INSTANCE_OFFSET		0x1000
+#define AZX_REG_INTEL_SSP_SHIM_OFFSET			0x0
+#define AZX_REG_INTEL_SSP_IP_OFFSET			0x100
+#define AZX_REG_INTEL_SSP_VS_SHIM_OFFSET		0xC00
+
+/* only one instance supported */
+#define AZX_REG_INTEL_UAOL_SHIM_OFFSET			0x0
+#define AZX_REG_INTEL_UAOL_IP_OFFSET			0x100
+#define AZX_REG_INTEL_UAOL_VS_SHIM_OFFSET		0xC00
+
+/* HDAML section - this part follows sequences in the hardware specification,
+ * including naming conventions and the use of the hdaml_ prefix.
+ * The code is intentionally minimal with limited dependencies on frameworks or
+ * helpers. Locking and scanning lists is handled at a higher level
+ */
+
+static int hdaml_lnk_enum(struct device *dev, struct hdac_ext2_link *h2link,
+			  void __iomem *ml_addr, int link_idx)
 {
-	if (bus->mlcap)
-		return snd_hdac_ext_bus_get_ml_capabilities(bus);
+	struct hdac_ext_link *hlink = &h2link->hext_link;
+	u32 base_offset;
+
+	hlink->lcaps  = readl(ml_addr + AZX_REG_ML_LCAP);
+
+	h2link->alt = FIELD_GET(AZX_ML_HDA_LCAP_ALT, hlink->lcaps);
+
+	/* handle alternate extensions */
+	if (!h2link->alt) {
+		h2link->slcount = 1;
+
+		/*
+		 * LSDIID is initialized by hardware for HDaudio link,
+		 * it needs to be setup by software for alternate links
+		 */
+		hlink->lsdiid = readw(ml_addr + AZX_REG_ML_LSDIID);
+
+		dev_dbg(dev, "Link %d: HDAudio - lsdiid=%d\n",
+			link_idx, hlink->lsdiid);
+
+		return 0;
+	}
+
+	h2link->intc = FIELD_GET(AZX_ML_HDA_LCAP_INTC, hlink->lcaps);
+	h2link->ofls = FIELD_GET(AZX_ML_HDA_LCAP_OFLS, hlink->lcaps);
+	h2link->lss = FIELD_GET(AZX_ML_HDA_LCAP_LSS, hlink->lcaps);
+
+	/* read slcount (increment due to zero-based hardware representation */
+	h2link->slcount = FIELD_GET(AZX_ML_HDA_LCAP_SLCOUNT, hlink->lcaps) + 1;
+	dev_dbg(dev, "Link %d: HDAudio extended - sublink count %d\n",
+		link_idx, h2link->slcount);
+
+	/* find IP ID and offsets */
+	h2link->leptr = readl(hlink->ml_addr + AZX_REG_ML_LEPTR);
+
+	h2link->elid = FIELD_GET(AZX_REG_ML_LEPTR_ID, h2link->leptr);
+
+	base_offset = FIELD_GET(AZX_REG_ML_LEPTR_PTR, h2link->leptr);
+	h2link->base_ptr = hlink->ml_addr + base_offset;
+
+	switch (h2link->elid) {
+	case AZX_REG_ML_LEPTR_ID_SDW:
+		h2link->shim_offset = AZX_REG_SDW_SHIM_OFFSET;
+		h2link->ip_offset = AZX_REG_SDW_IP_OFFSET;
+		h2link->shim_vs_offset = AZX_REG_SDW_VS_SHIM_OFFSET;
+		dev_dbg(dev, "Link %d: HDAudio extended - SoundWire alternate link, leptr.ptr %#x\n",
+			link_idx, base_offset);
+		break;
+	case AZX_REG_ML_LEPTR_ID_INTEL_DMIC:
+		h2link->shim_offset = AZX_REG_INTEL_DMIC_SHIM_OFFSET;
+		h2link->ip_offset = AZX_REG_INTEL_DMIC_IP_OFFSET;
+		h2link->shim_vs_offset = AZX_REG_INTEL_DMIC_VS_SHIM_OFFSET;
+		dev_dbg(dev, "Link %d: HDAudio extended - INTEL DMIC alternate link, leptr.ptr %#x\n",
+			link_idx, base_offset);
+		break;
+	case AZX_REG_ML_LEPTR_ID_INTEL_SSP:
+		h2link->shim_offset = AZX_REG_INTEL_SSP_SHIM_OFFSET;
+		h2link->ip_offset = AZX_REG_INTEL_SSP_IP_OFFSET;
+		h2link->shim_vs_offset = AZX_REG_INTEL_SSP_VS_SHIM_OFFSET;
+		dev_dbg(dev, "Link %d: HDAudio extended - INTEL SSP alternate link, leptr.ptr %#x\n",
+			link_idx, base_offset);
+		break;
+	case AZX_REG_ML_LEPTR_ID_INTEL_UAOL:
+		h2link->shim_offset = AZX_REG_INTEL_UAOL_SHIM_OFFSET;
+		h2link->ip_offset = AZX_REG_INTEL_UAOL_IP_OFFSET;
+		h2link->shim_vs_offset = AZX_REG_INTEL_UAOL_VS_SHIM_OFFSET;
+		dev_dbg(dev, "Link %d: HDAudio extended - INTEL UAOL alternate link, leptr.ptr %#x\n",
+			link_idx, base_offset);
+		break;
+	default:
+		dev_err(dev, "Link %d: HDAudio extended - Unsupported alternate link, leptr.id=%#02x value\n",
+			link_idx, h2link->elid);
+		return -EINVAL;
+	}
 	return 0;
 }
-EXPORT_SYMBOL_NS(hda_bus_ml_get_capabilities, SND_SOC_SOF_HDA_MLINK);
+
+/* END HDAML section */
+
+static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+	int ret;
+
+	h2link  = kzalloc(sizeof(*h2link), GFP_KERNEL);
+	if (!h2link)
+		return -ENOMEM;
+
+	/* basic initialization */
+	hlink = &h2link->hext_link;
+
+	hlink->index = index;
+	hlink->bus = bus;
+	hlink->ml_addr = bus->mlcap + AZX_ML_BASE + (AZX_ML_INTERVAL * index);
+
+	ret = hdaml_lnk_enum(bus->dev, h2link, hlink->ml_addr, index);
+	if (ret < 0) {
+		kfree(h2link);
+		return ret;
+	}
+
+	mutex_init(&h2link->eml_lock);
+
+	list_add_tail(&hlink->list, &bus->hlink_list);
+
+	/*
+	 * HDaudio regular links are powered-on by default, the
+	 * refcount needs to be initialized.
+	 */
+	if (!h2link->alt)
+		hlink->ref_count = 1;
+
+	return 0;
+}
+
+int hda_bus_ml_init(struct hdac_bus *bus)
+{
+	u32 link_count;
+	int ret;
+	int i;
+
+	if (!bus->mlcap)
+		return 0;
+
+	link_count = readl(bus->mlcap + AZX_REG_ML_MLCD) + 1;
+
+	dev_dbg(bus->dev, "HDAudio Multi-Link count: %d\n", link_count);
+
+	for (i = 0; i < link_count; i++) {
+		ret = hda_ml_alloc_h2link(bus, i);
+		if (ret < 0) {
+			hda_bus_ml_free(bus);
+			return ret;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_NS(hda_bus_ml_init, SND_SOC_SOF_HDA_MLINK);
 
 void hda_bus_ml_free(struct hdac_bus *bus)
 {
 	struct hdac_ext_link *hlink, *_h;
+	struct hdac_ext2_link *h2link;
 
 	if (!bus->mlcap)
 		return;
 
 	list_for_each_entry_safe(hlink, _h, &bus->hlink_list, list) {
 		list_del(&hlink->list);
-		kfree(hlink);
+		h2link = hdac_ext_link_to_ext2(hlink);
+
+		mutex_destroy(&h2link->eml_lock);
+		kfree(h2link);
 	}
 }
 EXPORT_SYMBOL_NS(hda_bus_ml_free, SND_SOC_SOF_HDA_MLINK);
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 8845ae255f1e..5980cf1e27a3 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -918,7 +918,7 @@ static int hda_init_caps(struct snd_sof_dev *sdev)
 		return ret;
 	}
 
-	hda_bus_ml_get_capabilities(bus);
+	hda_bus_ml_init(bus);
 
 	/* Skip SoundWire if it is not supported */
 	if (!(interface_mask & BIT(SOF_DAI_INTEL_ALH)))
-- 
cgit v1.2.3


From fc7dab8ec0b4a451dd5737aa4f06a0e8bd6efa32 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:17 +0300
Subject: ASoC: SOF: Intel: hda-mlink: introduce helpers for 'extended links'
 PM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add helpers to program SPA/CPA bits, using a mutex to access the
shared LCTL register if required.

All links are managed with the same LCTLx.SPA bits. However there are
quite a few implementation details to be aware of:

Legacy HDaudio multi-links are powered-up when exiting reset, which
requires the ref_count to be manually set to one when initializing the
link.

Alternate links for SoundWire/DMIC/SSP need to be explicitly
powered-up before accessing the SHIM/IP/Vendor-Specific SHIM space for
each sublink. DMIC/SSP/SoundWire are all different cases with a
different device/dai/hlink relationship.

SoundWire will handle power management with the auxiliary device
resume/suspend routine. The ref_count is not necessary in this case.

The DMIC/SSP will by contrast handle the power management from DAI
.startup and .shutdown callbacks.

The SSP has a 1:1 mapping between sublink and DAI, but it's
bidirectional so the ref_count will help avoid turning off the sublink
when one of the two directions is still in use.

The DMIC has a single link but two DAIs for data generated at
different sampling frequencies, again the ref_count will make sure the
two DAIs can be used concurrently.

And last the SoundWire Intel require power-up/down and bank switch to
be handled with a lock already taken, so the 'eml_lock' is made
optional with the _unlocked versions of the helpers.

Note that the _check_power_active() implementation is similar to
previous helpers in sound/hda/ext, with sleep duration and timeout
aligned with hardware recommendations. If desired, this helper could
be modified in a second step with .e.g. readl_poll_timeout()

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-9-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  32 ++++++++
 sound/soc/sof/intel/hda-mlink.c | 163 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 8048bf01c133..6ecb4c29e472 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -12,6 +12,13 @@ struct hdac_bus;
 
 int hda_bus_ml_init(struct hdac_bus *bus);
 void hda_bus_ml_free(struct hdac_bus *bus);
+
+int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
+int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
+
+int hdac_bus_eml_power_down(struct hdac_bus *bus, bool alt, int elid, int sublink);
+int hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus);
 void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
 int hda_bus_ml_resume(struct hdac_bus *bus);
@@ -23,6 +30,31 @@ static inline int
 hda_bus_ml_init(struct hdac_bus *bus) { return 0; }
 
 static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
+
+static inline int
+hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_power_down(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return 0;
+}
+
 static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
 static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
 static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 8a66c0a67589..4d016f6ab773 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -170,6 +170,68 @@ static int hdaml_lnk_enum(struct device *dev, struct hdac_ext2_link *h2link,
 	return 0;
 }
 
+/*
+ * Hardware recommendations are to wait ~10us before checking any hardware transition
+ * reported by bits changing status.
+ * This value does not need to be super-precise, a slack of 5us is perfectly acceptable.
+ * The worst-case is about 1ms before reporting an issue
+ */
+#define HDAML_POLL_DELAY_MIN_US 10
+#define HDAML_POLL_DELAY_SLACK_US 5
+#define HDAML_POLL_DELAY_RETRY  100
+
+static int check_sublink_power(u32 __iomem *lctl, int sublink, bool enabled)
+{
+	int mask = BIT(sublink) << AZX_ML_LCTL_CPA_SHIFT;
+	int retry = HDAML_POLL_DELAY_RETRY;
+	u32 val;
+
+	usleep_range(HDAML_POLL_DELAY_MIN_US,
+		     HDAML_POLL_DELAY_MIN_US + HDAML_POLL_DELAY_SLACK_US);
+	do {
+		val = readl(lctl);
+		if (enabled) {
+			if (val & mask)
+				return 0;
+		} else {
+			if (!(val & mask))
+				return 0;
+		}
+		usleep_range(HDAML_POLL_DELAY_MIN_US,
+			     HDAML_POLL_DELAY_MIN_US + HDAML_POLL_DELAY_SLACK_US);
+
+	} while (--retry);
+
+	return -EIO;
+}
+
+static int hdaml_link_init(u32 __iomem *lctl, int sublink)
+{
+	u32 val;
+	u32 mask = BIT(sublink) << AZX_ML_LCTL_SPA_SHIFT;
+
+	val = readl(lctl);
+	val |= mask;
+
+	writel(val, lctl);
+
+	return check_sublink_power(lctl, sublink, true);
+}
+
+static int hdaml_link_shutdown(u32 __iomem *lctl, int sublink)
+{
+	u32 val;
+	u32 mask;
+
+	val = readl(lctl);
+	mask = BIT(sublink) << AZX_ML_LCTL_SPA_SHIFT;
+	val &= ~mask;
+
+	writel(val, lctl);
+
+	return check_sublink_power(lctl, sublink, false);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -251,6 +313,107 @@ void hda_bus_ml_free(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hda_bus_ml_free, SND_SOC_SOF_HDA_MLINK);
 
+static struct hdac_ext2_link *
+find_ext2_link(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext_link *hlink;
+
+	list_for_each_entry(hlink, &bus->hlink_list, list) {
+		struct hdac_ext2_link *h2link = hdac_ext_link_to_ext2(hlink);
+
+		if (h2link->alt == alt && h2link->elid == elid)
+			return h2link;
+	}
+
+	return NULL;
+}
+
+static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
+				      bool eml_lock)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+	int ret = 0;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return -ENODEV;
+
+	if (sublink >= h2link->slcount)
+		return -EINVAL;
+
+	hlink = &h2link->hext_link;
+
+	if (eml_lock)
+		mutex_lock(&h2link->eml_lock);
+
+	if (++hlink->ref_count > 1)
+		goto skip_init;
+
+	ret = hdaml_link_init(hlink->ml_addr + AZX_REG_ML_LCTL, sublink);
+
+skip_init:
+	if (eml_lock)
+		mutex_unlock(&h2link->eml_lock);
+
+	return ret;
+}
+
+int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return hdac_bus_eml_power_up_base(bus, alt, elid, sublink, true);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_power_up, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return hdac_bus_eml_power_up_base(bus, alt, elid, sublink, false);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_power_up_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+static int hdac_bus_eml_power_down_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
+					bool eml_lock)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+	int ret = 0;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return -ENODEV;
+
+	if (sublink >= h2link->slcount)
+		return -EINVAL;
+
+	hlink = &h2link->hext_link;
+
+	if (eml_lock)
+		mutex_lock(&h2link->eml_lock);
+
+	if (--hlink->ref_count > 0)
+		goto skip_shutdown;
+
+	ret = hdaml_link_shutdown(hlink->ml_addr + AZX_REG_ML_LCTL, sublink);
+
+skip_shutdown:
+	if (eml_lock)
+		mutex_unlock(&h2link->eml_lock);
+
+	return ret;
+}
+
+int hdac_bus_eml_power_down(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return hdac_bus_eml_power_down_base(bus, alt, elid, sublink, true);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_power_down, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	return hdac_bus_eml_power_down_base(bus, alt, elid, sublink, false);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_power_down_unlocked, SND_SOC_SOF_HDA_MLINK);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus)
 {
 	struct hdac_ext_link *hlink;
-- 
cgit v1.2.3


From 725218f1d8210ee6eba2c2e923ea26a312fb3f46 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:18 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add convenience helpers for SoundWire PM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The updated SoundWire Intel driver will need to rely on Extended
HDaudio links for power management, but it doesn't need to be aware of
all the HDaudio structures. Add convenience helpers to avoid polluting
SoundWire drivers too much with HDaudio information.

Since the SoundWire/Intel solution already takes the lock at a higher
level, the _unlocked PM helpers are used.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-10-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  9 +++++++++
 sound/soc/sof/intel/hda-mlink.c | 12 ++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 6ecb4c29e472..c47008a603d0 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -19,6 +19,9 @@ int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int
 int hdac_bus_eml_power_down(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
 
+int hdac_bus_eml_sdw_power_up_unlocked(struct hdac_bus *bus, int sublink);
+int hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus);
 void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
 int hda_bus_ml_resume(struct hdac_bus *bus);
@@ -55,6 +58,12 @@ hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, int s
 	return 0;
 }
 
+static inline int
+hdac_bus_eml_sdw_power_up_unlocked(struct hdac_bus *bus, int sublink) { return 0; }
+
+static inline int
+hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink) { return 0; }
+
 static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
 static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
 static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 4d016f6ab773..5045c10bed76 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -414,6 +414,18 @@ int hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, i
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_power_down_unlocked, SND_SOC_SOF_HDA_MLINK);
 
+int hdac_bus_eml_sdw_power_up_unlocked(struct hdac_bus *bus, int sublink)
+{
+	return hdac_bus_eml_power_up_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW, sublink);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_power_up_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink)
+{
+	return hdac_bus_eml_power_down_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW, sublink);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_power_down_unlocked, SND_SOC_SOF_HDA_MLINK);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus)
 {
 	struct hdac_ext_link *hlink;
-- 
cgit v1.2.3


From 6857c7ee202cf4b8224882bcee8e3b97c26b6484 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:19 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helper to return sublink count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is needed for SoundWire integration.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-11-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  5 +++++
 sound/soc/sof/intel/hda-mlink.c | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index c47008a603d0..d7f65cb6e175 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -13,6 +13,8 @@ struct hdac_bus;
 int hda_bus_ml_init(struct hdac_bus *bus);
 void hda_bus_ml_free(struct hdac_bus *bus);
 
+int hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid);
+
 int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
 
@@ -34,6 +36,9 @@ hda_bus_ml_init(struct hdac_bus *bus) { return 0; }
 
 static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
 
+static inline int
+hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid) { return 0; }
+
 static inline int
 hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 5045c10bed76..02743c342f28 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -328,6 +328,18 @@ find_ext2_link(struct hdac_bus *bus, bool alt, int elid)
 	return NULL;
 }
 
+int hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return 0;
+
+	return h2link->slcount;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_get_count, SND_SOC_SOF_HDA_MLINK);
+
 static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
 				      bool eml_lock)
 {
-- 
cgit v1.2.3


From 2e4288319ad3e18a266461d3e63c86eb36f7a152 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:20 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helpers to enable/check interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When INTC is set, LCTL exposes INTEN and INTSTS fields.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-12-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  8 ++++++
 sound/soc/sof/intel/hda-mlink.c | 62 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index d7f65cb6e175..872652209a47 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -14,6 +14,8 @@ int hda_bus_ml_init(struct hdac_bus *bus);
 void hda_bus_ml_free(struct hdac_bus *bus);
 
 int hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid);
+void hdac_bus_eml_enable_interrupt(struct hdac_bus *bus, bool alt, int elid, bool enable);
+bool hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid);
 
 int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
@@ -39,6 +41,12 @@ static inline void hda_bus_ml_free(struct hdac_bus *bus) { }
 static inline int
 hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid) { return 0; }
 
+static inline void
+hdac_bus_eml_enable_interrupt(struct hdac_bus *bus, bool alt, int elid, bool enable) { }
+
+static inline bool
+hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid) { return false; }
+
 static inline int
 hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 02743c342f28..026642659037 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -232,6 +232,28 @@ static int hdaml_link_shutdown(u32 __iomem *lctl, int sublink)
 	return check_sublink_power(lctl, sublink, false);
 }
 
+static void hdaml_link_enable_interrupt(u32 __iomem *lctl, bool enable)
+{
+	u32 val;
+
+	val = readl(lctl);
+	if (enable)
+		val |= AZX_ML_LCTL_INTEN;
+	else
+		val &= ~AZX_ML_LCTL_INTEN;
+
+	writel(val, lctl);
+}
+
+static bool hdaml_link_check_interrupt(u32 __iomem *lctl)
+{
+	u32 val;
+
+	val = readl(lctl);
+
+	return val & AZX_ML_LCTL_INTSTS;
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -340,6 +362,46 @@ int hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_get_count, SND_SOC_SOF_HDA_MLINK);
 
+void hdac_bus_eml_enable_interrupt(struct hdac_bus *bus, bool alt, int elid, bool enable)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return;
+
+	if (!h2link->intc)
+		return;
+
+	hlink = &h2link->hext_link;
+
+	mutex_lock(&h2link->eml_lock);
+
+	hdaml_link_enable_interrupt(hlink->ml_addr + AZX_REG_ML_LCTL, enable);
+
+	mutex_unlock(&h2link->eml_lock);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_enable_interrupt, SND_SOC_SOF_HDA_MLINK);
+
+bool hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return false;
+
+	if (!h2link->intc)
+		return false;
+
+	hlink = &h2link->hext_link;
+
+	return hdaml_link_check_interrupt(hlink->ml_addr + AZX_REG_ML_LCTL);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_check_interrupt, SND_SOC_SOF_HDA_MLINK);
+
 static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
 				      bool eml_lock)
 {
-- 
cgit v1.2.3


From 02ba1b021c28670011b361be3be4bda49f348c43 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:21 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helpers to set link SYNC frequency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These helpers configure the ratio between the base clock and the
hardware signal used for link synchronization.

The SYNCPRD is written before the first sublink is powered-up. The
SYNCPU bit is set, but it will only be cleared after the link is
powered-up, hence the implementation with a set/wait pattern.

These helpers are currently only needed by SoundWire support, where
the lock is taken at a higher level, so only the _unlocked versions
are exposed for now.

Note that the _wait_bit() implementation is similar to previous
helpers in drivers/soundwire, but with sleep duration and timeout
aligned with hardware recommendations. If desired, this helper could
be modified in a second step with e.g. readl_poll_timeout().

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-13-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       | 27 +++++++++++++
 sound/soc/sof/intel/hda-mlink.c | 90 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 872652209a47..80af2a3996d7 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -17,6 +17,12 @@ int hdac_bus_eml_get_count(struct hdac_bus *bus, bool alt, int elid);
 void hdac_bus_eml_enable_interrupt(struct hdac_bus *bus, bool alt, int elid, bool enable);
 bool hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid);
 
+int hdac_bus_eml_set_syncprd_unlocked(struct hdac_bus *bus, bool alt, int elid, u32 syncprd);
+int hdac_bus_eml_sdw_set_syncprd_unlocked(struct hdac_bus *bus, u32 syncprd);
+
+int hdac_bus_eml_wait_syncpu_unlocked(struct hdac_bus *bus, bool alt, int elid);
+int hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus);
+
 int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
 
@@ -47,6 +53,27 @@ hdac_bus_eml_enable_interrupt(struct hdac_bus *bus, bool alt, int elid, bool ena
 static inline bool
 hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid) { return false; }
 
+static inline int
+hdac_bus_eml_set_syncprd_unlocked(struct hdac_bus *bus, bool alt, int elid, u32 syncprd)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_sdw_set_syncprd_unlocked(struct hdac_bus *bus, u32 syncprd)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_wait_syncpu_unlocked(struct hdac_bus *bus, bool alt, int elid)
+{
+	return 0;
+}
+
+static inline int
+hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus) { return 0; }
+
 static inline int
 hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 026642659037..a364d6b5935d 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -254,6 +254,46 @@ static bool hdaml_link_check_interrupt(u32 __iomem *lctl)
 	return val & AZX_ML_LCTL_INTSTS;
 }
 
+static int hdaml_wait_bit(void __iomem *base, int offset, u32 mask, u32 target)
+{
+	int timeout = HDAML_POLL_DELAY_RETRY;
+	u32 reg_read;
+
+	do {
+		reg_read = readl(base + offset);
+		if ((reg_read & mask) == target)
+			return 0;
+
+		timeout--;
+		usleep_range(HDAML_POLL_DELAY_MIN_US,
+			     HDAML_POLL_DELAY_MIN_US + HDAML_POLL_DELAY_SLACK_US);
+	} while (timeout != 0);
+
+	return -EAGAIN;
+}
+
+static void hdaml_link_set_syncprd(u32 __iomem *lsync, u32 syncprd)
+{
+	u32 val;
+
+	val = readl(lsync);
+	val &= ~AZX_REG_ML_LSYNC_SYNCPRD;
+	val |= (syncprd & AZX_REG_ML_LSYNC_SYNCPRD);
+
+	/*
+	 * set SYNCPU but do not wait. The bit is cleared by hardware when
+	 * the link becomes active.
+	 */
+	val |= AZX_REG_ML_LSYNC_SYNCPU;
+
+	writel(val, lsync);
+}
+
+static int hdaml_link_wait_syncpu(u32 __iomem *lsync)
+{
+	return hdaml_wait_bit(lsync, 0, AZX_REG_ML_LSYNC_SYNCPU, 0);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -402,6 +442,56 @@ bool hdac_bus_eml_check_interrupt(struct hdac_bus *bus, bool alt, int elid)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_check_interrupt, SND_SOC_SOF_HDA_MLINK);
 
+int hdac_bus_eml_set_syncprd_unlocked(struct hdac_bus *bus, bool alt, int elid, u32 syncprd)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return 0;
+
+	if (!h2link->lss)
+		return 0;
+
+	hlink = &h2link->hext_link;
+
+	hdaml_link_set_syncprd(hlink->ml_addr + AZX_REG_ML_LSYNC, syncprd);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_set_syncprd_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_sdw_set_syncprd_unlocked(struct hdac_bus *bus, u32 syncprd)
+{
+	return hdac_bus_eml_set_syncprd_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW, syncprd);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_set_syncprd_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_wait_syncpu_unlocked(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return 0;
+
+	if (!h2link->lss)
+		return 0;
+
+	hlink = &h2link->hext_link;
+
+	return hdaml_link_wait_syncpu(hlink->ml_addr + AZX_REG_ML_LSYNC);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_wait_syncpu_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus)
+{
+	return hdac_bus_eml_wait_syncpu_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_wait_syncpu_unlocked, SND_SOC_SOF_HDA_MLINK);
+
 static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
 				      bool eml_lock)
 {
-- 
cgit v1.2.3


From 1f5a6e8b5147b7bc49c0e091f8b458e45d8ee56c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:22 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helpers for sync_arm/sync_go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The multi-link synchronization uses the same concept and registers,
but moved to the HDAudio extended links.

Add helpers for sync_arm and sync_go which are the basic for the bus
reset, bank switch and clock stop.

Since SoundWire is the only user of those helpers, only expose the
_unlocked versions for now.

Note that SYNCGO is a write-only bit, so no error can be reported. We
still return 0 for compatibility with the SoundWire stream management
headers.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-14-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       | 18 +++++++++++
 sound/soc/sof/intel/hda-mlink.c | 70 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 80af2a3996d7..234af269495e 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -23,6 +23,12 @@ int hdac_bus_eml_sdw_set_syncprd_unlocked(struct hdac_bus *bus, u32 syncprd);
 int hdac_bus_eml_wait_syncpu_unlocked(struct hdac_bus *bus, bool alt, int elid);
 int hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus);
 
+void hdac_bus_eml_sync_arm_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
+void hdac_bus_eml_sdw_sync_arm_unlocked(struct hdac_bus *bus, int sublink);
+
+int hdac_bus_eml_sync_go_unlocked(struct hdac_bus *bus, bool alt, int elid);
+int hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus);
+
 int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
 
@@ -74,6 +80,18 @@ hdac_bus_eml_wait_syncpu_unlocked(struct hdac_bus *bus, bool alt, int elid)
 static inline int
 hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus) { return 0; }
 
+static inline void
+hdac_bus_eml_sync_arm_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink) { }
+
+static inline void
+hdac_bus_eml_sdw_sync_arm_unlocked(struct hdac_bus *bus, int sublink) { }
+
+static inline int
+hdac_bus_eml_sync_go_unlocked(struct hdac_bus *bus, bool alt, int elid) { return 0; }
+
+static inline int
+hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus) { return 0; }
+
 static inline int
 hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index a364d6b5935d..c8371ab03c0a 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -294,6 +294,26 @@ static int hdaml_link_wait_syncpu(u32 __iomem *lsync)
 	return hdaml_wait_bit(lsync, 0, AZX_REG_ML_LSYNC_SYNCPU, 0);
 }
 
+static void hdaml_link_sync_arm(u32 __iomem *lsync, int sublink)
+{
+	u32 val;
+
+	val = readl(lsync);
+	val |= (AZX_REG_ML_LSYNC_CMDSYNC << sublink);
+
+	writel(val, lsync);
+}
+
+static void hdaml_link_sync_go(u32 __iomem *lsync)
+{
+	u32 val;
+
+	val = readl(lsync);
+	val |= AZX_REG_ML_LSYNC_SYNCGO;
+
+	writel(val, lsync);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -492,6 +512,56 @@ int hdac_bus_eml_sdw_wait_syncpu_unlocked(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_wait_syncpu_unlocked, SND_SOC_SOF_HDA_MLINK);
 
+void hdac_bus_eml_sync_arm_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return;
+
+	if (!h2link->lss)
+		return;
+
+	hlink = &h2link->hext_link;
+
+	hdaml_link_sync_arm(hlink->ml_addr + AZX_REG_ML_LSYNC, sublink);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sync_arm_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+void hdac_bus_eml_sdw_sync_arm_unlocked(struct hdac_bus *bus, int sublink)
+{
+	hdac_bus_eml_sync_arm_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW, sublink);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_sync_arm_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_sync_go_unlocked(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return 0;
+
+	if (!h2link->lss)
+		return 0;
+
+	hlink = &h2link->hext_link;
+
+	hdaml_link_sync_go(hlink->ml_addr + AZX_REG_ML_LSYNC);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sync_go_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+int hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus)
+{
+	return hdac_bus_eml_sync_go_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_sync_go_unlocked, SND_SOC_SOF_HDA_MLINK);
+
 static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
 				      bool eml_lock)
 {
-- 
cgit v1.2.3


From d56d205857a2f6e10a1047532134321072f758b7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:23 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helper to check cmdsync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper is an optimization where sync_go is only called when the
cmdsync field is actually set to a non-zero value.

Since this is also only used by SoundWire for now, only expose the
_unlocked version.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-15-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  9 +++++++++
 sound/soc/sof/intel/hda-mlink.c | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 234af269495e..0c71b843f2cf 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -29,6 +29,9 @@ void hdac_bus_eml_sdw_sync_arm_unlocked(struct hdac_bus *bus, int sublink);
 int hdac_bus_eml_sync_go_unlocked(struct hdac_bus *bus, bool alt, int elid);
 int hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus);
 
+bool hdac_bus_eml_check_cmdsync_unlocked(struct hdac_bus *bus, bool alt, int elid);
+bool hdac_bus_eml_sdw_check_cmdsync_unlocked(struct hdac_bus *bus);
+
 int hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink);
 int hdac_bus_eml_power_up_unlocked(struct hdac_bus *bus, bool alt, int elid, int sublink);
 
@@ -92,6 +95,12 @@ hdac_bus_eml_sync_go_unlocked(struct hdac_bus *bus, bool alt, int elid) { return
 static inline int
 hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus) { return 0; }
 
+static inline bool
+hdac_bus_eml_check_cmdsync_unlocked(struct hdac_bus *bus, bool alt, int elid) { return false; }
+
+static inline bool
+hdac_bus_eml_sdw_check_cmdsync_unlocked(struct hdac_bus *bus) { return false; }
+
 static inline int
 hdac_bus_eml_power_up(struct hdac_bus *bus, bool alt, int elid, int sublink)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index c8371ab03c0a..cbbc8639691c 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -314,6 +314,15 @@ static void hdaml_link_sync_go(u32 __iomem *lsync)
 	writel(val, lsync);
 }
 
+static bool hdaml_link_check_cmdsync(u32 __iomem *lsync, u32 cmdsync_mask)
+{
+	u32 val;
+
+	val = readl(lsync);
+
+	return !!(val & cmdsync_mask);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -562,6 +571,35 @@ int hdac_bus_eml_sdw_sync_go_unlocked(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_sync_go_unlocked, SND_SOC_SOF_HDA_MLINK);
 
+bool hdac_bus_eml_check_cmdsync_unlocked(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+	u32 cmdsync_mask;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return 0;
+
+	if (!h2link->lss)
+		return 0;
+
+	hlink = &h2link->hext_link;
+
+	cmdsync_mask = GENMASK(AZX_REG_ML_LSYNC_CMDSYNC_SHIFT + h2link->slcount - 1,
+			       AZX_REG_ML_LSYNC_CMDSYNC_SHIFT);
+
+	return hdaml_link_check_cmdsync(hlink->ml_addr + AZX_REG_ML_LSYNC,
+					cmdsync_mask);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_check_cmdsync_unlocked, SND_SOC_SOF_HDA_MLINK);
+
+bool hdac_bus_eml_sdw_check_cmdsync_unlocked(struct hdac_bus *bus)
+{
+	return hdac_bus_eml_check_cmdsync_unlocked(bus, true, AZX_REG_ML_LEPTR_ID_SDW);
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_check_cmdsync_unlocked, SND_SOC_SOF_HDA_MLINK);
+
 static int hdac_bus_eml_power_up_base(struct hdac_bus *bus, bool alt, int elid, int sublink,
 				      bool eml_lock)
 {
-- 
cgit v1.2.3


From 87a6ddc0cf1c62dbc7c2cc4b5f764a2e992c5ba6 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:24 +0300
Subject: ASoC: SOF: Intel: hda-mlink: program SoundWire LSDIID registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each SoundWire peripheral can be programmed from the manager side
either with a regular command FIFO, or with the HDaudio CORB/RIRB
DMA-based mechanism. The mapping between SoundWire peripheral and SDI
address is handled with the LSDIID register.

This mapping only works of course if each peripheral has a unique
address across all links. This has already been enforced in previous
Intel contributions allowing for an IDA-based solution for the device
number allocation.

The checks on the dev_num are handled at the SoundWire level, but the
locking is handled at the hda-mlink level.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-16-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  5 +++++
 sound/soc/sof/intel/hda-mlink.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index 0c71b843f2cf..d626ef964ea7 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -41,6 +41,8 @@ int hdac_bus_eml_power_down_unlocked(struct hdac_bus *bus, bool alt, int elid, i
 int hdac_bus_eml_sdw_power_up_unlocked(struct hdac_bus *bus, int sublink);
 int hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink);
 
+int hdac_bus_eml_sdw_set_lsdiid(struct hdac_bus *bus, int sublink, int dev_num);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus);
 void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
 int hda_bus_ml_resume(struct hdac_bus *bus);
@@ -131,6 +133,9 @@ hdac_bus_eml_sdw_power_up_unlocked(struct hdac_bus *bus, int sublink) { return 0
 static inline int
 hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink) { return 0; }
 
+static inline int
+hdac_bus_eml_sdw_set_lsdiid(struct hdac_bus *bus, int sublink, int dev_num) { return 0; }
+
 static inline void hda_bus_ml_put_all(struct hdac_bus *bus) { }
 static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
 static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index cbbc8639691c..5810d6f8719c 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -323,6 +323,16 @@ static bool hdaml_link_check_cmdsync(u32 __iomem *lsync, u32 cmdsync_mask)
 	return !!(val & cmdsync_mask);
 }
 
+static void hdaml_link_set_lsdiid(u32 __iomem *lsdiid, int dev_num)
+{
+	u32 val;
+
+	val = readl(lsdiid);
+	val |= BIT(dev_num);
+
+	writel(val, lsdiid);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -698,6 +708,26 @@ int hdac_bus_eml_sdw_power_down_unlocked(struct hdac_bus *bus, int sublink)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_power_down_unlocked, SND_SOC_SOF_HDA_MLINK);
 
+int hdac_bus_eml_sdw_set_lsdiid(struct hdac_bus *bus, int sublink, int dev_num)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, true, AZX_REG_ML_LEPTR_ID_SDW);
+	if (!h2link)
+		return -ENODEV;
+
+	hlink = &h2link->hext_link;
+
+	mutex_lock(&h2link->eml_lock);
+
+	hdaml_link_set_lsdiid(hlink->ml_addr + AZX_REG_ML_LSDIID_OFFSET(sublink), dev_num);
+
+	mutex_unlock(&h2link->eml_lock);
+
+	return 0;
+} EXPORT_SYMBOL_NS(hdac_bus_eml_sdw_set_lsdiid, SND_SOC_SOF_HDA_MLINK);
+
 void hda_bus_ml_put_all(struct hdac_bus *bus)
 {
 	struct hdac_ext_link *hlink;
-- 
cgit v1.2.3


From 2b864e969be29958eb810fb611d1eac50eb7104b Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:25 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helpers to retrieve DMIC/SSP hlink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Small helpers to make DAI ops simpler.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-17-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       | 10 ++++++++++
 sound/soc/sof/intel/hda-mlink.c | 24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index d626ef964ea7..bfd2c77c184d 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -7,6 +7,7 @@
  */
 
 struct hdac_bus;
+struct hdac_ext_link;
 
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_MLINK)
 
@@ -48,6 +49,9 @@ void hda_bus_ml_reset_losidv(struct hdac_bus *bus);
 int hda_bus_ml_resume(struct hdac_bus *bus);
 int hda_bus_ml_suspend(struct hdac_bus *bus);
 
+struct hdac_ext_link *hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus);
+struct hdac_ext_link *hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus);
+
 #else
 
 static inline int
@@ -141,4 +145,10 @@ static inline void hda_bus_ml_reset_losidv(struct hdac_bus *bus) { }
 static inline int hda_bus_ml_resume(struct hdac_bus *bus) { return 0; }
 static inline int hda_bus_ml_suspend(struct hdac_bus *bus) { return 0; }
 
+static inline struct hdac_ext_link *
+hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus) { return NULL; }
+
+static inline struct hdac_ext_link *
+hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus) { return NULL; }
+
 #endif /* CONFIG_SND_SOC_SOF_HDA */
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 5810d6f8719c..06613c081410 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -788,6 +788,30 @@ int hda_bus_ml_suspend(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hda_bus_ml_suspend, SND_SOC_SOF_HDA_MLINK);
 
+struct hdac_ext_link *hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus)
+{
+	struct hdac_ext2_link *h2link;
+
+	h2link = find_ext2_link(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_SSP);
+	if (!h2link)
+		return NULL;
+
+	return &h2link->hext_link;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_ssp_get_hlink, SND_SOC_SOF_HDA_MLINK);
+
+struct hdac_ext_link *hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus)
+{
+	struct hdac_ext2_link *h2link;
+
+	h2link = find_ext2_link(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_DMIC);
+	if (!h2link)
+		return NULL;
+
+	return &h2link->hext_link;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_dmic_get_hlink, SND_SOC_SOF_HDA_MLINK);
+
 #endif
 
 MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From 82958c406da4fec8f818826624c33cf2e62f4147 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:26 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helper to offload link ownership
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For DMIC and SSP, the DSP will be responsible for programming the
blobs and link registers.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-18-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  7 +++++++
 sound/soc/sof/intel/hda-mlink.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index bfd2c77c184d..fee42c432fa3 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -52,6 +52,8 @@ int hda_bus_ml_suspend(struct hdac_bus *bus);
 struct hdac_ext_link *hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus);
 struct hdac_ext_link *hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus);
 
+int hdac_bus_eml_enable_offload(struct hdac_bus *bus, bool alt, int elid, bool enable);
+
 #else
 
 static inline int
@@ -151,4 +153,9 @@ hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus) { return NULL; }
 static inline struct hdac_ext_link *
 hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus) { return NULL; }
 
+static inline int
+hdac_bus_eml_enable_offload(struct hdac_bus *bus, bool alt, int elid, bool enable)
+{
+	return 0;
+}
 #endif /* CONFIG_SND_SOC_SOF_HDA */
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index 06613c081410..b9ce07c38bb5 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -333,6 +333,18 @@ static void hdaml_link_set_lsdiid(u32 __iomem *lsdiid, int dev_num)
 	writel(val, lsdiid);
 }
 
+static void hdaml_lctl_offload_enable(u32 __iomem *lctl, bool enable)
+{
+	u32 val = readl(lctl);
+
+	if (enable)
+		val |=  AZX_ML_LCTL_OFLEN;
+	else
+		val &=  ~AZX_ML_LCTL_OFLEN;
+
+	writel(val, lctl);
+}
+
 /* END HDAML section */
 
 static int hda_ml_alloc_h2link(struct hdac_bus *bus, int index)
@@ -812,6 +824,30 @@ struct hdac_ext_link *hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hdac_bus_eml_dmic_get_hlink, SND_SOC_SOF_HDA_MLINK);
 
+int hdac_bus_eml_enable_offload(struct hdac_bus *bus, bool alt, int elid, bool enable)
+{
+	struct hdac_ext2_link *h2link;
+	struct hdac_ext_link *hlink;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return -ENODEV;
+
+	if (!h2link->ofls)
+		return 0;
+
+	hlink = &h2link->hext_link;
+
+	mutex_lock(&h2link->eml_lock);
+
+	hdaml_lctl_offload_enable(hlink->ml_addr + AZX_REG_ML_LCTL, enable);
+
+	mutex_unlock(&h2link->eml_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_enable_offload, SND_SOC_SOF_HDA_MLINK);
+
 #endif
 
 MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From 681f27f302ff85ddf3f6e7fd0231059f99c0f26e Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 4 Apr 2023 13:41:27 +0300
Subject: ASoC: SOF: Intel: hda-mlink: add helper to retrieve eml_lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For SoundWire usages, we need to use the global eml_lock to
serialize/protect all accesses to shared registers.  Due to the split
implementation across two subsystems, we need to pass a pointer
around.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20230404104127.5629-19-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/hda-mlink.h       |  5 +++++
 sound/soc/sof/intel/hda-mlink.c | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index fee42c432fa3..dbc47af08135 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -52,6 +52,8 @@ int hda_bus_ml_suspend(struct hdac_bus *bus);
 struct hdac_ext_link *hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus);
 struct hdac_ext_link *hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus);
 
+struct mutex *hdac_bus_eml_get_mutex(struct hdac_bus *bus, bool alt, int elid);
+
 int hdac_bus_eml_enable_offload(struct hdac_bus *bus, bool alt, int elid, bool enable);
 
 #else
@@ -153,6 +155,9 @@ hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus) { return NULL; }
 static inline struct hdac_ext_link *
 hdac_bus_eml_dmic_get_hlink(struct hdac_bus *bus) { return NULL; }
 
+static inline struct mutex *
+hdac_bus_eml_get_mutex(struct hdac_bus *bus, bool alt, int elid) { return NULL; }
+
 static inline int
 hdac_bus_eml_enable_offload(struct hdac_bus *bus, bool alt, int elid, bool enable)
 {
diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c
index b9ce07c38bb5..775582ab7494 100644
--- a/sound/soc/sof/intel/hda-mlink.c
+++ b/sound/soc/sof/intel/hda-mlink.c
@@ -800,6 +800,18 @@ int hda_bus_ml_suspend(struct hdac_bus *bus)
 }
 EXPORT_SYMBOL_NS(hda_bus_ml_suspend, SND_SOC_SOF_HDA_MLINK);
 
+struct mutex *hdac_bus_eml_get_mutex(struct hdac_bus *bus, bool alt, int elid)
+{
+	struct hdac_ext2_link *h2link;
+
+	h2link = find_ext2_link(bus, alt, elid);
+	if (!h2link)
+		return NULL;
+
+	return &h2link->eml_lock;
+}
+EXPORT_SYMBOL_NS(hdac_bus_eml_get_mutex, SND_SOC_SOF_HDA_MLINK);
+
 struct hdac_ext_link *hdac_bus_eml_ssp_get_hlink(struct hdac_bus *bus)
 {
 	struct hdac_ext2_link *h2link;
-- 
cgit v1.2.3


From d769ccaf957fe7391f357c0a923de71f594b8a2b Mon Sep 17 00:00:00 2001
From: Kal Conley <kal.conley@dectris.com>
Date: Thu, 6 Apr 2023 01:59:18 +0200
Subject: xsk: Fix unaligned descriptor validation

Make sure unaligned descriptors that straddle the end of the UMEM are
considered invalid. Currently, descriptor validation is broken for
zero-copy mode which only checks descriptors at page granularity.
For example, descriptors in zero-copy mode that overrun the end of the
UMEM but not a page boundary are (incorrectly) considered valid. The
UMEM boundary check needs to happen before the page boundary and
contiguity checks in xp_desc_crosses_non_contig_pg(). Do this check in
xp_unaligned_validate_desc() instead like xp_check_unaligned() already
does.

Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
Signed-off-by: Kal Conley <kal.conley@dectris.com>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/r/20230405235920.7305-2-kal.conley@dectris.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/xsk_buff_pool.h | 9 ++-------
 net/xdp/xsk_queue.h         | 1 +
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 3e952e569418..d318c769b445 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -180,13 +180,8 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
 	if (likely(!cross_pg))
 		return false;
 
-	if (pool->dma_pages_cnt) {
-		return !(pool->dma_pages[addr >> PAGE_SHIFT] &
-			 XSK_NEXT_PG_CONTIG_MASK);
-	}
-
-	/* skb path */
-	return addr + len > pool->addrs_cnt;
+	return pool->dma_pages_cnt &&
+	       !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK);
 }
 
 static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bfb2a7e50c26..66c6f57c9c44 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -162,6 +162,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
 		return false;
 
 	if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+	    addr + desc->len > pool->addrs_cnt ||
 	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
 		return false;
 
-- 
cgit v1.2.3


From e999a5c5a19cf3b679f3d93c49ad5f5c04e4806c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 24 Mar 2023 18:01:01 +0000
Subject: fs: Add FGP_WRITEBEGIN

This particular combination of flags is used by most filesystems
in their ->write_begin method, although it does find use in a
few other places.  Before folios, it warranted its own function
(grab_cache_page_write_begin()), but I think that just having specialised
flags is enough.  It certainly helps the few places that have been
converted from grab_cache_page_write_begin() to __filemap_get_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20230324180129.1220691-2-willy@infradead.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/move_extent.c    |  5 ++---
 fs/iomap/buffered-io.c   |  2 +-
 fs/netfs/buffered_read.c |  3 +--
 fs/nfs/file.c            | 12 ++----------
 include/linux/pagemap.h  |  2 ++
 mm/folio-compat.c        |  4 +---
 6 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2de9829aed63..0cb361f0a4fe 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -126,7 +126,6 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
 {
 	struct address_space *mapping[2];
 	unsigned int flags;
-	unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
 
 	BUG_ON(!inode1 || !inode2);
 	if (inode1 < inode2) {
@@ -139,14 +138,14 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
 	}
 
 	flags = memalloc_nofs_save();
-	folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
+	folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping[0]));
 	if (!folio[0]) {
 		memalloc_nofs_restore(flags);
 		return -ENOMEM;
 	}
 
-	folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
+	folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping[1]));
 	memalloc_nofs_restore(flags);
 	if (!folio[1]) {
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6f4c97a6d7e9..10a203515583 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -467,7 +467,7 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  */
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
 {
-	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
+	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
 	struct folio *folio;
 
 	if (iter->flags & IOMAP_NOWAIT)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7679a68e8193..e3d754a9e1b0 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -341,14 +341,13 @@ int netfs_write_begin(struct netfs_inode *ctx,
 {
 	struct netfs_io_request *rreq;
 	struct folio *folio;
-	unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;
 
 	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
 
 retry:
-	folio = __filemap_get_folio(mapping, index, fgp_flags,
+	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
 				    mapping_gfp_mask(mapping));
 	if (!folio)
 		return -ENOMEM;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 893625eacab9..2474cbc30712 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -306,15 +306,6 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
 	return false;
 }
 
-static struct folio *
-nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index)
-{
-	unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
-
-	return __filemap_get_folio(mapping, index, fgp_flags,
-				   mapping_gfp_mask(mapping));
-}
-
 /*
  * This does the "real" work of the write. We must allocate and lock the
  * page to be sent back to the generic routine, which then copies the
@@ -335,7 +326,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file, mapping->host->i_ino, len, (long long) pos);
 
 start:
-	folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT);
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
+				    mapping_gfp_mask(mapping));
 	if (!folio)
 		return -ENOMEM;
 	*pagep = &folio->page;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0acb8e1fb7af..51b75b89730e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -507,6 +507,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_ENTRY		0x00000080
 #define FGP_STABLE		0x00000100
 
+#define FGP_WRITEBEGIN		(FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		int fgp_flags, gfp_t gfp);
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index cabcd1de9ecb..a71523a06ccd 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -106,9 +106,7 @@ EXPORT_SYMBOL(pagecache_get_page);
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
 					pgoff_t index)
 {
-	unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
-
-	return pagecache_get_page(mapping, index, fgp_flags,
+	return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping));
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
-- 
cgit v1.2.3


From c76e14dc13bcf89f3b55fd9dcd036a453a822d79 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 24 Mar 2023 18:01:02 +0000
Subject: fscrypt: Add some folio helper functions

fscrypt_is_bounce_folio() is the equivalent of fscrypt_is_bounce_page()
and fscrypt_pagecache_folio() is the equivalent of fscrypt_pagecache_page().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/20230324180129.1220691-3-willy@infradead.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypt.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index e0a49c3125eb..c068cb487aaa 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -273,6 +273,16 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
 	return (struct page *)page_private(bounce_page);
 }
 
+static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+{
+	return folio->mapping == NULL;
+}
+
+static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+{
+	return bounce_folio->private;
+}
+
 void fscrypt_free_bounce_page(struct page *bounce_page);
 
 /* policy.c */
@@ -445,6 +455,17 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+{
+	return false;
+}
+
+static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
+
 static inline void fscrypt_free_bounce_page(struct page *bounce_page)
 {
 }
-- 
cgit v1.2.3


From cd57b77197a434709aec0e7fb8b2e6ec8479aa4e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 24 Mar 2023 18:01:03 +0000
Subject: ext4: Convert ext4_bio_write_page() to use a folio

Remove several calls to compound_head() and the last caller of
set_page_writeback_keepwrite(), so remove the wrapper too.

Also export bio_add_folio() as this is the first caller from a module.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20230324180129.1220691-4-willy@infradead.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 block/bio.c                |  1 +
 fs/ext4/page-io.c          | 58 +++++++++++++++++++++-------------------------
 include/linux/page-flags.h |  5 ----
 3 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index fd11614bba4d..043944fd46eb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1159,6 +1159,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
 		return false;
 	return bio_add_page(bio, &folio->page, len, off) > 0;
 }
+EXPORT_SYMBOL(bio_add_folio);
 
 void __bio_release_pages(struct bio *bio, bool mark_dirty)
 {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 8703fd732abb..7850d2cb2e08 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -409,12 +409,10 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
 
 static void io_submit_add_bh(struct ext4_io_submit *io,
 			     struct inode *inode,
-			     struct page *pagecache_page,
-			     struct page *bounce_page,
+			     struct folio *folio,
+			     struct folio *io_folio,
 			     struct buffer_head *bh)
 {
-	int ret;
-
 	if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
 			   !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
 submit_and_retry:
@@ -422,11 +420,9 @@ submit_and_retry:
 	}
 	if (io->io_bio == NULL)
 		io_submit_init_bio(io, bh);
-	ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page,
-			   bh->b_size, bh_offset(bh));
-	if (ret != bh->b_size)
+	if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
 		goto submit_and_retry;
-	wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
 	io->io_next_block++;
 }
 
@@ -434,8 +430,9 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct page *page,
 			int len)
 {
-	struct page *bounce_page = NULL;
-	struct inode *inode = page->mapping->host;
+	struct folio *folio = page_folio(page);
+	struct folio *io_folio = folio;
+	struct inode *inode = folio->mapping->host;
 	unsigned block_start;
 	struct buffer_head *bh, *head;
 	int ret = 0;
@@ -443,30 +440,30 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	struct writeback_control *wbc = io->io_wbc;
 	bool keep_towrite = false;
 
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
+	BUG_ON(!folio_test_locked(folio));
+	BUG_ON(folio_test_writeback(folio));
 
-	ClearPageError(page);
+	folio_clear_error(folio);
 
 	/*
 	 * Comments copied from block_write_full_page:
 	 *
-	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * The folio straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	if (len < PAGE_SIZE)
-		zero_user_segment(page, len, PAGE_SIZE);
+	if (len < folio_size(folio))
+		folio_zero_segment(folio, len, folio_size(folio));
 	/*
 	 * In the first loop we prepare and mark buffers to submit. We have to
-	 * mark all buffers in the page before submitting so that
-	 * end_page_writeback() cannot be called from ext4_end_bio() when IO
+	 * mark all buffers in the folio before submitting so that
+	 * folio_end_writeback() cannot be called from ext4_end_bio() when IO
 	 * on the first buffer finishes and we are still working on submitting
 	 * the second buffer.
 	 */
-	bh = head = page_buffers(page);
+	bh = head = folio_buffers(folio);
 	do {
 		block_start = bh_offset(bh);
 		if (block_start >= len) {
@@ -481,14 +478,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 				clear_buffer_dirty(bh);
 			/*
 			 * Keeping dirty some buffer we cannot write? Make sure
-			 * to redirty the page and keep TOWRITE tag so that
-			 * racing WB_SYNC_ALL writeback does not skip the page.
+			 * to redirty the folio and keep TOWRITE tag so that
+			 * racing WB_SYNC_ALL writeback does not skip the folio.
 			 * This happens e.g. when doing writeout for
 			 * transaction commit.
 			 */
 			if (buffer_dirty(bh)) {
-				if (!PageDirty(page))
-					redirty_page_for_writepage(wbc, page);
+				if (!folio_test_dirty(folio))
+					folio_redirty_for_writepage(wbc, folio);
 				keep_towrite = true;
 			}
 			continue;
@@ -500,11 +497,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
-	/* Nothing to submit? Just unlock the page... */
+	/* Nothing to submit? Just unlock the folio... */
 	if (!nr_to_submit)
 		return 0;
 
-	bh = head = page_buffers(page);
+	bh = head = folio_buffers(folio);
 
 	/*
 	 * If any blocks are being written to an encrypted file, encrypt them
@@ -516,6 +513,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
 		gfp_t gfp_flags = GFP_NOFS;
 		unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+		struct page *bounce_page;
 
 		/*
 		 * Since bounce page allocation uses a mempool, we can only use
@@ -542,7 +540,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			}
 
 			printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
-			redirty_page_for_writepage(wbc, page);
+			folio_redirty_for_writepage(wbc, folio);
 			do {
 				if (buffer_async_write(bh)) {
 					clear_buffer_async_write(bh);
@@ -553,18 +551,16 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 
 			return ret;
 		}
+		io_folio = page_folio(bounce_page);
 	}
 
-	if (keep_towrite)
-		set_page_writeback_keepwrite(page);
-	else
-		set_page_writeback(page);
+	__folio_start_writeback(folio, keep_towrite);
 
 	/* Now submit buffers to write */
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		io_submit_add_bh(io, inode, page, bounce_page, bh);
+		io_submit_add_bh(io, inode, folio, io_folio, bh);
 	} while ((bh = bh->b_this_page) != head);
 
 	return 0;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a7e3a3405520..2e00cc14a81c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -766,11 +766,6 @@ bool set_page_writeback(struct page *page);
 #define folio_start_writeback_keepwrite(folio)	\
 	__folio_start_writeback(folio, true)
 
-static inline void set_page_writeback_keepwrite(struct page *page)
-{
-	folio_start_writeback_keepwrite(page_folio(page));
-}
-
 static inline bool test_set_page_writeback(struct page *page)
 {
 	return set_page_writeback(page);
-- 
cgit v1.2.3


From 86f240a2f7e953840f3b837bc23e39d515a653ed Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Wed, 5 Apr 2023 15:32:06 +0200
Subject: ACPICA: Headers: Delete bogus node_array array of pointers from AEST
 table

ACPICA commit f0c4a06f1dfc4886d4e0c2aa30bc57b10c5a8c53

Like many tables, this is a header followed by multiple subtables of
varying self-identifying types, and ACPICA does not normally add a field
for the subtables, instead relying on pointer arithmetic past the end of
the first header struct, since indexing a flexible array member is
meaningless for variable-length entries. If we really wanted a field for
this, we could use a u8 flexible array member, but it contradicts the
current style. Using void *, however, is categorically wrong, as ACPI
tables never contain native C-language pointers.

Link: https://github.com/acpica/acpica/commit/f0c4a06f
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index b2973dbe37ee..2e7e0d5674f6 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -82,7 +82,6 @@
 
 struct acpi_table_aest {
 	struct acpi_table_header header;
-	void *node_array[];
 };
 
 /* Common Subtable header - one per Node Structure (Subtable) */
-- 
cgit v1.2.3


From 377421fcfb9716cbaef57dbc3d37ddeccd41f9fe Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Date: Wed, 5 Apr 2023 15:33:07 +0200
Subject: ACPICA: ACPI 6.5: MADT: add support for trace buffer extension in
 GICC

ACPICA commit 1363e35dc6976143d118588b5124d72017365588

Link: https://github.com/acpica/acpica/commit/1363e35d
Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 2e7e0d5674f6..65f9e834e921 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1012,7 +1012,7 @@ struct acpi_madt_local_x2apic_nmi {
 	u8 reserved[3];		/* reserved - must be zero */
 };
 
-/* 11: Generic interrupt - GICC (ACPI 5.0 + ACPI 6.0 + ACPI 6.3 changes) */
+/* 11: Generic interrupt - GICC (ACPI 5.0 + ACPI 6.0 + ACPI 6.3 + ACPI 6.5 changes) */
 
 struct acpi_madt_generic_interrupt {
 	struct acpi_subtable_header header;
@@ -1032,6 +1032,7 @@ struct acpi_madt_generic_interrupt {
 	u8 efficiency_class;
 	u8 reserved2[1];
 	u16 spe_interrupt;	/* ACPI 6.3 */
+	u16 trbe_interrupt;	/* ACPI 6.5 */
 };
 
 /* Masks for Flags field above */
-- 
cgit v1.2.3


From 9737ff46f7dc16753d7f276c476d0211d1ac33c1 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Wed, 5 Apr 2023 15:35:31 +0200
Subject: ACPICA: acpisrc: Add missing tables to astable

ACPICA commit d4a2c93198cdd9c6f4a83798345851fee96d5ca5

Also renames struct acpi_data_table_mapping's struct to
struct acpi_data_table_mapping, just so conversion goes smoothly.

Link: https://github.com/acpica/acpica/commit/d4a2c931
Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/evrgnini.c | 4 ++--
 drivers/acpi/acpica/exregion.c | 4 ++--
 include/acpi/actypes.h         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/evrgnini.c b/drivers/acpi/acpica/evrgnini.c
index ca4ba6b351fe..792d1a5f0052 100644
--- a/drivers/acpi/acpica/evrgnini.c
+++ b/drivers/acpi/acpica/evrgnini.c
@@ -430,7 +430,7 @@ acpi_ev_data_table_region_setup(acpi_handle handle,
 {
 	union acpi_operand_object *region_desc =
 	    (union acpi_operand_object *)handle;
-	struct acpi_data_table_space_context *local_region_context;
+	struct acpi_data_table_mapping *local_region_context;
 
 	ACPI_FUNCTION_TRACE(ev_data_table_region_setup);
 
@@ -445,7 +445,7 @@ acpi_ev_data_table_region_setup(acpi_handle handle,
 	/* Create a new context */
 
 	local_region_context =
-	    ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_data_table_space_context));
+	    ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_data_table_mapping));
 	if (!(local_region_context)) {
 		return_ACPI_STATUS(AE_NO_MEMORY);
 	}
diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c
index 4ff35852c0b3..6d4a4cc14850 100644
--- a/drivers/acpi/acpica/exregion.c
+++ b/drivers/acpi/acpica/exregion.c
@@ -509,12 +509,12 @@ acpi_ex_data_table_space_handler(u32 function,
 				 u64 *value,
 				 void *handler_context, void *region_context)
 {
-	struct acpi_data_table_space_context *mapping;
+	struct acpi_data_table_mapping *mapping;
 	char *pointer;
 
 	ACPI_FUNCTION_TRACE(ex_data_table_space_handler);
 
-	mapping = (struct acpi_data_table_space_context *) region_context;
+	mapping = (struct acpi_data_table_mapping *) region_context;
 	pointer = ACPI_CAST_PTR(char, mapping->pointer) +
 	    (address - ACPI_PTR_TO_PHYSADDR(mapping->pointer));
 
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 95e4f56f9754..46878cbb7d39 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -1240,7 +1240,7 @@ struct acpi_mem_space_context {
 	struct acpi_mem_mapping *first_mm;
 };
 
-struct acpi_data_table_space_context {
+struct acpi_data_table_mapping {
 	void *pointer;
 };
 
-- 
cgit v1.2.3


From a47a0c2a1fe1e4975c7214e1efeabc68171dc5c5 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Wed, 5 Apr 2023 15:36:09 +0200
Subject: ACPICA: Add support for 64 bit loong_arch compilation

ACPICA commit d809b69cf43c632c8fe5b42372a891216fdd9223

Add 64 bit loong_arch architecture by defining ACPI_MACHINE_WIDTH to 64.
Useful for acpica tools and incorporating ACPICA into the Firmware Test
Suite.

Link: https://github.com/acpica/acpica/commit/d809b69c
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/platform/aclinux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index a5550dd4d507..14ac657a7ded 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -199,7 +199,7 @@
 
 #if defined(__ia64__)    || (defined(__x86_64__) && !defined(__ILP32__)) ||\
 	defined(__aarch64__) || defined(__PPC64__) ||\
-	defined(__s390x__) ||\
+	defined(__s390x__)   || defined(__loongarch__) ||\
 	(defined(__riscv) && (defined(__LP64__) || defined(_LP64)))
 #define ACPI_MACHINE_WIDTH          64
 #define COMPILER_DEPENDENT_INT64    long
-- 
cgit v1.2.3


From f5325cb1041479b2ae7d78173dfd51092c20c16b Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Date: Wed, 5 Apr 2023 15:36:51 +0200
Subject: ACPICA: Add support for ASPT table in disassembler

ACPICA commit 6771f8b758299bd383bab145d5fd36ec229b2d70

ASPT is the AMD Secure Processor table, found in Hyper-V VMs when SNP
isolation is exposed to the VM and in some high-end AMD servers. This
commit adds support for rev 1 of the ASPT spec in the disassembler.

Link: https://github.com/acpica/acpica/commit/6771f8b7
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl1.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 81b9e794424d..b5e011da5271 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -26,6 +26,7 @@
  */
 #define ACPI_SIG_AEST           "AEST"	/* Arm Error Source Table */
 #define ACPI_SIG_ASF            "ASF!"	/* Alert Standard Format table */
+#define ACPI_SIG_ASPT           "ASPT"	/* AMD Secure Processor Table */
 #define ACPI_SIG_BERT           "BERT"	/* Boot Error Record Table */
 #define ACPI_SIG_BGRT           "BGRT"	/* Boot Graphics Resource Table */
 #define ACPI_SIG_BOOT           "BOOT"	/* Simple Boot Flag Table */
@@ -109,6 +110,51 @@ struct acpi_whea_header {
 	u64 mask;		/* Bitmask required for this register instruction */
 };
 
+/* https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/acpitabl/ns-acpitabl-aspt_table */
+#define ASPT_REVISION_ID 0x01
+struct acpi_table_aspt {
+	struct acpi_table_header header;
+	u32 num_entries;
+};
+
+struct acpi_aspt_header {
+	u16 type;
+	u16 length;
+};
+
+enum acpi_aspt_type {
+	ACPI_ASPT_TYPE_GLOBAL_REGS = 0,
+	ACPI_ASPT_TYPE_SEV_MBOX_REGS = 1,
+	ACPI_ASPT_TYPE_ACPI_MBOX_REGS = 2,
+};
+
+/* 0: ASPT Global Registers */
+struct acpi_aspt_global_regs {
+	struct acpi_aspt_header header;
+	u32 reserved;
+	u64 feature_reg_addr;
+	u64 irq_en_reg_addr;
+	u64 irq_st_reg_addr;
+};
+
+/* 1: ASPT SEV Mailbox Registers */
+struct acpi_aspt_sev_mbox_regs {
+	struct acpi_aspt_header header;
+	u8 mbox_irq_id;
+	u8 reserved[3];
+	u64 cmd_resp_reg_addr;
+	u64 cmd_buf_lo_reg_addr;
+	u64 cmd_buf_hi_reg_addr;
+};
+
+/* 2: ASPT ACPI Mailbox Registers */
+struct acpi_aspt_acpi_mbox_regs {
+	struct acpi_aspt_header header;
+	u32 reserved1;
+	u64 cmd_resp_reg_addr;
+	u64 reserved2[2];
+};
+
 /*******************************************************************************
  *
  * ASF - Alert Standard Format table (Signature "ASF!")
-- 
cgit v1.2.3


From 47920aae34e295f4ffbeac0b10698ceda52eec99 Mon Sep 17 00:00:00 2001
From: Hesham Almatary <hesham.almatary@huawei.com>
Date: Wed, 5 Apr 2023 15:37:35 +0200
Subject: ACPICA: Add support for Arm's MPAM ACPI table version 2

ACPICA commit 005e24bcaa6e4c7db327b4f81fb63b2715aac7e6

Complies with ACPI for Memory System Resource Partitioning and
Monitoring 2.0 [1]. Document number: DEN0065, as of December 2022.

Support for all types of MPAM resources. No support yet for:
1) MPAM PCC Interface Type
2) The optional Resource-specific data per MSC node, introduced in v2 of the
MPAM ACPI spec.

[1] https://developer.arm.com/documentation/den0065/latest

Link: https://github.com/acpica/acpica/commit/005e24bc
Signed-off-by: Hesham Almatary <hesham.almatary@huawei.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 65f9e834e921..b082175906ba 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -35,6 +35,7 @@
 #define ACPI_SIG_MADT           "APIC"	/* Multiple APIC Description Table */
 #define ACPI_SIG_MCFG           "MCFG"	/* PCI Memory Mapped Configuration table */
 #define ACPI_SIG_MCHI           "MCHI"	/* Management Controller Host Interface table */
+#define ACPI_SIG_MPAM           "MPAM"	/* Memory System Resource Partitioning and Monitoring Table */
 #define ACPI_SIG_MPST           "MPST"	/* Memory Power State Table */
 #define ACPI_SIG_MSDM           "MSDM"	/* Microsoft Data Management Table */
 #define ACPI_SIG_NFIT           "NFIT"	/* NVDIMM Firmware Interface Table */
@@ -1332,6 +1333,121 @@ struct acpi_table_mchi {
 	u8 pci_function;
 };
 
+/*******************************************************************************
+ *
+ * MPAM - Memory System Resource Partitioning and Monitoring
+ *
+ * Conforms to "ACPI for Memory System Resource Partitioning and Monitoring 2.0"
+ * Document number: ARM DEN 0065, December, 2022.
+ *
+ ******************************************************************************/
+
+/* MPAM RIS locator types. Table 11, Location types */
+enum acpi_mpam_locator_type {
+	ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE = 0,
+	ACPI_MPAM_LOCATION_TYPE_MEMORY = 1,
+	ACPI_MPAM_LOCATION_TYPE_SMMU = 2,
+	ACPI_MPAM_LOCATION_TYPE_MEMORY_CACHE = 3,
+	ACPI_MPAM_LOCATION_TYPE_ACPI_DEVICE = 4,
+	ACPI_MPAM_LOCATION_TYPE_INTERCONNECT = 5,
+	ACPI_MPAM_LOCATION_TYPE_UNKNOWN = 0xFF
+};
+
+/* MPAM Functional dependency descriptor. Table 10 */
+struct acpi_mpam_func_deps {
+	u32 producer;
+	u32 reserved;
+};
+
+/* MPAM Processor cache locator descriptor. Table 13 */
+struct acpi_mpam_resource_cache_locator {
+	u64 cache_reference;
+	u32 reserved;
+};
+
+/* MPAM Memory locator descriptor. Table 14 */
+struct acpi_mpam_resource_memory_locator {
+	u64 proximity_domain;
+	u32 reserved;
+};
+
+/* MPAM SMMU locator descriptor. Table 15 */
+struct acpi_mpam_resource_smmu_locator {
+	u64 smmu_interface;
+	u32 reserved;
+};
+
+/* MPAM Memory-side cache locator descriptor. Table 16 */
+struct acpi_mpam_resource_memcache_locator {
+	u8 reserved[7];
+	u8 level;
+	u32 reference;
+};
+
+/* MPAM ACPI device locator descriptor. Table 17 */
+struct acpi_mpam_resource_acpi_locator {
+	u64 acpi_hw_id;
+	u32 acpi_unique_id;
+};
+
+/* MPAM Interconnect locator descriptor. Table 18 */
+struct acpi_mpam_resource_interconnect_locator {
+	u64 inter_connect_desc_tbl_off;
+	u32 reserved;
+};
+
+/* MPAM Locator structure. Table 12 */
+struct acpi_mpam_resource_generic_locator {
+	u64 descriptor1;
+	u32 descriptor2;
+};
+
+union acpi_mpam_resource_locator {
+	struct acpi_mpam_resource_cache_locator cache_locator;
+	struct acpi_mpam_resource_memory_locator memory_locator;
+	struct acpi_mpam_resource_smmu_locator smmu_locator;
+	struct acpi_mpam_resource_memcache_locator mem_cache_locator;
+	struct acpi_mpam_resource_acpi_locator acpi_locator;
+	struct acpi_mpam_resource_interconnect_locator interconnect_ifc_locator;
+	struct acpi_mpam_resource_generic_locator generic_locator;
+};
+
+/* Memory System Component Resource Node Structure Table 9 */
+struct acpi_mpam_resource_node {
+	u32 identifier;
+	u8 ris_index;
+	u16 reserved1;
+	u8 locator_type;
+	union acpi_mpam_resource_locator locator;
+	u32 num_functional_deps;
+};
+
+/* Memory System Component (MSC) Node Structure. Table 4 */
+struct acpi_mpam_msc_node {
+	u16 length;
+	u8 interface_type;
+	u8 reserved;
+	u32 identifier;
+	u64 base_address;
+	u32 mmio_size;
+	u32 overflow_interrupt;
+	u32 overflow_interrupt_flags;
+	u32 reserved1;
+	u32 overflow_interrupt_affinity;
+	u32 error_interrupt;
+	u32 error_interrupt_flags;
+	u32 reserved2;
+	u32 error_interrupt_affinity;
+	u32 max_nrdy_usec;
+	u64 hardware_id_linked_device;
+	u32 instance_id_linked_device;
+	u32 num_resouce_nodes;
+};
+
+struct acpi_table_mpam {
+	struct acpi_table_header header;	/* Common ACPI table header */
+};
+
 /*******************************************************************************
  *
  * MPST - Memory Power State Table (ACPI 5.0)
-- 
cgit v1.2.3


From 612c29328466bdc1454ce76959fc03a1e2f7087a Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 5 Apr 2023 15:38:21 +0200
Subject: ACPICA: Update all copyrights/signons to 2023

ACPICA commit 25bddd1824b1e450829468a64bbdcb38074ba3d2

Copyright updates to 2023.

Link: https://github.com/acpica/acpica/commit/25bddd18
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/acapps.h                             | 2 +-
 drivers/acpi/acpica/accommon.h                           | 2 +-
 drivers/acpi/acpica/acconvert.h                          | 2 +-
 drivers/acpi/acpica/acdebug.h                            | 2 +-
 drivers/acpi/acpica/acdispat.h                           | 2 +-
 drivers/acpi/acpica/acevents.h                           | 2 +-
 drivers/acpi/acpica/acglobal.h                           | 2 +-
 drivers/acpi/acpica/achware.h                            | 2 +-
 drivers/acpi/acpica/acinterp.h                           | 2 +-
 drivers/acpi/acpica/aclocal.h                            | 2 +-
 drivers/acpi/acpica/acmacros.h                           | 2 +-
 drivers/acpi/acpica/acnamesp.h                           | 2 +-
 drivers/acpi/acpica/acobject.h                           | 2 +-
 drivers/acpi/acpica/acopcode.h                           | 2 +-
 drivers/acpi/acpica/acparser.h                           | 2 +-
 drivers/acpi/acpica/acpredef.h                           | 2 +-
 drivers/acpi/acpica/acresrc.h                            | 2 +-
 drivers/acpi/acpica/acstruct.h                           | 2 +-
 drivers/acpi/acpica/actables.h                           | 2 +-
 drivers/acpi/acpica/acutils.h                            | 2 +-
 drivers/acpi/acpica/amlcode.h                            | 2 +-
 drivers/acpi/acpica/amlresrc.h                           | 2 +-
 drivers/acpi/acpica/dbhistry.c                           | 2 +-
 drivers/acpi/acpica/dsargs.c                             | 2 +-
 drivers/acpi/acpica/dscontrol.c                          | 2 +-
 drivers/acpi/acpica/dsdebug.c                            | 2 +-
 drivers/acpi/acpica/dsfield.c                            | 2 +-
 drivers/acpi/acpica/dsinit.c                             | 2 +-
 drivers/acpi/acpica/dsmethod.c                           | 2 +-
 drivers/acpi/acpica/dsobject.c                           | 2 +-
 drivers/acpi/acpica/dsopcode.c                           | 2 +-
 drivers/acpi/acpica/dspkginit.c                          | 2 +-
 drivers/acpi/acpica/dswexec.c                            | 2 +-
 drivers/acpi/acpica/dswload.c                            | 2 +-
 drivers/acpi/acpica/dswload2.c                           | 2 +-
 drivers/acpi/acpica/dswscope.c                           | 2 +-
 drivers/acpi/acpica/dswstate.c                           | 2 +-
 drivers/acpi/acpica/evevent.c                            | 2 +-
 drivers/acpi/acpica/evglock.c                            | 2 +-
 drivers/acpi/acpica/evgpe.c                              | 2 +-
 drivers/acpi/acpica/evgpeblk.c                           | 2 +-
 drivers/acpi/acpica/evgpeinit.c                          | 2 +-
 drivers/acpi/acpica/evgpeutil.c                          | 2 +-
 drivers/acpi/acpica/evhandler.c                          | 2 +-
 drivers/acpi/acpica/evmisc.c                             | 2 +-
 drivers/acpi/acpica/evregion.c                           | 2 +-
 drivers/acpi/acpica/evrgnini.c                           | 2 +-
 drivers/acpi/acpica/evxface.c                            | 2 +-
 drivers/acpi/acpica/evxfevnt.c                           | 2 +-
 drivers/acpi/acpica/evxfgpe.c                            | 2 +-
 drivers/acpi/acpica/evxfregn.c                           | 2 +-
 drivers/acpi/acpica/exconcat.c                           | 2 +-
 drivers/acpi/acpica/exconfig.c                           | 2 +-
 drivers/acpi/acpica/exconvrt.c                           | 2 +-
 drivers/acpi/acpica/excreate.c                           | 2 +-
 drivers/acpi/acpica/exdebug.c                            | 2 +-
 drivers/acpi/acpica/exdump.c                             | 2 +-
 drivers/acpi/acpica/exfield.c                            | 2 +-
 drivers/acpi/acpica/exfldio.c                            | 2 +-
 drivers/acpi/acpica/exmisc.c                             | 2 +-
 drivers/acpi/acpica/exmutex.c                            | 2 +-
 drivers/acpi/acpica/exnames.c                            | 2 +-
 drivers/acpi/acpica/exoparg1.c                           | 2 +-
 drivers/acpi/acpica/exoparg2.c                           | 2 +-
 drivers/acpi/acpica/exoparg3.c                           | 2 +-
 drivers/acpi/acpica/exoparg6.c                           | 2 +-
 drivers/acpi/acpica/exprep.c                             | 2 +-
 drivers/acpi/acpica/exregion.c                           | 2 +-
 drivers/acpi/acpica/exresnte.c                           | 2 +-
 drivers/acpi/acpica/exresolv.c                           | 2 +-
 drivers/acpi/acpica/exresop.c                            | 2 +-
 drivers/acpi/acpica/exserial.c                           | 2 +-
 drivers/acpi/acpica/exstore.c                            | 2 +-
 drivers/acpi/acpica/exstoren.c                           | 2 +-
 drivers/acpi/acpica/exstorob.c                           | 2 +-
 drivers/acpi/acpica/exsystem.c                           | 2 +-
 drivers/acpi/acpica/extrace.c                            | 2 +-
 drivers/acpi/acpica/exutils.c                            | 2 +-
 drivers/acpi/acpica/hwacpi.c                             | 2 +-
 drivers/acpi/acpica/hwesleep.c                           | 2 +-
 drivers/acpi/acpica/hwgpe.c                              | 2 +-
 drivers/acpi/acpica/hwsleep.c                            | 2 +-
 drivers/acpi/acpica/hwtimer.c                            | 2 +-
 drivers/acpi/acpica/hwvalid.c                            | 2 +-
 drivers/acpi/acpica/hwxface.c                            | 2 +-
 drivers/acpi/acpica/hwxfsleep.c                          | 2 +-
 drivers/acpi/acpica/nsarguments.c                        | 2 +-
 drivers/acpi/acpica/nsconvert.c                          | 2 +-
 drivers/acpi/acpica/nsdump.c                             | 2 +-
 drivers/acpi/acpica/nsdumpdv.c                           | 2 +-
 drivers/acpi/acpica/nsinit.c                             | 2 +-
 drivers/acpi/acpica/nsload.c                             | 2 +-
 drivers/acpi/acpica/nsparse.c                            | 2 +-
 drivers/acpi/acpica/nspredef.c                           | 2 +-
 drivers/acpi/acpica/nsprepkg.c                           | 2 +-
 drivers/acpi/acpica/nsrepair.c                           | 2 +-
 drivers/acpi/acpica/nsrepair2.c                          | 2 +-
 drivers/acpi/acpica/nsutils.c                            | 2 +-
 drivers/acpi/acpica/nswalk.c                             | 2 +-
 drivers/acpi/acpica/nsxfname.c                           | 2 +-
 drivers/acpi/acpica/psargs.c                             | 2 +-
 drivers/acpi/acpica/psloop.c                             | 2 +-
 drivers/acpi/acpica/psobject.c                           | 2 +-
 drivers/acpi/acpica/psopcode.c                           | 2 +-
 drivers/acpi/acpica/psopinfo.c                           | 2 +-
 drivers/acpi/acpica/psparse.c                            | 2 +-
 drivers/acpi/acpica/psscope.c                            | 2 +-
 drivers/acpi/acpica/pstree.c                             | 2 +-
 drivers/acpi/acpica/psutils.c                            | 2 +-
 drivers/acpi/acpica/pswalk.c                             | 2 +-
 drivers/acpi/acpica/psxface.c                            | 2 +-
 drivers/acpi/acpica/tbdata.c                             | 2 +-
 drivers/acpi/acpica/tbfadt.c                             | 2 +-
 drivers/acpi/acpica/tbfind.c                             | 2 +-
 drivers/acpi/acpica/tbinstal.c                           | 2 +-
 drivers/acpi/acpica/tbprint.c                            | 2 +-
 drivers/acpi/acpica/tbutils.c                            | 2 +-
 drivers/acpi/acpica/tbxface.c                            | 2 +-
 drivers/acpi/acpica/tbxfload.c                           | 2 +-
 drivers/acpi/acpica/tbxfroot.c                           | 2 +-
 drivers/acpi/acpica/utaddress.c                          | 2 +-
 drivers/acpi/acpica/utalloc.c                            | 2 +-
 drivers/acpi/acpica/utascii.c                            | 2 +-
 drivers/acpi/acpica/utbuffer.c                           | 2 +-
 drivers/acpi/acpica/utcache.c                            | 2 +-
 drivers/acpi/acpica/utcksum.c                            | 2 +-
 drivers/acpi/acpica/utcopy.c                             | 2 +-
 drivers/acpi/acpica/utdebug.c                            | 2 +-
 drivers/acpi/acpica/utdecode.c                           | 2 +-
 drivers/acpi/acpica/uteval.c                             | 2 +-
 drivers/acpi/acpica/utglobal.c                           | 2 +-
 drivers/acpi/acpica/uthex.c                              | 2 +-
 drivers/acpi/acpica/utids.c                              | 2 +-
 drivers/acpi/acpica/utinit.c                             | 2 +-
 drivers/acpi/acpica/utlock.c                             | 2 +-
 drivers/acpi/acpica/utobject.c                           | 2 +-
 drivers/acpi/acpica/utosi.c                              | 2 +-
 drivers/acpi/acpica/utpredef.c                           | 2 +-
 drivers/acpi/acpica/utprint.c                            | 2 +-
 drivers/acpi/acpica/uttrack.c                            | 2 +-
 drivers/acpi/acpica/utuuid.c                             | 2 +-
 drivers/acpi/acpica/utxface.c                            | 2 +-
 drivers/acpi/acpica/utxfinit.c                           | 2 +-
 include/acpi/acbuffer.h                                  | 2 +-
 include/acpi/acconfig.h                                  | 2 +-
 include/acpi/acexcep.h                                   | 2 +-
 include/acpi/acnames.h                                   | 2 +-
 include/acpi/acoutput.h                                  | 2 +-
 include/acpi/acpi.h                                      | 2 +-
 include/acpi/acpiosxf.h                                  | 2 +-
 include/acpi/acpixf.h                                    | 2 +-
 include/acpi/acrestyp.h                                  | 2 +-
 include/acpi/actbl.h                                     | 2 +-
 include/acpi/actbl1.h                                    | 2 +-
 include/acpi/actbl2.h                                    | 2 +-
 include/acpi/actbl3.h                                    | 2 +-
 include/acpi/actypes.h                                   | 2 +-
 include/acpi/acuuid.h                                    | 2 +-
 include/acpi/platform/acenv.h                            | 2 +-
 include/acpi/platform/acenvex.h                          | 2 +-
 include/acpi/platform/acgcc.h                            | 2 +-
 include/acpi/platform/acgccex.h                          | 2 +-
 include/acpi/platform/aclinux.h                          | 2 +-
 include/acpi/platform/aclinuxex.h                        | 2 +-
 tools/power/acpi/common/cmfsize.c                        | 2 +-
 tools/power/acpi/common/getopt.c                         | 2 +-
 tools/power/acpi/os_specific/service_layers/oslinuxtbl.c | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixdir.c  | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixmap.c  | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixxf.c   | 2 +-
 tools/power/acpi/tools/acpidump/acpidump.h               | 2 +-
 tools/power/acpi/tools/acpidump/apdump.c                 | 2 +-
 tools/power/acpi/tools/acpidump/apfiles.c                | 2 +-
 tools/power/acpi/tools/acpidump/apmain.c                 | 2 +-
 174 files changed, 174 insertions(+), 174 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/acapps.h b/drivers/acpi/acpica/acapps.h
index 0a50b4912515..9d4cbd956627 100644
--- a/drivers/acpi/acpica/acapps.h
+++ b/drivers/acpi/acpica/acapps.h
@@ -3,7 +3,7 @@
  *
  * Module Name: acapps - common include for ACPI applications/tools
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/accommon.h b/drivers/acpi/acpica/accommon.h
index bb329e34ee7d..4536dc9d3979 100644
--- a/drivers/acpi/acpica/accommon.h
+++ b/drivers/acpi/acpica/accommon.h
@@ -3,7 +3,7 @@
  *
  * Name: accommon.h - Common include files for generation of ACPICA source
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acconvert.h b/drivers/acpi/acpica/acconvert.h
index 476d21e67767..c6ba6a36cfb5 100644
--- a/drivers/acpi/acpica/acconvert.h
+++ b/drivers/acpi/acpica/acconvert.h
@@ -3,7 +3,7 @@
  *
  * Module Name: acapps - common include for ACPI applications/tools
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h
index d629716aa5b2..22f1f7a9e5a3 100644
--- a/drivers/acpi/acpica/acdebug.h
+++ b/drivers/acpi/acpica/acdebug.h
@@ -3,7 +3,7 @@
  *
  * Name: acdebug.h - ACPI/AML debugger
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acdispat.h b/drivers/acpi/acpica/acdispat.h
index fe2c3630a38d..73eecbf62f06 100644
--- a/drivers/acpi/acpica/acdispat.h
+++ b/drivers/acpi/acpica/acdispat.h
@@ -3,7 +3,7 @@
  *
  * Name: acdispat.h - dispatcher (parser to interpreter interface)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h
index 922f559a3e59..ddd072cbc738 100644
--- a/drivers/acpi/acpica/acevents.h
+++ b/drivers/acpi/acpica/acevents.h
@@ -3,7 +3,7 @@
  *
  * Name: acevents.h - Event subcomponent prototypes and defines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 777457a58340..778241173ed4 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -3,7 +3,7 @@
  *
  * Name: acglobal.h - Declarations for global variables
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h
index 6f2787506b50..ebf8fd373cf7 100644
--- a/drivers/acpi/acpica/achware.h
+++ b/drivers/acpi/acpica/achware.h
@@ -3,7 +3,7 @@
  *
  * Name: achware.h -- hardware specific interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acinterp.h b/drivers/acpi/acpica/acinterp.h
index 6bdf133a2767..955114c926bd 100644
--- a/drivers/acpi/acpica/acinterp.h
+++ b/drivers/acpi/acpica/acinterp.h
@@ -3,7 +3,7 @@
  *
  * Name: acinterp.h - Interpreter subcomponent prototypes and defines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index 901b1543b869..2b876dac7558 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -3,7 +3,7 @@
  *
  * Name: aclocal.h - Internal data types used across the ACPI subsystem
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acmacros.h b/drivers/acpi/acpica/acmacros.h
index 2f3e609df47d..de83dd22292b 100644
--- a/drivers/acpi/acpica/acmacros.h
+++ b/drivers/acpi/acpica/acmacros.h
@@ -3,7 +3,7 @@
  *
  * Name: acmacros.h - C macros for the entire subsystem.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h
index 7b27b9cc5916..9448bc026b9b 100644
--- a/drivers/acpi/acpica/acnamesp.h
+++ b/drivers/acpi/acpica/acnamesp.h
@@ -3,7 +3,7 @@
  *
  * Name: acnamesp.h - Namespace subcomponent prototypes and defines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h
index 6af5dc995085..1bdfeee5d7c5 100644
--- a/drivers/acpi/acpica/acobject.h
+++ b/drivers/acpi/acpica/acobject.h
@@ -3,7 +3,7 @@
  *
  * Name: acobject.h - Definition of union acpi_operand_object  (Internal object only)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acopcode.h b/drivers/acpi/acpica/acopcode.h
index a224926bd9c8..da96d80e6b3a 100644
--- a/drivers/acpi/acpica/acopcode.h
+++ b/drivers/acpi/acpica/acopcode.h
@@ -3,7 +3,7 @@
  *
  * Name: acopcode.h - AML opcode information for the AML parser and interpreter
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acparser.h b/drivers/acpi/acpica/acparser.h
index 4511c2bd8bc3..6dad786a382c 100644
--- a/drivers/acpi/acpica/acparser.h
+++ b/drivers/acpi/acpica/acparser.h
@@ -3,7 +3,7 @@
  *
  * Module Name: acparser.h - AML Parser subcomponent prototypes and defines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h
index f7d65a20026b..e64aabe3d33a 100644
--- a/drivers/acpi/acpica/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
@@ -3,7 +3,7 @@
  *
  * Name: acpredef - Information table for ACPI predefined methods and objects
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acresrc.h b/drivers/acpi/acpica/acresrc.h
index f7749c63d277..c2e6c8455587 100644
--- a/drivers/acpi/acpica/acresrc.h
+++ b/drivers/acpi/acpica/acresrc.h
@@ -3,7 +3,7 @@
  *
  * Name: acresrc.h - Resource Manager function prototypes
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acstruct.h b/drivers/acpi/acpica/acstruct.h
index b859de96a1e4..f8fee94ba708 100644
--- a/drivers/acpi/acpica/acstruct.h
+++ b/drivers/acpi/acpica/acstruct.h
@@ -3,7 +3,7 @@
  *
  * Name: acstruct.h - Internal structs
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h
index 1c29325e4c7f..b6ae979b01b6 100644
--- a/drivers/acpi/acpica/actables.h
+++ b/drivers/acpi/acpica/actables.h
@@ -3,7 +3,7 @@
  *
  * Name: actables.h - ACPI table management
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index 71175b664f49..dd060844189e 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -3,7 +3,7 @@
  *
  * Name: acutils.h -- prototypes for the common (subsystem-wide) procedures
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/amlcode.h b/drivers/acpi/acpica/amlcode.h
index 62a7ec277513..effe52b40dce 100644
--- a/drivers/acpi/acpica/amlcode.h
+++ b/drivers/acpi/acpica/amlcode.h
@@ -5,7 +5,7 @@
  *                   Declarations and definitions contained herein are derived
  *                   directly from the ACPI specification.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h
index b31779ce204a..89093111bfd2 100644
--- a/drivers/acpi/acpica/amlresrc.h
+++ b/drivers/acpi/acpica/amlresrc.h
@@ -3,7 +3,7 @@
  *
  * Module Name: amlresrc.h - AML resource descriptors
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dbhistry.c b/drivers/acpi/acpica/dbhistry.c
index 105e6ceaa887..e874c1dddefa 100644
--- a/drivers/acpi/acpica/dbhistry.c
+++ b/drivers/acpi/acpica/dbhistry.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dbhistry - debugger HISTORY command
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsargs.c b/drivers/acpi/acpica/dsargs.c
index 2963d1579c05..4354c175e12e 100644
--- a/drivers/acpi/acpica/dsargs.c
+++ b/drivers/acpi/acpica/dsargs.c
@@ -4,7 +4,7 @@
  * Module Name: dsargs - Support for execution of dynamic arguments for static
  *                       objects (regions, fields, buffer fields, etc.)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dscontrol.c b/drivers/acpi/acpica/dscontrol.c
index 8492619149d1..80c69af06948 100644
--- a/drivers/acpi/acpica/dscontrol.c
+++ b/drivers/acpi/acpica/dscontrol.c
@@ -4,7 +4,7 @@
  * Module Name: dscontrol - Support for execution control opcodes -
  *                          if/else/while/return
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsdebug.c b/drivers/acpi/acpica/dsdebug.c
index 2d99ccf5bde7..c5c8380a3114 100644
--- a/drivers/acpi/acpica/dsdebug.c
+++ b/drivers/acpi/acpica/dsdebug.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsdebug - Parser/Interpreter interface - debugging
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c
index de175f1b4beb..532401ecdab0 100644
--- a/drivers/acpi/acpica/dsfield.c
+++ b/drivers/acpi/acpica/dsfield.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsfield - Dispatcher field routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsinit.c b/drivers/acpi/acpica/dsinit.c
index dffd54fdbd51..6e0e362e461f 100644
--- a/drivers/acpi/acpica/dsinit.c
+++ b/drivers/acpi/acpica/dsinit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsinit - Object initialization namespace walk
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c
index 9332bc688713..e809c2aed78a 100644
--- a/drivers/acpi/acpica/dsmethod.c
+++ b/drivers/acpi/acpica/dsmethod.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsmethod - Parser/Interpreter interface - control method parsing
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsobject.c b/drivers/acpi/acpica/dsobject.c
index e3dfc734ace9..555f148d666b 100644
--- a/drivers/acpi/acpica/dsobject.c
+++ b/drivers/acpi/acpica/dsobject.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsobject - Dispatcher object management routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c
index 2b9b6a974ca9..dd3059000885 100644
--- a/drivers/acpi/acpica/dsopcode.c
+++ b/drivers/acpi/acpica/dsopcode.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dsopcode - Dispatcher support for regions and fields
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dspkginit.c b/drivers/acpi/acpica/dspkginit.c
index 1624d6e7dc46..ecf793fe9919 100644
--- a/drivers/acpi/acpica/dspkginit.c
+++ b/drivers/acpi/acpica/dspkginit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dspkginit - Completion of deferred package initialization
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c
index b082eb942a0f..a43336f05206 100644
--- a/drivers/acpi/acpica/dswexec.c
+++ b/drivers/acpi/acpica/dswexec.c
@@ -4,7 +4,7 @@
  * Module Name: dswexec - Dispatcher method execution callbacks;
  *                        dispatch to interpreter.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c
index 9f6573646ab5..f7b8496c8bdd 100644
--- a/drivers/acpi/acpica/dswload.c
+++ b/drivers/acpi/acpica/dswload.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dswload - Dispatcher first pass namespace load callbacks
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dswload2.c b/drivers/acpi/acpica/dswload2.c
index 778df616aaa0..541235f498c2 100644
--- a/drivers/acpi/acpica/dswload2.c
+++ b/drivers/acpi/acpica/dswload2.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dswload2 - Dispatcher second pass namespace load callbacks
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dswscope.c b/drivers/acpi/acpica/dswscope.c
index 634b9100f674..1fdd07ae862c 100644
--- a/drivers/acpi/acpica/dswscope.c
+++ b/drivers/acpi/acpica/dswscope.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dswscope - Scope stack manipulation
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/dswstate.c b/drivers/acpi/acpica/dswstate.c
index 0aa735d3b93c..95af370a7dce 100644
--- a/drivers/acpi/acpica/dswstate.c
+++ b/drivers/acpi/acpica/dswstate.c
@@ -3,7 +3,7 @@
  *
  * Module Name: dswstate - Dispatcher parse tree walk management routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c
index 82d1728b9bc6..015a3338a732 100644
--- a/drivers/acpi/acpica/evevent.c
+++ b/drivers/acpi/acpica/evevent.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evevent - Fixed Event handling and dispatch
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c
index 9aab54797ded..989dc01af03f 100644
--- a/drivers/acpi/acpica/evglock.c
+++ b/drivers/acpi/acpica/evglock.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evglock - Global Lock support
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c
index a6bb480d631c..934b201d3820 100644
--- a/drivers/acpi/acpica/evgpe.c
+++ b/drivers/acpi/acpica/evgpe.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evgpe - General Purpose Event handling and dispatch
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c
index 39fe4566310b..58e1890ab25b 100644
--- a/drivers/acpi/acpica/evgpeblk.c
+++ b/drivers/acpi/acpica/evgpeblk.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evgpeblk - GPE block creation and initialization.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c
index 2f1a75fee61c..0dbc4d88919a 100644
--- a/drivers/acpi/acpica/evgpeinit.c
+++ b/drivers/acpi/acpica/evgpeinit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evgpeinit - System GPE initialization and update
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evgpeutil.c b/drivers/acpi/acpica/evgpeutil.c
index c32eb57aa21d..ee3b1ea656d4 100644
--- a/drivers/acpi/acpica/evgpeutil.c
+++ b/drivers/acpi/acpica/evgpeutil.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evgpeutil - GPE utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evhandler.c b/drivers/acpi/acpica/evhandler.c
index be9a05498adc..1c8cb6d924df 100644
--- a/drivers/acpi/acpica/evhandler.c
+++ b/drivers/acpi/acpica/evhandler.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evhandler - Support for Address Space handlers
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c
index 6172cddc1b39..e68e876d3b84 100644
--- a/drivers/acpi/acpica/evmisc.c
+++ b/drivers/acpi/acpica/evmisc.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evmisc - Miscellaneous event manager support functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c
index d035092799eb..18fdf2bc2d49 100644
--- a/drivers/acpi/acpica/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evregion - Operation Region support
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evrgnini.c b/drivers/acpi/acpica/evrgnini.c
index 792d1a5f0052..46d1b3f5582d 100644
--- a/drivers/acpi/acpica/evrgnini.c
+++ b/drivers/acpi/acpica/evrgnini.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evrgnini- ACPI address_space (op_region) init
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c
index 18219abba108..24fa6433d562 100644
--- a/drivers/acpi/acpica/evxface.c
+++ b/drivers/acpi/acpica/evxface.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evxface - External interfaces for ACPI events
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c
index 8187b081e0a6..48bf845191d2 100644
--- a/drivers/acpi/acpica/evxfevnt.c
+++ b/drivers/acpi/acpica/evxfevnt.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evxfevnt - External Interfaces, ACPI event disable/enable
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c
index 340947e412bb..4eeeb3b7ab7e 100644
--- a/drivers/acpi/acpica/evxfgpe.c
+++ b/drivers/acpi/acpica/evxfgpe.c
@@ -3,7 +3,7 @@
  *
  * Module Name: evxfgpe - External Interfaces for General Purpose Events (GPEs)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c
index a5c19f46ec17..3197e6303c5b 100644
--- a/drivers/acpi/acpica/evxfregn.c
+++ b/drivers/acpi/acpica/evxfregn.c
@@ -4,7 +4,7 @@
  * Module Name: evxfregn - External Interfaces, ACPI Operation Regions and
  *                         Address Spaces.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exconcat.c b/drivers/acpi/acpica/exconcat.c
index 66201742f499..2fb78b35565b 100644
--- a/drivers/acpi/acpica/exconcat.c
+++ b/drivers/acpi/acpica/exconcat.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exconcat - Concatenate-type AML operators
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c
index e82faabdf907..473115309860 100644
--- a/drivers/acpi/acpica/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exconfig - Namespace reconfiguration (Load/Unload opcodes)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exconvrt.c b/drivers/acpi/acpica/exconvrt.c
index 8de5d47ad485..3729bf3b74f7 100644
--- a/drivers/acpi/acpica/exconvrt.c
+++ b/drivers/acpi/acpica/exconvrt.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exconvrt - Object conversion routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/excreate.c b/drivers/acpi/acpica/excreate.c
index fb2453fa9442..1bea9d97652c 100644
--- a/drivers/acpi/acpica/excreate.c
+++ b/drivers/acpi/acpica/excreate.c
@@ -3,7 +3,7 @@
  *
  * Module Name: excreate - Named object creation
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exdebug.c b/drivers/acpi/acpica/exdebug.c
index 8a99aadb9d15..3f86bfada510 100644
--- a/drivers/acpi/acpica/exdebug.c
+++ b/drivers/acpi/acpica/exdebug.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exdebug - Support for stores to the AML Debug Object
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exdump.c b/drivers/acpi/acpica/exdump.c
index 24b3d041b3e5..2e2da8790224 100644
--- a/drivers/acpi/acpica/exdump.c
+++ b/drivers/acpi/acpica/exdump.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exdump - Interpreter debug output routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c
index 657f4002f9dc..61ff36189ace 100644
--- a/drivers/acpi/acpica/exfield.c
+++ b/drivers/acpi/acpica/exfield.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exfield - AML execution - field_unit read/write
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c
index d769cea1468b..cf6c812a8b6d 100644
--- a/drivers/acpi/acpica/exfldio.c
+++ b/drivers/acpi/acpica/exfldio.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exfldio - Aml Field I/O
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exmisc.c b/drivers/acpi/acpica/exmisc.c
index b4bac8c60a13..c6f2a9166ac0 100644
--- a/drivers/acpi/acpica/exmisc.c
+++ b/drivers/acpi/acpica/exmisc.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exmisc - ACPI AML (p-code) execution - specific opcodes
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exmutex.c b/drivers/acpi/acpica/exmutex.c
index e9dcfa1e93eb..65c487facdda 100644
--- a/drivers/acpi/acpica/exmutex.c
+++ b/drivers/acpi/acpica/exmutex.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exmutex - ASL Mutex Acquire/Release functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exnames.c b/drivers/acpi/acpica/exnames.c
index 318eb769058d..9a448165bfeb 100644
--- a/drivers/acpi/acpica/exnames.c
+++ b/drivers/acpi/acpica/exnames.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exnames - interpreter/scanner name load/execute
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c
index d108a1a86f12..20fb34b68bee 100644
--- a/drivers/acpi/acpica/exoparg1.c
+++ b/drivers/acpi/acpica/exoparg1.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exoparg1 - AML execution - opcodes with 1 argument
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exoparg2.c b/drivers/acpi/acpica/exoparg2.c
index ebf7c89d52d9..743c258bf2e8 100644
--- a/drivers/acpi/acpica/exoparg2.c
+++ b/drivers/acpi/acpica/exoparg2.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exoparg2 - AML execution - opcodes with 2 arguments
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exoparg3.c b/drivers/acpi/acpica/exoparg3.c
index 4b069bd6bc71..d3091f619909 100644
--- a/drivers/acpi/acpica/exoparg3.c
+++ b/drivers/acpi/acpica/exoparg3.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exoparg3 - AML execution - opcodes with 3 arguments
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exoparg6.c b/drivers/acpi/acpica/exoparg6.c
index 2a506ef386cf..1af35e143ba9 100644
--- a/drivers/acpi/acpica/exoparg6.c
+++ b/drivers/acpi/acpica/exoparg6.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exoparg6 - AML execution - opcodes with 6 arguments
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exprep.c b/drivers/acpi/acpica/exprep.c
index 08f06797386a..08196fa17080 100644
--- a/drivers/acpi/acpica/exprep.c
+++ b/drivers/acpi/acpica/exprep.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exprep - ACPI AML field prep utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c
index 6d4a4cc14850..8907b8bf4267 100644
--- a/drivers/acpi/acpica/exregion.c
+++ b/drivers/acpi/acpica/exregion.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exregion - ACPI default op_region (address space) handlers
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exresnte.c b/drivers/acpi/acpica/exresnte.c
index b81506d73447..873de01b8ad2 100644
--- a/drivers/acpi/acpica/exresnte.c
+++ b/drivers/acpi/acpica/exresnte.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exresnte - AML Interpreter object resolution
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exresolv.c b/drivers/acpi/acpica/exresolv.c
index 61ee7fb46006..24a78b5e266c 100644
--- a/drivers/acpi/acpica/exresolv.c
+++ b/drivers/acpi/acpica/exresolv.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exresolv - AML Interpreter object resolution
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exresop.c b/drivers/acpi/acpica/exresop.c
index 3342780230af..3a437e6ace5c 100644
--- a/drivers/acpi/acpica/exresop.c
+++ b/drivers/acpi/acpica/exresop.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exresop - AML Interpreter operand/object resolution
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exserial.c b/drivers/acpi/acpica/exserial.c
index fd63f2042514..5d99b1a76c83 100644
--- a/drivers/acpi/acpica/exserial.c
+++ b/drivers/acpi/acpica/exserial.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exserial - field_unit support for serial address spaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c
index f99e8cf27a6c..575c7a39f1aa 100644
--- a/drivers/acpi/acpica/exstore.c
+++ b/drivers/acpi/acpica/exstore.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exstore - AML Interpreter object store support
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exstoren.c b/drivers/acpi/acpica/exstoren.c
index c848b328e760..b01ae015e1b5 100644
--- a/drivers/acpi/acpica/exstoren.c
+++ b/drivers/acpi/acpica/exstoren.c
@@ -4,7 +4,7 @@
  * Module Name: exstoren - AML Interpreter object store support,
  *                        Store to Node (namespace object)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exstorob.c b/drivers/acpi/acpica/exstorob.c
index 45c757bbf9a9..37c3131a82fa 100644
--- a/drivers/acpi/acpica/exstorob.c
+++ b/drivers/acpi/acpica/exstorob.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exstorob - AML object store support, store to object
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exsystem.c b/drivers/acpi/acpica/exsystem.c
index 7b5470f404f3..f665ffd9a396 100644
--- a/drivers/acpi/acpica/exsystem.c
+++ b/drivers/acpi/acpica/exsystem.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exsystem - Interface to OS services
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/extrace.c b/drivers/acpi/acpica/extrace.c
index b570d7a7e134..f1730221ff13 100644
--- a/drivers/acpi/acpica/extrace.c
+++ b/drivers/acpi/acpica/extrace.c
@@ -3,7 +3,7 @@
  *
  * Module Name: extrace - Support for interpreter execution tracing
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/exutils.c b/drivers/acpi/acpica/exutils.c
index 87f01ce1c1aa..f4d4a033f166 100644
--- a/drivers/acpi/acpica/exutils.c
+++ b/drivers/acpi/acpica/exutils.c
@@ -3,7 +3,7 @@
  *
  * Module Name: exutils - interpreter/scanner utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwacpi.c b/drivers/acpi/acpica/hwacpi.c
index 2f1c2fc8bd2a..790f342dcd25 100644
--- a/drivers/acpi/acpica/hwacpi.c
+++ b/drivers/acpi/acpica/hwacpi.c
@@ -3,7 +3,7 @@
  *
  * Module Name: hwacpi - ACPI Hardware Initialization/Mode Interface
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwesleep.c b/drivers/acpi/acpica/hwesleep.c
index d8597e052912..a9ba9190408b 100644
--- a/drivers/acpi/acpica/hwesleep.c
+++ b/drivers/acpi/acpica/hwesleep.c
@@ -4,7 +4,7 @@
  * Name: hwesleep.c - ACPI Hardware Sleep/Wake Support functions for the
  *                    extended FADT-V5 sleep registers.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c
index 13d54a48e6e9..e0c847ab8324 100644
--- a/drivers/acpi/acpica/hwgpe.c
+++ b/drivers/acpi/acpica/hwgpe.c
@@ -3,7 +3,7 @@
  *
  * Module Name: hwgpe - Low level GPE enable/disable/clear functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 37b3f641feaa..c4d7c29f398e 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -4,7 +4,7 @@
  * Name: hwsleep.c - ACPI Hardware Sleep/Wake Support functions for the
  *                   original/legacy sleep/PM registers.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwtimer.c b/drivers/acpi/acpica/hwtimer.c
index 46f3ae03ab99..192c04b5a599 100644
--- a/drivers/acpi/acpica/hwtimer.c
+++ b/drivers/acpi/acpica/hwtimer.c
@@ -3,7 +3,7 @@
  *
  * Name: hwtimer.c - ACPI Power Management Timer Interface
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwvalid.c b/drivers/acpi/acpica/hwvalid.c
index 0d392e7b0747..b8de458f0368 100644
--- a/drivers/acpi/acpica/hwvalid.c
+++ b/drivers/acpi/acpica/hwvalid.c
@@ -3,7 +3,7 @@
  *
  * Module Name: hwvalid - I/O request validation
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
index 55d9b897e70f..c31f803995c6 100644
--- a/drivers/acpi/acpica/hwxface.c
+++ b/drivers/acpi/acpica/hwxface.c
@@ -3,7 +3,7 @@
  *
  * Module Name: hwxface - Public ACPICA hardware interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c
index aff51ceea02c..36ea48f64110 100644
--- a/drivers/acpi/acpica/hwxfsleep.c
+++ b/drivers/acpi/acpica/hwxfsleep.c
@@ -3,7 +3,7 @@
  *
  * Name: hwxfsleep.c - ACPI Hardware Sleep/Wake External Interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsarguments.c b/drivers/acpi/acpica/nsarguments.c
index 22586b90e532..3efb46f0dc54 100644
--- a/drivers/acpi/acpica/nsarguments.c
+++ b/drivers/acpi/acpica/nsarguments.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsarguments - Validation of args for ACPI predefined methods
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsconvert.c b/drivers/acpi/acpica/nsconvert.c
index b02555fe38f0..7e5a683ae957 100644
--- a/drivers/acpi/acpica/nsconvert.c
+++ b/drivers/acpi/acpica/nsconvert.c
@@ -4,7 +4,7 @@
  * Module Name: nsconvert - Object conversions for objects returned by
  *                          predefined methods
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsdump.c b/drivers/acpi/acpica/nsdump.c
index f154824d4eb6..90a26cb0c472 100644
--- a/drivers/acpi/acpica/nsdump.c
+++ b/drivers/acpi/acpica/nsdump.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsdump - table dumping routines for debug
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsdumpdv.c b/drivers/acpi/acpica/nsdumpdv.c
index b9a88b7b518b..fa116ebe49a3 100644
--- a/drivers/acpi/acpica/nsdumpdv.c
+++ b/drivers/acpi/acpica/nsdumpdv.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsdump - table dumping routines for debug
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index 3e6207ad18d8..86d126fdb27d 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsinit - namespace initialization
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsload.c b/drivers/acpi/acpica/nsload.c
index 880260a30c0c..fcb9de0f77a2 100644
--- a/drivers/acpi/acpica/nsload.c
+++ b/drivers/acpi/acpica/nsload.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsload - namespace loading/expanding/contracting procedures
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsparse.c b/drivers/acpi/acpica/nsparse.c
index 4b893676ab5c..31e551cf4ea6 100644
--- a/drivers/acpi/acpica/nsparse.c
+++ b/drivers/acpi/acpica/nsparse.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsparse - namespace interface to AML parser
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c
index c0db6690bb32..cf57bd69616d 100644
--- a/drivers/acpi/acpica/nspredef.c
+++ b/drivers/acpi/acpica/nspredef.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nspredef - Validation of ACPI predefined methods and objects
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsprepkg.c b/drivers/acpi/acpica/nsprepkg.c
index 82932c9a774b..dd37fc108fce 100644
--- a/drivers/acpi/acpica/nsprepkg.c
+++ b/drivers/acpi/acpica/nsprepkg.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsprepkg - Validation of package objects for predefined names
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsrepair.c b/drivers/acpi/acpica/nsrepair.c
index ec512e06a48e..b8657004190d 100644
--- a/drivers/acpi/acpica/nsrepair.c
+++ b/drivers/acpi/acpica/nsrepair.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nsrepair - Repair for objects returned by predefined methods
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c
index 957d7eb4861f..1bb7b71f07f1 100644
--- a/drivers/acpi/acpica/nsrepair2.c
+++ b/drivers/acpi/acpica/nsrepair2.c
@@ -4,7 +4,7 @@
  * Module Name: nsrepair2 - Repair for objects returned by specific
  *                          predefined methods
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c
index ef531b145add..06ffdb6808f5 100644
--- a/drivers/acpi/acpica/nsutils.c
+++ b/drivers/acpi/acpica/nsutils.c
@@ -4,7 +4,7 @@
  * Module Name: nsutils - Utilities for accessing ACPI namespace, accessing
  *                        parents and siblings and Scope manipulation
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nswalk.c b/drivers/acpi/acpica/nswalk.c
index 82a0dae349e2..eee396a77bae 100644
--- a/drivers/acpi/acpica/nswalk.c
+++ b/drivers/acpi/acpica/nswalk.c
@@ -3,7 +3,7 @@
  *
  * Module Name: nswalk - Functions for walking the ACPI namespace
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/nsxfname.c b/drivers/acpi/acpica/nsxfname.c
index a0592d15dd37..5d5bcf165298 100644
--- a/drivers/acpi/acpica/nsxfname.c
+++ b/drivers/acpi/acpica/nsxfname.c
@@ -4,7 +4,7 @@
  * Module Name: nsxfname - Public interfaces to the ACPI subsystem
  *                         ACPI Namespace oriented interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c
index f7ec5606098c..422c074ed289 100644
--- a/drivers/acpi/acpica/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psargs - Parse AML opcode arguments
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psloop.c b/drivers/acpi/acpica/psloop.c
index 840512fa9fc6..d0fd55636129 100644
--- a/drivers/acpi/acpica/psloop.c
+++ b/drivers/acpi/acpica/psloop.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psloop - Main AML parse loop
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psobject.c b/drivers/acpi/acpica/psobject.c
index bca249e67c6b..54471083ba54 100644
--- a/drivers/acpi/acpica/psobject.c
+++ b/drivers/acpi/acpica/psobject.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psobject - Support for parse objects
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psopcode.c b/drivers/acpi/acpica/psopcode.c
index bef69e87a0a2..09029fe545f1 100644
--- a/drivers/acpi/acpica/psopcode.c
+++ b/drivers/acpi/acpica/psopcode.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psopcode - Parser/Interpreter opcode information table
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c
index f10afe699ad7..bccf606e08b4 100644
--- a/drivers/acpi/acpica/psopinfo.c
+++ b/drivers/acpi/acpica/psopinfo.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psopinfo - AML opcode information functions and dispatch tables
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psparse.c b/drivers/acpi/acpica/psparse.c
index ba93f359760a..10a072953d78 100644
--- a/drivers/acpi/acpica/psparse.c
+++ b/drivers/acpi/acpica/psparse.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psparse - Parser top level AML parse routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psscope.c b/drivers/acpi/acpica/psscope.c
index 400f001631ea..a0035bde7556 100644
--- a/drivers/acpi/acpica/psscope.c
+++ b/drivers/acpi/acpica/psscope.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psscope - Parser scope stack management routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/pstree.c b/drivers/acpi/acpica/pstree.c
index 3012a9342367..7f7f5ecd4011 100644
--- a/drivers/acpi/acpica/pstree.c
+++ b/drivers/acpi/acpica/pstree.c
@@ -3,7 +3,7 @@
  *
  * Module Name: pstree - Parser op tree manipulation/traversal/search
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psutils.c b/drivers/acpi/acpica/psutils.c
index 49b39aeded12..d550c4af4702 100644
--- a/drivers/acpi/acpica/psutils.c
+++ b/drivers/acpi/acpica/psutils.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psutils - Parser miscellaneous utilities (Parser only)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/pswalk.c b/drivers/acpi/acpica/pswalk.c
index 7735a01dab90..d92817c72b8d 100644
--- a/drivers/acpi/acpica/pswalk.c
+++ b/drivers/acpi/acpica/pswalk.c
@@ -3,7 +3,7 @@
  *
  * Module Name: pswalk - Parser routines to walk parsed op tree(s)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/psxface.c b/drivers/acpi/acpica/psxface.c
index a6509aeb2955..6f4eace0ba69 100644
--- a/drivers/acpi/acpica/psxface.c
+++ b/drivers/acpi/acpica/psxface.c
@@ -3,7 +3,7 @@
  *
  * Module Name: psxface - Parser external interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c
index 1f7677e0dbbe..a1f10e4409a3 100644
--- a/drivers/acpi/acpica/tbdata.c
+++ b/drivers/acpi/acpica/tbdata.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbdata - Table manager data structure functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index f04dc6051320..44267a92bce5 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbfadt   - FADT table utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbfind.c b/drivers/acpi/acpica/tbfind.c
index c31a5ddb0ffd..1c1b2e284bd9 100644
--- a/drivers/acpi/acpica/tbfind.c
+++ b/drivers/acpi/acpica/tbfind.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbfind   - find table
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index 499efcaf798d..0dc003c20e4d 100644
--- a/drivers/acpi/acpica/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbinstal - ACPI table installation and removal
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c
index f07aa9b46f3f..58b02e4b254b 100644
--- a/drivers/acpi/acpica/tbprint.c
+++ b/drivers/acpi/acpica/tbprint.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbprint - Table output utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 17ad9c227d42..cfe50c53ad4d 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbutils - ACPI Table utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c
index 37da09dca940..275b52dc42e9 100644
--- a/drivers/acpi/acpica/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbxface - ACPI table-oriented external interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index 258796e02be1..0f2a7343de3a 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbxfload - Table load/unload external interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c
index 53afd75bbc06..5b413bbab338 100644
--- a/drivers/acpi/acpica/tbxfroot.c
+++ b/drivers/acpi/acpica/tbxfroot.c
@@ -3,7 +3,7 @@
  *
  * Module Name: tbxfroot - Find the root ACPI table (RSDT)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utaddress.c b/drivers/acpi/acpica/utaddress.c
index 915321806cd7..be94d2fd99a7 100644
--- a/drivers/acpi/acpica/utaddress.c
+++ b/drivers/acpi/acpica/utaddress.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utaddress - op_region address range check
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utalloc.c b/drivers/acpi/acpica/utalloc.c
index 2bab6017d827..c1fb70457e20 100644
--- a/drivers/acpi/acpica/utalloc.c
+++ b/drivers/acpi/acpica/utalloc.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utalloc - local memory allocation routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utascii.c b/drivers/acpi/acpica/utascii.c
index 72fb7e9ec485..2be37676edd7 100644
--- a/drivers/acpi/acpica/utascii.c
+++ b/drivers/acpi/acpica/utascii.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utascii - Utility ascii functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utbuffer.c b/drivers/acpi/acpica/utbuffer.c
index 59c4050b8e91..b054bb5eeaf0 100644
--- a/drivers/acpi/acpica/utbuffer.c
+++ b/drivers/acpi/acpica/utbuffer.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utbuffer - Buffer dump routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utcache.c b/drivers/acpi/acpica/utcache.c
index 5425968dd2a8..85a85f7cf750 100644
--- a/drivers/acpi/acpica/utcache.c
+++ b/drivers/acpi/acpica/utcache.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utcache - local cache allocation routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utcksum.c b/drivers/acpi/acpica/utcksum.c
index c166e4c05ab6..b483894c3629 100644
--- a/drivers/acpi/acpica/utcksum.c
+++ b/drivers/acpi/acpica/utcksum.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utcksum - Support generating table checksums
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utcopy.c b/drivers/acpi/acpica/utcopy.c
index 63c17f420fb8..2e17e657dfa4 100644
--- a/drivers/acpi/acpica/utcopy.c
+++ b/drivers/acpi/acpica/utcopy.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utcopy - Internal to external object translation utilities
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 64ed546cf19c..1bbba8585fa6 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utdebug - Debug print/trace routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c
index 3176393a729d..95a4b7509e01 100644
--- a/drivers/acpi/acpica/utdecode.c
+++ b/drivers/acpi/acpica/utdecode.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utdecode - Utility decoding routines (value-to-string)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c
index df20d46ed8b7..3e5173d03953 100644
--- a/drivers/acpi/acpica/uteval.c
+++ b/drivers/acpi/acpica/uteval.c
@@ -3,7 +3,7 @@
  *
  * Module Name: uteval - Object evaluation
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c
index 53afa5edb6ec..ae46cef891e2 100644
--- a/drivers/acpi/acpica/utglobal.c
+++ b/drivers/acpi/acpica/utglobal.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utglobal - Global variables for the ACPI subsystem
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/uthex.c b/drivers/acpi/acpica/uthex.c
index c811ee2a8160..e62802791dcf 100644
--- a/drivers/acpi/acpica/uthex.c
+++ b/drivers/acpi/acpica/uthex.c
@@ -3,7 +3,7 @@
  *
  * Module Name: uthex -- Hex/ASCII support functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utids.c b/drivers/acpi/acpica/utids.c
index b6caab49f1bd..15c2ce91d403 100644
--- a/drivers/acpi/acpica/utids.c
+++ b/drivers/acpi/acpica/utids.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utids - support for device Ids - HID, UID, CID, SUB, CLS
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utinit.c b/drivers/acpi/acpica/utinit.c
index 18177e4f26f7..92fbaef161a7 100644
--- a/drivers/acpi/acpica/utinit.c
+++ b/drivers/acpi/acpica/utinit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utinit - Common ACPI subsystem initialization
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utlock.c b/drivers/acpi/acpica/utlock.c
index 84abdbf5cfca..ee6d72385c5c 100644
--- a/drivers/acpi/acpica/utlock.c
+++ b/drivers/acpi/acpica/utlock.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utlock - Reader/Writer lock interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utobject.c b/drivers/acpi/acpica/utobject.c
index d3667bfff401..f4aae8f0d3a8 100644
--- a/drivers/acpi/acpica/utobject.c
+++ b/drivers/acpi/acpica/utobject.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utobject - ACPI object create/delete/size/cache routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utosi.c b/drivers/acpi/acpica/utosi.c
index b8ab0a3cb5b9..251bd396c6fd 100644
--- a/drivers/acpi/acpica/utosi.c
+++ b/drivers/acpi/acpica/utosi.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utosi - Support for the _OSI predefined control method
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utpredef.c b/drivers/acpi/acpica/utpredef.c
index 2524f013be7a..29d2977d0746 100644
--- a/drivers/acpi/acpica/utpredef.c
+++ b/drivers/acpi/acpica/utpredef.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utpredef - support functions for predefined names
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utprint.c b/drivers/acpi/acpica/utprint.c
index d5aa2109847f..42b30b9f9312 100644
--- a/drivers/acpi/acpica/utprint.c
+++ b/drivers/acpi/acpica/utprint.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utprint - Formatted printing routines
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/uttrack.c b/drivers/acpi/acpica/uttrack.c
index a06988ac409d..f5f5da441458 100644
--- a/drivers/acpi/acpica/uttrack.c
+++ b/drivers/acpi/acpica/uttrack.c
@@ -3,7 +3,7 @@
  *
  * Module Name: uttrack - Memory allocation tracking routines (debug only)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utuuid.c b/drivers/acpi/acpica/utuuid.c
index e24bc670b53e..8f10b413e928 100644
--- a/drivers/acpi/acpica/utuuid.c
+++ b/drivers/acpi/acpica/utuuid.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utuuid -- UUID support functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utxface.c b/drivers/acpi/acpica/utxface.c
index 86e76b443da7..aa2e923462b7 100644
--- a/drivers/acpi/acpica/utxface.c
+++ b/drivers/acpi/acpica/utxface.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utxface - External interfaces, miscellaneous utility functions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index f2acec3a0ee3..1915bec2b279 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -3,7 +3,7 @@
  *
  * Module Name: utxfinit - External interfaces for ACPICA initialization
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acbuffer.h b/include/acpi/acbuffer.h
index 8cbfcbca7b7e..252b235dce5a 100644
--- a/include/acpi/acbuffer.h
+++ b/include/acpi/acbuffer.h
@@ -3,7 +3,7 @@
  *
  * Name: acbuffer.h - Support for buffers returned by ACPI predefined names
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h
index 151e40385673..d768d9c568cf 100644
--- a/include/acpi/acconfig.h
+++ b/include/acpi/acconfig.h
@@ -3,7 +3,7 @@
  *
  * Name: acconfig.h - Global configuration constants
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h
index 28943c900be7..c5ecd0a0170c 100644
--- a/include/acpi/acexcep.h
+++ b/include/acpi/acexcep.h
@@ -3,7 +3,7 @@
  *
  * Name: acexcep.h - Exception codes returned by the ACPI subsystem
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h
index 6f22e92b1744..d71291f25a80 100644
--- a/include/acpi/acnames.h
+++ b/include/acpi/acnames.h
@@ -3,7 +3,7 @@
  *
  * Name: acnames.h - Global names and strings
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 73781aae2119..b1571dd96310 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -3,7 +3,7 @@
  *
  * Name: acoutput.h -- debug output
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h
index 416e59bcf149..8b4a497c1300 100644
--- a/include/acpi/acpi.h
+++ b/include/acpi/acpi.h
@@ -3,7 +3,7 @@
  *
  * Name: acpi.h - Master public include file used to interface to ACPICA
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index 52844cc5eeb5..914c029f64c9 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -5,7 +5,7 @@
  *                    interfaces must be implemented by OSL to interface the
  *                    ACPI components to the host operating system.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 8e364cbdd14a..c961f0d99c5e 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -3,7 +3,7 @@
  *
  * Name: acpixf.h - External interfaces to the ACPI subsystem
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index a7fb8ddb3dc6..e5c875f91203 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -3,7 +3,7 @@
  *
  * Name: acrestyp.h - Defines, types, and structures for resource descriptors
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index c6af579f74f4..e5dfb6f4de52 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -3,7 +3,7 @@
  *
  * Name: actbl.h - Basic ACPI Table Definitions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index b5e011da5271..7a3ad28b86c6 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -3,7 +3,7 @@
  *
  * Name: actbl1.h - Additional ACPI table definitions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index b082175906ba..e0f205f9ab4c 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -3,7 +3,7 @@
  *
  * Name: actbl2.h - ACPI Table Definitions (tables not in ACPI spec)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h
index 832c6464f063..f51c46f4e3e4 100644
--- a/include/acpi/actbl3.h
+++ b/include/acpi/actbl3.h
@@ -3,7 +3,7 @@
  *
  * Name: actbl3.h - ACPI Table Definitions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 46878cbb7d39..ce1db1c2e9d3 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -3,7 +3,7 @@
  *
  * Name: actypes.h - Common data types for the entire ACPI subsystem
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/acuuid.h b/include/acpi/acuuid.h
index 171bb0b708a2..52a84523bfac 100644
--- a/include/acpi/acuuid.h
+++ b/include/acpi/acuuid.h
@@ -3,7 +3,7 @@
  *
  * Name: acuuid.h - ACPI-related UUID/GUID definitions
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h
index 9e4f7564201a..0fa7bbf85c83 100644
--- a/include/acpi/platform/acenv.h
+++ b/include/acpi/platform/acenv.h
@@ -3,7 +3,7 @@
  *
  * Name: acenv.h - Host and compiler configuration
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/acenvex.h b/include/acpi/platform/acenvex.h
index 72cc7bab469e..7e67e3503f7b 100644
--- a/include/acpi/platform/acenvex.h
+++ b/include/acpi/platform/acenvex.h
@@ -3,7 +3,7 @@
  *
  * Name: acenvex.h - Extra host and compiler configuration
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h
index ac80111f503c..c9d418f9395e 100644
--- a/include/acpi/platform/acgcc.h
+++ b/include/acpi/platform/acgcc.h
@@ -3,7 +3,7 @@
  *
  * Name: acgcc.h - GCC specific defines, etc.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/acgccex.h b/include/acpi/platform/acgccex.h
index 302ea1b724b9..7c9f10e9633a 100644
--- a/include/acpi/platform/acgccex.h
+++ b/include/acpi/platform/acgccex.h
@@ -3,7 +3,7 @@
  *
  * Name: acgccex.h - Extra GCC specific defines, etc.
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 14ac657a7ded..66285e054e1e 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -3,7 +3,7 @@
  *
  * Name: aclinux.h - OS specific defines, etc. for Linux
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index 28c72744decf..600d4e2641da 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -3,7 +3,7 @@
  *
  * Name: aclinuxex.h - Extra OS specific defines, etc. for Linux
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c
index 38f9b9da8170..68b9ea86b86c 100644
--- a/tools/power/acpi/common/cmfsize.c
+++ b/tools/power/acpi/common/cmfsize.c
@@ -3,7 +3,7 @@
  *
  * Module Name: cmfsize - Common get file size function
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/common/getopt.c b/tools/power/acpi/common/getopt.c
index 96fd6cec78e2..6a0cdba6fdfd 100644
--- a/tools/power/acpi/common/getopt.c
+++ b/tools/power/acpi/common/getopt.c
@@ -3,7 +3,7 @@
  *
  * Module Name: getopt
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
index bd08f36df4a7..9d70d8c945af 100644
--- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
+++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
@@ -3,7 +3,7 @@
  *
  * Module Name: oslinuxtbl - Linux OSL for obtaining ACPI tables
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/os_specific/service_layers/osunixdir.c b/tools/power/acpi/os_specific/service_layers/osunixdir.c
index 5107892d054b..39f3bffd9355 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixdir.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixdir.c
@@ -3,7 +3,7 @@
  *
  * Module Name: osunixdir - Unix directory access interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/os_specific/service_layers/osunixmap.c b/tools/power/acpi/os_specific/service_layers/osunixmap.c
index 6ff4edd8dc3b..2b7d56252684 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixmap.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixmap.c
@@ -3,7 +3,7 @@
  *
  * Module Name: osunixmap - Unix OSL for file mappings
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c
index b3651a04d68c..46429417c71a 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixxf.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c
@@ -3,7 +3,7 @@
  *
  * Module Name: osunixxf - UNIX OSL interfaces
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/tools/acpidump/acpidump.h b/tools/power/acpi/tools/acpidump/acpidump.h
index 153249c87fd7..643e3e722340 100644
--- a/tools/power/acpi/tools/acpidump/acpidump.h
+++ b/tools/power/acpi/tools/acpidump/acpidump.h
@@ -3,7 +3,7 @@
  *
  * Module Name: acpidump.h - Include file for acpi_dump utility
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c
index ea44b0ed5dcb..0742b00b61a1 100644
--- a/tools/power/acpi/tools/acpidump/apdump.c
+++ b/tools/power/acpi/tools/acpidump/apdump.c
@@ -3,7 +3,7 @@
  *
  * Module Name: apdump - Dump routines for ACPI tables (acpidump)
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c
index 2d9b45a9b526..13817f9112c0 100644
--- a/tools/power/acpi/tools/acpidump/apfiles.c
+++ b/tools/power/acpi/tools/acpidump/apfiles.c
@@ -3,7 +3,7 @@
  *
  * Module Name: apfiles - File-related functions for acpidump utility
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
diff --git a/tools/power/acpi/tools/acpidump/apmain.c b/tools/power/acpi/tools/acpidump/apmain.c
index 44b23fc53dd9..666a9675e743 100644
--- a/tools/power/acpi/tools/acpidump/apmain.c
+++ b/tools/power/acpi/tools/acpidump/apmain.c
@@ -3,7 +3,7 @@
  *
  * Module Name: apmain - Main module for the acpidump utility
  *
- * Copyright (C) 2000 - 2022, Intel Corp.
+ * Copyright (C) 2000 - 2023, Intel Corp.
  *
  *****************************************************************************/
 
-- 
cgit v1.2.3


From 520d4a0ee5b6d9c7a1258ace6caa13a94ac35ef8 Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Wed, 5 Apr 2023 15:39:23 +0200
Subject: ACPICA: add support for ClockInput resource (v6.5)

ACPICA commit 661feab5ee01a34af95a389a18c82e79f1aba05a

Link: https://github.com/acpica/acpica/commit/661feab5
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/aclocal.h     |  3 ++-
 drivers/acpi/acpica/acresrc.h     |  2 ++
 drivers/acpi/acpica/acutils.h     |  2 ++
 drivers/acpi/acpica/amlresrc.h    | 17 ++++++++++++++
 drivers/acpi/acpica/rscalc.c      | 17 ++++++++++++++
 drivers/acpi/acpica/rsdumpinfo.c  | 17 ++++++++++++++
 drivers/acpi/acpica/rsinfo.c      |  5 ++++
 drivers/acpi/acpica/rsserial.c    | 49 +++++++++++++++++++++++++++++++++++++++
 drivers/acpi/acpica/utresdecode.c | 11 +++++++++
 drivers/acpi/acpica/utresrc.c     |  3 +++
 include/acpi/acrestyp.h           | 13 ++++++++++-
 11 files changed, 137 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index 2b876dac7558..12d4a024f029 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -1122,7 +1122,8 @@ struct acpi_port_info {
 #define ACPI_RESOURCE_NAME_PIN_GROUP            0x90
 #define ACPI_RESOURCE_NAME_PIN_GROUP_FUNCTION   0x91
 #define ACPI_RESOURCE_NAME_PIN_GROUP_CONFIG     0x92
-#define ACPI_RESOURCE_NAME_LARGE_MAX            0x92
+#define ACPI_RESOURCE_NAME_CLOCK_INPUT          0x93
+#define ACPI_RESOURCE_NAME_LARGE_MAX            0x94
 
 /*****************************************************************************
  *
diff --git a/drivers/acpi/acpica/acresrc.h b/drivers/acpi/acpica/acresrc.h
index c2e6c8455587..d772ff9ca07d 100644
--- a/drivers/acpi/acpica/acresrc.h
+++ b/drivers/acpi/acpica/acresrc.h
@@ -306,6 +306,7 @@ extern struct acpi_rsconvert_info acpi_rs_convert_pin_config[];
 extern struct acpi_rsconvert_info acpi_rs_convert_pin_group[];
 extern struct acpi_rsconvert_info acpi_rs_convert_pin_group_function[];
 extern struct acpi_rsconvert_info acpi_rs_convert_pin_group_config[];
+extern struct acpi_rsconvert_info acpi_rs_convert_clock_input[];
 
 /* These resources require separate get/set tables */
 
@@ -361,6 +362,7 @@ extern struct acpi_rsdump_info acpi_rs_dump_pin_config[];
 extern struct acpi_rsdump_info acpi_rs_dump_pin_group[];
 extern struct acpi_rsdump_info acpi_rs_dump_pin_group_function[];
 extern struct acpi_rsdump_info acpi_rs_dump_pin_group_config[];
+extern struct acpi_rsdump_info acpi_rs_dump_clock_input[];
 #endif
 
 #endif				/* __ACRESRC_H__ */
diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index dd060844189e..edfdbbef81c1 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -53,6 +53,8 @@ extern const char *acpi_gbl_sb_decode[];
 extern const char *acpi_gbl_fc_decode[];
 extern const char *acpi_gbl_pt_decode[];
 extern const char *acpi_gbl_ptyp_decode[];
+extern const char *acpi_gbl_clock_input_mode[];
+extern const char *acpi_gbl_clock_input_scale[];
 #endif
 
 /*
diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h
index 89093111bfd2..48df447ef5bb 100644
--- a/drivers/acpi/acpica/amlresrc.h
+++ b/drivers/acpi/acpica/amlresrc.h
@@ -70,6 +70,8 @@
 #define ACPI_RESTAG_TYPE                        "_TTP"	/* Translation(1), Static (0) */
 #define ACPI_RESTAG_XFERTYPE                    "_SIZ"	/* 8(0), 8And16(1), 16(2) */
 #define ACPI_RESTAG_VENDORDATA                  "_VEN"
+#define ACPI_RESTAG_FQN                         "_FQN"
+#define ACPI_RESTAG_FQD                         "_FQD"
 
 /* Default sizes for "small" resource descriptors */
 
@@ -427,6 +429,20 @@ struct aml_resource_pin_config {
 	 */
 };
 
+#define AML_RESOURCE_CLOCK_INPUT_REVISION      1	/* ACPI 6.5 */
+
+struct aml_resource_clock_input {
+	AML_RESOURCE_LARGE_HEADER_COMMON u8 revision_id;
+	u16 flags;
+	u16 frequency_divisor;
+	u32 frequency_numerator;
+	/*
+	 * Optional fields follow immediately:
+	 * 1) Resource Source index
+	 * 2) Resource Source String
+	 */
+};
+
 #define AML_RESOURCE_PIN_CONFIG_REVISION      1	/* ACPI 6.2 */
 
 struct aml_resource_pin_group {
@@ -533,6 +549,7 @@ union aml_resource {
 	struct aml_resource_pin_group pin_group;
 	struct aml_resource_pin_group_function pin_group_function;
 	struct aml_resource_pin_group_config pin_group_config;
+	struct aml_resource_clock_input clock_input;
 
 	/* Utility overlays */
 
diff --git a/drivers/acpi/acpica/rscalc.c b/drivers/acpi/acpica/rscalc.c
index 90583db459a2..ffb448fa51fd 100644
--- a/drivers/acpi/acpica/rscalc.c
+++ b/drivers/acpi/acpica/rscalc.c
@@ -320,6 +320,16 @@ acpi_rs_get_aml_length(struct acpi_resource *resource,
 
 			break;
 
+		case ACPI_RESOURCE_TYPE_CLOCK_INPUT:
+
+			total_size = (acpi_rs_length)(total_size +
+						      resource->data.
+						      clock_input.
+						      resource_source.
+						      string_length);
+
+			break;
+
 		case ACPI_RESOURCE_TYPE_SERIAL_BUS:
 
 			total_size =
@@ -650,6 +660,13 @@ acpi_rs_get_list_length(u8 *aml_buffer,
 
 			break;
 
+		case ACPI_RESOURCE_NAME_CLOCK_INPUT:
+			extra_struct_bytes =
+			    acpi_rs_stream_option_length(resource_length,
+							 minimum_aml_resource_length);
+
+			break;
+
 		default:
 
 			break;
diff --git a/drivers/acpi/acpica/rsdumpinfo.c b/drivers/acpi/acpica/rsdumpinfo.c
index b8b37449011b..998a79cc09c2 100644
--- a/drivers/acpi/acpica/rsdumpinfo.c
+++ b/drivers/acpi/acpica/rsdumpinfo.c
@@ -301,6 +301,23 @@ struct acpi_rsdump_info acpi_rs_dump_pin_function[10] = {
 	 "VendorData", NULL},
 };
 
+struct acpi_rsdump_info acpi_rs_dump_clock_input[7] = {
+	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_clock_input),
+	 "ClockInput", NULL},
+	{ACPI_RSD_UINT8, ACPI_RSD_OFFSET(clock_input.revision_id), "RevisionId",
+	 NULL},
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(clock_input.frequency_numerator),
+	 "FrequencyNumerator", NULL},
+	{ACPI_RSD_UINT32, ACPI_RSD_OFFSET(clock_input.frequency_divisor),
+	 "FrequencyDivisor", NULL},
+	{ACPI_RSD_1BITFLAG, ACPI_RSD_OFFSET(clock_input.scale), "Scale",
+	 acpi_gbl_clock_input_scale},
+	{ACPI_RSD_1BITFLAG, ACPI_RSD_OFFSET(clock_input.mode), "Mode",
+	 acpi_gbl_clock_input_mode},
+	{ACPI_RSD_SOURCE, ACPI_RSD_OFFSET(clock_input.resource_source),
+	 "ResourceSource", NULL},
+};
+
 struct acpi_rsdump_info acpi_rs_dump_pin_config[11] = {
 	{ACPI_RSD_TITLE, ACPI_RSD_TABLE_SIZE(acpi_rs_dump_pin_config),
 	 "PinConfig", NULL},
diff --git a/drivers/acpi/acpica/rsinfo.c b/drivers/acpi/acpica/rsinfo.c
index eaeb7ab58c2a..ad7465ddfe13 100644
--- a/drivers/acpi/acpica/rsinfo.c
+++ b/drivers/acpi/acpica/rsinfo.c
@@ -49,6 +49,7 @@ struct acpi_rsconvert_info *acpi_gbl_set_resource_dispatch[] = {
 	acpi_rs_convert_pin_group,	/* 0x16, ACPI_RESOURCE_TYPE_PIN_GROUP */
 	acpi_rs_convert_pin_group_function,	/* 0x17, ACPI_RESOURCE_TYPE_PIN_GROUP_FUNCTION */
 	acpi_rs_convert_pin_group_config,	/* 0x18, ACPI_RESOURCE_TYPE_PIN_GROUP_CONFIG */
+	acpi_rs_convert_clock_input,	/* 0x19, ACPI_RESOURCE_TYPE_CLOCK_INPUT */
 };
 
 /* Dispatch tables for AML-to-resource (Get Resource) conversion functions */
@@ -94,6 +95,7 @@ struct acpi_rsconvert_info *acpi_gbl_get_resource_dispatch[] = {
 	acpi_rs_convert_pin_group,	/* 0x10, ACPI_RESOURCE_NAME_PIN_GROUP */
 	acpi_rs_convert_pin_group_function,	/* 0x11, ACPI_RESOURCE_NAME_PIN_GROUP_FUNCTION */
 	acpi_rs_convert_pin_group_config,	/* 0x12, ACPI_RESOURCE_NAME_PIN_GROUP_CONFIG */
+	acpi_rs_convert_clock_input,	/* 0x13, ACPI_RESOURCE_NAME_CLOCK_INPUT */
 };
 
 /* Subtype table for serial_bus -- I2C, SPI, UART, and CSI2 */
@@ -136,6 +138,7 @@ struct acpi_rsdump_info *acpi_gbl_dump_resource_dispatch[] = {
 	acpi_rs_dump_pin_group,	/* ACPI_RESOURCE_TYPE_PIN_GROUP */
 	acpi_rs_dump_pin_group_function,	/* ACPI_RESOURCE_TYPE_PIN_GROUP_FUNCTION */
 	acpi_rs_dump_pin_group_config,	/* ACPI_RESOURCE_TYPE_PIN_GROUP_CONFIG */
+	acpi_rs_dump_clock_input,	/* ACPI_RESOURCE_TYPE_CLOCK_INPUT */
 };
 
 struct acpi_rsdump_info *acpi_gbl_dump_serial_bus_dispatch[] = {
@@ -178,6 +181,7 @@ const u8 acpi_gbl_aml_resource_sizes[] = {
 	sizeof(struct aml_resource_pin_group),	/* ACPI_RESOURCE_TYPE_PIN_GROUP */
 	sizeof(struct aml_resource_pin_group_function),	/* ACPI_RESOURCE_TYPE_PIN_GROUP_FUNCTION */
 	sizeof(struct aml_resource_pin_group_config),	/* ACPI_RESOURCE_TYPE_PIN_GROUP_CONFIG */
+	sizeof(struct aml_resource_clock_input),	/* ACPI_RESOURCE_TYPE_CLOCK_INPUT */
 };
 
 const u8 acpi_gbl_resource_struct_sizes[] = {
@@ -221,6 +225,7 @@ const u8 acpi_gbl_resource_struct_sizes[] = {
 	ACPI_RS_SIZE(struct acpi_resource_pin_group),
 	ACPI_RS_SIZE(struct acpi_resource_pin_group_function),
 	ACPI_RS_SIZE(struct acpi_resource_pin_group_config),
+	ACPI_RS_SIZE(struct acpi_resource_clock_input),
 };
 
 const u8 acpi_gbl_aml_resource_serial_bus_sizes[] = {
diff --git a/drivers/acpi/acpica/rsserial.c b/drivers/acpi/acpica/rsserial.c
index f9267956535c..279bfa27da94 100644
--- a/drivers/acpi/acpica/rsserial.c
+++ b/drivers/acpi/acpica/rsserial.c
@@ -109,6 +109,55 @@ struct acpi_rsconvert_info acpi_rs_convert_gpio[18] = {
 	 0},
 };
 
+/*******************************************************************************
+ *
+ * acpi_rs_convert_clock_input
+ *
+ ******************************************************************************/
+
+struct acpi_rsconvert_info acpi_rs_convert_clock_input[8] = {
+	{ACPI_RSC_INITGET, ACPI_RESOURCE_TYPE_CLOCK_INPUT,
+	 ACPI_RS_SIZE(struct acpi_resource_clock_input),
+	 ACPI_RSC_TABLE_SIZE(acpi_rs_convert_clock_input)},
+
+	{ACPI_RSC_INITSET, ACPI_RESOURCE_NAME_CLOCK_INPUT,
+	 sizeof(struct aml_resource_clock_input),
+	 0}
+	,
+
+	{ACPI_RSC_MOVE8, ACPI_RS_OFFSET(data.clock_input.revision_id),
+	 AML_OFFSET(clock_input.revision_id),
+	 1}
+	,
+
+	{ACPI_RSC_1BITFLAG, ACPI_RS_OFFSET(data.clock_input.mode),
+	 AML_OFFSET(clock_input.flags),
+	 0}
+	,
+
+	{ACPI_RSC_2BITFLAG, ACPI_RS_OFFSET(data.clock_input.scale),
+	 AML_OFFSET(clock_input.flags),
+	 1}
+	,
+
+	{ACPI_RSC_MOVE16, ACPI_RS_OFFSET(data.clock_input.frequency_divisor),
+	 AML_OFFSET(clock_input.frequency_divisor),
+	 2}
+	,
+
+	{ACPI_RSC_MOVE32, ACPI_RS_OFFSET(data.clock_input.frequency_numerator),
+	 AML_OFFSET(clock_input.frequency_numerator),
+	 4}
+	,
+
+	/* Resource Source */
+	{ACPI_RSC_SOURCE, ACPI_RS_OFFSET(data.clock_input.resource_source),
+	 0,
+	 sizeof(struct aml_resource_clock_input)}
+	,
+
+};
+
 /*******************************************************************************
  *
  * acpi_rs_convert_pinfunction
diff --git a/drivers/acpi/acpica/utresdecode.c b/drivers/acpi/acpica/utresdecode.c
index 85730fcd7d00..d801d9069841 100644
--- a/drivers/acpi/acpica/utresdecode.c
+++ b/drivers/acpi/acpica/utresdecode.c
@@ -284,4 +284,15 @@ const char *acpi_gbl_ptyp_decode[] = {
 	"Input Schmitt Trigger",
 };
 
+const char *acpi_gbl_clock_input_mode[] = {
+	"Fixed",
+	"Variable",
+};
+
+const char *acpi_gbl_clock_input_scale[] = {
+	"Hz",
+	"KHz",
+	"MHz",
+};
+
 #endif
diff --git a/drivers/acpi/acpica/utresrc.c b/drivers/acpi/acpica/utresrc.c
index 16f9a7035b39..e1cc3d348750 100644
--- a/drivers/acpi/acpica/utresrc.c
+++ b/drivers/acpi/acpica/utresrc.c
@@ -57,6 +57,8 @@ const u8 acpi_gbl_resource_aml_sizes[] = {
 	ACPI_AML_SIZE_LARGE(struct aml_resource_pin_group),
 	ACPI_AML_SIZE_LARGE(struct aml_resource_pin_group_function),
 	ACPI_AML_SIZE_LARGE(struct aml_resource_pin_group_config),
+	ACPI_AML_SIZE_LARGE(struct aml_resource_clock_input),
+
 };
 
 const u8 acpi_gbl_resource_aml_serial_bus_sizes[] = {
@@ -114,6 +116,7 @@ static const u8 acpi_gbl_resource_types[] = {
 	ACPI_VARIABLE_LENGTH,	/* 10 pin_group */
 	ACPI_VARIABLE_LENGTH,	/* 11 pin_group_function */
 	ACPI_VARIABLE_LENGTH,	/* 12 pin_group_config */
+	ACPI_VARIABLE_LENGTH,	/* 13 clock_input */
 };
 
 /*******************************************************************************
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index e5c875f91203..f913983ef55e 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -536,6 +536,15 @@ struct acpi_resource_pin_config {
 	u8 *vendor_data;
 };
 
+struct acpi_resource_clock_input {
+	u8 revision_id;
+	u8 mode;
+	u8 scale;
+	u16 frequency_divisor;
+	u32 frequency_numerator;
+	struct acpi_resource_source resource_source;
+};
+
 /* Values for pin_config_type field above */
 
 #define ACPI_PIN_CONFIG_DEFAULT                 0
@@ -613,7 +622,8 @@ struct acpi_resource_pin_group_config {
 #define ACPI_RESOURCE_TYPE_PIN_GROUP            22	/* ACPI 6.2 */
 #define ACPI_RESOURCE_TYPE_PIN_GROUP_FUNCTION   23	/* ACPI 6.2 */
 #define ACPI_RESOURCE_TYPE_PIN_GROUP_CONFIG     24	/* ACPI 6.2 */
-#define ACPI_RESOURCE_TYPE_MAX                  24
+#define ACPI_RESOURCE_TYPE_CLOCK_INPUT          25	/* ACPI 6.5 */
+#define ACPI_RESOURCE_TYPE_MAX                  25
 
 /* Master union for resource descriptors */
 
@@ -647,6 +657,7 @@ union acpi_resource_data {
 	struct acpi_resource_pin_group pin_group;
 	struct acpi_resource_pin_group_function pin_group_function;
 	struct acpi_resource_pin_group_config pin_group_config;
+	struct acpi_resource_clock_input clock_input;
 
 	/* Common fields */
 
-- 
cgit v1.2.3


From f2ca92d08325b8d4668a6b3ee4b3d5622b75b952 Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Wed, 5 Apr 2023 15:40:12 +0200
Subject: ACPICA: MADT: Add RISC-V INTC interrupt controller

ACPICA commit bd6d1ae1e13abe78e149c8b61b4bc7bc7feab015

The ECR to add RISC-V INTC interrupt controller is approved by
the UEFI forum and will be available in the next revision of
the ACPI specification.

Link: https://github.com/acpica/acpica/commit/bd6d1ae1
Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index e0f205f9ab4c..9af17cb66faa 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -891,7 +891,8 @@ enum acpi_madt_type {
 	ACPI_MADT_TYPE_MSI_PIC = 21,
 	ACPI_MADT_TYPE_BIO_PIC = 22,
 	ACPI_MADT_TYPE_LPC_PIC = 23,
-	ACPI_MADT_TYPE_RESERVED = 24,	/* 24 to 0x7F are reserved */
+	ACPI_MADT_TYPE_RINTC = 24,
+	ACPI_MADT_TYPE_RESERVED = 25,	/* 25 to 0x7F are reserved */
 	ACPI_MADT_TYPE_OEM_RESERVED = 0x80	/* 0x80 to 0xFF are reserved for OEM use */
 };
 
@@ -1251,6 +1252,24 @@ enum acpi_madt_lpc_pic_version {
 	ACPI_MADT_LPC_PIC_VERSION_RESERVED = 2	/* 2 and greater are reserved */
 };
 
+/* 24: RISC-V INTC */
+struct acpi_madt_rintc {
+	struct acpi_subtable_header header;
+	u8 version;
+	u8 reserved;
+	u32 flags;
+	u64 hart_id;
+	u32 uid;		/* ACPI processor UID */
+};
+
+/* Values for RISC-V INTC Version field above */
+
+enum acpi_madt_rintc_version {
+	ACPI_MADT_RINTC_VERSION_NONE = 0,
+	ACPI_MADT_RINTC_VERSION_V1 = 1,
+	ACPI_MADT_RINTC_VERSION_RESERVED = 2	/* 2 and greater are reserved */
+};
+
 /* 80: OEM data */
 
 struct acpi_madt_oem_data {
-- 
cgit v1.2.3


From 003567a3b70d0dc5adf71ca0b2bd43d2b4a1ee26 Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Wed, 5 Apr 2023 15:40:57 +0200
Subject: ACPICA: Add structure definitions for RISC-V RHCT

ACPICA commit 82afd0434e79f74b96a6be88115ddc8343a1ba40

RISC-V Hart Capabilities Table (RHCT) is a new static table.
The ECR to add RHCT is approved by the UEFI forum and will be
available in the next version of the ACPI spec.

Link: https://github.com/acpica/acpica/commit/82afd043
Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 9af17cb66faa..db292f325696 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -48,6 +48,7 @@
 #define ACPI_SIG_PRMT           "PRMT"	/* Platform Runtime Mechanism Table */
 #define ACPI_SIG_RASF           "RASF"	/* RAS Feature table */
 #define ACPI_SIG_RGRT           "RGRT"	/* Regulatory Graphics Resource Table */
+#define ACPI_SIG_RHCT           "RHCT"	/* RISC-V Hart Capabilities Table */
 #define ACPI_SIG_SBST           "SBST"	/* Smart Battery Specification Table */
 #define ACPI_SIG_SDEI           "SDEI"	/* Software Delegated Exception Interface Table */
 #define ACPI_SIG_SDEV           "SDEV"	/* Secure Devices table */
@@ -2720,6 +2721,53 @@ enum acpi_rgrt_image_type {
 	ACPI_RGRT_TYPE_RESERVED = 2	/* 2 and greater are reserved */
 };
 
+/*******************************************************************************
+ *
+ * RHCT - RISC-V Hart Capabilities Table
+ *        Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_rhct {
+	struct acpi_table_header header;	/* Common ACPI table header */
+	u32 reserved;
+	u64 time_base_freq;
+	u32 node_count;
+	u32 node_offset;
+};
+
+/*
+ * RHCT subtables
+ */
+struct acpi_rhct_node_header {
+	u16 type;
+	u16 length;
+	u16 revision;
+};
+
+/* Values for RHCT subtable Type above */
+
+enum acpi_rhct_node_type {
+	ACPI_RHCT_NODE_TYPE_ISA_STRING = 0x0000,
+	ACPI_RHCT_NODE_TYPE_HART_INFO = 0xFFFF,
+};
+
+/*
+ * RHCT node specific subtables
+ */
+
+/* ISA string node structure */
+struct acpi_rhct_isa_string {
+	u16 isa_length;
+	char isa[];
+};
+
+/* Hart Info node structure */
+struct acpi_rhct_hart_info {
+	u16 num_offsets;
+	u32 uid;		/* ACPI processor UID */
+};
+
 /*******************************************************************************
  *
  * SBST - Smart Battery Specification Table
-- 
cgit v1.2.3


From 4cf8a60602f724d07e0c8d4bb639dfd38b36aaa4 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 5 Apr 2023 15:43:28 +0200
Subject: ACPICA: Avoid undefined behavior: member access within null pointer

ACPICA commit 2411e11ef88f42b08f33c38ed9c0d40282780e8c

84449c1eef1c0d092b037dc4c2c60cec5d5cc6c4 fixed this for Linux kernel
builds, but not Linux userspace builds.

Before this change we see the following UBSAN stack trace in Fuchsia:

  ../../third_party/acpica/source/components/tables/tbfadt.c:536:39: runtime error: member access within null pointer of type 'struct acpi_table_fadt' (aka 'struct acpi_table_fadt')
      #0 0x564860b5ee9b in acpi_tb_convert_fadt ../../third_party/acpica/source/components/tables/tbfadt.c:536:39
      #1 0x564860b5edb4 in acpi_tb_create_local_fadt ../../third_party/acpica/source/components/tables/tbfadt.c:461:5
      #2 0x564860b5e5c6 in acpi_tb_parse_fadt ../../third_party/acpica/source/components/tables/tbfadt.c:371:5
      #3 0x564860b5c485 in acpi_tb_parse_root_table ../../third_party/acpica/source/components/tables/tbutils.c:407:13
      #4 0x564860b6401a in acpi_initialize_tables ../../third_party/acpica/source/components/tables/tbxface.c:160:14
      #5 0x5648608fb417 in acpi_host_test::acpi_host_test::init_acpi_with_tables(char const*) ../../src/devices/board/tests/acpi-host-tests/acpi-host-test.cc:36:5
      #6 0x5648608f9095 in acpi_host_test::acpi_host_test_device_is_child_of_scope_test_Test::test_body() ../../src/devices/board/tests/acpi-host-tests/acpi-host-test.cc:85:3
      #7 0x564860c6007e in void testing::internal::handle_seh_exceptions_in_method_if_supported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) ../../third_party/googletest/src/googletest/src/gtest.cc:2609:10
      #8 0x564860bbd5df in void testing::internal::handle_exceptions_in_method_if_supported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) ../../third_party/googletest/src/googletest/src/gtest.cc:2664:12 #9 0x564860bbd141 in testing::Test::Run() ../../third_party/googletest/src/googletest/src/gtest.cc:2684:5  #10 0x564860bbff0a in testing::test_info::Run() ../../third_party/googletest/src/googletest/src/gtest.cc:2864:11   #11 0x564860bc40f1 in testing::test_suite::Run() ../../third_party/googletest/src/googletest/src/gtest.cc:3023:30   #12 0x564860beba40 in testing::internal::unit_test_impl::run_all_tests() ../../third_party/googletest/src/googletest/src/gtest.cc:5882:44
      #13 0x564860c7db6e in bool testing::internal::handle_seh_exceptions_in_method_if_supported<testing::internal::unit_test_impl, bool>(testing::internal::unit_test_impl*, bool (testing::internal::unit_test_impl::*)(), char const*) ../../third_party/googletest/src/googletest/src/gtest.cc:2609:10
      #14 0x564860bea71f in bool testing::internal::handle_exceptions_in_method_if_supported<testing::internal::unit_test_impl, bool>(testing::internal::unit_test_impl*, bool (testing::internal::unit_test_impl::*)(), char const*) ../../third_party/googletest/src/googletest/src/gtest.cc:2664:12 #15 0x564860bea1c5 in testing::unit_test::Run() ../../third_party/googletest/src/googletest/src/gtest.cc:5456:10 #16 0x5648608fccc0 in RUN_ALL_TESTS() ../../third_party/googletest/src/googletest/include/gtest/gtest.h:2304:73 #17 0x5648608fcb7e in main ../../src/devices/board/tests/acpi-host-tests/acpi-host-test.cc:121:10 #18 0x7f6defa2d189  (/lib/x86_64-linux-gnu/libc.so.6+0x27189) (build_id: c4f6727c560b1c33527ff9e0ca0cef13a7db64d2)
      #19 0x7f6defa2d244 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x27244) (build_id: c4f6727c560b1c33527ff9e0ca0cef13a7db64d2)
      #20 0x56486082e598  (/usr/local/google/home/tamird/src/fuchsia/out/core.x64/host_x64/acpi-host-test-bin+0x359598) (build_id: 851423b0e664df6a)

Link: https://github.com/acpica/acpica/commit/2411e11e
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/platform/aclinux.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 66285e054e1e..1ca450e35c0d 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -180,7 +180,10 @@
 #define ACPI_USE_STANDARD_HEADERS
 
 #ifdef ACPI_USE_STANDARD_HEADERS
+#include <stddef.h>
 #include <unistd.h>
+
+#define ACPI_OFFSET(d, f)   offsetof(d, f)
 #endif
 
 /* Define/disable kernel-specific declarators */
-- 
cgit v1.2.3


From 94bf7c8a626d71a7c206b3e0cdfb43aca305c862 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:48:10 +0200
Subject: ACPICA: struct acpi_resource_vendor: Replace 1-element array with
 flexible array

ACPICA commit 446d05d5ea77946b8b3b8d0c638d1a446b18503e

Similar to commit 7ba2f3d91a32 ("Replace one-element array with
flexible-array"), replace the 1-element array with a proper
flexible array member as defined by C99. This allows the code to
operate without tripping compile-time and run-time bounds checkers
(e.g. via __builtin_object_size(), -fsanitize=bounds, and/or
-fstrict-flex-arrays=3).

No binary changes appear in the .text nor .data sections.

Link: https://github.com/acpica/acpica/commit/446d05d5
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acrestyp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index f913983ef55e..77d735594ac2 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -194,7 +194,7 @@ struct acpi_resource_fixed_dma {
 
 struct acpi_resource_vendor {
 	u16 byte_length;
-	u8 byte_data[1];
+	u8 byte_data[];
 };
 
 /* Vendor resource with UUID info (introduced in ACPI 3.0) */
@@ -203,7 +203,7 @@ struct acpi_resource_vendor_typed {
 	u16 byte_length;
 	u8 uuid_subtype;
 	u8 uuid[ACPI_UUID_LENGTH];
-	u8 byte_data[1];
+	u8 byte_data[];
 };
 
 struct acpi_resource_end_tag {
-- 
cgit v1.2.3


From 48ff467c338b184bc309ea2e0d2a9524f569d245 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:48:56 +0200
Subject: ACPICA: actbl1: Replace 1-element arrays with flexible arrays

ACPICA commit 8c9bd5d151f77767b2fd937911848b7159dc8ee9

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3).

No .text nor .data differences result from this change.

Link: https://github.com/acpica/acpica/commit/8c9bd5d1
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl1.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 7a3ad28b86c6..6ee7437537db 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -944,7 +944,7 @@ struct acpi_table_drtm {
 
 struct acpi_drtm_vtable_list {
 	u32 validated_table_count;
-	u64 validated_tables[1];
+	u64 validated_tables[];
 };
 
 /* 2) Resources List (of Resource Descriptors) */
@@ -959,7 +959,7 @@ struct acpi_drtm_resource {
 
 struct acpi_drtm_resource_list {
 	u32 resource_count;
-	struct acpi_drtm_resource resources[1];
+	struct acpi_drtm_resource resources[];
 };
 
 /* 3) Platform-specific Identifiers List */
@@ -982,7 +982,7 @@ struct acpi_table_ecdt {
 	struct acpi_generic_address data;	/* Address of EC data register */
 	u32 uid;		/* Unique ID - must be same as the EC _UID method */
 	u8 gpe;			/* The GPE for the EC */
-	u8 id[1];		/* Full namepath of the EC in the ACPI namespace */
+	u8 id[];		/* Full namepath of the EC in the ACPI namespace */
 };
 
 /*******************************************************************************
-- 
cgit v1.2.3


From 74522fea27f8a0695f529089130da64ac98f1761 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:50:18 +0200
Subject: ACPICA: actbl2: Replace 1-element arrays with flexible arrays

ACPICA commit 44f1af0664599e87bebc3a1260692baa27b2f264

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3).

The sizeof() uses with struct acpi_nfit_flush_address and struct
acpi_nfit_smbios have been adjusted to drop the open-coded subtraction
of the trailing single element. The result is no binary differences in
.text nor .data sections.

Link: https://github.com/acpica/acpica/commit/44f1af06
Signed-off-by: Bob Moore <robert.moore@intel.com>
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/nfit/core.c         |  4 ++--
 include/acpi/actbl2.h            | 16 ++++++++--------
 tools/testing/nvdimm/test/nfit.c |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 4e48d6db05eb..85d9d67e38a4 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -894,7 +894,7 @@ static size_t sizeof_flush(struct acpi_nfit_flush_address *flush)
 {
 	if (flush->header.length < sizeof(*flush))
 		return 0;
-	return sizeof(*flush) + sizeof(u64) * (flush->hint_count - 1);
+	return struct_size(flush, hint_address, flush->hint_count);
 }
 
 static bool add_flush(struct acpi_nfit_desc *acpi_desc,
@@ -3477,7 +3477,7 @@ static __init int nfit_init(void)
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 64);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20);
-	BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 8);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_capabilities) != 16);
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index db292f325696..6d3251ea4c53 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -397,7 +397,7 @@ struct acpi_iort_node {
 	u32 identifier;
 	u32 mapping_count;
 	u32 mapping_offset;
-	char node_data[1];
+	char node_data[];
 };
 
 /* Values for subtable Type above */
@@ -453,14 +453,14 @@ struct acpi_iort_memory_access {
  */
 struct acpi_iort_its_group {
 	u32 its_count;
-	u32 identifiers[1];	/* GIC ITS identifier array */
+	u32 identifiers[];	/* GIC ITS identifier array */
 };
 
 struct acpi_iort_named_component {
 	u32 node_flags;
 	u64 memory_properties;	/* Memory access properties */
 	u8 memory_address_limit;	/* Memory address size limit */
-	char device_name[1];	/* Path of namespace object */
+	char device_name[];	/* Path of namespace object */
 };
 
 /* Masks for Flags field above */
@@ -474,7 +474,7 @@ struct acpi_iort_root_complex {
 	u32 pci_segment_number;
 	u8 memory_address_limit;	/* Memory address size limit */
 	u16 pasid_capabilities;	/* PASID Capabilities */
-	u8 reserved[1];		/* Reserved, must be zero */
+	u8 reserved[];		/* Reserved, must be zero */
 };
 
 /* Masks for ats_attribute field above */
@@ -496,7 +496,7 @@ struct acpi_iort_smmu {
 	u32 context_interrupt_offset;
 	u32 pmu_interrupt_count;
 	u32 pmu_interrupt_offset;
-	u64 interrupts[1];	/* Interrupt array */
+	u64 interrupts[];	/* Interrupt array */
 };
 
 /* Values for Model field above */
@@ -975,7 +975,7 @@ struct acpi_madt_local_sapic {
 	u8 reserved[3];		/* Reserved, must be zero */
 	u32 lapic_flags;
 	u32 uid;		/* Numeric UID - ACPI 3.0 */
-	char uid_string[1];	/* String UID  - ACPI 3.0 */
+	char uid_string[];	/* String UID  - ACPI 3.0 */
 };
 
 /* 8: Platform Interrupt Source */
@@ -1708,7 +1708,7 @@ struct acpi_nfit_interleave {
 struct acpi_nfit_smbios {
 	struct acpi_nfit_header header;
 	u32 reserved;		/* Reserved, must be zero */
-	u8 data[1];		/* Variable length */
+	u8 data[];		/* Variable length */
 };
 
 /* 4: NVDIMM Control Region Structure */
@@ -1765,7 +1765,7 @@ struct acpi_nfit_flush_address {
 	u32 device_handle;
 	u16 hint_count;
 	u8 reserved[6];		/* Reserved, must be zero */
-	u64 hint_address[1];	/* Variable length */
+	u64 hint_address[];	/* Variable length */
 };
 
 /* 7: Platform Capabilities Structure */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index c75abb497a1a..745c4a27bc35 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1878,14 +1878,14 @@ static size_t sizeof_spa(struct acpi_nfit_system_address *spa)
 static int nfit_test0_alloc(struct nfit_test *t)
 {
 	struct acpi_nfit_system_address *spa = NULL;
+	struct acpi_nfit_flush_address *flush;
 	size_t nfit_size = sizeof_spa(spa) * NUM_SPA
 			+ sizeof(struct acpi_nfit_memory_map) * NUM_MEM
 			+ sizeof(struct acpi_nfit_control_region) * NUM_DCR
 			+ offsetof(struct acpi_nfit_control_region,
 					window_size) * NUM_DCR
 			+ sizeof(struct acpi_nfit_data_region) * NUM_BDW
-			+ (sizeof(struct acpi_nfit_flush_address)
-					+ sizeof(u64) * NUM_HINTS) * NUM_DCR
+			+ struct_size(flush, hint_address, NUM_HINTS) * NUM_DCR
 			+ sizeof(struct acpi_nfit_capabilities);
 	int i;
 
-- 
cgit v1.2.3


From 2a5ab99847bd41ad5f0461f519d0825f163874a6 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:51:25 +0200
Subject: ACPICA: struct acpi_nfit_interleave: Replace 1-element array with
 flexible array

ACPICA commit e66decc6fca36b59194b0947d87d6a9bec078bc3

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3).

Unlike struct acpi_nfit_flush_address and struct acpi_nfit_smbios, which
had their sizeof() uses adjusted in code, struct acpi_nfit_interleave did
not. This appears to have been a bug. After this change, there is a binary
difference in acpi_dm_dump_nfit() since the size of the structure now has
the correct size, as the prior result was including the trailing U32:

-       mov    $0x14,%ebp
+       mov    $0x10,%ebp

Link: https://github.com/acpica/acpica/commit/e66decc6
Signed-off-by: Bob Moore <robert.moore@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/nfit/core.c | 2 +-
 include/acpi/actbl2.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 85d9d67e38a4..07204d482968 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -3476,7 +3476,7 @@ static __init int nfit_init(void)
 	BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 64);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48);
-	BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 16);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 8);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80);
 	BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40);
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 6d3251ea4c53..a51fd4090d27 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1700,7 +1700,7 @@ struct acpi_nfit_interleave {
 	u16 reserved;		/* Reserved, must be zero */
 	u32 line_count;
 	u32 line_size;
-	u32 line_offset[1];	/* Variable length */
+	u32 line_offset[];	/* Variable length */
 };
 
 /* 3: SMBIOS Management Information Structure */
-- 
cgit v1.2.3


From 11132ad0176e9a6a847a9bfca77c39c2857cf85b Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:52:23 +0200
Subject: ACPICA: Introduce ACPI_FLEX_ARRAY

ACPICA commit e73b227e8e475c20cc394f237ea35d592fdf9ec3

In order to enable using -fstrict-flex-arrays with GCC and Clang in the
Linux kernel, each trailing dynamically sized array must be defined as
proper C99 "flexible array members" (FAM). Unfortunately, ACPICA has a
bunch of technical debt, dating back to before even the GNU extension of
0-length arrays, meaning the code base has many 1-element and 0-length
arrays defined at the end of structures that should actually be FAMs.

One limitation of the C99 FAM specification is the accidental requirement
that they cannot be in unions or alone in structs. There is no real-world
reason for this, though, and, actually, the existing GNU extension
permits this for 0-length arrays (which get treated as FAMs).

Add the ACPI_FLEX_ARRAY() helper macro to work around this requirement
so that FAMs can be defined in unions or alone in structs. Since this
behavior still depends on GNU extensions, keep the macro specific to GCC
(and Clang) builds. In this way, MSVC will continue to use 0-length
arrays (since it does not support the union work-around). When MSVC
grows support for this in the future, the macro can be updated.

Link: https://github.com/acpica/acpica/commit/e73b227e
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actypes.h        |  4 ++++
 include/acpi/platform/acgcc.h | 11 +++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index ce1db1c2e9d3..cf6ee2cb0eb9 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -1323,4 +1323,8 @@ typedef enum {
 #define ACPI_FALLTHROUGH do {} while(0)
 #endif
 
+#ifndef ACPI_FLEX_ARRAY
+#define ACPI_FLEX_ARRAY(TYPE, NAME)     TYPE NAME[0]
+#endif
+
 #endif				/* __ACTYPES_H__ */
diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h
index c9d418f9395e..04b4bf620517 100644
--- a/include/acpi/platform/acgcc.h
+++ b/include/acpi/platform/acgcc.h
@@ -61,4 +61,15 @@
 #define ACPI_FALLTHROUGH __attribute__((__fallthrough__))
 #endif
 
+/*
+ * Flexible array members are not allowed to be part of a union under
+ * C99, but this is not for any technical reason. Work around the
+ * limitation.
+ */
+#define ACPI_FLEX_ARRAY(TYPE, NAME)             \
+        struct {                                \
+                struct { } __Empty_ ## NAME;    \
+                TYPE NAME[];                    \
+        }
+
 #endif				/* __ACGCC_H__ */
-- 
cgit v1.2.3


From 6671709c6ee5a9970ad94fe80823dc2fab711f84 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:53:20 +0200
Subject: ACPICA: struct acpi_resource_dma: Replace 1-element array with
 flexible array

ACPICA commit 8409bb869a1790f6e02391c3f0eaf9c5fa63e33f

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99,
but without changing the structure size.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3). As with IRQs, leave a single element in
a union.

No binary changes appear in the .text nor .data sections.

Link: https://github.com/acpica/acpica/commit/8409bb86
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acrestyp.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index 77d735594ac2..bfba293787ba 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -150,7 +150,10 @@ struct acpi_resource_dma {
 	u8 bus_master;
 	u8 transfer;
 	u8 channel_count;
-	u8 channels[1];
+	union {
+		u8 channel;
+		 ACPI_FLEX_ARRAY(u8, channels);
+	};
 };
 
 struct acpi_resource_start_dependent {
-- 
cgit v1.2.3


From 3a287932da6a97c84437669a6fec1486401893d2 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:54:25 +0200
Subject: ACPICA: acpi_pci_routing_table: Replace fixed-size array with flex
 array member

ACPICA commit f4a3afd78c28dede0907f47951f0b73c9a776d4e

The "Source" array is actually a dynamically sized array, but it
is defined as a fixed-size 4 byte array. This results in tripping
both compile-time and run-time bounds checkers (e.g. via either
__builtin_object_size() or -fsanitize=bounds).

To retain the padding, create a union with an unused Pad variable of
size 4, and redefine Source as a proper flexible array member.

No binary changes appear in the .text nor .data sections.

Link: https://github.com/acpica/acpica/commit/f4a3afd7
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acrestyp.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index bfba293787ba..87d58afca79e 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -693,7 +693,10 @@ struct acpi_pci_routing_table {
 	u32 pin;
 	u64 address;		/* here for 64-bit alignment */
 	u32 source_index;
-	char source[4];		/* pad to 64 bits so sizeof() works in all cases */
+	union {
+		char pad[4];	/* pad to 64 bits so sizeof() works in all cases */
+		 ACPI_FLEX_ARRAY(char, source);
+	};
 };
 
 #endif				/* __ACRESTYP_H__ */
-- 
cgit v1.2.3


From 49bd783e4fe19c03d3ce97ab066e865acc6c7b15 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:55:13 +0200
Subject: ACPICA: acpi_dmar_andd: Replace 1-element array with flexible array

ACPICA commit 3c19ae70424e9ab1e1b805203d300d2660f9a2f7

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3).

The handling of struct acpi_dmar_andd by acpi_dm_dump_dmar() appears to
expect a single trailing char for calculating table offsets. Keep a char
in the union to avoid any code changes appearing in the .text or .data
sections.

Link: https://github.com/acpica/acpica/commit/3c19ae70
Signed-off-by: Kees Cook <kees@outflux.net>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl1.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 6ee7437537db..58b0490a2ad1 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -901,7 +901,10 @@ struct acpi_dmar_andd {
 	struct acpi_dmar_header header;
 	u8 reserved[3];
 	u8 device_number;
-	char device_name[1];
+	union {
+		char __pad;
+		 ACPI_FLEX_ARRAY(char, device_name);
+	};
 };
 
 /* 5: SOC Integrated Address Translation Cache Reporting Structure */
-- 
cgit v1.2.3


From 2a85fc5626797f9057311a419b1d5d847d86c527 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:56:06 +0200
Subject: ACPICA: acpi_madt_oem_data: Fix flexible array member definition

ACPICA commit e7f6d8c1b7f79eb4b9b07f1bc09c549a2acbd6e8

Use ACPI_FLEX_ARRAY() helper to define flexible array member alone in a
struct. Fixes issue #812.

No binary changes appear in the .text nor .data sections.

Link: https://github.com/acpica/acpica/commit/e7f6d8c1
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index a51fd4090d27..0029336775a9 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1274,7 +1274,7 @@ enum acpi_madt_rintc_version {
 /* 80: OEM data */
 
 struct acpi_madt_oem_data {
-	u8 oem_data[0];
+	ACPI_FLEX_ARRAY(u8, oem_data);
 };
 
 /*
-- 
cgit v1.2.3


From 376b0fb3ad621ca507c1bdf5636232959bf827e6 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Wed, 5 Apr 2023 15:56:59 +0200
Subject: ACPICA: acpi_resource_irq: Replace 1-element arrays with flexible
 array

ACPICA commit bfdd3446e7caf795c85c70326c137023942972c5

Similar to "Replace one-element array with flexible-array", replace the
1-element array with a proper flexible array member as defined by C99.

This allows the code to operate without tripping compile-time and run-
time bounds checkers (e.g. via __builtin_object_size(), -fsanitize=bounds,
and/or -fstrict-flex-arrays=3). Note that the spec requires there be at
least one interrupt, so use a union to keep space allocated for this.

The only binary change in .text and .data sections is some rearrangement
by the compiler of acpi_dm_address_common(), but appears to be harmless.

Link: https://github.com/acpica/acpica/commit/bfdd3446
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/amlresrc.h |  5 ++++-
 include/acpi/acrestyp.h        | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h
index 48df447ef5bb..4e88f9fc2a28 100644
--- a/drivers/acpi/acpica/amlresrc.h
+++ b/drivers/acpi/acpica/amlresrc.h
@@ -261,7 +261,10 @@ struct aml_resource_address16 {
 struct aml_resource_extended_irq {
 	AML_RESOURCE_LARGE_HEADER_COMMON u8 flags;
 	u8 interrupt_count;
-	u32 interrupts[1];
+	union {
+		u32 interrupt;
+		 ACPI_FLEX_ARRAY(u32, interrupts);
+	};
 	/* res_source_index, res_source optional fields follow */
 };
 
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index 87d58afca79e..efef208b0324 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -142,7 +142,10 @@ struct acpi_resource_irq {
 	u8 shareable;
 	u8 wake_capable;
 	u8 interrupt_count;
-	u8 interrupts[1];
+	union {
+		u8 interrupt;
+		 ACPI_FLEX_ARRAY(u8, interrupts);
+	};
 };
 
 struct acpi_resource_dma {
@@ -335,7 +338,10 @@ struct acpi_resource_extended_irq {
 	u8 wake_capable;
 	u8 interrupt_count;
 	struct acpi_resource_source resource_source;
-	u32 interrupts[1];
+	union {
+		u32 interrupt;
+		 ACPI_FLEX_ARRAY(u32, interrupts);
+	};
 };
 
 struct acpi_resource_generic_register {
-- 
cgit v1.2.3


From beadd51f74f606105b558ec69bbbe2305066d0ea Mon Sep 17 00:00:00 2001
From: Najumon <najumon.ba@intel.com>
Date: Wed, 5 Apr 2023 15:59:00 +0200
Subject: ACPICA: add os specific support for Zephyr RTOS

ACPICA commit 463d30f0a8edc5dccad20c2d189dc55111d51aae

The acpica will be integrated as module into Zephyr project for
enable acpi bus driver. This patch is for enable os specific
support layer for Zephyr.

Link: https://github.com/acpica/acpica/commit/463d30f0
Signed-off-by: Najumon <najumon.ba@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/platform/acenv.h    |  2 ++
 include/acpi/platform/aczephyr.h | 48 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 include/acpi/platform/aczephyr.h

(limited to 'include')

diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h
index 0fa7bbf85c83..337ffa931ee8 100644
--- a/include/acpi/platform/acenv.h
+++ b/include/acpi/platform/acenv.h
@@ -209,6 +209,8 @@
 #elif defined(_AED_EFI) || defined(_GNU_EFI) || defined(_EDK2_EFI)
 #include "acefi.h"
 
+#elif defined(__ZEPHYR__)
+#include "aczephyr.h"
 #else
 
 /* Unknown environment */
diff --git a/include/acpi/platform/aczephyr.h b/include/acpi/platform/aczephyr.h
new file mode 100644
index 000000000000..2f0d30c3c5fd
--- /dev/null
+++ b/include/acpi/platform/aczephyr.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/******************************************************************************
+ *
+ * Module Name: aczephyr.h - OS specific defines, etc.
+ *
+ * Copyright (C) 2000 - 2023, Intel Corp.
+ *
+ *****************************************************************************/
+
+#ifndef __ACZEPHYR_H__
+#define __ACZEPHYR_H__
+
+#define SEEK_SET FS_SEEK_SET
+#define SEEK_END FS_SEEK_END
+
+#define ACPI_MACHINE_WIDTH      64
+
+#define ACPI_NO_ERROR_MESSAGES
+#undef ACPI_DEBUG_OUTPUT
+#define ACPI_USE_SYSTEM_CLIBRARY
+#undef ACPI_DBG_TRACK_ALLOCATIONS
+#define ACPI_SINGLE_THREADED
+#define ACPI_USE_NATIVE_RSDP_POINTER
+
+#include <zephyr/kernel.h>
+#include <zephyr/device.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <zephyr/fs/fs.h>
+#include <zephyr/sys/printk.h>
+#include <zephyr/sys/__assert.h>
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_enable_dbg_print
+ *
+ * PARAMETERS:  Enable, 	            - Enable/Disable debug print
+ *
+ * RETURN:      None
+ *
+ * DESCRIPTION: Enable/disable debug print
+ *
+ *****************************************************************************/
+
+void acpi_enable_dbg_print(bool enable);
+#endif
-- 
cgit v1.2.3


From df2286655ce126ed6f54a76e1cb66d53cd185242 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 5 Apr 2023 15:59:37 +0200
Subject: ACPICA: Update version to 20230331

ACPICA commit 4578e0e94d945e56547749316691017880c8ee74

Version 20230331.

Link: https://github.com/acpica/acpica/commit/4578e0e9
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index c961f0d99c5e..e6098a08c914 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20221020
+#define ACPI_CA_VERSION                 0x20230331
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 45166620b7254a9e9329a2b6cdfa369bc3ca9c5b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 5 Apr 2023 15:27:24 -0500
Subject: ACPI: Replace irqdomain.h include with struct declarations

linux/acpi.h includes irqdomain.h which includes of.h. Break the include
chain by replacing the irqdomain include with forward declarations for
struct irq_domain and irq_domain_ops which is sufficient for acpi.h.

of.h also includes mod_devicetable.h which many drivers implicitly
depend on. As acpi.h already includes it, just move it out of the
'#ifdef CONFIG_ACPI'.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index efff750f326d..96bd672dc336 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -10,12 +10,15 @@
 
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
-#include <linux/irqdomain.h>
 #include <linux/resource_ext.h>
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
 
+struct irq_domain;
+struct irq_domain_ops;
+
 #ifndef _LINUX
 #define _LINUX
 #endif
@@ -24,7 +27,6 @@
 #ifdef	CONFIG_ACPI
 
 #include <linux/list.h>
-#include <linux/mod_devicetable.h>
 #include <linux/dynamic_debug.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3


From 195d8e5da3acb17c5357526494f818a21e97cd10 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Wed, 29 Mar 2023 13:13:11 -0700
Subject: PCI/MSI: Provide missing stub for pci_msix_can_alloc_dyn()

pci_msix_can_alloc_dyn() is not declared when CONFIG_PCI_MSI is disabled.

There is no existing user of pci_msix_can_alloc_dyn() but work is in
progress to change this. This work encounters the following error when
CONFIG_PCI_MSI is disabled:

  drivers/vfio/pci/vfio_pci_intrs.c:427:21: error: implicit declaration of function 'pci_msix_can_alloc_dyn' [-Werror=implicit-function-declaration]

Provide definition for pci_msix_can_alloc_dyn() in preparation for users
that need to compile when CONFIG_PCI_MSI is disabled.

[bhelgaas: Also reported by Arnd Bergmann <arnd@kernel.org> in
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c; added his Fixes: line]

Fixes: fb0a6a268dcd ("net/mlx5: Provide external API for allocating vectors")
Fixes: 34026364df8e ("PCI/MSI: Provide post-enable dynamic allocation interfaces for MSI-X")
Link: https://lore.kernel.org/oe-kbuild-all/202303291000.PWFqGCxH-lkp@intel.com/
Link: https://lore.kernel.org/r/310ecc4815dae4174031062f525245f0755c70e2.1680119924.git.reinette.chatre@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Cc: stable@vger.kernel.org	# v6.2+
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index fafd8020c6d7..5e2dab675943 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1623,6 +1623,8 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 					      flags, NULL);
 }
 
+static inline bool pci_msix_can_alloc_dyn(struct pci_dev *dev)
+{ return false; }
 static inline struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index,
 						   const struct irq_affinity_desc *affdesc)
 {
-- 
cgit v1.2.3


From 8751d15426a31baaf40f7570263c27c3e5d1dc44 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:12 +0100
Subject: io_uring: reduce scheduling due to tw

Every task_work will try to wake the task to be executed, which causes
excessive scheduling and additional overhead. For some tw it's
justified, but others won't do much but post a single CQE.

When a task waits for multiple cqes, every such task_work will wake it
up. Instead, the task may give a hint about how many cqes it waits for,
io_req_local_work_add() will compare against it and skip wake ups
if #cqes + #tw is not enough to satisfy the waiting condition. Task_work
that uses the optimisation should be simple enough and never post more
than one CQE. It's also ignored for non DEFER_TASKRUN rings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +-
 io_uring/io_uring.c            | 68 +++++++++++++++++++++++++++++-------------
 io_uring/io_uring.h            |  9 ++++++
 io_uring/notif.c               |  2 +-
 io_uring/notif.h               |  2 +-
 io_uring/rw.c                  |  2 +-
 6 files changed, 61 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4a6ce03a4903..fa621a508a01 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -296,7 +296,7 @@ struct io_ring_ctx {
 		spinlock_t		completion_lock;
 
 		bool			poll_multi_queue;
-		bool			cq_waiting;
+		atomic_t		cq_wait_nr;
 
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
@@ -566,6 +566,7 @@ struct io_kiocb {
 	atomic_t			refs;
 	atomic_t			poll_refs;
 	struct io_task_work		io_task_work;
+	unsigned			nr_tw;
 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 	union {
 		struct hlist_node	hash_node;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 786ecfa01c54..8a327a81beaf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1300,35 +1300,59 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 	}
 }
 
-static void io_req_local_work_add(struct io_kiocb *req)
+static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned nr_wait, nr_tw, nr_tw_prev;
 	struct llist_node *first;
 
+	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+		flags &= ~IOU_F_TWQ_LAZY_WAKE;
+
 	first = READ_ONCE(ctx->work_llist.first);
 	do {
+		nr_tw_prev = 0;
+		if (first) {
+			struct io_kiocb *first_req = container_of(first,
+							struct io_kiocb,
+							io_task_work.node);
+			/*
+			 * Might be executed at any moment, rely on
+			 * SLAB_TYPESAFE_BY_RCU to keep it alive.
+			 */
+			nr_tw_prev = READ_ONCE(first_req->nr_tw);
+		}
+		nr_tw = nr_tw_prev + 1;
+		/* Large enough to fail the nr_wait comparison below */
+		if (!(flags & IOU_F_TWQ_LAZY_WAKE))
+			nr_tw = -1U;
+
+		req->nr_tw = nr_tw;
 		req->io_task_work.node.next = first;
 	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
 			      &req->io_task_work.node));
 
-	if (first)
-		return;
-
-	/* needed for the following wake up */
-	smp_mb__after_atomic();
-
-	if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
-		io_move_task_work_from_local(ctx);
-		return;
+	if (!first) {
+		if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
+			io_move_task_work_from_local(ctx);
+			return;
+		}
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+		if (ctx->has_evfd)
+			io_eventfd_signal(ctx);
 	}
 
-	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
-		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (ctx->has_evfd)
-		io_eventfd_signal(ctx);
-
-	if (READ_ONCE(ctx->cq_waiting))
-		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
+	nr_wait = atomic_read(&ctx->cq_wait_nr);
+	/* no one is waiting */
+	if (!nr_wait)
+		return;
+	/* either not enough or the previous add has already woken it up */
+	if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
+		return;
+	/* pairs with set_current_state() in io_cqring_wait() */
+	smp_mb__after_atomic();
+	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
@@ -1339,7 +1363,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
 	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
 		rcu_read_lock();
-		io_req_local_work_add(req);
+		io_req_local_work_add(req, flags);
 		rcu_read_unlock();
 		return;
 	}
@@ -2625,7 +2649,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-			WRITE_ONCE(ctx->cq_waiting, 1);
+			int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
+
+			atomic_set(&ctx->cq_wait_nr, nr_wait);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
@@ -2634,7 +2660,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 		ret = io_cqring_wait_schedule(ctx, &iowq);
 		__set_current_state(TASK_RUNNING);
-		WRITE_ONCE(ctx->cq_waiting, 0);
+		atomic_set(&ctx->cq_wait_nr, 0);
 
 		if (ret < 0)
 			break;
@@ -4517,7 +4543,7 @@ static int __init io_uring_init(void)
 	io_uring_optable_init();
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
-				SLAB_ACCOUNT);
+				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cb4309a2acdc..ef449e43d493 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -18,6 +18,15 @@
 enum {
 	/* don't use deferred task_work */
 	IOU_F_TWQ_FORCE_NORMAL			= 1,
+
+	/*
+	 * A hint to not wake right away but delay until there are enough of
+	 * tw's queued to match the number of CQEs the task is waiting for.
+	 *
+	 * Must not be used wirh requests generating more than one CQE.
+	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
+	 */
+	IOU_F_TWQ_LAZY_WAKE			= 2,
 };
 
 enum {
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 172105eb347d..e1846a25dde1 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 
 	if (refcount_dec_and_test(&uarg->refcnt))
-		io_req_task_work_add(notif);
+		__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
diff --git a/io_uring/notif.h b/io_uring/notif.h
index c88c800cd89d..6dd1b30a468f 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
 
 	/* drop slot's master ref */
 	if (refcount_dec_and_test(&nd->uarg.refcnt))
-		io_req_task_work_add(notif);
+		__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index f14868624f41..6c7d2654770e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 	req->io_task_work.func = io_req_rw_complete;
-	io_req_task_work_add(req);
+	__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-- 
cgit v1.2.3


From fe4e5efa401fe15eab9259e358730145449e83a2 Mon Sep 17 00:00:00 2001
From: Jiaxun Yang <jiaxun.yang@flygoat.com>
Date: Sat, 1 Apr 2023 10:15:29 +0100
Subject: dma-mapping: provide a fallback dma_default_coherent

dma_default_coherent was decleared unconditionally at kernel/dma/mapping.c
but only decleared when any of non-coherent options is enabled in
dma-map-ops.h.

Guard the declaration in mapping.c with non-coherent options and provide
a fallback definition.

Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 2 ++
 kernel/dma/mapping.c        | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 41bf4bdb117a..31f114f486c4 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -269,6 +269,8 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 	return dev->dma_coherent;
 }
 #else
+#define dma_default_coherent true
+
 static inline bool dev_is_dma_coherent(struct device *dev)
 {
 	return true;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 68106e3791f6..80f9663ffe26 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -17,7 +17,11 @@
 #include "debug.h"
 #include "direct.h"
 
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 bool dma_default_coherent;
+#endif
 
 /*
  * Managed DMA API
-- 
cgit v1.2.3


From 05f3ab7780b3c0cfe26a8134606bdf641c4f4bb2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 5 Apr 2023 17:10:25 +0200
Subject: net: ethernet: mtk_eth_soc: add code for offloading flows from wlan
 devices

WED version 2 (on MT7986 and later) can offload flows originating from
wireless devices.
In order to make that work, ndo_setup_tc needs to be implemented on the
netdevs. This adds the required code to offload flows coming in from WED,
while keeping track of the incoming wed index used for selecting the
correct PPE device.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.h     |   3 +
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |  40 ++++++----
 drivers/net/ethernet/mediatek/mtk_wed.c         | 101 ++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk_wed.h            |   6 ++
 4 files changed, 136 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index 23c7abeb5c14..cdcf8534283e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -1276,6 +1276,9 @@ int mtk_gmac_rgmii_path_setup(struct mtk_eth *eth, int mac_id);
 int mtk_eth_offload_init(struct mtk_eth *eth);
 int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
 		     void *type_data);
+int mtk_flow_offload_cmd(struct mtk_eth *eth, struct flow_cls_offload *cls,
+			 int ppe_index);
+void mtk_flow_offload_cleanup(struct mtk_eth *eth, struct list_head *list);
 void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev);
 
 
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index 46634dc29d2f..02eebff02d45 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -235,7 +235,8 @@ out:
 }
 
 static int
-mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
+mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f,
+			 int ppe_index)
 {
 	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_action_entry *act;
@@ -452,6 +453,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
 	entry->cookie = f->cookie;
 	memcpy(&entry->data, &foe, sizeof(entry->data));
 	entry->wed_index = wed_index;
+	entry->ppe_index = ppe_index;
 
 	err = mtk_foe_entry_commit(eth->ppe[entry->ppe_index], entry);
 	if (err < 0)
@@ -520,25 +522,15 @@ mtk_flow_offload_stats(struct mtk_eth *eth, struct flow_cls_offload *f)
 
 static DEFINE_MUTEX(mtk_flow_offload_mutex);
 
-static int
-mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+int mtk_flow_offload_cmd(struct mtk_eth *eth, struct flow_cls_offload *cls,
+			 int ppe_index)
 {
-	struct flow_cls_offload *cls = type_data;
-	struct net_device *dev = cb_priv;
-	struct mtk_mac *mac = netdev_priv(dev);
-	struct mtk_eth *eth = mac->hw;
 	int err;
 
-	if (!tc_can_offload(dev))
-		return -EOPNOTSUPP;
-
-	if (type != TC_SETUP_CLSFLOWER)
-		return -EOPNOTSUPP;
-
 	mutex_lock(&mtk_flow_offload_mutex);
 	switch (cls->command) {
 	case FLOW_CLS_REPLACE:
-		err = mtk_flow_offload_replace(eth, cls);
+		err = mtk_flow_offload_replace(eth, cls, ppe_index);
 		break;
 	case FLOW_CLS_DESTROY:
 		err = mtk_flow_offload_destroy(eth, cls);
@@ -555,6 +547,26 @@ mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri
 	return err;
 }
 
+static int
+mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{
+	struct flow_cls_offload *cls = type_data;
+	struct net_device *dev = cb_priv;
+	struct mtk_mac *mac;
+	struct mtk_eth *eth;
+
+	mac = netdev_priv(dev);
+	eth = mac->hw;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	if (type != TC_SETUP_CLSFLOWER)
+		return -EOPNOTSUPP;
+
+	return mtk_flow_offload_cmd(eth, cls, 0);
+}
+
 static int
 mtk_eth_setup_tc_block(struct net_device *dev, struct flow_block_offload *f)
 {
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 95d890870984..4c205afbd230 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -13,6 +13,8 @@
 #include <linux/mfd/syscon.h>
 #include <linux/debugfs.h>
 #include <linux/soc/mediatek/mtk_wed.h>
+#include <net/flow_offload.h>
+#include <net/pkt_cls.h>
 #include "mtk_eth_soc.h"
 #include "mtk_wed_regs.h"
 #include "mtk_wed.h"
@@ -41,6 +43,11 @@
 static struct mtk_wed_hw *hw_list[2];
 static DEFINE_MUTEX(hw_lock);
 
+struct mtk_wed_flow_block_priv {
+	struct mtk_wed_hw *hw;
+	struct net_device *dev;
+};
+
 static void
 wed_m32(struct mtk_wed_device *dev, u32 reg, u32 mask, u32 val)
 {
@@ -1745,6 +1752,99 @@ out:
 	mutex_unlock(&hw_lock);
 }
 
+static int
+mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{
+	struct mtk_wed_flow_block_priv *priv = cb_priv;
+	struct flow_cls_offload *cls = type_data;
+	struct mtk_wed_hw *hw = priv->hw;
+
+	if (!tc_can_offload(priv->dev))
+		return -EOPNOTSUPP;
+
+	if (type != TC_SETUP_CLSFLOWER)
+		return -EOPNOTSUPP;
+
+	return mtk_flow_offload_cmd(hw->eth, cls, hw->index);
+}
+
+static int
+mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev,
+		       struct flow_block_offload *f)
+{
+	struct mtk_wed_flow_block_priv *priv;
+	static LIST_HEAD(block_cb_list);
+	struct flow_block_cb *block_cb;
+	struct mtk_eth *eth = hw->eth;
+	flow_setup_cb_t *cb;
+
+	if (!eth->soc->offload_version)
+		return -EOPNOTSUPP;
+
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	cb = mtk_wed_setup_tc_block_cb;
+	f->driver_block_list = &block_cb_list;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (block_cb) {
+			flow_block_cb_incref(block_cb);
+			return 0;
+		}
+
+		priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+		if (!priv)
+			return -ENOMEM;
+
+		priv->hw = hw;
+		priv->dev = dev;
+		block_cb = flow_block_cb_alloc(cb, dev, priv, NULL);
+		if (IS_ERR(block_cb)) {
+			kfree(priv);
+			return PTR_ERR(block_cb);
+		}
+
+		flow_block_cb_incref(block_cb);
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &block_cb_list);
+		return 0;
+	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		if (!flow_block_cb_decref(block_cb)) {
+			flow_block_cb_remove(block_cb, f);
+			list_del(&block_cb->driver_list);
+			kfree(block_cb->cb_priv);
+		}
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mtk_wed_setup_tc(struct mtk_wed_device *wed, struct net_device *dev,
+		 enum tc_setup_type type, void *type_data)
+{
+	struct mtk_wed_hw *hw = wed->hw;
+
+	if (hw->version < 2)
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+	case TC_SETUP_FT:
+		return mtk_wed_setup_tc_block(hw, dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		    void __iomem *wdma, phys_addr_t wdma_phy,
 		    int index)
@@ -1764,6 +1864,7 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		.irq_set_mask = mtk_wed_irq_set_mask,
 		.detach = mtk_wed_detach,
 		.ppe_check = mtk_wed_ppe_check,
+		.setup_tc = mtk_wed_setup_tc,
 	};
 	struct device_node *eth_np = eth->dev->of_node;
 	struct platform_device *pdev;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index fd0b0605cf90..b2b28180dff7 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -6,6 +6,7 @@
 #include <linux/regmap.h>
 #include <linux/pci.h>
 #include <linux/skbuff.h>
+#include <linux/netdevice.h>
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
@@ -179,6 +180,8 @@ struct mtk_wed_ops {
 
 	u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask);
 	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
+	int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev,
+			enum tc_setup_type type, void *type_data);
 };
 
 extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
@@ -237,6 +240,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 	(_dev)->ops->msg_update(_dev, _id, _msg, _len)
 #define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev)
 #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
+#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \
+	(_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data)
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -255,6 +260,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
 #define mtk_wed_device_stop(_dev) do {} while (0)
 #define mtk_wed_device_dma_reset(_dev) do {} while (0)
+#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP
 #endif
 
 #endif
-- 
cgit v1.2.3


From 4598380f9c548aa161eb4e990a1583f0a7d1e0d7 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 6 Apr 2023 16:23:50 +0800
Subject: bonding: fix ns validation on backup slaves

When arp_validate is set to 2, 3, or 6, validation is performed for
backup slaves as well. As stated in the bond documentation, validation
involves checking the broadcast ARP request sent out via the active
slave. This helps determine which slaves are more likely to function in
the event of an active slave failure.

However, when the target is an IPv6 address, the NS message sent from
the active interface is not checked on backup slaves. Additionally,
based on the bond_arp_rcv() rule b, we must reverse the saddr and daddr
when checking the NS message.

Note that when checking the NS message, the destination address is a
multicast address. Therefore, we must convert the target address to
solicited multicast in the bond_get_targets_ip6() function.

Prior to the fix, the backup slaves had a mii status of "down", but
after the fix, all of the slaves' mii status was updated to "UP".

Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets")
Reviewed-by: Jonathan Toppins <jtoppins@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 5 +++--
 include/net/bonding.h           | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 236e5219c811..8cc9a74789b7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3269,7 +3269,8 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
 
 	combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined);
 	if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP ||
-	    combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)
+	    (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
+	     combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
 		goto out;
 
 	saddr = &combined->ip6.saddr;
@@ -3291,7 +3292,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
 	else if (curr_active_slave &&
 		 time_after(slave_last_rx(bond, curr_active_slave),
 			    curr_active_slave->last_link_up))
-		bond_validate_na(bond, slave, saddr, daddr);
+		bond_validate_na(bond, slave, daddr, saddr);
 	else if (curr_arp_slave &&
 		 bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
 		bond_validate_na(bond, slave, saddr, daddr);
diff --git a/include/net/bonding.h b/include/net/bonding.h
index ea36ab7f9e72..c3843239517d 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -761,13 +761,17 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 #if IS_ENABLED(CONFIG_IPV6)
 static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip)
 {
+	struct in6_addr mcaddr;
 	int i;
 
-	for (i = 0; i < BOND_MAX_NS_TARGETS; i++)
-		if (ipv6_addr_equal(&targets[i], ip))
+	for (i = 0; i < BOND_MAX_NS_TARGETS; i++) {
+		addrconf_addr_solict_mult(&targets[i], &mcaddr);
+		if ((ipv6_addr_equal(&targets[i], ip)) ||
+		    (ipv6_addr_equal(&mcaddr, ip)))
 			return i;
 		else if (ipv6_addr_any(&targets[i]))
 			break;
+	}
 
 	return -1;
 }
-- 
cgit v1.2.3


From ded2d383b1b441f380552897791ffb85a1377f08 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 30 Mar 2023 18:45:26 +0800
Subject: thermal/core: Remove thermal_bind_params structure

Remove struct thermal_bind_params because no one is using it for thermal
binding now.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20230330104526.3196-1-rui.zhang@intel.com
---
 Documentation/driver-api/thermal/sysfs-api.rst |  40 --------
 drivers/thermal/thermal_core.c                 | 129 +++----------------------
 include/linux/thermal.h                        |  38 --------
 3 files changed, 11 insertions(+), 196 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst
index 2e0f79a9e2ee..6c1175c6afba 100644
--- a/Documentation/driver-api/thermal/sysfs-api.rst
+++ b/Documentation/driver-api/thermal/sysfs-api.rst
@@ -304,42 +304,6 @@ temperature) and throttle appropriate devices.
 1.4 Thermal Zone Parameters
 ---------------------------
 
-    ::
-
-	struct thermal_bind_params
-
-    This structure defines the following parameters that are used to bind
-    a zone with a cooling device for a particular trip point.
-
-    .cdev:
-	     The cooling device pointer
-    .weight:
-	     The 'influence' of a particular cooling device on this
-	     zone. This is relative to the rest of the cooling
-	     devices. For example, if all cooling devices have a
-	     weight of 1, then they all contribute the same. You can
-	     use percentages if you want, but it's not mandatory. A
-	     weight of 0 means that this cooling device doesn't
-	     contribute to the cooling of this zone unless all cooling
-	     devices have a weight of 0. If all weights are 0, then
-	     they all contribute the same.
-    .trip_mask:
-	       This is a bit mask that gives the binding relation between
-	       this thermal zone and cdev, for a particular trip point.
-	       If nth bit is set, then the cdev and thermal zone are bound
-	       for trip point n.
-    .binding_limits:
-		     This is an array of cooling state limits. Must have
-		     exactly 2 * thermal_zone.number_of_trip_points. It is an
-		     array consisting of tuples <lower-state upper-state> of
-		     state limits. Each trip will be associated with one state
-		     limit tuple when binding. A NULL pointer means
-		     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
-		     These limits are used when binding a cdev to a trip point.
-    .match:
-	    This call back returns success(0) if the 'tz and cdev' need to
-	    be bound, as per platform data.
-
     ::
 
 	struct thermal_zone_params
@@ -357,10 +321,6 @@ temperature) and throttle appropriate devices.
 	       will be created. when no_hwmon == true, nothing will be done.
 	       In case the thermal_zone_params is NULL, the hwmon interface
 	       will be created (for backward compatibility).
-    .num_tbps:
-	       Number of thermal_bind_params entries for this zone
-    .tbp:
-	       thermal_bind_params entries
 
 2. sysfs attributes structure
 =============================
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index d02470a4bdb1..237430f1b565 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -794,62 +794,16 @@ void print_bind_err_msg(struct thermal_zone_device *tz,
 		tz->type, cdev->type, ret);
 }
 
-static void __bind(struct thermal_zone_device *tz, int mask,
-		   struct thermal_cooling_device *cdev,
-		   unsigned long *limits,
-		   unsigned int weight)
-{
-	int i, ret;
-
-	for (i = 0; i < tz->num_trips; i++) {
-		if (mask & (1 << i)) {
-			unsigned long upper, lower;
-
-			upper = THERMAL_NO_LIMIT;
-			lower = THERMAL_NO_LIMIT;
-			if (limits) {
-				lower = limits[i * 2];
-				upper = limits[i * 2 + 1];
-			}
-			ret = thermal_zone_bind_cooling_device(tz, i, cdev,
-							       upper, lower,
-							       weight);
-			if (ret)
-				print_bind_err_msg(tz, cdev, ret);
-		}
-	}
-}
-
 static void bind_cdev(struct thermal_cooling_device *cdev)
 {
-	int i, ret;
-	const struct thermal_zone_params *tzp;
+	int ret;
 	struct thermal_zone_device *pos = NULL;
 
 	list_for_each_entry(pos, &thermal_tz_list, node) {
-		if (!pos->tzp && !pos->ops->bind)
-			continue;
-
 		if (pos->ops->bind) {
 			ret = pos->ops->bind(pos, cdev);
 			if (ret)
 				print_bind_err_msg(pos, cdev, ret);
-			continue;
-		}
-
-		tzp = pos->tzp;
-		if (!tzp || !tzp->tbp)
-			continue;
-
-		for (i = 0; i < tzp->num_tbps; i++) {
-			if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
-				continue;
-			if (tzp->tbp[i].match(pos, cdev))
-				continue;
-			tzp->tbp[i].cdev = cdev;
-			__bind(pos, tzp->tbp[i].trip_mask, cdev,
-			       tzp->tbp[i].binding_limits,
-			       tzp->tbp[i].weight);
 		}
 	}
 }
@@ -1134,16 +1088,6 @@ unlock_list:
 }
 EXPORT_SYMBOL_GPL(thermal_cooling_device_update);
 
-static void __unbind(struct thermal_zone_device *tz, int mask,
-		     struct thermal_cooling_device *cdev)
-{
-	int i;
-
-	for (i = 0; i < tz->num_trips; i++)
-		if (mask & (1 << i))
-			thermal_zone_unbind_cooling_device(tz, i, cdev);
-}
-
 /**
  * thermal_cooling_device_unregister - removes a thermal cooling device
  * @cdev:	the thermal cooling device to remove.
@@ -1153,8 +1097,6 @@ static void __unbind(struct thermal_zone_device *tz, int mask,
  */
 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 {
-	int i;
-	const struct thermal_zone_params *tzp;
 	struct thermal_zone_device *tz;
 
 	if (!cdev)
@@ -1171,21 +1113,8 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 
 	/* Unbind all thermal zones associated with 'this' cdev */
 	list_for_each_entry(tz, &thermal_tz_list, node) {
-		if (tz->ops->unbind) {
+		if (tz->ops->unbind)
 			tz->ops->unbind(tz, cdev);
-			continue;
-		}
-
-		if (!tz->tzp || !tz->tzp->tbp)
-			continue;
-
-		tzp = tz->tzp;
-		for (i = 0; i < tzp->num_tbps; i++) {
-			if (tzp->tbp[i].cdev == cdev) {
-				__unbind(tz, tzp->tbp[i].trip_mask, cdev);
-				tzp->tbp[i].cdev = NULL;
-			}
-		}
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -1196,41 +1125,20 @@ EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
 
 static void bind_tz(struct thermal_zone_device *tz)
 {
-	int i, ret;
+	int ret;
 	struct thermal_cooling_device *pos = NULL;
-	const struct thermal_zone_params *tzp = tz->tzp;
 
-	if (!tzp && !tz->ops->bind)
+	if (!tz->ops->bind)
 		return;
 
 	mutex_lock(&thermal_list_lock);
 
-	/* If there is ops->bind, try to use ops->bind */
-	if (tz->ops->bind) {
-		list_for_each_entry(pos, &thermal_cdev_list, node) {
-			ret = tz->ops->bind(tz, pos);
-			if (ret)
-				print_bind_err_msg(tz, pos, ret);
-		}
-		goto exit;
-	}
-
-	if (!tzp || !tzp->tbp)
-		goto exit;
-
 	list_for_each_entry(pos, &thermal_cdev_list, node) {
-		for (i = 0; i < tzp->num_tbps; i++) {
-			if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
-				continue;
-			if (tzp->tbp[i].match(tz, pos))
-				continue;
-			tzp->tbp[i].cdev = pos;
-			__bind(tz, tzp->tbp[i].trip_mask, pos,
-			       tzp->tbp[i].binding_limits,
-			       tzp->tbp[i].weight);
-		}
+		ret = tz->ops->bind(tz, pos);
+		if (ret)
+			print_bind_err_msg(tz, pos, ret);
 	}
-exit:
+
 	mutex_unlock(&thermal_list_lock);
 }
 
@@ -1487,15 +1395,13 @@ EXPORT_SYMBOL_GPL(thermal_zone_device_id);
  */
 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 {
-	int i, tz_id;
-	const struct thermal_zone_params *tzp;
+	int tz_id;
 	struct thermal_cooling_device *cdev;
 	struct thermal_zone_device *pos = NULL;
 
 	if (!tz)
 		return;
 
-	tzp = tz->tzp;
 	tz_id = tz->id;
 
 	mutex_lock(&thermal_list_lock);
@@ -1510,22 +1416,9 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 	list_del(&tz->node);
 
 	/* Unbind all cdevs associated with 'this' thermal zone */
-	list_for_each_entry(cdev, &thermal_cdev_list, node) {
-		if (tz->ops->unbind) {
+	list_for_each_entry(cdev, &thermal_cdev_list, node)
+		if (tz->ops->unbind)
 			tz->ops->unbind(tz, cdev);
-			continue;
-		}
-
-		if (!tzp || !tzp->tbp)
-			break;
-
-		for (i = 0; i < tzp->num_tbps; i++) {
-			if (tzp->tbp[i].cdev == cdev) {
-				__unbind(tz, tzp->tbp[i].trip_mask, cdev);
-				tzp->tbp[i].cdev = NULL;
-			}
-		}
-	}
 
 	mutex_unlock(&thermal_list_lock);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index fef625f799ae..ab7460bfdcf6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -207,41 +207,6 @@ struct thermal_governor {
 	struct list_head	governor_list;
 };
 
-/* Structure that holds binding parameters for a zone */
-struct thermal_bind_params {
-	struct thermal_cooling_device *cdev;
-
-	/*
-	 * This is a measure of 'how effectively these devices can
-	 * cool 'this' thermal zone. It shall be determined by
-	 * platform characterization. This value is relative to the
-	 * rest of the weights so a cooling device whose weight is
-	 * double that of another cooling device is twice as
-	 * effective. See Documentation/driver-api/thermal/sysfs-api.rst for more
-	 * information.
-	 */
-	int weight;
-
-	/*
-	 * This is a bit mask that gives the binding relation between this
-	 * thermal zone and cdev, for a particular trip point.
-	 * See Documentation/driver-api/thermal/sysfs-api.rst for more information.
-	 */
-	int trip_mask;
-
-	/*
-	 * This is an array of cooling state limits. Must have exactly
-	 * 2 * thermal_zone.number_of_trip_points. It is an array consisting
-	 * of tuples <lower-state upper-state> of state limits. Each trip
-	 * will be associated with one state limit tuple when binding.
-	 * A NULL pointer means <THERMAL_NO_LIMITS THERMAL_NO_LIMITS>
-	 * on all trips.
-	 */
-	unsigned long *binding_limits;
-	int (*match) (struct thermal_zone_device *tz,
-			struct thermal_cooling_device *cdev);
-};
-
 /* Structure to define Thermal Zone parameters */
 struct thermal_zone_params {
 	char governor_name[THERMAL_NAME_LENGTH];
@@ -253,9 +218,6 @@ struct thermal_zone_params {
 	 */
 	bool no_hwmon;
 
-	int num_tbps;	/* Number of tbp entries */
-	struct thermal_bind_params *tbp;
-
 	/*
 	 * Sustainable power (heat) that this thermal zone can dissipate in
 	 * mW
-- 
cgit v1.2.3


From 05aaa7fdb0736262e224369b9b9f1410320fc71b Mon Sep 17 00:00:00 2001
From: Balsam CHIHI <bchihi@baylibre.com>
Date: Tue, 7 Mar 2023 16:45:21 +0100
Subject: dt-bindings: thermal: mediatek: Add AP domain to LVTS thermal
 controllers for mt8195

Add AP Domain to LVTS thermal controllers dt-binding definition for mt8195.

Signed-off-by: Balsam CHIHI <bchihi@baylibre.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20230307154524.118541-2-bchihi@baylibre.com
---
 include/dt-bindings/thermal/mediatek,lvts-thermal.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/thermal/mediatek,lvts-thermal.h b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
index c09398920468..8fa5a46675c4 100644
--- a/include/dt-bindings/thermal/mediatek,lvts-thermal.h
+++ b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
@@ -16,4 +16,14 @@
 #define MT8195_MCU_LITTLE_CPU2  6
 #define MT8195_MCU_LITTLE_CPU3  7
 
+#define MT8195_AP_VPU0  8
+#define MT8195_AP_VPU1  9
+#define MT8195_AP_GPU0  10
+#define MT8195_AP_GPU1  11
+#define MT8195_AP_VDEC  12
+#define MT8195_AP_IMG   13
+#define MT8195_AP_INFRA 14
+#define MT8195_AP_CAM0  15
+#define MT8195_AP_CAM1  16
+
 #endif /* __MEDIATEK_LVTS_DT_H */
-- 
cgit v1.2.3


From a038895e25b296ca1ef0254f92673ea64bc1a2ee Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 4 Apr 2023 05:09:10 +0530
Subject: cpufreq: drivers with target_index() must set freq_table

Since the cpufreq core directly uses freq_table, for cpufreq drivers
that set their target_index() callback, make it mandatory for them to
set the same.

Since this is set per policy and normally from policy->init(), do this
from cpufreq_table_validate_and_sort() which gets called right after
->init().

Reported-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c    | 5 +++++
 drivers/cpufreq/freq_table.c | 7 ++++++-
 include/linux/cpufreq.h      | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d61f7308f63c..b24ed81ba8bd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -73,6 +73,11 @@ static inline bool has_target(void)
 	return cpufreq_driver->target_index || cpufreq_driver->target;
 }
 
+bool has_target_index(void)
+{
+	return !!cpufreq_driver->target_index;
+}
+
 /* internal prototypes */
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 static int cpufreq_init_governor(struct cpufreq_policy *policy);
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index 67e56cf638ef..ddd4832bcc0d 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -355,8 +355,13 @@ int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy)
 {
 	int ret;
 
-	if (!policy->freq_table)
+	if (!policy->freq_table) {
+		/* Freq table must be passed by drivers with target_index() */
+		if (has_target_index())
+			return -EINVAL;
+
 		return 0;
+	}
 
 	ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table);
 	if (ret)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 65623233ab2f..541013487a0e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -237,6 +237,7 @@ bool cpufreq_supports_freq_invariance(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
+bool has_target_index(void);
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
-- 
cgit v1.2.3


From d8cc9415a40f479b666fc03e7b1f4601868e6dc8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 6 Apr 2023 22:29:56 +0200
Subject: hwmon: constify pointers to hwmon_channel_info

HWmon core receives an array of pointers to hwmon_channel_info and it
does not modify it, thus it can be array of const pointers for safety.
This allows drivers to make them also const.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/hwmon-kernel-api.rst | 6 +++---
 drivers/accel/habanalabs/common/hwmon.c  | 2 +-
 drivers/hwmon/hwmon.c                    | 4 ++--
 include/linux/hwmon.h                    | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst
index 5451a6d4c874..f00e4a01c240 100644
--- a/Documentation/hwmon/hwmon-kernel-api.rst
+++ b/Documentation/hwmon/hwmon-kernel-api.rst
@@ -133,7 +133,7 @@ The hwmon_chip_info structure looks as follows::
 
 	struct hwmon_chip_info {
 		const struct hwmon_ops *ops;
-		const struct hwmon_channel_info **info;
+		const struct hwmon_channel_info * const *info;
 	};
 
 It contains the following fields:
@@ -229,7 +229,7 @@ register (HWMON_T_MAX) as well as a maximum temperature hysteresis register
 		.config = lm75_temp_config,
 	};
 
-	static const struct hwmon_channel_info *lm75_info[] = {
+	static const struct hwmon_channel_info * const lm75_info[] = {
 		&lm75_chip,
 		&lm75_temp,
 		NULL
@@ -238,7 +238,7 @@ register (HWMON_T_MAX) as well as a maximum temperature hysteresis register
 	The HWMON_CHANNEL_INFO() macro can and should be used when possible.
 	With this macro, the above example can be simplified to
 
-	static const struct hwmon_channel_info *lm75_info[] = {
+	static const struct hwmon_channel_info * const lm75_info[] = {
 		HWMON_CHANNEL_INFO(chip,
 				HWMON_C_REGISTER_TZ | HWMON_C_UPDATE_INTERVAL),
 		HWMON_CHANNEL_INFO(temp,
diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 55eb0203817f..8598056216e7 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -914,7 +914,7 @@ void hl_hwmon_fini(struct hl_device *hdev)
 
 void hl_hwmon_release_resources(struct hl_device *hdev)
 {
-	const struct hwmon_channel_info **channel_info_arr;
+	const struct hwmon_channel_info * const *channel_info_arr;
 	int i = 0;
 
 	if (!hdev->hl_chip_info->info)
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index d193ed3cb35e..c7ebef1990c8 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -174,7 +174,7 @@ static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int
 	struct hwmon_thermal_data *tdata = tz->devdata;
 	struct hwmon_device *hwdev = to_hwmon_device(tdata->dev);
 	const struct hwmon_chip_info *chip = hwdev->chip;
-	const struct hwmon_channel_info **info = chip->info;
+	const struct hwmon_channel_info * const *info = chip->info;
 	unsigned int i;
 	int err;
 
@@ -253,7 +253,7 @@ static int hwmon_thermal_register_sensors(struct device *dev)
 {
 	struct hwmon_device *hwdev = to_hwmon_device(dev);
 	const struct hwmon_chip_info *chip = hwdev->chip;
-	const struct hwmon_channel_info **info = chip->info;
+	const struct hwmon_channel_info * const *info = chip->info;
 	void *drvdata = dev_get_drvdata(dev);
 	int i;
 
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index c1b62384b6ee..492dd27a5dd8 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -430,7 +430,7 @@ struct hwmon_channel_info {
  */
 struct hwmon_chip_info {
 	const struct hwmon_ops *ops;
-	const struct hwmon_channel_info **info;
+	const struct hwmon_channel_info * const *info;
 };
 
 /* hwmon_device_register() is deprecated */
-- 
cgit v1.2.3


From 4a670ac3e75e517c96cbd01ef870dbd598c3ce71 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Fri, 7 Apr 2023 17:26:04 +0200
Subject: regmap: allow upshifting register addresses before performing
 operations

Similar to the existing reg_downshift mechanism, that is used to
translate register addresses on busses that have a smaller address
stride, it's also possible to want to upshift register addresses.

Such a case was encountered when network PHYs and PCS that usually sit
on a MDIO bus (16-bits register with a stride of 1) are integrated
directly as memory-mapped devices. Here, the same register layout
defined in 802.3 is used, but the register now have a larger stride.

Introduce a mechanism to also allow upshifting register addresses.
Re-purpose reg_downshift into a more generic, signed reg_shift, whose
sign indicates the direction of the shift. To avoid confusion, also
introduce macros to explicitly indicate if we want to downshift or
upshift.

For bisectability, change any use of reg_downshift to use reg_shift.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Colin Foster <colin.foster@in-advantage.com>
Link: https://lore.kernel.org/r/20230407152604.105467-1-maxime.chevallier@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h |  2 +-
 drivers/base/regmap/regmap.c   | 10 ++++++++--
 drivers/mfd/ocelot-spi.c       |  2 +-
 include/linux/regmap.h         | 15 ++++++++++++---
 4 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 6361df6f553a..9bd0dfd1e259 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -31,8 +31,8 @@ struct regmap_format {
 	size_t buf_size;
 	size_t reg_bytes;
 	size_t pad_bytes;
-	size_t reg_downshift;
 	size_t val_bytes;
+	s8 reg_shift;
 	void (*format_write)(struct regmap *map,
 			     unsigned int reg, unsigned int val);
 	void (*format_reg)(void *buf, unsigned int reg, unsigned int shift);
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 473b65b102db..db7851f0e3b8 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -814,7 +814,7 @@ struct regmap *__regmap_init(struct device *dev,
 
 	map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
 	map->format.pad_bytes = config->pad_bits / 8;
-	map->format.reg_downshift = config->reg_downshift;
+	map->format.reg_shift = config->reg_shift;
 	map->format.val_bytes = DIV_ROUND_UP(config->val_bits, 8);
 	map->format.buf_size = DIV_ROUND_UP(config->reg_bits +
 			config->val_bits + config->pad_bits, 8);
@@ -1679,7 +1679,13 @@ static void regmap_set_work_buf_flag_mask(struct regmap *map, int max_bytes,
 static unsigned int regmap_reg_addr(struct regmap *map, unsigned int reg)
 {
 	reg += map->reg_base;
-	return reg >> map->format.reg_downshift;
+
+	if (map->format.reg_shift > 0)
+		reg >>= map->format.reg_shift;
+	else if (map->format.reg_shift < 0)
+		reg <<= -(map->format.reg_shift);
+
+	return reg;
 }
 
 static int _regmap_raw_write_impl(struct regmap *map, unsigned int reg,
diff --git a/drivers/mfd/ocelot-spi.c b/drivers/mfd/ocelot-spi.c
index 2ecd271de2fb..2d1349a10ca9 100644
--- a/drivers/mfd/ocelot-spi.c
+++ b/drivers/mfd/ocelot-spi.c
@@ -125,7 +125,7 @@ static int ocelot_spi_initialize(struct device *dev)
 static const struct regmap_config ocelot_spi_regmap_config = {
 	.reg_bits = 24,
 	.reg_stride = 4,
-	.reg_downshift = 2,
+	.reg_shift = REGMAP_DOWNSHIFT(2),
 	.val_bits = 32,
 
 	.write_flag_mask = 0x80,
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 8743b177399e..c2b9cc5db824 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -46,6 +46,14 @@ struct sdw_slave;
 #define REGMAP_MDIO_C45_DEVAD_MASK	GENMASK(20, 16)
 #define REGMAP_MDIO_C45_REGNUM_MASK	GENMASK(15, 0)
 
+/*
+ * regmap.reg_shift indicates by how much we must shift registers prior to
+ * performing any operation. It's a signed value, positive numbers means
+ * downshifting the register's address, while negative numbers means upshifting.
+ */
+#define REGMAP_UPSHIFT(s)	(-(s))
+#define REGMAP_DOWNSHIFT(s)	(s)
+
 /* An enum of all the supported cache types */
 enum regcache_type {
 	REGCACHE_NONE,
@@ -246,8 +254,9 @@ typedef void (*regmap_unlock)(void *);
  * @reg_stride: The register address stride. Valid register addresses are a
  *              multiple of this value. If set to 0, a value of 1 will be
  *              used.
- * @reg_downshift: The number of bits to downshift the register before
- *		   performing any operations.
+ * @reg_shift: The number of bits to shift the register before performing any
+ *	       operations. Any positive number will be downshifted, and negative
+ *	       values will be upshifted
  * @reg_base: Value to be added to every register address before performing any
  *	      operation.
  * @pad_bits: Number of bits of padding between register and value.
@@ -381,7 +390,7 @@ struct regmap_config {
 
 	int reg_bits;
 	int reg_stride;
-	int reg_downshift;
+	int reg_shift;
 	unsigned int reg_base;
 	int pad_bits;
 	int val_bits;
-- 
cgit v1.2.3


From ac614a9b4c35bf766dec40d73cbbc7f37c1734f7 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 4 Apr 2023 09:51:36 +0200
Subject: thermal/of: Unexport unused OF functions

The functions thermal_of_zone_register() and
thermal_of_zone_unregister() are no longer needed from the drivers as
the devm_ variant is always used.

Make them static in the C file and remove their declaration from thermal.h

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20230404075138.2914680-2-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_of.c |  8 +++-----
 include/linux/thermal.h      | 17 -----------------
 2 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index ff4d12ef51bc..6fb14e521197 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -439,7 +439,7 @@ static int thermal_of_unbind(struct thermal_zone_device *tz,
  *
  * @tz: a pointer to the thermal zone structure
  */
-void thermal_of_zone_unregister(struct thermal_zone_device *tz)
+static void thermal_of_zone_unregister(struct thermal_zone_device *tz)
 {
 	struct thermal_trip *trips = tz->trips;
 	struct thermal_zone_params *tzp = tz->tzp;
@@ -451,7 +451,6 @@ void thermal_of_zone_unregister(struct thermal_zone_device *tz)
 	kfree(tzp);
 	kfree(ops);
 }
-EXPORT_SYMBOL_GPL(thermal_of_zone_unregister);
 
 /**
  * thermal_of_zone_register - Register a thermal zone with device node
@@ -473,8 +472,8 @@ EXPORT_SYMBOL_GPL(thermal_of_zone_unregister);
  *	- ENOMEM: if one structure can not be allocated
  *	- Other negative errors are returned by the underlying called functions
  */
-struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
-						     const struct thermal_zone_device_ops *ops)
+static struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+							    const struct thermal_zone_device_ops *ops)
 {
 	struct thermal_zone_device *tz;
 	struct thermal_trip *trips;
@@ -550,7 +549,6 @@ out_kfree_of_ops:
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(thermal_of_zone_register);
 
 static void devm_thermal_of_zone_release(struct device *dev, void *res)
 {
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ab7460bfdcf6..82ddb32f9876 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -259,25 +259,12 @@ struct thermal_zone_params {
 
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
-struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
-						     const struct thermal_zone_device_ops *ops);
-
 struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
 							  const struct thermal_zone_device_ops *ops);
 
-void thermal_of_zone_unregister(struct thermal_zone_device *tz);
-
 void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz);
 
-void thermal_of_zone_unregister(struct thermal_zone_device *tz);
-
 #else
-static inline
-struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
-						     const struct thermal_zone_device_ops *ops)
-{
-	return ERR_PTR(-ENOTSUPP);
-}
 
 static inline
 struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
@@ -286,10 +273,6 @@ struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, in
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz)
-{
-}
-
 static inline void devm_thermal_of_zone_unregister(struct device *dev,
 						   struct thermal_zone_device *tz)
 {
-- 
cgit v1.2.3


From b065b23d3c3bc91f7e54f9bff4294a7bfbd2afb6 Mon Sep 17 00:00:00 2001
From: Devi Priya <quic_devipriy@quicinc.com>
Date: Thu, 16 Mar 2023 12:59:35 +0530
Subject: dt-bindings: clock: Add ipq9574 clock and reset definitions

Add clock and reset ID definitions for ipq9574

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Co-developed-by: Anusha Rao <quic_anusha@quicinc.com>
Signed-off-by: Anusha Rao <quic_anusha@quicinc.com>
Signed-off-by: Devi Priya <quic_devipriy@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230316072940.29137-2-quic_devipriy@quicinc.com
---
 .../bindings/clock/qcom,ipq9574-gcc.yaml           |  61 ++++++
 include/dt-bindings/clock/qcom,ipq9574-gcc.h       | 213 +++++++++++++++++++++
 include/dt-bindings/reset/qcom,ipq9574-gcc.h       | 164 ++++++++++++++++
 3 files changed, 438 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,ipq9574-gcc.h
 create mode 100644 include/dt-bindings/reset/qcom,ipq9574-gcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml
new file mode 100644
index 000000000000..afc68eb9d7cc
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,ipq9574-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ9574
+
+maintainers:
+  - Anusha Rao <quic_anusha@quicinc.com>
+
+description: |
+  Qualcomm global clock control module provides the clocks, resets and power
+  domains on IPQ9574
+
+  See also::
+    include/dt-bindings/clock/qcom,ipq9574-gcc.h
+    include/dt-bindings/reset/qcom,ipq9574-gcc.h
+
+properties:
+  compatible:
+    const: qcom,ipq9574-gcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Sleep clock source
+      - description: Bias PLL ubi clock source
+      - description: PCIE30 PHY0 pipe clock source
+      - description: PCIE30 PHY1 pipe clock source
+      - description: PCIE30 PHY2 pipe clock source
+      - description: PCIE30 PHY3 pipe clock source
+      - description: USB3 PHY pipe clock source
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    clock-controller@1800000 {
+      compatible = "qcom,ipq9574-gcc";
+      reg = <0x01800000 0x80000>;
+      clocks = <&xo_board_clk>,
+               <&sleep_clk>,
+               <&bias_pll_ubi_nc_clk>,
+               <&pcie30_phy0_pipe_clk>,
+               <&pcie30_phy1_pipe_clk>,
+               <&pcie30_phy2_pipe_clk>,
+               <&pcie30_phy3_pipe_clk>,
+               <&usb3phy_0_cc_pipe_clk>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,ipq9574-gcc.h b/include/dt-bindings/clock/qcom,ipq9574-gcc.h
new file mode 100644
index 000000000000..5a2961bfe893
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,ipq9574-gcc.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2018-2023 The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLOCK_IPQ_GCC_9574_H
+#define _DT_BINDINGS_CLOCK_IPQ_GCC_9574_H
+
+#define GPLL0_MAIN					0
+#define GPLL0						1
+#define GPLL2_MAIN					2
+#define GPLL2						3
+#define GPLL4_MAIN					4
+#define GPLL4						5
+#define GCC_SLEEP_CLK_SRC				6
+#define APSS_AHB_CLK_SRC				7
+#define APSS_AXI_CLK_SRC				8
+#define BLSP1_QUP1_I2C_APPS_CLK_SRC			9
+#define BLSP1_QUP1_SPI_APPS_CLK_SRC			10
+#define BLSP1_QUP2_I2C_APPS_CLK_SRC			11
+#define BLSP1_QUP2_SPI_APPS_CLK_SRC			12
+#define BLSP1_QUP3_I2C_APPS_CLK_SRC			13
+#define BLSP1_QUP3_SPI_APPS_CLK_SRC			14
+#define BLSP1_QUP4_I2C_APPS_CLK_SRC			15
+#define BLSP1_QUP4_SPI_APPS_CLK_SRC			16
+#define BLSP1_QUP5_I2C_APPS_CLK_SRC			17
+#define BLSP1_QUP5_SPI_APPS_CLK_SRC			18
+#define BLSP1_QUP6_I2C_APPS_CLK_SRC			19
+#define BLSP1_QUP6_SPI_APPS_CLK_SRC			20
+#define BLSP1_UART1_APPS_CLK_SRC			21
+#define BLSP1_UART2_APPS_CLK_SRC			22
+#define BLSP1_UART3_APPS_CLK_SRC			23
+#define BLSP1_UART4_APPS_CLK_SRC			24
+#define BLSP1_UART5_APPS_CLK_SRC			25
+#define BLSP1_UART6_APPS_CLK_SRC			26
+#define GCC_APSS_AHB_CLK				27
+#define GCC_APSS_AXI_CLK				28
+#define GCC_BLSP1_QUP1_I2C_APPS_CLK			29
+#define GCC_BLSP1_QUP1_SPI_APPS_CLK			30
+#define GCC_BLSP1_QUP2_I2C_APPS_CLK			31
+#define GCC_BLSP1_QUP2_SPI_APPS_CLK			32
+#define GCC_BLSP1_QUP3_I2C_APPS_CLK			33
+#define GCC_BLSP1_QUP3_SPI_APPS_CLK			34
+#define GCC_BLSP1_QUP4_I2C_APPS_CLK			35
+#define GCC_BLSP1_QUP4_SPI_APPS_CLK			36
+#define GCC_BLSP1_QUP5_I2C_APPS_CLK			37
+#define GCC_BLSP1_QUP5_SPI_APPS_CLK			38
+#define GCC_BLSP1_QUP6_I2C_APPS_CLK			39
+#define GCC_BLSP1_QUP6_SPI_APPS_CLK			40
+#define GCC_BLSP1_UART1_APPS_CLK			41
+#define GCC_BLSP1_UART2_APPS_CLK			42
+#define GCC_BLSP1_UART3_APPS_CLK			43
+#define GCC_BLSP1_UART4_APPS_CLK			44
+#define GCC_BLSP1_UART5_APPS_CLK			45
+#define GCC_BLSP1_UART6_APPS_CLK			46
+#define PCIE0_AXI_M_CLK_SRC				47
+#define GCC_PCIE0_AXI_M_CLK				48
+#define PCIE1_AXI_M_CLK_SRC				49
+#define GCC_PCIE1_AXI_M_CLK				50
+#define PCIE2_AXI_M_CLK_SRC				51
+#define GCC_PCIE2_AXI_M_CLK				52
+#define PCIE3_AXI_M_CLK_SRC				53
+#define GCC_PCIE3_AXI_M_CLK				54
+#define PCIE0_AXI_S_CLK_SRC				55
+#define GCC_PCIE0_AXI_S_BRIDGE_CLK			56
+#define GCC_PCIE0_AXI_S_CLK				57
+#define PCIE1_AXI_S_CLK_SRC				58
+#define GCC_PCIE1_AXI_S_BRIDGE_CLK			59
+#define GCC_PCIE1_AXI_S_CLK				60
+#define PCIE2_AXI_S_CLK_SRC				61
+#define GCC_PCIE2_AXI_S_BRIDGE_CLK			62
+#define GCC_PCIE2_AXI_S_CLK				63
+#define PCIE3_AXI_S_CLK_SRC				64
+#define GCC_PCIE3_AXI_S_BRIDGE_CLK			65
+#define GCC_PCIE3_AXI_S_CLK				66
+#define PCIE0_PIPE_CLK_SRC				67
+#define PCIE1_PIPE_CLK_SRC				68
+#define PCIE2_PIPE_CLK_SRC				69
+#define PCIE3_PIPE_CLK_SRC				70
+#define PCIE_AUX_CLK_SRC				71
+#define GCC_PCIE0_AUX_CLK				72
+#define GCC_PCIE1_AUX_CLK				73
+#define GCC_PCIE2_AUX_CLK				74
+#define GCC_PCIE3_AUX_CLK				75
+#define PCIE0_RCHNG_CLK_SRC				76
+#define GCC_PCIE0_RCHNG_CLK				77
+#define PCIE1_RCHNG_CLK_SRC				78
+#define GCC_PCIE1_RCHNG_CLK				79
+#define PCIE2_RCHNG_CLK_SRC				80
+#define GCC_PCIE2_RCHNG_CLK				81
+#define PCIE3_RCHNG_CLK_SRC				82
+#define GCC_PCIE3_RCHNG_CLK				83
+#define GCC_PCIE0_AHB_CLK				84
+#define GCC_PCIE1_AHB_CLK				85
+#define GCC_PCIE2_AHB_CLK				86
+#define GCC_PCIE3_AHB_CLK				87
+#define USB0_AUX_CLK_SRC				88
+#define GCC_USB0_AUX_CLK				89
+#define USB0_MASTER_CLK_SRC				90
+#define GCC_USB0_MASTER_CLK				91
+#define GCC_SNOC_USB_CLK				92
+#define GCC_ANOC_USB_AXI_CLK				93
+#define USB0_MOCK_UTMI_CLK_SRC				94
+#define USB0_MOCK_UTMI_DIV_CLK_SRC			95
+#define GCC_USB0_MOCK_UTMI_CLK				96
+#define USB0_PIPE_CLK_SRC				97
+#define GCC_USB0_PHY_CFG_AHB_CLK			98
+#define SDCC1_APPS_CLK_SRC				99
+#define GCC_SDCC1_APPS_CLK				100
+#define SDCC1_ICE_CORE_CLK_SRC				101
+#define GCC_SDCC1_ICE_CORE_CLK				102
+#define GCC_SDCC1_AHB_CLK				103
+#define PCNOC_BFDCD_CLK_SRC				104
+#define GCC_NSSCFG_CLK					105
+#define GCC_NSSNOC_NSSCC_CLK				106
+#define GCC_NSSCC_CLK					107
+#define GCC_NSSNOC_PCNOC_1_CLK				108
+#define GCC_QDSS_DAP_AHB_CLK				109
+#define GCC_QDSS_CFG_AHB_CLK				110
+#define GCC_QPIC_AHB_CLK				111
+#define GCC_QPIC_CLK					112
+#define GCC_BLSP1_AHB_CLK				113
+#define GCC_MDIO_AHB_CLK				114
+#define GCC_PRNG_AHB_CLK				115
+#define GCC_UNIPHY0_AHB_CLK				116
+#define GCC_UNIPHY1_AHB_CLK				117
+#define GCC_UNIPHY2_AHB_CLK				118
+#define GCC_CMN_12GPLL_AHB_CLK				119
+#define GCC_CMN_12GPLL_APU_CLK				120
+#define SYSTEM_NOC_BFDCD_CLK_SRC			121
+#define GCC_NSSNOC_SNOC_CLK				122
+#define GCC_NSSNOC_SNOC_1_CLK				123
+#define GCC_QDSS_ETR_USB_CLK				124
+#define WCSS_AHB_CLK_SRC				125
+#define GCC_Q6_AHB_CLK					126
+#define GCC_Q6_AHB_S_CLK				127
+#define GCC_WCSS_ECAHB_CLK				128
+#define GCC_WCSS_ACMT_CLK				129
+#define GCC_SYS_NOC_WCSS_AHB_CLK			130
+#define WCSS_AXI_M_CLK_SRC				131
+#define GCC_ANOC_WCSS_AXI_M_CLK				132
+#define QDSS_AT_CLK_SRC					133
+#define GCC_Q6SS_ATBM_CLK				134
+#define GCC_WCSS_DBG_IFC_ATB_CLK			135
+#define GCC_NSSNOC_ATB_CLK				136
+#define GCC_QDSS_AT_CLK					137
+#define GCC_SYS_NOC_AT_CLK				138
+#define GCC_PCNOC_AT_CLK				139
+#define GCC_USB0_EUD_AT_CLK				140
+#define GCC_QDSS_EUD_AT_CLK				141
+#define QDSS_STM_CLK_SRC				142
+#define GCC_QDSS_STM_CLK				143
+#define GCC_SYS_NOC_QDSS_STM_AXI_CLK			144
+#define QDSS_TRACECLKIN_CLK_SRC				145
+#define GCC_QDSS_TRACECLKIN_CLK				146
+#define QDSS_TSCTR_CLK_SRC				147
+#define GCC_Q6_TSCTR_1TO2_CLK				148
+#define GCC_WCSS_DBG_IFC_NTS_CLK			149
+#define GCC_QDSS_TSCTR_DIV2_CLK				150
+#define GCC_QDSS_TS_CLK					151
+#define GCC_QDSS_TSCTR_DIV4_CLK				152
+#define GCC_NSS_TS_CLK					153
+#define GCC_QDSS_TSCTR_DIV8_CLK				154
+#define GCC_QDSS_TSCTR_DIV16_CLK			155
+#define GCC_Q6SS_PCLKDBG_CLK				156
+#define GCC_Q6SS_TRIG_CLK				157
+#define GCC_WCSS_DBG_IFC_APB_CLK			158
+#define GCC_WCSS_DBG_IFC_DAPBUS_CLK			159
+#define GCC_QDSS_DAP_CLK				160
+#define GCC_QDSS_APB2JTAG_CLK				161
+#define GCC_QDSS_TSCTR_DIV3_CLK				162
+#define QPIC_IO_MACRO_CLK_SRC				163
+#define GCC_QPIC_IO_MACRO_CLK                           164
+#define Q6_AXI_CLK_SRC					165
+#define GCC_Q6_AXIM_CLK					166
+#define GCC_WCSS_Q6_TBU_CLK				167
+#define GCC_MEM_NOC_Q6_AXI_CLK				168
+#define Q6_AXIM2_CLK_SRC				169
+#define NSSNOC_MEMNOC_BFDCD_CLK_SRC			170
+#define GCC_NSSNOC_MEMNOC_CLK				171
+#define GCC_NSSNOC_MEM_NOC_1_CLK			172
+#define GCC_NSS_TBU_CLK					173
+#define GCC_MEM_NOC_NSSNOC_CLK				174
+#define LPASS_AXIM_CLK_SRC				175
+#define LPASS_SWAY_CLK_SRC				176
+#define ADSS_PWM_CLK_SRC				177
+#define GCC_ADSS_PWM_CLK				178
+#define GP1_CLK_SRC					179
+#define GP2_CLK_SRC					180
+#define GP3_CLK_SRC					181
+#define DDRSS_SMS_SLOW_CLK_SRC				182
+#define GCC_XO_CLK_SRC					183
+#define GCC_XO_CLK					184
+#define GCC_NSSNOC_QOSGEN_REF_CLK			185
+#define GCC_NSSNOC_TIMEOUT_REF_CLK			186
+#define GCC_XO_DIV4_CLK					187
+#define GCC_UNIPHY0_SYS_CLK				188
+#define GCC_UNIPHY1_SYS_CLK				189
+#define GCC_UNIPHY2_SYS_CLK				190
+#define GCC_CMN_12GPLL_SYS_CLK				191
+#define GCC_NSSNOC_XO_DCD_CLK				192
+#define GCC_Q6SS_BOOT_CLK				193
+#define UNIPHY_SYS_CLK_SRC				194
+#define NSS_TS_CLK_SRC					195
+#define GCC_ANOC_PCIE0_1LANE_M_CLK			196
+#define GCC_ANOC_PCIE1_1LANE_M_CLK			197
+#define GCC_ANOC_PCIE2_2LANE_M_CLK			198
+#define GCC_ANOC_PCIE3_2LANE_M_CLK			199
+#define GCC_SNOC_PCIE0_1LANE_S_CLK			200
+#define GCC_SNOC_PCIE1_1LANE_S_CLK			201
+#define GCC_SNOC_PCIE2_2LANE_S_CLK			202
+#define GCC_SNOC_PCIE3_2LANE_S_CLK			203
+#endif
diff --git a/include/dt-bindings/reset/qcom,ipq9574-gcc.h b/include/dt-bindings/reset/qcom,ipq9574-gcc.h
new file mode 100644
index 000000000000..d01dc6a24cf1
--- /dev/null
+++ b/include/dt-bindings/reset/qcom,ipq9574-gcc.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2018-2023, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_RESET_IPQ_GCC_9574_H
+#define _DT_BINDINGS_RESET_IPQ_GCC_9574_H
+
+#define GCC_ADSS_BCR						0
+#define GCC_APC0_VOLTAGE_DROOP_DETECTOR_BCR			1
+#define GCC_BLSP1_BCR						2
+#define GCC_BLSP1_QUP1_BCR					3
+#define GCC_BLSP1_QUP2_BCR					4
+#define GCC_BLSP1_QUP3_BCR					5
+#define GCC_BLSP1_QUP4_BCR					6
+#define GCC_BLSP1_QUP5_BCR					7
+#define GCC_BLSP1_QUP6_BCR					8
+#define GCC_BLSP1_UART1_BCR					9
+#define GCC_BLSP1_UART2_BCR					10
+#define GCC_BLSP1_UART3_BCR					11
+#define GCC_BLSP1_UART4_BCR					12
+#define GCC_BLSP1_UART5_BCR					13
+#define GCC_BLSP1_UART6_BCR					14
+#define GCC_BOOT_ROM_BCR					15
+#define GCC_MDIO_BCR						16
+#define GCC_NSS_BCR						17
+#define GCC_NSS_TBU_BCR						18
+#define GCC_PCIE0_BCR						19
+#define GCC_PCIE0_LINK_DOWN_BCR					20
+#define GCC_PCIE0_PHY_BCR					21
+#define GCC_PCIE0PHY_PHY_BCR					22
+#define GCC_PCIE1_BCR						23
+#define GCC_PCIE1_LINK_DOWN_BCR					24
+#define GCC_PCIE1_PHY_BCR					25
+#define GCC_PCIE1PHY_PHY_BCR					26
+#define GCC_PCIE2_BCR						27
+#define GCC_PCIE2_LINK_DOWN_BCR					28
+#define GCC_PCIE2_PHY_BCR					29
+#define GCC_PCIE2PHY_PHY_BCR					30
+#define GCC_PCIE3_BCR						31
+#define GCC_PCIE3_LINK_DOWN_BCR					32
+#define GCC_PCIE3_PHY_BCR					33
+#define GCC_PCIE3PHY_PHY_BCR					34
+#define GCC_PRNG_BCR						35
+#define GCC_QUSB2_0_PHY_BCR					36
+#define GCC_SDCC_BCR						37
+#define GCC_TLMM_BCR						38
+#define GCC_UNIPHY0_BCR						39
+#define GCC_UNIPHY1_BCR						40
+#define GCC_UNIPHY2_BCR						41
+#define GCC_USB0_PHY_BCR					42
+#define GCC_USB3PHY_0_PHY_BCR					43
+#define GCC_USB_BCR						44
+#define GCC_ANOC0_TBU_BCR					45
+#define GCC_ANOC1_TBU_BCR					46
+#define GCC_ANOC_BCR						47
+#define GCC_APSS_TCU_BCR					48
+#define GCC_CMN_BLK_BCR						49
+#define GCC_CMN_BLK_AHB_ARES					50
+#define GCC_CMN_BLK_SYS_ARES					51
+#define GCC_CMN_BLK_APU_ARES					52
+#define GCC_DCC_BCR						53
+#define GCC_DDRSS_BCR						54
+#define GCC_IMEM_BCR						55
+#define GCC_LPASS_BCR						56
+#define GCC_MPM_BCR						57
+#define GCC_MSG_RAM_BCR						58
+#define GCC_NSSNOC_MEMNOC_1_ARES				59
+#define GCC_NSSNOC_PCNOC_1_ARES					60
+#define GCC_NSSNOC_SNOC_1_ARES					61
+#define GCC_NSSNOC_XO_DCD_ARES					62
+#define GCC_NSSNOC_TS_ARES					63
+#define GCC_NSSCC_ARES						64
+#define GCC_NSSNOC_NSSCC_ARES					65
+#define GCC_NSSNOC_ATB_ARES					66
+#define GCC_NSSNOC_MEMNOC_ARES					67
+#define GCC_NSSNOC_QOSGEN_REF_ARES				68
+#define GCC_NSSNOC_SNOC_ARES					69
+#define GCC_NSSNOC_TIMEOUT_REF_ARES				70
+#define GCC_NSS_CFG_ARES					71
+#define GCC_UBI0_DBG_ARES					72
+#define GCC_PCIE0_AHB_ARES					73
+#define GCC_PCIE0_AUX_ARES					74
+#define GCC_PCIE0_AXI_M_ARES					75
+#define GCC_PCIE0_AXI_M_STICKY_ARES				76
+#define GCC_PCIE0_AXI_S_ARES					77
+#define GCC_PCIE0_AXI_S_STICKY_ARES				78
+#define GCC_PCIE0_CORE_STICKY_ARES				79
+#define GCC_PCIE0_PIPE_ARES					80
+#define GCC_PCIE1_AHB_ARES					81
+#define GCC_PCIE1_AUX_ARES					82
+#define GCC_PCIE1_AXI_M_ARES					83
+#define GCC_PCIE1_AXI_M_STICKY_ARES				84
+#define GCC_PCIE1_AXI_S_ARES					85
+#define GCC_PCIE1_AXI_S_STICKY_ARES				86
+#define GCC_PCIE1_CORE_STICKY_ARES				87
+#define GCC_PCIE1_PIPE_ARES					88
+#define GCC_PCIE2_AHB_ARES					89
+#define GCC_PCIE2_AUX_ARES					90
+#define GCC_PCIE2_AXI_M_ARES					91
+#define GCC_PCIE2_AXI_M_STICKY_ARES				92
+#define GCC_PCIE2_AXI_S_ARES					93
+#define GCC_PCIE2_AXI_S_STICKY_ARES				94
+#define GCC_PCIE2_CORE_STICKY_ARES				95
+#define GCC_PCIE2_PIPE_ARES					96
+#define GCC_PCIE3_AHB_ARES					97
+#define GCC_PCIE3_AUX_ARES					98
+#define GCC_PCIE3_AXI_M_ARES					99
+#define GCC_PCIE3_AXI_M_STICKY_ARES				100
+#define GCC_PCIE3_AXI_S_ARES					101
+#define GCC_PCIE3_AXI_S_STICKY_ARES				102
+#define GCC_PCIE3_CORE_STICKY_ARES				103
+#define GCC_PCIE3_PIPE_ARES					104
+#define GCC_PCNOC_BCR						105
+#define GCC_PCNOC_BUS_TIMEOUT0_BCR				106
+#define GCC_PCNOC_BUS_TIMEOUT1_BCR				107
+#define GCC_PCNOC_BUS_TIMEOUT2_BCR				108
+#define GCC_PCNOC_BUS_TIMEOUT3_BCR				109
+#define GCC_PCNOC_BUS_TIMEOUT4_BCR				110
+#define GCC_PCNOC_BUS_TIMEOUT5_BCR				111
+#define GCC_PCNOC_BUS_TIMEOUT6_BCR				112
+#define GCC_PCNOC_BUS_TIMEOUT7_BCR				113
+#define GCC_PCNOC_BUS_TIMEOUT8_BCR				114
+#define GCC_PCNOC_BUS_TIMEOUT9_BCR				115
+#define GCC_PCNOC_TBU_BCR					116
+#define GCC_Q6SS_DBG_ARES					117
+#define GCC_Q6_AHB_ARES						118
+#define GCC_Q6_AHB_S_ARES					119
+#define GCC_Q6_AXIM2_ARES					120
+#define GCC_Q6_AXIM_ARES					121
+#define GCC_QDSS_BCR						122
+#define GCC_QPIC_BCR						123
+#define GCC_QPIC_AHB_ARES					124
+#define GCC_QPIC_ARES						125
+#define GCC_RBCPR_BCR						126
+#define GCC_RBCPR_MX_BCR					127
+#define GCC_SEC_CTRL_BCR					128
+#define GCC_SMMU_CFG_BCR					129
+#define GCC_SNOC_BCR						130
+#define GCC_SPDM_BCR						131
+#define GCC_TME_BCR						132
+#define GCC_UNIPHY0_SYS_RESET					133
+#define GCC_UNIPHY0_AHB_RESET					134
+#define GCC_UNIPHY0_XPCS_RESET					135
+#define GCC_UNIPHY1_SYS_RESET					136
+#define GCC_UNIPHY1_AHB_RESET					137
+#define GCC_UNIPHY1_XPCS_RESET					138
+#define GCC_UNIPHY2_SYS_RESET					139
+#define GCC_UNIPHY2_AHB_RESET					140
+#define GCC_UNIPHY2_XPCS_RESET					141
+#define GCC_USB_MISC_RESET					142
+#define GCC_WCSSAON_RESET					143
+#define GCC_WCSS_ACMT_ARES					144
+#define GCC_WCSS_AHB_S_ARES					145
+#define GCC_WCSS_AXI_M_ARES					146
+#define GCC_WCSS_BCR						147
+#define GCC_WCSS_DBG_ARES					148
+#define GCC_WCSS_DBG_BDG_ARES					149
+#define GCC_WCSS_ECAHB_ARES					150
+#define GCC_WCSS_Q6_BCR						151
+#define GCC_WCSS_Q6_TBU_BCR					152
+#define GCC_TCSR_BCR						153
+
+#endif
-- 
cgit v1.2.3


From 77c7a41e052502f0d8827c63eddae6dd5d717b20 Mon Sep 17 00:00:00 2001
From: Dylan Van Assche <me@dylanvanassche.be>
Date: Thu, 6 Apr 2023 19:31:45 +0200
Subject: dt-bindings: firmware: qcom: scm: add SSC_Q6 and ADSP_Q6 VMIDs

SSC_Q6 and ADSP_Q6 are used in the FastRPC driver for accessing
the secure world.

Signed-off-by: Dylan Van Assche <me@dylanvanassche.be>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230406173148.28309-3-me@dylanvanassche.be
---
 include/dt-bindings/firmware/qcom,scm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/firmware/qcom,scm.h b/include/dt-bindings/firmware/qcom,scm.h
index 1a4e68fa0744..d1dc09e72923 100644
--- a/include/dt-bindings/firmware/qcom,scm.h
+++ b/include/dt-bindings/firmware/qcom,scm.h
@@ -8,6 +8,8 @@
 #define _DT_BINDINGS_FIRMWARE_QCOM_SCM_H
 
 #define QCOM_SCM_VMID_HLOS		0x3
+#define QCOM_SCM_VMID_SSC_Q6		0x5
+#define QCOM_SCM_VMID_ADSP_Q6		0x6
 #define QCOM_SCM_VMID_MSS_MSA		0xF
 #define QCOM_SCM_VMID_WLAN		0x18
 #define QCOM_SCM_VMID_WLAN_CE		0x19
-- 
cgit v1.2.3


From 2afbf43a4aec6e31dac7835e65d52c867f2be400 Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@linaro.org>
Date: Fri, 7 Apr 2023 13:50:26 +0300
Subject: soc: qcom: Make the Qualcomm UFS/SDCC ICE a dedicated driver

This takes the already existing duplicated support in both ufs-qcom
and sdhci-msm drivers and makes it a dedicated driver that can be used
by both mentioned drivers.

The reason for this is because, staring with SM8550, the ICE IP block
is shared between UFS and SDCC, which means we need to probe a dedicated
device and share it between those two consumers.

So let's add the ICE dedicated driver as a soc driver.

Platforms that already have ICE supported, will use it as a library
as the of_qcom_ice_get will return an ICE instance created for the
consumer device. This allows the backwards compatibility with old-style
devicetree approach.

Also, add support to HW version 4.x since it works out-of-the-box with
the current driver. The 4.x HW version is found on SM8550 platform.

Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230407105029.2274111-4-abel.vesa@linaro.org
---
 drivers/soc/qcom/Kconfig  |   4 +
 drivers/soc/qcom/Makefile |   1 +
 drivers/soc/qcom/ice.c    | 366 ++++++++++++++++++++++++++++++++++++++++++++++
 include/soc/qcom/ice.h    |  37 +++++
 4 files changed, 408 insertions(+)
 create mode 100644 drivers/soc/qcom/ice.c
 create mode 100644 include/soc/qcom/ice.h

(limited to 'include')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index a8f283086a21..64a977e75ec4 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -275,4 +275,8 @@ config QCOM_ICC_BWMON
 	  the fixed bandwidth votes from cpufreq (CPU nodes) thus achieve high
 	  memory throughput even with lower CPU frequencies.
 
+config QCOM_INLINE_CRYPTO_ENGINE
+	tristate
+	select QCOM_SCM
+
 endmenu
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 6e88da899f60..0f43a88b4894 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_QCOM_RPMHPD) += rpmhpd.o
 obj-$(CONFIG_QCOM_RPMPD) += rpmpd.o
 obj-$(CONFIG_QCOM_KRYO_L2_ACCESSORS) +=	kryo-l2-accessors.o
 obj-$(CONFIG_QCOM_ICC_BWMON)	+= icc-bwmon.o
+obj-$(CONFIG_QCOM_INLINE_CRYPTO_ENGINE)	+= ice.o
diff --git a/drivers/soc/qcom/ice.c b/drivers/soc/qcom/ice.c
new file mode 100644
index 000000000000..a6123ea96272
--- /dev/null
+++ b/drivers/soc/qcom/ice.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Qualcomm ICE (Inline Crypto Engine) support.
+ *
+ * Copyright (c) 2013-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2019, Google LLC
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/iopoll.h>
+#include <linux/of_platform.h>
+
+#include <linux/firmware/qcom/qcom_scm.h>
+
+#include <soc/qcom/ice.h>
+
+#define AES_256_XTS_KEY_SIZE			64
+
+/* QCOM ICE registers */
+#define QCOM_ICE_REG_VERSION			0x0008
+#define QCOM_ICE_REG_FUSE_SETTING		0x0010
+#define QCOM_ICE_REG_BIST_STATUS		0x0070
+#define QCOM_ICE_REG_ADVANCED_CONTROL		0x1000
+
+/* BIST ("built-in self-test") status flags */
+#define QCOM_ICE_BIST_STATUS_MASK		GENMASK(31, 28)
+
+#define QCOM_ICE_FUSE_SETTING_MASK		0x1
+#define QCOM_ICE_FORCE_HW_KEY0_SETTING_MASK	0x2
+#define QCOM_ICE_FORCE_HW_KEY1_SETTING_MASK	0x4
+
+#define qcom_ice_writel(engine, val, reg)	\
+	writel((val), (engine)->base + (reg))
+
+#define qcom_ice_readl(engine, reg)	\
+	readl((engine)->base + (reg))
+
+struct qcom_ice {
+	struct device *dev;
+	void __iomem *base;
+	struct device_link *link;
+
+	struct clk *core_clk;
+};
+
+static bool qcom_ice_check_supported(struct qcom_ice *ice)
+{
+	u32 regval = qcom_ice_readl(ice, QCOM_ICE_REG_VERSION);
+	struct device *dev = ice->dev;
+	int major = FIELD_GET(GENMASK(31, 24), regval);
+	int minor = FIELD_GET(GENMASK(23, 16), regval);
+	int step = FIELD_GET(GENMASK(15, 0), regval);
+
+	/* For now this driver only supports ICE version 3 and 4. */
+	if (major != 3 && major != 4) {
+		dev_warn(dev, "Unsupported ICE version: v%d.%d.%d\n",
+			 major, minor, step);
+		return false;
+	}
+
+	dev_info(dev, "Found QC Inline Crypto Engine (ICE) v%d.%d.%d\n",
+		 major, minor, step);
+
+	/* If fuses are blown, ICE might not work in the standard way. */
+	regval = qcom_ice_readl(ice, QCOM_ICE_REG_FUSE_SETTING);
+	if (regval & (QCOM_ICE_FUSE_SETTING_MASK |
+		      QCOM_ICE_FORCE_HW_KEY0_SETTING_MASK |
+		      QCOM_ICE_FORCE_HW_KEY1_SETTING_MASK)) {
+		dev_warn(dev, "Fuses are blown; ICE is unusable!\n");
+		return false;
+	}
+
+	return true;
+}
+
+static void qcom_ice_low_power_mode_enable(struct qcom_ice *ice)
+{
+	u32 regval;
+
+	regval = qcom_ice_readl(ice, QCOM_ICE_REG_ADVANCED_CONTROL);
+
+	/* Enable low power mode sequence */
+	regval |= 0x7000;
+	qcom_ice_writel(ice, regval, QCOM_ICE_REG_ADVANCED_CONTROL);
+}
+
+static void qcom_ice_optimization_enable(struct qcom_ice *ice)
+{
+	u32 regval;
+
+	/* ICE Optimizations Enable Sequence */
+	regval = qcom_ice_readl(ice, QCOM_ICE_REG_ADVANCED_CONTROL);
+	regval |= 0xd807100;
+	/* ICE HPG requires delay before writing */
+	udelay(5);
+	qcom_ice_writel(ice, regval, QCOM_ICE_REG_ADVANCED_CONTROL);
+	udelay(5);
+}
+
+/*
+ * Wait until the ICE BIST (built-in self-test) has completed.
+ *
+ * This may be necessary before ICE can be used.
+ * Note that we don't really care whether the BIST passed or failed;
+ * we really just want to make sure that it isn't still running. This is
+ * because (a) the BIST is a FIPS compliance thing that never fails in
+ * practice, (b) ICE is documented to reject crypto requests if the BIST
+ * fails, so we needn't do it in software too, and (c) properly testing
+ * storage encryption requires testing the full storage stack anyway,
+ * and not relying on hardware-level self-tests.
+ */
+static int qcom_ice_wait_bist_status(struct qcom_ice *ice)
+{
+	u32 regval;
+	int err;
+
+	err = readl_poll_timeout(ice->base + QCOM_ICE_REG_BIST_STATUS,
+				 regval, !(regval & QCOM_ICE_BIST_STATUS_MASK),
+				 50, 5000);
+	if (err)
+		dev_err(ice->dev, "Timed out waiting for ICE self-test to complete\n");
+
+	return err;
+}
+
+int qcom_ice_enable(struct qcom_ice *ice)
+{
+	qcom_ice_low_power_mode_enable(ice);
+	qcom_ice_optimization_enable(ice);
+
+	return qcom_ice_wait_bist_status(ice);
+}
+EXPORT_SYMBOL_GPL(qcom_ice_enable);
+
+int qcom_ice_resume(struct qcom_ice *ice)
+{
+	struct device *dev = ice->dev;
+	int err;
+
+	err = clk_prepare_enable(ice->core_clk);
+	if (err) {
+		dev_err(dev, "failed to enable core clock (%d)\n",
+			err);
+		return err;
+	}
+
+	return qcom_ice_wait_bist_status(ice);
+}
+EXPORT_SYMBOL_GPL(qcom_ice_resume);
+
+int qcom_ice_suspend(struct qcom_ice *ice)
+{
+	clk_disable_unprepare(ice->core_clk);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qcom_ice_suspend);
+
+int qcom_ice_program_key(struct qcom_ice *ice,
+			 u8 algorithm_id, u8 key_size,
+			 const u8 crypto_key[], u8 data_unit_size,
+			 int slot)
+{
+	struct device *dev = ice->dev;
+	union {
+		u8 bytes[AES_256_XTS_KEY_SIZE];
+		u32 words[AES_256_XTS_KEY_SIZE / sizeof(u32)];
+	} key;
+	int i;
+	int err;
+
+	/* Only AES-256-XTS has been tested so far. */
+	if (algorithm_id != QCOM_ICE_CRYPTO_ALG_AES_XTS ||
+	    key_size != QCOM_ICE_CRYPTO_KEY_SIZE_256) {
+		dev_err_ratelimited(dev,
+				    "Unhandled crypto capability; algorithm_id=%d, key_size=%d\n",
+				    algorithm_id, key_size);
+		return -EINVAL;
+	}
+
+	memcpy(key.bytes, crypto_key, AES_256_XTS_KEY_SIZE);
+
+	/* The SCM call requires that the key words are encoded in big endian */
+	for (i = 0; i < ARRAY_SIZE(key.words); i++)
+		__cpu_to_be32s(&key.words[i]);
+
+	err = qcom_scm_ice_set_key(slot, key.bytes, AES_256_XTS_KEY_SIZE,
+				   QCOM_SCM_ICE_CIPHER_AES_256_XTS,
+				   data_unit_size);
+
+	memzero_explicit(&key, sizeof(key));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(qcom_ice_program_key);
+
+int qcom_ice_evict_key(struct qcom_ice *ice, int slot)
+{
+	return qcom_scm_ice_invalidate_key(slot);
+}
+EXPORT_SYMBOL_GPL(qcom_ice_evict_key);
+
+static struct qcom_ice *qcom_ice_create(struct device *dev,
+					void __iomem *base)
+{
+	struct qcom_ice *engine;
+
+	if (!qcom_scm_is_available())
+		return ERR_PTR(-EPROBE_DEFER);
+
+	if (!qcom_scm_ice_available()) {
+		dev_warn(dev, "ICE SCM interface not found\n");
+		return NULL;
+	}
+
+	engine = devm_kzalloc(dev, sizeof(*engine), GFP_KERNEL);
+	if (!engine)
+		return ERR_PTR(-ENOMEM);
+
+	engine->dev = dev;
+	engine->base = base;
+
+	/*
+	 * Legacy DT binding uses different clk names for each consumer,
+	 * so lets try those first. If none of those are a match, it means
+	 * the we only have one clock and it is part of the dedicated DT node.
+	 * Also, enable the clock before we check what HW version the driver
+	 * supports.
+	 */
+	engine->core_clk = devm_clk_get_optional_enabled(dev, "ice_core_clk");
+	if (!engine->core_clk)
+		engine->core_clk = devm_clk_get_optional_enabled(dev, "ice");
+	if (!engine->core_clk)
+		engine->core_clk = devm_clk_get_enabled(dev, NULL);
+	if (IS_ERR(engine->core_clk))
+		return ERR_CAST(engine->core_clk);
+
+	if (!qcom_ice_check_supported(engine))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	dev_dbg(dev, "Registered Qualcomm Inline Crypto Engine\n");
+
+	return engine;
+}
+
+/**
+ * of_qcom_ice_get() - get an ICE instance from a DT node
+ * @dev: device pointer for the consumer device
+ *
+ * This function will provide an ICE instance either by creating one for the
+ * consumer device if its DT node provides the 'ice' reg range and the 'ice'
+ * clock (for legacy DT style). On the other hand, if consumer provides a
+ * phandle via 'qcom,ice' property to an ICE DT, the ICE instance will already
+ * be created and so this function will return that instead.
+ *
+ * Return: ICE pointer on success, NULL if there is no ICE data provided by the
+ * consumer or ERR_PTR() on error.
+ */
+struct qcom_ice *of_qcom_ice_get(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct qcom_ice *ice;
+	struct device_node *node;
+	struct resource *res;
+	void __iomem *base;
+
+	if (!dev || !dev->of_node)
+		return ERR_PTR(-ENODEV);
+
+	/*
+	 * In order to support legacy style devicetree bindings, we need
+	 * to create the ICE instance using the consumer device and the reg
+	 * range called 'ice' it provides.
+	 */
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ice");
+	if (res) {
+		base = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(base))
+			return ERR_CAST(base);
+
+		/* create ICE instance using consumer dev */
+		return qcom_ice_create(&pdev->dev, base);
+	}
+
+	/*
+	 * If the consumer node does not provider an 'ice' reg range
+	 * (legacy DT binding), then it must at least provide a phandle
+	 * to the ICE devicetree node, otherwise ICE is not supported.
+	 */
+	node = of_parse_phandle(dev->of_node, "qcom,ice", 0);
+	if (!node)
+		return NULL;
+
+	pdev = of_find_device_by_node(node);
+	if (!pdev) {
+		dev_err(dev, "Cannot find device node %s\n", node->name);
+		ice = ERR_PTR(-EPROBE_DEFER);
+		goto out;
+	}
+
+	ice = platform_get_drvdata(pdev);
+	if (!ice) {
+		dev_err(dev, "Cannot get ice instance from %s\n",
+			dev_name(&pdev->dev));
+		platform_device_put(pdev);
+		ice = ERR_PTR(-EPROBE_DEFER);
+		goto out;
+	}
+
+	ice->link = device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER);
+	if (!ice->link) {
+		dev_err(&pdev->dev,
+			"Failed to create device link to consumer %s\n",
+			dev_name(dev));
+		platform_device_put(pdev);
+		ice = ERR_PTR(-EINVAL);
+	}
+
+out:
+	of_node_put(node);
+
+	return ice;
+}
+EXPORT_SYMBOL_GPL(of_qcom_ice_get);
+
+static int qcom_ice_probe(struct platform_device *pdev)
+{
+	struct qcom_ice *engine;
+	void __iomem *base;
+
+	base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(base)) {
+		dev_warn(&pdev->dev, "ICE registers not found\n");
+		return PTR_ERR(base);
+	}
+
+	engine = qcom_ice_create(&pdev->dev, base);
+	if (IS_ERR(engine))
+		return PTR_ERR(engine);
+
+	platform_set_drvdata(pdev, engine);
+
+	return 0;
+}
+
+static const struct of_device_id qcom_ice_of_match_table[] = {
+	{ .compatible = "qcom,inline-crypto-engine" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, qcom_ice_of_match_table);
+
+static struct platform_driver qcom_ice_driver = {
+	.probe	= qcom_ice_probe,
+	.driver = {
+		.name = "qcom-ice",
+		.of_match_table = qcom_ice_of_match_table,
+	},
+};
+
+module_platform_driver(qcom_ice_driver);
+
+MODULE_DESCRIPTION("Qualcomm Inline Crypto Engine driver");
+MODULE_LICENSE("GPL");
diff --git a/include/soc/qcom/ice.h b/include/soc/qcom/ice.h
new file mode 100644
index 000000000000..5870a94599a2
--- /dev/null
+++ b/include/soc/qcom/ice.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#ifndef __QCOM_ICE_H__
+#define __QCOM_ICE_H__
+
+#include <linux/types.h>
+
+struct qcom_ice;
+
+enum qcom_ice_crypto_key_size {
+	QCOM_ICE_CRYPTO_KEY_SIZE_INVALID	= 0x0,
+	QCOM_ICE_CRYPTO_KEY_SIZE_128		= 0x1,
+	QCOM_ICE_CRYPTO_KEY_SIZE_192		= 0x2,
+	QCOM_ICE_CRYPTO_KEY_SIZE_256		= 0x3,
+	QCOM_ICE_CRYPTO_KEY_SIZE_512		= 0x4,
+};
+
+enum qcom_ice_crypto_alg {
+	QCOM_ICE_CRYPTO_ALG_AES_XTS		= 0x0,
+	QCOM_ICE_CRYPTO_ALG_BITLOCKER_AES_CBC	= 0x1,
+	QCOM_ICE_CRYPTO_ALG_AES_ECB		= 0x2,
+	QCOM_ICE_CRYPTO_ALG_ESSIV_AES_CBC	= 0x3,
+};
+
+int qcom_ice_enable(struct qcom_ice *ice);
+int qcom_ice_resume(struct qcom_ice *ice);
+int qcom_ice_suspend(struct qcom_ice *ice);
+int qcom_ice_program_key(struct qcom_ice *ice,
+			 u8 algorithm_id, u8 key_size,
+			 const u8 crypto_key[], u8 data_unit_size,
+			 int slot);
+int qcom_ice_evict_key(struct qcom_ice *ice, int slot);
+struct qcom_ice *of_qcom_ice_get(struct device *dev);
+#endif /* __QCOM_ICE_H__ */
-- 
cgit v1.2.3


From dee234032e767b3d6823fe122517770757306f04 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 15 Mar 2023 13:02:18 +0000
Subject: irqchip/gic: Drop support for board files

With the last non-OF, non-ACPI user of the GIC being removed in
e73307b9ebc4 ("ARM: cns3xxx: remove entire platform"), we can finally
drop the entry point and do some minor cleanup.

We also make the driver depend on CONFIG_OF, which is required
even when CONFIG_ACPI is selected.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230315130218.3212033-1-maz@kernel.org
---
 drivers/irqchip/Kconfig         |  1 +
 drivers/irqchip/irq-gic.c       | 60 +++--------------------------------------
 include/linux/irqchip/arm-gic.h |  6 -----
 3 files changed, 4 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 7dc990eb2c9b..b744fd905c92 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -7,6 +7,7 @@ config IRQCHIP
 
 config ARM_GIC
 	bool
+	depends on OF
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 95e3d2a71db6..412196a7dad5 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1081,10 +1081,6 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static void gic_irq_domain_unmap(struct irq_domain *d, unsigned int irq)
-{
-}
-
 static int gic_irq_domain_translate(struct irq_domain *d,
 				    struct irq_fwspec *fwspec,
 				    unsigned long *hwirq,
@@ -1167,11 +1163,6 @@ static const struct irq_domain_ops gic_irq_domain_hierarchy_ops = {
 	.free = irq_domain_free_irqs_top,
 };
 
-static const struct irq_domain_ops gic_irq_domain_ops = {
-	.map = gic_irq_domain_map,
-	.unmap = gic_irq_domain_unmap,
-};
-
 static int gic_init_bases(struct gic_chip_data *gic,
 			  struct fwnode_handle *handle)
 {
@@ -1219,30 +1210,9 @@ static int gic_init_bases(struct gic_chip_data *gic,
 		gic_irqs = 1020;
 	gic->gic_irqs = gic_irqs;
 
-	if (handle) {		/* DT/ACPI */
-		gic->domain = irq_domain_create_linear(handle, gic_irqs,
-						       &gic_irq_domain_hierarchy_ops,
-						       gic);
-	} else {		/* Legacy support */
-		/*
-		 * For primary GICs, skip over SGIs.
-		 * No secondary GIC support whatsoever.
-		 */
-		int irq_base;
-
-		gic_irqs -= 16; /* calculate # of irqs to allocate */
-
-		irq_base = irq_alloc_descs(16, 16, gic_irqs,
-					   numa_node_id());
-		if (irq_base < 0) {
-			WARN(1, "Cannot allocate irq_descs @ IRQ16, assuming pre-allocated\n");
-			irq_base = 16;
-		}
-
-		gic->domain = irq_domain_add_legacy(NULL, gic_irqs, irq_base,
-						    16, &gic_irq_domain_ops, gic);
-	}
-
+	gic->domain = irq_domain_create_linear(handle, gic_irqs,
+					       &gic_irq_domain_hierarchy_ops,
+					       gic);
 	if (WARN_ON(!gic->domain)) {
 		ret = -ENODEV;
 		goto error;
@@ -1297,23 +1267,6 @@ static int __init __gic_init_bases(struct gic_chip_data *gic,
 	return ret;
 }
 
-void __init gic_init(void __iomem *dist_base, void __iomem *cpu_base)
-{
-	struct gic_chip_data *gic;
-
-	/*
-	 * Non-DT/ACPI systems won't run a hypervisor, so let's not
-	 * bother with these...
-	 */
-	static_branch_disable(&supports_deactivate_key);
-
-	gic = &gic_data[0];
-	gic->raw_dist_base = dist_base;
-	gic->raw_cpu_base = cpu_base;
-
-	__gic_init_bases(gic, NULL);
-}
-
 static void gic_teardown(struct gic_chip_data *gic)
 {
 	if (WARN_ON(!gic))
@@ -1325,7 +1278,6 @@ static void gic_teardown(struct gic_chip_data *gic)
 		iounmap(gic->raw_cpu_base);
 }
 
-#ifdef CONFIG_OF
 static int gic_cnt __initdata;
 static bool gicv2_force_probe;
 
@@ -1570,12 +1522,6 @@ IRQCHIP_DECLARE(cortex_a7_gic, "arm,cortex-a7-gic", gic_of_init);
 IRQCHIP_DECLARE(msm_8660_qgic, "qcom,msm-8660-qgic", gic_of_init);
 IRQCHIP_DECLARE(msm_qgic2, "qcom,msm-qgic2", gic_of_init);
 IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init);
-#else
-int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq)
-{
-	return -ENOTSUPP;
-}
-#endif
 
 #ifdef CONFIG_ACPI
 static struct
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 5686711b0f40..2223f95079ce 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -151,12 +151,6 @@ int gic_of_init(struct device_node *node, struct device_node *parent);
  */
 int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq);
 
-/*
- * Legacy platforms not converted to DT yet must use this to init
- * their GIC
- */
-void gic_init(void __iomem *dist , void __iomem *cpu);
-
 void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
 int gic_get_cpu_id(unsigned int cpu);
 void gic_migrate_target(unsigned int new_cpu_id);
-- 
cgit v1.2.3


From 35727af2b15d98a2dd2811d631d3a3886111312e Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <sdonthineni@nvidia.com>
Date: Sat, 18 Mar 2023 21:43:14 -0500
Subject: irqchip/gicv3: Workaround for NVIDIA erratum T241-FABRIC-4

The T241 platform suffers from the T241-FABRIC-4 erratum which causes
unexpected behavior in the GIC when multiple transactions are received
simultaneously from different sources. This hardware issue impacts
NVIDIA server platforms that use more than two T241 chips
interconnected. Each chip has support for 320 {E}SPIs.

This issue occurs when multiple packets from different GICs are
incorrectly interleaved at the target chip. The erratum text below
specifies exactly what can cause multiple transfer packets susceptible
to interleaving and GIC state corruption. GIC state corruption can
lead to a range of problems, including kernel panics, and unexpected
behavior.

>From the erratum text:
  "In some cases, inter-socket AXI4 Stream packets with multiple
  transfers, may be interleaved by the fabric when presented to ARM
  Generic Interrupt Controller. GIC expects all transfers of a packet
  to be delivered without any interleaving.

  The following GICv3 commands may result in multiple transfer packets
  over inter-socket AXI4 Stream interface:
   - Register reads from GICD_I* and GICD_N*
   - Register writes to 64-bit GICD registers other than GICD_IROUTERn*
   - ITS command MOVALL

  Multiple commands in GICv4+ utilize multiple transfer packets,
  including VMOVP, VMOVI, VMAPP, and 64-bit register accesses."

  This issue impacts system configurations with more than 2 sockets,
  that require multi-transfer packets to be sent over inter-socket
  AXI4 Stream interface between GIC instances on different sockets.
  GICv4 cannot be supported. GICv3 SW model can only be supported
  with the workaround. Single and Dual socket configurations are not
  impacted by this issue and support GICv3 and GICv4."

Link: https://developer.nvidia.com/docs/t241-fabric-4/nvidia-t241-fabric-4-errata.pdf

Writing to the chip alias region of the GICD_In{E} registers except
GICD_ICENABLERn has an equivalent effect as writing to the global
distributor. The SPI interrupt deactivate path is not impacted by
the erratum.

To fix this problem, implement a workaround that ensures read accesses
to the GICD_In{E} registers are directed to the chip that owns the
SPI, and disable GICv4.x features. To simplify code changes, the
gic_configure_irq() function uses the same alias region for both read
and write operations to GICD_ICFGR.

Co-developed-by: Vikram Sethi <vsethi@nvidia.com>
Signed-off-by: Vikram Sethi <vsethi@nvidia.com>
Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com> (for SMCCC/SOC ID bits)
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230319024314.3540573-2-sdonthineni@nvidia.com
---
 Documentation/arm64/silicon-errata.rst |   2 +
 drivers/firmware/smccc/smccc.c         |  26 ++++++++
 drivers/firmware/smccc/soc_id.c        |  28 ++------
 drivers/irqchip/Kconfig                |   1 +
 drivers/irqchip/irq-gic-v3.c           | 115 +++++++++++++++++++++++++++++----
 include/linux/arm-smccc.h              |  18 ++++++
 6 files changed, 156 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index ec5f889d7681..e31f6c068704 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -172,6 +172,8 @@ stable kernels.
 +----------------+-----------------+-----------------+-----------------------------+
 | NVIDIA         | Carmel Core     | N/A             | NVIDIA_CARMEL_CNP_ERRATUM   |
 +----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA         | T241 GICv3/4.x  | T241-FABRIC-4   | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
 +----------------+-----------------+-----------------+-----------------------------+
 | Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
 +----------------+-----------------+-----------------+-----------------------------+
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index 60ccf3e90d7d..db818f9dcb8e 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -17,9 +17,13 @@ static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
 
 bool __ro_after_init smccc_trng_available = false;
 u64 __ro_after_init smccc_has_sve_hint = false;
+s32 __ro_after_init smccc_soc_id_version = SMCCC_RET_NOT_SUPPORTED;
+s32 __ro_after_init smccc_soc_id_revision = SMCCC_RET_NOT_SUPPORTED;
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 {
+	struct arm_smccc_res res;
+
 	smccc_version = version;
 	smccc_conduit = conduit;
 
@@ -27,6 +31,18 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    smccc_version >= ARM_SMCCC_VERSION_1_3)
 		smccc_has_sve_hint = true;
+
+	if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
+	    (smccc_conduit != SMCCC_CONDUIT_NONE)) {
+		arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+				     ARM_SMCCC_ARCH_SOC_ID, &res);
+		if ((s32)res.a0 >= 0) {
+			arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
+			smccc_soc_id_version = (s32)res.a0;
+			arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 1, &res);
+			smccc_soc_id_revision = (s32)res.a0;
+		}
+	}
 }
 
 enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void)
@@ -44,6 +60,16 @@ u32 arm_smccc_get_version(void)
 }
 EXPORT_SYMBOL_GPL(arm_smccc_get_version);
 
+s32 arm_smccc_get_soc_id_version(void)
+{
+	return smccc_soc_id_version;
+}
+
+s32 arm_smccc_get_soc_id_revision(void)
+{
+	return smccc_soc_id_revision;
+}
+
 static int __init smccc_devices_init(void)
 {
 	struct platform_device *pdev;
diff --git a/drivers/firmware/smccc/soc_id.c b/drivers/firmware/smccc/soc_id.c
index dd7c3d5e8b0b..890eb454599a 100644
--- a/drivers/firmware/smccc/soc_id.c
+++ b/drivers/firmware/smccc/soc_id.c
@@ -42,41 +42,23 @@ static int __init smccc_soc_init(void)
 	if (arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2)
 		return 0;
 
-	if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) {
-		pr_err("%s: invalid SMCCC conduit\n", __func__);
-		return -EOPNOTSUPP;
-	}
-
-	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
-			     ARM_SMCCC_ARCH_SOC_ID, &res);
-
-	if ((int)res.a0 == SMCCC_RET_NOT_SUPPORTED) {
+	soc_id_version = arm_smccc_get_soc_id_version();
+	if (soc_id_version == SMCCC_RET_NOT_SUPPORTED) {
 		pr_info("ARCH_SOC_ID not implemented, skipping ....\n");
 		return 0;
 	}
 
-	if ((int)res.a0 < 0) {
-		pr_info("ARCH_FEATURES(ARCH_SOC_ID) returned error: %lx\n",
-			res.a0);
-		return -EINVAL;
-	}
-
-	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
-	if ((int)res.a0 < 0) {
+	if (soc_id_version < 0) {
 		pr_err("ARCH_SOC_ID(0) returned error: %lx\n", res.a0);
 		return -EINVAL;
 	}
 
-	soc_id_version = res.a0;
-
-	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 1, &res);
-	if ((int)res.a0 < 0) {
+	soc_id_rev = arm_smccc_get_soc_id_revision();
+	if (soc_id_rev < 0) {
 		pr_err("ARCH_SOC_ID(1) returned error: %lx\n", res.a0);
 		return -EINVAL;
 	}
 
-	soc_id_rev = res.a0;
-
 	soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
 	if (!soc_dev_attr)
 		return -ENOMEM;
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index b744fd905c92..e7bb3554c544 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -36,6 +36,7 @@ config ARM_GIC_V3
 	select IRQ_DOMAIN_HIERARCHY
 	select PARTITION_PERCPU
 	select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
+	select HAVE_ARM_SMCCC_DISCOVERY
 
 config ARM_GIC_V3_ITS
 	bool
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index fd134e1f481a..6fcee221f201 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -24,6 +24,9 @@
 #include <linux/irqchip/arm-gic-common.h>
 #include <linux/irqchip/arm-gic-v3.h>
 #include <linux/irqchip/irq-partition-percpu.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/arm-smccc.h>
 
 #include <asm/cputype.h>
 #include <asm/exception.h>
@@ -47,6 +50,7 @@ struct redist_region {
 
 struct gic_chip_data {
 	struct fwnode_handle	*fwnode;
+	phys_addr_t		dist_phys_base;
 	void __iomem		*dist_base;
 	struct redist_region	*redist_regions;
 	struct rdists		rdists;
@@ -59,6 +63,10 @@ struct gic_chip_data {
 	struct partition_desc	**ppi_descs;
 };
 
+#define T241_CHIPS_MAX		4
+static void __iomem *t241_dist_base_alias[T241_CHIPS_MAX] __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(gic_nvidia_t241_erratum);
+
 static struct gic_chip_data gic_data __read_mostly;
 static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
 
@@ -179,6 +187,39 @@ static inline bool gic_irq_in_rdist(struct irq_data *d)
 	}
 }
 
+static inline void __iomem *gic_dist_base_alias(struct irq_data *d)
+{
+	if (static_branch_unlikely(&gic_nvidia_t241_erratum)) {
+		irq_hw_number_t hwirq = irqd_to_hwirq(d);
+		u32 chip;
+
+		/*
+		 * For the erratum T241-FABRIC-4, read accesses to GICD_In{E}
+		 * registers are directed to the chip that owns the SPI. The
+		 * the alias region can also be used for writes to the
+		 * GICD_In{E} except GICD_ICENABLERn. Each chip has support
+		 * for 320 {E}SPIs. Mappings for all 4 chips:
+		 *    Chip0 = 32-351
+		 *    Chip1 = 352-671
+		 *    Chip2 = 672-991
+		 *    Chip3 = 4096-4415
+		 */
+		switch (__get_intid_range(hwirq)) {
+		case SPI_RANGE:
+			chip = (hwirq - 32) / 320;
+			break;
+		case ESPI_RANGE:
+			chip = 3;
+			break;
+		default:
+			unreachable();
+		}
+		return t241_dist_base_alias[chip];
+	}
+
+	return gic_data.dist_base;
+}
+
 static inline void __iomem *gic_dist_base(struct irq_data *d)
 {
 	switch (get_intid_range(d)) {
@@ -337,7 +378,7 @@ static int gic_peek_irq(struct irq_data *d, u32 offset)
 	if (gic_irq_in_rdist(d))
 		base = gic_data_rdist_sgi_base();
 	else
-		base = gic_data.dist_base;
+		base = gic_dist_base_alias(d);
 
 	return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask);
 }
@@ -588,7 +629,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
 	if (gic_irq_in_rdist(d))
 		base = gic_data_rdist_sgi_base();
 	else
-		base = gic_data.dist_base;
+		base = gic_dist_base_alias(d);
 
 	offset = convert_offset_index(d, GICD_ICFGR, &index);
 
@@ -1708,6 +1749,43 @@ static bool gic_enable_quirk_hip06_07(void *data)
 	return false;
 }
 
+#define T241_CHIPN_MASK		GENMASK_ULL(45, 44)
+#define T241_CHIP_GICDA_OFFSET	0x1580000
+#define SMCCC_SOC_ID_T241	0x036b0241
+
+static bool gic_enable_quirk_nvidia_t241(void *data)
+{
+	s32 soc_id = arm_smccc_get_soc_id_version();
+	unsigned long chip_bmask = 0;
+	phys_addr_t phys;
+	u32 i;
+
+	/* Check JEP106 code for NVIDIA T241 chip (036b:0241) */
+	if ((soc_id < 0) || (soc_id != SMCCC_SOC_ID_T241))
+		return false;
+
+	/* Find the chips based on GICR regions PHYS addr */
+	for (i = 0; i < gic_data.nr_redist_regions; i++) {
+		chip_bmask |= BIT(FIELD_GET(T241_CHIPN_MASK,
+				  (u64)gic_data.redist_regions[i].phys_base));
+	}
+
+	if (hweight32(chip_bmask) < 3)
+		return false;
+
+	/* Setup GICD alias regions */
+	for (i = 0; i < ARRAY_SIZE(t241_dist_base_alias); i++) {
+		if (chip_bmask & BIT(i)) {
+			phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET;
+			phys |= FIELD_PREP(T241_CHIPN_MASK, i);
+			t241_dist_base_alias[i] = ioremap(phys, SZ_64K);
+			WARN_ON_ONCE(!t241_dist_base_alias[i]);
+		}
+	}
+	static_branch_enable(&gic_nvidia_t241_erratum);
+	return true;
+}
+
 static const struct gic_quirk gic_quirks[] = {
 	{
 		.desc	= "GICv3: Qualcomm MSM8996 broken firmware",
@@ -1739,6 +1817,12 @@ static const struct gic_quirk gic_quirks[] = {
 		.mask	= 0xe8f00fff,
 		.init	= gic_enable_quirk_cavium_38539,
 	},
+	{
+		.desc	= "GICv3: NVIDIA erratum T241-FABRIC-4",
+		.iidr	= 0x0402043b,
+		.mask	= 0xffffffff,
+		.init	= gic_enable_quirk_nvidia_t241,
+	},
 	{
 	}
 };
@@ -1798,7 +1882,8 @@ static void gic_enable_nmi_support(void)
 		gic_chip.flags |= IRQCHIP_SUPPORTS_NMI;
 }
 
-static int __init gic_init_bases(void __iomem *dist_base,
+static int __init gic_init_bases(phys_addr_t dist_phys_base,
+				 void __iomem *dist_base,
 				 struct redist_region *rdist_regs,
 				 u32 nr_redist_regions,
 				 u64 redist_stride,
@@ -1814,6 +1899,7 @@ static int __init gic_init_bases(void __iomem *dist_base,
 		pr_info("GIC: Using split EOI/Deactivate mode\n");
 
 	gic_data.fwnode = handle;
+	gic_data.dist_phys_base = dist_phys_base;
 	gic_data.dist_base = dist_base;
 	gic_data.redist_regions = rdist_regs;
 	gic_data.nr_redist_regions = nr_redist_regions;
@@ -1841,10 +1927,13 @@ static int __init gic_init_bases(void __iomem *dist_base,
 	gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
 						 &gic_data);
 	gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
-	gic_data.rdists.has_rvpeid = true;
-	gic_data.rdists.has_vlpis = true;
-	gic_data.rdists.has_direct_lpi = true;
-	gic_data.rdists.has_vpend_valid_dirty = true;
+	if (!static_branch_unlikely(&gic_nvidia_t241_erratum)) {
+		/* Disable GICv4.x features for the erratum T241-FABRIC-4 */
+		gic_data.rdists.has_rvpeid = true;
+		gic_data.rdists.has_vlpis = true;
+		gic_data.rdists.has_direct_lpi = true;
+		gic_data.rdists.has_vpend_valid_dirty = true;
+	}
 
 	if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
 		err = -ENOMEM;
@@ -2050,6 +2139,7 @@ static void __iomem *gic_of_iomap(struct device_node *node, int idx,
 
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
 {
+	phys_addr_t dist_phys_base;
 	void __iomem *dist_base;
 	struct redist_region *rdist_regs;
 	struct resource res;
@@ -2063,6 +2153,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 		return PTR_ERR(dist_base);
 	}
 
+	dist_phys_base = res.start;
+
 	err = gic_validate_dist_version(dist_base);
 	if (err) {
 		pr_err("%pOF: no distributor detected, giving up\n", node);
@@ -2094,8 +2186,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 
 	gic_enable_of_quirks(node, gic_quirks, &gic_data);
 
-	err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
-			     redist_stride, &node->fwnode);
+	err = gic_init_bases(dist_phys_base, dist_base, rdist_regs,
+			     nr_redist_regions, redist_stride, &node->fwnode);
 	if (err)
 		goto out_unmap_rdist;
 
@@ -2411,8 +2503,9 @@ gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
 		goto out_redist_unmap;
 	}
 
-	err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
-			     acpi_data.nr_redist_regions, 0, gsi_domain_handle);
+	err = gic_init_bases(dist->base_address, acpi_data.dist_base,
+			     acpi_data.redist_regs, acpi_data.nr_redist_regions,
+			     0, gsi_domain_handle);
 	if (err)
 		goto out_fwhandle_free;
 
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 220c8c60e021..f196c19f8e55 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -226,6 +226,24 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
 
 extern u64 smccc_has_sve_hint;
 
+/**
+ * arm_smccc_get_soc_id_version()
+ *
+ * Returns the SOC ID version.
+ *
+ * When ARM_SMCCC_ARCH_SOC_ID is not present, returns SMCCC_RET_NOT_SUPPORTED.
+ */
+s32 arm_smccc_get_soc_id_version(void);
+
+/**
+ * arm_smccc_get_soc_id_revision()
+ *
+ * Returns the SOC ID revision.
+ *
+ * When ARM_SMCCC_ARCH_SOC_ID is not present, returns SMCCC_RET_NOT_SUPPORTED.
+ */
+s32 arm_smccc_get_soc_id_revision(void);
+
 /**
  * struct arm_smccc_res - Result from SMC/HVC call
  * @a0-a3 result values from registers 0 to 3
-- 
cgit v1.2.3


From f4708a82dc45fbf08bd3c68fc3eb32397e9121a1 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Tue, 14 Mar 2023 17:00:58 -0300
Subject: notifiers: add tracepoints to the notifiers infrastructure

Currently there is no way to show the callback names for registered,
unregistered or executed notifiers. This is very useful for debug
purposes, hence add this functionality here in the form of notifiers'
tracepoints, one per operation.

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20230314200058.1326909-1-gpiccoli@igalia.com
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Cc: Guilherme G. Piccoli <gpiccoli@igalia.com>
Cc: Guilherme G. Piccoli <kernel@gpiccoli.net>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/notifier.h | 69 +++++++++++++++++++++++++++++++++++++++++
 kernel/notifier.c               |  6 ++++
 2 files changed, 75 insertions(+)
 create mode 100644 include/trace/events/notifier.h

(limited to 'include')

diff --git a/include/trace/events/notifier.h b/include/trace/events/notifier.h
new file mode 100644
index 000000000000..26b298a31950
--- /dev/null
+++ b/include/trace/events/notifier.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM notifier
+
+#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NOTIFIERS_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(notifier_info,
+
+	TP_PROTO(void *cb),
+
+	TP_ARGS(cb),
+
+	TP_STRUCT__entry(
+		__field(void *, cb)
+	),
+
+	TP_fast_assign(
+		__entry->cb = cb;
+	),
+
+	TP_printk("%ps", __entry->cb)
+);
+
+/*
+ * notifier_register - called upon notifier callback registration
+ *
+ * @cb:		callback pointer
+ *
+ */
+DEFINE_EVENT(notifier_info, notifier_register,
+
+	TP_PROTO(void *cb),
+
+	TP_ARGS(cb)
+);
+
+/*
+ * notifier_unregister - called upon notifier callback unregistration
+ *
+ * @cb:		callback pointer
+ *
+ */
+DEFINE_EVENT(notifier_info, notifier_unregister,
+
+	TP_PROTO(void *cb),
+
+	TP_ARGS(cb)
+);
+
+/*
+ * notifier_run - called upon notifier callback execution
+ *
+ * @cb:		callback pointer
+ *
+ */
+DEFINE_EVENT(notifier_info, notifier_run,
+
+	TP_PROTO(void *cb),
+
+	TP_ARGS(cb)
+);
+
+#endif /* _TRACE_NOTIFIERS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d353e4b5402d..b3ce28f39eb6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -7,6 +7,9 @@
 #include <linux/vmalloc.h>
 #include <linux/reboot.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	at shutdown. This is used to stop any idling DMA operations
@@ -37,6 +40,7 @@ static int notifier_chain_register(struct notifier_block **nl,
 	}
 	n->next = *nl;
 	rcu_assign_pointer(*nl, n);
+	trace_notifier_register((void *)n->notifier_call);
 	return 0;
 }
 
@@ -46,6 +50,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
 	while ((*nl) != NULL) {
 		if ((*nl) == n) {
 			rcu_assign_pointer(*nl, n->next);
+			trace_notifier_unregister((void *)n->notifier_call);
 			return 0;
 		}
 		nl = &((*nl)->next);
@@ -84,6 +89,7 @@ static int notifier_call_chain(struct notifier_block **nl,
 			continue;
 		}
 #endif
+		trace_notifier_run((void *)nb->notifier_call);
 		ret = nb->notifier_call(nb, val, v);
 
 		if (nr_calls)
-- 
cgit v1.2.3


From 890a3ee3ce416e2f1aed41718a8c1d42c82cf1b2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 23 Mar 2023 17:50:29 +0200
Subject: kernel.h: split the hexadecimal related helpers to hex.h

For the sake of cleaning up the kernel.h split the hexadecimal related
helpers to own header called 'hex.h'.

Link: https://lkml.kernel.org/r/20230323155029.40000-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hex.h    | 35 +++++++++++++++++++++++++++++++++++
 include/linux/kernel.h | 29 +----------------------------
 2 files changed, 36 insertions(+), 28 deletions(-)
 create mode 100644 include/linux/hex.h

(limited to 'include')

diff --git a/include/linux/hex.h b/include/linux/hex.h
new file mode 100644
index 000000000000..2618382e5b0c
--- /dev/null
+++ b/include/linux/hex.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HEX_H
+#define _LINUX_HEX_H
+
+#include <linux/types.h>
+
+extern const char hex_asc[];
+#define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
+#define hex_asc_hi(x)	hex_asc[((x) & 0xf0) >> 4]
+
+static inline char *hex_byte_pack(char *buf, u8 byte)
+{
+	*buf++ = hex_asc_hi(byte);
+	*buf++ = hex_asc_lo(byte);
+	return buf;
+}
+
+extern const char hex_asc_upper[];
+#define hex_asc_upper_lo(x)	hex_asc_upper[((x) & 0x0f)]
+#define hex_asc_upper_hi(x)	hex_asc_upper[((x) & 0xf0) >> 4]
+
+static inline char *hex_byte_pack_upper(char *buf, u8 byte)
+{
+	*buf++ = hex_asc_upper_hi(byte);
+	*buf++ = hex_asc_upper_lo(byte);
+	return buf;
+}
+
+extern int hex_to_bin(unsigned char ch);
+extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
+extern char *bin2hex(char *dst, const void *src, size_t count);
+
+bool mac_pton(const char *s, u8 *mac);
+
+#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 40bce7495af8..0d91e0af0125 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -20,6 +20,7 @@
 #include <linux/compiler.h>
 #include <linux/container_of.h>
 #include <linux/bitops.h>
+#include <linux/hex.h>
 #include <linux/kstrtox.h>
 #include <linux/log2.h>
 #include <linux/math.h>
@@ -263,34 +264,6 @@ extern enum system_states {
 	SYSTEM_SUSPEND,
 } system_state;
 
-extern const char hex_asc[];
-#define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
-#define hex_asc_hi(x)	hex_asc[((x) & 0xf0) >> 4]
-
-static inline char *hex_byte_pack(char *buf, u8 byte)
-{
-	*buf++ = hex_asc_hi(byte);
-	*buf++ = hex_asc_lo(byte);
-	return buf;
-}
-
-extern const char hex_asc_upper[];
-#define hex_asc_upper_lo(x)	hex_asc_upper[((x) & 0x0f)]
-#define hex_asc_upper_hi(x)	hex_asc_upper[((x) & 0xf0) >> 4]
-
-static inline char *hex_byte_pack_upper(char *buf, u8 byte)
-{
-	*buf++ = hex_asc_upper_hi(byte);
-	*buf++ = hex_asc_upper_lo(byte);
-	return buf;
-}
-
-extern int hex_to_bin(unsigned char ch);
-extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-extern char *bin2hex(char *dst, const void *src, size_t count);
-
-bool mac_pton(const char *s, u8 *mac);
-
 /*
  * General tracing related utility functions - trace_printk(),
  * tracing_on/tracing_off and tracing_start()/tracing_stop
-- 
cgit v1.2.3


From 7982722ff7286596a1e2f343ec4219e309ddc82a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 7 Mar 2023 16:44:15 -0600
Subject: x86/kexec: remove unnecessary arch_kexec_kernel_image_load()

Patch series "kexec: Remove unnecessary arch hook", v2.

There are no arch-specific things in arch_kexec_kernel_image_load(), so
remove it and just use the generic version.


This patch (of 2):

The x86 implementation of arch_kexec_kernel_image_load() is functionally
identical to the generic arch_kexec_kernel_image_load():

  arch_kexec_kernel_image_load                # x86
    if (!image->fops || !image->fops->load)
      return ERR_PTR(-ENOEXEC);
    return image->fops->load(image, image->kernel_buf, ...)

  arch_kexec_kernel_image_load                # generic
    kexec_image_load_default
      if (!image->fops || !image->fops->load)
	return ERR_PTR(-ENOEXEC);
      return image->fops->load(image, image->kernel_buf, ...)

Remove the x86-specific version and use the generic
arch_kexec_kernel_image_load().  No functional change intended.

Link: https://lkml.kernel.org/r/20230307224416.907040-1-helgaas@kernel.org
Link: https://lkml.kernel.org/r/20230307224416.907040-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/kexec.h       |  3 ---
 arch/x86/kernel/machine_kexec_64.c | 11 -----------
 include/linux/kexec.h              |  2 --
 3 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index a3760ca796aa..5b77bbc28f96 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -200,9 +200,6 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 				     const Elf_Shdr *symtab);
 #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
 
-void *arch_kexec_kernel_image_load(struct kimage *image);
-#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load
-
 int arch_kimage_file_post_load_cleanup(struct kimage *image);
 #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
 #endif
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 0611fd83858e..1a3e2c05a8a5 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -374,17 +374,6 @@ void machine_kexec(struct kimage *image)
 /* arch-dependent functionality related to kexec file-based syscall */
 
 #ifdef CONFIG_KEXEC_FILE
-void *arch_kexec_kernel_image_load(struct kimage *image)
-{
-	if (!image->fops || !image->fops->load)
-		return ERR_PTR(-ENOEXEC);
-
-	return image->fops->load(image, image->kernel_buf,
-				 image->kernel_buf_len, image->initrd_buf,
-				 image->initrd_buf_len, image->cmdline_buf,
-				 image->cmdline_buf_len);
-}
-
 /*
  * Apply purgatory relocations.
  *
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6883c5922701..4746bc9d39c9 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -207,12 +207,10 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image)
 }
 #endif
 
-#ifndef arch_kexec_kernel_image_load
 static inline void *arch_kexec_kernel_image_load(struct kimage *image)
 {
 	return kexec_image_load_default(image);
 }
-#endif
 
 #ifdef CONFIG_KEXEC_SIG
 #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
-- 
cgit v1.2.3


From fb15abdca64503511bb32cb6ff70da306f24fa06 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 7 Mar 2023 16:44:16 -0600
Subject: kexec: remove unnecessary arch_kexec_kernel_image_load()

arch_kexec_kernel_image_load() only calls kexec_image_load_default(), and
there are no arch-specific implementations.

Remove the unnecessary arch_kexec_kernel_image_load() and make
kexec_image_load_default() static.

No functional change intended.

Link: https://lkml.kernel.org/r/20230307224416.907040-3-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 6 ------
 kernel/kexec_file.c   | 6 +++---
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 4746bc9d39c9..22b5cd24f581 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -190,7 +190,6 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 				   void *buf, unsigned int size,
 				   bool get_value);
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
-void *kexec_image_load_default(struct kimage *image);
 
 #ifndef arch_kexec_kernel_image_probe
 static inline int
@@ -207,11 +206,6 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image)
 }
 #endif
 
-static inline void *arch_kexec_kernel_image_load(struct kimage *image)
-{
-	return kexec_image_load_default(image);
-}
-
 #ifdef CONFIG_KEXEC_SIG
 #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
 int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1a0e4e3fb5c..f989f5f1933b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -65,7 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
 	return ret;
 }
 
-void *kexec_image_load_default(struct kimage *image)
+static void *kexec_image_load_default(struct kimage *image)
 {
 	if (!image->fops || !image->fops->load)
 		return ERR_PTR(-ENOEXEC);
@@ -249,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 	/* IMA needs to pass the measurement list to the next kernel. */
 	ima_add_kexec_buffer(image);
 
-	/* Call arch image load handlers */
-	ldata = arch_kexec_kernel_image_load(image);
+	/* Call image load handler */
+	ldata = kexec_image_load_default(image);
 
 	if (IS_ERR(ldata)) {
 		ret = PTR_ERR(ldata);
-- 
cgit v1.2.3


From 79643567cc34ebd0743f4da3ac8f853e26202453 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 3 Apr 2023 17:46:31 +0800
Subject: dt-bindings: clock: imx8mp: Add LDB clock entry

Add LDB clock entry for i.MX8MP

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Link: https://lore.kernel.org/r/20230403094633.3366446-2-peng.fan@oss.nxp.com
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
---
 include/dt-bindings/clock/imx8mp-clock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mp-clock.h b/include/dt-bindings/clock/imx8mp-clock.h
index ede1f65a3147..3f28ce685f41 100644
--- a/include/dt-bindings/clock/imx8mp-clock.h
+++ b/include/dt-bindings/clock/imx8mp-clock.h
@@ -334,8 +334,8 @@
 #define IMX8MP_CLK_SAI6_ROOT			326
 #define IMX8MP_CLK_SAI7_ROOT			327
 #define IMX8MP_CLK_PDM_ROOT			328
-
-#define IMX8MP_CLK_END				329
+#define IMX8MP_CLK_MEDIA_LDB_ROOT		329
+#define IMX8MP_CLK_END				330
 
 #define IMX8MP_CLK_AUDIOMIX_SAI1_IPG		0
 #define IMX8MP_CLK_AUDIOMIX_SAI1_MCLK1		1
-- 
cgit v1.2.3


From 5fd7b00ca2361c81f2026f82dff93e52afd97a0b Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 3 Apr 2023 17:52:59 +0800
Subject: dt-bindings: clock: imx93: add NIC, A55 and ARM PLL CLK

Add i.MX93 NIC, A55 and ARM PLL CLK.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230403095300.3386988-7-peng.fan@oss.nxp.com
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
---
 include/dt-bindings/clock/imx93-clock.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx93-clock.h b/include/dt-bindings/clock/imx93-clock.h
index 8e02859d8ce2..35a1f62053a5 100644
--- a/include/dt-bindings/clock/imx93-clock.h
+++ b/include/dt-bindings/clock/imx93-clock.h
@@ -199,6 +199,10 @@
 #define IMX93_CLK_MU1_B_GATE		194
 #define IMX93_CLK_MU2_A_GATE		195
 #define IMX93_CLK_MU2_B_GATE		196
-#define IMX93_CLK_END			197
+#define IMX93_CLK_NIC_AXI		197
+#define IMX93_CLK_ARM_PLL		198
+#define IMX93_CLK_A55_SEL		199
+#define IMX93_CLK_A55_CORE		200
+#define IMX93_CLK_END			201
 
 #endif
-- 
cgit v1.2.3


From 5a17818682cf43ad0fdd6035945f3b7a8c9dc5e9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 6 Apr 2023 14:42:46 +0300
Subject: net: dsa: replace NETDEV_PRE_CHANGE_HWTSTAMP notifier with a stub

There was a sort of rush surrounding commit 88c0a6b503b7 ("net: create a
netdev notifier for DSA to reject PTP on DSA master"), due to a desire
to convert DSA's attempt to deny TX timestamping on a DSA master to
something that doesn't block the kernel-wide API conversion from
ndo_eth_ioctl() to ndo_hwtstamp_set().

What was required was a mechanism that did not depend on ndo_eth_ioctl(),
and what was provided was a mechanism that did not depend on
ndo_eth_ioctl(), while at the same time introducing something that
wasn't absolutely necessary - a new netdev notifier.

There have been objections from Jakub Kicinski that using notifiers in
general when they are not absolutely necessary creates complications to
the control flow and difficulties to maintainers who look at the code.
So there is a desire to not use notifiers.

In addition to that, the notifier chain gets called even if there is no
DSA in the system and no one is interested in applying any restriction.

Take the model of udp_tunnel_nic_ops and introduce a stub mechanism,
through which net/core/dev_ioctl.c can call into DSA even when
CONFIG_NET_DSA=m.

Compared to the code that existed prior to the notifier conversion, aka
what was added in commits:
- 4cfab3566710 ("net: dsa: Add wrappers for overloaded ndo_ops")
- 3369afba1e46 ("net: Call into DSA netdevice_ops wrappers")

this is different because we are not overloading any struct
net_device_ops of the DSA master anymore, but rather, we are exposing a
rather specific functionality which is orthogonal to which API is used
to enable it - ndo_eth_ioctl() or ndo_hwtstamp_set().

Also, what is similar is that both approaches use function pointers to
get from built-in code to DSA.

There is no point in replicating the function pointers towards
__dsa_master_hwtstamp_validate() once for every CPU port (dev->dsa_ptr).
Instead, it is sufficient to introduce a singleton struct dsa_stubs,
built into the kernel, which contains a single function pointer to
__dsa_master_hwtstamp_validate().

I find this approach preferable to what we had originally, because
dev->dsa_ptr->netdev_ops->ndo_do_ioctl() used to require going through
struct dsa_port (dev->dsa_ptr), and so, this was incompatible with any
attempts to add any data encapsulation and hide DSA data structures from
the outside world.

Link: https://lore.kernel.org/netdev/20230403083019.120b72fd@kernel.org/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ------
 include/net/dsa_stubs.h   | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 net/Makefile              |  2 +-
 net/core/dev.c            |  2 +-
 net/core/dev_ioctl.c      | 12 ++----------
 net/dsa/Makefile          |  6 ++++++
 net/dsa/dsa.c             | 19 +++++++++++++++++++
 net/dsa/master.c          |  2 +-
 net/dsa/master.h          |  2 +-
 net/dsa/slave.c           | 11 -----------
 net/dsa/stubs.c           | 10 ++++++++++
 11 files changed, 89 insertions(+), 31 deletions(-)
 create mode 100644 include/net/dsa_stubs.h
 create mode 100644 net/dsa/stubs.c

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a740be3bb911..1c25b39681b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2878,7 +2878,6 @@ enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 	NETDEV_XDP_FEAT_CHANGE,
-	NETDEV_PRE_CHANGE_HWTSTAMP,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2929,11 +2928,6 @@ struct netdev_notifier_pre_changeaddr_info {
 	const unsigned char *dev_addr;
 };
 
-struct netdev_notifier_hwtstamp_info {
-	struct netdev_notifier_info info; /* must be first */
-	struct kernel_hwtstamp_config *config;
-};
-
 enum netdev_offload_xstats_type {
 	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
 };
diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h
new file mode 100644
index 000000000000..361811750a54
--- /dev/null
+++ b/include/net/dsa_stubs.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework
+ */
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <net/dsa.h>
+
+#if IS_ENABLED(CONFIG_NET_DSA)
+
+extern const struct dsa_stubs *dsa_stubs;
+
+struct dsa_stubs {
+	int (*master_hwtstamp_validate)(struct net_device *dev,
+					const struct kernel_hwtstamp_config *config,
+					struct netlink_ext_ack *extack);
+};
+
+static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
+					       const struct kernel_hwtstamp_config *config,
+					       struct netlink_ext_ack *extack)
+{
+	if (!netdev_uses_dsa(dev))
+		return 0;
+
+	/* rtnl_lock() is a sufficient guarantee, because as long as
+	 * netdev_uses_dsa() returns true, the dsa_core module is still
+	 * registered, and so, dsa_unregister_stubs() couldn't have run.
+	 * For netdev_uses_dsa() to start returning false, it would imply that
+	 * dsa_master_teardown() has executed, which requires rtnl_lock().
+	 */
+	ASSERT_RTNL();
+
+	return dsa_stubs->master_hwtstamp_validate(dev, config, extack);
+}
+
+#else
+
+static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
+					       const struct kernel_hwtstamp_config *config,
+					       struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+#endif
diff --git a/net/Makefile b/net/Makefile
index 0914bea9c335..87592009366f 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
 obj-$(CONFIG_NET_DEVLINK)	+= devlink/
-obj-$(CONFIG_NET_DSA)		+= dsa/
+obj-y				+= dsa/
 obj-$(CONFIG_ATALK)		+= appletalk/
 obj-$(CONFIG_X25)		+= x25/
 obj-$(CONFIG_LAPB)		+= lapb/
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ce5985be84b..480600a075ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1612,7 +1612,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
-	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
+	N(XDP_FEAT_CHANGE)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 6d772837eb3f..3730945ee294 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -7,7 +7,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/wireless.h>
 #include <linux/if_bridge.h>
-#include <net/dsa.h>
+#include <net/dsa_stubs.h>
 #include <net/wext.h>
 
 #include "dev.h"
@@ -259,9 +259,6 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
-	struct netdev_notifier_hwtstamp_info info = {
-		.info.dev = dev,
-	};
 	struct kernel_hwtstamp_config kernel_cfg;
 	struct netlink_ext_ack extack = {};
 	struct hwtstamp_config cfg;
@@ -276,12 +273,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	info.info.extack = &extack;
-	info.config = &kernel_cfg;
-
-	err = call_netdevice_notifiers_info(NETDEV_PRE_CHANGE_HWTSTAMP,
-					    &info.info);
-	err = notifier_to_errno(err);
+	err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
 	if (err) {
 		if (extack._msg)
 			netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index cc7e93a562fe..3835de286116 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+
+# the stubs are built-in whenever DSA is built-in or module
+ifdef CONFIG_NET_DSA
+obj-y := stubs.o
+endif
+
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += \
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index e5f156940c67..ab1afe67fd18 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -17,6 +17,7 @@
 #include <linux/of.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
+#include <net/dsa_stubs.h>
 #include <net/sch_generic.h>
 
 #include "devlink.h"
@@ -1702,6 +1703,20 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
 
+static const struct dsa_stubs __dsa_stubs = {
+	.master_hwtstamp_validate = __dsa_master_hwtstamp_validate,
+};
+
+static void dsa_register_stubs(void)
+{
+	dsa_stubs = &__dsa_stubs;
+}
+
+static void dsa_unregister_stubs(void)
+{
+	dsa_stubs = NULL;
+}
+
 static int __init dsa_init_module(void)
 {
 	int rc;
@@ -1721,6 +1736,8 @@ static int __init dsa_init_module(void)
 	if (rc)
 		goto netlink_register_fail;
 
+	dsa_register_stubs();
+
 	return 0;
 
 netlink_register_fail:
@@ -1735,6 +1752,8 @@ module_init(dsa_init_module);
 
 static void __exit dsa_cleanup_module(void)
 {
+	dsa_unregister_stubs();
+
 	rtnl_link_unregister(&dsa_link_ops);
 
 	dsa_slave_unregister_notifier();
diff --git a/net/dsa/master.c b/net/dsa/master.c
index c2cabe6248b1..6be89ab0cc01 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -198,7 +198,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 /* Deny PTP operations on master if there is at least one switch in the tree
  * that is PTP capable.
  */
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
 				   const struct kernel_hwtstamp_config *config,
 				   struct netlink_ext_ack *extack)
 {
diff --git a/net/dsa/master.h b/net/dsa/master.h
index 80842f4e27f7..76e39d3ec909 100644
--- a/net/dsa/master.h
+++ b/net/dsa/master.h
@@ -15,7 +15,7 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
 			 struct netlink_ext_ack *extack);
 void dsa_master_lag_teardown(struct net_device *lag_dev,
 			     struct dsa_port *cpu_dp);
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
 				   const struct kernel_hwtstamp_config *config,
 				   struct netlink_ext_ack *extack);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8abc1658ac47..165bb2cb8431 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -3289,7 +3289,6 @@ static int dsa_master_changeupper(struct net_device *dev,
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
 {
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
@@ -3419,16 +3418,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 
 		return NOTIFY_OK;
 	}
-	case NETDEV_PRE_CHANGE_HWTSTAMP: {
-		struct netdev_notifier_hwtstamp_info *info = ptr;
-		int err;
-
-		if (!netdev_uses_dsa(dev))
-			return NOTIFY_DONE;
-
-		err = dsa_master_pre_change_hwtstamp(dev, info->config, extack);
-		return notifier_from_errno(err);
-	}
 	default:
 		break;
 	}
diff --git a/net/dsa/stubs.c b/net/dsa/stubs.c
new file mode 100644
index 000000000000..2ed8a6c85fbf
--- /dev/null
+++ b/net/dsa/stubs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Stubs for DSA functionality called by the core network stack.
+ * These are necessary because CONFIG_NET_DSA can be a module, and built-in
+ * code cannot directly call symbols exported by modules.
+ */
+#include <net/dsa_stubs.h>
+
+const struct dsa_stubs *dsa_stubs;
+EXPORT_SYMBOL_GPL(dsa_stubs);
-- 
cgit v1.2.3


From 38416c28e16890b52fdd5eb73479299ec3f062f3 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Fri, 31 Mar 2023 15:40:28 +0300
Subject: iio: light: Add gain-time-scale helpers

Some light sensors can adjust both the HW-gain and integration time.
There are cases where adjusting the integration time has similar impact
to the scale of the reported values as gain setting has.

IIO users do typically expect to handle scale by a single writable 'scale'
entry. Driver should then adjust the gain/time accordingly.

It however is difficult for a driver to know whether it should change
gain or integration time to meet the requested scale. Usually it is
preferred to have longer integration time which usually improves
accuracy, but there may be use-cases where long measurement times can be
an issue. Thus it can be preferable to allow also changing the
integration time - but mitigate the scale impact by also changing the gain
underneath. Eg, if integration time change doubles the measured values,
the driver can reduce the HW-gain to half.

The theory of the computations of gain-time-scale is simple. However,
some people (undersigned) got that implemented wrong for more than once.

Add some gain-time-scale helpers in order to not dublicate errors in all
drivers needing these computations.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/268d418e7cffcdaa2ece6738478bbc57692c213e.1680263956.git.mazziesaccount@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/Kconfig                   |    3 +
 drivers/iio/Makefile                  |    1 +
 drivers/iio/industrialio-gts-helper.c | 1077 +++++++++++++++++++++++++++++++++
 include/linux/iio/iio-gts-helper.h    |  206 +++++++
 4 files changed, 1287 insertions(+)
 create mode 100644 drivers/iio/industrialio-gts-helper.c
 create mode 100644 include/linux/iio/iio-gts-helper.h

(limited to 'include')

diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index b190846c3dc2..52eb46ef84c1 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -30,6 +30,9 @@ config IIO_CONFIGFS
 	  (e.g. software triggers). For more info see
 	  Documentation/iio/iio_configfs.rst.
 
+config IIO_GTS_HELPER
+	tristate
+
 config IIO_TRIGGER
 	bool "Enable triggered sampling support"
 	help
diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
index 3be08cdadd7e..9622347a1c1b 100644
--- a/drivers/iio/Makefile
+++ b/drivers/iio/Makefile
@@ -9,6 +9,7 @@ industrialio-$(CONFIG_IIO_BUFFER) += industrialio-buffer.o
 industrialio-$(CONFIG_IIO_TRIGGER) += industrialio-trigger.o
 
 obj-$(CONFIG_IIO_CONFIGFS) += industrialio-configfs.o
+obj-$(CONFIG_IIO_GTS_HELPER) += industrialio-gts-helper.o
 obj-$(CONFIG_IIO_SW_DEVICE) += industrialio-sw-device.o
 obj-$(CONFIG_IIO_SW_TRIGGER) += industrialio-sw-trigger.o
 obj-$(CONFIG_IIO_TRIGGERED_EVENT) += industrialio-triggered-event.o
diff --git a/drivers/iio/industrialio-gts-helper.c b/drivers/iio/industrialio-gts-helper.c
new file mode 100644
index 000000000000..8bb68975b259
--- /dev/null
+++ b/drivers/iio/industrialio-gts-helper.c
@@ -0,0 +1,1077 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* gain-time-scale conversion helpers for IIO light sensors
+ *
+ * Copyright (c) 2023 Matti Vaittinen <mazziesaccount@gmail.com>
+ */
+
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/types.h>
+#include <linux/units.h>
+
+#include <linux/iio/iio-gts-helper.h>
+#include <linux/iio/types.h>
+
+/**
+ * iio_gts_get_gain - Convert scale to total gain
+ *
+ * Internal helper for converting scale to total gain.
+ *
+ * @max:	Maximum linearized scale. As an example, when scale is created
+ *		in magnitude of NANOs and max scale is 64.1 - The linearized
+ *		scale is 64 100 000 000.
+ * @scale:	Linearized scale to compute the gain for.
+ *
+ * Return:	(floored) gain corresponding to the scale. -EINVAL if scale
+ *		is invalid.
+ */
+static int iio_gts_get_gain(const u64 max, const u64 scale)
+{
+	u64 full = max;
+	int tmp = 1;
+
+	if (scale > full || !scale)
+		return -EINVAL;
+
+	if (U64_MAX - full < scale) {
+		/* Risk of overflow */
+		if (full - scale < scale)
+			return 1;
+
+		full -= scale;
+		tmp++;
+	}
+
+	while (full > scale * (u64)tmp)
+		tmp++;
+
+	return tmp;
+}
+
+/**
+ * gain_get_scale_fraction - get the gain or time based on scale and known one
+ *
+ * @max:	Maximum linearized scale. As an example, when scale is created
+ *		in magnitude of NANOs and max scale is 64.1 - The linearized
+ *		scale is 64 100 000 000.
+ * @scale:	Linearized scale to compute the gain/time for.
+ * @known:	Either integration time or gain depending on which one is known
+ * @unknown:	Pointer to variable where the computed gain/time is stored
+ *
+ * Internal helper for computing unknown fraction of total gain.
+ * Compute either gain or time based on scale and either the gain or time
+ * depending on which one is known.
+ *
+ * Return:	0 on success.
+ */
+static int gain_get_scale_fraction(const u64 max, u64 scale, int known,
+				   int *unknown)
+{
+	int tot_gain;
+
+	tot_gain = iio_gts_get_gain(max, scale);
+	if (tot_gain < 0)
+		return tot_gain;
+
+	*unknown = tot_gain / known;
+
+	/* We require total gain to be exact multiple of known * unknown */
+	if (!*unknown || *unknown * known != tot_gain)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int iio_gts_delinearize(u64 lin_scale, unsigned long scaler,
+			       int *scale_whole, int *scale_nano)
+{
+	int frac;
+
+	if (scaler > NANO)
+		return -EOVERFLOW;
+
+	if (!scaler)
+		return -EINVAL;
+
+	frac = do_div(lin_scale, scaler);
+
+	*scale_whole = lin_scale;
+	*scale_nano = frac * (NANO / scaler);
+
+	return 0;
+}
+
+static int iio_gts_linearize(int scale_whole, int scale_nano,
+			     unsigned long scaler, u64 *lin_scale)
+{
+	/*
+	 * Expect scale to be (mostly) NANO or MICRO. Divide divider instead of
+	 * multiplication followed by division to avoid overflow.
+	 */
+	if (scaler > NANO || !scaler)
+		return -EINVAL;
+
+	*lin_scale = (u64)scale_whole * (u64)scaler +
+		     (u64)(scale_nano / (NANO / scaler));
+
+	return 0;
+}
+
+/**
+ * iio_gts_total_gain_to_scale - convert gain to scale
+ * @gts:	Gain time scale descriptor
+ * @total_gain:	the gain to be converted
+ * @scale_int:	Pointer to integral part of the scale (typically val1)
+ * @scale_nano:	Pointer to fractional part of the scale (nano or ppb)
+ *
+ * Convert the total gain value to scale. NOTE: This does not separate gain
+ * generated by HW-gain or integration time. It is up to caller to decide what
+ * part of the total gain is due to integration time and what due to HW-gain.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int iio_gts_total_gain_to_scale(struct iio_gts *gts, int total_gain,
+				int *scale_int, int *scale_nano)
+{
+	u64 tmp;
+
+	tmp = gts->max_scale;
+
+	do_div(tmp, total_gain);
+
+	return iio_gts_delinearize(tmp, NANO, scale_int, scale_nano);
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_total_gain_to_scale, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_purge_avail_scale_table - free-up the available scale tables
+ * @gts:	Gain time scale descriptor
+ *
+ * Free the space reserved by iio_gts_build_avail_scale_table().
+ */
+static void iio_gts_purge_avail_scale_table(struct iio_gts *gts)
+{
+	int i;
+
+	if (gts->per_time_avail_scale_tables) {
+		for (i = 0; i < gts->num_itime; i++)
+			kfree(gts->per_time_avail_scale_tables[i]);
+
+		kfree(gts->per_time_avail_scale_tables);
+		gts->per_time_avail_scale_tables = NULL;
+	}
+
+	kfree(gts->avail_all_scales_table);
+	gts->avail_all_scales_table = NULL;
+
+	gts->num_avail_all_scales = 0;
+}
+
+static int iio_gts_gain_cmp(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+static int gain_to_scaletables(struct iio_gts *gts, int **gains, int **scales)
+{
+	int ret, i, j, new_idx, time_idx;
+	int *all_gains;
+	size_t gain_bytes;
+
+	for (i = 0; i < gts->num_itime; i++) {
+		/*
+		 * Sort the tables for nice output and for easier finding of
+		 * unique values.
+		 */
+		sort(gains[i], gts->num_hwgain, sizeof(int), iio_gts_gain_cmp,
+		     NULL);
+
+		/* Convert gains to scales */
+		for (j = 0; j < gts->num_hwgain; j++) {
+			ret = iio_gts_total_gain_to_scale(gts, gains[i][j],
+							  &scales[i][2 * j],
+							  &scales[i][2 * j + 1]);
+			if (ret)
+				return ret;
+		}
+	}
+
+	gain_bytes = array_size(gts->num_hwgain, sizeof(int));
+	all_gains = kcalloc(gts->num_itime, gain_bytes, GFP_KERNEL);
+	if (!all_gains)
+		return -ENOMEM;
+
+	/*
+	 * We assume all the gains for same integration time were unique.
+	 * It is likely the first time table had greatest time multiplier as
+	 * the times are in the order of preference and greater times are
+	 * usually preferred. Hence we start from the last table which is likely
+	 * to have the smallest total gains.
+	 */
+	time_idx = gts->num_itime - 1;
+	memcpy(all_gains, gains[time_idx], gain_bytes);
+	new_idx = gts->num_hwgain;
+
+	while (time_idx--) {
+		for (j = 0; j < gts->num_hwgain; j++) {
+			int candidate = gains[time_idx][j];
+			int chk;
+
+			if (candidate > all_gains[new_idx - 1]) {
+				all_gains[new_idx] = candidate;
+				new_idx++;
+
+				continue;
+			}
+			for (chk = 0; chk < new_idx; chk++)
+				if (candidate <= all_gains[chk])
+					break;
+
+			if (candidate == all_gains[chk])
+				continue;
+
+			memmove(&all_gains[chk + 1], &all_gains[chk],
+				(new_idx - chk) * sizeof(int));
+			all_gains[chk] = candidate;
+			new_idx++;
+		}
+	}
+
+	gts->avail_all_scales_table = kcalloc(new_idx, 2 * sizeof(int),
+					      GFP_KERNEL);
+	if (!gts->avail_all_scales_table) {
+		ret = -ENOMEM;
+		goto free_out;
+	}
+	gts->num_avail_all_scales = new_idx;
+
+	for (i = 0; i < gts->num_avail_all_scales; i++) {
+		ret = iio_gts_total_gain_to_scale(gts, all_gains[i],
+					&gts->avail_all_scales_table[i * 2],
+					&gts->avail_all_scales_table[i * 2 + 1]);
+
+		if (ret) {
+			kfree(gts->avail_all_scales_table);
+			gts->num_avail_all_scales = 0;
+			goto free_out;
+		}
+	}
+
+free_out:
+	kfree(all_gains);
+
+	return ret;
+}
+
+/**
+ * iio_gts_build_avail_scale_table - create tables of available scales
+ * @gts:	Gain time scale descriptor
+ *
+ * Build the tables which can represent the available scales based on the
+ * originally given gain and time tables. When both time and gain tables are
+ * given this results:
+ * 1. A set of tables representing available scales for each supported
+ *    integration time.
+ * 2. A single table listing all the unique scales that any combination of
+ *    supported gains and times can provide.
+ *
+ * NOTE: Space allocated for the tables must be freed using
+ * iio_gts_purge_avail_scale_table() when the tables are no longer needed.
+ *
+ * Return: 0 on success.
+ */
+static int iio_gts_build_avail_scale_table(struct iio_gts *gts)
+{
+	int **per_time_gains, **per_time_scales, i, j, ret = -ENOMEM;
+
+	per_time_gains = kcalloc(gts->num_itime, sizeof(*per_time_gains), GFP_KERNEL);
+	if (!per_time_gains)
+		return ret;
+
+	per_time_scales = kcalloc(gts->num_itime, sizeof(*per_time_scales), GFP_KERNEL);
+	if (!per_time_scales)
+		goto free_gains;
+
+	for (i = 0; i < gts->num_itime; i++) {
+		per_time_scales[i] = kcalloc(gts->num_hwgain, 2 * sizeof(int),
+					     GFP_KERNEL);
+		if (!per_time_scales[i])
+			goto err_free_out;
+
+		per_time_gains[i] = kcalloc(gts->num_hwgain, sizeof(int),
+					    GFP_KERNEL);
+		if (!per_time_gains[i]) {
+			kfree(per_time_scales[i]);
+			goto err_free_out;
+		}
+
+		for (j = 0; j < gts->num_hwgain; j++)
+			per_time_gains[i][j] = gts->hwgain_table[j].gain *
+					       gts->itime_table[i].mul;
+	}
+
+	ret = gain_to_scaletables(gts, per_time_gains, per_time_scales);
+	if (ret)
+		goto err_free_out;
+
+	kfree(per_time_gains);
+	gts->per_time_avail_scale_tables = per_time_scales;
+
+	return 0;
+
+err_free_out:
+	for (i--; i; i--) {
+		kfree(per_time_scales[i]);
+		kfree(per_time_gains[i]);
+	}
+	kfree(per_time_scales);
+free_gains:
+	kfree(per_time_gains);
+
+	return ret;
+}
+
+/**
+ * iio_gts_build_avail_time_table - build table of available integration times
+ * @gts:	Gain time scale descriptor
+ *
+ * Build the table which can represent the available times to be returned
+ * to users using the read_avail-callback.
+ *
+ * NOTE: Space allocated for the tables must be freed using
+ * iio_gts_purge_avail_time_table() when the tables are no longer needed.
+ *
+ * Return: 0 on success.
+ */
+static int iio_gts_build_avail_time_table(struct iio_gts *gts)
+{
+	int *times, i, j, idx = 0;
+
+	if (!gts->num_itime)
+		return 0;
+
+	times = kcalloc(gts->num_itime, sizeof(int), GFP_KERNEL);
+	if (!times)
+		return -ENOMEM;
+
+	/* Sort times from all tables to one and remove duplicates */
+	for (i = gts->num_itime - 1; i >= 0; i--) {
+		int new = gts->itime_table[i].time_us;
+
+		if (times[idx] < new) {
+			times[idx++] = new;
+			continue;
+		}
+
+		for (j = 0; j <= idx; j++) {
+			if (times[j] > new) {
+				memmove(&times[j + 1], &times[j],
+					(idx - j) * sizeof(int));
+				times[j] = new;
+				idx++;
+			}
+		}
+	}
+	gts->avail_time_tables = times;
+	/*
+	 * This is just to survive a unlikely corner-case where times in the
+	 * given time table were not unique. Else we could just trust the
+	 * gts->num_itime.
+	 */
+	gts->num_avail_time_tables = idx;
+
+	return 0;
+}
+
+/**
+ * iio_gts_purge_avail_time_table - free-up the available integration time table
+ * @gts:	Gain time scale descriptor
+ *
+ * Free the space reserved by iio_gts_build_avail_time_table().
+ */
+static void iio_gts_purge_avail_time_table(struct iio_gts *gts)
+{
+	if (gts->num_avail_time_tables) {
+		kfree(gts->avail_time_tables);
+		gts->avail_time_tables = NULL;
+		gts->num_avail_time_tables = 0;
+	}
+}
+
+/**
+ * iio_gts_build_avail_tables - create tables of available scales and int times
+ * @gts:	Gain time scale descriptor
+ *
+ * Build the tables which can represent the available scales and available
+ * integration times. Availability tables are built based on the originally
+ * given gain and given time tables.
+ *
+ * When both time and gain tables are
+ * given this results:
+ * 1. A set of sorted tables representing available scales for each supported
+ *    integration time.
+ * 2. A single sorted table listing all the unique scales that any combination
+ *    of supported gains and times can provide.
+ * 3. A sorted table of supported integration times
+ *
+ * After these tables are built one can use the iio_gts_all_avail_scales(),
+ * iio_gts_avail_scales_for_time() and iio_gts_avail_times() helpers to
+ * implement the read_avail operations.
+ *
+ * NOTE: Space allocated for the tables must be freed using
+ * iio_gts_purge_avail_tables() when the tables are no longer needed.
+ *
+ * Return: 0 on success.
+ */
+static int iio_gts_build_avail_tables(struct iio_gts *gts)
+{
+	int ret;
+
+	ret = iio_gts_build_avail_scale_table(gts);
+	if (ret)
+		return ret;
+
+	ret = iio_gts_build_avail_time_table(gts);
+	if (ret)
+		iio_gts_purge_avail_scale_table(gts);
+
+	return ret;
+}
+
+/**
+ * iio_gts_purge_avail_tables - free-up the availability tables
+ * @gts:	Gain time scale descriptor
+ *
+ * Free the space reserved by iio_gts_build_avail_tables(). Frees both the
+ * integration time and scale tables.
+ */
+static void iio_gts_purge_avail_tables(struct iio_gts *gts)
+{
+	iio_gts_purge_avail_time_table(gts);
+	iio_gts_purge_avail_scale_table(gts);
+}
+
+static void devm_iio_gts_avail_all_drop(void *res)
+{
+	iio_gts_purge_avail_tables(res);
+}
+
+/**
+ * devm_iio_gts_build_avail_tables - manged add availability tables
+ * @dev:	Pointer to the device whose lifetime tables are bound
+ * @gts:	Gain time scale descriptor
+ *
+ * Build the tables which can represent the available scales and available
+ * integration times. Availability tables are built based on the originally
+ * given gain and given time tables.
+ *
+ * When both time and gain tables are given this results:
+ * 1. A set of sorted tables representing available scales for each supported
+ *    integration time.
+ * 2. A single sorted table listing all the unique scales that any combination
+ *    of supported gains and times can provide.
+ * 3. A sorted table of supported integration times
+ *
+ * After these tables are built one can use the iio_gts_all_avail_scales(),
+ * iio_gts_avail_scales_for_time() and iio_gts_avail_times() helpers to
+ * implement the read_avail operations.
+ *
+ * The tables are automatically released upon device detach.
+ *
+ * Return: 0 on success.
+ */
+static int devm_iio_gts_build_avail_tables(struct device *dev,
+					   struct iio_gts *gts)
+{
+	int ret;
+
+	ret = iio_gts_build_avail_tables(gts);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, devm_iio_gts_avail_all_drop, gts);
+}
+
+static int sanity_check_time(const struct iio_itime_sel_mul *t)
+{
+	if (t->sel < 0 || t->time_us < 0 || t->mul <= 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int sanity_check_gain(const struct iio_gain_sel_pair *g)
+{
+	if (g->sel < 0 || g->gain <= 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int iio_gts_sanity_check(struct iio_gts *gts)
+{
+	int g, t, ret;
+
+	if (!gts->num_hwgain && !gts->num_itime)
+		return -EINVAL;
+
+	for (t = 0; t < gts->num_itime; t++) {
+		ret = sanity_check_time(&gts->itime_table[t]);
+		if (ret)
+			return ret;
+	}
+
+	for (g = 0; g < gts->num_hwgain; g++) {
+		ret = sanity_check_gain(&gts->hwgain_table[g]);
+		if (ret)
+			return ret;
+	}
+
+	for (g = 0; g < gts->num_hwgain; g++) {
+		for (t = 0; t < gts->num_itime; t++) {
+			int gain, mul, res;
+
+			gain = gts->hwgain_table[g].gain;
+			mul = gts->itime_table[t].mul;
+
+			if (check_mul_overflow(gain, mul, &res))
+				return -EOVERFLOW;
+		}
+	}
+
+	return 0;
+}
+
+static int iio_init_iio_gts(int max_scale_int, int max_scale_nano,
+			const struct iio_gain_sel_pair *gain_tbl, int num_gain,
+			const struct iio_itime_sel_mul *tim_tbl, int num_times,
+			struct iio_gts *gts)
+{
+	int ret;
+
+	memset(gts, 0, sizeof(*gts));
+
+	ret = iio_gts_linearize(max_scale_int, max_scale_nano, NANO,
+				   &gts->max_scale);
+	if (ret)
+		return ret;
+
+	gts->hwgain_table = gain_tbl;
+	gts->num_hwgain = num_gain;
+	gts->itime_table = tim_tbl;
+	gts->num_itime = num_times;
+
+	return iio_gts_sanity_check(gts);
+}
+
+/**
+ * devm_iio_init_iio_gts - Initialize the gain-time-scale helper
+ * @dev:		Pointer to the device whose lifetime gts resources are
+ *			bound
+ * @max_scale_int:	integer part of the maximum scale value
+ * @max_scale_nano:	fraction part of the maximum scale value
+ * @gain_tbl:		table describing supported gains
+ * @num_gain:		number of gains in the gain table
+ * @tim_tbl:		table describing supported integration times. Provide
+ *			the integration time table sorted so that the preferred
+ *			integration time is in the first array index. The search
+ *			functions like the
+ *			iio_gts_find_time_and_gain_sel_for_scale() start search
+ *			from first provided time.
+ * @num_times:		number of times in the time table
+ * @gts:		pointer to the helper struct
+ *
+ * Initialize the gain-time-scale helper for use. Note, gains, times, selectors
+ * and multipliers must be positive. Negative values are reserved for error
+ * checking. The total gain (maximum gain * maximum time multiplier) must not
+ * overflow int. The allocated resources will be released upon device detach.
+ *
+ * Return: 0 on success.
+ */
+int devm_iio_init_iio_gts(struct device *dev, int max_scale_int, int max_scale_nano,
+			  const struct iio_gain_sel_pair *gain_tbl, int num_gain,
+			  const struct iio_itime_sel_mul *tim_tbl, int num_times,
+			  struct iio_gts *gts)
+{
+	int ret;
+
+	ret = iio_init_iio_gts(max_scale_int, max_scale_nano, gain_tbl,
+			       num_gain, tim_tbl, num_times, gts);
+	if (ret)
+		return ret;
+
+	return devm_iio_gts_build_avail_tables(dev, gts);
+}
+EXPORT_SYMBOL_NS_GPL(devm_iio_init_iio_gts, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_all_avail_scales - helper for listing all available scales
+ * @gts:	Gain time scale descriptor
+ * @vals:	Returned array of supported scales
+ * @type:	Type of returned scale values
+ * @length:	Amount of returned values in array
+ *
+ * Return: a value suitable to be returned from read_avail or a negative error.
+ */
+int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type,
+			     int *length)
+{
+	if (!gts->num_avail_all_scales)
+		return -EINVAL;
+
+	*vals = gts->avail_all_scales_table;
+	*type = IIO_VAL_INT_PLUS_NANO;
+	*length = gts->num_avail_all_scales * 2;
+
+	return IIO_AVAIL_LIST;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_all_avail_scales, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_avail_scales_for_time - list scales for integration time
+ * @gts:	Gain time scale descriptor
+ * @time:	Integration time for which the scales are listed
+ * @vals:	Returned array of supported scales
+ * @type:	Type of returned scale values
+ * @length:	Amount of returned values in array
+ *
+ * Drivers which do not allow scale setting to change integration time can
+ * use this helper to list only the scales which are valid for given integration
+ * time.
+ *
+ * Return: a value suitable to be returned from read_avail or a negative error.
+ */
+int iio_gts_avail_scales_for_time(struct iio_gts *gts, int time,
+				  const int **vals, int *type, int *length)
+{
+	int i;
+
+	for (i = 0; i < gts->num_itime; i++)
+		if (gts->itime_table[i].time_us == time)
+			break;
+
+	if (i == gts->num_itime)
+		return -EINVAL;
+
+	*vals = gts->per_time_avail_scale_tables[i];
+	*type = IIO_VAL_INT_PLUS_NANO;
+	*length = gts->num_hwgain * 2;
+
+	return IIO_AVAIL_LIST;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_avail_scales_for_time, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_avail_times - helper for listing available integration times
+ * @gts:	Gain time scale descriptor
+ * @vals:	Returned array of supported times
+ * @type:	Type of returned scale values
+ * @length:	Amount of returned values in array
+ *
+ * Return: a value suitable to be returned from read_avail or a negative error.
+ */
+int iio_gts_avail_times(struct iio_gts *gts,  const int **vals, int *type,
+			int *length)
+{
+	if (!gts->num_avail_time_tables)
+		return -EINVAL;
+
+	*vals = gts->avail_time_tables;
+	*type = IIO_VAL_INT;
+	*length = gts->num_avail_time_tables;
+
+	return IIO_AVAIL_LIST;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_avail_times, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_find_sel_by_gain - find selector corresponding to a HW-gain
+ * @gts:	Gain time scale descriptor
+ * @gain:	HW-gain for which matching selector is searched for
+ *
+ * Return:	a selector matching given HW-gain or -EINVAL if selector was
+ *		not found.
+ */
+int iio_gts_find_sel_by_gain(struct iio_gts *gts, int gain)
+{
+	int i;
+
+	for (i = 0; i < gts->num_hwgain; i++)
+		if (gts->hwgain_table[i].gain == gain)
+			return gts->hwgain_table[i].sel;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_find_sel_by_gain, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_find_gain_by_sel - find HW-gain corresponding to a selector
+ * @gts:	Gain time scale descriptor
+ * @sel:	selector for which matching HW-gain is searched for
+ *
+ * Return:	a HW-gain matching given selector or -EINVAL if HW-gain was not
+ *		found.
+ */
+int iio_gts_find_gain_by_sel(struct iio_gts *gts, int sel)
+{
+	int i;
+
+	for (i = 0; i < gts->num_hwgain; i++)
+		if (gts->hwgain_table[i].sel == sel)
+			return gts->hwgain_table[i].gain;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_find_gain_by_sel, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_get_min_gain - find smallest valid HW-gain
+ * @gts:	Gain time scale descriptor
+ *
+ * Return:	The smallest HW-gain -EINVAL if no HW-gains were in the tables.
+ */
+int iio_gts_get_min_gain(struct iio_gts *gts)
+{
+	int i, min = -EINVAL;
+
+	for (i = 0; i < gts->num_hwgain; i++) {
+		int gain = gts->hwgain_table[i].gain;
+
+		if (min == -EINVAL)
+			min = gain;
+		else
+			min = min(min, gain);
+	}
+
+	return min;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_get_min_gain, IIO_GTS_HELPER);
+
+/**
+ * iio_find_closest_gain_low - Find the closest lower matching gain
+ * @gts:	Gain time scale descriptor
+ * @gain:	HW-gain for which the closest match is searched
+ * @in_range:	indicate if the @gain was actually in the range of
+ *		supported gains.
+ *
+ * Search for closest supported gain that is lower than or equal to the
+ * gain given as a parameter. This is usable for drivers which do not require
+ * user to request exact matching gain but rather for rounding to a supported
+ * gain value which is equal or lower (setting lower gain is typical for
+ * avoiding saturation)
+ *
+ * Return:	The closest matching supported gain or -EINVAL if @gain
+ *		was smaller than the smallest supported gain.
+ */
+int iio_find_closest_gain_low(struct iio_gts *gts, int gain, bool *in_range)
+{
+	int i, diff = 0;
+	int best = -1;
+
+	*in_range = false;
+
+	for (i = 0; i < gts->num_hwgain; i++) {
+		if (gain == gts->hwgain_table[i].gain) {
+			*in_range = true;
+			return gain;
+		}
+
+		if (gain > gts->hwgain_table[i].gain) {
+			if (!diff) {
+				diff = gain - gts->hwgain_table[i].gain;
+				best = i;
+			} else {
+				int tmp = gain - gts->hwgain_table[i].gain;
+
+				if (tmp < diff) {
+					diff = tmp;
+					best = i;
+				}
+			}
+		} else {
+			/*
+			 * We found valid HW-gain which is greater than
+			 * reference. So, unless we return a failure below we
+			 * will have found an in-range gain
+			 */
+			*in_range = true;
+		}
+	}
+	/* The requested gain was smaller than anything we support */
+	if (!diff) {
+		*in_range = false;
+
+		return -EINVAL;
+	}
+
+	return gts->hwgain_table[best].gain;
+}
+EXPORT_SYMBOL_NS_GPL(iio_find_closest_gain_low, IIO_GTS_HELPER);
+
+static int iio_gts_get_int_time_gain_multiplier_by_sel(struct iio_gts *gts,
+						       int sel)
+{
+	const struct iio_itime_sel_mul *time;
+
+	time = iio_gts_find_itime_by_sel(gts, sel);
+	if (!time)
+		return -EINVAL;
+
+	return time->mul;
+}
+
+/**
+ * iio_gts_find_gain_for_scale_using_time - Find gain by time and scale
+ * @gts:	Gain time scale descriptor
+ * @time_sel:	Integration time selector corresponding to the time gain is
+ *		searched for
+ * @scale_int:	Integral part of the scale (typically val1)
+ * @scale_nano:	Fractional part of the scale (nano or ppb)
+ * @gain:	Pointer to value where gain is stored.
+ *
+ * In some cases the light sensors may want to find a gain setting which
+ * corresponds given scale and integration time. Sensors which fill the
+ * gain and time tables may use this helper to retrieve the gain.
+ *
+ * Return:	0 on success. -EINVAL if gain matching the parameters is not
+ *		found.
+ */
+static int iio_gts_find_gain_for_scale_using_time(struct iio_gts *gts, int time_sel,
+						  int scale_int, int scale_nano,
+						  int *gain)
+{
+	u64 scale_linear;
+	int ret, mul;
+
+	ret = iio_gts_linearize(scale_int, scale_nano, NANO, &scale_linear);
+	if (ret)
+		return ret;
+
+	ret = iio_gts_get_int_time_gain_multiplier_by_sel(gts, time_sel);
+	if (ret < 0)
+		return ret;
+
+	mul = ret;
+
+	ret = gain_get_scale_fraction(gts->max_scale, scale_linear, mul, gain);
+	if (ret)
+		return ret;
+
+	if (!iio_gts_valid_gain(gts, *gain))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * iio_gts_find_gain_sel_for_scale_using_time - Fetch gain selector.
+ * @gts:	Gain time scale descriptor
+ * @time_sel:	Integration time selector corresponding to the time gain is
+ *		searched for
+ * @scale_int:	Integral part of the scale (typically val1)
+ * @scale_nano:	Fractional part of the scale (nano or ppb)
+ * @gain_sel:	Pointer to value where gain selector is stored.
+ *
+ * See iio_gts_find_gain_for_scale_using_time() for more information
+ */
+int iio_gts_find_gain_sel_for_scale_using_time(struct iio_gts *gts, int time_sel,
+					       int scale_int, int scale_nano,
+					       int *gain_sel)
+{
+	int gain, ret;
+
+	ret = iio_gts_find_gain_for_scale_using_time(gts, time_sel, scale_int,
+						     scale_nano, &gain);
+	if (ret)
+		return ret;
+
+	ret = iio_gts_find_sel_by_gain(gts, gain);
+	if (ret < 0)
+		return ret;
+
+	*gain_sel = ret;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_find_gain_sel_for_scale_using_time, IIO_GTS_HELPER);
+
+static int iio_gts_get_total_gain(struct iio_gts *gts, int gain, int time)
+{
+	const struct iio_itime_sel_mul *itime;
+
+	if (!iio_gts_valid_gain(gts, gain))
+		return -EINVAL;
+
+	if (!gts->num_itime)
+		return gain;
+
+	itime = iio_gts_find_itime_by_time(gts, time);
+	if (!itime)
+		return -EINVAL;
+
+	return gain * itime->mul;
+}
+
+static int iio_gts_get_scale_linear(struct iio_gts *gts, int gain, int time,
+				    u64 *scale)
+{
+	int total_gain;
+	u64 tmp;
+
+	total_gain = iio_gts_get_total_gain(gts, gain, time);
+	if (total_gain < 0)
+		return total_gain;
+
+	tmp = gts->max_scale;
+
+	do_div(tmp, total_gain);
+
+	*scale = tmp;
+
+	return 0;
+}
+
+/**
+ * iio_gts_get_scale - get scale based on integration time and HW-gain
+ * @gts:	Gain time scale descriptor
+ * @gain:	HW-gain for which the scale is computed
+ * @time:	Integration time for which the scale is computed
+ * @scale_int:	Integral part of the scale (typically val1)
+ * @scale_nano:	Fractional part of the scale (nano or ppb)
+ *
+ * Compute scale matching the integration time and HW-gain given as parameter.
+ *
+ * Return: 0 on success.
+ */
+int iio_gts_get_scale(struct iio_gts *gts, int gain, int time, int *scale_int,
+		      int *scale_nano)
+{
+	u64 lin_scale;
+	int ret;
+
+	ret = iio_gts_get_scale_linear(gts, gain, time, &lin_scale);
+	if (ret)
+		return ret;
+
+	return iio_gts_delinearize(lin_scale, NANO, scale_int, scale_nano);
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_get_scale, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_find_new_gain_sel_by_old_gain_time - compensate for time change
+ * @gts:		Gain time scale descriptor
+ * @old_gain:		Previously set gain
+ * @old_time_sel:	Selector corresponding previously set time
+ * @new_time_sel:	Selector corresponding new time to be set
+ * @new_gain:		Pointer to value where new gain is to be written
+ *
+ * We may want to mitigate the scale change caused by setting a new integration
+ * time (for a light sensor) by also updating the (HW)gain. This helper computes
+ * new gain value to maintain the scale with new integration time.
+ *
+ * Return: 0 if an exactly matching supported new gain was found. When a
+ * non-zero value is returned, the @new_gain will be set to a negative or
+ * positive value. The negative value means that no gain could be computed.
+ * Positive value will be the "best possible new gain there could be". There
+ * can be two reasons why finding the "best possible" new gain is not deemed
+ * successful. 1) This new value cannot be supported by the hardware. 2) The new
+ * gain required to maintain the scale would not be an integer. In this case,
+ * the "best possible" new gain will be a floored optimal gain, which may or
+ * may not be supported by the hardware.
+ */
+int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts,
+					       int old_gain, int old_time_sel,
+					       int new_time_sel, int *new_gain)
+{
+	const struct iio_itime_sel_mul *itime_old, *itime_new;
+	u64 scale;
+	int ret;
+
+	*new_gain = -1;
+
+	itime_old = iio_gts_find_itime_by_sel(gts, old_time_sel);
+	if (!itime_old)
+		return -EINVAL;
+
+	itime_new = iio_gts_find_itime_by_sel(gts, new_time_sel);
+	if (!itime_new)
+		return -EINVAL;
+
+	ret = iio_gts_get_scale_linear(gts, old_gain, itime_old->time_us,
+				       &scale);
+	if (ret)
+		return ret;
+
+	ret = gain_get_scale_fraction(gts->max_scale, scale, itime_new->mul,
+				      new_gain);
+	if (ret)
+		return ret;
+
+	if (!iio_gts_valid_gain(gts, *new_gain))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_find_new_gain_sel_by_old_gain_time, IIO_GTS_HELPER);
+
+/**
+ * iio_gts_find_new_gain_by_old_gain_time - compensate for time change
+ * @gts:		Gain time scale descriptor
+ * @old_gain:		Previously set gain
+ * @old_time:		Selector corresponding previously set time
+ * @new_time:		Selector corresponding new time to be set
+ * @new_gain:		Pointer to value where new gain is to be written
+ *
+ * We may want to mitigate the scale change caused by setting a new integration
+ * time (for a light sensor) by also updating the (HW)gain. This helper computes
+ * new gain value to maintain the scale with new integration time.
+ *
+ * Return: 0 if an exactly matching supported new gain was found. When a
+ * non-zero value is returned, the @new_gain will be set to a negative or
+ * positive value. The negative value means that no gain could be computed.
+ * Positive value will be the "best possible new gain there could be". There
+ * can be two reasons why finding the "best possible" new gain is not deemed
+ * successful. 1) This new value cannot be supported by the hardware. 2) The new
+ * gain required to maintain the scale would not be an integer. In this case,
+ * the "best possible" new gain will be a floored optimal gain, which may or
+ * may not be supported by the hardware.
+ */
+int iio_gts_find_new_gain_by_old_gain_time(struct iio_gts *gts, int old_gain,
+					   int old_time, int new_time,
+					   int *new_gain)
+{
+	const struct iio_itime_sel_mul *itime_new;
+	u64 scale;
+	int ret;
+
+	*new_gain = -1;
+
+	itime_new = iio_gts_find_itime_by_time(gts, new_time);
+	if (!itime_new)
+		return -EINVAL;
+
+	ret = iio_gts_get_scale_linear(gts, old_gain, old_time, &scale);
+	if (ret)
+		return ret;
+
+	ret = gain_get_scale_fraction(gts->max_scale, scale, itime_new->mul,
+				      new_gain);
+	if (ret)
+		return ret;
+
+	if (!iio_gts_valid_gain(gts, *new_gain))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iio_gts_find_new_gain_by_old_gain_time, IIO_GTS_HELPER);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Matti Vaittinen <mazziesaccount@gmail.com>");
+MODULE_DESCRIPTION("IIO light sensor gain-time-scale helpers");
diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h
new file mode 100644
index 000000000000..dd64e544a3da
--- /dev/null
+++ b/include/linux/iio/iio-gts-helper.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* gain-time-scale conversion helpers for IIO light sensors
+ *
+ * Copyright (c) 2023 Matti Vaittinen <mazziesaccount@gmail.com>
+ */
+
+#ifndef __IIO_GTS_HELPER__
+#define __IIO_GTS_HELPER__
+
+#include <linux/types.h>
+
+struct device;
+
+/**
+ * struct iio_gain_sel_pair - gain - selector values
+ *
+ * In many cases devices like light sensors allow setting signal amplification
+ * (gain) using a register interface. This structure describes amplification
+ * and corresponding selector (register value)
+ *
+ * @gain:	Gain (multiplication) value. Gain must be positive, negative
+ *		values are reserved for error handling.
+ * @sel:	Selector (usually register value) used to indicate this gain.
+ *		NOTE: Only selectors >= 0 supported.
+ */
+struct iio_gain_sel_pair {
+	int gain;
+	int sel;
+};
+
+/**
+ * struct iio_itime_sel_mul - integration time description
+ *
+ * In many cases devices like light sensors allow setting the duration of
+ * collecting data. Typically this duration has also an impact to the magnitude
+ * of measured values (gain). This structure describes the relation of
+ * integration time and amplification as well as corresponding selector
+ * (register value).
+ *
+ * An example could be a sensor allowing 50, 100, 200 and 400 mS times. The
+ * respective multiplication values could be 50 mS => 1, 100 mS => 2,
+ * 200 mS => 4 and 400 mS => 8 assuming the impact of integration time would be
+ * linear in a way that when collecting data for 50 mS caused value X, doubling
+ * the data collection time caused value 2X etc.
+ *
+ * @time_us:	Integration time in microseconds. Time values must be positive,
+ *		negative values are reserved for error handling.
+ * @sel:	Selector (usually register value) used to indicate this time
+ *		NOTE: Only selectors >= 0 supported.
+ * @mul:	Multiplication to the values caused by this time.
+ *		NOTE: Only multipliers > 0 supported.
+ */
+struct iio_itime_sel_mul {
+	int time_us;
+	int sel;
+	int mul;
+};
+
+struct iio_gts {
+	u64 max_scale;
+	const struct iio_gain_sel_pair *hwgain_table;
+	int num_hwgain;
+	const struct iio_itime_sel_mul *itime_table;
+	int num_itime;
+	int **per_time_avail_scale_tables;
+	int *avail_all_scales_table;
+	int num_avail_all_scales;
+	int *avail_time_tables;
+	int num_avail_time_tables;
+};
+
+#define GAIN_SCALE_GAIN(_gain, _sel)			\
+{							\
+	.gain = (_gain),				\
+	.sel = (_sel),					\
+}
+
+#define GAIN_SCALE_ITIME_US(_itime, _sel, _mul)		\
+{							\
+	.time_us = (_itime),				\
+	.sel = (_sel),					\
+	.mul = (_mul),					\
+}
+
+static inline const struct iio_itime_sel_mul *
+iio_gts_find_itime_by_time(struct iio_gts *gts, int time)
+{
+	int i;
+
+	if (!gts->num_itime)
+		return NULL;
+
+	for (i = 0; i < gts->num_itime; i++)
+		if (gts->itime_table[i].time_us == time)
+			return &gts->itime_table[i];
+
+	return NULL;
+}
+
+static inline const struct iio_itime_sel_mul *
+iio_gts_find_itime_by_sel(struct iio_gts *gts, int sel)
+{
+	int i;
+
+	for (i = 0; i < gts->num_itime; i++)
+		if (gts->itime_table[i].sel == sel)
+			return &gts->itime_table[i];
+
+	return NULL;
+}
+
+int devm_iio_init_iio_gts(struct device *dev, int max_scale_int, int max_scale_nano,
+			  const struct iio_gain_sel_pair *gain_tbl, int num_gain,
+			  const struct iio_itime_sel_mul *tim_tbl, int num_times,
+			  struct iio_gts *gts);
+/**
+ * iio_gts_find_int_time_by_sel - find integration time matching a selector
+ * @gts:	Gain time scale descriptor
+ * @sel:	selector for which matching integration time is searched for
+ *
+ * Return:	integration time matching given selector or -EINVAL if
+ *		integration time was not found.
+ */
+static inline int iio_gts_find_int_time_by_sel(struct iio_gts *gts, int sel)
+{
+	const struct iio_itime_sel_mul *itime;
+
+	itime = iio_gts_find_itime_by_sel(gts, sel);
+	if (!itime)
+		return -EINVAL;
+
+	return itime->time_us;
+}
+
+/**
+ * iio_gts_find_sel_by_int_time - find selector matching integration time
+ * @gts:	Gain time scale descriptor
+ * @gain:	HW-gain for which matching selector is searched for
+ *
+ * Return:	a selector matching given integration time or -EINVAL if
+ *		selector was not found.
+ */
+static inline int iio_gts_find_sel_by_int_time(struct iio_gts *gts, int time)
+{
+	const struct iio_itime_sel_mul *itime;
+
+	itime = iio_gts_find_itime_by_time(gts, time);
+	if (!itime)
+		return -EINVAL;
+
+	return itime->sel;
+}
+
+/**
+ * iio_gts_valid_time - check if given integration time is valid
+ * @gts:	Gain time scale descriptor
+ * @time_us:	Integration time to check
+ *
+ * Return:	True if given time is supported by device. False if not.
+ */
+static inline bool iio_gts_valid_time(struct iio_gts *gts, int time_us)
+{
+	return iio_gts_find_itime_by_time(gts, time_us) != NULL;
+}
+
+int iio_gts_find_sel_by_gain(struct iio_gts *gts, int gain);
+
+/**
+ * iio_gts_valid_gain - check if given HW-gain is valid
+ * @gts:	Gain time scale descriptor
+ * @gain:	HW-gain to check
+ *
+ * Return:	True if given time is supported by device. False if not.
+ */
+static inline bool iio_gts_valid_gain(struct iio_gts *gts, int gain)
+{
+	return iio_gts_find_sel_by_gain(gts, gain) >= 0;
+}
+
+int iio_find_closest_gain_low(struct iio_gts *gts, int gain, bool *in_range);
+int iio_gts_find_gain_by_sel(struct iio_gts *gts, int sel);
+int iio_gts_get_min_gain(struct iio_gts *gts);
+int iio_gts_find_int_time_by_sel(struct iio_gts *gts, int sel);
+int iio_gts_find_sel_by_int_time(struct iio_gts *gts, int time);
+
+int iio_gts_total_gain_to_scale(struct iio_gts *gts, int total_gain,
+				int *scale_int, int *scale_nano);
+int iio_gts_find_gain_sel_for_scale_using_time(struct iio_gts *gts, int time_sel,
+					       int scale_int, int scale_nano,
+					       int *gain_sel);
+int iio_gts_get_scale(struct iio_gts *gts, int gain, int time, int *scale_int,
+		      int *scale_nano);
+int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts,
+					       int old_gain, int old_time_sel,
+					       int new_time_sel, int *new_gain);
+int iio_gts_find_new_gain_by_old_gain_time(struct iio_gts *gts, int old_gain,
+					   int old_time, int new_time,
+					   int *new_gain);
+int iio_gts_avail_times(struct iio_gts *gts,  const int **vals, int *type,
+			int *length);
+int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type,
+			     int *length);
+int iio_gts_avail_scales_for_time(struct iio_gts *gts, int time,
+				  const int **vals, int *type, int *length);
+
+#endif
-- 
cgit v1.2.3


From ae77d1391445f1357d888990c07b5288a4cacac5 Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Thu, 12 Jan 2023 09:04:47 +0000
Subject: media: add Sorenson Spark video format

Sorenson Spark is an implementation of H.263 for use
in Flash Video and Adobe Flash files.
Sorenson Spark is an incomplete implementation of H.263.
It differs mostly in header structure and ranges of the coefficients.

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 5 +++++
 drivers/media/v4l2-core/v4l2-ioctl.c                        | 1 +
 include/uapi/linux/videodev2.h                              | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index 506dd3c98884..a0230f357680 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -88,6 +88,11 @@ Compressed Formats
       - ``V4L2_PIX_FMT_H263``
       - 'H263'
       - H263 video elementary stream.
+    * .. _V4L2-PIX-FMT-SPK:
+
+      - ``V4L2_PIX_FMT_SPK``
+      - 'SPK0'
+      - Sorenson Spark is an implementation of H.263 for use in Flash Video and Adobe Flash files
     * .. _V4L2-PIX-FMT-MPEG1:
 
       - ``V4L2_PIX_FMT_MPEG1``
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index caeed2e62b71..f0bf5d5d7e1c 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1475,6 +1475,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 		case V4L2_PIX_FMT_HEVC_SLICE:	descr = "HEVC Parsed Slice Data"; break;
 		case V4L2_PIX_FMT_FWHT:		descr = "FWHT"; break; /* used in vicodec */
 		case V4L2_PIX_FMT_FWHT_STATELESS:	descr = "FWHT Stateless"; break; /* used in vicodec */
+		case V4L2_PIX_FMT_SPK:		descr = "Sorenson Spark"; break;
 		case V4L2_PIX_FMT_CPIA1:	descr = "GSPCA CPiA YUV"; break;
 		case V4L2_PIX_FMT_WNVA:		descr = "WNVA"; break;
 		case V4L2_PIX_FMT_SN9C10X:	descr = "GSPCA SN9C10X"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index b5b3d1fddea2..f943d58ec9df 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -750,6 +750,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_FWHT_STATELESS     v4l2_fourcc('S', 'F', 'W', 'H') /* Stateless FWHT (vicodec) */
 #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+#define V4L2_PIX_FMT_SPK      v4l2_fourcc('S', 'P', 'K', '0') /* Sorenson Spark */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_CPIA1    v4l2_fourcc('C', 'P', 'I', 'A') /* cpia1 YUV */
-- 
cgit v1.2.3


From ec9aa62a1e4d151e9f14b7bda0b13438a901f904 Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Tue, 17 Jan 2023 02:31:54 +0000
Subject: media: add RealVideo format RV30 and RV40

RealVideo, or also spelled as Real Video, is a suite of proprietary
video compression formats developed by RealNetworks -
the specific format changes with the version.
RealVideo codecs are identified by four-character codes.
RV30 and RV40 are RealNetworks' proprietary H.264-based codecs.

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-compressed.rst    | 20 ++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                 |  2 ++
 include/uapi/linux/videodev2.h                       |  2 ++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index a0230f357680..06b78e5589d2 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -237,6 +237,26 @@ Compressed Formats
         Metadata associated with the frame to decode is required to be passed
         through the ``V4L2_CID_STATELESS_FWHT_PARAMS`` control.
 	See the :ref:`associated Codec Control ID <codec-stateless-fwht>`.
+    * .. _V4L2-PIX-FMT-RV30:
+
+      - ``V4L2_PIX_FMT_RV30``
+      - 'RV30'
+      - RealVideo, or also spelled as Real Video, is a suite of
+        proprietary video compression formats developed by
+        RealNetworks - the specific format changes with the version.
+        RealVideo codecs are identified by four-character codes.
+        RV30 corresponds to RealVideo 8, suspected to be based
+        largely on an early draft of H.264
+    * .. _V4L2-PIX-FMT-RV40:
+
+      - ``V4L2_PIX_FMT_RV40``
+      - 'RV40'
+      - RV40 represents RealVideo 9 and RealVideo 10.
+        RealVideo 9, suspected to be based on H.264.
+        RealVideo 10, aka RV9 EHQ, This refers to an improved encoder
+        for the RV9 format that is fully backwards compatible with
+        RV9 players - the format and decoder did not change, only
+        the encoder did. As a result, it uses the same FourCC.
 
 .. raw:: latex
 
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index f0bf5d5d7e1c..592410ad6e8a 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1476,6 +1476,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 		case V4L2_PIX_FMT_FWHT:		descr = "FWHT"; break; /* used in vicodec */
 		case V4L2_PIX_FMT_FWHT_STATELESS:	descr = "FWHT Stateless"; break; /* used in vicodec */
 		case V4L2_PIX_FMT_SPK:		descr = "Sorenson Spark"; break;
+		case V4L2_PIX_FMT_RV30:		descr = "RealVideo 8"; break;
+		case V4L2_PIX_FMT_RV40:		descr = "RealVideo 9 & 10"; break;
 		case V4L2_PIX_FMT_CPIA1:	descr = "GSPCA CPiA YUV"; break;
 		case V4L2_PIX_FMT_WNVA:		descr = "WNVA"; break;
 		case V4L2_PIX_FMT_SN9C10X:	descr = "GSPCA SN9C10X"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index f943d58ec9df..d452481a5316 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -751,6 +751,8 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 #define V4L2_PIX_FMT_SPK      v4l2_fourcc('S', 'P', 'K', '0') /* Sorenson Spark */
+#define V4L2_PIX_FMT_RV30     v4l2_fourcc('R', 'V', '3', '0') /* RealVideo 8 */
+#define V4L2_PIX_FMT_RV40     v4l2_fourcc('R', 'V', '4', '0') /* RealVideo 9 & 10 */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_CPIA1    v4l2_fourcc('C', 'P', 'I', 'A') /* cpia1 YUV */
-- 
cgit v1.2.3


From b62e72200eaad523f08d8319bba50fc652e032a8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 24 Mar 2023 13:18:20 -0700
Subject: Bluetooth: Fix printing errors if LE Connection times out

This fixes errors like bellow when LE Connection times out since that
is actually not a controller error:

 Bluetooth: hci0: Opcode 0x200d failed: -110
 Bluetooth: hci0: request failed to create LE connection: err -110

Instead the code shall properly detect if -ETIMEDOUT is returned and
send HCI_OP_LE_CREATE_CONN_CANCEL to give up on the connection.

Link: https://github.com/bluez/bluez/issues/340
Fixes: 8e8b92ee60de ("Bluetooth: hci_sync: Add hci_le_create_conn_sync")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_conn.c         |  7 +++++--
 net/bluetooth/hci_event.c        | 16 ++++++----------
 net/bluetooth/hci_sync.c         | 13 ++++++++++---
 4 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6ed9b4d546a7..d5311ceb21c6 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -954,6 +954,7 @@ enum {
 	HCI_CONN_STK_ENCRYPT,
 	HCI_CONN_AUTH_INITIATOR,
 	HCI_CONN_DROP,
+	HCI_CONN_CANCEL,
 	HCI_CONN_PARAM_REMOVAL_PEND,
 	HCI_CONN_NEW_LINK_KEY,
 	HCI_CONN_SCANNING,
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 5af3f6b011c9..e4aee5950c36 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1233,6 +1233,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct hci_conn *conn = data;
 
+	bt_dev_dbg(hdev, "err %d", err);
+
 	hci_dev_lock(hdev);
 
 	if (!err) {
@@ -1240,8 +1242,6 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
 		goto done;
 	}
 
-	bt_dev_err(hdev, "request failed to create LE connection: err %d", err);
-
 	/* Check if connection is still pending */
 	if (conn != hci_lookup_le_connect(hdev))
 		goto done;
@@ -2771,6 +2771,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 {
 	int r = 0;
 
+	if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+		return 0;
+
 	switch (conn->state) {
 	case BT_CONNECTED:
 	case BT_CONFIG:
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ad92a4be5851..e68f2a7d863a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2881,16 +2881,6 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
 
 	conn->resp_addr_type = peer_addr_type;
 	bacpy(&conn->resp_addr, peer_addr);
-
-	/* We don't want the connection attempt to stick around
-	 * indefinitely since LE doesn't have a page timeout concept
-	 * like BR/EDR. Set a timer for any connection that doesn't use
-	 * the accept list for connecting.
-	 */
-	if (filter_policy == HCI_LE_USE_PEER_ADDR)
-		queue_delayed_work(conn->hdev->workqueue,
-				   &conn->le_conn_timeout,
-				   conn->conn_timeout);
 }
 
 static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
@@ -5902,6 +5892,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	if (status)
 		goto unlock;
 
+	/* Drop the connection if it has been aborted */
+	if (test_bit(HCI_CONN_CANCEL, &conn->flags)) {
+		hci_conn_drop(conn);
+		goto unlock;
+	}
+
 	if (conn->dst_type == ADDR_LE_DEV_PUBLIC)
 		addr_type = BDADDR_LE_PUBLIC;
 	else
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5a6aa1627791..632be1267288 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -246,8 +246,9 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 
 	skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk);
 	if (IS_ERR(skb)) {
-		bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode,
-				PTR_ERR(skb));
+		if (!event)
+			bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode,
+				   PTR_ERR(skb));
 		return PTR_ERR(skb);
 	}
 
@@ -5126,8 +5127,11 @@ static int hci_le_connect_cancel_sync(struct hci_dev *hdev,
 	if (test_bit(HCI_CONN_SCANNING, &conn->flags))
 		return 0;
 
+	if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+		return 0;
+
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL,
-				     6, &conn->dst, HCI_CMD_TIMEOUT);
+				     0, NULL, HCI_CMD_TIMEOUT);
 }
 
 static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn)
@@ -6102,6 +6106,9 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn)
 				       conn->conn_timeout, NULL);
 
 done:
+	if (err == -ETIMEDOUT)
+		hci_le_connect_cancel_sync(hdev, conn);
+
 	/* Re-enable advertising after the connection attempt is finished. */
 	hci_resume_advertising_sync(hdev);
 	return err;
-- 
cgit v1.2.3


From 0c9f4521958ff52b74967c8a39a8b5747ba8df41 Mon Sep 17 00:00:00 2001
From: Weizhao Ouyang <o451686892@gmail.com>
Date: Tue, 4 Apr 2023 11:28:44 +0800
Subject: f2fs: use common implementation of file type

Use common implementation of file type conversion helpers.

Signed-off-by: Weizhao Ouyang <o451686892@gmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c           | 39 +++------------------------------------
 fs/f2fs/f2fs.h          |  1 -
 fs/f2fs/inline.c        |  2 +-
 include/linux/f2fs_fs.h | 15 ---------------
 4 files changed, 4 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 73c338db5808..d6dd8327e82d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -42,39 +42,6 @@ static unsigned int bucket_blocks(unsigned int level)
 		return 4;
 }
 
-static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
-	[F2FS_FT_UNKNOWN]	= DT_UNKNOWN,
-	[F2FS_FT_REG_FILE]	= DT_REG,
-	[F2FS_FT_DIR]		= DT_DIR,
-	[F2FS_FT_CHRDEV]	= DT_CHR,
-	[F2FS_FT_BLKDEV]	= DT_BLK,
-	[F2FS_FT_FIFO]		= DT_FIFO,
-	[F2FS_FT_SOCK]		= DT_SOCK,
-	[F2FS_FT_SYMLINK]	= DT_LNK,
-};
-
-static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= F2FS_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= F2FS_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= F2FS_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= F2FS_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= F2FS_FT_SYMLINK,
-};
-
-static void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
-{
-	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
-}
-
-unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de)
-{
-	if (de->file_type < F2FS_FT_MAX)
-		return f2fs_filetype_table[de->file_type];
-	return DT_UNKNOWN;
-}
-
 /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */
 int f2fs_init_casefolded_name(const struct inode *dir,
 			      struct f2fs_filename *fname)
@@ -485,7 +452,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	lock_page(page);
 	f2fs_wait_on_page_writeback(page, type, true, true);
 	de->ino = cpu_to_le32(inode->i_ino);
-	set_de_type(de, inode->i_mode);
+	de->file_type = fs_umode_to_ftype(inode->i_mode);
 	set_page_dirty(page);
 
 	dir->i_mtime = dir->i_ctime = current_time(dir);
@@ -699,7 +666,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
 	de->name_len = cpu_to_le16(name->len);
 	memcpy(d->filename[bit_pos], name->name, name->len);
 	de->ino = cpu_to_le32(ino);
-	set_de_type(de, mode);
+	de->file_type = fs_umode_to_ftype(mode);
 	for (i = 0; i < slots; i++) {
 		__set_bit_le(bit_pos + i, (void *)d->bitmap);
 		/* avoid wrong garbage data for readdir */
@@ -1036,7 +1003,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			continue;
 		}
 
-		d_type = f2fs_get_de_type(de);
+		d_type = fs_ftype_to_dtype(de->file_type);
 
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c2516e3273ad..5b9993219087 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3474,7 +3474,6 @@ int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 /*
  * dir.c
  */
-unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de);
 int f2fs_init_casefolded_name(const struct inode *dir,
 			      struct f2fs_filename *fname);
 int f2fs_setup_filename(struct inode *dir, const struct qstr *iname,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 72269e7efd26..4638fee16a91 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -497,7 +497,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
 		fname.hash = de->hash_code;
 
 		ino = le32_to_cpu(de->ino);
-		fake_mode = f2fs_get_de_type(de) << S_SHIFT;
+		fake_mode = fs_ftype_to_dtype(de->file_type) << S_DT_SHIFT;
 
 		err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode);
 		if (err)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 881eb9321967..1d6402529d10 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -584,21 +584,6 @@ struct f2fs_dentry_block {
 	__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
 } __packed;
 
-/* file types used in inode_info->flags */
-enum {
-	F2FS_FT_UNKNOWN,
-	F2FS_FT_REG_FILE,
-	F2FS_FT_DIR,
-	F2FS_FT_CHRDEV,
-	F2FS_FT_BLKDEV,
-	F2FS_FT_FIFO,
-	F2FS_FT_SOCK,
-	F2FS_FT_SYMLINK,
-	F2FS_FT_MAX
-};
-
-#define S_SHIFT 12
-
 #define	F2FS_DEF_PROJID		0	/* default project ID */
 
 #endif  /* _LINUX_F2FS_FS_H */
-- 
cgit v1.2.3


From 1dd5474ee6ee1d6ddc95f1423966ab8d4afda448 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:51:58 -0500
Subject: of: Make devtree_lock declaration private

Sparc is the only place devtree_lock is used outside of drivers/of/.
Move the devtree_lock declaration into of_private.h and Sparc's prom.h
so pulling in spinlock.h to of.h can be avoided for everything besides
Sparc.

Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-1-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 arch/sparc/include/asm/prom.h | 3 +++
 drivers/of/of_private.h       | 1 +
 include/linux/of.h            | 2 --
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index 587edb8b5a65..8184575b1336 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -19,11 +19,14 @@
 #include <linux/mutex.h>
 #include <linux/atomic.h>
 #include <linux/irqdomain.h>
+#include <linux/spinlock.h>
 
 #define of_compat_cmp(s1, s2, l)	strncmp((s1), (s2), (l))
 #define of_prop_cmp(s1, s2)		strcasecmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcmp((s1), (s2))
 
+extern raw_spinlock_t devtree_lock;
+
 struct of_irq_controller {
 	unsigned int	(*irq_build)(struct device_node *, unsigned int, void *);
 	void		*data;
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index fb6792d381a6..b57f1014e419 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -38,6 +38,7 @@ struct alias_prop {
 #define OF_ROOT_NODE_SIZE_CELLS_DEFAULT 1
 
 extern struct mutex of_mutex;
+extern raw_spinlock_t devtree_lock;
 extern struct list_head aliases_lookup;
 extern struct kset *of_kset;
 
diff --git a/include/linux/of.h b/include/linux/of.h
index 0af611307db2..36cf94596eba 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -16,7 +16,6 @@
 #include <linux/errno.h>
 #include <linux/kobject.h>
 #include <linux/mod_devicetable.h>
-#include <linux/spinlock.h>
 #include <linux/topology.h>
 #include <linux/notifier.h>
 #include <linux/property.h>
@@ -145,7 +144,6 @@ extern struct device_node *of_root;
 extern struct device_node *of_chosen;
 extern struct device_node *of_aliases;
 extern struct device_node *of_stdout;
-extern raw_spinlock_t devtree_lock;
 
 /*
  * struct device_node flag descriptions
-- 
cgit v1.2.3


From 4c32fb7dcf65b16cc7b2837ccfc35d00be92bddb Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:51:59 -0500
Subject: of: Move of_device_(add|register|unregister) to of_platform.h

As of_device_(add|register|unregister) functions work on struct
platform_device, they should be declared in of_platform.h instead.

This move is transparent for now as both headers include each other.

Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-2-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h   | 4 ----
 include/linux/of_platform.h | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index f4b57614979d..e4aa61cb2bd0 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -26,10 +26,6 @@ static inline int of_driver_match_device(struct device *dev,
 	return of_match_device(drv->of_match_table, dev) != NULL;
 }
 
-extern int of_device_add(struct platform_device *pdev);
-extern int of_device_register(struct platform_device *ofdev);
-extern void of_device_unregister(struct platform_device *ofdev);
-
 extern const void *of_device_get_match_data(const struct device *dev);
 
 extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len);
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index d15b6cd5e1c3..8ac5cb933dc3 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -52,6 +52,11 @@ extern const struct of_device_id of_default_bus_match_table[];
 extern struct platform_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
+
+extern int of_device_add(struct platform_device *pdev);
+extern int of_device_register(struct platform_device *ofdev);
+extern void of_device_unregister(struct platform_device *ofdev);
+
 #ifdef CONFIG_OF
 extern struct platform_device *of_find_device_by_node(struct device_node *np);
 #else
-- 
cgit v1.2.3


From c91c46de6bbc1147ae5dfe046b87f5f3d6593215 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 6 Apr 2023 18:25:33 -0700
Subject: net: provide macros for commonly copied lockless queue stop/wake code

A lot of drivers follow the same scheme to stop / start queues
without introducing locks between xmit and NAPI tx completions.
I'm guessing they all copy'n'paste each other's code.
The original code dates back all the way to e1000 and Linux 2.6.19.

Smaller drivers shy away from the scheme and introduce a lock
which may cause deadlocks in netpoll.

Provide macros which encapsulate the necessary logic.

The macros do not prevent false wake ups, the extra barrier
required to close that race is not worth it. See discussion in:
https://lore.kernel.org/all/c39312a2-4537-14b4-270c-9fe1fbb91e89@gmail.com/

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/driver.rst |   6 ++
 include/linux/netdevice.h           |   1 +
 include/net/netdev_queues.h         | 144 ++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 include/net/netdev_queues.h

(limited to 'include')

diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst
index 19c363291d04..4071f2c00f8b 100644
--- a/Documentation/networking/driver.rst
+++ b/Documentation/networking/driver.rst
@@ -104,6 +104,12 @@ and:
 	    TX_BUFFS_AVAIL(dp) > 0)
 		netif_wake_queue(dp->dev);
 
+Lockless queue stop / wake helper macros
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/net/netdev_queues.h
+   :doc: Lockless queue stopping / waking helpers.
+
 No exclusive ownership
 ----------------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c25b39681b3..7bec9a2be8ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3335,6 +3335,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 
 static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
+	/* Must be an atomic op see netif_txq_try_stop() */
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
 
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
new file mode 100644
index 000000000000..5236d78bbdeb
--- /dev/null
+++ b/include/net/netdev_queues.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NET_QUEUES_H
+#define _LINUX_NET_QUEUES_H
+
+#include <linux/netdevice.h>
+
+/**
+ * DOC: Lockless queue stopping / waking helpers.
+ *
+ * The netif_txq_maybe_stop() and __netif_txq_completed_wake()
+ * macros are designed to safely implement stopping
+ * and waking netdev queues without full lock protection.
+ *
+ * We assume that there can be no concurrent stop attempts and no concurrent
+ * wake attempts. The try-stop should happen from the xmit handler,
+ * while wake up should be triggered from NAPI poll context.
+ * The two may run concurrently (single producer, single consumer).
+ *
+ * The try-stop side is expected to run from the xmit handler and therefore
+ * it does not reschedule Tx (netif_tx_start_queue() instead of
+ * netif_tx_wake_queue()). Uses of the ``stop`` macros outside of the xmit
+ * handler may lead to xmit queue being enabled but not run.
+ * The waking side does not have similar context restrictions.
+ *
+ * The macros guarantee that rings will not remain stopped if there's
+ * space available, but they do *not* prevent false wake ups when
+ * the ring is full! Drivers should check for ring full at the start
+ * for the xmit handler.
+ *
+ * All descriptor ring indexes (and other relevant shared state) must
+ * be updated before invoking the macros.
+ */
+
+#define netif_txq_try_stop(txq, get_desc, start_thrs)			\
+	({								\
+		int _res;						\
+									\
+		netif_tx_stop_queue(txq);				\
+		/* Producer index and stop bit must be visible		\
+		 * to consumer before we recheck.			\
+		 * Pairs with a barrier in __netif_txq_maybe_wake().	\
+		 */							\
+		smp_mb__after_atomic();					\
+									\
+		/* We need to check again in a case another		\
+		 * CPU has just made room available.			\
+		 */							\
+		_res = 0;						\
+		if (unlikely(get_desc >= start_thrs)) {			\
+			netif_tx_start_queue(txq);			\
+			_res = -1;					\
+		}							\
+		_res;							\
+	})								\
+
+/**
+ * netif_txq_maybe_stop() - locklessly stop a Tx queue, if needed
+ * @txq:	struct netdev_queue to stop/start
+ * @get_desc:	get current number of free descriptors (see requirements below!)
+ * @stop_thrs:	minimal number of available descriptors for queue to be left
+ *		enabled
+ * @start_thrs:	minimal number of descriptors to re-enable the queue, can be
+ *		equal to @stop_thrs or higher to avoid frequent waking
+ *
+ * All arguments may be evaluated multiple times, beware of side effects.
+ * @get_desc must be a formula or a function call, it must always
+ * return up-to-date information when evaluated!
+ * Expected to be used from ndo_start_xmit, see the comment on top of the file.
+ *
+ * Returns:
+ *	 0 if the queue was stopped
+ *	 1 if the queue was left enabled
+ *	-1 if the queue was re-enabled (raced with waking)
+ */
+#define netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs)	\
+	({								\
+		int _res;						\
+									\
+		_res = 1;						\
+		if (unlikely(get_desc < stop_thrs))			\
+			_res = netif_txq_try_stop(txq, get_desc, start_thrs); \
+		_res;							\
+	})								\
+
+
+/**
+ * __netif_txq_maybe_wake() - locklessly wake a Tx queue, if needed
+ * @txq:	struct netdev_queue to stop/start
+ * @get_desc:	get current number of free descriptors (see requirements below!)
+ * @start_thrs:	minimal number of descriptors to re-enable the queue
+ * @down_cond:	down condition, predicate indicating that the queue should
+ *		not be woken up even if descriptors are available
+ *
+ * All arguments may be evaluated multiple times.
+ * @get_desc must be a formula or a function call, it must always
+ * return up-to-date information when evaluated!
+ *
+ * Returns:
+ *	 0 if the queue was woken up
+ *	 1 if the queue was already enabled (or disabled but @down_cond is true)
+ *	-1 if the queue was left unchanged (@start_thrs not reached)
+ */
+#define __netif_txq_maybe_wake(txq, get_desc, start_thrs, down_cond)	\
+	({								\
+		int _res;						\
+									\
+		_res = -1;						\
+		if (likely(get_desc > start_thrs)) {			\
+			/* Make sure that anybody stopping the queue after \
+			 * this sees the new next_to_clean.		\
+			 */						\
+			smp_mb();					\
+			_res = 1;					\
+			if (unlikely(netif_tx_queue_stopped(txq)) &&	\
+			    !(down_cond)) {				\
+				netif_tx_wake_queue(txq);		\
+				_res = 0;				\
+			}						\
+		}							\
+		_res;							\
+	})
+
+#define netif_txq_maybe_wake(txq, get_desc, start_thrs)		\
+	__netif_txq_maybe_wake(txq, get_desc, start_thrs, false)
+
+/* subqueue variants follow */
+
+#define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs)		\
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_try_stop(txq, get_desc, start_thrs);		\
+	})
+
+#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
+	})
+
+#endif
-- 
cgit v1.2.3


From 301f227fc860624d37ba5dae9da57dcf371268db Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 6 Apr 2023 18:25:36 -0700
Subject: net: piggy back on the memory barrier in bql when waking queues

Drivers call netdev_tx_completed_queue() right before
netif_txq_maybe_wake(). If BQL is enabled netdev_tx_completed_queue()
should issue a memory barrier, so we can depend on that separating
the stop check from the consumer index update, instead of adding
another barrier in netif_txq_maybe_wake().

This matters more than the barriers on the xmit path, because
the wake condition is almost always true. So we issue the
consumer side barrier often.

Wrap netdev_tx_completed_queue() in a local helper to issue
the barrier even if BQL is disabled. Keep the same semantics
as netdev_tx_completed_queue() (barrier only if bytes != 0)
to make it clear that the barrier is conditional.

Plus since macro gets pkt/byte counts as arguments now -
we can skip waking if there were no packets completed.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  6 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 ++++-----
 include/linux/netdevice.h                     |  2 +-
 include/net/netdev_queues.h                   | 39 ++++++++++++++++++++-------
 4 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index de97bee25249..f7602d8d79e3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -688,11 +688,11 @@ next_tx_int:
 		dev_kfree_skb_any(skb);
 	}
 
-	netdev_tx_completed_queue(txq, nr_pkts, tx_bytes);
 	txr->tx_cons = cons;
 
-	__netif_txq_maybe_wake(txq, bnxt_tx_avail(bp, txr), bp->tx_wake_thresh,
-			       READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING);
+	__netif_txq_completed_wake(txq, nr_pkts, tx_bytes,
+				   bnxt_tx_avail(bp, txr), bp->tx_wake_thresh,
+				   READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING);
 }
 
 static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index cbbddee55db1..f2604fc05991 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1251,15 +1251,13 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 	if (ring_is_xdp(tx_ring))
 		return !!budget;
 
-	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  total_packets, total_bytes);
-
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
 	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
-	if (total_packets && netif_carrier_ok(tx_ring->netdev) &&
-	    !__netif_txq_maybe_wake(txq, ixgbe_desc_unused(tx_ring),
-				    TX_WAKE_THRESHOLD,
-				    test_bit(__IXGBE_DOWN, &adapter->state)))
+	if (!__netif_txq_completed_wake(txq, total_packets, total_bytes,
+					ixgbe_desc_unused(tx_ring),
+					TX_WAKE_THRESHOLD,
+					netif_carrier_ok(tx_ring->netdev) &&
+					test_bit(__IXGBE_DOWN, &adapter->state)))
 		++tx_ring->tx_stats.restart_queue;
 
 	return !!budget;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7bec9a2be8ef..fe355592dfde 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3532,7 +3532,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
 	 * netdev_tx_sent_queue will miss the update and cause the queue to
 	 * be stopped forever
 	 */
-	smp_mb();
+	smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */
 
 	if (unlikely(dql_avail(&dev_queue->dql) < 0))
 		return;
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 5236d78bbdeb..b26fdb441e39 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -38,7 +38,7 @@
 		netif_tx_stop_queue(txq);				\
 		/* Producer index and stop bit must be visible		\
 		 * to consumer before we recheck.			\
-		 * Pairs with a barrier in __netif_txq_maybe_wake().	\
+		 * Pairs with a barrier in __netif_txq_completed_wake(). \
 		 */							\
 		smp_mb__after_atomic();					\
 									\
@@ -82,10 +82,24 @@
 		_res;							\
 	})								\
 
+/* Variant of netdev_tx_completed_queue() which guarantees smp_mb() if
+ * @bytes != 0, regardless of kernel config.
+ */
+static inline void
+netdev_txq_completed_mb(struct netdev_queue *dev_queue,
+			unsigned int pkts, unsigned int bytes)
+{
+	if (IS_ENABLED(CONFIG_BQL))
+		netdev_tx_completed_queue(dev_queue, pkts, bytes);
+	else if (bytes)
+		smp_mb();
+}
 
 /**
- * __netif_txq_maybe_wake() - locklessly wake a Tx queue, if needed
+ * __netif_txq_completed_wake() - locklessly wake a Tx queue, if needed
  * @txq:	struct netdev_queue to stop/start
+ * @pkts:	number of packets completed
+ * @bytes:	number of bytes completed
  * @get_desc:	get current number of free descriptors (see requirements below!)
  * @start_thrs:	minimal number of descriptors to re-enable the queue
  * @down_cond:	down condition, predicate indicating that the queue should
@@ -94,22 +108,27 @@
  * All arguments may be evaluated multiple times.
  * @get_desc must be a formula or a function call, it must always
  * return up-to-date information when evaluated!
+ * Reports completed pkts/bytes to BQL.
  *
  * Returns:
  *	 0 if the queue was woken up
  *	 1 if the queue was already enabled (or disabled but @down_cond is true)
  *	-1 if the queue was left unchanged (@start_thrs not reached)
  */
-#define __netif_txq_maybe_wake(txq, get_desc, start_thrs, down_cond)	\
+#define __netif_txq_completed_wake(txq, pkts, bytes,			\
+				   get_desc, start_thrs, down_cond)	\
 	({								\
 		int _res;						\
 									\
+		/* Report to BQL and piggy back on its barrier.		\
+		 * Barrier makes sure that anybody stopping the queue	\
+		 * after this point sees the new consumer index.	\
+		 * Pairs with barrier in netif_txq_try_stop().		\
+		 */							\
+		netdev_txq_completed_mb(txq, pkts, bytes);		\
+									\
 		_res = -1;						\
-		if (likely(get_desc > start_thrs)) {			\
-			/* Make sure that anybody stopping the queue after \
-			 * this sees the new next_to_clean.		\
-			 */						\
-			smp_mb();					\
+		if (pkts && likely(get_desc > start_thrs)) {		\
 			_res = 1;					\
 			if (unlikely(netif_tx_queue_stopped(txq)) &&	\
 			    !(down_cond)) {				\
@@ -120,8 +139,8 @@
 		_res;							\
 	})
 
-#define netif_txq_maybe_wake(txq, get_desc, start_thrs)		\
-	__netif_txq_maybe_wake(txq, get_desc, start_thrs, false)
+#define netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs) \
+	__netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs, false)
 
 /* subqueue variants follow */
 
-- 
cgit v1.2.3


From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:47 -0700
Subject: bpf: Split off basic BPF verifier log into separate file

kernel/bpf/verifier.c file is large and growing larger all the time. So
it's good to start splitting off more or less self-contained parts into
separate files to keep source code size (somewhat) somewhat under
control.

This patch is a one step in this direction, moving some of BPF verifier log
routines into a separate kernel/bpf/log.c. Right now it's most low-level
and isolated routines to append data to log, reset log to previous
position, etc. Eventually we could probably move verifier state
printing logic here as well, but this patch doesn't attempt to do that
yet.

Subsequent patches will add more logic to verifier log management, so
having basics in a separate file will make sure verifier.c doesn't grow
more with new changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 19 ++++------
 kernel/bpf/Makefile          |  3 +-
 kernel/bpf/log.c             | 85 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 69 -----------------------------------
 4 files changed, 94 insertions(+), 82 deletions(-)
 create mode 100644 kernel/bpf/log.c

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 81d525d057c7..83dff25545ee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -498,11 +498,6 @@ struct bpf_verifier_log {
 	u32 len_total;
 };
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	return log->len_used >= log->len_total - 1;
-}
-
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
@@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log &&
@@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 		 log->level == BPF_LOG_KERNEL);
 }
 
-static inline bool
-bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
-{
-	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
-	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
-}
-
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
@@ -608,12 +601,14 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..920061e38d2e
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
+	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+		       va_list args)
+{
+	unsigned int n;
+
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	if (log->level == BPF_LOG_KERNEL) {
+		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+		return;
+	}
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+	char zero = 0;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	log->len_used = new_pos;
+	if (put_user(zero, log->ubuf + new_pos))
+		log->ubuf = NULL;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+					   const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(&env->log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+			    const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3660b573048a..745ae0cd01d4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -335,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
 	return &linfo[i - 1];
 }
 
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
-		       va_list args)
-{
-	unsigned int n;
-
-	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
-	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
-		  "verifier log line truncated - local buffer too short\n");
-
-	if (log->level == BPF_LOG_KERNEL) {
-		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
-		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
-		return;
-	}
-
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
-	char zero = 0;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
-		log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-					   const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(&env->log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(&env->log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
 __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 {
 	struct bpf_verifier_env *env = private_data;
@@ -403,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
-			    const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
 static const char *ltrim(const char *s)
 {
 	while (isspace(*s))
-- 
cgit v1.2.3


From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:49 -0700
Subject: bpf: Switch BPF verifier log to be a rotating log by default

Currently, if user-supplied log buffer to collect BPF verifier log turns
out to be too small to contain full log, bpf() syscall returns -ENOSPC,
fails BPF program verification/load, and preserves first N-1 bytes of
the verifier log (where N is the size of user-supplied buffer).

This is problematic in a bunch of common scenarios, especially when
working with real-world BPF programs that tend to be pretty complex as
far as verification goes and require big log buffers. Typically, it's
when debugging tricky cases at log level 2 (verbose). Also, when BPF program
is successfully validated, log level 2 is the only way to actually see
verifier state progression and all the important details.

Even with log level 1, it's possible to get -ENOSPC even if the final
verifier log fits in log buffer, if there is a code path that's deep
enough to fill up entire log, even if normally it would be reset later
on (there is a logic to chop off successfully validated portions of BPF
verifier log).

In short, it's not always possible to pre-size log buffer. Also, what's
worse, in practice, the end of the log most often is way more important
than the beginning, but verifier stops emitting log as soon as initial
log buffer is filled up.

This patch switches BPF verifier log behavior to effectively behave as
rotating log. That is, if user-supplied log buffer turns out to be too
short, verifier will keep overwriting previously written log,
effectively treating user's log buffer as a ring buffer. -ENOSPC is
still going to be returned at the end, to notify user that log contents
was truncated, but the important last N bytes of the log would be
returned, which might be all that user really needs. This consistent
-ENOSPC behavior, regardless of rotating or fixed log behavior, allows
to prevent backwards compatibility breakage. The only user-visible
change is which portion of verifier log user ends up seeing *if buffer
is too small*. Given contents of verifier log itself is not an ABI,
there is no breakage due to this behavior change. Specialized tools that
rely on specific contents of verifier log in -ENOSPC scenario are
expected to be easily adapted to accommodate old and new behaviors.

Importantly, though, to preserve good user experience and not require
every user-space application to adopt to this new behavior, before
exiting to user-space verifier will rotate log (in place) to make it
start at the very beginning of user buffer as a continuous
zero-terminated string. The contents will be a chopped off N-1 last
bytes of full verifier log, of course.

Given beginning of log is sometimes important as well, we add
BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows
tools like veristat to request first part of verifier log, if necessary.
BPF_LOG_FIXED flag is also a simple and straightforward way to check if
BPF verifier supports rotating behavior.

On the implementation side, conceptually, it's all simple. We maintain
64-bit logical start and end positions. If we need to truncate the log,
start position will be adjusted accordingly to lag end position by
N bytes. We then use those logical positions to calculate their matching
actual positions in user buffer and handle wrap around the end of the
buffer properly. Finally, right before returning from bpf_check(), we
rotate user log buffer contents in-place as necessary, to make log
contents contiguous. See comments in relevant functions for details.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org
---
 include/linux/bpf_verifier.h                       |  33 +++-
 kernel/bpf/btf.c                                   |   3 +-
 kernel/bpf/log.c                                   | 198 ++++++++++++++++++++-
 kernel/bpf/verifier.c                              |  19 +-
 tools/testing/selftests/bpf/prog_tests/log_fixup.c |   1 +
 5 files changed, 228 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 83dff25545ee..4c926227f612 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -491,25 +491,42 @@ struct bpf_insn_aux_data {
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
 struct bpf_verifier_log {
-	u32 level;
-	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+	/* Logical start and end positions of a "log window" of the verifier log.
+	 * start_pos == 0 means we haven't truncated anything.
+	 * Once truncation starts to happen, start_pos + len_total == end_pos,
+	 * except during log reset situations, in which (end_pos - start_pos)
+	 * might get smaller than len_total (see bpf_vlog_reset()).
+	 * Generally, (end_pos - start_pos) gives number of useful data in
+	 * user log buffer.
+	 */
+	u64 start_pos;
+	u64 end_pos;
 	char __user *ubuf;
-	u32 len_used;
+	u32 level;
 	u32 len_total;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
+#define BPF_LOG_FIXED	8
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
-#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
+{
+	return log->end_pos - log->start_pos;
+}
+
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
-	return log->len_used >= log->len_total - 1;
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
@@ -596,7 +613,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
-	u32 prev_log_len, prev_insn_print_len;
+	u64 prev_log_pos, prev_insn_print_pos;
 	/* buffer used in reg_type_str() to generate reg_type string */
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
@@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
+void bpf_vlog_finalize(struct bpf_verifier_log *log);
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 593c45a294d0..20a05b8932db 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5593,7 +5593,8 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		}
 	}
 
-	if (log->level && bpf_verifier_log_full(log)) {
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
 	}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 1974891fc324..92b1c8ad6601 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
+#include <linux/math64.h>
 
 bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
@@ -32,23 +33,202 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
+	if (log->level & BPF_LOG_FIXED) {
+		n = min(log->len_total - bpf_log_used(log) - 1, n);
+		log->kbuf[n] = '\0';
+		n += 1;
+
+		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
+			goto fail;
+
+		log->end_pos += n - 1; /* don't count terminating '\0' */
+	} else {
+		u64 new_end, new_start, cur_pos;
+		u32 buf_start, buf_end, new_n;
+
+		n += 1;
+
+		new_end = log->end_pos + n;
+		if (new_end - log->start_pos >= log->len_total)
+			new_start = new_end - log->len_total;
+		else
+			new_start = log->start_pos;
+		new_n = min(n, log->len_total);
+		cur_pos = new_end - new_n;
+
+		div_u64_rem(cur_pos, log->len_total, &buf_start);
+		div_u64_rem(new_end, log->len_total, &buf_end);
+		/* new_end and buf_end are exclusive indices, so if buf_end is
+		 * exactly zero, then it actually points right to the end of
+		 * ubuf and there is no wrap around
+		 */
+		if (buf_end == 0)
+			buf_end = log->len_total;
+
+		/* if buf_start > buf_end, we wrapped around;
+		 * if buf_start == buf_end, then we fill ubuf completely; we
+		 * can't have buf_start == buf_end to mean that there is
+		 * nothing to write, because we always write at least
+		 * something, even if terminal '\0'
+		 */
+		if (buf_start < buf_end) {
+			/* message fits within contiguous chunk of ubuf */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 buf_end - buf_start))
+				goto fail;
+		} else {
+			/* message wraps around the end of ubuf, copy in two chunks */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 log->len_total - buf_start))
+				goto fail;
+			if (copy_to_user(log->ubuf,
+					 log->kbuf + n - buf_end,
+					 buf_end))
+				goto fail;
+		}
+
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+	}
+
+	return;
+fail:
+	log->ubuf = NULL;
 }
 
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 {
 	char zero = 0;
+	u32 pos;
+
+	if (WARN_ON_ONCE(new_pos > log->end_pos))
+		return;
 
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
+	/* if position to which we reset is beyond current log window,
+	 * then we didn't preserve any useful content and should adjust
+	 * start_pos to end up with an empty log (start_pos == end_pos)
+	 */
+	log->end_pos = new_pos;
+	if (log->end_pos < log->start_pos)
+		log->start_pos = log->end_pos;
+	div_u64_rem(new_pos, log->len_total, &pos);
+	if (put_user(zero, log->ubuf + pos))
+		log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+	int i, j;
+
+	for (i = 0, j = len - 1; i < j; i++, j--)
+		swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+	/* we split log->kbuf into two equal parts for both ends of array */
+	int n = sizeof(log->kbuf) / 2, nn;
+	char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+	/* Read ubuf's section [start, end) two chunks at a time, from left
+	 * and right side; within each chunk, swap all the bytes; after that
+	 * reverse the order of lbuf and rbuf and write result back to ubuf.
+	 * This way we'll end up with swapped contents of specified
+	 * [start, end) ubuf segment.
+	 */
+	while (end - start > 1) {
+		nn = min(n, (end - start ) / 2);
+
+		if (copy_from_user(lbuf, log->ubuf + start, nn))
+			return -EFAULT;
+		if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+			return -EFAULT;
+
+		bpf_vlog_reverse_kbuf(lbuf, nn);
+		bpf_vlog_reverse_kbuf(rbuf, nn);
+
+		/* we write lbuf to the right end of ubuf, while rbuf to the
+		 * left one to end up with properly reversed overall ubuf
+		 */
+		if (copy_to_user(log->ubuf + start, rbuf, nn))
+			return -EFAULT;
+		if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+			return -EFAULT;
+
+		start += nn;
+		end -= nn;
+	}
+
+	return 0;
+}
+
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+{
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	else
+		return log->start_pos > 0;
+}
+
+void bpf_vlog_finalize(struct bpf_verifier_log *log)
+{
+	u32 sublen;
+	int err;
+
+	if (!log || !log->level || !log->ubuf)
+		return;
+	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
+		return;
+
+	/* If we never truncated log, there is nothing to move around. */
+	if (log->start_pos == 0)
+		return;
+
+	/* Otherwise we need to rotate log contents to make it start from the
+	 * buffer beginning and be a continuous zero-terminated string. Note
+	 * that if log->start_pos != 0 then we definitely filled up entire log
+	 * buffer with no gaps, and we just need to shift buffer contents to
+	 * the left by (log->start_pos % log->len_total) bytes.
+	 *
+	 * Unfortunately, user buffer could be huge and we don't want to
+	 * allocate temporary kernel memory of the same size just to shift
+	 * contents in a straightforward fashion. Instead, we'll be clever and
+	 * do in-place array rotation. This is a leetcode-style problem, which
+	 * could be solved by three rotations.
+	 *
+	 * Let's say we have log buffer that has to be shifted left by 7 bytes
+	 * (spaces and vertical bar is just for demonstrative purposes):
+	 *   E F G H I J K | A B C D
+	 *
+	 * First, we reverse entire array:
+	 *   D C B A | K J I H G F E
+	 *
+	 * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+	 * (KJIHGFE), resulting in a properly rotated array:
+	 *   A B C D | E F G H I J K
+	 *
+	 * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+	 * bytes, and write them back. Doing it byte-by-byte would be
+	 * unnecessarily inefficient. Altogether we are going to read and
+	 * write each byte twice, for total 4 memory copies between kernel and
+	 * user space.
+	 */
+
+	/* length of the chopped off part that will be the beginning;
+	 * len(ABCD) in the example above
+	 */
+	div_u64_rem(log->start_pos, log->len_total, &sublen);
+	sublen = log->len_total - sublen;
+
+	err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+	err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+	if (err)
 		log->ubuf = NULL;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 745ae0cd01d4..a476bb319685 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1439,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos)
 static void print_insn_state(struct bpf_verifier_env *env,
 			     const struct bpf_func_state *state)
 {
-	if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
 		/* remove new line character */
-		bpf_vlog_reset(&env->log, env->prev_log_len - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
 	} else {
 		verbose(env, "%d:", env->insn_idx);
 	}
@@ -1750,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	err = copy_verifier_state(&elem->st, cur);
@@ -2286,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -15638,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env)
 				print_insn_state(env, state->frame[state->curframe]);
 
 			verbose_linfo(env, env->insn_idx, "; ");
-			env->prev_log_len = env->log.len_used;
+			env->prev_log_pos = env->log.end_pos;
 			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
-			env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
-			env->prev_log_len = env->log.len_used;
+			env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+			env->prev_log_pos = env->log.end_pos;
 		}
 
 		if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -18860,7 +18860,8 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	if (log->level && bpf_verifier_log_full(log))
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
index 239e1c5753b0..bc27170bdeb0 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
@@ -24,6 +24,7 @@ static void bad_core_relo(size_t log_buf_size, enum trunc_type trunc_type)
 	bpf_program__set_autoload(skel->progs.bad_relo, true);
 	memset(log_buf, 0, sizeof(log_buf));
 	bpf_program__set_log_buf(skel->progs.bad_relo, log_buf, log_buf_size ?: sizeof(log_buf));
+	bpf_program__set_log_level(skel->progs.bad_relo, 1 | 8); /* BPF_LOG_FIXED to force truncation */
 
 	err = test_log_fixup__load(skel);
 	if (!ASSERT_ERR(err, "load_fail"))
-- 
cgit v1.2.3


From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:57 -0700
Subject: bpf: Keep track of total log content size in both fixed and rolling
 modes

Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos
as *logical* log position. This means that we can go beyond physical log
buffer size now and be able to tell what log buffer size should be to
fit entire log contents without -ENOSPC.

To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting
logic of not vsnprintf()'ing further log content once we filled up
user-provided buffer, which is done by bpf_verifier_log_needed() checks.
We modify these checks to always keep going if log->level is non-zero
(i.e., log is requested), even if log->ubuf was NULL'ed out due to
copying data to user-space, or if entire log buffer is physically full.
We adopt bpf_verifier_vlog() routine to work correctly with
log->ubuf == NULL condition, performing log formatting into temporary
kernel buffer, doing all the necessary accounting, but just avoiding
copying data out if buffer is full or NULL'ed out.

With these changes, it's now possible to do this sort of determination of
log contents size in both BPF_LOG_FIXED and default rolling log mode.
We need to keep in mind bpf_vlog_reset(), though, which shrinks log
contents after successful verification of a particular code path. This
log reset means that log->end_pos isn't always increasing, so to return
back to users what should be the log buffer size to fit all log content
without causing -ENOSPC even in the presence of log resetting, we need
to keep maximum over "lifetime" of logging. We do this accounting in
bpf_vlog_update_len_max() helper.

A related and subtle aspect is that with this logical log->end_pos even in
BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset
it back with bpf_vlog_reset() to a position inside user-supplied
log_buf. In such situation we still want to properly maintain
terminating zero. We will eventually return -ENOSPC even if final log
buffer is small (we detect this through log->len_max check). This
behavior is simpler to reason about and is consistent with current
behavior of verifier log. Handling of this required a small addition to
bpf_vlog_reset() logic to avoid doing put_user() beyond physical log
buffer dimensions.

Another issue to keep in mind is that we limit log buffer size to 32-bit
value and keep such log length as u32, but theoretically verifier could
produce huge log stretching beyond 4GB. Instead of keeping (and later
returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes
it impossible to specify log buffer size bigger than 4GB anyways, so we
don't really loose anything here and keep everything consistently 32-bit
in UAPI. This property will be utilized in next patch.

Doing the same determination of maximum log buffer for rolling mode is
trivial, as log->end_pos and log->start_pos are already logical
positions, so there is nothing new there.

These changes do incidentally fix one small issue with previous logging
logic. Previously, if use provided log buffer of size N, and actual log
output was exactly N-1 bytes + terminating \0, kernel logic coun't
distinguish this condition from log truncation scenario which would end
up with truncated log contents of N-1 bytes + terminating \0 as well.

But now with log->end_pos being logical position that could go beyond
actual log buffer size, we can distinguish these two conditions, which
we do in this patch. This plays nicely with returning log_size_actual
(implemented in UAPI in the next patch), as we can now guarantee that if
user takes such log_size_actual and provides log buffer of that exact
size, they will not get -ENOSPC in return.

All in all, all these changes do conceptually unify fixed and rolling
log modes much better, and allow a nice feature requested by users:
knowing what should be the size of the buffer to avoid -ENOSPC.

We'll plumb this through the UAPI and the code in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 12 ++------
 kernel/bpf/log.c             | 67 ++++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c926227f612..98d2eb382dbb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -504,6 +504,7 @@ struct bpf_verifier_log {
 	char __user *ubuf;
 	u32 level;
 	u32 len_total;
+	u32 len_max;
 	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
@@ -517,23 +518,16 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
-{
-	return log->end_pos - log->start_pos;
-}
-
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
 	if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
+		return log->end_pos >= log->len_total;
 	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
-	return log &&
-		((log->level && log->ubuf && !bpf_verifier_log_full(log)) ||
-		 log->level == BPF_LOG_KERNEL);
+	return log && log->level;
 }
 
 #define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index c778f3b290cb..47bea2fad6fe 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -16,10 +16,26 @@ bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+	/* add_len includes terminal \0, so no need for +1. */
+	u64 len = log->end_pos + add_len;
+
+	/* log->len_max could be larger than our current len due to
+	 * bpf_vlog_reset() calls, so we maintain the max of any length at any
+	 * previous point
+	 */
+	if (len > UINT_MAX)
+		log->len_max = UINT_MAX;
+	else if (len > log->len_max)
+		log->len_max = len;
+}
+
 void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		       va_list args)
 {
-	unsigned int n;
+	u64 cur_pos;
+	u32 new_n, n;
 
 	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 
@@ -33,21 +49,27 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	if (log->level & BPF_LOG_FIXED) {
-		n = min(log->len_total - bpf_log_used(log) - 1, n);
-		log->kbuf[n] = '\0';
-		n += 1;
+	n += 1; /* include terminating zero */
+	bpf_vlog_update_len_max(log, n);
 
-		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
-			goto fail;
+	if (log->level & BPF_LOG_FIXED) {
+		/* check if we have at least something to put into user buf */
+		new_n = 0;
+		if (log->end_pos < log->len_total) {
+			new_n = min_t(u32, log->len_total - log->end_pos, n);
+			log->kbuf[new_n - 1] = '\0';
+		}
 
+		cur_pos = log->end_pos;
 		log->end_pos += n - 1; /* don't count terminating '\0' */
+
+		if (log->ubuf && new_n &&
+		    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+			goto fail;
 	} else {
-		u64 new_end, new_start, cur_pos;
+		u64 new_end, new_start;
 		u32 buf_start, buf_end, new_n;
 
-		n += 1;
-
 		new_end = log->end_pos + n;
 		if (new_end - log->start_pos >= log->len_total)
 			new_start = new_end - log->len_total;
@@ -65,6 +87,12 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		if (buf_end == 0)
 			buf_end = log->len_total;
 
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+		if (!log->ubuf)
+			return;
+
 		/* if buf_start > buf_end, we wrapped around;
 		 * if buf_start == buf_end, then we fill ubuf completely; we
 		 * can't have buf_start == buf_end to mean that there is
@@ -88,9 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 					 buf_end))
 				goto fail;
 		}
-
-		log->start_pos = new_start;
-		log->end_pos = new_end - 1; /* don't count terminating '\0' */
 	}
 
 	return;
@@ -116,8 +141,13 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 	log->end_pos = new_pos;
 	if (log->end_pos < log->start_pos)
 		log->start_pos = log->end_pos;
-	div_u64_rem(new_pos, log->len_total, &pos);
-	if (put_user(zero, log->ubuf + pos))
+
+	if (log->level & BPF_LOG_FIXED)
+		pos = log->end_pos + 1;
+	else
+		div_u64_rem(new_pos, log->len_total, &pos);
+
+	if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos))
 		log->ubuf = NULL;
 }
 
@@ -169,12 +199,7 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 
 bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
-	if (!log->level)
-		return false;
-	else if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
-	else
-		return log->start_pos > 0;
+	return log->len_max > log->len_total;
 }
 
 void bpf_vlog_finalize(struct bpf_verifier_log *log)
-- 
cgit v1.2.3


From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:58 -0700
Subject: bpf: Add log_true_size output field to return necessary log buffer
 size

Add output-only log_true_size and btf_log_true_size field to
BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return
the size of log buffer necessary to fit in all the log contents at
specified log_level. This is very useful for BPF loader libraries like
libbpf to be able to size log buffer correctly, but could be used by
users directly, if necessary, as well.

This patch plumbs all this through the code, taking into account actual
bpf_attr size provided by user to determine if these new fields are
expected by users. And if they are, set them from kernel on return.

We refactory btf_parse() function to accommodate this, moving attr and
uattr handling inside it. The rest is very straightforward code, which
is split from the logging accounting changes in the previous patch to
make it simpler to review logic vs UAPI changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org
---
 include/linux/bpf.h            |  2 +-
 include/linux/btf.h            |  2 +-
 include/uapi/linux/bpf.h       | 10 ++++++++++
 kernel/bpf/btf.c               | 32 ++++++++++++++++++--------------
 kernel/bpf/syscall.c           | 16 ++++++++--------
 kernel/bpf/verifier.c          |  8 +++++++-
 tools/include/uapi/linux/bpf.h | 12 +++++++++++-
 7 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 002a811b6b90..2c6095bd7d69 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d53b10cc55f2..495250162422 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -125,7 +125,7 @@ extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr);
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e3d3b5160d26..3823100b7934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5aa540ee611f..0748cf4b8ab6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,9 +5504,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
-			     u32 log_level, char __user *log_ubuf, u32 log_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
+	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
@@ -5514,7 +5515,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	u8 *data;
 	int err;
 
-	if (btf_data_size > BTF_MAX_SIZE)
+	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -5522,13 +5523,13 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		return ERR_PTR(-ENOMEM);
 
 	log = &env->log;
-	if (log_level || log_ubuf || log_size) {
+	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log->level = log_level;
+		log->level = attr->btf_log_level;
 		log->ubuf = log_ubuf;
-		log->len_total = log_size;
+		log->len_total = attr->btf_log_size;
 
 		/* log attributes have to be sane */
 		if (!bpf_verifier_log_attr_valid(log)) {
@@ -5544,16 +5545,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 	env->btf = btf;
 
-	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
 		err = -ENOMEM;
 		goto errout;
 	}
 
 	btf->data = data;
-	btf->data_size = btf_data_size;
+	btf->data_size = attr->btf_size;
 
-	if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+	if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
@@ -5594,6 +5595,12 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		err = -EFAULT;
+		goto errout_meta;
+	}
 	if (bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
@@ -7218,15 +7225,12 @@ static int __btf_new_fd(struct btf *btf)
 	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
 }
 
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	struct btf *btf;
 	int ret;
 
-	btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
-			attr->btf_size, attr->btf_log_level,
-			u64_to_user_ptr(attr->btf_log_buf),
-			attr->btf_log_size);
+	btf = btf_parse(attr, uattr, uattr_size);
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e18ac7fdc210..6d575505f89c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2501,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
 
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
@@ -2653,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr, uattr);
+	err = bpf_check(&prog, attr, uattr, uattr_size);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -4371,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
 
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
@@ -4381,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
 	if (!bpf_capable())
 		return -EPERM;
 
-	return btf_new_fd(attr, uattr);
+	return btf_new_fd(attr, uattr, uattr_size);
 }
 
 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -5059,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = map_freeze(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr, uattr);
+		err = bpf_prog_load(&attr, uattr, size);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
@@ -5104,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
 	case BPF_BTF_LOAD:
-		err = bpf_btf_load(&attr, uattr);
+		err = bpf_btf_load(&attr, uattr, size);
 		break;
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a98cbc046d1e..308e7abeb979 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18694,7 +18694,7 @@ struct btf *bpf_get_btf_vmlinux(void)
 	return btf_vmlinux;
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
@@ -18861,6 +18861,12 @@ skip_full_check:
 	env->prog->aux->verified_insns = env->insn_processed;
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		ret = -EFAULT;
+		goto err_release_maps;
+	}
 	if (bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6c5a022ae28..3823100b7934 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
@@ -1513,7 +1523,7 @@ union bpf_attr {
 	struct { /* struct used by BPF_LINK_CREATE command */
 		union {
 			__u32		prog_fd;	/* eBPF program to attach */
-			__u32		map_fd;		/* eBPF struct_ops to attach */
+			__u32		map_fd;		/* struct_ops to attach */
 		};
 		union {
 			__u32		target_fd;	/* object to attach to */
-- 
cgit v1.2.3


From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:59 -0700
Subject: bpf: Simplify internal verifier log interface

Simplify internal verifier log API down to bpf_vlog_init() and
bpf_vlog_finalize(). The former handles input arguments validation in
one place and makes it easier to change it. The latter subsumes -ENOSPC
(truncation) and -EFAULT handling and simplifies both caller's code
(bpf_check() and btf_parse()).

For btf_parse(), this patch also makes sure that verifier log
finalization happens even if there is some error condition during BTF
verification process prior to normal finalization step.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 13 ++-------
 kernel/bpf/btf.c             | 65 ++++++++++++++++++++++----------------------
 kernel/bpf/log.c             | 48 ++++++++++++++++++++++++++------
 kernel/bpf/verifier.c        | 39 +++++++++++---------------
 4 files changed, 90 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 98d2eb382dbb..f03852b89d28 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -518,13 +518,6 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	if (log->level & BPF_LOG_FIXED)
-		return log->end_pos >= log->len_total;
-	return false;
-}
-
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log && log->level;
@@ -612,16 +605,16 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size);
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
-void bpf_vlog_finalize(struct bpf_verifier_log *log);
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0748cf4b8ab6..ffc31a1c84af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,16 +5504,30 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+	u32 log_true_size;
+	int err;
+
+	err = bpf_vlog_finalize(log, &log_true_size);
+
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log_true_size, sizeof(log_true_size)))
+		err = -EFAULT;
+
+	return err;
+}
+
 static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
 	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
-	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
 	u8 *data;
-	int err;
+	int err, ret;
 
 	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
@@ -5522,21 +5536,13 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (!env)
 		return ERR_PTR(-ENOMEM);
 
-	log = &env->log;
-	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->btf_log_level;
-		log->ubuf = log_ubuf;
-		log->len_total = attr->btf_log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			err = -EINVAL;
-			goto errout;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	err = bpf_vlog_init(&env->log, attr->btf_log_level,
+			    log_ubuf, attr->btf_log_size);
+	if (err)
+		goto errout_free;
 
 	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
 	if (!btf) {
@@ -5577,7 +5583,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (err)
 		goto errout;
 
-	struct_meta_tab = btf_parse_struct_metas(log, btf);
+	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
 	if (IS_ERR(struct_meta_tab)) {
 		err = PTR_ERR(struct_meta_tab);
 		goto errout;
@@ -5594,21 +5600,9 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		}
 	}
 
-	bpf_vlog_finalize(log);
-	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
-	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
-	if (bpf_vlog_truncated(log)) {
-		err = -ENOSPC;
-		goto errout_meta;
-	}
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
+	err = finalize_log(&env->log, uattr, uattr_size);
+	if (err)
+		goto errout_free;
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
@@ -5617,6 +5611,11 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 errout_meta:
 	btf_free_struct_meta_tab(btf);
 errout:
+	/* overwrite err with -ENOSPC or -EFAULT */
+	ret = finalize_log(&env->log, uattr, uattr_size);
+	if (ret)
+		err = ret;
+errout_free:
 	btf_verifier_env_free(env);
 	if (btf)
 		btf_free(btf);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 47bea2fad6fe..1fae2c5d7ae4 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,12 +10,26 @@
 #include <linux/bpf_verifier.h>
 #include <linux/math64.h>
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
 	return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 &&
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size)
+{
+	log->level = log_level;
+	log->ubuf = log_buf;
+	log->len_total = log_size;
+
+	/* log attributes have to be sane */
+	if (!bpf_verifier_log_attr_valid(log))
+		return -EINVAL;
+
+	return 0;
+}
+
 static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
 {
 	/* add_len includes terminal \0, so no need for +1. */
@@ -197,24 +211,25 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 	return 0;
 }
 
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+static bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
 	return log->len_max > log->len_total;
 }
 
-void bpf_vlog_finalize(struct bpf_verifier_log *log)
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
 {
 	u32 sublen;
 	int err;
 
-	if (!log || !log->level || !log->ubuf)
-		return;
-	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
-		return;
+	*log_size_actual = 0;
+	if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+		return 0;
 
+	if (!log->ubuf)
+		goto skip_log_rotate;
 	/* If we never truncated log, there is nothing to move around. */
-	if (log->start_pos == 0)
-		return;
+	if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0)
+		goto skip_log_rotate;
 
 	/* Otherwise we need to rotate log contents to make it start from the
 	 * buffer beginning and be a continuous zero-terminated string. Note
@@ -257,6 +272,21 @@ void bpf_vlog_finalize(struct bpf_verifier_log *log)
 	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
 	if (err)
 		log->ubuf = NULL;
+
+skip_log_rotate:
+	*log_size_actual = log->len_max;
+
+	/* properly initialized log has either both ubuf!=NULL and len_total>0
+	 * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+	 * we got a fault somewhere along the way, so report it back
+	 */
+	if (!!log->ubuf != !!log->len_total)
+		return -EFAULT;
+
+	if (bpf_vlog_truncated(log))
+		return -ENOSPC;
+
+	return 0;
 }
 
 /* log_level controls verbosity level of eBPF verifier.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 308e7abeb979..d6db6de3e9ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18698,8 +18698,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
-	struct bpf_verifier_log *log;
-	int i, len, ret = -EINVAL;
+	int i, len, ret = -EINVAL, err;
+	u32 log_true_size;
 	bool is_priv;
 
 	/* no program is valid */
@@ -18712,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
-	log = &env->log;
 
 	len = (*prog)->len;
 	env->insn_aux_data =
@@ -18733,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (!is_priv)
 		mutex_lock(&bpf_verifier_lock);
 
-	if (attr->log_level || attr->log_buf || attr->log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->log_level;
-		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log->len_total = attr->log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			ret = -EINVAL;
-			goto err_unlock;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	ret = bpf_vlog_init(&env->log, attr->log_level,
+			    (char __user *) (unsigned long) attr->log_buf,
+			    attr->log_size);
+	if (ret)
+		goto err_unlock;
 
 	mark_verifier_state_clean(env);
 
@@ -18860,17 +18853,17 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	bpf_vlog_finalize(log);
+	/* preserve original error even if log finalization is successful */
+	err = bpf_vlog_finalize(&env->log, &log_true_size);
+	if (err)
+		ret = err;
+
 	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
 	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
+				  &log_true_size, sizeof(log_true_size))) {
 		ret = -EFAULT;
 		goto err_release_maps;
 	}
-	if (bpf_vlog_truncated(log))
-		ret = -ENOSPC;
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
-		ret = -EFAULT;
 
 	if (ret)
 		goto err_release_maps;
-- 
cgit v1.2.3


From 3664ff82dae1ef9f14f7763d3dd30565e7ef9e14 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Mon, 10 Apr 2023 00:43:37 +0800
Subject: dm: add helper macro for simple DM target module init and exit

Eliminate duplicate boilerplate code for simple modules that contain
a single DM target driver without any additional setup code.

Add a new module_dm() macro, which replaces the module_init() and
module_exit() with template functions that call dm_register_target()
and dm_unregister_target() respectively.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c         | 14 +-------------
 drivers/md/dm-delay.c         | 15 +--------------
 drivers/md/dm-dust.c          | 14 +-------------
 drivers/md/dm-ebs-target.c    | 14 +-------------
 drivers/md/dm-era-target.c    | 14 +-------------
 drivers/md/dm-flakey.c        | 15 +--------------
 drivers/md/dm-log-writes.c    | 14 +-------------
 drivers/md/dm-raid.c          | 18 +-----------------
 drivers/md/dm-switch.c        | 14 +-------------
 drivers/md/dm-unstripe.c      | 14 +-------------
 drivers/md/dm-verity-target.c | 14 +-------------
 drivers/md/dm-writecache.c    | 14 +-------------
 drivers/md/dm-zero.c          | 14 +-------------
 drivers/md/dm-zoned-target.c  | 16 ++--------------
 include/linux/device-mapper.h | 20 ++++++++++++++++++++
 15 files changed, 35 insertions(+), 189 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 52615a258e13..8b47b913ee83 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3659,19 +3659,7 @@ static struct target_type crypt_target = {
 	.iterate_devices = crypt_iterate_devices,
 	.io_hints = crypt_io_hints,
 };
-
-static int __init dm_crypt_init(void)
-{
-	return dm_register_target(&crypt_target);
-}
-
-static void __exit dm_crypt_exit(void)
-{
-	dm_unregister_target(&crypt_target);
-}
-
-module_init(dm_crypt_init);
-module_exit(dm_crypt_exit);
+module_dm(crypt);
 
 MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 00d18b2070a5..7433525e5985 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -367,20 +367,7 @@ static struct target_type delay_target = {
 	.status	     = delay_status,
 	.iterate_devices = delay_iterate_devices,
 };
-
-static int __init dm_delay_init(void)
-{
-	return dm_register_target(&delay_target);
-}
-
-static void __exit dm_delay_exit(void)
-{
-	dm_unregister_target(&delay_target);
-}
-
-/* Module hooks */
-module_init(dm_delay_init);
-module_exit(dm_delay_exit);
+module_dm(delay);
 
 MODULE_DESCRIPTION(DM_NAME " delay target");
 MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 9bf3cdf548de..12a377e06d02 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -570,19 +570,7 @@ static struct target_type dust_target = {
 	.status = dust_status,
 	.prepare_ioctl = dust_prepare_ioctl,
 };
-
-static int __init dm_dust_init(void)
-{
-	return dm_register_target(&dust_target);
-}
-
-static void __exit dm_dust_exit(void)
-{
-	dm_unregister_target(&dust_target);
-}
-
-module_init(dm_dust_init);
-module_exit(dm_dust_exit);
+module_dm(dust);
 
 MODULE_DESCRIPTION(DM_NAME " dust test target");
 MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 38da4de3ecbf..435b45201f4d 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -452,19 +452,7 @@ static struct target_type ebs_target = {
 	.prepare_ioctl	 = ebs_prepare_ioctl,
 	.iterate_devices = ebs_iterate_devices,
 };
-
-static int __init dm_ebs_init(void)
-{
-	return dm_register_target(&ebs_target);
-}
-
-static void dm_ebs_exit(void)
-{
-	dm_unregister_target(&ebs_target);
-}
-
-module_init(dm_ebs_init);
-module_exit(dm_ebs_exit);
+module_dm(ebs);
 
 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
 MODULE_DESCRIPTION(DM_NAME " emulated block size target");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 554d234fca18..0d70914217ee 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1753,19 +1753,7 @@ static struct target_type era_target = {
 	.iterate_devices = era_iterate_devices,
 	.io_hints = era_io_hints
 };
-
-static int __init dm_era_init(void)
-{
-	return dm_register_target(&era_target);
-}
-
-static void __exit dm_era_exit(void)
-{
-	dm_unregister_target(&era_target);
-}
-
-module_init(dm_era_init);
-module_exit(dm_era_exit);
+module_dm(era);
 
 MODULE_DESCRIPTION(DM_NAME " era target");
 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 14179355e6a1..ebcfb99b186b 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -506,20 +506,7 @@ static struct target_type flakey_target = {
 	.prepare_ioctl = flakey_prepare_ioctl,
 	.iterate_devices = flakey_iterate_devices,
 };
-
-static int __init dm_flakey_init(void)
-{
-	return dm_register_target(&flakey_target);
-}
-
-static void __exit dm_flakey_exit(void)
-{
-	dm_unregister_target(&flakey_target);
-}
-
-/* Module hooks */
-module_init(dm_flakey_init);
-module_exit(dm_flakey_exit);
+module_dm(flakey);
 
 MODULE_DESCRIPTION(DM_NAME " flakey target");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 6d7436d2fd7f..f17a6cf2284e 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -937,19 +937,7 @@ static struct target_type log_writes_target = {
 	.dax_zero_page_range = log_writes_dax_zero_page_range,
 	.dax_recovery_write = log_writes_dax_recovery_write,
 };
-
-static int __init dm_log_writes_init(void)
-{
-	return dm_register_target(&log_writes_target);
-}
-
-static void __exit dm_log_writes_exit(void)
-{
-	dm_unregister_target(&log_writes_target);
-}
-
-module_init(dm_log_writes_init);
-module_exit(dm_log_writes_exit);
+module_dm(log_writes);
 
 MODULE_DESCRIPTION(DM_NAME " log writes target");
 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2dfd51509647..c8821fcb8299 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -4077,23 +4077,7 @@ static struct target_type raid_target = {
 	.preresume = raid_preresume,
 	.resume = raid_resume,
 };
-
-static int __init dm_raid_init(void)
-{
-	DMINFO("Loading target version %u.%u.%u",
-	       raid_target.version[0],
-	       raid_target.version[1],
-	       raid_target.version[2]);
-	return dm_register_target(&raid_target);
-}
-
-static void __exit dm_raid_exit(void)
-{
-	dm_unregister_target(&raid_target);
-}
-
-module_init(dm_raid_init);
-module_exit(dm_raid_exit);
+module_dm(raid);
 
 module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 5a5976b0cfb8..dfd9fb52a6f3 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -565,19 +565,7 @@ static struct target_type switch_target = {
 	.prepare_ioctl = switch_prepare_ioctl,
 	.iterate_devices = switch_iterate_devices,
 };
-
-static int __init dm_switch_init(void)
-{
-	return dm_register_target(&switch_target);
-}
-
-static void __exit dm_switch_exit(void)
-{
-	dm_unregister_target(&switch_target);
-}
-
-module_init(dm_switch_init);
-module_exit(dm_switch_exit);
+module_dm(switch);
 
 MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
 MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
index e7b7d5983a16..48587c16c445 100644
--- a/drivers/md/dm-unstripe.c
+++ b/drivers/md/dm-unstripe.c
@@ -192,19 +192,7 @@ static struct target_type unstripe_target = {
 	.iterate_devices = unstripe_iterate_devices,
 	.io_hints = unstripe_io_hints,
 };
-
-static int __init dm_unstripe_init(void)
-{
-	return dm_register_target(&unstripe_target);
-}
-
-static void __exit dm_unstripe_exit(void)
-{
-	dm_unregister_target(&unstripe_target);
-}
-
-module_init(dm_unstripe_init);
-module_exit(dm_unstripe_exit);
+module_dm(unstripe);
 
 MODULE_DESCRIPTION(DM_NAME " unstriped target");
 MODULE_ALIAS("dm-unstriped");
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 260f04d12982..e35c16e06d06 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1514,19 +1514,7 @@ static struct target_type verity_target = {
 	.iterate_devices = verity_iterate_devices,
 	.io_hints	= verity_io_hints,
 };
-
-static int __init dm_verity_init(void)
-{
-	return dm_register_target(&verity_target);
-}
-
-static void __exit dm_verity_exit(void)
-{
-	dm_unregister_target(&verity_target);
-}
-
-module_init(dm_verity_init);
-module_exit(dm_verity_exit);
+module_dm(verity);
 
 MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
 MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 81b60b75a9fa..074cb785eafc 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -2773,19 +2773,7 @@ static struct target_type writecache_target = {
 	.iterate_devices	= writecache_iterate_devices,
 	.io_hints		= writecache_io_hints,
 };
-
-static int __init dm_writecache_init(void)
-{
-	return dm_register_target(&writecache_target);
-}
-
-static void __exit dm_writecache_exit(void)
-{
-	dm_unregister_target(&writecache_target);
-}
-
-module_init(dm_writecache_init);
-module_exit(dm_writecache_exit);
+module_dm(writecache);
 
 MODULE_DESCRIPTION(DM_NAME " writecache target");
 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 884ac726a743..3b13e6eb1aa4 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -75,19 +75,7 @@ static struct target_type zero_target = {
 	.map    = zero_map,
 	.io_hints = zero_io_hints,
 };
-
-static int __init dm_zero_init(void)
-{
-	return dm_register_target(&zero_target);
-}
-
-static void __exit dm_zero_exit(void)
-{
-	dm_unregister_target(&zero_target);
-}
-
-module_init(dm_zero_init)
-module_exit(dm_zero_exit)
+module_dm(zero);
 
 MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index ad4764dcd013..ad8e670a2f9b 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1138,7 +1138,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
 	return r;
 }
 
-static struct target_type dmz_type = {
+static struct target_type zoned_target = {
 	.name		 = "zoned",
 	.version	 = {2, 0, 0},
 	.features	 = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
@@ -1154,19 +1154,7 @@ static struct target_type dmz_type = {
 	.status		 = dmz_status,
 	.message	 = dmz_message,
 };
-
-static int __init dmz_init(void)
-{
-	return dm_register_target(&dmz_type);
-}
-
-static void __exit dmz_exit(void)
-{
-	dm_unregister_target(&dmz_type);
-}
-
-module_init(dmz_init);
-module_exit(dmz_exit);
+module_dm(zoned);
 
 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 8aa6b3ea91fa..f2d9afb8a7e9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -631,6 +631,26 @@ void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
 		DMEMIT("target_name=%s,target_version=%u.%u.%u", \
 		       (y)->name, (y)->version[0], (y)->version[1], (y)->version[2])
 
+/**
+ * module_dm() - Helper macro for DM targets that don't do anything
+ * special in their module_init and module_exit.
+ * Each module may only use this macro once, and calling it replaces
+ * module_init() and module_exit().
+ *
+ * @name: DM target's name
+ */
+#define module_dm(name) \
+static int __init dm_##name##_init(void) \
+{ \
+	return dm_register_target(&(name##_target)); \
+} \
+module_init(dm_##name##_init) \
+static void __exit dm_##name##_exit(void) \
+{ \
+	dm_unregister_target(&(name##_target)); \
+} \
+module_exit(dm_##name##_exit)
+
 /*
  * Definitions of return values from target end_io function.
  */
-- 
cgit v1.2.3


From a7687fec197596f23954225c99e183e90208bcbb Mon Sep 17 00:00:00 2001
From: Jason Kim <sukbeom.kim@gmail.com>
Date: Tue, 28 Mar 2023 12:14:59 +0900
Subject: media: mc-device: remove unnecessary __must_check

In the file mc-device.c, the function
media_device_register_entity_notify() does not need to have the
__must_check attribute since it returns only a value of 0.
Therefore, we can remove this attribute and change the function's
return type.

Signed-off-by: Jason Kim <sukbeom.kim@gmail.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/mc/mc-device.c           | 3 +--
 drivers/media/usb/au0828/au0828-core.c | 9 ++-------
 include/media/media-device.h           | 5 ++---
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/media/mc/mc-device.c b/drivers/media/mc/mc-device.c
index 25020d58eb06..8cee956e38d4 100644
--- a/drivers/media/mc/mc-device.c
+++ b/drivers/media/mc/mc-device.c
@@ -756,13 +756,12 @@ int __must_check __media_device_register(struct media_device *mdev,
 }
 EXPORT_SYMBOL_GPL(__media_device_register);
 
-int __must_check media_device_register_entity_notify(struct media_device *mdev,
+void media_device_register_entity_notify(struct media_device *mdev,
 					struct media_entity_notify *nptr)
 {
 	mutex_lock(&mdev->graph_mutex);
 	list_add_tail(&nptr->list, &mdev->entity_notify);
 	mutex_unlock(&mdev->graph_mutex);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(media_device_register_entity_notify);
 
diff --git a/drivers/media/usb/au0828/au0828-core.c b/drivers/media/usb/au0828/au0828-core.c
index 400cf9bd26b9..b3a09d3ac7d2 100644
--- a/drivers/media/usb/au0828/au0828-core.c
+++ b/drivers/media/usb/au0828/au0828-core.c
@@ -627,14 +627,9 @@ static int au0828_media_device_register(struct au0828_dev *dev,
 	/* register entity_notify callback */
 	dev->entity_notify.notify_data = (void *) dev;
 	dev->entity_notify.notify = (void *) au0828_media_graph_notify;
-	ret = media_device_register_entity_notify(dev->media_dev,
+	media_device_register_entity_notify(dev->media_dev,
 						  &dev->entity_notify);
-	if (ret) {
-		dev_err(&udev->dev,
-			"Media Device register entity_notify Error: %d\n",
-			ret);
-		return ret;
-	}
+
 	/* set enable_source */
 	mutex_lock(&dev->media_dev->graph_mutex);
 	dev->media_dev->source_priv = (void *) dev;
diff --git a/include/media/media-device.h b/include/media/media-device.h
index 86716ee7cc6c..2c146d0b2b1c 100644
--- a/include/media/media-device.h
+++ b/include/media/media-device.h
@@ -364,7 +364,7 @@ void media_device_unregister_entity(struct media_entity *entity);
  *    media_entity_notify callbacks are invoked.
  */
 
-int __must_check media_device_register_entity_notify(struct media_device *mdev,
+void media_device_register_entity_notify(struct media_device *mdev,
 					struct media_entity_notify *nptr);
 
 /**
@@ -444,11 +444,10 @@ static inline int media_device_register_entity(struct media_device *mdev,
 static inline void media_device_unregister_entity(struct media_entity *entity)
 {
 }
-static inline int media_device_register_entity_notify(
+static inline void media_device_register_entity_notify(
 					struct media_device *mdev,
 					struct media_entity_notify *nptr)
 {
-	return 0;
 }
 static inline void media_device_unregister_entity_notify(
 					struct media_device *mdev,
-- 
cgit v1.2.3


From bd5a03bc5be8cc74c607896cb780a2819389a4dd Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Fri, 24 Mar 2023 12:35:29 +0200
Subject: media: Accept non-subdev sinks in v4l2_create_fwnode_links_to_pad()

The v4l2_create_fwnode_links_to_pad() helper requires the sink pad
passed to it to belong to a subdev. This requirement can be lifted
easily. Make the function usable for non-subdev sinks, which allows
using it with video_device sinks.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-mc.c | 15 ++++++---------
 include/media/v4l2-mc.h           |  8 ++++----
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-mc.c b/drivers/media/v4l2-core/v4l2-mc.c
index b01474717dca..bf0c18100664 100644
--- a/drivers/media/v4l2-core/v4l2-mc.c
+++ b/drivers/media/v4l2-core/v4l2-mc.c
@@ -313,14 +313,11 @@ int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd,
 				    struct media_pad *sink, u32 flags)
 {
 	struct fwnode_handle *endpoint;
-	struct v4l2_subdev *sink_sd;
 
 	if (!(sink->flags & MEDIA_PAD_FL_SINK) ||
 	    !is_media_entity_v4l2_subdev(sink->entity))
 		return -EINVAL;
 
-	sink_sd = media_entity_to_v4l2_subdev(sink->entity);
-
 	fwnode_graph_for_each_endpoint(dev_fwnode(src_sd->dev), endpoint) {
 		struct fwnode_handle *remote_ep;
 		int src_idx, sink_idx, ret;
@@ -340,7 +337,7 @@ int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd,
 		 * ask the sink to verify it owns the remote endpoint,
 		 * and translate to a sink pad.
 		 */
-		sink_idx = media_entity_get_fwnode_pad(&sink_sd->entity,
+		sink_idx = media_entity_get_fwnode_pad(sink->entity,
 						       remote_ep,
 						       MEDIA_PAD_FL_SINK);
 		fwnode_handle_put(remote_ep);
@@ -362,17 +359,17 @@ int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd,
 		if (media_entity_find_link(src, sink))
 			continue;
 
-		dev_dbg(sink_sd->dev, "creating link %s:%d -> %s:%d\n",
+		dev_dbg(src_sd->dev, "creating link %s:%d -> %s:%d\n",
 			src_sd->entity.name, src_idx,
-			sink_sd->entity.name, sink_idx);
+			sink->entity->name, sink_idx);
 
 		ret = media_create_pad_link(&src_sd->entity, src_idx,
-					    &sink_sd->entity, sink_idx, flags);
+					    sink->entity, sink_idx, flags);
 		if (ret) {
-			dev_err(sink_sd->dev,
+			dev_err(src_sd->dev,
 				"link %s:%d -> %s:%d failed with %d\n",
 				src_sd->entity.name, src_idx,
-				sink_sd->entity.name, sink_idx, ret);
+				sink->entity->name, sink_idx, ret);
 
 			fwnode_handle_put(endpoint);
 			return ret;
diff --git a/include/media/v4l2-mc.h b/include/media/v4l2-mc.h
index c181685923d5..b39586dfba35 100644
--- a/include/media/v4l2-mc.h
+++ b/include/media/v4l2-mc.h
@@ -87,17 +87,17 @@ int v4l_vb2q_enable_media_source(struct vb2_queue *q);
 
 /**
  * v4l2_create_fwnode_links_to_pad - Create fwnode-based links from a
- *                                   source subdev to a sink subdev pad.
+ *                                   source subdev to a sink pad.
  *
  * @src_sd: pointer to a source subdev
- * @sink:  pointer to a subdev sink pad
+ * @sink:  pointer to a sink pad
  * @flags: the link flags
  *
  * This function searches for fwnode endpoint connections from a source
  * subdevice to a single sink pad, and if suitable connections are found,
  * translates them into media links to that pad. The function can be
- * called by the sink subdevice, in its v4l2-async notifier subdev bound
- * callback, to create links from a bound source subdevice.
+ * called by the sink, in its v4l2-async notifier bound callback, to create
+ * links from a bound source subdevice.
  *
  * The @flags argument specifies the link flags. The caller shall ensure that
  * the flags are valid regardless of the number of links that may be created.
-- 
cgit v1.2.3


From 88a4d7bdeec97890cd543b58dd3588f1f879f51b Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Mon, 20 Feb 2023 08:43:05 -0500
Subject: NFS: Configure support for netfs when NFS fscache is configured

As first steps for support of the netfs library when NFS_FSCACHE is
configured, add NETFS_SUPPORT to Kconfig and add the required netfs_inode
into struct nfs_inode.

Using netfs requires we move the VFS inode structure to be stored
inside struct netfs_inode, along with the fscache_cookie.
Thus, if NFS_FSCACHE is configured, place netfs_inode inside an
anonymous union so the vfs_inode memory is the same and we do
not need to modify other non-fscache areas of NFS.
In addition, inside the NFS fscache code, use the new helpers,
netfs_inode() and netfs_i_cookie() helpers, and remove our own
helper, nfs_i_fscache().

Later patches will convert NFS fscache to fully use netfs.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-by: Daire Byrne <daire@dneg.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/Kconfig         |  1 +
 fs/nfs/fscache.c       | 20 +++++++++-----------
 fs/nfs/fscache.h       | 15 ++++++---------
 include/linux/nfs_fs.h | 24 ++++++++++--------------
 4 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c1c7ed2fd860..b6fc169be1b1 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -170,6 +170,7 @@ config ROOT_NFS
 config NFS_FSCACHE
 	bool "Provide NFS client caching support"
 	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+	select NETFS_SUPPORT
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
 	  the general filesystem cache manager
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ea5f2976dfab..3b4dafa771bd 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -163,13 +163,14 @@ void nfs_fscache_init_inode(struct inode *inode)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	nfsi->fscache = NULL;
+	netfs_inode(inode)->cache = NULL;
 	if (!(nfss->fscache && S_ISREG(inode->i_mode)))
 		return;
 
 	nfs_fscache_update_auxdata(&auxdata, inode);
 
-	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+	netfs_inode(inode)->cache = fscache_acquire_cookie(
+					       nfss->fscache,
 					       0,
 					       nfsi->fh.data, /* index_key */
 					       nfsi->fh.size,
@@ -183,11 +184,8 @@ void nfs_fscache_init_inode(struct inode *inode)
  */
 void nfs_fscache_clear_inode(struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
-
-	fscache_relinquish_cookie(cookie, false);
-	nfsi->fscache = NULL;
+	fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false);
+	netfs_inode(inode)->cache = NULL;
 }
 
 /*
@@ -212,7 +210,7 @@ void nfs_fscache_clear_inode(struct inode *inode)
 void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	bool open_for_write = inode_is_open_for_write(inode);
 
 	if (!fscache_cookie_valid(cookie))
@@ -230,7 +228,7 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	loff_t i_size = i_size_read(inode);
 
 	nfs_fscache_update_auxdata(&auxdata, inode);
@@ -243,7 +241,7 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 static int fscache_fallback_read_page(struct inode *inode, struct page *page)
 {
 	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
 	struct iov_iter iter;
 	struct bio_vec bvec;
 	int ret;
@@ -269,7 +267,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
 				       bool no_space_allocated_yet)
 {
 	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
 	struct iov_iter iter;
 	struct bio_vec bvec;
 	loff_t start = page_offset(page);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2a37af880978..38614ed8f951 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -54,7 +54,7 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
 		folio_wait_fscache(folio);
-		fscache_note_page_release(nfs_i_fscache(folio->mapping->host));
+		fscache_note_page_release(netfs_i_cookie(&NFS_I(folio->mapping->host)->netfs));
 		nfs_inc_fscache_stats(folio->mapping->host,
 				      NFSIOS_FSCACHE_PAGES_UNCACHED);
 	}
@@ -66,7 +66,7 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
  */
 static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
 {
-	if (nfs_i_fscache(inode))
+	if (netfs_inode(inode)->cache)
 		return __nfs_fscache_read_page(inode, page);
 	return -ENOBUFS;
 }
@@ -78,7 +78,7 @@ static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
 static inline void nfs_fscache_write_page(struct inode *inode,
 					   struct page *page)
 {
-	if (nfs_i_fscache(inode))
+	if (netfs_inode(inode)->cache)
 		__nfs_fscache_write_page(inode, page);
 }
 
@@ -101,13 +101,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie =  netfs_i_cookie(&NFS_I(inode)->netfs);
 
-	if (nfsi->fscache) {
-		nfs_fscache_update_auxdata(&auxdata, inode);
-		fscache_invalidate(nfsi->fscache, &auxdata,
-				   i_size_read(inode), flags);
-	}
+	nfs_fscache_update_auxdata(&auxdata, inode);
+	fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags);
 }
 
 /*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index bf89fe6fc3ba..041e79a48f7a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -31,6 +31,10 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/clnt.h>
 
+#ifdef CONFIG_NFS_FSCACHE
+#include <linux/netfs.h>
+#endif
+
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs3.h>
@@ -204,14 +208,15 @@ struct nfs_inode {
 	/* how many bytes have been written/read and how many bytes queued up */
 	__u64 write_io;
 	__u64 read_io;
-#ifdef CONFIG_NFS_FSCACHE
-	struct fscache_cookie	*fscache;
-#endif
-	struct inode		vfs_inode;
-
 #ifdef CONFIG_NFS_V4_2
 	struct nfs4_xattr_cache *xattr_cache;
 #endif
+	union {
+		struct inode		vfs_inode;
+#ifdef CONFIG_NFS_FSCACHE
+		struct netfs_inode	netfs; /* netfs context and VFS inode */
+#endif
+	};
 };
 
 struct nfs4_copy_state {
@@ -329,15 +334,6 @@ static inline int NFS_STALE(const struct inode *inode)
 	return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
-static inline struct fscache_cookie *nfs_i_fscache(struct inode *inode)
-{
-#ifdef CONFIG_NFS_FSCACHE
-	return NFS_I(inode)->fscache;
-#else
-	return NULL;
-#endif
-}
-
 static inline __u64 NFS_FILEID(const struct inode *inode)
 {
 	return NFS_I(inode)->fileid;
-- 
cgit v1.2.3


From 000dbe0bec058cbf2ca9e156e4a5584f5158b0f9 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Mon, 20 Feb 2023 08:43:06 -0500
Subject: NFS: Convert buffered read paths to use netfs when fscache is enabled

Convert the NFS buffered read code paths to corresponding netfs APIs,
but only when fscache is configured and enabled.

The netfs API defines struct netfs_request_ops which must be filled
in by the network filesystem.  For NFS, we only need to define 5 of
the functions, the main one being the issue_read() function.
The issue_read() function is called by the netfs layer when a read
cannot be fulfilled locally, and must be sent to the server (either
the cache is not active, or it is active but the data is not available).
Once the read from the server is complete, netfs requires a call to
netfs_subreq_terminated() which conveys either how many bytes were read
successfully, or an error.  Note that issue_read() is called with a
structure, netfs_io_subrequest, which defines the IO requested, and
contains a start and a length (both in bytes), and assumes the underlying
netfs will return a either an error on the whole region, or the number
of bytes successfully read.

The NFS IO path is page based and the main APIs are the pgio APIs defined
in pagelist.c.  For the pgio APIs, there is no way for the caller to
know how many RPCs will be sent and how the pages will be broken up
into underlying RPCs, each of which will have their own completion and
return code.  In contrast, netfs is subrequest based, a single
subrequest may contain multiple pages, and a single subrequest is
initiated with issue_read() and terminated with netfs_subreq_terminated().
Thus, to utilze the netfs APIs, NFS needs some way to accommodate
the netfs API requirement on the single response to the whole
subrequest, while also minimizing disruptive changes to the NFS
pgio layer.

The approach taken with this patch is to allocate a small structure
for each nfs_netfs_issue_read() call, store the final error and number
of bytes successfully transferred in the structure, and update these values
as each RPC completes.  The refcount on the structure is used as a marker
for the last RPC completion, is incremented in nfs_netfs_read_initiate(),
and decremented inside nfs_netfs_read_completion(), when a nfs_pgio_header
contains a valid pointer to the data.  On the final put (which signals
the final outstanding RPC is complete) in nfs_netfs_read_completion(),
call netfs_subreq_terminated() with either the final error value (if
one or more READs complete with an error) or the number of bytes
successfully transferred (if all RPCs complete successfully).  Note
that when all RPCs complete successfully, the number of bytes transferred
is capped to the length of the subrequest.  Capping the transferred length
to the subrequest length prevents "Subreq overread" warnings from netfs.
This is due to the "aligned_len" in nfs_pageio_add_page(), and the
corner case where NFS requests a full page at the end of the file,
even when i_size reflects only a partial page (NFS overread).

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-by: Daire Byrne <daire@dneg.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/fscache.c         | 222 ++++++++++++++++++++++++++++-------------------
 fs/nfs/fscache.h         | 122 +++++++++++++++++++-------
 fs/nfs/inode.c           |   2 +
 fs/nfs/internal.h        |   9 ++
 fs/nfs/pagelist.c        |   4 +
 fs/nfs/read.c            |  61 +++++++------
 include/linux/nfs_page.h |   3 +
 include/linux/nfs_xdr.h  |   3 +
 8 files changed, 274 insertions(+), 152 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 3b4dafa771bd..95c2b3056e2b 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -15,6 +15,9 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include <linux/xarray.h>
+#include <linux/fscache.h>
+#include <linux/netfs.h>
 
 #include "internal.h"
 #include "iostat.h"
@@ -235,108 +238,153 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 	fscache_unuse_cookie(cookie, &auxdata, &i_size);
 }
 
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
+int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
-	struct iov_iter iter;
-	struct bio_vec bvec;
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
-	iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE);
-
-	ret = fscache_begin_read_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
-			   NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	if (!netfs_inode(folio_inode(folio))->cache)
+		return -ENOBUFS;
+
+	return netfs_read_folio(file, folio);
 }
 
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_page(struct inode *inode, struct page *page,
-				       bool no_space_allocated_yet)
+int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
-	struct iov_iter iter;
-	struct bio_vec bvec;
-	loff_t start = page_offset(page);
-	size_t len = PAGE_SIZE;
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
-	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
-
-	ret = fscache_begin_write_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
-				      no_space_allocated_yet);
-	if (ret == 0)
-		ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	struct inode *inode = ractl->mapping->host;
+
+	if (!netfs_inode(inode)->cache)
+		return -ENOBUFS;
+
+	netfs_readahead(ractl);
+	return 0;
 }
 
-/*
- * Retrieve a page from fscache
- */
-int __nfs_fscache_read_page(struct inode *inode, struct page *page)
+atomic_t nfs_netfs_debug_id;
+static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
-	int ret;
+	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
 
-	trace_nfs_fscache_read_page(inode, page);
-	if (PageChecked(page)) {
-		ClearPageChecked(page);
-		ret = 1;
-		goto out;
-	}
+	return 0;
+}
 
-	ret = fscache_fallback_read_page(inode, page);
-	if (ret < 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
-		SetPageChecked(page);
-		goto out;
-	}
+static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+{
+	put_nfs_open_context(rreq->netfs_priv);
+}
 
-	/* Read completed synchronously */
-	nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
-	SetPageUptodate(page);
-	ret = 0;
-out:
-	trace_nfs_fscache_read_page_exit(inode, page, ret);
-	return ret;
+static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+	return fscache_begin_read_operation(&rreq->cache_resources,
+					    netfs_i_cookie(netfs_inode(rreq->inode)));
 }
 
-/*
- * Store a newly fetched page in fscache.  We can be certain there's no page
- * stored in the cache as yet otherwise we would've read it from there.
- */
-void __nfs_fscache_write_page(struct inode *inode, struct page *page)
+static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
 {
-	int ret;
+	struct nfs_netfs_io_data *netfs;
+
+	netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT);
+	if (!netfs)
+		return NULL;
+	netfs->sreq = sreq;
+	refcount_set(&netfs->refcount, 1);
+	return netfs;
+}
 
-	trace_nfs_fscache_write_page(inode, page);
+static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
+{
+	size_t	rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
 
-	ret = fscache_fallback_write_page(inode, page, true);
+	sreq->len = min(sreq->len, rsize);
+	return true;
+}
 
-	if (ret != 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
-	} else {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
+{
+	struct nfs_netfs_io_data	*netfs;
+	struct nfs_pageio_descriptor	pgio;
+	struct inode *inode = sreq->rreq->inode;
+	struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
+	struct page *page;
+	int err;
+	pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+	pgoff_t last = ((sreq->start + sreq->len -
+			 sreq->transferred - 1) >> PAGE_SHIFT);
+	XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
+
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	netfs = nfs_netfs_alloc(sreq);
+	if (!netfs)
+		return netfs_subreq_terminated(sreq, -ENOMEM, false);
+
+	pgio.pg_netfs = netfs; /* used in completion */
+
+	xas_lock(&xas);
+	xas_for_each(&xas, page, last) {
+		/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs  */
+		xas_pause(&xas);
+		xas_unlock(&xas);
+		err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
+		if (err < 0) {
+			netfs->error = err;
+			goto out;
+		}
+		xas_lock(&xas);
 	}
-	trace_nfs_fscache_write_page_exit(inode, page, ret);
+	xas_unlock(&xas);
+out:
+	nfs_pageio_complete_read(&pgio);
+	nfs_netfs_put(netfs);
 }
+
+void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+
+	if (!netfs)
+		return;
+
+	nfs_netfs_get(netfs);
+}
+
+int nfs_netfs_folio_unlock(struct folio *folio)
+{
+	struct inode *inode = folio_file_mapping(folio)->host;
+
+	/*
+	 * If fscache is enabled, netfs will unlock pages.
+	 */
+	if (netfs_inode(inode)->cache)
+		return 0;
+
+	return 1;
+}
+
+void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+	struct netfs_io_subrequest      *sreq;
+
+	if (!netfs)
+		return;
+
+	sreq = netfs->sreq;
+	if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
+
+	if (hdr->error)
+		netfs->error = hdr->error;
+	else
+		atomic64_add(hdr->res.count, &netfs->transferred);
+
+	nfs_netfs_put(netfs);
+	hdr->netfs = NULL;
+}
+
+const struct netfs_request_ops nfs_netfs_ops = {
+	.init_request		= nfs_netfs_init_request,
+	.free_request		= nfs_netfs_free_request,
+	.begin_cache_operation	= nfs_netfs_begin_cache_operation,
+	.issue_read		= nfs_netfs_issue_read,
+	.clamp_length		= nfs_netfs_clamp_length
+};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 38614ed8f951..e1706e736c64 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -34,6 +34,58 @@ struct nfs_fscache_inode_auxdata {
 	u64	change_attr;
 };
 
+struct nfs_netfs_io_data {
+	/*
+	 * NFS may split a netfs_io_subrequest into multiple RPCs, each
+	 * with their own read completion.  In netfs, we can only call
+	 * netfs_subreq_terminated() once for each subrequest.  Use the
+	 * refcount here to double as a marker of the last RPC completion,
+	 * and only call netfs via netfs_subreq_terminated() once.
+	 */
+	refcount_t			refcount;
+	struct netfs_io_subrequest	*sreq;
+
+	/*
+	 * Final disposition of the netfs_io_subrequest, sent in
+	 * netfs_subreq_terminated()
+	 */
+	atomic64_t	transferred;
+	int		error;
+};
+
+static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
+{
+	refcount_inc(&netfs->refcount);
+}
+
+static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
+{
+	ssize_t final_len;
+
+	/* Only the last RPC completion should call netfs_subreq_terminated() */
+	if (!refcount_dec_and_test(&netfs->refcount))
+		return;
+
+	/*
+	 * The NFS pageio interface may read a complete page, even when netfs
+	 * only asked for a partial page.  Specifically, this may be seen when
+	 * one thread is truncating a file while another one is reading the last
+	 * page of the file.
+	 * Correct the final length here to be no larger than the netfs subrequest
+	 * length, and thus avoid netfs's "Subreq overread" warning message.
+	 */
+	final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
+	netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+	kfree(netfs);
+}
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
+{
+	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+}
+extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
+extern int nfs_netfs_folio_unlock(struct folio *folio);
+
 /*
  * fscache.c
  */
@@ -44,9 +96,8 @@ extern void nfs_fscache_init_inode(struct inode *);
 extern void nfs_fscache_clear_inode(struct inode *);
 extern void nfs_fscache_open_file(struct inode *, struct file *);
 extern void nfs_fscache_release_file(struct inode *, struct file *);
-
-extern int __nfs_fscache_read_page(struct inode *, struct page *);
-extern void __nfs_fscache_write_page(struct inode *, struct page *);
+extern int nfs_netfs_readahead(struct readahead_control *ractl);
+extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
 
 static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 {
@@ -54,34 +105,11 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
 		folio_wait_fscache(folio);
-		fscache_note_page_release(netfs_i_cookie(&NFS_I(folio->mapping->host)->netfs));
-		nfs_inc_fscache_stats(folio->mapping->host,
-				      NFSIOS_FSCACHE_PAGES_UNCACHED);
 	}
+	fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
 	return true;
 }
 
-/*
- * Retrieve a page from an inode data storage object.
- */
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
-{
-	if (netfs_inode(inode)->cache)
-		return __nfs_fscache_read_page(inode, page);
-	return -ENOBUFS;
-}
-
-/*
- * Store a page newly fetched from the server in an inode data storage object
- * in the cache.
- */
-static inline void nfs_fscache_write_page(struct inode *inode,
-					   struct page *page)
-{
-	if (netfs_inode(inode)->cache)
-		__nfs_fscache_write_page(inode, page);
-}
-
 static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
 					      struct inode *inode)
 {
@@ -117,7 +145,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 	return "no ";
 }
 
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+					     struct nfs_pageio_descriptor *desc)
+{
+	hdr->netfs = desc->pg_netfs;
+}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+						   struct nfs_pgio_header *hdr)
+{
+	desc->pg_netfs = hdr->netfs;
+}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc)
+{
+	desc->pg_netfs = NULL;
+}
 #else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline int nfs_netfs_folio_unlock(struct folio *folio)
+{
+	return 1;
+}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
 static inline void nfs_fscache_init_inode(struct inode *inode) {}
@@ -125,22 +174,29 @@ static inline void nfs_fscache_clear_inode(struct inode *inode) {}
 static inline void nfs_fscache_open_file(struct inode *inode,
 					 struct file *filp) {}
 static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-
-static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+static inline int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	return true; /* may release folio */
+	return -ENOBUFS;
 }
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
+static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
 	return -ENOBUFS;
 }
-static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+	return true; /* may release folio */
+}
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
 
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 {
 	return "no ";
 }
-
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+					     struct nfs_pageio_descriptor *desc) {}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+						   struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {}
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 222a28320e1c..5c8027e3c961 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2254,6 +2254,8 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_NFS_V4_2
 	nfsi->xattr_cache = NULL;
 #endif
+	nfs_netfs_inode_init(nfsi);
+
 	return &nfsi->vfs_inode;
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a65fe2a63ab..9ba07553eab8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -452,6 +452,10 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern int nfs_client_for_each_server(struct nfs_client *clp,
 				      int (*fn)(struct nfs_server *, void *),
 				      void *data);
+#ifdef CONFIG_NFS_FSCACHE
+extern const struct netfs_request_ops nfs_netfs_ops;
+#endif
+
 /* io.c */
 extern void nfs_start_io_read(struct inode *inode);
 extern void nfs_end_io_read(struct inode *inode);
@@ -481,9 +485,14 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
 
 struct nfs_pgio_completion_ops;
 /* read.c */
+extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
+extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+			       struct nfs_open_context *ctx,
+			       struct folio *folio);
+extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 64fa8de199de..6efb5068c116 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,7 @@
 #include "internal.h"
 #include "pnfs.h"
 #include "nfstrace.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -105,6 +106,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->good_bytes = mirror->pg_count;
 	hdr->io_completion = desc->pg_io_completion;
 	hdr->dreq = desc->pg_dreq;
+	nfs_netfs_set_pgio_header(hdr, desc);
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
 	if (hdr->completion_ops->init_hdr)
@@ -941,6 +943,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_lseg = NULL;
 	desc->pg_io_completion = NULL;
 	desc->pg_dreq = NULL;
+	nfs_netfs_reset_pageio_descriptor(desc);
 	desc->pg_bsize = bsize;
 
 	desc->pg_mirror_count = 1;
@@ -1477,6 +1480,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 
 	desc->pg_io_completion = hdr->io_completion;
 	desc->pg_dreq = hdr->dreq;
+	nfs_netfs_set_pageio_descriptor(desc, hdr);
 	list_splice_init(&hdr->pages, &pages);
 	while (!list_empty(&pages)) {
 		struct nfs_page *req = nfs_list_entry(pages.next);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8cb9e8d3c51d..f71eeee67e20 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,7 +31,7 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
@@ -74,7 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
-static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
+void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
 {
 	struct nfs_pgio_mirror *pgm;
 	unsigned long npages;
@@ -110,20 +110,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
 static void nfs_readpage_release(struct nfs_page *req, int error)
 {
-	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
 	struct folio *folio = nfs_page_to_folio(req);
 
-	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-		(long long)req_offset(req));
-
 	if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
 		folio_set_error(folio);
-	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-		if (folio_test_uptodate(folio))
-			nfs_fscache_write_page(inode, &folio->page);
-		folio_unlock(folio);
-	}
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+		if (nfs_netfs_folio_unlock(folio))
+			folio_unlock(folio);
+
 	nfs_release_request(req);
 }
 
@@ -177,6 +171,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req, error);
 	}
+	nfs_netfs_read_completion(hdr);
+
 out:
 	hdr->release(hdr);
 }
@@ -187,6 +183,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
 	rpc_ops->read_setup(hdr, msg);
+	nfs_netfs_initiate_read(hdr);
 	trace_nfs_initiate_read(hdr);
 }
 
@@ -202,7 +199,7 @@ nfs_async_read_error(struct list_head *head, int error)
 	}
 }
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
 	.error_cleanup = nfs_async_read_error,
 	.completion = nfs_read_completion,
 };
@@ -277,9 +274,9 @@ static void nfs_readpage_result(struct rpc_task *task,
 		nfs_readpage_retry(task, hdr);
 }
 
-static int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
-			      struct nfs_open_context *ctx,
-			      struct folio *folio)
+int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+		       struct nfs_open_context *ctx,
+		       struct folio *folio)
 {
 	struct inode *inode = folio_file_mapping(folio)->host;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -295,15 +292,11 @@ static int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
 
 	aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize);
 
-	if (!IS_SYNC(inode)) {
-		error = nfs_fscache_read_page(inode, &folio->page);
-		if (error == 0)
-			goto out_unlock;
-	}
-
 	new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
-	if (IS_ERR(new))
-		goto out_error;
+	if (IS_ERR(new)) {
+		error = PTR_ERR(new);
+		goto out;
+	}
 
 	if (len < fsize)
 		folio_zero_segment(folio, len, fsize);
@@ -314,10 +307,6 @@ static int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
 		goto out;
 	}
 	return 0;
-out_error:
-	error = PTR_ERR(new);
-out_unlock:
-	folio_unlock(folio);
 out:
 	return error;
 }
@@ -356,6 +345,10 @@ int nfs_read_folio(struct file *file, struct folio *folio)
 	if (NFS_STALE(inode))
 		goto out_unlock;
 
+	ret = nfs_netfs_read_folio(file, folio);
+	if (!ret)
+		goto out;
+
 	ctx = get_nfs_open_context(nfs_file_open_context(file));
 
 	xchg(&ctx->error, 0);
@@ -364,7 +357,7 @@ int nfs_read_folio(struct file *file, struct folio *folio)
 
 	ret = nfs_read_add_folio(&pgio, ctx, folio);
 	if (ret)
-		goto out;
+		goto out_put;
 
 	nfs_pageio_complete_read(&pgio);
 	ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
@@ -373,14 +366,14 @@ int nfs_read_folio(struct file *file, struct folio *folio)
 		if (!folio_test_uptodate(folio) && !ret)
 			ret = xchg(&ctx->error, 0);
 	}
-out:
+out_put:
 	put_nfs_open_context(ctx);
+out:
 	trace_nfs_aop_readpage_done(inode, folio, ret);
 	return ret;
 out_unlock:
 	folio_unlock(folio);
-	trace_nfs_aop_readpage_done(inode, folio, ret);
-	return ret;
+	goto out;
 }
 
 void nfs_readahead(struct readahead_control *ractl)
@@ -401,6 +394,10 @@ void nfs_readahead(struct readahead_control *ractl)
 	if (NFS_STALE(inode))
 		goto out;
 
+	ret = nfs_netfs_readahead(ractl);
+	if (!ret)
+		goto out;
+
 	if (file == NULL) {
 		ret = -EBADF;
 		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index a2f1ca657623..aa9f4c6ebe26 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -105,6 +105,9 @@ struct nfs_pageio_descriptor {
 	struct pnfs_layout_segment *pg_lseg;
 	struct nfs_io_completion *pg_io_completion;
 	struct nfs_direct_req	*pg_dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	void			*pg_netfs;
+#endif
 	unsigned int		pg_bsize;	/* default bsize for mirrors */
 
 	u32			pg_mirror_count;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e86cf6642d21..e196ef595908 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
 	const struct nfs_rw_ops	*rw_ops;
 	struct nfs_io_completion *io_completion;
 	struct nfs_direct_req	*dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	void			*netfs;
+#endif
 
 	int			pnfs_error;
 	int			error;		/* merge with pnfs_error */
-- 
cgit v1.2.3


From 0631d5e02a1c722b90c11a47dcb6d83ef88ab47b Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Mon, 20 Feb 2023 08:43:07 -0500
Subject: NFS: Remove all NFSIOS_FSCACHE counters due to conversion to netfs
 API

The old NFSIOS_FSCACHE counters are no longer accurate or useful with
the conversion to the new netfs API.  The new API does not have a page
based interface, and so the counters in nfs_stat_fscachecounters are
no longer obtainable.  The new netfs the API has extensive statistics
inside /proc/fs/fscache/stats so we no longer need NFS specific fscache
stats.

Note this also removes the 'fsc:' line from /proc/self/mountstats so
it will be a user-visible change.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Daire Byrne <daire@dneg.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/iostat.h            | 17 -----------------
 fs/nfs/super.c             | 11 -----------
 include/linux/nfs_iostat.h | 12 ------------
 3 files changed, 40 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 2ddaab1ac653..5aa776b5a3e7 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -17,9 +17,6 @@
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
-#ifdef CONFIG_NFS_FSCACHE
-	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
-#endif
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -49,20 +46,6 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
-#ifdef CONFIG_NFS_FSCACHE
-static inline void nfs_add_fscache_stats(struct inode *inode,
-					 enum nfs_stat_fscachecounters stat,
-					 long addend)
-{
-	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
-}
-static inline void nfs_inc_fscache_stats(struct inode *inode,
-					 enum nfs_stat_fscachecounters stat)
-{
-	this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
-}
-#endif
-
 static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 05ae23657527..90796db8c205 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -692,10 +692,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 			totals.events[i] += stats->events[i];
 		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 			totals.bytes[i] += stats->bytes[i];
-#ifdef CONFIG_NFS_FSCACHE
-		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
-			totals.fscache[i] += stats->fscache[i];
-#endif
 
 		preempt_enable();
 	}
@@ -706,13 +702,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 	seq_puts(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
-#ifdef CONFIG_NFS_FSCACHE
-	if (nfss->options & NFS_OPTION_FSCACHE) {
-		seq_puts(m, "\n\tfsc:\t");
-		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
-			seq_printf(m, "%Lu ", totals.fscache[i]);
-	}
-#endif
 	seq_putc(m, '\n');
 
 	rpc_clnt_show_stats(m, nfss->client);
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 027874c36c88..8d946089d151 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -119,16 +119,4 @@ enum nfs_stat_eventcounters {
 	__NFSIOS_COUNTSMAX,
 };
 
-/*
- * NFS local caching servicing counters
- */
-enum nfs_stat_fscachecounters {
-	NFSIOS_FSCACHE_PAGES_READ_OK,
-	NFSIOS_FSCACHE_PAGES_READ_FAIL,
-	NFSIOS_FSCACHE_PAGES_WRITTEN_OK,
-	NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL,
-	NFSIOS_FSCACHE_PAGES_UNCACHED,
-	__NFSIOS_FSCACHEMAX,
-};
-
 #endif	/* _LINUX_NFS_IOSTAT */
-- 
cgit v1.2.3


From 03f5bd75a4c19714a929a0f4e7bbf36b49e874c1 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Mon, 20 Feb 2023 08:43:08 -0500
Subject: NFS: Remove fscache specific trace points and NFS_INO_FSCACHE bit

The NFS specific trace points are no longer needed as tracing is well
covered by netfs and fscache.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Daire Byrne <daire@dneg.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfstrace.h      | 91 --------------------------------------------------
 include/linux/nfs_fs.h |  1 -
 2 files changed, 92 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a778713343df..4e90ca531176 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
 			{ BIT(NFS_INO_STALE), "STALE" }, \
 			{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
 			{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
-			{ BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
 			{ BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
 			{ BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
 			{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
@@ -1243,96 +1242,6 @@ TRACE_EVENT(nfs_readpage_short,
 		)
 );
 
-DECLARE_EVENT_CLASS(nfs_fscache_page_event,
-		TP_PROTO(
-			const struct inode *inode,
-			struct page *page
-		),
-
-		TP_ARGS(inode, page),
-
-		TP_STRUCT__entry(
-			__field(dev_t, dev)
-			__field(u32, fhandle)
-			__field(u64, fileid)
-			__field(loff_t, offset)
-		),
-
-		TP_fast_assign(
-			const struct nfs_inode *nfsi = NFS_I(inode);
-			const struct nfs_fh *fh = &nfsi->fh;
-
-			__entry->offset = page_index(page) << PAGE_SHIFT;
-			__entry->dev = inode->i_sb->s_dev;
-			__entry->fileid = nfsi->fileid;
-			__entry->fhandle = nfs_fhandle_hash(fh);
-		),
-
-		TP_printk(
-			"fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld",
-			MAJOR(__entry->dev), MINOR(__entry->dev),
-			(unsigned long long)__entry->fileid,
-			__entry->fhandle,
-			(long long)__entry->offset
-		)
-);
-DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
-		TP_PROTO(
-			const struct inode *inode,
-			struct page *page,
-			int error
-		),
-
-		TP_ARGS(inode, page, error),
-
-		TP_STRUCT__entry(
-			__field(int, error)
-			__field(dev_t, dev)
-			__field(u32, fhandle)
-			__field(u64, fileid)
-			__field(loff_t, offset)
-		),
-
-		TP_fast_assign(
-			const struct nfs_inode *nfsi = NFS_I(inode);
-			const struct nfs_fh *fh = &nfsi->fh;
-
-			__entry->offset = page_index(page) << PAGE_SHIFT;
-			__entry->dev = inode->i_sb->s_dev;
-			__entry->fileid = nfsi->fileid;
-			__entry->fhandle = nfs_fhandle_hash(fh);
-			__entry->error = error;
-		),
-
-		TP_printk(
-			"fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld error=%d",
-			MAJOR(__entry->dev), MINOR(__entry->dev),
-			(unsigned long long)__entry->fileid,
-			__entry->fhandle,
-			(long long)__entry->offset, __entry->error
-		)
-);
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
-	DEFINE_EVENT(nfs_fscache_page_event, name, \
-			TP_PROTO( \
-				const struct inode *inode, \
-				struct page *page \
-			), \
-			TP_ARGS(inode, page))
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
-	DEFINE_EVENT(nfs_fscache_page_event_done, name, \
-			TP_PROTO( \
-				const struct inode *inode, \
-				struct page *page, \
-				int error \
-			), \
-			TP_ARGS(inode, page, error))
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
 
 TRACE_EVENT(nfs_pgio_error,
 	TP_PROTO(
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 041e79a48f7a..12bb868f9a18 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -281,7 +281,6 @@ struct nfs4_copy_state {
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_PRESERVE_UNLINKED (4)		/* preserve file if removed while open */
-#define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
-- 
cgit v1.2.3


From 67abe9c6a8077819aae490dcd3b9629c2e87bfc2 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 4 Apr 2023 13:02:50 +0200
Subject: ACPI: video: Remove register_backlight_delay module option and code

Since commit 5aa9d943e9b6 ("ACPI: video: Don't enable fallback path for
creating ACPI backlight by default"), the delayed registering of
acpi_video# backlight devices has been disabled by default.

The few bugreports where this option was used as a workaround were all
cases where the GPU driver did not call acpi_video_register_backlight()
and the workaround was to pass video.register_backlight_delay=1.

With the recent "ACPI: video: Make acpi_backlight=video work independent
from GPU driver" changes acpi_backlight=video can be used to achieve
the same result. So there is no need for the register_backlight_delay
option + code anymore.

Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_video.c                         | 38 -----------------------
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  4 ---
 include/acpi/video.h                              |  2 --
 3 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index c7a6d0b69dab..62f4364e4460 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -70,12 +70,6 @@ module_param(device_id_scheme, bool, 0444);
 static int only_lcd = -1;
 module_param(only_lcd, int, 0444);
 
-static int register_backlight_delay;
-module_param(register_backlight_delay, int, 0444);
-MODULE_PARM_DESC(register_backlight_delay,
-	"Delay in seconds before doing fallback (non GPU driver triggered) "
-	"backlight registration, set to 0 to disable.");
-
 static bool may_report_brightness_keys;
 static int register_count;
 static DEFINE_MUTEX(register_count_mutex);
@@ -84,9 +78,6 @@ static LIST_HEAD(video_bus_head);
 static int acpi_video_bus_add(struct acpi_device *device);
 static void acpi_video_bus_remove(struct acpi_device *device);
 static void acpi_video_bus_notify(struct acpi_device *device, u32 event);
-static void acpi_video_bus_register_backlight_work(struct work_struct *ignored);
-static DECLARE_DELAYED_WORK(video_bus_register_backlight_work,
-			    acpi_video_bus_register_backlight_work);
 
 /*
  * Indices in the _BCL method response: the first two items are special,
@@ -2096,11 +2087,6 @@ static void acpi_video_bus_remove(struct acpi_device *device)
 	kfree(video);
 }
 
-static void acpi_video_bus_register_backlight_work(struct work_struct *ignored)
-{
-	acpi_video_register_backlight();
-}
-
 static int __init is_i740(struct pci_dev *dev)
 {
 	if (dev->device == 0x00D1)
@@ -2183,17 +2169,6 @@ static bool should_check_lcd_flag(void)
 	return false;
 }
 
-/*
- * At least one graphics driver has reported that no LCD is connected
- * via the native interface. cancel the registration for fallback acpi_video0.
- * If another driver still deems this necessary, it can explicitly register it.
- */
-void acpi_video_report_nolcd(void)
-{
-	cancel_delayed_work(&video_bus_register_backlight_work);
-}
-EXPORT_SYMBOL(acpi_video_report_nolcd);
-
 int acpi_video_register(void)
 {
 	int ret = 0;
@@ -2222,18 +2197,6 @@ int acpi_video_register(void)
 	 */
 	register_count = 1;
 
-	/*
-	 * acpi_video_bus_add() skips registering the userspace visible
-	 * backlight_device. The intend is for this to be registered by the
-	 * drm/kms driver calling acpi_video_register_backlight() *after* it is
-	 * done setting up its own native backlight device. The delayed work
-	 * ensures that acpi_video_register_backlight() always gets called
-	 * eventually, in case there is no drm/kms driver or it is disabled.
-	 */
-	if (register_backlight_delay)
-		schedule_delayed_work(&video_bus_register_backlight_work,
-				      register_backlight_delay * HZ);
-
 leave:
 	mutex_unlock(&register_count_mutex);
 	return ret;
@@ -2244,7 +2207,6 @@ void acpi_video_unregister(void)
 {
 	mutex_lock(&register_count_mutex);
 	if (register_count) {
-		cancel_delayed_work_sync(&video_bus_register_backlight_work);
 		acpi_bus_unregister_driver(&acpi_video_bus);
 		register_count = 0;
 		may_report_brightness_keys = false;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index a01fd41643fc..d8b3e1224ec2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4445,10 +4445,6 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev)
 		amdgpu_set_panel_orientation(&aconnector->base);
 	}
 
-	/* If we didn't find a panel, notify the acpi video detection */
-	if (dm->adev->flags & AMD_IS_APU && dm->num_of_edps == 0)
-		acpi_video_report_nolcd();
-
 	/* Software is initialized. Now we can register interrupt handlers. */
 	switch (adev->asic_type) {
 #if defined(CONFIG_DRM_AMD_DC_SI)
diff --git a/include/acpi/video.h b/include/acpi/video.h
index ff5a8da5d883..4230392b5b0b 100644
--- a/include/acpi/video.h
+++ b/include/acpi/video.h
@@ -53,7 +53,6 @@ enum acpi_backlight_type {
 };
 
 #if IS_ENABLED(CONFIG_ACPI_VIDEO)
-extern void acpi_video_report_nolcd(void);
 extern int acpi_video_register(void);
 extern void acpi_video_unregister(void);
 extern void acpi_video_register_backlight(void);
@@ -81,7 +80,6 @@ static inline bool acpi_video_backlight_use_native(void)
 	return __acpi_video_get_backlight_type(true, NULL) == acpi_backlight_native;
 }
 #else
-static inline void acpi_video_report_nolcd(void) { return; };
 static inline int acpi_video_register(void) { return -ENODEV; }
 static inline void acpi_video_unregister(void) { return; }
 static inline void acpi_video_register_backlight(void) { return; }
-- 
cgit v1.2.3


From 3db63daabe210af32a09533fe7d8d47c711a103c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 22 Mar 2023 09:27:04 +1100
Subject: NFSv3: handle out-of-order write replies.

NFSv3 includes pre/post wcc attributes which allow the client to
determine if all changes to the file have been made by the client
itself, or if any might have been made by some other client.

If there are gaps in the pre/post ctime sequence it must be assumed that
some other client changed the file in that gap and the local cache must
be suspect.  The next time the file is opened the cache should be
invalidated.

Since Commit 1c341b777501 ("NFS: Add deferred cache invalidation for
close-to-open consistency violations") in linux 5.3 the Linux client has
been triggering this invalidation.  The chunk in nfs_update_inode() in
particularly triggers.

Unfortunately Linux NFS assumes that all replies will be processed in
the order sent, and will arrive in the order processed.  This is not
true in general.  Consequently Linux NFS might ignore the wcc info in a
WRITE reply because the reply is in response to a WRITE that was sent
before some other request for which a reply has already been seen.  This
is detected by Linux using the gencount tests in nfs_inode_attr_cmp().

Also, when the gencount tests pass it is still possible that the request
were processed on the server in a different order, and a gap seen in
the ctime sequence might be filled in by a subsequent reply, so gaps
should not immediately trigger delayed invalidation.

The net result is that writing to a server and then reading the file
back can result in going to the server for the read rather than serving
it from cache - all because a couple of replies arrived out-of-order.
This is a performance regression over kernels before 5.3, though the
change in 5.3 is a correctness improvement.

This has been seen with Linux writing to a Netapp server which
occasionally re-orders requests.  In testing the majority of requests
were in-order, but a few (maybe 2 or three at a time) could be
re-ordered.

This patch addresses the problem by recording any gaps seen in the
pre/post ctime sequence and not triggering invalidation until either
there are too many gaps to fit in the table, or until there are no more
active writes and the remaining gaps cannot be resolved.

We allocate a table of 16 gaps on demand.  If the allocation fails we
revert to current behaviour which is of little cost as we are unlikely
to be able to cache the writes anyway.

In the table we store "start->end" pair when iversion is updated and
"end<-start" pairs pre/post pairs reported by the server.  Usually these
exactly cancel out and so nothing is stored.  When there are
out-of-order replies we do store gaps and these will eventually be
cancelled against later replies when this client is the only writer.

If the final write is out-of-order there may be one gap remaining when
the file is closed.  This will be noticed and if there is precisely on
gap and if the iversion can be advanced to match it, then we do so.

This patch makes no attempt to handle directories correctly.  The same
problem potentially exists in the out-of-order replies to create/unlink
requests can cause future lookup requires to be sent to the server
unnecessarily.  A similar scheme using the same primitives could be used
to notice and handle out-of-order replies.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/inode.c         | 112 ++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/nfs_fs.h |  47 +++++++++++++++++++++
 2 files changed, 144 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5c8027e3c961..eb8af1e404d9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 
 	nfsi->cache_validity |= flags;
 
-	if (inode->i_mapping->nrpages == 0)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
-					  NFS_INO_DATA_INVAL_DEFER);
-	else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-		nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+	if (inode->i_mapping->nrpages == 0) {
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+		nfs_ooo_clear(nfsi);
+	} else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+		nfs_ooo_clear(nfsi);
+	}
 	trace_nfs_set_cache_invalid(inode, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	trace_nfs_size_truncate(inode, offset);
 	i_size_write(inode, offset);
 	/* Optimisation */
-	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
-				NFS_INO_DATA_INVAL_DEFER);
+	if (offset == 0) {
+		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		nfs_ooo_clear(NFS_I(inode));
+	}
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -1109,7 +1111,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 
 	spin_lock(&inode->i_lock);
 	if (list_empty(&nfsi->open_files) &&
-	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+	    nfs_ooo_test(nfsi))
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
 						     NFS_INO_REVAL_FORCED);
 	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
@@ -1353,8 +1355,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &=
-		~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
+	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfs_ooo_clear(nfsi);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1816,6 +1818,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
 	return 0;
 }
 
+static void nfs_ooo_merge(struct nfs_inode *nfsi,
+			  u64 start, u64 end)
+{
+	int i, cnt;
+
+	if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)
+		/* No point merging anything */
+		return;
+
+	if (!nfsi->ooo) {
+		nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC);
+		if (!nfsi->ooo) {
+			nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			return;
+		}
+		nfsi->ooo->cnt = 0;
+	}
+
+	/* add this range, merging if possible */
+	cnt = nfsi->ooo->cnt;
+	for (i = 0; i < cnt; i++) {
+		if (end == nfsi->ooo->gap[i].start)
+			end = nfsi->ooo->gap[i].end;
+		else if (start == nfsi->ooo->gap[i].end)
+			start = nfsi->ooo->gap[i].start;
+		else
+			continue;
+		/* Remove 'i' from table and loop to insert the new range */
+		cnt -= 1;
+		nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt];
+		i = -1;
+	}
+	if (start != end) {
+		if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) {
+			nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			kfree(nfsi->ooo);
+			nfsi->ooo = NULL;
+			return;
+		}
+		nfsi->ooo->gap[cnt].start = start;
+		nfsi->ooo->gap[cnt].end = end;
+		cnt += 1;
+	}
+	nfsi->ooo->cnt = cnt;
+}
+
+static void nfs_ooo_record(struct nfs_inode *nfsi,
+			   struct nfs_fattr *fattr)
+{
+	/* This reply was out-of-order, so record in the
+	 * pre/post change id, possibly cancelling
+	 * gaps created when iversion was jumpped forward.
+	 */
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) &&
+	    (fattr->valid & NFS_ATTR_FATTR_PRECHANGE))
+		nfs_ooo_merge(nfsi,
+			      fattr->change_attr,
+			      fattr->pre_change_attr);
+}
+
 static int nfs_refresh_inode_locked(struct inode *inode,
 				    struct nfs_fattr *fattr)
 {
@@ -1826,8 +1888,12 @@ static int nfs_refresh_inode_locked(struct inode *inode,
 
 	if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
 		ret = nfs_update_inode(inode, fattr);
-	else if (attr_cmp == 0)
-		ret = nfs_check_inode_attributes(inode, fattr);
+	else {
+		nfs_ooo_record(NFS_I(inode), fattr);
+
+		if (attr_cmp == 0)
+			ret = nfs_check_inode_attributes(inode, fattr);
+	}
 
 	trace_nfs_refresh_inode_exit(inode, ret);
 	return ret;
@@ -1918,6 +1984,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
 	if (attr_cmp < 0)
 		return 0;
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
+		/* Record the pre/post change info before clearing PRECHANGE */
+		nfs_ooo_record(NFS_I(inode), fattr);
 		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
 				| NFS_ATTR_FATTR_PRESIZE
 				| NFS_ATTR_FATTR_PREMTIME
@@ -2072,6 +2140,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 &&
+		    nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) {
+			/* There is one remaining gap that hasn't been
+			 * merged into iversion - do that now.
+			 */
+			inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start);
+			kfree(nfsi->ooo);
+			nfsi->ooo = NULL;
+		}
 		if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
 			/* Could it be a race with writeback? */
 			if (!(have_writers || have_delegation)) {
@@ -2093,8 +2170,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			} else if (!have_delegation)
-				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			} else if (!have_delegation) {
+				nfs_ooo_record(nfsi, fattr);
+				nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode),
+					      fattr->change_attr);
+			}
 			inode_set_iversion_raw(inode, fattr->change_attr);
 		}
 	} else {
@@ -2248,6 +2328,7 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	nfsi->flags = 0UL;
 	nfsi->cache_validity = 0UL;
+	nfsi->ooo = NULL;
 #if IS_ENABLED(CONFIG_NFS_V4)
 	nfsi->nfs4_acl = NULL;
 #endif /* CONFIG_NFS_V4 */
@@ -2262,6 +2343,7 @@ EXPORT_SYMBOL_GPL(nfs_alloc_inode);
 
 void nfs_free_inode(struct inode *inode)
 {
+	kfree(NFS_I(inode)->ooo);
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 EXPORT_SYMBOL_GPL(nfs_free_inode);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 12bb868f9a18..279262057a92 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -195,6 +195,39 @@ struct nfs_inode {
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
 
+	/* Keep track of out-of-order replies.
+	 * The ooo array contains start/end pairs of
+	 * numbers from the changeid sequence when
+	 * the inode's iversion has been updated.
+	 * It also contains end/start pair (i.e. reverse order)
+	 * of sections of the changeid sequence that have
+	 * been seen in replies from the server.
+	 * Normally these should match and when both
+	 * A:B and B:A are found in ooo, they are both removed.
+	 * And if a reply with A:B causes an iversion update
+	 * of A:B, then neither are added.
+	 * When a reply has pre_change that doesn't match
+	 * iversion, then the changeid pair and any consequent
+	 * change in iversion ARE added.  Later replies
+	 * might fill in the gaps, or possibly a gap is caused
+	 * by a change from another client.
+	 * When a file or directory is opened, if the ooo table
+	 * is not empty, then we assume the gaps were due to
+	 * another client and we invalidate the cached data.
+	 *
+	 * We can only track a limited number of concurrent gaps.
+	 * Currently that limit is 16.
+	 * We allocate the table on demand.  If there is insufficient
+	 * memory, then we probably cannot cache the file anyway
+	 * so there is no loss.
+	 */
+	struct {
+		int cnt;
+		struct {
+			u64 start, end;
+		} gap[16];
+	} *ooo;
+
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
@@ -612,6 +645,20 @@ nfs_fileid_to_ino_t(u64 fileid)
 	return ino;
 }
 
+static inline void nfs_ooo_clear(struct nfs_inode *nfsi)
+{
+	nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+	kfree(nfsi->ooo);
+	nfsi->ooo = NULL;
+}
+
+static inline bool nfs_ooo_test(struct nfs_inode *nfsi)
+{
+	return (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) ||
+		(nfsi->ooo && nfsi->ooo->cnt > 0);
+
+}
+
 #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
 
 /* We need to block new opens while a file is being unlinked.
-- 
cgit v1.2.3


From 3c85f087faeca3ca9ec9e7b085e1eff370e3f0db Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Wed, 29 Mar 2023 13:13:03 +0300
Subject: scsi: ufs: mcq: Use pointer arithmetic in ufshcd_send_command()

Make sqe_base_addr the UTRD pointer it is, instead of an opaque void *.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20230329101303.18377-3-avri.altman@wdc.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 5 +++--
 include/ufs/ufshcd.h      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 03c47f9a2750..9434328ba323 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2242,10 +2242,11 @@ void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag,
 
 	if (is_mcq_enabled(hba)) {
 		int utrd_size = sizeof(struct utp_transfer_req_desc);
+		struct utp_transfer_req_desc *src = lrbp->utr_descriptor_ptr;
+		struct utp_transfer_req_desc *dest = hwq->sqe_base_addr + hwq->sq_tail_slot;
 
 		spin_lock(&hwq->sq_lock);
-		memcpy(hwq->sqe_base_addr + (hwq->sq_tail_slot * utrd_size),
-		       lrbp->utr_descriptor_ptr, utrd_size);
+		memcpy(dest, src, utrd_size);
 		ufshcd_inc_sq_tail(hwq);
 		spin_unlock(&hwq->sq_lock);
 	} else {
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 8b1046c21960..721ae4cd3702 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1095,7 +1095,7 @@ struct ufs_hw_queue {
 	void __iomem *mcq_cq_head;
 	void __iomem *mcq_cq_tail;
 
-	void *sqe_base_addr;
+	struct utp_transfer_req_desc *sqe_base_addr;
 	dma_addr_t sqe_dma_addr;
 	struct cq_entry *cqe_base_addr;
 	dma_addr_t cqe_dma_addr;
-- 
cgit v1.2.3


From 8eb8af4b3df5965dc65a24a32768043f39d82d59 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 27 Mar 2023 21:03:26 -0700
Subject: fsverity: use WARN_ON_ONCE instead of WARN_ON

As per Linus's suggestion
(https://lore.kernel.org/r/CAHk-=whefxRGyNGzCzG6BVeM=5vnvgb-XhSeFJVxJyAxAF8XRA@mail.gmail.com),
use WARN_ON_ONCE instead of WARN_ON.  This barely adds any extra
overhead, and it makes it so that if any of these ever becomes reachable
(they shouldn't, but that's the point), the logs can't be flooded.

Link: https://lore.kernel.org/r/20230406181542.38894-1-ebiggers@kernel.org
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/enable.c       | 4 ++--
 fs/verity/hash_algs.c    | 4 ++--
 fs/verity/open.c         | 2 +-
 include/linux/fsverity.h | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 7a0e3a84d370..541c2a277c5c 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -165,7 +165,7 @@ static int build_merkle_tree(struct file *filp,
 		}
 	}
 	/* The root hash was filled by the last call to hash_one_block(). */
-	if (WARN_ON(buffers[num_levels].filled != params->digest_size)) {
+	if (WARN_ON_ONCE(buffers[num_levels].filled != params->digest_size)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -277,7 +277,7 @@ static int enable_verity(struct file *filp,
 		fsverity_err(inode, "%ps() failed with err %d",
 			     vops->end_enable_verity, err);
 		fsverity_free_info(vi);
-	} else if (WARN_ON(!IS_VERITY(inode))) {
+	} else if (WARN_ON_ONCE(!IS_VERITY(inode))) {
 		err = -EINVAL;
 		fsverity_free_info(vi);
 	} else {
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 13fcf31be844..ea00dbedf756 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -84,9 +84,9 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
 	}
 
 	err = -EINVAL;
-	if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm)))
+	if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm)))
 		goto err_free_tfm;
-	if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm)))
+	if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm)))
 		goto err_free_tfm;
 
 	err = mempool_init_kmalloc_pool(&alg->req_pool, 1,
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 9366b441d01c..52048b7630dc 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -83,7 +83,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 	params->log_blocks_per_page = PAGE_SHIFT - log_blocksize;
 	params->blocks_per_page = 1 << params->log_blocks_per_page;
 
-	if (WARN_ON(!is_power_of_2(params->digest_size))) {
+	if (WARN_ON_ONCE(!is_power_of_2(params->digest_size))) {
 		err = -EINVAL;
 		goto out_err;
 	}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 119a3266791f..e76605d5b36e 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -233,18 +233,18 @@ static inline int fsverity_ioctl_read_metadata(struct file *filp,
 static inline bool fsverity_verify_blocks(struct folio *folio, size_t len,
 					  size_t offset)
 {
-	WARN_ON(1);
+	WARN_ON_ONCE(1);
 	return false;
 }
 
 static inline void fsverity_verify_bio(struct bio *bio)
 {
-	WARN_ON(1);
+	WARN_ON_ONCE(1);
 }
 
 static inline void fsverity_enqueue_verify_work(struct work_struct *work)
 {
-	WARN_ON(1);
+	WARN_ON_ONCE(1);
 }
 
 #endif	/* !CONFIG_FS_VERITY */
-- 
cgit v1.2.3


From e5688f6fb9e3d0254a8fb1c714c38e407f0baa64 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Wed, 1 Mar 2023 10:33:55 +0100
Subject: net/mlx5: Add mlx5_ifc definitions for bridge multicast support

Add the required hardware definitions to mlx5_ifc: fdb_uplink_hairpin,
fdb_multi_path_any_table_limit_regc, fdb_multi_path_any_table.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e47d6c58da35..02c628f4fe26 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -880,7 +880,12 @@ enum {
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      fdb_to_vport_reg_c_id[0x8];
-	u8      reserved_at_8[0xd];
+	u8      reserved_at_8[0x5];
+	u8      fdb_uplink_hairpin[0x1];
+	u8      fdb_multi_path_any_table_limit_regc[0x1];
+	u8      reserved_at_f[0x3];
+	u8      fdb_multi_path_any_table[0x1];
+	u8      reserved_at_13[0x2];
 	u8      fdb_modify_header_fwd_to_table[0x1];
 	u8      fdb_ipv4_ttl_modify[0x1];
 	u8      flow_source[0x1];
-- 
cgit v1.2.3


From 9df839a711aee437390b16ee39cf0b5c1620be6a Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 23 Apr 2020 08:27:59 -0500
Subject: net/mlx5: Create a new profile for SFs

Create a new profile for SFs in order to disable the command cache.
Each function command cache consumes ~500KB of memory, when using a
large number of SFs this savings is notable on memory constarined
systems.

Use a new profile to provide for future differences between SFs and PFs.

The mr_cache not used for non-PF functions, so it is excluded from the
new profile.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Bodong Wang <bodong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c           | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c          | 9 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h     | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 2 +-
 include/linux/mlx5/driver.h                             | 1 +
 5 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b00e33ed05e9..d53de39539a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1802,7 +1802,7 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
 	if (in_size <= 16)
 		goto cache_miss;
 
-	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+	for (i = 0; i < dev->profile.num_cmd_caches; i++) {
 		ch = &cmd->cache[i];
 		if (in_size > ch->max_inbox_size)
 			continue;
@@ -2097,7 +2097,7 @@ static void destroy_msg_cache(struct mlx5_core_dev *dev)
 	struct mlx5_cmd_msg *n;
 	int i;
 
-	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+	for (i = 0; i < dev->profile.num_cmd_caches; i++) {
 		ch = &dev->cmd.cache[i];
 		list_for_each_entry_safe(msg, n, &ch->head, list) {
 			list_del(&msg->list);
@@ -2127,7 +2127,7 @@ static void create_msg_cache(struct mlx5_core_dev *dev)
 	int k;
 
 	/* Initialize and fill the caches with initial entries */
-	for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) {
+	for (k = 0; k < dev->profile.num_cmd_caches; k++) {
 		ch = &cmd->cache[k];
 		spin_lock_init(&ch->lock);
 		INIT_LIST_HEAD(&ch->head);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f95df73d1089..a95d1218def9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -100,15 +100,19 @@ enum {
 static struct mlx5_profile profile[] = {
 	[0] = {
 		.mask           = 0,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
 	},
 	[1] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE,
 		.log_max_qp	= 12,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
+
 	},
 	[2] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE |
 				  MLX5_PROF_MASK_MR_CACHE,
 		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
 		.mr_cache[0]	= {
 			.size	= 500,
 			.limit	= 250
@@ -174,6 +178,11 @@ static struct mlx5_profile profile[] = {
 			.limit	= 4
 		},
 	},
+	[3] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
+		.num_cmd_caches = 0,
+	},
 };
 
 static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index be0785f83083..5eaab99678ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -142,6 +142,7 @@ enum mlx5_semaphore_space_address {
 };
 
 #define MLX5_DEFAULT_PROF       2
+#define MLX5_SF_PROF		3
 
 static inline int mlx5_flexible_inlen(struct mlx5_core_dev *dev, size_t fixed,
 				      size_t item_size, size_t num_items,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
index a7377619ba6f..e2f26d0bc615 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -28,7 +28,7 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
 	mdev->priv.adev_idx = adev->id;
 	sf_dev->mdev = mdev;
 
-	err = mlx5_mdev_init(mdev, MLX5_DEFAULT_PROF);
+	err = mlx5_mdev_init(mdev, MLX5_SF_PROF);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_mdev_init on err=%d\n", err);
 		goto mdev_err;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f243bd10a5e1..135a3c8d8237 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -751,6 +751,7 @@ enum {
 struct mlx5_profile {
 	u64	mask;
 	u8	log_max_qp;
+	u8	num_cmd_caches;
 	struct {
 		int	size;
 		int	limit;
-- 
cgit v1.2.3


From 9fa7f1de3dda0bc74c964b5127b3f22bd8ef8fd2 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 30 Aug 2022 01:20:30 +0300
Subject: net/mlx5: Add mlx5_ifc bits for modify header argument

Add enum value for modify-header argument object and mlx5_bits
for the related capabilities.

Signed-off-by: Muhammad Sammar <muhammads@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 02c628f4fe26..6c84bf6eec85 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -78,12 +78,15 @@ enum {
 
 enum {
 	MLX5_OBJ_TYPE_SW_ICM = 0x0008,
+	MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT  = 0x23,
 };
 
 enum {
 	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
 	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
 	MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13),
+	MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT =
+		(1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT),
 	MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39),
 };
 
@@ -321,6 +324,10 @@ enum {
 	MLX5_FT_NIC_TX_RDMA_2_NIC_TX = BIT(1),
 };
 
+enum {
+	MLX5_CMD_OP_MOD_UPDATE_HEADER_MODIFY_ARGUMENT = 0x1,
+};
+
 struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_dmac[0x1];
 	u8         outer_smac[0x1];
@@ -1927,7 +1934,14 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_750[0x4];
 	u8	   max_dynamic_vf_msix_table_size[0xc];
 
-	u8	   reserved_at_760[0x20];
+	u8         reserved_at_760[0x3];
+	u8         log_max_num_header_modify_argument[0x5];
+	u8         reserved_at_768[0x4];
+	u8         log_header_modify_argument_granularity[0x4];
+	u8         reserved_at_770[0x3];
+	u8         log_header_modify_argument_max_alloc[0x5];
+	u8         reserved_at_778[0x8];
+
 	u8	   vhca_tunnel_commands[0x40];
 	u8         match_definer_format_supported[0x40];
 };
@@ -6361,6 +6375,18 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_modify_header_arg_bits {
+	u8         reserved_at_0[0x80];
+
+	u8         reserved_at_80[0x8];
+	u8         access_pd[0x18];
+};
+
+struct mlx5_ifc_create_modify_header_arg_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_modify_header_arg_bits arg;
+};
+
 struct mlx5_ifc_create_match_definer_in_bits {
 	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
 
-- 
cgit v1.2.3


From 977c4a3e7c89d5b48fdc548773fb9ed3e6c50cf8 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 30 Aug 2022 01:20:40 +0300
Subject: net/mlx5: Add new WQE for updating flow table

Add new WQE type: FLOW_TBL_ACCESS, which will be used for
writing modify header arguments.
This type has specific control segment and special data segment.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h |  2 ++
 include/linux/mlx5/qp.h     | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index af4dd536a52c..e4aa147ab390 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -442,6 +442,8 @@ enum {
 
 	MLX5_OPCODE_UMR			= 0x25,
 
+	MLX5_OPCODE_FLOW_TBL_ACCESS	= 0x2c,
+
 	MLX5_OPCODE_ACCESS_ASO		= 0x2d,
 };
 
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index df55fbb65717..bd53cf4be7bd 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -499,6 +499,16 @@ struct mlx5_stride_block_ctrl_seg {
 	__be16		num_entries;
 };
 
+struct mlx5_wqe_flow_update_ctrl_seg {
+	__be32		flow_idx_update;
+	__be32		dest_handle;
+	u8		reserved0[40];
+};
+
+struct mlx5_wqe_header_modify_argument_update_seg {
+	u8		argument_list[64];
+};
+
 struct mlx5_core_qp {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	void (*event)		(struct mlx5_core_qp *, int);
-- 
cgit v1.2.3


From d8f48fbdfd9af268e92b33462472559d2840228c Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 21 Mar 2023 10:38:55 +0530
Subject: soundwire: amd: Add support for AMD Manager driver

AMD ACP(v6.x) IP block has two SoundWire manager devices.
Add support for
  - Manager driver probe & remove sequence
  - Helper functions to enable/disable interrupts,
    Initialize sdw manager, enable sdw pads
  - Manager driver sdw_master_ops & port_ops callbacks

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/lkml/20230310162554.699766-3-Vijendar.Mukunda@amd.com
Link: https://lore.kernel.org/r/20230321050901.115439-3-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 670 ++++++++++++++++++++++++++++++++++++++
 drivers/soundwire/amd_manager.h   | 235 +++++++++++++
 include/linux/soundwire/sdw_amd.h |  67 ++++
 3 files changed, 972 insertions(+)
 create mode 100644 drivers/soundwire/amd_manager.c
 create mode 100644 drivers/soundwire/amd_manager.h
 create mode 100644 include/linux/soundwire/sdw_amd.h

(limited to 'include')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
new file mode 100644
index 000000000000..f02522d11417
--- /dev/null
+++ b/drivers/soundwire/amd_manager.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * SoundWire AMD Manager driver
+ *
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include <linux/wait.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include "bus.h"
+#include "amd_manager.h"
+
+#define DRV_NAME "amd_sdw_manager"
+
+#define to_amd_sdw(b)	container_of(b, struct amd_sdw_manager, bus)
+
+static void amd_enable_sdw_pads(struct amd_sdw_manager *amd_manager)
+{
+	u32 sw_pad_pulldown_val;
+	u32 val;
+
+	mutex_lock(amd_manager->acp_sdw_lock);
+	val = readl(amd_manager->acp_mmio + ACP_SW_PAD_KEEPER_EN);
+	val |= amd_manager->reg_mask->sw_pad_enable_mask;
+	writel(val, amd_manager->acp_mmio + ACP_SW_PAD_KEEPER_EN);
+	usleep_range(1000, 1500);
+
+	sw_pad_pulldown_val = readl(amd_manager->acp_mmio + ACP_PAD_PULLDOWN_CTRL);
+	sw_pad_pulldown_val &= amd_manager->reg_mask->sw_pad_pulldown_mask;
+	writel(sw_pad_pulldown_val, amd_manager->acp_mmio + ACP_PAD_PULLDOWN_CTRL);
+	mutex_unlock(amd_manager->acp_sdw_lock);
+}
+
+static int amd_init_sdw_manager(struct amd_sdw_manager *amd_manager)
+{
+	u32 val;
+	int ret;
+
+	writel(AMD_SDW_ENABLE, amd_manager->mmio + ACP_SW_EN);
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_EN_STATUS, val, val, ACP_DELAY_US,
+				 AMD_SDW_TIMEOUT);
+	if (ret)
+		return ret;
+
+	/* SoundWire manager bus reset */
+	writel(AMD_SDW_BUS_RESET_REQ, amd_manager->mmio + ACP_SW_BUS_RESET_CTRL);
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_BUS_RESET_CTRL, val,
+				 (val & AMD_SDW_BUS_RESET_DONE), ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret)
+		return ret;
+
+	writel(AMD_SDW_BUS_RESET_CLEAR_REQ, amd_manager->mmio + ACP_SW_BUS_RESET_CTRL);
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_BUS_RESET_CTRL, val, !val,
+				 ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret) {
+		dev_err(amd_manager->dev, "Failed to reset SoundWire manager instance%d\n",
+			amd_manager->instance);
+		return ret;
+	}
+
+	writel(AMD_SDW_DISABLE, amd_manager->mmio + ACP_SW_EN);
+	return readl_poll_timeout(amd_manager->mmio + ACP_SW_EN_STATUS, val, !val, ACP_DELAY_US,
+				  AMD_SDW_TIMEOUT);
+}
+
+static int amd_enable_sdw_manager(struct amd_sdw_manager *amd_manager)
+{
+	u32 val;
+
+	writel(AMD_SDW_ENABLE, amd_manager->mmio + ACP_SW_EN);
+	return readl_poll_timeout(amd_manager->mmio + ACP_SW_EN_STATUS, val, val, ACP_DELAY_US,
+				  AMD_SDW_TIMEOUT);
+}
+
+static int amd_disable_sdw_manager(struct amd_sdw_manager *amd_manager)
+{
+	u32 val;
+
+	writel(AMD_SDW_DISABLE, amd_manager->mmio + ACP_SW_EN);
+	/*
+	 * After invoking manager disable sequence, check whether
+	 * manager has executed clock stop sequence. In this case,
+	 * manager should ignore checking enable status register.
+	 */
+	val = readl(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+	if (val)
+		return 0;
+	return readl_poll_timeout(amd_manager->mmio + ACP_SW_EN_STATUS, val, !val, ACP_DELAY_US,
+				  AMD_SDW_TIMEOUT);
+}
+
+static void amd_enable_sdw_interrupts(struct amd_sdw_manager *amd_manager)
+{
+	struct sdw_manager_reg_mask *reg_mask = amd_manager->reg_mask;
+	u32 val;
+
+	mutex_lock(amd_manager->acp_sdw_lock);
+	val = readl(amd_manager->acp_mmio + ACP_EXTERNAL_INTR_CNTL(amd_manager->instance));
+	val |= reg_mask->acp_sdw_intr_mask;
+	writel(val, amd_manager->acp_mmio + ACP_EXTERNAL_INTR_CNTL(amd_manager->instance));
+	mutex_unlock(amd_manager->acp_sdw_lock);
+
+	writel(AMD_SDW_IRQ_MASK_0TO7, amd_manager->mmio +
+		       ACP_SW_STATE_CHANGE_STATUS_MASK_0TO7);
+	writel(AMD_SDW_IRQ_MASK_8TO11, amd_manager->mmio +
+		       ACP_SW_STATE_CHANGE_STATUS_MASK_8TO11);
+	writel(AMD_SDW_IRQ_ERROR_MASK, amd_manager->mmio + ACP_SW_ERROR_INTR_MASK);
+}
+
+static void amd_disable_sdw_interrupts(struct amd_sdw_manager *amd_manager)
+{
+	struct sdw_manager_reg_mask *reg_mask = amd_manager->reg_mask;
+	u32 val;
+
+	mutex_lock(amd_manager->acp_sdw_lock);
+	val = readl(amd_manager->acp_mmio + ACP_EXTERNAL_INTR_CNTL(amd_manager->instance));
+	val &= ~reg_mask->acp_sdw_intr_mask;
+	writel(val, amd_manager->acp_mmio + ACP_EXTERNAL_INTR_CNTL(amd_manager->instance));
+	mutex_unlock(amd_manager->acp_sdw_lock);
+
+	writel(0x00, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_MASK_0TO7);
+	writel(0x00, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_MASK_8TO11);
+	writel(0x00, amd_manager->mmio + ACP_SW_ERROR_INTR_MASK);
+}
+
+static void amd_sdw_set_frameshape(struct amd_sdw_manager *amd_manager)
+{
+	u32 frame_size;
+
+	frame_size = (amd_manager->rows_index << 3) | amd_manager->cols_index;
+	writel(frame_size, amd_manager->mmio + ACP_SW_FRAMESIZE);
+}
+
+static void amd_sdw_ctl_word_prep(u32 *lower_word, u32 *upper_word, struct sdw_msg *msg,
+				  int cmd_offset)
+{
+	u32 upper_data;
+	u32 lower_data = 0;
+	u16 addr;
+	u8 upper_addr, lower_addr;
+	u8 data = 0;
+
+	addr = msg->addr + cmd_offset;
+	upper_addr = (addr & 0xFF00) >> 8;
+	lower_addr = addr & 0xFF;
+
+	if (msg->flags == SDW_MSG_FLAG_WRITE)
+		data = msg->buf[cmd_offset];
+
+	upper_data = FIELD_PREP(AMD_SDW_MCP_CMD_DEV_ADDR, msg->dev_num);
+	upper_data |= FIELD_PREP(AMD_SDW_MCP_CMD_COMMAND, msg->flags + 2);
+	upper_data |= FIELD_PREP(AMD_SDW_MCP_CMD_REG_ADDR_HIGH, upper_addr);
+	lower_data |= FIELD_PREP(AMD_SDW_MCP_CMD_REG_ADDR_LOW, lower_addr);
+	lower_data |= FIELD_PREP(AMD_SDW_MCP_CMD_REG_DATA, data);
+
+	*upper_word = upper_data;
+	*lower_word = lower_data;
+}
+
+static u64 amd_sdw_send_cmd_get_resp(struct amd_sdw_manager *amd_manager, u32 lower_data,
+				     u32 upper_data)
+{
+	u64 resp;
+	u32 lower_resp, upper_resp;
+	u32 sts;
+	int ret;
+
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_IMM_CMD_STS, sts,
+				 !(sts & AMD_SDW_IMM_CMD_BUSY), ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret) {
+		dev_err(amd_manager->dev, "SDW%x previous cmd status clear failed\n",
+			amd_manager->instance);
+		return ret;
+	}
+
+	if (sts & AMD_SDW_IMM_RES_VALID) {
+		dev_err(amd_manager->dev, "SDW%x manager is in bad state\n", amd_manager->instance);
+		writel(0x00, amd_manager->mmio + ACP_SW_IMM_CMD_STS);
+	}
+	writel(upper_data, amd_manager->mmio + ACP_SW_IMM_CMD_UPPER_WORD);
+	writel(lower_data, amd_manager->mmio + ACP_SW_IMM_CMD_LOWER_QWORD);
+
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_IMM_CMD_STS, sts,
+				 (sts & AMD_SDW_IMM_RES_VALID), ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret) {
+		dev_err(amd_manager->dev, "SDW%x cmd response timeout occurred\n",
+			amd_manager->instance);
+		return ret;
+	}
+	upper_resp = readl(amd_manager->mmio + ACP_SW_IMM_RESP_UPPER_WORD);
+	lower_resp = readl(amd_manager->mmio + ACP_SW_IMM_RESP_LOWER_QWORD);
+
+	writel(AMD_SDW_IMM_RES_VALID, amd_manager->mmio + ACP_SW_IMM_CMD_STS);
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_IMM_CMD_STS, sts,
+				 !(sts & AMD_SDW_IMM_RES_VALID), ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret) {
+		dev_err(amd_manager->dev, "SDW%x cmd status retry failed\n",
+			amd_manager->instance);
+		return ret;
+	}
+	resp = upper_resp;
+	resp = (resp << 32) | lower_resp;
+	return resp;
+}
+
+static enum sdw_command_response
+amd_program_scp_addr(struct amd_sdw_manager *amd_manager, struct sdw_msg *msg)
+{
+	struct sdw_msg scp_msg = {0};
+	u64 response_buf[2] = {0};
+	u32 upper_data = 0, lower_data = 0;
+	int index;
+
+	scp_msg.dev_num = msg->dev_num;
+	scp_msg.addr = SDW_SCP_ADDRPAGE1;
+	scp_msg.buf = &msg->addr_page1;
+	scp_msg.flags = SDW_MSG_FLAG_WRITE;
+	amd_sdw_ctl_word_prep(&lower_data, &upper_data, &scp_msg, 0);
+	response_buf[0] = amd_sdw_send_cmd_get_resp(amd_manager, lower_data, upper_data);
+	scp_msg.addr = SDW_SCP_ADDRPAGE2;
+	scp_msg.buf = &msg->addr_page2;
+	amd_sdw_ctl_word_prep(&lower_data, &upper_data, &scp_msg, 0);
+	response_buf[1] = amd_sdw_send_cmd_get_resp(amd_manager, lower_data, upper_data);
+
+	for (index = 0; index < 2; index++) {
+		if (response_buf[index] == -ETIMEDOUT) {
+			dev_err_ratelimited(amd_manager->dev,
+					    "SCP_addrpage command timeout for Slave %d\n",
+					    msg->dev_num);
+			return SDW_CMD_TIMEOUT;
+		} else if (!(response_buf[index] & AMD_SDW_MCP_RESP_ACK)) {
+			if (response_buf[index] & AMD_SDW_MCP_RESP_NACK) {
+				dev_err_ratelimited(amd_manager->dev,
+						    "SCP_addrpage NACKed for Slave %d\n",
+						    msg->dev_num);
+				return SDW_CMD_FAIL;
+			}
+			dev_dbg_ratelimited(amd_manager->dev, "SCP_addrpage ignored for Slave %d\n",
+					    msg->dev_num);
+			return SDW_CMD_IGNORED;
+		}
+	}
+	return SDW_CMD_OK;
+}
+
+static int amd_prep_msg(struct amd_sdw_manager *amd_manager, struct sdw_msg *msg)
+{
+	int ret;
+
+	if (msg->page) {
+		ret = amd_program_scp_addr(amd_manager, msg);
+		if (ret) {
+			msg->len = 0;
+			return ret;
+		}
+	}
+	switch (msg->flags) {
+	case SDW_MSG_FLAG_READ:
+	case SDW_MSG_FLAG_WRITE:
+		break;
+	default:
+		dev_err(amd_manager->dev, "Invalid msg cmd: %d\n", msg->flags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static enum sdw_command_response amd_sdw_fill_msg_resp(struct amd_sdw_manager *amd_manager,
+						       struct sdw_msg *msg, u64 response,
+						       int offset)
+{
+	if (response & AMD_SDW_MCP_RESP_ACK) {
+		if (msg->flags == SDW_MSG_FLAG_READ)
+			msg->buf[offset] = FIELD_GET(AMD_SDW_MCP_RESP_RDATA, response);
+	} else {
+		if (response == -ETIMEDOUT) {
+			dev_err_ratelimited(amd_manager->dev, "command timeout for Slave %d\n",
+					    msg->dev_num);
+			return SDW_CMD_TIMEOUT;
+		} else if (response & AMD_SDW_MCP_RESP_NACK) {
+			dev_err_ratelimited(amd_manager->dev,
+					    "command response NACK received for Slave %d\n",
+					    msg->dev_num);
+			return SDW_CMD_FAIL;
+		}
+		dev_err_ratelimited(amd_manager->dev, "command is ignored for Slave %d\n",
+				    msg->dev_num);
+		return SDW_CMD_IGNORED;
+	}
+	return SDW_CMD_OK;
+}
+
+static unsigned int _amd_sdw_xfer_msg(struct amd_sdw_manager *amd_manager, struct sdw_msg *msg,
+				      int cmd_offset)
+{
+	u64 response;
+	u32 upper_data = 0, lower_data = 0;
+
+	amd_sdw_ctl_word_prep(&lower_data, &upper_data, msg, cmd_offset);
+	response = amd_sdw_send_cmd_get_resp(amd_manager, lower_data, upper_data);
+	return amd_sdw_fill_msg_resp(amd_manager, msg, response, cmd_offset);
+}
+
+static enum sdw_command_response amd_sdw_xfer_msg(struct sdw_bus *bus, struct sdw_msg *msg)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	int ret, i;
+
+	ret = amd_prep_msg(amd_manager, msg);
+	if (ret)
+		return SDW_CMD_FAIL_OTHER;
+	for (i = 0; i < msg->len; i++) {
+		ret = _amd_sdw_xfer_msg(amd_manager, msg, i);
+		if (ret)
+			return ret;
+	}
+	return SDW_CMD_OK;
+}
+
+static u32 amd_sdw_read_ping_status(struct sdw_bus *bus)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	u64 response;
+	u32 slave_stat;
+
+	response = amd_sdw_send_cmd_get_resp(amd_manager, 0, 0);
+	/* slave status from ping response */
+	slave_stat = FIELD_GET(AMD_SDW_MCP_SLAVE_STAT_0_3, response);
+	slave_stat |= FIELD_GET(AMD_SDW_MCP_SLAVE_STAT_4_11, response) << 8;
+	dev_dbg(amd_manager->dev, "slave_stat:0x%x\n", slave_stat);
+	return slave_stat;
+}
+
+static int amd_sdw_compute_params(struct sdw_bus *bus)
+{
+	struct sdw_transport_data t_data = {0};
+	struct sdw_master_runtime *m_rt;
+	struct sdw_port_runtime *p_rt;
+	struct sdw_bus_params *b_params = &bus->params;
+	int port_bo, hstart, hstop, sample_int;
+	unsigned int rate, bps;
+
+	port_bo = 0;
+	hstart = 1;
+	hstop = bus->params.col - 1;
+	t_data.hstop = hstop;
+	t_data.hstart = hstart;
+
+	list_for_each_entry(m_rt, &bus->m_rt_list, bus_node) {
+		rate = m_rt->stream->params.rate;
+		bps = m_rt->stream->params.bps;
+		sample_int = (bus->params.curr_dr_freq / rate);
+		list_for_each_entry(p_rt, &m_rt->port_list, port_node) {
+			port_bo = (p_rt->num * 64) + 1;
+			dev_dbg(bus->dev, "p_rt->num=%d hstart=%d hstop=%d port_bo=%d\n",
+				p_rt->num, hstart, hstop, port_bo);
+			sdw_fill_xport_params(&p_rt->transport_params, p_rt->num,
+					      false, SDW_BLK_GRP_CNT_1, sample_int,
+					      port_bo, port_bo >> 8, hstart, hstop,
+					      SDW_BLK_PKG_PER_PORT, 0x0);
+
+			sdw_fill_port_params(&p_rt->port_params,
+					     p_rt->num, bps,
+					     SDW_PORT_FLOW_MODE_ISOCH,
+					     b_params->m_data_mode);
+			t_data.hstart = hstart;
+			t_data.hstop = hstop;
+			t_data.block_offset = port_bo;
+			t_data.sub_block_offset = 0;
+		}
+		sdw_compute_slave_ports(m_rt, &t_data);
+	}
+	return 0;
+}
+
+static int amd_sdw_port_params(struct sdw_bus *bus, struct sdw_port_params *p_params,
+			       unsigned int bank)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	u32 frame_fmt_reg, dpn_frame_fmt;
+
+	dev_dbg(amd_manager->dev, "p_params->num:0x%x\n", p_params->num);
+	switch (amd_manager->instance) {
+	case ACP_SDW0:
+		frame_fmt_reg = sdw0_manager_dp_reg[p_params->num].frame_fmt_reg;
+		break;
+	case ACP_SDW1:
+		frame_fmt_reg = sdw1_manager_dp_reg[p_params->num].frame_fmt_reg;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dpn_frame_fmt = readl(amd_manager->mmio + frame_fmt_reg);
+	u32p_replace_bits(&dpn_frame_fmt, p_params->flow_mode, AMD_DPN_FRAME_FMT_PFM);
+	u32p_replace_bits(&dpn_frame_fmt, p_params->data_mode, AMD_DPN_FRAME_FMT_PDM);
+	u32p_replace_bits(&dpn_frame_fmt, p_params->bps - 1, AMD_DPN_FRAME_FMT_WORD_LEN);
+	writel(dpn_frame_fmt, amd_manager->mmio + frame_fmt_reg);
+	return 0;
+}
+
+static int amd_sdw_transport_params(struct sdw_bus *bus,
+				    struct sdw_transport_params *params,
+				    enum sdw_reg_bank bank)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	u32 dpn_frame_fmt;
+	u32 dpn_sampleinterval;
+	u32 dpn_hctrl;
+	u32 dpn_offsetctrl;
+	u32 dpn_lanectrl;
+	u32 frame_fmt_reg, sample_int_reg, hctrl_dp0_reg;
+	u32 offset_reg, lane_ctrl_ch_en_reg;
+
+	switch (amd_manager->instance) {
+	case ACP_SDW0:
+		frame_fmt_reg = sdw0_manager_dp_reg[params->port_num].frame_fmt_reg;
+		sample_int_reg = sdw0_manager_dp_reg[params->port_num].sample_int_reg;
+		hctrl_dp0_reg = sdw0_manager_dp_reg[params->port_num].hctrl_dp0_reg;
+		offset_reg = sdw0_manager_dp_reg[params->port_num].offset_reg;
+		lane_ctrl_ch_en_reg = sdw0_manager_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
+		break;
+	case ACP_SDW1:
+		frame_fmt_reg = sdw1_manager_dp_reg[params->port_num].frame_fmt_reg;
+		sample_int_reg = sdw1_manager_dp_reg[params->port_num].sample_int_reg;
+		hctrl_dp0_reg = sdw1_manager_dp_reg[params->port_num].hctrl_dp0_reg;
+		offset_reg = sdw1_manager_dp_reg[params->port_num].offset_reg;
+		lane_ctrl_ch_en_reg = sdw1_manager_dp_reg[params->port_num].lane_ctrl_ch_en_reg;
+		break;
+	default:
+		return -EINVAL;
+	}
+	writel(AMD_SDW_SSP_COUNTER_VAL, amd_manager->mmio + ACP_SW_SSP_COUNTER);
+
+	dpn_frame_fmt = readl(amd_manager->mmio + frame_fmt_reg);
+	u32p_replace_bits(&dpn_frame_fmt, params->blk_pkg_mode, AMD_DPN_FRAME_FMT_BLK_PKG_MODE);
+	u32p_replace_bits(&dpn_frame_fmt, params->blk_grp_ctrl, AMD_DPN_FRAME_FMT_BLK_GRP_CTRL);
+	u32p_replace_bits(&dpn_frame_fmt, SDW_STREAM_PCM, AMD_DPN_FRAME_FMT_PCM_OR_PDM);
+	writel(dpn_frame_fmt, amd_manager->mmio + frame_fmt_reg);
+
+	dpn_sampleinterval = params->sample_interval - 1;
+	writel(dpn_sampleinterval, amd_manager->mmio + sample_int_reg);
+
+	dpn_hctrl = FIELD_PREP(AMD_DPN_HCTRL_HSTOP, params->hstop);
+	dpn_hctrl |= FIELD_PREP(AMD_DPN_HCTRL_HSTART, params->hstart);
+	writel(dpn_hctrl, amd_manager->mmio + hctrl_dp0_reg);
+
+	dpn_offsetctrl = FIELD_PREP(AMD_DPN_OFFSET_CTRL_1, params->offset1);
+	dpn_offsetctrl |= FIELD_PREP(AMD_DPN_OFFSET_CTRL_2, params->offset2);
+	writel(dpn_offsetctrl, amd_manager->mmio + offset_reg);
+
+	/*
+	 * lane_ctrl_ch_en_reg will be used to program lane_ctrl and ch_mask
+	 * parameters.
+	 */
+	dpn_lanectrl = readl(amd_manager->mmio + lane_ctrl_ch_en_reg);
+	u32p_replace_bits(&dpn_lanectrl, params->lane_ctrl, AMD_DPN_CH_EN_LCTRL);
+	writel(dpn_lanectrl, amd_manager->mmio + lane_ctrl_ch_en_reg);
+	return 0;
+}
+
+static int amd_sdw_port_enable(struct sdw_bus *bus,
+			       struct sdw_enable_ch *enable_ch,
+			       unsigned int bank)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	u32 dpn_ch_enable;
+	u32 lane_ctrl_ch_en_reg;
+
+	switch (amd_manager->instance) {
+	case ACP_SDW0:
+		lane_ctrl_ch_en_reg = sdw0_manager_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
+		break;
+	case ACP_SDW1:
+		lane_ctrl_ch_en_reg = sdw1_manager_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * lane_ctrl_ch_en_reg will be used to program lane_ctrl and ch_mask
+	 * parameters.
+	 */
+	dpn_ch_enable = readl(amd_manager->mmio + lane_ctrl_ch_en_reg);
+	u32p_replace_bits(&dpn_ch_enable, enable_ch->ch_mask, AMD_DPN_CH_EN_CHMASK);
+	if (enable_ch->enable)
+		writel(dpn_ch_enable, amd_manager->mmio + lane_ctrl_ch_en_reg);
+	else
+		writel(0, amd_manager->mmio + lane_ctrl_ch_en_reg);
+	return 0;
+}
+
+static int sdw_master_read_amd_prop(struct sdw_bus *bus)
+{
+	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
+	struct fwnode_handle *link;
+	struct sdw_master_prop *prop;
+	u32 quirk_mask = 0;
+	u32 wake_en_mask = 0;
+	u32 power_mode_mask = 0;
+	char name[32];
+
+	prop = &bus->prop;
+	/* Find manager handle */
+	snprintf(name, sizeof(name), "mipi-sdw-link-%d-subproperties", bus->link_id);
+	link = device_get_named_child_node(bus->dev, name);
+	if (!link) {
+		dev_err(bus->dev, "Manager node %s not found\n", name);
+		return -EIO;
+	}
+	fwnode_property_read_u32(link, "amd-sdw-enable", &quirk_mask);
+	if (!(quirk_mask & AMD_SDW_QUIRK_MASK_BUS_ENABLE))
+		prop->hw_disabled = true;
+	prop->quirks = SDW_MASTER_QUIRKS_CLEAR_INITIAL_CLASH |
+		       SDW_MASTER_QUIRKS_CLEAR_INITIAL_PARITY;
+
+	fwnode_property_read_u32(link, "amd-sdw-wakeup-enable", &wake_en_mask);
+	amd_manager->wake_en_mask = wake_en_mask;
+	fwnode_property_read_u32(link, "amd-sdw-power-mode", &power_mode_mask);
+	amd_manager->power_mode_mask = power_mode_mask;
+	return 0;
+}
+
+static int amd_prop_read(struct sdw_bus *bus)
+{
+	sdw_master_read_prop(bus);
+	sdw_master_read_amd_prop(bus);
+	return 0;
+}
+
+static const struct sdw_master_port_ops amd_sdw_port_ops = {
+	.dpn_set_port_params = amd_sdw_port_params,
+	.dpn_set_port_transport_params = amd_sdw_transport_params,
+	.dpn_port_enable_ch = amd_sdw_port_enable,
+};
+
+static const struct sdw_master_ops amd_sdw_ops = {
+	.read_prop = amd_prop_read,
+	.xfer_msg = amd_sdw_xfer_msg,
+	.read_ping_status = amd_sdw_read_ping_status,
+};
+
+static void amd_sdw_probe_work(struct work_struct *work)
+{
+	struct amd_sdw_manager *amd_manager = container_of(work, struct amd_sdw_manager,
+							   probe_work);
+	struct sdw_master_prop *prop;
+	int ret;
+
+	prop = &amd_manager->bus.prop;
+	if (!prop->hw_disabled) {
+		amd_enable_sdw_pads(amd_manager);
+		ret = amd_init_sdw_manager(amd_manager);
+		if (ret)
+			return;
+		amd_enable_sdw_interrupts(amd_manager);
+		ret = amd_enable_sdw_manager(amd_manager);
+		if (ret)
+			return;
+		amd_sdw_set_frameshape(amd_manager);
+	}
+}
+
+static int amd_sdw_manager_probe(struct platform_device *pdev)
+{
+	const struct acp_sdw_pdata *pdata = pdev->dev.platform_data;
+	struct resource *res;
+	struct device *dev = &pdev->dev;
+	struct sdw_master_prop *prop;
+	struct sdw_bus_params *params;
+	struct amd_sdw_manager *amd_manager;
+	int ret;
+
+	amd_manager = devm_kzalloc(dev, sizeof(struct amd_sdw_manager), GFP_KERNEL);
+	if (!amd_manager)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	amd_manager->acp_mmio = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(amd_manager->mmio)) {
+		dev_err(dev, "mmio not found\n");
+		return PTR_ERR(amd_manager->mmio);
+	}
+	amd_manager->instance = pdata->instance;
+	amd_manager->mmio = amd_manager->acp_mmio +
+			    (amd_manager->instance * SDW_MANAGER_REG_OFFSET);
+	amd_manager->acp_sdw_lock = pdata->acp_sdw_lock;
+	amd_manager->cols_index = sdw_find_col_index(AMD_SDW_DEFAULT_COLUMNS);
+	amd_manager->rows_index = sdw_find_row_index(AMD_SDW_DEFAULT_ROWS);
+	amd_manager->dev = dev;
+	amd_manager->bus.ops = &amd_sdw_ops;
+	amd_manager->bus.port_ops = &amd_sdw_port_ops;
+	amd_manager->bus.compute_params = &amd_sdw_compute_params;
+	amd_manager->bus.clk_stop_timeout = 200;
+	amd_manager->bus.link_id = amd_manager->instance;
+
+	switch (amd_manager->instance) {
+	case ACP_SDW0:
+		amd_manager->num_dout_ports = AMD_SDW0_MAX_TX_PORTS;
+		amd_manager->num_din_ports = AMD_SDW0_MAX_RX_PORTS;
+		break;
+	case ACP_SDW1:
+		amd_manager->num_dout_ports = AMD_SDW1_MAX_TX_PORTS;
+		amd_manager->num_din_ports = AMD_SDW1_MAX_RX_PORTS;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	amd_manager->reg_mask = &sdw_manager_reg_mask_array[amd_manager->instance];
+	params = &amd_manager->bus.params;
+	params->max_dr_freq = AMD_SDW_DEFAULT_CLK_FREQ * 2;
+	params->curr_dr_freq = AMD_SDW_DEFAULT_CLK_FREQ * 2;
+	params->col = AMD_SDW_DEFAULT_COLUMNS;
+	params->row = AMD_SDW_DEFAULT_ROWS;
+	prop = &amd_manager->bus.prop;
+	prop->clk_freq = &amd_sdw_freq_tbl[0];
+	prop->mclk_freq = AMD_SDW_BUS_BASE_FREQ;
+
+	ret = sdw_bus_master_add(&amd_manager->bus, dev, dev->fwnode);
+	if (ret) {
+		dev_err(dev, "Failed to register SoundWire manager(%d)\n", ret);
+		return ret;
+	}
+	dev_set_drvdata(dev, amd_manager);
+	INIT_WORK(&amd_manager->probe_work, amd_sdw_probe_work);
+	/*
+	 * Instead of having lengthy probe sequence, use deferred probe.
+	 */
+	schedule_work(&amd_manager->probe_work);
+	return 0;
+}
+
+static int amd_sdw_manager_remove(struct platform_device *pdev)
+{
+	struct amd_sdw_manager *amd_manager = dev_get_drvdata(&pdev->dev);
+
+	cancel_work_sync(&amd_manager->probe_work);
+	amd_disable_sdw_interrupts(amd_manager);
+	sdw_bus_master_delete(&amd_manager->bus);
+	return amd_disable_sdw_manager(amd_manager);
+}
+
+static struct platform_driver amd_sdw_driver = {
+	.probe	= &amd_sdw_manager_probe,
+	.remove = &amd_sdw_manager_remove,
+	.driver = {
+		.name	= "amd_sdw_manager",
+	}
+};
+module_platform_driver(amd_sdw_driver);
+
+MODULE_AUTHOR("Vijendar.Mukunda@amd.com");
+MODULE_DESCRIPTION("AMD SoundWire driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/drivers/soundwire/amd_manager.h b/drivers/soundwire/amd_manager.h
new file mode 100644
index 000000000000..f7106521b500
--- /dev/null
+++ b/drivers/soundwire/amd_manager.h
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#ifndef __AMD_MANAGER_H
+#define __AMD_MANAGER_H
+
+#include <linux/soundwire/sdw_amd.h>
+
+#define SDW_MANAGER_REG_OFFSET				0xc00
+#define AMD_SDW_DEFAULT_ROWS				50
+#define AMD_SDW_DEFAULT_COLUMNS				10
+#define ACP_PAD_PULLDOWN_CTRL				0x0001448
+#define ACP_SW_PAD_KEEPER_EN				0x0001454
+#define ACP_SW0_WAKE_EN					0x0001458
+#define ACP_EXTERNAL_INTR_CNTL0				0x0001a04
+#define ACP_EXTERNAL_INTR_STAT0				0x0001a0c
+#define ACP_EXTERNAL_INTR_CNTL(i)			(ACP_EXTERNAL_INTR_CNTL0 + ((i) * 4))
+#define ACP_EXTERNAL_INTR_STAT(i)			(ACP_EXTERNAL_INTR_STAT0 + ((i) * 4))
+#define ACP_SW_WAKE_EN(i)				(ACP_SW0_WAKE_EN + ((i) * 8))
+
+#define ACP_SW_EN					0x0003000
+#define ACP_SW_EN_STATUS				0x0003004
+#define ACP_SW_FRAMESIZE				0x0003008
+#define ACP_SW_SSP_COUNTER				0x000300c
+#define ACP_SW_AUDIO0_TX_EN				0x0003010
+#define ACP_SW_AUDIO0_TX_EN_STATUS			0x0003014
+#define ACP_SW_AUDIO0_TX_FRAME_FORMAT			0x0003018
+#define ACP_SW_AUDIO0_TX_SAMPLEINTERVAL			0x000301c
+#define ACP_SW_AUDIO0_TX_HCTRL_DP0			0x0003020
+#define ACP_SW_AUDIO0_TX_HCTRL_DP1			0x0003024
+#define ACP_SW_AUDIO0_TX_HCTRL_DP2			0x0003028
+#define ACP_SW_AUDIO0_TX_HCTRL_DP3			0x000302c
+#define ACP_SW_AUDIO0_TX_OFFSET_DP0			0x0003030
+#define ACP_SW_AUDIO0_TX_OFFSET_DP1			0x0003034
+#define ACP_SW_AUDIO0_TX_OFFSET_DP2			0x0003038
+#define ACP_SW_AUDIO0_TX_OFFSET_DP3			0x000303c
+#define ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP0		0x0003040
+#define ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP1		0x0003044
+#define ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP2		0x0003048
+#define ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP3		0x000304c
+#define ACP_SW_AUDIO1_TX_EN				0x0003050
+#define ACP_SW_AUDIO1_TX_EN_STATUS			0x0003054
+#define ACP_SW_AUDIO1_TX_FRAME_FORMAT			0x0003058
+#define ACP_SW_AUDIO1_TX_SAMPLEINTERVAL			0x000305c
+#define ACP_SW_AUDIO1_TX_HCTRL				0x0003060
+#define ACP_SW_AUDIO1_TX_OFFSET				0x0003064
+#define ACP_SW_AUDIO1_TX_CHANNEL_ENABLE_DP0		0x0003068
+#define ACP_SW_AUDIO2_TX_EN				0x000306c
+#define ACP_SW_AUDIO2_TX_EN_STATUS			0x0003070
+#define ACP_SW_AUDIO2_TX_FRAME_FORMAT			0x0003074
+#define ACP_SW_AUDIO2_TX_SAMPLEINTERVAL			0x0003078
+#define ACP_SW_AUDIO2_TX_HCTRL				0x000307c
+#define ACP_SW_AUDIO2_TX_OFFSET				0x0003080
+#define ACP_SW_AUDIO2_TX_CHANNEL_ENABLE_DP0		0x0003084
+#define ACP_SW_AUDIO0_RX_EN				0x0003088
+#define ACP_SW_AUDIO0_RX_EN_STATUS			0x000308c
+#define ACP_SW_AUDIO0_RX_FRAME_FORMAT			0x0003090
+#define ACP_SW_AUDIO0_RX_SAMPLEINTERVAL			0x0003094
+#define ACP_SW_AUDIO0_RX_HCTRL_DP0			0x0003098
+#define ACP_SW_AUDIO0_RX_HCTRL_DP1			0x000309c
+#define ACP_SW_AUDIO0_RX_HCTRL_DP2			0x0003100
+#define ACP_SW_AUDIO0_RX_HCTRL_DP3			0x0003104
+#define ACP_SW_AUDIO0_RX_OFFSET_DP0			0x0003108
+#define ACP_SW_AUDIO0_RX_OFFSET_DP1			0x000310c
+#define ACP_SW_AUDIO0_RX_OFFSET_DP2			0x0003110
+#define ACP_SW_AUDIO0_RX_OFFSET_DP3			0x0003114
+#define ACP_SW_AUDIO0_RX_CHANNEL_ENABLE_DP0		0x0003118
+#define ACP_SW_AUDIO0_RX_CHANNEL_ENABLE_DP1		0x000311c
+#define ACP_SW_AUDIO0_RX_CHANNEL_ENABLE_DP2		0x0003120
+#define ACP_SW_AUDIO0_RX_CHANNEL_ENABLE_DP3		0x0003124
+#define ACP_SW_AUDIO1_RX_EN				0x0003128
+#define ACP_SW_AUDIO1_RX_EN_STATUS			0x000312c
+#define ACP_SW_AUDIO1_RX_FRAME_FORMAT			0x0003130
+#define ACP_SW_AUDIO1_RX_SAMPLEINTERVAL			0x0003134
+#define ACP_SW_AUDIO1_RX_HCTRL				0x0003138
+#define ACP_SW_AUDIO1_RX_OFFSET				0x000313c
+#define ACP_SW_AUDIO1_RX_CHANNEL_ENABLE_DP0		0x0003140
+#define ACP_SW_AUDIO2_RX_EN				0x0003144
+#define ACP_SW_AUDIO2_RX_EN_STATUS			0x0003148
+#define ACP_SW_AUDIO2_RX_FRAME_FORMAT			0x000314c
+#define ACP_SW_AUDIO2_RX_SAMPLEINTERVAL			0x0003150
+#define ACP_SW_AUDIO2_RX_HCTRL				0x0003154
+#define ACP_SW_AUDIO2_RX_OFFSET				0x0003158
+#define ACP_SW_AUDIO2_RX_CHANNEL_ENABLE_DP0		0x000315c
+#define ACP_SW_BPT_PORT_EN				0x0003160
+#define ACP_SW_BPT_PORT_EN_STATUS			0x0003164
+#define ACP_SW_BPT_PORT_FRAME_FORMAT			0x0003168
+#define ACP_SW_BPT_PORT_SAMPLEINTERVAL			0x000316c
+#define ACP_SW_BPT_PORT_HCTRL				0x0003170
+#define ACP_SW_BPT_PORT_OFFSET				0x0003174
+#define ACP_SW_BPT_PORT_CHANNEL_ENABLE			0x0003178
+#define ACP_SW_BPT_PORT_FIRST_BYTE_ADDR			0x000317c
+#define ACP_SW_CLK_RESUME_CTRL				0x0003180
+#define ACP_SW_CLK_RESUME_DELAY_CNTR			0x0003184
+#define ACP_SW_BUS_RESET_CTRL				0x0003188
+#define ACP_SW_PRBS_ERR_STATUS				0x000318c
+#define ACP_SW_IMM_CMD_UPPER_WORD			0x0003230
+#define ACP_SW_IMM_CMD_LOWER_QWORD			0x0003234
+#define ACP_SW_IMM_RESP_UPPER_WORD			0x0003238
+#define ACP_SW_IMM_RESP_LOWER_QWORD			0x000323c
+#define ACP_SW_IMM_CMD_STS				0x0003240
+#define ACP_SW_BRA_BASE_ADDRESS				0x0003244
+#define ACP_SW_BRA_TRANSFER_SIZE			0x0003248
+#define ACP_SW_BRA_DMA_BUSY				0x000324c
+#define ACP_SW_BRA_RESP					0x0003250
+#define ACP_SW_BRA_RESP_FRAME_ADDR			0x0003254
+#define ACP_SW_BRA_CURRENT_TRANSFER_SIZE		0x0003258
+#define ACP_SW_STATE_CHANGE_STATUS_0TO7			0x000325c
+#define ACP_SW_STATE_CHANGE_STATUS_8TO11		0x0003260
+#define ACP_SW_STATE_CHANGE_STATUS_MASK_0TO7		0x0003264
+#define ACP_SW_STATE_CHANGE_STATUS_MASK_8TO11		0x0003268
+#define ACP_SW_CLK_FREQUENCY_CTRL			0x000326c
+#define ACP_SW_ERROR_INTR_MASK				0x0003270
+#define ACP_SW_PHY_TEST_MODE_DATA_OFF			0x0003274
+
+#define ACP_DELAY_US					10
+#define AMD_SDW_TIMEOUT					1000
+#define AMD_SDW_DEFAULT_CLK_FREQ			12000000
+
+#define AMD_SDW_MCP_RESP_ACK				BIT(0)
+#define AMD_SDW_MCP_RESP_NACK				BIT(1)
+#define AMD_SDW_MCP_RESP_RDATA				GENMASK(14, 7)
+
+#define AMD_SDW_MCP_CMD_SSP_TAG				BIT(31)
+#define AMD_SDW_MCP_CMD_COMMAND				GENMASK(14, 12)
+#define AMD_SDW_MCP_CMD_DEV_ADDR			GENMASK(11, 8)
+#define AMD_SDW_MCP_CMD_REG_ADDR_HIGH			GENMASK(7, 0)
+#define AMD_SDW_MCP_CMD_REG_ADDR_LOW			GENMASK(31, 24)
+#define AMD_SDW_MCP_CMD_REG_DATA			GENMASK(14, 7)
+#define AMD_SDW_MCP_SLAVE_STAT_0_3			GENMASK(14, 7)
+#define AMD_SDW_MCP_SLAVE_STAT_4_11			GENMASK_ULL(39, 24)
+#define AMD_SDW_MCP_SLAVE_STATUS_MASK			GENMASK(1, 0)
+#define AMD_SDW_MCP_SLAVE_STATUS_BITS			GENMASK(3, 2)
+#define AMD_SDW_MCP_SLAVE_STATUS_8TO_11			GENMASK_ULL(15, 0)
+#define AMD_SDW_MCP_SLAVE_STATUS_VALID_MASK(x)		BIT(((x) * 4))
+#define AMD_SDW_MCP_SLAVE_STAT_SHIFT_MASK(x)		(((x) * 4) + 1)
+
+#define AMD_SDW_MASTER_SUSPEND_DELAY_MS			2000
+#define AMD_SDW_QUIRK_MASK_BUS_ENABLE			BIT(0)
+
+#define AMD_SDW_IMM_RES_VALID		1
+#define AMD_SDW_IMM_CMD_BUSY		2
+#define AMD_SDW_ENABLE			1
+#define AMD_SDW_DISABLE			0
+#define AMD_SDW_BUS_RESET_CLEAR_REQ	0
+#define AMD_SDW_BUS_RESET_REQ		1
+#define AMD_SDW_BUS_RESET_DONE		2
+#define AMD_SDW_BUS_BASE_FREQ		24000000
+
+#define AMD_SDW0_EXT_INTR_MASK		0x200000
+#define AMD_SDW1_EXT_INTR_MASK		4
+#define AMD_SDW_IRQ_MASK_0TO7		0x77777777
+#define AMD_SDW_IRQ_MASK_8TO11		0x000d7777
+#define AMD_SDW_IRQ_ERROR_MASK		0xff
+#define AMD_SDW_MAX_FREQ_NUM		1
+#define AMD_SDW0_MAX_TX_PORTS		3
+#define AMD_SDW0_MAX_RX_PORTS		3
+#define AMD_SDW1_MAX_TX_PORTS		1
+#define AMD_SDW1_MAX_RX_PORTS		1
+#define AMD_SDW0_MAX_DAI		6
+#define AMD_SDW1_MAX_DAI		2
+#define AMD_SDW_SLAVE_0_ATTACHED	5
+#define AMD_SDW_SSP_COUNTER_VAL		3
+
+#define AMD_DPN_FRAME_FMT_PFM				GENMASK(1, 0)
+#define AMD_DPN_FRAME_FMT_PDM				GENMASK(3, 2)
+#define AMD_DPN_FRAME_FMT_BLK_PKG_MODE			BIT(4)
+#define AMD_DPN_FRAME_FMT_BLK_GRP_CTRL			GENMASK(6, 5)
+#define AMD_DPN_FRAME_FMT_WORD_LEN			GENMASK(12, 7)
+#define AMD_DPN_FRAME_FMT_PCM_OR_PDM			BIT(13)
+#define AMD_DPN_HCTRL_HSTOP				GENMASK(3, 0)
+#define AMD_DPN_HCTRL_HSTART				GENMASK(7, 4)
+#define AMD_DPN_OFFSET_CTRL_1				GENMASK(7, 0)
+#define AMD_DPN_OFFSET_CTRL_2				GENMASK(15, 8)
+#define AMD_DPN_CH_EN_LCTRL				GENMASK(2, 0)
+#define AMD_DPN_CH_EN_CHMASK				GENMASK(10, 3)
+#define AMD_SDW_STAT_MAX_RETRY_COUNT			100
+#define AMD_SDW0_PAD_PULLDOWN_CTRL_ENABLE_MASK		0x7f9f
+#define AMD_SDW1_PAD_PULLDOWN_CTRL_ENABLE_MASK		0x7ffa
+#define AMD_SDW0_PAD_PULLDOWN_CTRL_DISABLE_MASK		0x60
+#define AMD_SDW1_PAD_PULLDOWN_CTRL_DISABLE_MASK		5
+#define AMD_SDW0_PAD_KEEPER_EN_MASK			1
+#define AMD_SDW1_PAD_KEEPER_EN_MASK			0x10
+#define AMD_SDW0_PAD_KEEPER_DISABLE_MASK		0x1e
+#define AMD_SDW1_PAD_KEEPER_DISABLE_MASK		0xf
+
+static u32 amd_sdw_freq_tbl[AMD_SDW_MAX_FREQ_NUM] = {
+	AMD_SDW_DEFAULT_CLK_FREQ,
+};
+
+struct sdw_manager_dp_reg {
+	u32 frame_fmt_reg;
+	u32 sample_int_reg;
+	u32 hctrl_dp0_reg;
+	u32 offset_reg;
+	u32 lane_ctrl_ch_en_reg;
+};
+
+static struct sdw_manager_dp_reg sdw0_manager_dp_reg[AMD_SDW0_MAX_DAI] =  {
+	{ACP_SW_AUDIO0_TX_FRAME_FORMAT, ACP_SW_AUDIO0_TX_SAMPLEINTERVAL, ACP_SW_AUDIO0_TX_HCTRL_DP0,
+	 ACP_SW_AUDIO0_TX_OFFSET_DP0, ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO1_TX_FRAME_FORMAT, ACP_SW_AUDIO1_TX_SAMPLEINTERVAL, ACP_SW_AUDIO1_TX_HCTRL,
+	 ACP_SW_AUDIO1_TX_OFFSET, ACP_SW_AUDIO1_TX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO2_TX_FRAME_FORMAT, ACP_SW_AUDIO2_TX_SAMPLEINTERVAL, ACP_SW_AUDIO2_TX_HCTRL,
+	 ACP_SW_AUDIO2_TX_OFFSET, ACP_SW_AUDIO2_TX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO0_RX_FRAME_FORMAT, ACP_SW_AUDIO0_RX_SAMPLEINTERVAL, ACP_SW_AUDIO0_RX_HCTRL_DP0,
+	 ACP_SW_AUDIO0_RX_OFFSET_DP0, ACP_SW_AUDIO0_RX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO1_RX_FRAME_FORMAT, ACP_SW_AUDIO1_RX_SAMPLEINTERVAL, ACP_SW_AUDIO1_RX_HCTRL,
+	 ACP_SW_AUDIO1_RX_OFFSET, ACP_SW_AUDIO1_RX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO2_RX_FRAME_FORMAT, ACP_SW_AUDIO2_RX_SAMPLEINTERVAL, ACP_SW_AUDIO2_RX_HCTRL,
+	 ACP_SW_AUDIO2_RX_OFFSET, ACP_SW_AUDIO2_RX_CHANNEL_ENABLE_DP0},
+};
+
+static struct sdw_manager_dp_reg sdw1_manager_dp_reg[AMD_SDW1_MAX_DAI] =  {
+	{ACP_SW_AUDIO1_TX_FRAME_FORMAT, ACP_SW_AUDIO1_TX_SAMPLEINTERVAL, ACP_SW_AUDIO1_TX_HCTRL,
+	 ACP_SW_AUDIO1_TX_OFFSET, ACP_SW_AUDIO1_TX_CHANNEL_ENABLE_DP0},
+	{ACP_SW_AUDIO1_RX_FRAME_FORMAT, ACP_SW_AUDIO1_RX_SAMPLEINTERVAL, ACP_SW_AUDIO1_RX_HCTRL,
+	 ACP_SW_AUDIO1_RX_OFFSET, ACP_SW_AUDIO1_RX_CHANNEL_ENABLE_DP0}
+};
+
+static struct sdw_manager_reg_mask sdw_manager_reg_mask_array[2] =  {
+	{
+		AMD_SDW0_PAD_KEEPER_EN_MASK,
+		AMD_SDW0_PAD_PULLDOWN_CTRL_ENABLE_MASK,
+		AMD_SDW0_EXT_INTR_MASK
+	},
+	{
+		AMD_SDW1_PAD_KEEPER_EN_MASK,
+		AMD_SDW1_PAD_PULLDOWN_CTRL_ENABLE_MASK,
+		AMD_SDW1_EXT_INTR_MASK
+	}
+};
+#endif
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
new file mode 100644
index 000000000000..c14a291a40e8
--- /dev/null
+++ b/include/linux/soundwire/sdw_amd.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#ifndef __SDW_AMD_H
+#define __SDW_AMD_H
+
+#include <linux/soundwire/sdw.h>
+
+#define ACP_SDW0	0
+#define ACP_SDW1	1
+
+struct acp_sdw_pdata {
+	u16 instance;
+	/* mutex to protect acp common register access */
+	struct mutex *acp_sdw_lock;
+};
+
+struct sdw_manager_reg_mask {
+	u32 sw_pad_enable_mask;
+	u32 sw_pad_pulldown_mask;
+	u32 acp_sdw_intr_mask;
+};
+
+/**
+ * struct amd_sdw_manager - amd manager driver context
+ * @bus: bus handle
+ * @dev: linux device
+ * @mmio: SoundWire registers mmio base
+ * @acp_mmio: acp registers mmio base
+ * @reg_mask: register mask structure per manager instance
+ * @probe_work: SoundWire manager probe workqueue
+ * @acp_sdw_lock: mutex to protect acp share register access
+ * @num_din_ports: number of input ports
+ * @num_dout_ports: number of output ports
+ * @cols_index: Column index in frame shape
+ * @rows_index: Rows index in frame shape
+ * @instance: SoundWire manager instance
+ * @quirks: SoundWire manager quirks
+ * @wake_en_mask: wake enable mask per SoundWire manager
+ * @power_mode_mask: flag interprets amd SoundWire manager power mode
+ */
+struct amd_sdw_manager {
+	struct sdw_bus bus;
+	struct device *dev;
+
+	void __iomem *mmio;
+	void __iomem *acp_mmio;
+
+	struct sdw_manager_reg_mask *reg_mask;
+	struct work_struct probe_work;
+	/* mutex to protect acp common register access */
+	struct mutex *acp_sdw_lock;
+
+	int num_din_ports;
+	int num_dout_ports;
+
+	int cols_index;
+	int rows_index;
+
+	u32 instance;
+	u32 quirks;
+	u32 wake_en_mask;
+	u32 power_mode_mask;
+};
+#endif
-- 
cgit v1.2.3


From 2b13596f7c9c03b1653b21a440ce672c3d7e5233 Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 21 Mar 2023 10:38:56 +0530
Subject: soundwire: amd: register SoundWire manager dai ops

Register dai ops for SoundWire manager instances.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/lkml/20230227154801.50319-4-Vijendar.Mukunda@amd.com
Link: https://lore.kernel.org/r/20230321050901.115439-4-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 178 ++++++++++++++++++++++++++++++++++++++
 drivers/soundwire/amd_manager.h   |  18 ++++
 include/linux/soundwire/sdw_amd.h |  18 ++++
 3 files changed, 214 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index f02522d11417..f12dad6c3226 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -551,6 +551,178 @@ static const struct sdw_master_ops amd_sdw_ops = {
 	.read_ping_status = amd_sdw_read_ping_status,
 };
 
+static int amd_sdw_hw_params(struct snd_pcm_substream *substream,
+			     struct snd_pcm_hw_params *params,
+			     struct snd_soc_dai *dai)
+{
+	struct amd_sdw_manager *amd_manager = snd_soc_dai_get_drvdata(dai);
+	struct sdw_amd_dai_runtime *dai_runtime;
+	struct sdw_stream_config sconfig;
+	struct sdw_port_config *pconfig;
+	int ch, dir;
+	int ret;
+
+	dai_runtime = amd_manager->dai_runtime_array[dai->id];
+	if (!dai_runtime)
+		return -EIO;
+
+	ch = params_channels(params);
+	if (substream->stream == SNDRV_PCM_STREAM_CAPTURE)
+		dir = SDW_DATA_DIR_RX;
+	else
+		dir = SDW_DATA_DIR_TX;
+	dev_dbg(amd_manager->dev, "dir:%d dai->id:0x%x\n", dir, dai->id);
+
+	sconfig.direction = dir;
+	sconfig.ch_count = ch;
+	sconfig.frame_rate = params_rate(params);
+	sconfig.type = dai_runtime->stream_type;
+
+	sconfig.bps = snd_pcm_format_width(params_format(params));
+
+	/* Port configuration */
+	pconfig = kzalloc(sizeof(*pconfig), GFP_KERNEL);
+	if (!pconfig) {
+		ret =  -ENOMEM;
+		goto error;
+	}
+
+	pconfig->num = dai->id;
+	pconfig->ch_mask = (1 << ch) - 1;
+	ret = sdw_stream_add_master(&amd_manager->bus, &sconfig,
+				    pconfig, 1, dai_runtime->stream);
+	if (ret)
+		dev_err(amd_manager->dev, "add manager to stream failed:%d\n", ret);
+
+	kfree(pconfig);
+error:
+	return ret;
+}
+
+static int amd_sdw_hw_free(struct snd_pcm_substream *substream, struct snd_soc_dai *dai)
+{
+	struct amd_sdw_manager *amd_manager = snd_soc_dai_get_drvdata(dai);
+	struct sdw_amd_dai_runtime *dai_runtime;
+	int ret;
+
+	dai_runtime = amd_manager->dai_runtime_array[dai->id];
+	if (!dai_runtime)
+		return -EIO;
+
+	ret = sdw_stream_remove_master(&amd_manager->bus, dai_runtime->stream);
+	if (ret < 0)
+		dev_err(dai->dev, "remove manager from stream %s failed: %d\n",
+			dai_runtime->stream->name, ret);
+	return ret;
+}
+
+static int amd_set_sdw_stream(struct snd_soc_dai *dai, void *stream, int direction)
+{
+	struct amd_sdw_manager *amd_manager = snd_soc_dai_get_drvdata(dai);
+	struct sdw_amd_dai_runtime *dai_runtime;
+
+	dai_runtime = amd_manager->dai_runtime_array[dai->id];
+	if (stream) {
+		/* first paranoia check */
+		if (dai_runtime) {
+			dev_err(dai->dev, "dai_runtime already allocated for dai %s\n",	dai->name);
+			return -EINVAL;
+		}
+
+		/* allocate and set dai_runtime info */
+		dai_runtime = kzalloc(sizeof(*dai_runtime), GFP_KERNEL);
+		if (!dai_runtime)
+			return -ENOMEM;
+
+		dai_runtime->stream_type = SDW_STREAM_PCM;
+		dai_runtime->bus = &amd_manager->bus;
+		dai_runtime->stream = stream;
+		amd_manager->dai_runtime_array[dai->id] = dai_runtime;
+	} else {
+		/* second paranoia check */
+		if (!dai_runtime) {
+			dev_err(dai->dev, "dai_runtime not allocated for dai %s\n", dai->name);
+			return -EINVAL;
+		}
+
+		/* for NULL stream we release allocated dai_runtime */
+		kfree(dai_runtime);
+		amd_manager->dai_runtime_array[dai->id] = NULL;
+	}
+	return 0;
+}
+
+static int amd_pcm_set_sdw_stream(struct snd_soc_dai *dai, void *stream, int direction)
+{
+	return amd_set_sdw_stream(dai, stream, direction);
+}
+
+static void *amd_get_sdw_stream(struct snd_soc_dai *dai, int direction)
+{
+	struct amd_sdw_manager *amd_manager = snd_soc_dai_get_drvdata(dai);
+	struct sdw_amd_dai_runtime *dai_runtime;
+
+	dai_runtime = amd_manager->dai_runtime_array[dai->id];
+	if (!dai_runtime)
+		return ERR_PTR(-EINVAL);
+
+	return dai_runtime->stream;
+}
+
+static const struct snd_soc_dai_ops amd_sdw_dai_ops = {
+	.hw_params = amd_sdw_hw_params,
+	.hw_free = amd_sdw_hw_free,
+	.set_stream = amd_pcm_set_sdw_stream,
+	.get_stream = amd_get_sdw_stream,
+};
+
+static const struct snd_soc_component_driver amd_sdw_dai_component = {
+	.name = "soundwire",
+};
+
+static int amd_sdw_register_dais(struct amd_sdw_manager *amd_manager)
+{
+	struct sdw_amd_dai_runtime **dai_runtime_array;
+	struct snd_soc_dai_driver *dais;
+	struct snd_soc_pcm_stream *stream;
+	struct device *dev;
+	int i, num_dais;
+
+	dev = amd_manager->dev;
+	num_dais = amd_manager->num_dout_ports + amd_manager->num_din_ports;
+	dais = devm_kcalloc(dev, num_dais, sizeof(*dais), GFP_KERNEL);
+	if (!dais)
+		return -ENOMEM;
+
+	dai_runtime_array = devm_kcalloc(dev, num_dais,
+					 sizeof(struct sdw_amd_dai_runtime *),
+					 GFP_KERNEL);
+	if (!dai_runtime_array)
+		return -ENOMEM;
+	amd_manager->dai_runtime_array = dai_runtime_array;
+	for (i = 0; i < num_dais; i++) {
+		dais[i].name = devm_kasprintf(dev, GFP_KERNEL, "SDW%d Pin%d", amd_manager->instance,
+					      i);
+		if (!dais[i].name)
+			return -ENOMEM;
+		if (i < amd_manager->num_dout_ports)
+			stream = &dais[i].playback;
+		else
+			stream = &dais[i].capture;
+
+		stream->channels_min = 2;
+		stream->channels_max = 2;
+		stream->rates = SNDRV_PCM_RATE_48000;
+		stream->formats = SNDRV_PCM_FMTBIT_S16_LE;
+
+		dais[i].ops = &amd_sdw_dai_ops;
+		dais[i].id = i;
+	}
+
+	return devm_snd_soc_register_component(dev, &amd_sdw_dai_component,
+					       dais, num_dais);
+}
+
 static void amd_sdw_probe_work(struct work_struct *work)
 {
 	struct amd_sdw_manager *amd_manager = container_of(work, struct amd_sdw_manager,
@@ -636,6 +808,12 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 		dev_err(dev, "Failed to register SoundWire manager(%d)\n", ret);
 		return ret;
 	}
+	ret = amd_sdw_register_dais(amd_manager);
+	if (ret) {
+		dev_err(dev, "CPU DAI registration failed\n");
+		sdw_bus_master_delete(&amd_manager->bus);
+		return ret;
+	}
 	dev_set_drvdata(dev, amd_manager);
 	INIT_WORK(&amd_manager->probe_work, amd_sdw_probe_work);
 	/*
diff --git a/drivers/soundwire/amd_manager.h b/drivers/soundwire/amd_manager.h
index f7106521b500..c92e0dee2cb1 100644
--- a/drivers/soundwire/amd_manager.h
+++ b/drivers/soundwire/amd_manager.h
@@ -198,6 +198,24 @@ struct sdw_manager_dp_reg {
 	u32 lane_ctrl_ch_en_reg;
 };
 
+/*
+ * SDW0 Manager instance registers  6 CPU DAI (3 TX & 3 RX Ports)
+ * whereas SDW1  Manager Instance registers 2 CPU DAI (one TX & one RX port)
+ * Below is the CPU DAI <->Manager port number mapping
+ * i.e SDW0 Pin0 -> port number 0 -> AUDIO0 TX
+ *     SDW0 Pin1 -> Port number 1 -> AUDIO1 TX
+ *     SDW0 Pin2 -> Port number 2 -> AUDIO2 TX
+ *     SDW0 Pin3 -> port number 3 -> AUDIO0 RX
+ *     SDW0 Pin4 -> Port number 4 -> AUDIO1 RX
+ *     SDW0 Pin5 -> Port number 5 -> AUDIO2 RX
+ *  Whereas for SDW1 instance
+ *  SDW1 Pin0 -> port number 0 -> AUDIO1 TX
+ *  SDW1 Pin1 -> Port number 1 -> AUDIO1 RX
+ *  Same mapping should be used for programming DMA controller registers in SoundWire DMA driver.
+ * i.e if AUDIO0 TX channel is selected then we need to use AUDIO0 TX registers for DMA programming
+ * in SoundWire DMA driver.
+ */
+
 static struct sdw_manager_dp_reg sdw0_manager_dp_reg[AMD_SDW0_MAX_DAI] =  {
 	{ACP_SW_AUDIO0_TX_FRAME_FORMAT, ACP_SW_AUDIO0_TX_SAMPLEINTERVAL, ACP_SW_AUDIO0_TX_HCTRL_DP0,
 	 ACP_SW_AUDIO0_TX_OFFSET_DP0, ACP_SW_AUDIO0_TX_CHANNEL_ENABLE_DP0},
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index c14a291a40e8..ac537419301d 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -23,6 +23,21 @@ struct sdw_manager_reg_mask {
 	u32 acp_sdw_intr_mask;
 };
 
+/**
+ * struct sdw_amd_dai_runtime: AMD sdw dai runtime  data
+ *
+ * @name: SoundWire stream name
+ * @stream: stream runtime
+ * @bus: Bus handle
+ * @stream_type: Stream type
+ */
+struct sdw_amd_dai_runtime {
+	char *name;
+	struct sdw_stream_runtime *stream;
+	struct sdw_bus *bus;
+	enum sdw_stream_type stream_type;
+};
+
 /**
  * struct amd_sdw_manager - amd manager driver context
  * @bus: bus handle
@@ -40,6 +55,7 @@ struct sdw_manager_reg_mask {
  * @quirks: SoundWire manager quirks
  * @wake_en_mask: wake enable mask per SoundWire manager
  * @power_mode_mask: flag interprets amd SoundWire manager power mode
+ * @dai_runtime_array: dai runtime array
  */
 struct amd_sdw_manager {
 	struct sdw_bus bus;
@@ -63,5 +79,7 @@ struct amd_sdw_manager {
 	u32 quirks;
 	u32 wake_en_mask;
 	u32 power_mode_mask;
+
+	struct sdw_amd_dai_runtime **dai_runtime_array;
 };
 #endif
-- 
cgit v1.2.3


From 65f93e4096a07abd41acf0d240715bd8c7ef7eeb Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 21 Mar 2023 10:38:58 +0530
Subject: soundwire: amd: add SoundWire manager interrupt handling

Add support for handling SoundWire manager interrupts.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Signed-off-by: Mastan Katragadda <Mastan.Katragadda@amd.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/lkml/20230227154801.50319-6-Vijendar.Mukunda@amd.com
Link: https://lore.kernel.org/r/20230321050901.115439-6-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 126 ++++++++++++++++++++++++++++++++++++++
 drivers/soundwire/amd_manager.h   |   1 +
 include/linux/soundwire/sdw_amd.h |   7 +++
 3 files changed, 134 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index f12dad6c3226..49b7133cb41f 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -327,6 +327,47 @@ static enum sdw_command_response amd_sdw_xfer_msg(struct sdw_bus *bus, struct sd
 	return SDW_CMD_OK;
 }
 
+static void amd_sdw_fill_slave_status(struct amd_sdw_manager *amd_manager, u16 index, u32 status)
+{
+	switch (status) {
+	case SDW_SLAVE_ATTACHED:
+	case SDW_SLAVE_UNATTACHED:
+	case SDW_SLAVE_ALERT:
+		amd_manager->status[index] = status;
+		break;
+	default:
+		amd_manager->status[index] = SDW_SLAVE_RESERVED;
+		break;
+	}
+}
+
+static void amd_sdw_process_ping_status(u64 response, struct amd_sdw_manager *amd_manager)
+{
+	u64 slave_stat;
+	u32 val;
+	u16 dev_index;
+
+	/* slave status response */
+	slave_stat = FIELD_GET(AMD_SDW_MCP_SLAVE_STAT_0_3, response);
+	slave_stat |= FIELD_GET(AMD_SDW_MCP_SLAVE_STAT_4_11, response) << 8;
+	dev_dbg(amd_manager->dev, "slave_stat:0x%llx\n", slave_stat);
+	for (dev_index = 0; dev_index <= SDW_MAX_DEVICES; ++dev_index) {
+		val = (slave_stat >> (dev_index * 2)) & AMD_SDW_MCP_SLAVE_STATUS_MASK;
+		dev_dbg(amd_manager->dev, "val:0x%x\n", val);
+		amd_sdw_fill_slave_status(amd_manager, dev_index, val);
+	}
+}
+
+static void amd_sdw_read_and_process_ping_status(struct amd_sdw_manager *amd_manager)
+{
+	u64 response;
+
+	mutex_lock(&amd_manager->bus.msg_lock);
+	response = amd_sdw_send_cmd_get_resp(amd_manager, 0, 0);
+	mutex_unlock(&amd_manager->bus.msg_lock);
+	amd_sdw_process_ping_status(response, amd_manager);
+}
+
 static u32 amd_sdw_read_ping_status(struct sdw_bus *bus)
 {
 	struct amd_sdw_manager *amd_manager = to_amd_sdw(bus);
@@ -723,6 +764,89 @@ static int amd_sdw_register_dais(struct amd_sdw_manager *amd_manager)
 					       dais, num_dais);
 }
 
+static void amd_sdw_update_slave_status_work(struct work_struct *work)
+{
+	struct amd_sdw_manager *amd_manager =
+		container_of(work, struct amd_sdw_manager, amd_sdw_work);
+	int retry_count = 0;
+
+	if (amd_manager->status[0] == SDW_SLAVE_ATTACHED) {
+		writel(0, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_MASK_0TO7);
+		writel(0, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_MASK_8TO11);
+	}
+
+update_status:
+	sdw_handle_slave_status(&amd_manager->bus, amd_manager->status);
+	/*
+	 * During the peripheral enumeration sequence, the SoundWire manager interrupts
+	 * are masked. Once the device number programming is done for all peripherals,
+	 * interrupts will be unmasked. Read the peripheral device status from ping command
+	 * and process the response. This sequence will ensure all peripheral devices enumerated
+	 * and initialized properly.
+	 */
+	if (amd_manager->status[0] == SDW_SLAVE_ATTACHED) {
+		if (retry_count++ < SDW_MAX_DEVICES) {
+			writel(AMD_SDW_IRQ_MASK_0TO7, amd_manager->mmio +
+			       ACP_SW_STATE_CHANGE_STATUS_MASK_0TO7);
+			writel(AMD_SDW_IRQ_MASK_8TO11, amd_manager->mmio +
+			       ACP_SW_STATE_CHANGE_STATUS_MASK_8TO11);
+			amd_sdw_read_and_process_ping_status(amd_manager);
+			goto update_status;
+		} else {
+			dev_err_ratelimited(amd_manager->dev,
+					    "Device0 detected after %d iterations\n",
+					    retry_count);
+		}
+	}
+}
+
+static void amd_sdw_update_slave_status(u32 status_change_0to7, u32 status_change_8to11,
+					struct amd_sdw_manager *amd_manager)
+{
+	u64 slave_stat;
+	u32 val;
+	int dev_index;
+
+	if (status_change_0to7 == AMD_SDW_SLAVE_0_ATTACHED)
+		memset(amd_manager->status, 0, sizeof(amd_manager->status));
+	slave_stat = status_change_0to7;
+	slave_stat |= FIELD_GET(AMD_SDW_MCP_SLAVE_STATUS_8TO_11, status_change_8to11) << 32;
+	dev_dbg(amd_manager->dev, "status_change_0to7:0x%x status_change_8to11:0x%x\n",
+		status_change_0to7, status_change_8to11);
+	if (slave_stat) {
+		for (dev_index = 0; dev_index <= SDW_MAX_DEVICES; ++dev_index) {
+			if (slave_stat & AMD_SDW_MCP_SLAVE_STATUS_VALID_MASK(dev_index)) {
+				val = (slave_stat >> AMD_SDW_MCP_SLAVE_STAT_SHIFT_MASK(dev_index)) &
+				      AMD_SDW_MCP_SLAVE_STATUS_MASK;
+				amd_sdw_fill_slave_status(amd_manager, dev_index, val);
+			}
+		}
+	}
+}
+
+static void amd_sdw_irq_thread(struct work_struct *work)
+{
+	struct amd_sdw_manager *amd_manager =
+			container_of(work, struct amd_sdw_manager, amd_sdw_irq_thread);
+	u32 status_change_8to11;
+	u32 status_change_0to7;
+
+	status_change_8to11 = readl(amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_8TO11);
+	status_change_0to7 = readl(amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_0TO7);
+	dev_dbg(amd_manager->dev, "[SDW%d] SDW INT: 0to7=0x%x, 8to11=0x%x\n",
+		amd_manager->instance, status_change_0to7, status_change_8to11);
+	if (status_change_8to11 & AMD_SDW_PREQ_INTR_STAT) {
+		amd_sdw_read_and_process_ping_status(amd_manager);
+	} else {
+		/* Check for the updated status on peripheral device */
+		amd_sdw_update_slave_status(status_change_0to7, status_change_8to11, amd_manager);
+	}
+	if (status_change_8to11 || status_change_0to7)
+		schedule_work(&amd_manager->amd_sdw_work);
+	writel(0x00, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_8TO11);
+	writel(0x00, amd_manager->mmio + ACP_SW_STATE_CHANGE_STATUS_0TO7);
+}
+
 static void amd_sdw_probe_work(struct work_struct *work)
 {
 	struct amd_sdw_manager *amd_manager = container_of(work, struct amd_sdw_manager,
@@ -815,6 +939,8 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 		return ret;
 	}
 	dev_set_drvdata(dev, amd_manager);
+	INIT_WORK(&amd_manager->amd_sdw_irq_thread, amd_sdw_irq_thread);
+	INIT_WORK(&amd_manager->amd_sdw_work, amd_sdw_update_slave_status_work);
 	INIT_WORK(&amd_manager->probe_work, amd_sdw_probe_work);
 	/*
 	 * Instead of having lengthy probe sequence, use deferred probe.
diff --git a/drivers/soundwire/amd_manager.h b/drivers/soundwire/amd_manager.h
index c92e0dee2cb1..fca7d792bf5f 100644
--- a/drivers/soundwire/amd_manager.h
+++ b/drivers/soundwire/amd_manager.h
@@ -185,6 +185,7 @@
 #define AMD_SDW1_PAD_KEEPER_EN_MASK			0x10
 #define AMD_SDW0_PAD_KEEPER_DISABLE_MASK		0x1e
 #define AMD_SDW1_PAD_KEEPER_DISABLE_MASK		0xf
+#define AMD_SDW_PREQ_INTR_STAT				BIT(19)
 
 static u32 amd_sdw_freq_tbl[AMD_SDW_MAX_FREQ_NUM] = {
 	AMD_SDW_DEFAULT_CLK_FREQ,
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index ac537419301d..df60bc0de6fc 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -45,8 +45,11 @@ struct sdw_amd_dai_runtime {
  * @mmio: SoundWire registers mmio base
  * @acp_mmio: acp registers mmio base
  * @reg_mask: register mask structure per manager instance
+ * @amd_sdw_irq_thread: SoundWire manager irq workqueue
+ * @amd_sdw_work: peripheral status work queue
  * @probe_work: SoundWire manager probe workqueue
  * @acp_sdw_lock: mutex to protect acp share register access
+ * @status: peripheral devices status array
  * @num_din_ports: number of input ports
  * @num_dout_ports: number of output ports
  * @cols_index: Column index in frame shape
@@ -65,10 +68,14 @@ struct amd_sdw_manager {
 	void __iomem *acp_mmio;
 
 	struct sdw_manager_reg_mask *reg_mask;
+	struct work_struct amd_sdw_irq_thread;
+	struct work_struct amd_sdw_work;
 	struct work_struct probe_work;
 	/* mutex to protect acp common register access */
 	struct mutex *acp_sdw_lock;
 
+	enum sdw_slave_status status[SDW_MAX_DEVICES + 1];
+
 	int num_din_ports;
 	int num_dout_ports;
 
-- 
cgit v1.2.3


From 81ff58ff71ad9dcddf5caffcf912cde6589d07bd Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Tue, 21 Mar 2023 10:38:59 +0530
Subject: soundwire: amd: add runtime pm ops for AMD SoundWire manager driver

Add support for runtime pm ops for AMD SoundWire manager driver.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Signed-off-by: Mastan Katragadda <Mastan.Katragadda@amd.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/lkml/20230227154801.50319-7-Vijendar.Mukunda@amd.com
Link: https://lore.kernel.org/r/20230321050901.115439-7-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 140 ++++++++++++++++++++++++++++++++++++++
 drivers/soundwire/amd_manager.h   |   3 +
 include/linux/soundwire/sdw_amd.h |  17 +++++
 3 files changed, 160 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index 49b7133cb41f..6172b3cb84d4 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/soundwire/sdw.h>
 #include <linux/soundwire/sdw_registers.h>
+#include <linux/pm_runtime.h>
 #include <linux/wait.h>
 #include <sound/pcm_params.h>
 #include <sound/soc.h>
@@ -133,6 +134,12 @@ static void amd_disable_sdw_interrupts(struct amd_sdw_manager *amd_manager)
 	writel(0x00, amd_manager->mmio + ACP_SW_ERROR_INTR_MASK);
 }
 
+static int amd_deinit_sdw_manager(struct amd_sdw_manager *amd_manager)
+{
+	amd_disable_sdw_interrupts(amd_manager);
+	return amd_disable_sdw_manager(amd_manager);
+}
+
 static void amd_sdw_set_frameshape(struct amd_sdw_manager *amd_manager)
 {
 	u32 frame_size;
@@ -866,6 +873,12 @@ static void amd_sdw_probe_work(struct work_struct *work)
 			return;
 		amd_sdw_set_frameshape(amd_manager);
 	}
+	/* Enable runtime PM */
+	pm_runtime_set_autosuspend_delay(amd_manager->dev, AMD_SDW_MASTER_SUSPEND_DELAY_MS);
+	pm_runtime_use_autosuspend(amd_manager->dev);
+	pm_runtime_mark_last_busy(amd_manager->dev);
+	pm_runtime_set_active(amd_manager->dev);
+	pm_runtime_enable(amd_manager->dev);
 }
 
 static int amd_sdw_manager_probe(struct platform_device *pdev)
@@ -953,17 +966,144 @@ static int amd_sdw_manager_remove(struct platform_device *pdev)
 {
 	struct amd_sdw_manager *amd_manager = dev_get_drvdata(&pdev->dev);
 
+	pm_runtime_disable(&pdev->dev);
 	cancel_work_sync(&amd_manager->probe_work);
 	amd_disable_sdw_interrupts(amd_manager);
 	sdw_bus_master_delete(&amd_manager->bus);
 	return amd_disable_sdw_manager(amd_manager);
 }
 
+static int amd_sdw_clock_stop(struct amd_sdw_manager *amd_manager)
+{
+	u32 val;
+	int ret;
+
+	ret = sdw_bus_prep_clk_stop(&amd_manager->bus);
+	if (ret < 0 && ret != -ENODATA) {
+		dev_err(amd_manager->dev, "prepare clock stop failed %d", ret);
+		return 0;
+	}
+	ret = sdw_bus_clk_stop(&amd_manager->bus);
+	if (ret < 0 && ret != -ENODATA) {
+		dev_err(amd_manager->dev, "bus clock stop failed %d", ret);
+		return 0;
+	}
+
+	ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL, val,
+				 (val & AMD_SDW_CLK_STOP_DONE), ACP_DELAY_US, AMD_SDW_TIMEOUT);
+	if (ret) {
+		dev_err(amd_manager->dev, "SDW%x clock stop failed\n", amd_manager->instance);
+		return 0;
+	}
+
+	amd_manager->clk_stopped = true;
+	if (amd_manager->wake_en_mask)
+		writel(0x01, amd_manager->acp_mmio + ACP_SW_WAKE_EN(amd_manager->instance));
+
+	dev_dbg(amd_manager->dev, "SDW%x clock stop successful\n", amd_manager->instance);
+	return 0;
+}
+
+static int amd_sdw_clock_stop_exit(struct amd_sdw_manager *amd_manager)
+{
+	int ret;
+	u32 val;
+
+	if (amd_manager->clk_stopped) {
+		val = readl(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+		val |= AMD_SDW_CLK_RESUME_REQ;
+		writel(val, amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+		ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL, val,
+					 (val & AMD_SDW_CLK_RESUME_DONE), ACP_DELAY_US,
+					 AMD_SDW_TIMEOUT);
+		if (val & AMD_SDW_CLK_RESUME_DONE) {
+			writel(0, amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+			ret = sdw_bus_exit_clk_stop(&amd_manager->bus);
+			if (ret < 0)
+				dev_err(amd_manager->dev, "bus failed to exit clock stop %d\n",
+					ret);
+			amd_manager->clk_stopped = false;
+		}
+	}
+	if (amd_manager->clk_stopped) {
+		dev_err(amd_manager->dev, "SDW%x clock stop exit failed\n", amd_manager->instance);
+		return 0;
+	}
+	dev_dbg(amd_manager->dev, "SDW%x clock stop exit successful\n", amd_manager->instance);
+	return 0;
+}
+
+static int __maybe_unused amd_suspend_runtime(struct device *dev)
+{
+	struct amd_sdw_manager *amd_manager = dev_get_drvdata(dev);
+	struct sdw_bus *bus = &amd_manager->bus;
+	int ret;
+
+	if (bus->prop.hw_disabled) {
+		dev_dbg(bus->dev, "SoundWire manager %d is disabled,\n",
+			bus->link_id);
+		return 0;
+	}
+	if (amd_manager->power_mode_mask & AMD_SDW_CLK_STOP_MODE) {
+		return amd_sdw_clock_stop(amd_manager);
+	} else if (amd_manager->power_mode_mask & AMD_SDW_POWER_OFF_MODE) {
+		ret = amd_sdw_clock_stop(amd_manager);
+		if (ret)
+			return ret;
+		return amd_deinit_sdw_manager(amd_manager);
+	}
+	return 0;
+}
+
+static int __maybe_unused amd_resume_runtime(struct device *dev)
+{
+	struct amd_sdw_manager *amd_manager = dev_get_drvdata(dev);
+	struct sdw_bus *bus = &amd_manager->bus;
+	int ret;
+	u32 val;
+
+	if (bus->prop.hw_disabled) {
+		dev_dbg(bus->dev, "SoundWire manager %d is disabled, ignoring\n",
+			bus->link_id);
+		return 0;
+	}
+
+	if (amd_manager->power_mode_mask & AMD_SDW_CLK_STOP_MODE) {
+		return amd_sdw_clock_stop_exit(amd_manager);
+	} else if (amd_manager->power_mode_mask & AMD_SDW_POWER_OFF_MODE) {
+		val = readl(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+		if (val) {
+			val |= AMD_SDW_CLK_RESUME_REQ;
+			writel(val, amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+			ret = readl_poll_timeout(amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL, val,
+						 (val & AMD_SDW_CLK_RESUME_DONE), ACP_DELAY_US,
+						 AMD_SDW_TIMEOUT);
+			if (val & AMD_SDW_CLK_RESUME_DONE) {
+				writel(0, amd_manager->mmio + ACP_SW_CLK_RESUME_CTRL);
+				amd_manager->clk_stopped = false;
+			}
+		}
+		sdw_clear_slave_status(bus, SDW_UNATTACH_REQUEST_MASTER_RESET);
+		amd_init_sdw_manager(amd_manager);
+		amd_enable_sdw_interrupts(amd_manager);
+		ret = amd_enable_sdw_manager(amd_manager);
+		if (ret)
+			return ret;
+		amd_sdw_set_frameshape(amd_manager);
+	}
+	return 0;
+}
+
+static const struct dev_pm_ops amd_pm = {
+	SET_RUNTIME_PM_OPS(amd_suspend_runtime, amd_resume_runtime, NULL)
+};
+
 static struct platform_driver amd_sdw_driver = {
 	.probe	= &amd_sdw_manager_probe,
 	.remove = &amd_sdw_manager_remove,
 	.driver = {
 		.name	= "amd_sdw_manager",
+		.pm = &amd_pm,
 	}
 };
 module_platform_driver(amd_sdw_driver);
diff --git a/drivers/soundwire/amd_manager.h b/drivers/soundwire/amd_manager.h
index fca7d792bf5f..b101f4536230 100644
--- a/drivers/soundwire/amd_manager.h
+++ b/drivers/soundwire/amd_manager.h
@@ -186,6 +186,9 @@
 #define AMD_SDW0_PAD_KEEPER_DISABLE_MASK		0x1e
 #define AMD_SDW1_PAD_KEEPER_DISABLE_MASK		0xf
 #define AMD_SDW_PREQ_INTR_STAT				BIT(19)
+#define AMD_SDW_CLK_STOP_DONE				1
+#define AMD_SDW_CLK_RESUME_REQ				2
+#define AMD_SDW_CLK_RESUME_DONE				3
 
 static u32 amd_sdw_freq_tbl[AMD_SDW_MAX_FREQ_NUM] = {
 	AMD_SDW_DEFAULT_CLK_FREQ,
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index df60bc0de6fc..ceecad74aef9 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -8,6 +8,21 @@
 
 #include <linux/soundwire/sdw.h>
 
+/* AMD pm_runtime quirk definitions */
+
+/*
+ * Force the clock to stop(ClockStopMode0) when suspend callback
+ * is invoked.
+ */
+#define AMD_SDW_CLK_STOP_MODE		1
+
+/*
+ * Stop the bus when runtime suspend/system level suspend callback
+ * is invoked. If set, a complete bus reset and re-enumeration will
+ * be performed when the bus restarts. In-band wake interrupts are
+ * not supported in this mode.
+ */
+#define AMD_SDW_POWER_OFF_MODE		2
 #define ACP_SDW0	0
 #define ACP_SDW1	1
 
@@ -57,6 +72,7 @@ struct sdw_amd_dai_runtime {
  * @instance: SoundWire manager instance
  * @quirks: SoundWire manager quirks
  * @wake_en_mask: wake enable mask per SoundWire manager
+ * @clk_stopped: flag set to true when clock is stopped
  * @power_mode_mask: flag interprets amd SoundWire manager power mode
  * @dai_runtime_array: dai runtime array
  */
@@ -86,6 +102,7 @@ struct amd_sdw_manager {
 	u32 quirks;
 	u32 wake_en_mask;
 	u32 power_mode_mask;
+	bool clk_stopped;
 
 	struct sdw_amd_dai_runtime **dai_runtime_array;
 };
-- 
cgit v1.2.3


From 13186dae182ab1a2a52a53424672f49cf3e81f9b Mon Sep 17 00:00:00 2001
From: Benjamin Bara <benjamin.bara@skidata.com>
Date: Wed, 5 Apr 2023 19:14:34 +0200
Subject: regulator: da9063: add voltage monitoring registers

Add the definitions for the registers responsible for voltage
monitoring. Add a voltage monitor enable bitfield per regulator.

Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Reviewed-by: Adam Ward <DLG-Adam.Ward.opensource@dm.renesas.com>
Signed-off-by: Benjamin Bara <benjamin.bara@skidata.com>
Link: https://lore.kernel.org/r/20230403-da9063-disable-unused-v3-1-cc4dc698864c@skidata.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/da9063-regulator.c | 29 +++++++++++++++++++++++++++++
 include/linux/mfd/da9063/registers.h | 23 +++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c
index e092e4df86ab..7d204b1d6b3f 100644
--- a/drivers/regulator/da9063-regulator.c
+++ b/drivers/regulator/da9063-regulator.c
@@ -83,6 +83,9 @@ struct da9063_regulator_info {
 
 	/* DA9063 event detection bit */
 	struct reg_field oc_event;
+
+	/* DA9063 voltage monitor bit */
+	struct reg_field vmon;
 };
 
 /* Macros for LDO */
@@ -148,6 +151,7 @@ struct da9063_regulator {
 	struct regmap_field			*suspend;
 	struct regmap_field			*sleep;
 	struct regmap_field			*suspend_sleep;
+	struct regmap_field			*vmon;
 };
 
 /* Encapsulates all information for the regulators driver */
@@ -581,36 +585,42 @@ static const struct da9063_regulator_info da9063_regulator_info[] = {
 			    da9063_buck_a_limits,
 			    DA9063_REG_BUCK_ILIM_C, DA9063_BCORE1_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BCORE1),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BCORE1_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BCORE2, 300, 10, 1570,
 			    da9063_buck_a_limits,
 			    DA9063_REG_BUCK_ILIM_C, DA9063_BCORE2_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BCORE2),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BCORE2_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BPRO, 530, 10, 1800,
 			    da9063_buck_a_limits,
 			    DA9063_REG_BUCK_ILIM_B, DA9063_BPRO_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BPRO),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BPRO_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BMEM, 800, 20, 3340,
 			    da9063_buck_b_limits,
 			    DA9063_REG_BUCK_ILIM_A, DA9063_BMEM_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BMEM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BMEM_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BIO, 800, 20, 3340,
 			    da9063_buck_b_limits,
 			    DA9063_REG_BUCK_ILIM_A, DA9063_BIO_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BIO),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BIO_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BPERI, 800, 20, 3340,
 			    da9063_buck_b_limits,
 			    DA9063_REG_BUCK_ILIM_B, DA9063_BPERI_ILIM_MASK),
 		DA9063_BUCK_COMMON_FIELDS(BPERI),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BPERI_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BCORES_MERGED, 300, 10, 1570,
@@ -618,6 +628,7 @@ static const struct da9063_regulator_info da9063_regulator_info[] = {
 			    DA9063_REG_BUCK_ILIM_C, DA9063_BCORE1_ILIM_MASK),
 		/* BCORES_MERGED uses the same register fields as BCORE1 */
 		DA9063_BUCK_COMMON_FIELDS(BCORE1),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BCORE1_MON_EN),
 	},
 	{
 		DA9063_BUCK(DA9063, BMEM_BIO_MERGED, 800, 20, 3340,
@@ -625,47 +636,59 @@ static const struct da9063_regulator_info da9063_regulator_info[] = {
 			    DA9063_REG_BUCK_ILIM_A, DA9063_BMEM_ILIM_MASK),
 		/* BMEM_BIO_MERGED uses the same register fields as BMEM */
 		DA9063_BUCK_COMMON_FIELDS(BMEM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_4, DA9063_BMEM_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO3, 900, 20, 3440),
 		.oc_event = BFIELD(DA9063_REG_STATUS_D, DA9063_LDO3_LIM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO3_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO7, 900, 50, 3600),
 		.oc_event = BFIELD(DA9063_REG_STATUS_D, DA9063_LDO7_LIM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO7_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO8, 900, 50, 3600),
 		.oc_event = BFIELD(DA9063_REG_STATUS_D, DA9063_LDO8_LIM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO8_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO9, 950, 50, 3600),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_3, DA9063_LDO9_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO11, 900, 50, 3600),
 		.oc_event = BFIELD(DA9063_REG_STATUS_D, DA9063_LDO11_LIM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_3, DA9063_LDO11_MON_EN),
 	},
 
 	/* The following LDOs are present only on DA9063, not on DA9063L */
 	{
 		DA9063_LDO(DA9063, LDO1, 600, 20, 1860),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO1_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO2, 600, 20, 1860),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO2_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO4, 900, 20, 3440),
 		.oc_event = BFIELD(DA9063_REG_STATUS_D, DA9063_LDO4_LIM),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO4_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO5, 900, 50, 3600),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO5_MON_EN),
 	},
 	{
 		DA9063_LDO(DA9063, LDO6, 900, 50, 3600),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_2, DA9063_LDO6_MON_EN),
 	},
 
 	{
 		DA9063_LDO(DA9063, LDO10, 900, 50, 3600),
+		.vmon = BFIELD(DA9063_BB_REG_MON_REG_3, DA9063_LDO10_MON_EN),
 	},
 };
 
@@ -932,6 +955,12 @@ static int da9063_regulator_probe(struct platform_device *pdev)
 			if (IS_ERR(regl->suspend_sleep))
 				return PTR_ERR(regl->suspend_sleep);
 		}
+		if (regl->info->vmon.reg) {
+			regl->vmon = devm_regmap_field_alloc(&pdev->dev,
+				da9063->regmap, regl->info->vmon);
+			if (IS_ERR(regl->vmon))
+				return PTR_ERR(regl->vmon);
+		}
 
 		/* Register regulator */
 		memset(&config, 0, sizeof(config));
diff --git a/include/linux/mfd/da9063/registers.h b/include/linux/mfd/da9063/registers.h
index 6e0f66a2e727..7b8364bd08a0 100644
--- a/include/linux/mfd/da9063/registers.h
+++ b/include/linux/mfd/da9063/registers.h
@@ -1040,6 +1040,29 @@
 /* DA9063_REG_CONFIG_J (addr=0x10F) */
 #define DA9063_TWOWIRE_TO			0x40
 
+/* DA9063_REG_MON_REG_2 (addr=0x115) */
+#define DA9063_LDO1_MON_EN			0x01
+#define DA9063_LDO2_MON_EN			0x02
+#define DA9063_LDO3_MON_EN			0x04
+#define DA9063_LDO4_MON_EN			0x08
+#define DA9063_LDO5_MON_EN			0x10
+#define DA9063_LDO6_MON_EN			0x20
+#define DA9063_LDO7_MON_EN			0x40
+#define DA9063_LDO8_MON_EN			0x80
+
+/* DA9063_REG_MON_REG_3 (addr=0x116) */
+#define DA9063_LDO9_MON_EN			0x01
+#define DA9063_LDO10_MON_EN			0x02
+#define DA9063_LDO11_MON_EN			0x04
+
+/* DA9063_REG_MON_REG_4 (addr=0x117) */
+#define DA9063_BCORE1_MON_EN			0x04
+#define DA9063_BCORE2_MON_EN			0x08
+#define DA9063_BPRO_MON_EN			0x10
+#define DA9063_BIO_MON_EN			0x20
+#define DA9063_BMEM_MON_EN			0x40
+#define DA9063_BPERI_MON_EN			0x80
+
 /* DA9063_REG_MON_REG_5 (addr=0x116) */
 #define DA9063_MON_A8_IDX_MASK			0x07
 #define		DA9063_MON_A8_IDX_NONE		0x00
-- 
cgit v1.2.3


From 7d6d2dd326a8a8d32091e9748f3428dd3be68367 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Wed, 5 Apr 2023 22:07:26 +0200
Subject: mmc: sdio: add Realtek SDIO vendor ID and various wifi device IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the SDIO vendor ID for Realtek and some device IDs extracted from
their GPL vendor driver. This will be useful in the future when the
rtw88 driver gains support for these chips.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Reviewed-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20230405200729.632435-7-martin.blumenstingl@googlemail.com
---
 include/linux/mmc/sdio_ids.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 0e4ef9c5127a..66f503ed2448 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -112,6 +112,15 @@
 #define SDIO_VENDOR_ID_MICROCHIP_WILC		0x0296
 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000	0x5347
 
+#define SDIO_VENDOR_ID_REALTEK			0x024c
+#define SDIO_DEVICE_ID_REALTEK_RTW8723BS	0xb723
+#define SDIO_DEVICE_ID_REALTEK_RTW8821BS	0xb821
+#define SDIO_DEVICE_ID_REALTEK_RTW8822BS	0xb822
+#define SDIO_DEVICE_ID_REALTEK_RTW8821CS	0xc821
+#define SDIO_DEVICE_ID_REALTEK_RTW8822CS	0xc822
+#define SDIO_DEVICE_ID_REALTEK_RTW8723DS	0xd723
+#define SDIO_DEVICE_ID_REALTEK_RTW8821DS	0xd821
+
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
 #define SDIO_DEVICE_ID_SIANO_NICE		0x0202
-- 
cgit v1.2.3


From 59322d35179987e85b593e504fd334de8683c835 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 11 Apr 2023 16:25:28 +0100
Subject: ASoC: cs35l56: Re-patch firmware after system suspend

Check during cs35l56_system_resume() whether the firmware patch must
be applied again.

The FIRMWARE_MISSING flag in the PROTECTION_STATUS register indicates
whether the firmware has been patched.

In non-secure mode the FIRMWARE_MISSING flag is cleared at the end of
dsp_work(). If it is set after system-resume we know that dsp_work()
must be run again.

In secure mode the pre-OS loader will have done the secure patching
and cleared the FIRMWARE_MISSING flag. So this flag does not tell us
whether firmware memory was lost. But the driver could only be
downloading non-secure tunings, which is always safe to do.

If the driver has control of RESET we will have asserted it during
suspend so the firmware patch will have been lost. The driver would only
have control of RESET in non-secure mode.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/168122674550.26.8545058503709956172@mailman-core.alsa-project.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h        |  4 +++
 sound/soc/codecs/cs35l56-sdw.c | 12 +++++++-
 sound/soc/codecs/cs35l56.c     | 67 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 5f8ea2dfaa21..b3300bce74f4 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -95,6 +95,7 @@
 #define CS35L56_MAIN_RENDER_USER_MUTE			0x3400024
 #define CS35L56_MAIN_RENDER_USER_VOLUME			0x340002C
 #define CS35L56_MAIN_POSTURE_NUMBER			0x3400094
+#define CS35L56_PROTECTION_STATUS			0x34000D8
 #define CS35L56_TRANSDUCER_ACTUAL_PS			0x3400150
 #define CS35L56_DSP1_YMEM_UNPACKED24_6141		0x3405FF4
 #define CS35L56_DSP1_PMEM_0				0x3800000
@@ -216,6 +217,9 @@
 #define CS35L56_MAIN_POSTURE_MAX			255
 #define CS35L56_MAIN_POSTURE_MASK			CS35L56_MAIN_POSTURE_MAX
 
+/* CS35L56_PROTECTION_STATUS */
+#define CS35L56_FIRMWARE_MISSING			BIT(0)
+
 /* Software Values */
 #define CS35L56_HALO_STATE_SHUTDOWN			1
 #define CS35L56_HALO_STATE_BOOT_DONE			2
diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c
index 947d4e5f4dc9..e759347423cf 100644
--- a/sound/soc/codecs/cs35l56-sdw.c
+++ b/sound/soc/codecs/cs35l56-sdw.c
@@ -473,6 +473,16 @@ static int __maybe_unused cs35l56_sdw_system_suspend(struct device *dev)
 	return cs35l56_system_suspend(dev);
 }
 
+static int __maybe_unused cs35l56_sdw_system_resume(struct device *dev)
+{
+	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
+
+	cs35l56->sdw_irq_no_unmask = false;
+	/* runtime_resume re-enables the interrupt */
+
+	return cs35l56_system_resume(dev);
+}
+
 static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_device_id *id)
 {
 	struct device *dev = &peripheral->dev;
@@ -522,7 +532,7 @@ static int cs35l56_sdw_remove(struct sdw_slave *peripheral)
 
 static const struct dev_pm_ops cs35l56_sdw_pm = {
 	SET_RUNTIME_PM_OPS(cs35l56_sdw_runtime_suspend, cs35l56_sdw_runtime_resume, NULL)
-	SYSTEM_SLEEP_PM_OPS(cs35l56_sdw_system_suspend, cs35l56_system_resume)
+	SYSTEM_SLEEP_PM_OPS(cs35l56_sdw_system_suspend, cs35l56_sdw_system_resume)
 	LATE_SYSTEM_SLEEP_PM_OPS(cs35l56_system_suspend_late, cs35l56_system_resume_early)
 	/* NOIRQ stage not needed, SoundWire doesn't use a hard IRQ */
 };
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index eb85c27ab087..18e341744839 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -946,6 +946,7 @@ static void cs35l56_dsp_work(struct work_struct *work)
 		goto err_unlock;
 	}
 
+	regmap_clear_bits(cs35l56->regmap, CS35L56_PROTECTION_STATUS, CS35L56_FIRMWARE_MISSING);
 	cs35l56->fw_patched = true;
 
 err_unlock:
@@ -1026,6 +1027,8 @@ static const struct snd_soc_component_driver soc_component_dev_cs35l56 = {
 	.num_controls = ARRAY_SIZE(cs35l56_controls),
 
 	.set_bias_level = cs35l56_set_bias_level,
+
+	.suspend_bias_off = 1, /* see cs35l56_system_resume() */
 };
 
 static const struct reg_sequence cs35l56_hibernate_seq[] = {
@@ -1156,6 +1159,47 @@ err:
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_runtime_resume_common, SND_SOC_CS35L56_CORE);
 
+static int cs35l56_is_fw_reload_needed(struct cs35l56_private *cs35l56)
+{
+	unsigned int val;
+	int ret;
+
+	/* Nothing to re-patch if we haven't done any patching yet. */
+	if (!cs35l56->fw_patched)
+		return false;
+
+	/*
+	 * If we have control of RESET we will have asserted it so the firmware
+	 * will need re-patching.
+	 */
+	if (cs35l56->reset_gpio)
+		return true;
+
+	/*
+	 * In secure mode FIRMWARE_MISSING is cleared by the BIOS loader so
+	 * can't be used here to test for memory retention.
+	 * Assume that tuning must be re-loaded.
+	 */
+	if (cs35l56->secured)
+		return true;
+
+	ret = pm_runtime_resume_and_get(cs35l56->dev);
+	if (ret) {
+		dev_err(cs35l56->dev, "Failed to runtime_get: %d\n", ret);
+		return ret;
+	}
+
+	ret = regmap_read(cs35l56->regmap, CS35L56_PROTECTION_STATUS, &val);
+	if (ret)
+		dev_err(cs35l56->dev, "Failed to read PROTECTION_STATUS: %d\n", ret);
+	else
+		ret = !!(val & CS35L56_FIRMWARE_MISSING);
+
+	pm_runtime_put_autosuspend(cs35l56->dev);
+
+	return ret;
+}
+
 int cs35l56_system_suspend(struct device *dev)
 {
 	struct cs35l56_private *cs35l56 = dev_get_drvdata(dev);
@@ -1273,7 +1317,28 @@ int cs35l56_system_resume(struct device *dev)
 	if (cs35l56->irq)
 		enable_irq(cs35l56->irq);
 
-	return ret;
+	if (ret)
+		return ret;
+
+	/* Firmware won't have been loaded if the component hasn't probed */
+	if (!cs35l56->component)
+		return 0;
+
+	ret = cs35l56_is_fw_reload_needed(cs35l56);
+	dev_dbg(cs35l56->dev, "fw_reload_needed: %d\n", ret);
+	if (ret < 1)
+		return ret;
+
+	cs35l56->fw_patched = false;
+	init_completion(&cs35l56->dsp_ready_completion);
+	queue_work(cs35l56->dsp_wq, &cs35l56->dsp_work);
+
+	/*
+	 * suspend_bias_off ensures we are now in BIAS_OFF so there will be
+	 * a BIAS_OFF->BIAS_STANDBY transition to complete dsp patching.
+	 */
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(cs35l56_system_resume);
 
-- 
cgit v1.2.3


From 244da66cda359227d80ccb41dbcb99da40eae186 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:30 -0700
Subject: dmaengine: idxd: setup event log configuration

Add setup of event log feature for supported device. Event log addresses
error reporting that was lacking in gen 1 DSA devices where a second error
event does not get reported when a first event is pending software
handling. The event log allows a circular buffer that the device can push
error events to. It is up to the user to create a large enough event log
ring in order to capture the expected events. The evl size can be set in
the device sysfs attribute. By default 64 entries are supported as minimal
when event log is enabled.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-4-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/device.c    | 89 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/idxd.h      | 19 ++++++++++
 drivers/dma/idxd/init.c      |  1 +
 drivers/dma/idxd/registers.h | 72 ++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/sysfs.c     |  3 +-
 include/uapi/linux/idxd.h    |  1 +
 6 files changed, 181 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5f321f3b4242..230fe9bb56ae 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -752,6 +752,83 @@ void idxd_device_clear_state(struct idxd_device *idxd)
 	spin_unlock(&idxd->dev_lock);
 }
 
+static int idxd_device_evl_setup(struct idxd_device *idxd)
+{
+	union gencfg_reg gencfg;
+	union evlcfg_reg evlcfg;
+	union genctrl_reg genctrl;
+	struct device *dev = &idxd->pdev->dev;
+	void *addr;
+	dma_addr_t dma_addr;
+	int size;
+	struct idxd_evl *evl = idxd->evl;
+
+	if (!evl)
+		return 0;
+
+	size = evl_size(idxd);
+	/*
+	 * Address needs to be page aligned. However, dma_alloc_coherent() provides
+	 * at minimal page size aligned address. No manual alignment required.
+	 */
+	addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	memset(addr, 0, size);
+
+	spin_lock(&evl->lock);
+	evl->log = addr;
+	evl->dma = dma_addr;
+	evl->log_size = size;
+
+	memset(&evlcfg, 0, sizeof(evlcfg));
+	evlcfg.bits[0] = dma_addr & GENMASK(63, 12);
+	evlcfg.size = evl->size;
+
+	iowrite64(evlcfg.bits[0], idxd->reg_base + IDXD_EVLCFG_OFFSET);
+	iowrite64(evlcfg.bits[1], idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.evl_int_en = 1;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+
+	gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	gencfg.evl_en = 1;
+	iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+
+	spin_unlock(&evl->lock);
+	return 0;
+}
+
+static void idxd_device_evl_free(struct idxd_device *idxd)
+{
+	union gencfg_reg gencfg;
+	union genctrl_reg genctrl;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
+
+	gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	if (!gencfg.evl_en)
+		return;
+
+	spin_lock(&evl->lock);
+	gencfg.evl_en = 0;
+	iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.evl_int_en = 0;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+
+	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET);
+	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
+
+	dma_free_coherent(dev, evl->log_size, evl->log, evl->dma);
+	evl->log = NULL;
+	evl->size = IDXD_EVL_SIZE_MIN;
+	spin_unlock(&evl->lock);
+}
+
 static void idxd_group_config_write(struct idxd_group *group)
 {
 	struct idxd_device *idxd = group->idxd;
@@ -1451,15 +1528,24 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	if (rc < 0)
 		return -ENXIO;
 
+	rc = idxd_device_evl_setup(idxd);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_DEV_EVL_ERR;
+		return rc;
+	}
+
 	/* Start device */
 	rc = idxd_device_enable(idxd);
-	if (rc < 0)
+	if (rc < 0) {
+		idxd_device_evl_free(idxd);
 		return rc;
+	}
 
 	/* Setup DMA device without channels */
 	rc = idxd_register_dma_device(idxd);
 	if (rc < 0) {
 		idxd_device_disable(idxd);
+		idxd_device_evl_free(idxd);
 		idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR;
 		return rc;
 	}
@@ -1488,6 +1574,7 @@ void idxd_device_drv_remove(struct idxd_dev *idxd_dev)
 	idxd_device_disable(idxd);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		idxd_device_reset(idxd);
+	idxd_device_evl_free(idxd);
 }
 
 static enum idxd_dev_type dev_types[] = {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 2a71273f1822..c74681f02b18 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -262,7 +262,15 @@ struct idxd_driver_data {
 };
 
 struct idxd_evl {
+	/* Lock to protect event log access. */
+	spinlock_t lock;
+	void *log;
+	dma_addr_t dma;
+	/* Total size of event log = number of entries * entry size. */
+	unsigned int log_size;
+	/* The number of entries in the event log. */
 	u16 size;
+	u16 head;
 };
 
 struct idxd_device {
@@ -324,6 +332,17 @@ struct idxd_device {
 	struct idxd_evl *evl;
 };
 
+static inline unsigned int evl_ent_size(struct idxd_device *idxd)
+{
+	return idxd->hw.gen_cap.evl_support ?
+	       (32 * (1 << idxd->hw.gen_cap.evl_support)) : 0;
+}
+
+static inline unsigned int evl_size(struct idxd_device *idxd)
+{
+	return idxd->evl->size * evl_ent_size(idxd);
+}
+
 /* IDXD software descriptor */
 struct idxd_desc {
 	union {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 525de59df82a..f44719a11c95 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -343,6 +343,7 @@ static int idxd_init_evl(struct idxd_device *idxd)
 	if (!evl)
 		return -ENOMEM;
 
+	spin_lock_init(&evl->lock);
 	evl->size = IDXD_EVL_SIZE_MIN;
 	idxd->evl = evl;
 	return 0;
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index ea3a499a3c3c..11bb97cf7481 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -3,6 +3,8 @@
 #ifndef _IDXD_REGISTERS_H_
 #define _IDXD_REGISTERS_H_
 
+#include <uapi/linux/idxd.h>
+
 /* PCI Config */
 #define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
 #define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
@@ -119,7 +121,8 @@ union gencfg_reg {
 		u32 rdbuf_limit:8;
 		u32 rsvd:4;
 		u32 user_int_en:1;
-		u32 rsvd2:19;
+		u32 evl_en:1;
+		u32 rsvd2:18;
 	};
 	u32 bits;
 } __packed;
@@ -129,7 +132,8 @@ union genctrl_reg {
 	struct {
 		u32 softerr_int_en:1;
 		u32 halt_int_en:1;
-		u32 rsvd:30;
+		u32 evl_int_en:1;
+		u32 rsvd:29;
 	};
 	u32 bits;
 } __packed;
@@ -299,6 +303,21 @@ union iaa_cap_reg {
 
 #define IDXD_IAACAP_OFFSET	0x180
 
+#define IDXD_EVLCFG_OFFSET	0xe0
+union evlcfg_reg {
+	struct {
+		u64 pasid_en:1;
+		u64 priv:1;
+		u64 rsvd:10;
+		u64 base_addr:52;
+
+		u64 size:16;
+		u64 pasid:20;
+		u64 rsvd2:28;
+	};
+	u64 bits[2];
+} __packed;
+
 #define IDXD_EVL_SIZE_MIN	0x0040
 #define IDXD_EVL_SIZE_MAX	0xffff
 
@@ -539,4 +558,53 @@ union filter_cfg {
 	u64 val;
 } __packed;
 
+struct __evl_entry {
+	u64 rsvd:2;
+	u64 desc_valid:1;
+	u64 wq_idx_valid:1;
+	u64 batch:1;
+	u64 fault_rw:1;
+	u64 priv:1;
+	u64 err_info_valid:1;
+	u64 error:8;
+	u64 wq_idx:8;
+	u64 batch_id:8;
+	u64 operation:8;
+	u64 pasid:20;
+	u64 rsvd2:4;
+
+	u16 batch_idx;
+	u16 rsvd3;
+	union {
+		/* Invalid Flags 0x11 */
+		u32 invalid_flags;
+		/* Invalid Int Handle 0x19 */
+		/* Page fault 0x1a */
+		/* Page fault 0x06, 0x1f, only operand_id */
+		/* Page fault before drain or in batch, 0x26, 0x27 */
+		struct {
+			u16 int_handle;
+			u16 rci:1;
+			u16 ims:1;
+			u16 rcr:1;
+			u16 first_err_in_batch:1;
+			u16 rsvd4_2:9;
+			u16 operand_id:3;
+		};
+	};
+	u64 fault_addr;
+	u64 rsvd5;
+} __packed;
+
+struct dsa_evl_entry {
+	struct __evl_entry e;
+	struct dsa_completion_record cr;
+} __packed;
+
+struct iax_evl_entry {
+	struct __evl_entry e;
+	u64 rsvd[4];
+	struct iax_completion_record cr;
+} __packed;
+
 #endif
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 85644e5bde83..163fdfaa5022 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1605,7 +1605,8 @@ static ssize_t event_log_size_store(struct device *dev,
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
-	if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX)
+	if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX ||
+	    (val * evl_ent_size(idxd) > ULONG_MAX - idxd->evl->dma))
 		return -EINVAL;
 
 	idxd->evl->size = val;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index fc47635b57dc..5d05bf12f2bd 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -30,6 +30,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
 	IDXD_SCMD_WQ_USER_NO_IOMMU = 0x80110000,
+	IDXD_SCMD_DEV_EVL_ERR = 0x80120000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
cgit v1.2.3


From 2f431ba908d2ef05da478d10925207728f1ff483 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:31 -0700
Subject: dmaengine: idxd: add interrupt handling for event log

An event log interrupt is raised in the misc interrupt INTCAUSE register
when an event is written by the hardware. Add basic event log processing
support to the interrupt handler. The event log is a ring where the
hardware owns the tail and the software owns the head. The hardware will
advance the tail index when an additional event has been pushed to memory.
The software will process the log entry and then advances the head. The
log is full when (tail + 1) % log_size = head. The hardware will stop
writing when the log is full. The user is expected to create a log size
large enough to handle all the expected events.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-5-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/irq.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/registers.h | 19 ++++++++++++++++++
 include/uapi/linux/idxd.h    |  1 +
 3 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 0d639303b515..52b8b7d9db22 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -217,6 +217,49 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 	kfree(revoke);
 }
 
+static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u8 status;
+
+	status = DSA_COMP_STATUS(entry_head->error);
+	dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
+			     status, entry_head->operation, entry_head->fault_addr);
+}
+
+static void process_evl_entries(struct idxd_device *idxd)
+{
+	union evl_status_reg evl_status;
+	unsigned int h, t;
+	struct idxd_evl *evl = idxd->evl;
+	struct __evl_entry *entry_head;
+	unsigned int ent_size = evl_ent_size(idxd);
+	u32 size;
+
+	evl_status.bits = 0;
+	evl_status.int_pending = 1;
+
+	spin_lock(&evl->lock);
+	/* Clear interrupt pending bit */
+	iowrite32(evl_status.bits_upper32,
+		  idxd->reg_base + IDXD_EVLSTATUS_OFFSET + sizeof(u32));
+	h = evl->head;
+	evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = evl_status.tail;
+	size = idxd->evl->size;
+
+	while (h != t) {
+		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
+		process_evl_entry(idxd, entry_head);
+		h = (h + 1) % size;
+	}
+
+	evl->head = h;
+	evl_status.head = h;
+	iowrite32(evl_status.bits_lower32, idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	spin_unlock(&evl->lock);
+}
+
 irqreturn_t idxd_misc_thread(int vec, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
@@ -304,6 +347,11 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 		perfmon_counter_overflow(idxd);
 	}
 
+	if (cause & IDXD_INTC_EVL) {
+		val |= IDXD_INTC_EVL;
+		process_evl_entries(idxd);
+	}
+
 	val ^= cause;
 	if (val)
 		dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n",
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 11bb97cf7481..148db94f9373 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -168,6 +168,7 @@ enum idxd_device_reset_type {
 #define IDXD_INTC_OCCUPY			0x04
 #define IDXD_INTC_PERFMON_OVFL		0x08
 #define IDXD_INTC_HALT_STATE		0x10
+#define IDXD_INTC_EVL			0x20
 #define IDXD_INTC_INT_HANDLE_REVOKED	0x80000000
 
 #define IDXD_CMD_OFFSET			0xa0
@@ -558,6 +559,24 @@ union filter_cfg {
 	u64 val;
 } __packed;
 
+#define IDXD_EVLSTATUS_OFFSET		0xf0
+
+union evl_status_reg {
+	struct {
+		u32 head:16;
+		u32 rsvd:16;
+		u32 tail:16;
+		u32 rsvd2:14;
+		u32 int_pending:1;
+		u32 rsvd3:1;
+	};
+	struct {
+		u32 bits_lower32;
+		u32 bits_upper32;
+	};
+	u64 bits;
+} __packed;
+
 struct __evl_entry {
 	u64 rsvd:2;
 	u64 desc_valid:1;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 5d05bf12f2bd..0bc8eea18586 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -170,6 +170,7 @@ enum iax_completion_status {
 
 #define DSA_COMP_STATUS_MASK		0x7f
 #define DSA_COMP_STATUS_WRITE		0x80
+#define DSA_COMP_STATUS(status)		((status) & DSA_COMP_STATUS_MASK)
 
 struct dsa_hw_desc {
 	uint32_t	pasid:20;
-- 
cgit v1.2.3


From 5fbe6503b52f5665560584f62adab5db70ac910e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:32 -0700
Subject: dmanegine: idxd: add debugfs for event log dump

Add debugfs entry to dump the content of the event log for debugging. The
function will dump all non-zero entries in the event log. It will note
which entries are processed and which entries are still pending processing
at the time of the dump. The entries may not always be in chronological
order due to the log is a circular buffer.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-6-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/Makefile  |   2 +-
 drivers/dma/idxd/debugfs.c | 138 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h    |   9 +++
 drivers/dma/idxd/init.c    |  12 ++++
 include/uapi/linux/idxd.h  |   6 +-
 5 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 drivers/dma/idxd/debugfs.c

(limited to 'include')

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index a1e9f2b3a37c..dc096839ac63 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,7 +1,7 @@
 ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD
 
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
-idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o debugfs.o
 
 idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
 
diff --git a/drivers/dma/idxd/debugfs.c b/drivers/dma/idxd/debugfs.c
new file mode 100644
index 000000000000..9cfbd9b14c4c
--- /dev/null
+++ b/drivers/dma/idxd/debugfs.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+static struct dentry *idxd_debugfs_dir;
+
+static void dump_event_entry(struct idxd_device *idxd, struct seq_file *s,
+			     u16 index, int *count, bool processed)
+{
+	struct idxd_evl *evl = idxd->evl;
+	struct dsa_evl_entry *entry;
+	struct dsa_completion_record *cr;
+	u64 *raw;
+	int i;
+	int evl_strides = evl_ent_size(idxd) / sizeof(u64);
+
+	entry = (struct dsa_evl_entry *)evl->log + index;
+
+	if (!entry->e.desc_valid)
+		return;
+
+	seq_printf(s, "Event Log entry %d (real index %u) processed: %u\n",
+		   *count, index, processed);
+
+	seq_printf(s, "desc valid %u wq idx valid %u\n"
+		   "batch %u fault rw %u priv %u error 0x%x\n"
+		   "wq idx %u op %#x pasid %u batch idx %u\n"
+		   "fault addr %#llx\n",
+		   entry->e.desc_valid, entry->e.wq_idx_valid,
+		   entry->e.batch, entry->e.fault_rw, entry->e.priv,
+		   entry->e.error, entry->e.wq_idx, entry->e.operation,
+		   entry->e.pasid, entry->e.batch_idx, entry->e.fault_addr);
+
+	cr = &entry->cr;
+	seq_printf(s, "status %#x result %#x fault_info %#x bytes_completed %u\n"
+		   "fault addr %#llx inv flags %#x\n\n",
+		   cr->status, cr->result, cr->fault_info, cr->bytes_completed,
+		   cr->fault_addr, cr->invalid_flags);
+
+	raw = (u64 *)entry;
+
+	for (i = 0; i < evl_strides; i++)
+		seq_printf(s, "entry[%d] = %#llx\n", i, raw[i]);
+
+	seq_puts(s, "\n");
+	*count += 1;
+}
+
+static int debugfs_evl_show(struct seq_file *s, void *d)
+{
+	struct idxd_device *idxd = s->private;
+	struct idxd_evl *evl = idxd->evl;
+	union evl_status_reg evl_status;
+	u16 h, t, evl_size, i;
+	int count = 0;
+	bool processed = true;
+
+	if (!evl || !evl->log)
+		return 0;
+
+	spin_lock(&evl->lock);
+
+	h = evl->head;
+	evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = evl_status.tail;
+	evl_size = evl->size;
+
+	seq_printf(s, "Event Log head %u tail %u interrupt pending %u\n\n",
+		   evl_status.head, evl_status.tail, evl_status.int_pending);
+
+	i = t;
+	while (1) {
+		i = (i + 1) % evl_size;
+		if (i == t)
+			break;
+
+		if (processed && i == h)
+			processed = false;
+		dump_event_entry(idxd, s, i, &count, processed);
+	}
+
+	spin_unlock(&evl->lock);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_evl);
+
+int idxd_device_init_debugfs(struct idxd_device *idxd)
+{
+	if (IS_ERR_OR_NULL(idxd_debugfs_dir))
+		return 0;
+
+	idxd->dbgfs_dir = debugfs_create_dir(dev_name(idxd_confdev(idxd)), idxd_debugfs_dir);
+	if (IS_ERR(idxd->dbgfs_dir))
+		return PTR_ERR(idxd->dbgfs_dir);
+
+	if (idxd->evl) {
+		idxd->dbgfs_evl_file = debugfs_create_file("event_log", 0400,
+							   idxd->dbgfs_dir, idxd,
+							   &debugfs_evl_fops);
+		if (IS_ERR(idxd->dbgfs_evl_file)) {
+			debugfs_remove_recursive(idxd->dbgfs_dir);
+			idxd->dbgfs_dir = NULL;
+			return PTR_ERR(idxd->dbgfs_evl_file);
+		}
+	}
+
+	return 0;
+}
+
+void idxd_device_remove_debugfs(struct idxd_device *idxd)
+{
+	debugfs_remove_recursive(idxd->dbgfs_dir);
+}
+
+int idxd_init_debugfs(void)
+{
+	if (!debugfs_initialized())
+		return 0;
+
+	idxd_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (IS_ERR(idxd_debugfs_dir))
+		return  PTR_ERR(idxd_debugfs_dir);
+	return 0;
+}
+
+void idxd_remove_debugfs(void)
+{
+	debugfs_remove_recursive(idxd_debugfs_dir);
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index c74681f02b18..b923b90b7299 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -330,6 +330,9 @@ struct idxd_device {
 
 	unsigned long *opcap_bmap;
 	struct idxd_evl *evl;
+
+	struct dentry *dbgfs_dir;
+	struct dentry *dbgfs_evl_file;
 };
 
 static inline unsigned int evl_ent_size(struct idxd_device *idxd)
@@ -704,4 +707,10 @@ static inline void perfmon_init(void) {}
 static inline void perfmon_exit(void) {}
 #endif
 
+/* debugfs */
+int idxd_device_init_debugfs(struct idxd_device *idxd);
+void idxd_device_remove_debugfs(struct idxd_device *idxd);
+int idxd_init_debugfs(void);
+void idxd_remove_debugfs(void);
+
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index f44719a11c95..e21c4b7a1d32 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -669,6 +669,10 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_dev_register;
 	}
 
+	rc = idxd_device_init_debugfs(idxd);
+	if (rc)
+		dev_warn(dev, "IDXD debugfs failed to setup\n");
+
 	dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
 		 idxd->hw.version);
 
@@ -731,6 +735,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
+	idxd_device_remove_debugfs(idxd);
 
 	irq_entry = idxd_get_ie(idxd, 0);
 	free_irq(irq_entry->vector, irq_entry);
@@ -788,6 +793,10 @@ static int __init idxd_init_module(void)
 	if (err)
 		goto err_cdev_register;
 
+	err = idxd_init_debugfs();
+	if (err)
+		goto err_debugfs;
+
 	err = pci_register_driver(&idxd_pci_driver);
 	if (err)
 		goto err_pci_register;
@@ -795,6 +804,8 @@ static int __init idxd_init_module(void)
 	return 0;
 
 err_pci_register:
+	idxd_remove_debugfs();
+err_debugfs:
 	idxd_cdev_remove();
 err_cdev_register:
 	idxd_driver_unregister(&idxd_user_drv);
@@ -815,5 +826,6 @@ static void __exit idxd_exit_module(void)
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	perfmon_exit();
+	idxd_remove_debugfs();
 }
 module_exit(idxd_exit_module);
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 0bc8eea18586..e86199d09a91 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -311,7 +311,8 @@ struct dsa_completion_record {
 		uint8_t		result;
 		uint8_t		dif_status;
 	};
-	uint16_t		rsvd;
+	uint8_t			fault_info;
+	uint8_t			rsvd;
 	uint32_t		bytes_completed;
 	uint64_t		fault_addr;
 	union {
@@ -368,7 +369,8 @@ struct dsa_raw_completion_record {
 struct iax_completion_record {
 	volatile uint8_t        status;
 	uint8_t                 error_code;
-	uint16_t                rsvd;
+	uint8_t			fault_info;
+	uint8_t			rsvd;
 	uint32_t                bytes_completed;
 	uint64_t                fault_addr;
 	uint32_t                invalid_flags;
-- 
cgit v1.2.3


From c40bd7d9737bdcfb02d42765bc6c59b338151123 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:36 -0700
Subject: dmaengine: idxd: process user page faults for completion record

DSA supports page fault handling through PRS. However, the DMA engine
that's processing the descriptor is blocked until the PRS response is
received. Other workqueues sharing the engine are also blocked.
Page fault handing by the driver with PRS disabled can be used to
mitigate the stalling.

With PRS disabled while ATS remain enabled, DSA handles page faults on
a completion record by reporting an event in the event log. In this
instance, the descriptor is completed and the event log contains the
completion record address and the contents of the completion record. Add
support to the event log handling code to fault in the completion record
and copy the content of the completion record to user memory.

A bitmap is introduced to keep track of discarded event log entries. When
the user process initiates ->release() of the char device, it no longer is
interested in any remaining event log entries tied to the relevant wq and
PASID. The driver will mark the event log entry index in the bitmap. Upon
encountering the entries during processing, the event log handler will just
clear the bitmap bit and skip the entry rather than attempt to process the
event log entry.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-10-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/cdev.c   | 30 ++++++++++++++++
 drivers/dma/idxd/device.c | 22 ++++++++++--
 drivers/dma/idxd/idxd.h   |  2 ++
 drivers/dma/idxd/init.c   |  2 ++
 drivers/dma/idxd/irq.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/idxd.h |  1 +
 6 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 8b8a0a0fb054..0a51c33198f6 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -164,6 +164,35 @@ failed:
 	return rc;
 }
 
+static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_evl *evl = idxd->evl;
+	union evl_status_reg status;
+	u16 h, t, size;
+	int ent_size = evl_ent_size(idxd);
+	struct __evl_entry *entry_head;
+
+	if (!evl)
+		return;
+
+	spin_lock(&evl->lock);
+	status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = status.tail;
+	h = evl->head;
+	size = evl->size;
+
+	while (h != t) {
+		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
+		if (entry_head->pasid == pasid && entry_head->wq_idx == wq->id)
+			set_bit(h, evl->bmap);
+		h = (h + 1) % size;
+	}
+	spin_unlock(&evl->lock);
+
+	drain_workqueue(wq->wq);
+}
+
 static int idxd_cdev_release(struct inode *node, struct file *filep)
 {
 	struct idxd_user_context *ctx = filep->private_data;
@@ -190,6 +219,7 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	}
 
 	if (ctx->sva) {
+		idxd_cdev_evl_drain_pasid(wq, ctx->pasid);
 		iommu_sva_unbind_device(ctx->sva);
 		idxd_xa_pasid_remove(ctx);
 	}
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 230fe9bb56ae..fd97b2b58734 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -762,18 +762,29 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 	dma_addr_t dma_addr;
 	int size;
 	struct idxd_evl *evl = idxd->evl;
+	unsigned long *bmap;
+	int rc;
 
 	if (!evl)
 		return 0;
 
 	size = evl_size(idxd);
+
+	bmap = bitmap_zalloc(size, GFP_KERNEL);
+	if (!bmap) {
+		rc = -ENOMEM;
+		goto err_bmap;
+	}
+
 	/*
 	 * Address needs to be page aligned. However, dma_alloc_coherent() provides
 	 * at minimal page size aligned address. No manual alignment required.
 	 */
 	addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL);
-	if (!addr)
-		return -ENOMEM;
+	if (!addr) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
 
 	memset(addr, 0, size);
 
@@ -781,6 +792,7 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 	evl->log = addr;
 	evl->dma = dma_addr;
 	evl->log_size = size;
+	evl->bmap = bmap;
 
 	memset(&evlcfg, 0, sizeof(evlcfg));
 	evlcfg.bits[0] = dma_addr & GENMASK(63, 12);
@@ -799,6 +811,11 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 
 	spin_unlock(&evl->lock);
 	return 0;
+
+err_alloc:
+	bitmap_free(bmap);
+err_bmap:
+	return rc;
 }
 
 static void idxd_device_evl_free(struct idxd_device *idxd)
@@ -824,6 +841,7 @@ static void idxd_device_evl_free(struct idxd_device *idxd)
 	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
 
 	dma_free_coherent(dev, evl->log_size, evl->log, evl->dma);
+	bitmap_free(evl->bmap);
 	evl->log = NULL;
 	evl->size = IDXD_EVL_SIZE_MIN;
 	spin_unlock(&evl->lock);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b3f9a12adce2..3963c83165a6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -264,6 +264,7 @@ struct idxd_driver_data {
 	struct device_type *dev_type;
 	int compl_size;
 	int align;
+	int evl_cr_off;
 };
 
 struct idxd_evl {
@@ -276,6 +277,7 @@ struct idxd_evl {
 	/* The number of entries in the event log. */
 	u16 size;
 	u16 head;
+	unsigned long *bmap;
 };
 
 struct idxd_evl_fault {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index e6faff58733d..18344593b83f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -46,6 +46,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.compl_size = sizeof(struct dsa_completion_record),
 		.align = 32,
 		.dev_type = &dsa_device_type,
+		.evl_cr_off = offsetof(struct dsa_evl_entry, cr),
 	},
 	[IDXD_TYPE_IAX] = {
 		.name_prefix = "iax",
@@ -53,6 +54,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.compl_size = sizeof(struct iax_completion_record),
 		.align = 64,
 		.dev_type = &iax_device_type,
+		.evl_cr_off = offsetof(struct iax_evl_entry, cr),
 	},
 };
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 52b8b7d9db22..96983975f974 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -7,6 +7,8 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmaengine.h>
 #include <linux/delay.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
 #include <uapi/linux/idxd.h>
 #include "../dmaengine.h"
 #include "idxd.h"
@@ -217,14 +219,89 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 	kfree(revoke);
 }
 
-static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head)
+static void idxd_evl_fault_work(struct work_struct *work)
+{
+	struct idxd_evl_fault *fault = container_of(work, struct idxd_evl_fault, work);
+	struct idxd_wq *wq = fault->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct __evl_entry *entry_head = fault->entry;
+	void *cr = (void *)entry_head + idxd->data->evl_cr_off;
+	int cr_size = idxd->data->compl_size, copied;
+
+	switch (fault->status) {
+	case DSA_COMP_CRA_XLAT:
+	case DSA_COMP_DRAIN_EVL:
+		/*
+		 * Copy completion record to fault_addr in user address space
+		 * that is found by wq and PASID.
+		 */
+		copied = idxd_copy_cr(wq, entry_head->pasid,
+				      entry_head->fault_addr,
+				      cr, cr_size);
+		/*
+		 * The task that triggered the page fault is unknown currently
+		 * because multiple threads may share the user address
+		 * space or the task exits already before this fault.
+		 * So if the copy fails, SIGSEGV can not be sent to the task.
+		 * Just print an error for the failure. The user application
+		 * waiting for the completion record will time out on this
+		 * failure.
+		 */
+		if (copied != cr_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n",
+					    cr_size, copied);
+		}
+		break;
+	default:
+		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n",
+				    DSA_COMP_STATUS(entry_head->error));
+		break;
+	}
+
+	kmem_cache_free(idxd->evl_cache, fault);
+}
+
+static void process_evl_entry(struct idxd_device *idxd,
+			      struct __evl_entry *entry_head, unsigned int index)
 {
 	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
 	u8 status;
 
-	status = DSA_COMP_STATUS(entry_head->error);
-	dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
-			     status, entry_head->operation, entry_head->fault_addr);
+	if (test_bit(index, evl->bmap)) {
+		clear_bit(index, evl->bmap);
+	} else {
+		status = DSA_COMP_STATUS(entry_head->error);
+
+		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) {
+			struct idxd_evl_fault *fault;
+			int ent_size = evl_ent_size(idxd);
+
+			if (entry_head->rci)
+				dev_dbg(dev, "Completion Int Req set, ignoring!\n");
+
+			if (!entry_head->rcr && status == DSA_COMP_DRAIN_EVL)
+				return;
+
+			fault = kmem_cache_alloc(idxd->evl_cache, GFP_ATOMIC);
+			if (fault) {
+				struct idxd_wq *wq = idxd->wqs[entry_head->wq_idx];
+
+				fault->wq = wq;
+				fault->status = status;
+				memcpy(&fault->entry, entry_head, ent_size);
+				INIT_WORK(&fault->work, idxd_evl_fault_work);
+				queue_work(wq->wq, &fault->work);
+			} else {
+				dev_warn(dev, "Failed to service fault work.\n");
+			}
+		} else {
+			dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
+					     status, entry_head->operation,
+					     entry_head->fault_addr);
+		}
+	}
 }
 
 static void process_evl_entries(struct idxd_device *idxd)
@@ -250,7 +327,7 @@ static void process_evl_entries(struct idxd_device *idxd)
 
 	while (h != t) {
 		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
-		process_evl_entry(idxd, entry_head);
+		process_evl_entry(idxd, entry_head, h);
 		h = (h + 1) % size;
 	}
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e86199d09a91..4b584d5afd87 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -135,6 +135,7 @@ enum dsa_completion_status {
 	DSA_COMP_HW_ERR1,
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
+	DSA_COMP_DRAIN_EVL = 0x26,
 };
 
 enum iax_completion_status {
-- 
cgit v1.2.3


From 6926987185a3ae92c31b99ce1bfdfb04e95057c0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:37 -0700
Subject: dmaengine: idxd: add descs_completed field for completion record

The descs_completed field for a completion record is part of a batch
descriptor completion record. It takes the same location as bytes_completed
in a normal descriptor field. Add to expose to user.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-11-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 4b584d5afd87..76ad71bf751e 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -314,7 +314,10 @@ struct dsa_completion_record {
 	};
 	uint8_t			fault_info;
 	uint8_t			rsvd;
-	uint32_t		bytes_completed;
+	union {
+		uint32_t		bytes_completed;
+		uint32_t		descs_completed;
+	};
 	uint64_t		fault_addr;
 	union {
 		/* common record */
-- 
cgit v1.2.3


From 2442b7473ad03671378d2d95651bd6bbe09a0943 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:38 -0700
Subject: dmaengine: idxd: process batch descriptor completion record faults

Add event log processing for faulting of user batch descriptor completion
record.

When encountering an event log entry for a page fault on a completion
record, the driver is expected to do the following:
1. If the "first error in batch" bit in event log entry error info is
set, discard any previously recorded errors associated with the
"batch identifier".
2. Fix the page fault according to the fault address in the event log. If
successful, write the completion record to the fault address in user space.
3. If an error is encountered while writing the completion record and it is
associated to a descriptor in the batch, the driver associates the error
with the batch identifier of the event log entry and tracks it until the
event log entry for the corresponding batch desc is encountered.

While processing an event log entry for a batch descriptor with error
indicating that one or more descs in the batch had event log entries,
the driver will do the following before writing the batch completion
record:
1. If the status field of the completion record is 0x1, the driver will
change it to error code 0x5 (one or more operations in batch completed
with status not successful) and changes the result field to 1.
2. If the status is error code 0x6 (page fault on batch descriptor list
address), change the result field to 1.
3. If status is any other value, the completion record is not changed.
4. Clear the recorded error in preparation for next batch with same batch
identifier.

The result field is for user software to determine whether to set the
"Batch Error" flag bit in the descriptor for continuation of partial
batch descriptor completion. See DSA spec 2.0 for additional information.

If no error has been recorded for the batch, the batch completion record is
written to user space as is.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-12-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/idxd.h      |  3 ++
 drivers/dma/idxd/init.c      |  4 ++
 drivers/dma/idxd/irq.c       | 91 ++++++++++++++++++++++++++++++++------------
 drivers/dma/idxd/registers.h |  4 +-
 include/uapi/linux/idxd.h    |  1 +
 5 files changed, 78 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 3963c83165a6..4c4baa80c731 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -265,6 +265,8 @@ struct idxd_driver_data {
 	int compl_size;
 	int align;
 	int evl_cr_off;
+	int cr_status_off;
+	int cr_result_off;
 };
 
 struct idxd_evl {
@@ -278,6 +280,7 @@ struct idxd_evl {
 	u16 size;
 	u16 head;
 	unsigned long *bmap;
+	bool batch_fail[IDXD_MAX_BATCH_IDENT];
 };
 
 struct idxd_evl_fault {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 18344593b83f..92f60d364392 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -47,6 +47,8 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.align = 32,
 		.dev_type = &dsa_device_type,
 		.evl_cr_off = offsetof(struct dsa_evl_entry, cr),
+		.cr_status_off = offsetof(struct dsa_completion_record, status),
+		.cr_result_off = offsetof(struct dsa_completion_record, result),
 	},
 	[IDXD_TYPE_IAX] = {
 		.name_prefix = "iax",
@@ -55,6 +57,8 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.align = 64,
 		.dev_type = &iax_device_type,
 		.evl_cr_off = offsetof(struct iax_evl_entry, cr),
+		.cr_status_off = offsetof(struct iax_completion_record, status),
+		.cr_result_off = offsetof(struct iax_completion_record, error_code),
 	},
 };
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 96983975f974..c660d63a3eb8 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -225,37 +225,79 @@ static void idxd_evl_fault_work(struct work_struct *work)
 	struct idxd_wq *wq = fault->wq;
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
 	struct __evl_entry *entry_head = fault->entry;
 	void *cr = (void *)entry_head + idxd->data->evl_cr_off;
-	int cr_size = idxd->data->compl_size, copied;
+	int cr_size = idxd->data->compl_size;
+	u8 *status = (u8 *)cr + idxd->data->cr_status_off;
+	u8 *result = (u8 *)cr + idxd->data->cr_result_off;
+	int copied, copy_size;
+	bool *bf;
 
 	switch (fault->status) {
 	case DSA_COMP_CRA_XLAT:
-	case DSA_COMP_DRAIN_EVL:
-		/*
-		 * Copy completion record to fault_addr in user address space
-		 * that is found by wq and PASID.
-		 */
-		copied = idxd_copy_cr(wq, entry_head->pasid,
-				      entry_head->fault_addr,
-				      cr, cr_size);
-		/*
-		 * The task that triggered the page fault is unknown currently
-		 * because multiple threads may share the user address
-		 * space or the task exits already before this fault.
-		 * So if the copy fails, SIGSEGV can not be sent to the task.
-		 * Just print an error for the failure. The user application
-		 * waiting for the completion record will time out on this
-		 * failure.
-		 */
-		if (copied != cr_size) {
-			dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n",
-					    cr_size, copied);
+		if (entry_head->batch && entry_head->first_err_in_batch)
+			evl->batch_fail[entry_head->batch_id] = false;
+
+		copy_size = cr_size;
+		break;
+	case DSA_COMP_BATCH_EVL_ERR:
+		bf = &evl->batch_fail[entry_head->batch_id];
+
+		copy_size = entry_head->rcr || *bf ? cr_size : 0;
+		if (*bf) {
+			if (*status == DSA_COMP_SUCCESS)
+				*status = DSA_COMP_BATCH_FAIL;
+			*result = 1;
+			*bf = false;
 		}
 		break;
+	case DSA_COMP_DRAIN_EVL:
+		copy_size = cr_size;
+		break;
 	default:
-		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n",
-				    DSA_COMP_STATUS(entry_head->error));
+		copy_size = 0;
+		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", fault->status);
+		break;
+	}
+
+	if (copy_size == 0)
+		return;
+
+	/*
+	 * Copy completion record to fault_addr in user address space
+	 * that is found by wq and PASID.
+	 */
+	copied = idxd_copy_cr(wq, entry_head->pasid, entry_head->fault_addr,
+			      cr, copy_size);
+	/*
+	 * The task that triggered the page fault is unknown currently
+	 * because multiple threads may share the user address
+	 * space or the task exits already before this fault.
+	 * So if the copy fails, SIGSEGV can not be sent to the task.
+	 * Just print an error for the failure. The user application
+	 * waiting for the completion record will time out on this
+	 * failure.
+	 */
+	switch (fault->status) {
+	case DSA_COMP_CRA_XLAT:
+		if (copied != copy_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to completion record: (%d:%d)\n",
+					    copy_size, copied);
+			if (entry_head->batch)
+				evl->batch_fail[entry_head->batch_id] = true;
+		}
+		break;
+	case DSA_COMP_BATCH_EVL_ERR:
+		if (copied != copy_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to batch completion record: (%d:%d)\n",
+					    copy_size, copied);
+		}
+		break;
+	case DSA_COMP_DRAIN_EVL:
+		if (copied != copy_size)
+			dev_dbg_ratelimited(dev, "Failed to write to drain completion record: (%d:%d)\n",
+					    copy_size, copied);
 		break;
 	}
 
@@ -274,7 +316,8 @@ static void process_evl_entry(struct idxd_device *idxd,
 	} else {
 		status = DSA_COMP_STATUS(entry_head->error);
 
-		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) {
+		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL ||
+		    status == DSA_COMP_BATCH_EVL_ERR) {
 			struct idxd_evl_fault *fault;
 			int ent_size = evl_ent_size(idxd);
 
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 148db94f9373..9f3959d001b6 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -35,7 +35,7 @@ union gen_cap_reg {
 		u64 drain_readback:1;
 		u64 rsvd2:3;
 		u64 evl_support:2;
-		u64 rsvd4:1;
+		u64 batch_continuation:1;
 		u64 max_xfer_shift:5;
 		u64 max_batch_shift:4;
 		u64 max_ims_mult:6;
@@ -577,6 +577,8 @@ union evl_status_reg {
 	u64 bits;
 } __packed;
 
+#define IDXD_MAX_BATCH_IDENT	256
+
 struct __evl_entry {
 	u64 rsvd:2;
 	u64 desc_valid:1;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 76ad71bf751e..606b52e88ce3 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -136,6 +136,7 @@ enum dsa_completion_status {
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
 	DSA_COMP_DRAIN_EVL = 0x26,
+	DSA_COMP_BATCH_EVL_ERR,
 };
 
 enum iax_completion_status {
-- 
cgit v1.2.3


From 528407b1e0ea51260fff2cc8b669c632a65d7a09 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:05 +0100
Subject: io_uring/rsrc: consolidate node caching

We store one pre-allocated rsrc node in ->rsrc_backup_node, merge it
with ->rsrc_node_cache.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6d5410e51ccd29be7a716be045b51d6b371baef6.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/alloc_cache.h         |  5 +++++
 io_uring/io_uring.c            |  2 --
 io_uring/rsrc.c                | 20 +++++++++++---------
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fa621a508a01..40cab420b1bd 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -326,7 +326,6 @@ struct io_ring_ctx {
 	struct io_restriction		restrictions;
 
 	/* slow path rsrc auxilary data, used by update/register */
-	struct io_rsrc_node		*rsrc_backup_node;
 	struct io_mapped_ubuf		*dummy_ubuf;
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 851a527afb5e..241245cb54a6 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -23,6 +23,11 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 	return false;
 }
 
+static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
+{
+	return !cache->list.next;
+}
+
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
 	if (cache->list.next) {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b171c26d331d..075bae8a2bb1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2852,8 +2852,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
 		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-	if (ctx->rsrc_backup_node)
-		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 24e4e2109549..73f9e10d9bf0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -230,7 +230,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill)
 	__must_hold(&ctx->uring_lock)
 {
-	WARN_ON_ONCE(!ctx->rsrc_backup_node);
+	WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache));
 	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 
 	if (data_to_kill) {
@@ -245,18 +245,20 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		ctx->rsrc_node = NULL;
 	}
 
-	if (!ctx->rsrc_node) {
-		ctx->rsrc_node = ctx->rsrc_backup_node;
-		ctx->rsrc_backup_node = NULL;
-	}
+	if (!ctx->rsrc_node)
+		ctx->rsrc_node = io_rsrc_node_alloc(ctx);
 }
 
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
-	if (ctx->rsrc_backup_node)
-		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
-	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+	if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) {
+		struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+		if (!node)
+			return -ENOMEM;
+		io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache);
+	}
+	return 0;
 }
 
 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-- 
cgit v1.2.3


From ac931d4cdec3df8b6eac3bc40a6871123021f078 Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Fri, 7 Apr 2023 15:38:53 +0200
Subject: ipip,ip_tunnel,sit: Add FOU support for externally controlled ipip
 devices

Today ipip devices in collect-metadata mode don't allow for sending FOU
or GUE encapsulated packets. This patch lifts the restriction by adding
a struct ip_tunnel_encap to the tunnel metadata.

On the egress path, the members of this struct can be set by the
bpf_skb_set_fou_encap kfunc via a BPF tc-hook. Instead of dropping packets
wishing to use additional UDP encapsulation, ip_md_tunnel_xmit now
evaluates the contents of this struct and adds the corresponding FOU or
GUE header. Furthermore, it is making sure that additional header bytes
are taken into account for PMTU discovery.

On the ingress path, an ipip device in collect-metadata mode will fill this
struct and a BPF tc-hook can obtain the information via a call to the
bpf_skb_get_fou_encap kfunc.

The minor change to ip_tunnel_encap, which now takes a pointer to
struct ip_tunnel_encap instead of struct ip_tunnel, allows us to control
FOU encap type and parameters on a per packet-level.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Link: https://lore.kernel.org/r/cfea47de655d0f870248abf725932f851b53960a.1680874078.git.cehrig@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/ip_tunnels.h | 28 +++++++++++++++-------------
 net/ipv4/ip_tunnel.c     | 22 ++++++++++++++++++++--
 net/ipv4/ipip.c          |  1 +
 net/ipv6/sit.c           |  2 +-
 4 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fca357679816..7912f53caae0 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -57,6 +57,13 @@ struct ip_tunnel_key {
 	__u8			flow_flags;
 };
 
+struct ip_tunnel_encap {
+	u16			type;
+	u16			flags;
+	__be16			sport;
+	__be16			dport;
+};
+
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
 #define IP_TUNNEL_INFO_IPV6	0x02	/* key contains IPv6 addresses */
@@ -66,9 +73,9 @@ struct ip_tunnel_key {
 #define IP_TUNNEL_OPTS_MAX					\
 	GENMASK((sizeof_field(struct ip_tunnel_info,		\
 			      options_len) * BITS_PER_BYTE) - 1, 0)
-
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
+	struct ip_tunnel_encap	encap;
 #ifdef CONFIG_DST_CACHE
 	struct dst_cache	dst_cache;
 #endif
@@ -86,13 +93,6 @@ struct ip_tunnel_6rd_parm {
 };
 #endif
 
-struct ip_tunnel_encap {
-	u16			type;
-	u16			flags;
-	__be16			sport;
-	__be16			dport;
-};
-
 struct ip_tunnel_prl_entry {
 	struct ip_tunnel_prl_entry __rcu *next;
 	__be32				addr;
@@ -293,6 +293,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   __be32 remote, __be32 local,
 				   __be32 key);
 
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info);
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 		  bool log_ecn_error);
@@ -371,22 +372,23 @@ static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
 	return hlen;
 }
 
-static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+static inline int ip_tunnel_encap(struct sk_buff *skb,
+				  struct ip_tunnel_encap *e,
 				  u8 *protocol, struct flowi4 *fl4)
 {
 	const struct ip_tunnel_encap_ops *ops;
 	int ret = -EINVAL;
 
-	if (t->encap.type == TUNNEL_ENCAP_NONE)
+	if (e->type == TUNNEL_ENCAP_NONE)
 		return 0;
 
-	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+	if (e->type >= MAX_IPTUN_ENCAP_OPS)
 		return -EINVAL;
 
 	rcu_read_lock();
-	ops = rcu_dereference(iptun_encaps[t->encap.type]);
+	ops = rcu_dereference(iptun_encaps[e->type]);
 	if (likely(ops && ops->build_header))
-		ret = ops->build_header(skb, &t->encap, protocol, fl4);
+		ret = ops->build_header(skb, e, protocol, fl4);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index de90b09dfe78..add437f710fc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -359,6 +359,20 @@ err_dev_set_mtu:
 	return ERR_PTR(err);
 }
 
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph;
+
+	if (iph->protocol != IPPROTO_UDP)
+		return;
+
+	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+	info->encap.sport = udph->source;
+	info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 		  bool log_ecn_error)
@@ -572,7 +586,11 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
 			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
 			    key->flow_flags);
-	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+
+	if (!tunnel_hlen)
+		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
 		goto tx_error;
 
 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
@@ -732,7 +750,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			    dev_net(dev), tunnel->parms.link,
 			    tunnel->fwmark, skb_get_hash(skb), 0);
 
-	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
 		goto tx_error;
 
 	if (connected && md) {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index abea77759b7e..27b8f83c6ea2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -241,6 +241,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
 			if (!tun_dst)
 				return 0;
+			ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
 		}
 		skb_reset_mac_header(skb);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 70d81bba5093..063560e2cb1a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1024,7 +1024,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		ttl = iph6->hop_limit;
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
-	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
+	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-- 
cgit v1.2.3


From c50e96099edb134bf107fafc02715fbc4aa2277f Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Fri, 7 Apr 2023 15:38:54 +0200
Subject: bpf,fou: Add bpf_skb_{set,get}_fou_encap kfuncs

Add two new kfuncs that allow a BPF tc-hook, installed on an ipip
device in collect-metadata mode, to control FOU encap parameters on a
per-packet level. The set of kfuncs is registered with the fou module.

The bpf_skb_set_fou_encap kfunc is supposed to be used in tandem and after
a successful call to the bpf_skb_set_tunnel_key bpf-helper. UDP source and
destination ports can be controlled by passing a struct bpf_fou_encap. A
source port of zero will auto-assign a source port. enum bpf_fou_encap_type
is used to specify if the egress path should FOU or GUE encap the packet.

On the ingress path bpf_skb_get_fou_encap can be used to read UDP source
and destination ports from the receiver's point of view and allows for
packet multiplexing across different destination ports within a single
BPF program and ipip device.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Link: https://lore.kernel.org/r/e17c94a646b63e78ce0dbf3f04b2c33dc948a32d.1680874078.git.cehrig@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/fou.h   |   2 +
 net/ipv4/Makefile   |   2 +-
 net/ipv4/fou_bpf.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fou_core.c |   5 +++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/fou_bpf.c

(limited to 'include')

diff --git a/include/net/fou.h b/include/net/fou.h
index 80f56e275b08..824eb4b231fd 100644
--- a/include/net/fou.h
+++ b/include/net/fou.h
@@ -17,4 +17,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		       u8 *protocol, __be16 *sport, int type);
 
+int register_fou_bpf(void);
+
 #endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 880277c9fd07..b18ba8ef93ad 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 gre-y := gre_demux.o
-fou-y := fou_core.o fou_nl.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
 obj-$(CONFIG_NET_FOU) += fou.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 000000000000..3760a14b6b57
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+	__be16 sport;
+	__be16 dport;
+};
+
+enum bpf_fou_encap_type {
+	FOU_BPF_ENCAP_FOU,
+	FOU_BPF_ENCAP_GUE,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in BTF");
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * 		dst ports. If sport is set to 0 the kernel will auto-assign a
+ * 		port. This is similar to using `encap-sport auto`.
+ * 		Cannot be NULL
+ * @type	Encapsulation type for the packet. Their definitions are
+ * 		specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap, int type)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!encap))
+		return -EINVAL;
+
+	if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+		return -EINVAL;
+
+	switch (type) {
+	case FOU_BPF_ENCAP_FOU:
+		info->encap.type = TUNNEL_ENCAP_FOU;
+		break;
+	case FOU_BPF_ENCAP_GUE:
+		info->encap.type = TUNNEL_ENCAP_GUE;
+		break;
+	default:
+		info->encap.type = TUNNEL_ENCAP_NONE;
+	}
+
+	if (info->key.tun_flags & TUNNEL_CSUM)
+		info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+	info->encap.sport = encap->sport;
+	info->encap.dport = encap->dport;
+
+	return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a struct bpf_fou_encap storing UDP source and
+ * 		destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info))
+		return -EINVAL;
+
+	encap->sport = info->encap.sport;
+	encap->dport = info->encap.dport;
+
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_SET8_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index cafec9b4eee0..0c41076e31ed 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -1236,10 +1236,15 @@ static int __init fou_init(void)
 	if (ret < 0)
 		goto unregister;
 
+	ret = register_fou_bpf();
+	if (ret < 0)
+		goto kfunc_failed;
+
 	ret = ip_tunnel_encap_add_fou_ops();
 	if (ret == 0)
 		return 0;
 
+kfunc_failed:
 	genl_unregister_family(&fou_nl_family);
 unregister:
 	unregister_pernet_device(&fou_net_ops);
-- 
cgit v1.2.3


From 59d3efd27c11c59b32291e5ebc307bed2edb65ee Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Tue, 11 Apr 2023 09:43:19 +0200
Subject: rtnetlink: Restore RTM_NEW/DELLINK notification behavior

The commits referenced below allows userspace to use the NLM_F_ECHO flag
for RTM_NEW/DELLINK operations to receive unicast notifications for the
affected link. Prior to these changes, applications may have relied on
multicast notifications to learn the same information without specifying
the NLM_F_ECHO flag.

For such applications, the mentioned commits changed the behavior for
requests not using NLM_F_ECHO. Multicast notifications are still received,
but now use the portid of the requester and the sequence number of the
request instead of zero values used previously. For the application, this
message may be unexpected and likely handled as a response to the
NLM_F_ACKed request, especially if it uses the same socket to handle
requests and notifications.

To fix existing applications relying on the old notification behavior,
set the portid and sequence number in the notification only if the
request included the NLM_F_ECHO flag. This restores the old behavior
for applications not using it, but allows unicasted notifications for
others.

Fixes: f3a63cce1b4f ("rtnetlink: Honour NLM_F_ECHO flag in rtnl_delete_link")
Fixes: d88e136cab37 ("rtnetlink: Honour NLM_F_ECHO flag in rtnl_newlink_create")
Signed-off-by: Martin Willi <martin@strongswan.org>
Acked-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20230411074319.24133-1-martin@strongswan.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h |  3 ++-
 net/core/dev.c            |  2 +-
 net/core/rtnetlink.c      | 11 +++++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 92ad75549e9c..b6e6378dcbbd 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -25,7 +25,8 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
 				       gfp_t flags, int *new_nsid,
-				       int new_ifindex, u32 portid, u32 seq);
+				       int new_ifindex, u32 portid,
+				       const struct nlmsghdr *nlh);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags, u32 portid, const struct nlmsghdr *nlh);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 48067321c0db..1488f700bf81 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10847,7 +10847,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 						     GFP_KERNEL, NULL, 0,
-						     portid, nlmsg_seq(nlh));
+						     portid, nlh);
 
 		/*
 		 *	Flush the unicast and multicast chains
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5d8eb57867a9..6e44e92ebdf5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3972,16 +3972,23 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
 				       u32 event, gfp_t flags, int *new_nsid,
-				       int new_ifindex, u32 portid, u32 seq)
+				       int new_ifindex, u32 portid,
+				       const struct nlmsghdr *nlh)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
+	u32 seq = 0;
 
 	skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
 	if (skb == NULL)
 		goto errout;
 
+	if (nlmsg_report(nlh))
+		seq = nlmsg_seq(nlh);
+	else
+		portid = 0;
+
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, portid, seq, change, 0, 0, event,
 			       new_nsid, new_ifindex, -1, flags);
@@ -4017,7 +4024,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 		return;
 
 	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
-				     new_ifindex, portid, nlmsg_seq(nlh));
+				     new_ifindex, portid, nlh);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
 }
-- 
cgit v1.2.3


From edde9e70bb48301c853299f1bd7c0aa0745a38ea Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 22 Mar 2023 14:37:03 +0200
Subject: blk-mq-rdma: remove queue mapping helper for rdma devices

No rdma device exposes its irq vectors affinity today. So the only
mapping that we have left, is the default blk_mq_map_queues, which
we fallback to anyways. Also fixup the only consumer of this helper
(nvme-rdma).

Remove this now dead code.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/Kconfig               |  5 -----
 block/Makefile              |  1 -
 block/blk-mq-rdma.c         | 44 --------------------------------------------
 drivers/nvme/host/rdma.c    |  7 ++-----
 include/linux/blk-mq-rdma.h | 11 -----------
 5 files changed, 2 insertions(+), 66 deletions(-)
 delete mode 100644 block/blk-mq-rdma.c
 delete mode 100644 include/linux/blk-mq-rdma.h

(limited to 'include')

diff --git a/block/Kconfig b/block/Kconfig
index 5d9d9c84d516..0b7c13f9a089 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -215,11 +215,6 @@ config BLK_MQ_VIRTIO
 	depends on VIRTIO
 	default y
 
-config BLK_MQ_RDMA
-	bool
-	depends on INFINIBAND
-	default y
-
 config BLK_PM
 	def_bool PM
 
diff --git a/block/Makefile b/block/Makefile
index 4e01bb71ad6e..b31b05390749 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY_T10)	+= t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
 obj-$(CONFIG_BLK_MQ_VIRTIO)	+= blk-mq-virtio.o
-obj-$(CONFIG_BLK_MQ_RDMA)	+= blk-mq-rdma.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
deleted file mode 100644
index 29c1f4d6eb04..000000000000
--- a/block/blk-mq-rdma.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 Sagi Grimberg.
- */
-#include <linux/blk-mq.h>
-#include <linux/blk-mq-rdma.h>
-#include <rdma/ib_verbs.h>
-
-/**
- * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
- * @map:	CPU to hardware queue map.
- * @dev:	rdma device to provide a mapping for.
- * @first_vec:	first interrupt vectors to use for queues (usually 0)
- *
- * This function assumes the rdma device @dev has at least as many available
- * interrupt vetors as @set has queues.  It will then query it's affinity mask
- * and built queue mapping that maps a queue to the CPUs that have irq affinity
- * for the corresponding vector.
- *
- * In case either the driver passed a @dev with less vectors than
- * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
- * vector, we fallback to the naive mapping.
- */
-void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
-		struct ib_device *dev, int first_vec)
-{
-	const struct cpumask *mask;
-	unsigned int queue, cpu;
-
-	for (queue = 0; queue < map->nr_queues; queue++) {
-		mask = ib_get_vector_affinity(dev, first_vec + queue);
-		if (!mask)
-			goto fallback;
-
-		for_each_cpu(cpu, mask)
-			map->mq_map[cpu] = map->queue_offset + queue;
-	}
-
-	return;
-
-fallback:
-	blk_mq_map_queues(map);
-}
-EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4767ae19178c..0eb79696fb73 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
-#include <linux/blk-mq-rdma.h>
 #include <linux/blk-integrity.h>
 #include <linux/types.h>
 #include <linux/list.h>
@@ -2159,10 +2158,8 @@ static void nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 			ctrl->io_queues[HCTX_TYPE_DEFAULT];
 		set->map[HCTX_TYPE_READ].queue_offset = 0;
 	}
-	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
-			ctrl->device->dev, 0);
-	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
-			ctrl->device->dev, 0);
+	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
 
 	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
 		/* map dedicated poll queues only if we have queues left */
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
deleted file mode 100644
index 53b58c610e76..000000000000
--- a/include/linux/blk-mq-rdma.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_BLK_MQ_RDMA_H
-#define _LINUX_BLK_MQ_RDMA_H
-
-struct blk_mq_tag_set;
-struct ib_device;
-
-void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
-		struct ib_device *dev, int first_vec);
-
-#endif /* _LINUX_BLK_MQ_RDMA_H */
-- 
cgit v1.2.3


From 6539cffa94957241c096099a57d05fa4d8c7db8a Mon Sep 17 00:00:00 2001
From: Radu Rendec <rrendec@redhat.com>
Date: Wed, 12 Apr 2023 14:57:57 -0400
Subject: cacheinfo: Add arch specific early level initializer

This patch gives architecture specific code the ability to initialize
the cache level and allocate cacheinfo memory early, when cache level
initialization runs on the primary CPU for all possible CPUs.

This is part of a patch series that attempts to further the work in
commit 5944ce092b97 ("arch_topology: Build cacheinfo from primary CPU").
Previously, in the absence of any DT/ACPI cache info, architecture
specific cache detection and info allocation for secondary CPUs would
happen in non-preemptible context during early CPU initialization and
trigger a "BUG: sleeping function called from invalid context" splat on
an RT kernel.

More specifically, this patch adds the early_cache_level() function,
which is called by fetch_cache_info() as a fallback when the number of
cache leaves cannot be extracted from DT/ACPI. In the default generic
(weak) implementation, this new function returns -ENOENT, which
preserves the original behavior for architectures that do not implement
the function.

Since early detection can get the number of cache leaves wrong in some
cases*, additional logic is added to still call init_cache_level() later
on the secondary CPU, therefore giving the architecture specific code an
opportunity to go back and fix the initial guess. Again, the original
behavior is preserved for architectures that do not implement the new
function.

* For example, on arm64, CLIDR_EL1 detection works only when it runs on
  the current CPU. In other words, a CPU cannot detect the cache depth
  for any other CPU than itself.

Signed-off-by: Radu Rendec <rrendec@redhat.com>
Reviewed-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://lore.kernel.org/r/20230412185759.755408-2-rrendec@redhat.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/cacheinfo.c  | 75 +++++++++++++++++++++++++++++++++--------------
 include/linux/cacheinfo.h |  2 ++
 2 files changed, 55 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index f6573c335f4c..6e0a44bad305 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -398,6 +398,11 @@ static void free_cache_attributes(unsigned int cpu)
 	cache_shared_cpu_map_remove(cpu);
 }
 
+int __weak early_cache_level(unsigned int cpu)
+{
+	return -ENOENT;
+}
+
 int __weak init_cache_level(unsigned int cpu)
 {
 	return -ENOENT;
@@ -423,56 +428,82 @@ int allocate_cache_info(int cpu)
 
 int fetch_cache_info(unsigned int cpu)
 {
-	struct cpu_cacheinfo *this_cpu_ci;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 	unsigned int levels = 0, split_levels = 0;
 	int ret;
 
 	if (acpi_disabled) {
 		ret = init_of_cache_level(cpu);
-		if (ret < 0)
-			return ret;
 	} else {
 		ret = acpi_get_cache_info(cpu, &levels, &split_levels);
-		if (ret < 0)
+		if (!ret) {
+			this_cpu_ci->num_levels = levels;
+			/*
+			 * This assumes that:
+			 * - there cannot be any split caches (data/instruction)
+			 *   above a unified cache
+			 * - data/instruction caches come by pair
+			 */
+			this_cpu_ci->num_leaves = levels + split_levels;
+		}
+	}
+
+	if (ret || !cache_leaves(cpu)) {
+		ret = early_cache_level(cpu);
+		if (ret)
 			return ret;
 
-		this_cpu_ci = get_cpu_cacheinfo(cpu);
-		this_cpu_ci->num_levels = levels;
-		/*
-		 * This assumes that:
-		 * - there cannot be any split caches (data/instruction)
-		 *   above a unified cache
-		 * - data/instruction caches come by pair
-		 */
-		this_cpu_ci->num_leaves = levels + split_levels;
+		if (!cache_leaves(cpu))
+			return -ENOENT;
+
+		this_cpu_ci->early_ci_levels = true;
 	}
-	if (!cache_leaves(cpu))
-		return -ENOENT;
 
 	return allocate_cache_info(cpu);
 }
 
-int detect_cache_attributes(unsigned int cpu)
+static inline int init_level_allocate_ci(unsigned int cpu)
 {
-	int ret;
+	unsigned int early_leaves = cache_leaves(cpu);
 
 	/* Since early initialization/allocation of the cacheinfo is allowed
 	 * via fetch_cache_info() and this also gets called as CPU hotplug
 	 * callbacks via cacheinfo_cpu_online, the init/alloc can be skipped
 	 * as it will happen only once (the cacheinfo memory is never freed).
-	 * Just populate the cacheinfo.
+	 * Just populate the cacheinfo. However, if the cacheinfo has been
+	 * allocated early through the arch-specific early_cache_level() call,
+	 * there is a chance the info is wrong (this can happen on arm64). In
+	 * that case, call init_cache_level() anyway to give the arch-specific
+	 * code a chance to make things right.
 	 */
-	if (per_cpu_cacheinfo(cpu))
-		goto populate_leaves;
+	if (per_cpu_cacheinfo(cpu) && !ci_cacheinfo(cpu)->early_ci_levels)
+		return 0;
 
 	if (init_cache_level(cpu) || !cache_leaves(cpu))
 		return -ENOENT;
 
-	ret = allocate_cache_info(cpu);
+	/*
+	 * Now that we have properly initialized the cache level info, make
+	 * sure we don't try to do that again the next time we are called
+	 * (e.g. as CPU hotplug callbacks).
+	 */
+	ci_cacheinfo(cpu)->early_ci_levels = false;
+
+	if (cache_leaves(cpu) <= early_leaves)
+		return 0;
+
+	kfree(per_cpu_cacheinfo(cpu));
+	return allocate_cache_info(cpu);
+}
+
+int detect_cache_attributes(unsigned int cpu)
+{
+	int ret;
+
+	ret = init_level_allocate_ci(cpu);
 	if (ret)
 		return ret;
 
-populate_leaves:
 	/*
 	 * populate_cache_leaves() may completely setup the cache leaves and
 	 * shared_cpu_map or it may leave it partially setup.
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 908e19d17f49..6147b2672555 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -76,9 +76,11 @@ struct cpu_cacheinfo {
 	unsigned int num_levels;
 	unsigned int num_leaves;
 	bool cpu_map_populated;
+	bool early_ci_levels;
 };
 
 struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
+int early_cache_level(unsigned int cpu);
 int init_cache_level(unsigned int cpu);
 int init_of_cache_level(unsigned int cpu);
 int populate_cache_leaves(unsigned int cpu);
-- 
cgit v1.2.3


From 79d9622d622d49d6c14c51edec2059b8591e7975 Mon Sep 17 00:00:00 2001
From: Patrik Dahlström <risca@dalakolonin.se>
Date: Sat, 8 Apr 2023 13:48:19 +0200
Subject: iio: adc: palmas: remove adc_wakeupX_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It does not seem to be used by anyone and later patches in this series
are made simpler by first removing this. There is now a lot of dead code
that cannot be reached, until later patches revive it. Arguably, this is
preferred over removing the code only to add it again.

Signed-off-by: Patrik Dahlström <risca@dalakolonin.se>
Link: https://lore.kernel.org/r/20230408114825.824505-4-risca@dalakolonin.se
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/palmas_gpadc.c | 50 +++++-------------------------------------
 include/linux/mfd/palmas.h     |  8 -------
 2 files changed, 6 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/palmas_gpadc.c b/drivers/iio/adc/palmas_gpadc.c
index 2921186458e0..03af6cd73ec8 100644
--- a/drivers/iio/adc/palmas_gpadc.c
+++ b/drivers/iio/adc/palmas_gpadc.c
@@ -76,6 +76,12 @@ static struct palmas_gpadc_info palmas_gpadc_info[] = {
 	PALMAS_ADC_INFO(IN15, 0, 0, 0, 0, INVALID, INVALID, true),
 };
 
+struct palmas_adc_wakeup_property {
+	int adc_channel_number;
+	int adc_high_threshold;
+	int adc_low_threshold;
+};
+
 /*
  * struct palmas_gpadc - the palmas_gpadc structure
  * @ch0_current:	channel 0 current source setting
@@ -493,11 +499,6 @@ static int palmas_gpadc_get_adc_dt_data(struct platform_device *pdev,
 	return 0;
 }
 
-static void palmas_disable_wakeup(void *dev)
-{
-	device_wakeup_disable(dev);
-}
-
 static int palmas_gpadc_probe(struct platform_device *pdev)
 {
 	struct palmas_gpadc *adc;
@@ -548,36 +549,6 @@ static int palmas_gpadc_probe(struct platform_device *pdev)
 		return dev_err_probe(adc->dev, ret,
 				     "request irq %d failed\n", adc->irq);
 
-	if (gpadc_pdata->adc_wakeup1_data) {
-		memcpy(&adc->wakeup1_data, gpadc_pdata->adc_wakeup1_data,
-			sizeof(adc->wakeup1_data));
-		adc->wakeup1_enable = true;
-		adc->irq_auto_0 =  platform_get_irq(pdev, 1);
-		ret = devm_request_threaded_irq(&pdev->dev, adc->irq_auto_0,
-						NULL, palmas_gpadc_irq_auto,
-						IRQF_ONESHOT,
-						"palmas-adc-auto-0", adc);
-		if (ret < 0)
-			return dev_err_probe(adc->dev, ret,
-					     "request auto0 irq %d failed\n",
-					     adc->irq_auto_0);
-	}
-
-	if (gpadc_pdata->adc_wakeup2_data) {
-		memcpy(&adc->wakeup2_data, gpadc_pdata->adc_wakeup2_data,
-				sizeof(adc->wakeup2_data));
-		adc->wakeup2_enable = true;
-		adc->irq_auto_1 =  platform_get_irq(pdev, 2);
-		ret = devm_request_threaded_irq(&pdev->dev, adc->irq_auto_1,
-						NULL, palmas_gpadc_irq_auto,
-						IRQF_ONESHOT,
-						"palmas-adc-auto-1", adc);
-		if (ret < 0)
-			return dev_err_probe(adc->dev, ret,
-					     "request auto1 irq %d failed\n",
-					     adc->irq_auto_1);
-	}
-
 	/* set the current source 0 (value 0/5/15/20 uA => 0..3) */
 	if (gpadc_pdata->ch0_current <= 1)
 		adc->ch0_current = PALMAS_ADC_CH0_CURRENT_SRC_0;
@@ -617,15 +588,6 @@ static int palmas_gpadc_probe(struct platform_device *pdev)
 			palmas_gpadc_calibrate(adc, i);
 	}
 
-	if (adc->wakeup1_enable || adc->wakeup2_enable) {
-		device_wakeup_enable(&pdev->dev);
-		ret = devm_add_action_or_reset(&pdev->dev,
-					       palmas_disable_wakeup,
-					       &pdev->dev);
-		if (ret)
-			return ret;
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 117d02708439..eda1ffd99c1a 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -128,12 +128,6 @@ struct palmas_pmic_driver_data {
 			    struct regulator_config config);
 };
 
-struct palmas_adc_wakeup_property {
-	int adc_channel_number;
-	int adc_high_threshold;
-	int adc_low_threshold;
-};
-
 struct palmas_gpadc_platform_data {
 	/* Channel 3 current source is only enabled during conversion */
 	int ch3_current;	/* 0: off; 1: 10uA; 2: 400uA; 3: 800 uA */
@@ -152,8 +146,6 @@ struct palmas_gpadc_platform_data {
 	int start_polarity;
 
 	int auto_conversion_period_ms;
-	struct palmas_adc_wakeup_property *adc_wakeup1_data;
-	struct palmas_adc_wakeup_property *adc_wakeup2_data;
 };
 
 struct palmas_reg_init {
-- 
cgit v1.2.3


From 33719b57f52e5b761234373f98f55f4e036d61c9 Mon Sep 17 00:00:00 2001
From: Andrew Halaney <ahalaney@redhat.com>
Date: Tue, 11 Apr 2023 15:04:06 -0500
Subject: net: stmmac: dwmac4: Allow platforms to specify some DMA/MTL offsets

Some platforms have dwmac4 implementations that have a different
address space layout than the default, resulting in the need to define
their own DMA/MTL offsets.

Extend the functions to allow a platform driver to indicate what its
addresses are, overriding the defaults.

Signed-off-by: Andrew Halaney <ahalaney@redhat.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Brian Masney <bmasney@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 101 +++++++++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |  36 +++--
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  | 157 +++++++++++++---------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h  |  54 +++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c  |  67 +++++----
 include/linux/stmmac.h                            |  19 +++
 6 files changed, 293 insertions(+), 141 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index ccd49346d3b3..4538f334df57 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -336,14 +336,25 @@ enum power_event {
 
 #define MTL_CHAN_BASE_ADDR		0x00000d00
 #define MTL_CHAN_BASE_OFFSET		0x40
-#define MTL_CHANX_BASE_ADDR(x)		(MTL_CHAN_BASE_ADDR + \
-					(x * MTL_CHAN_BASE_OFFSET))
 
-#define MTL_CHAN_TX_OP_MODE(x)		MTL_CHANX_BASE_ADDR(x)
-#define MTL_CHAN_TX_DEBUG(x)		(MTL_CHANX_BASE_ADDR(x) + 0x8)
-#define MTL_CHAN_INT_CTRL(x)		(MTL_CHANX_BASE_ADDR(x) + 0x2c)
-#define MTL_CHAN_RX_OP_MODE(x)		(MTL_CHANX_BASE_ADDR(x) + 0x30)
-#define MTL_CHAN_RX_DEBUG(x)		(MTL_CHANX_BASE_ADDR(x) + 0x38)
+static inline u32 mtl_chanx_base_addr(const struct dwmac4_addrs *addrs,
+				      const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_chan + (x * addrs->mtl_chan_offset);
+	else
+		addr = MTL_CHAN_BASE_ADDR + (x * MTL_CHAN_BASE_OFFSET);
+
+	return addr;
+}
+
+#define MTL_CHAN_TX_OP_MODE(addrs, x)	mtl_chanx_base_addr(addrs, x)
+#define MTL_CHAN_TX_DEBUG(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x8)
+#define MTL_CHAN_INT_CTRL(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x2c)
+#define MTL_CHAN_RX_OP_MODE(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x30)
+#define MTL_CHAN_RX_DEBUG(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x38)
 
 #define MTL_OP_MODE_RSF			BIT(5)
 #define MTL_OP_MODE_TXQEN_MASK		GENMASK(3, 2)
@@ -388,8 +399,19 @@ enum power_event {
 /* MTL ETS Control register */
 #define MTL_ETS_CTRL_BASE_ADDR		0x00000d10
 #define MTL_ETS_CTRL_BASE_OFFSET	0x40
-#define MTL_ETSX_CTRL_BASE_ADDR(x)	(MTL_ETS_CTRL_BASE_ADDR + \
-					((x) * MTL_ETS_CTRL_BASE_OFFSET))
+
+static inline u32 mtl_etsx_ctrl_base_addr(const struct dwmac4_addrs *addrs,
+					  const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_ets_ctrl + (x * addrs->mtl_ets_ctrl_offset);
+	else
+		addr = MTL_ETS_CTRL_BASE_ADDR + (x * MTL_ETS_CTRL_BASE_OFFSET);
+
+	return addr;
+}
 
 #define MTL_ETS_CTRL_CC			BIT(3)
 #define MTL_ETS_CTRL_AVALG		BIT(2)
@@ -397,31 +419,76 @@ enum power_event {
 /* MTL Queue Quantum Weight */
 #define MTL_TXQ_WEIGHT_BASE_ADDR	0x00000d18
 #define MTL_TXQ_WEIGHT_BASE_OFFSET	0x40
-#define MTL_TXQX_WEIGHT_BASE_ADDR(x)	(MTL_TXQ_WEIGHT_BASE_ADDR + \
-					((x) * MTL_TXQ_WEIGHT_BASE_OFFSET))
+
+static inline u32 mtl_txqx_weight_base_addr(const struct dwmac4_addrs *addrs,
+					    const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_txq_weight + (x * addrs->mtl_txq_weight_offset);
+	else
+		addr = MTL_TXQ_WEIGHT_BASE_ADDR + (x * MTL_TXQ_WEIGHT_BASE_OFFSET);
+
+	return addr;
+}
+
 #define MTL_TXQ_WEIGHT_ISCQW_MASK	GENMASK(20, 0)
 
 /* MTL sendSlopeCredit register */
 #define MTL_SEND_SLP_CRED_BASE_ADDR	0x00000d1c
 #define MTL_SEND_SLP_CRED_OFFSET	0x40
-#define MTL_SEND_SLP_CREDX_BASE_ADDR(x)	(MTL_SEND_SLP_CRED_BASE_ADDR + \
-					((x) * MTL_SEND_SLP_CRED_OFFSET))
+
+static inline u32 mtl_send_slp_credx_base_addr(const struct dwmac4_addrs *addrs,
+					       const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_send_slp_cred + (x * addrs->mtl_send_slp_cred_offset);
+	else
+		addr = MTL_SEND_SLP_CRED_BASE_ADDR + (x * MTL_SEND_SLP_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_SEND_SLP_CRED_SSC_MASK	GENMASK(13, 0)
 
 /* MTL hiCredit register */
 #define MTL_HIGH_CRED_BASE_ADDR		0x00000d20
 #define MTL_HIGH_CRED_OFFSET		0x40
-#define MTL_HIGH_CREDX_BASE_ADDR(x)	(MTL_HIGH_CRED_BASE_ADDR + \
-					((x) * MTL_HIGH_CRED_OFFSET))
+
+static inline u32 mtl_high_credx_base_addr(const struct dwmac4_addrs *addrs,
+					   const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_high_cred + (x * addrs->mtl_high_cred_offset);
+	else
+		addr = MTL_HIGH_CRED_BASE_ADDR + (x * MTL_HIGH_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_HIGH_CRED_HC_MASK		GENMASK(28, 0)
 
 /* MTL loCredit register */
 #define MTL_LOW_CRED_BASE_ADDR		0x00000d24
 #define MTL_LOW_CRED_OFFSET		0x40
-#define MTL_LOW_CREDX_BASE_ADDR(x)	(MTL_LOW_CRED_BASE_ADDR + \
-					((x) * MTL_LOW_CRED_OFFSET))
+
+static inline u32 mtl_low_credx_base_addr(const struct dwmac4_addrs *addrs,
+					  const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_low_cred + (x * addrs->mtl_low_cred_offset);
+	else
+		addr = MTL_LOW_CRED_BASE_ADDR + (x * MTL_LOW_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_HIGH_CRED_LC_MASK		GENMASK(28, 0)
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index f44180519638..afaec3fb9ab6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -202,12 +202,14 @@ static void dwmac4_set_mtl_tx_queue_weight(struct stmmac_priv *priv,
 					   struct mac_device_info *hw,
 					   u32 weight, u32 queue)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
-	u32 value = readl(ioaddr + MTL_TXQX_WEIGHT_BASE_ADDR(queue));
+	u32 value = readl(ioaddr + mtl_txqx_weight_base_addr(dwmac4_addrs,
+							     queue));
 
 	value &= ~MTL_TXQ_WEIGHT_ISCQW_MASK;
 	value |= weight & MTL_TXQ_WEIGHT_ISCQW_MASK;
-	writel(value, ioaddr + MTL_TXQX_WEIGHT_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_txqx_weight_base_addr(dwmac4_addrs, queue));
 }
 
 static void dwmac4_map_mtl_dma(struct mac_device_info *hw, u32 queue, u32 chan)
@@ -233,6 +235,7 @@ static void dwmac4_config_cbs(struct stmmac_priv *priv,
 			      u32 send_slope, u32 idle_slope,
 			      u32 high_credit, u32 low_credit, u32 queue)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value;
 
@@ -243,31 +246,33 @@ static void dwmac4_config_cbs(struct stmmac_priv *priv,
 	pr_debug("\tlow_credit: 0x%08x\n", low_credit);
 
 	/* enable AV algorithm */
-	value = readl(ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_etsx_ctrl_base_addr(dwmac4_addrs, queue));
 	value |= MTL_ETS_CTRL_AVALG;
 	value |= MTL_ETS_CTRL_CC;
-	writel(value, ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_etsx_ctrl_base_addr(dwmac4_addrs, queue));
 
 	/* configure send slope */
-	value = readl(ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_send_slp_credx_base_addr(dwmac4_addrs,
+							    queue));
 	value &= ~MTL_SEND_SLP_CRED_SSC_MASK;
 	value |= send_slope & MTL_SEND_SLP_CRED_SSC_MASK;
-	writel(value, ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_send_slp_credx_base_addr(dwmac4_addrs,
+							    queue));
 
 	/* configure idle slope (same register as tx weight) */
 	dwmac4_set_mtl_tx_queue_weight(priv, hw, idle_slope, queue);
 
 	/* configure high credit */
-	value = readl(ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_high_credx_base_addr(dwmac4_addrs, queue));
 	value &= ~MTL_HIGH_CRED_HC_MASK;
 	value |= high_credit & MTL_HIGH_CRED_HC_MASK;
-	writel(value, ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_high_credx_base_addr(dwmac4_addrs, queue));
 
 	/* configure high credit */
-	value = readl(ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_low_credx_base_addr(dwmac4_addrs, queue));
 	value &= ~MTL_HIGH_CRED_LC_MASK;
 	value |= low_credit & MTL_HIGH_CRED_LC_MASK;
-	writel(value, ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_low_credx_base_addr(dwmac4_addrs, queue));
 }
 
 static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
@@ -764,6 +769,7 @@ static void dwmac4_phystatus(void __iomem *ioaddr, struct stmmac_extra_stats *x)
 static int dwmac4_irq_mtl_status(struct stmmac_priv *priv,
 				 struct mac_device_info *hw, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
 	u32 mtl_int_qx_status;
 	int ret = 0;
@@ -773,12 +779,13 @@ static int dwmac4_irq_mtl_status(struct stmmac_priv *priv,
 	/* Check MTL Interrupt */
 	if (mtl_int_qx_status & MTL_INT_QX(chan)) {
 		/* read Queue x Interrupt status */
-		u32 status = readl(ioaddr + MTL_CHAN_INT_CTRL(chan));
+		u32 status = readl(ioaddr + MTL_CHAN_INT_CTRL(dwmac4_addrs,
+							      chan));
 
 		if (status & MTL_RX_OVERFLOW_INT) {
 			/*  clear Interrupt */
 			writel(status | MTL_RX_OVERFLOW_INT,
-			       ioaddr + MTL_CHAN_INT_CTRL(chan));
+			       ioaddr + MTL_CHAN_INT_CTRL(dwmac4_addrs, chan));
 			ret = CORE_IRQ_MTL_RX_OVERFLOW;
 		}
 	}
@@ -840,11 +847,12 @@ static void dwmac4_debug(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 struct stmmac_extra_stats *x,
 			 u32 rx_queues, u32 tx_queues)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 queue;
 
 	for (queue = 0; queue < tx_queues; queue++) {
-		value = readl(ioaddr + MTL_CHAN_TX_DEBUG(queue));
+		value = readl(ioaddr + MTL_CHAN_TX_DEBUG(dwmac4_addrs, queue));
 
 		if (value & MTL_DEBUG_TXSTSFSTS)
 			x->mtl_tx_status_fifo_full++;
@@ -869,7 +877,7 @@ static void dwmac4_debug(struct stmmac_priv *priv, void __iomem *ioaddr,
 	}
 
 	for (queue = 0; queue < rx_queues; queue++) {
-		value = readl(ioaddr + MTL_CHAN_RX_DEBUG(queue));
+		value = readl(ioaddr + MTL_CHAN_RX_DEBUG(dwmac4_addrs, queue));
 
 		if (value & MTL_DEBUG_RXFSTS_MASK) {
 			u32 rxfsts = (value & MTL_DEBUG_RXFSTS_MASK)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 6f255d12f60f..84d3a8551b03 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -13,6 +13,7 @@
 #include <linux/io.h>
 #include "dwmac4.h"
 #include "dwmac4_dma.h"
+#include "stmmac.h"
 
 static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 {
@@ -73,18 +74,20 @@ static void dwmac4_dma_init_rx_chan(struct stmmac_priv *priv,
 				    struct stmmac_dma_cfg *dma_cfg,
 				    dma_addr_t dma_rx_phy, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
-	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && likely(dma_cfg->eame))
 		writel(upper_32_bits(dma_rx_phy),
-		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(chan));
+		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(dwmac4_addrs, chan));
 
-	writel(lower_32_bits(dma_rx_phy), ioaddr + DMA_CHAN_RX_BASE_ADDR(chan));
+	writel(lower_32_bits(dma_rx_phy),
+	       ioaddr + DMA_CHAN_RX_BASE_ADDR(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init_tx_chan(struct stmmac_priv *priv,
@@ -92,57 +95,61 @@ static void dwmac4_dma_init_tx_chan(struct stmmac_priv *priv,
 				    struct stmmac_dma_cfg *dma_cfg,
 				    dma_addr_t dma_tx_phy, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
 
-	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT);
 
 	/* Enable OSP to get best performance */
 	value |= DMA_CONTROL_OSP;
 
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && likely(dma_cfg->eame))
 		writel(upper_32_bits(dma_tx_phy),
-		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(chan));
+		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(dwmac4_addrs, chan));
 
-	writel(lower_32_bits(dma_tx_phy), ioaddr + DMA_CHAN_TX_BASE_ADDR(chan));
+	writel(lower_32_bits(dma_tx_phy),
+	       ioaddr + DMA_CHAN_TX_BASE_ADDR(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init_channel(struct stmmac_priv *priv,
 				    void __iomem *ioaddr,
 				    struct stmmac_dma_cfg *dma_cfg, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	/* common channel control register config */
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (dma_cfg->pblx8)
 		value = value | DMA_BUS_MODE_PBL;
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_CHAN_INTR_DEFAULT_MASK,
-	       ioaddr + DMA_CHAN_INTR_ENA(chan));
+	       ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 static void dwmac410_dma_init_channel(struct stmmac_priv *priv,
 				      void __iomem *ioaddr,
 				      struct stmmac_dma_cfg *dma_cfg, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	/* common channel control register config */
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (dma_cfg->pblx8)
 		value = value | DMA_BUS_MODE_PBL;
 
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_CHAN_INTR_DEFAULT_MASK_4_10,
-	       ioaddr + DMA_CHAN_INTR_ENA(chan));
+	       ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init(void __iomem *ioaddr,
@@ -184,40 +191,46 @@ static void _dwmac4_dump_dma_regs(struct stmmac_priv *priv,
 				  void __iomem *ioaddr, u32 channel,
 				  u32 *reg_space)
 {
-	reg_space[DMA_CHAN_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CONTROL(channel));
-	reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
-	reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
-	reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel));
-	reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
-	reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel));
-	reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel));
-	reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel));
-	reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel));
-	reg_space[DMA_CHAN_INTR_ENA(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_INTR_ENA(channel));
-	reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel));
-	reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel));
-	reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel));
-	reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel));
-	reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel));
-	reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel));
-	reg_space[DMA_CHAN_STATUS(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_STATUS(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	const struct dwmac4_addrs *default_addrs = NULL;
+
+	/* Purposely save the registers in the "normal" layout, regardless of
+	 * platform modifications, to keep reg_space size constant
+	 */
+	reg_space[DMA_CHAN_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_BASE_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_BASE_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_END_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_END_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_END_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_END_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_RING_LEN(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_RING_LEN(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_RING_LEN(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_RING_LEN(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_INTR_ENA(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_WATCHDOG(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_WATCHDOG(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_SLOT_CTRL_STATUS(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_TX_DESC(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_TX_DESC(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_RX_DESC(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_RX_DESC(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_STATUS(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, channel));
 }
 
 static void dwmac4_dump_dma_regs(struct stmmac_priv *priv, void __iomem *ioaddr,
@@ -232,17 +245,20 @@ static void dwmac4_dump_dma_regs(struct stmmac_priv *priv, void __iomem *ioaddr,
 static void dwmac4_rx_watchdog(struct stmmac_priv *priv, void __iomem *ioaddr,
 			       u32 riwt, u32 queue)
 {
-	writel(riwt, ioaddr + DMA_CHAN_RX_WATCHDOG(queue));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(riwt, ioaddr + DMA_CHAN_RX_WATCHDOG(dwmac4_addrs, queue));
 }
 
 static void dwmac4_dma_rx_chan_op_mode(struct stmmac_priv *priv,
 				       void __iomem *ioaddr, int mode,
 				       u32 channel, int fifosz, u8 qmode)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	unsigned int rqs = fifosz / 256 - 1;
 	u32 mtl_rx_op;
 
-	mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(channel));
+	mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(dwmac4_addrs, channel));
 
 	if (mode == SF_DMA_MODE) {
 		pr_debug("GMAC: enable RX store and forward mode\n");
@@ -300,14 +316,16 @@ static void dwmac4_dma_rx_chan_op_mode(struct stmmac_priv *priv,
 		mtl_rx_op |= rfa << MTL_OP_MODE_RFA_SHIFT;
 	}
 
-	writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(channel));
+	writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static void dwmac4_dma_tx_chan_op_mode(struct stmmac_priv *priv,
 				       void __iomem *ioaddr, int mode,
 				       u32 channel, int fifosz, u8 qmode)
 {
-	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(dwmac4_addrs,
+							   channel));
 	unsigned int tqs = fifosz / 256 - 1;
 
 	if (mode == SF_DMA_MODE) {
@@ -353,7 +371,7 @@ static void dwmac4_dma_tx_chan_op_mode(struct stmmac_priv *priv,
 	mtl_tx_op &= ~MTL_OP_MODE_TQS_MASK;
 	mtl_tx_op |= tqs << MTL_OP_MODE_TQS_SHIFT;
 
-	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(channel));
+	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static int dwmac4_get_hw_feature(void __iomem *ioaddr,
@@ -454,25 +472,28 @@ static int dwmac4_get_hw_feature(void __iomem *ioaddr,
 static void dwmac4_enable_tso(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      bool en, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	if (en) {
 		/* enable TSO */
-		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 		writel(value | DMA_CONTROL_TSE,
-		       ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		       ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	} else {
 		/* enable TSO */
-		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 		writel(value & ~DMA_CONTROL_TSE,
-		       ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		       ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	}
 }
 
 static void dwmac4_qmode(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 channel, u8 qmode)
 {
-	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(dwmac4_addrs,
+							   channel));
 
 	mtl_tx_op &= ~MTL_OP_MODE_TXQEN_MASK;
 	if (qmode != MTL_QUEUE_AVB)
@@ -480,50 +501,54 @@ static void dwmac4_qmode(struct stmmac_priv *priv, void __iomem *ioaddr,
 	else
 		mtl_tx_op |= MTL_OP_MODE_TXQEN_AV;
 
-	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(channel));
+	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static void dwmac4_set_bfsize(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      int bfsize, u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_RBSZ_MASK;
 	value |= (bfsize << DMA_RBSZ_SHIFT) & DMA_RBSZ_MASK;
 
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 }
 
 static void dwmac4_enable_sph(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      bool en, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value = readl(ioaddr + GMAC_EXT_CONFIG);
 
 	value &= ~GMAC_CONFIG_HDSMS;
 	value |= GMAC_CONFIG_HDSMS_256; /* Segment max 256 bytes */
 	writel(value, ioaddr + GMAC_EXT_CONFIG);
 
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (en)
 		value |= DMA_CONTROL_SPH;
 	else
 		value &= ~DMA_CONTROL_SPH;
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 }
 
 static int dwmac4_enable_tbs(struct stmmac_priv *priv, void __iomem *ioaddr,
 			     bool en, u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	if (en)
 		value |= DMA_CONTROL_EDSE;
 	else
 		value &= ~DMA_CONTROL_EDSE;
 
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
-	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan)) & DMA_CONTROL_EDSE;
+	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs,
+						   chan)) & DMA_CONTROL_EDSE;
 	if (en && !value)
 		return -EIO;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
index 740c3bc8d9a0..358e7dcb6a9a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
@@ -95,29 +95,41 @@
 /* Following DMA defines are chanels oriented */
 #define DMA_CHAN_BASE_ADDR		0x00001100
 #define DMA_CHAN_BASE_OFFSET		0x80
-#define DMA_CHANX_BASE_ADDR(x)		(DMA_CHAN_BASE_ADDR + \
-					(x * DMA_CHAN_BASE_OFFSET))
+
+static inline u32 dma_chanx_base_addr(const struct dwmac4_addrs *addrs,
+				      const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->dma_chan + (x * addrs->dma_chan_offset);
+	else
+		addr = DMA_CHAN_BASE_ADDR + (x * DMA_CHAN_BASE_OFFSET);
+
+	return addr;
+}
+
 #define DMA_CHAN_REG_NUMBER		17
 
-#define DMA_CHAN_CONTROL(x)		DMA_CHANX_BASE_ADDR(x)
-#define DMA_CHAN_TX_CONTROL(x)		(DMA_CHANX_BASE_ADDR(x) + 0x4)
-#define DMA_CHAN_RX_CONTROL(x)		(DMA_CHANX_BASE_ADDR(x) + 0x8)
-#define DMA_CHAN_TX_BASE_ADDR_HI(x)	(DMA_CHANX_BASE_ADDR(x) + 0x10)
-#define DMA_CHAN_TX_BASE_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x14)
-#define DMA_CHAN_RX_BASE_ADDR_HI(x)	(DMA_CHANX_BASE_ADDR(x) + 0x18)
-#define DMA_CHAN_RX_BASE_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x1c)
-#define DMA_CHAN_TX_END_ADDR(x)		(DMA_CHANX_BASE_ADDR(x) + 0x20)
-#define DMA_CHAN_RX_END_ADDR(x)		(DMA_CHANX_BASE_ADDR(x) + 0x28)
-#define DMA_CHAN_TX_RING_LEN(x)		(DMA_CHANX_BASE_ADDR(x) + 0x2c)
-#define DMA_CHAN_RX_RING_LEN(x)		(DMA_CHANX_BASE_ADDR(x) + 0x30)
-#define DMA_CHAN_INTR_ENA(x)		(DMA_CHANX_BASE_ADDR(x) + 0x34)
-#define DMA_CHAN_RX_WATCHDOG(x)		(DMA_CHANX_BASE_ADDR(x) + 0x38)
-#define DMA_CHAN_SLOT_CTRL_STATUS(x)	(DMA_CHANX_BASE_ADDR(x) + 0x3c)
-#define DMA_CHAN_CUR_TX_DESC(x)		(DMA_CHANX_BASE_ADDR(x) + 0x44)
-#define DMA_CHAN_CUR_RX_DESC(x)		(DMA_CHANX_BASE_ADDR(x) + 0x4c)
-#define DMA_CHAN_CUR_TX_BUF_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x54)
-#define DMA_CHAN_CUR_RX_BUF_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x5c)
-#define DMA_CHAN_STATUS(x)		(DMA_CHANX_BASE_ADDR(x) + 0x60)
+#define DMA_CHAN_CONTROL(addrs, x)	dma_chanx_base_addr(addrs, x)
+#define DMA_CHAN_TX_CONTROL(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x4)
+#define DMA_CHAN_RX_CONTROL(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x8)
+#define DMA_CHAN_TX_BASE_ADDR_HI(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x10)
+#define DMA_CHAN_TX_BASE_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x14)
+#define DMA_CHAN_RX_BASE_ADDR_HI(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x18)
+#define DMA_CHAN_RX_BASE_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x1c)
+#define DMA_CHAN_TX_END_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x20)
+#define DMA_CHAN_RX_END_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x28)
+#define DMA_CHAN_TX_RING_LEN(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x2c)
+#define DMA_CHAN_RX_RING_LEN(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x30)
+#define DMA_CHAN_INTR_ENA(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x34)
+#define DMA_CHAN_RX_WATCHDOG(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x38)
+#define DMA_CHAN_SLOT_CTRL_STATUS(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x3c)
+#define DMA_CHAN_CUR_TX_DESC(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x44)
+#define DMA_CHAN_CUR_RX_DESC(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x4c)
+#define DMA_CHAN_CUR_TX_BUF_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x54)
+#define DMA_CHAN_CUR_RX_BUF_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x5c)
+#define DMA_CHAN_STATUS(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x60)
 
 /* DMA Control X */
 #define DMA_CONTROL_SPH			BIT(24)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
index 5e9c495aa03e..df41eac54058 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
@@ -11,6 +11,7 @@
 #include "common.h"
 #include "dwmac4_dma.h"
 #include "dwmac4.h"
+#include "stmmac.h"
 
 int dwmac4_dma_reset(void __iomem *ioaddr)
 {
@@ -28,22 +29,27 @@ int dwmac4_dma_reset(void __iomem *ioaddr)
 void dwmac4_set_rx_tail_ptr(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 tail_ptr, u32 chan)
 {
-	writel(tail_ptr, ioaddr + DMA_CHAN_RX_END_ADDR(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(tail_ptr, ioaddr + DMA_CHAN_RX_END_ADDR(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_tx_tail_ptr(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 tail_ptr, u32 chan)
 {
-	writel(tail_ptr, ioaddr + DMA_CHAN_TX_END_ADDR(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(tail_ptr, ioaddr + DMA_CHAN_TX_END_ADDR(dwmac4_addrs, chan));
 }
 
 void dwmac4_dma_start_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value |= DMA_CONTROL_ST;
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value = readl(ioaddr + GMAC_CONFIG);
 	value |= GMAC_CONFIG_TE;
@@ -53,20 +59,24 @@ void dwmac4_dma_start_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 void dwmac4_dma_stop_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_CONTROL_ST;
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 }
 
 void dwmac4_dma_start_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value |= DMA_CONTROL_SR;
 
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value = readl(ioaddr + GMAC_CONFIG);
 	value |= GMAC_CONFIG_RE;
@@ -76,81 +86,91 @@ void dwmac4_dma_start_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 void dwmac4_dma_stop_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_CONTROL_SR;
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_tx_ring_len(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 len, u32 chan)
 {
-	writel(len, ioaddr + DMA_CHAN_TX_RING_LEN(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(len, ioaddr + DMA_CHAN_TX_RING_LEN(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_rx_ring_len(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 len, u32 chan)
 {
-	writel(len, ioaddr + DMA_CHAN_RX_RING_LEN(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(len, ioaddr + DMA_CHAN_RX_RING_LEN(dwmac4_addrs, chan));
 }
 
 void dwmac4_enable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			   u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value |= DMA_CHAN_INTR_DEFAULT_RX;
 	if (tx)
 		value |= DMA_CHAN_INTR_DEFAULT_TX;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac410_enable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			     u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value |= DMA_CHAN_INTR_DEFAULT_RX_4_10;
 	if (tx)
 		value |= DMA_CHAN_INTR_DEFAULT_TX_4_10;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac4_disable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_RX;
 	if (tx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_TX;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac410_disable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_RX_4_10;
 	if (tx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_TX_4_10;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 struct stmmac_extra_stats *x, u32 chan, u32 dir)
 {
-	u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(chan));
-	u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan));
+	u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 	int ret = 0;
 
 	if (dir == DMA_DIR_RX)
@@ -195,7 +215,8 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr,
 	if (unlikely(intr_status & DMA_CHAN_STATUS_ERI))
 		x->rx_early_irq++;
 
-	writel(intr_status & intr_en, ioaddr + DMA_CHAN_STATUS(chan));
+	writel(intr_status & intr_en,
+	       ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan));
 	return ret;
 }
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dafa001e9e7a..225751a8fd8e 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -186,6 +186,24 @@ struct stmmac_safety_feature_cfg {
 	u32 tmouten;
 };
 
+/* Addresses that may be customized by a platform */
+struct dwmac4_addrs {
+	u32 dma_chan;
+	u32 dma_chan_offset;
+	u32 mtl_chan;
+	u32 mtl_chan_offset;
+	u32 mtl_ets_ctrl;
+	u32 mtl_ets_ctrl_offset;
+	u32 mtl_txq_weight;
+	u32 mtl_txq_weight_offset;
+	u32 mtl_send_slp_cred;
+	u32 mtl_send_slp_cred_offset;
+	u32 mtl_high_cred;
+	u32 mtl_high_cred_offset;
+	u32 mtl_low_cred;
+	u32 mtl_low_cred_offset;
+};
+
 struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
@@ -274,5 +292,6 @@ struct plat_stmmacenet_data {
 	bool use_phy_wol;
 	bool sph_disable;
 	bool serdes_up_after_phy_linkup;
+	const struct dwmac4_addrs *dwmac4_addrs;
 };
 #endif
-- 
cgit v1.2.3


From 0af4d704ba8e5ee632b6e65015ffe4d229c1a9a9 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 12 Apr 2023 13:56:36 +0200
Subject: pwm: Delete deprecated functions pwm_request() and pwm_free()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 5a7fbe452ad9 ("backlight: pwm_bl: Drop support for legacy PWM
probing") the last user of pwm_request() and pwm_free() is gone. So remove
these functions that were deprecated over 10 years ago in commit
8138d2ddbcca ("pwm: Add table-based lookup for static mappings").

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
[thierry.reding@gmail.com: clean up a bit after removal]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/pwm.rst | 13 ++++-----
 drivers/pwm/core.c               | 58 ++--------------------------------------
 include/linux/pwm.h              | 13 ---------
 3 files changed, 7 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index 8c71a2055d27..3fdc95f7a1d1 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -35,12 +35,9 @@ consumers to providers, as given in the following example::
 Using PWMs
 ----------
 
-Legacy users can request a PWM device using pwm_request() and free it
-after usage with pwm_free().
-
-New users should use the pwm_get() function and pass to it the consumer
-device or a consumer name. pwm_put() is used to free the PWM device. Managed
-variants of the getter, devm_pwm_get() and devm_fwnode_pwm_get(), also exist.
+Consumers use the pwm_get() function and pass to it the consumer device or a
+consumer name. pwm_put() is used to free the PWM device. Managed variants of
+the getter, devm_pwm_get() and devm_fwnode_pwm_get(), also exist.
 
 After being requested, a PWM has to be configured using::
 
@@ -165,8 +162,8 @@ consumers should implement it as described in the "Using PWMs" section.
 Locking
 -------
 
-The PWM core list manipulations are protected by a mutex, so pwm_request()
-and pwm_free() may not be called from an atomic context. Currently the
+The PWM core list manipulations are protected by a mutex, so pwm_get()
+and pwm_put() may not be called from an atomic context. Currently the
 PWM core does not enforce any locking to pwm_enable(), pwm_disable() and
 pwm_config(), so the calling context is currently driver specific. This
 is an issue derived from the former barebone API and should be fixed soon.
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 474725714a05..9ce85c6157e4 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -35,11 +35,6 @@ static LIST_HEAD(pwm_chips);
 static DECLARE_BITMAP(allocated_pwms, MAX_PWMS);
 static RADIX_TREE(pwm_tree, GFP_KERNEL);
 
-static struct pwm_device *pwm_to_device(unsigned int pwm)
-{
-	return radix_tree_lookup(&pwm_tree, pwm);
-}
-
 /* Called with pwm_lock held */
 static int alloc_pwms(unsigned int count)
 {
@@ -369,43 +364,6 @@ int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip)
 }
 EXPORT_SYMBOL_GPL(devm_pwmchip_add);
 
-/**
- * pwm_request() - request a PWM device
- * @pwm: global PWM device index
- * @label: PWM device label
- *
- * This function is deprecated, use pwm_get() instead.
- *
- * Returns: A pointer to a PWM device or an ERR_PTR()-encoded error code on
- * failure.
- */
-struct pwm_device *pwm_request(int pwm, const char *label)
-{
-	struct pwm_device *dev;
-	int err;
-
-	if (pwm < 0 || pwm >= MAX_PWMS)
-		return ERR_PTR(-EINVAL);
-
-	mutex_lock(&pwm_lock);
-
-	dev = pwm_to_device(pwm);
-	if (!dev) {
-		dev = ERR_PTR(-EPROBE_DEFER);
-		goto out;
-	}
-
-	err = pwm_device_request(dev, label);
-	if (err < 0)
-		dev = ERR_PTR(err);
-
-out:
-	mutex_unlock(&pwm_lock);
-
-	return dev;
-}
-EXPORT_SYMBOL_GPL(pwm_request);
-
 /**
  * pwm_request_from_chip() - request a PWM device relative to a PWM chip
  * @chip: PWM chip
@@ -438,18 +396,6 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 }
 EXPORT_SYMBOL_GPL(pwm_request_from_chip);
 
-/**
- * pwm_free() - free a PWM device
- * @pwm: PWM device
- *
- * This function is deprecated, use pwm_put() instead.
- */
-void pwm_free(struct pwm_device *pwm)
-{
-	pwm_put(pwm);
-}
-EXPORT_SYMBOL_GPL(pwm_free);
-
 static void pwm_apply_state_debug(struct pwm_device *pwm,
 				  const struct pwm_state *state)
 {
@@ -790,7 +736,7 @@ static struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
 	dl = pwm_device_link_add(dev, pwm);
 	if (IS_ERR(dl)) {
 		/* of_xlate ended up calling pwm_request_from_chip() */
-		pwm_free(pwm);
+		pwm_put(pwm);
 		pwm = ERR_CAST(dl);
 		goto put;
 	}
@@ -1014,7 +960,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 
 	dl = pwm_device_link_add(dev, pwm);
 	if (IS_ERR(dl)) {
-		pwm_free(pwm);
+		pwm_put(pwm);
 		return ERR_CAST(dl);
 	}
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 7b7b93b6fb81..04ae1d9073a7 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -309,8 +309,6 @@ struct pwm_chip {
 
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
-struct pwm_device *pwm_request(int pwm_id, const char *label);
-void pwm_free(struct pwm_device *pwm);
 int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
@@ -410,17 +408,6 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
 #else
-static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
-{
-	might_sleep();
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void pwm_free(struct pwm_device *pwm)
-{
-	might_sleep();
-}
-
 static inline int pwm_apply_state(struct pwm_device *pwm,
 				  const struct pwm_state *state)
 {
-- 
cgit v1.2.3


From 4668c7a2940d134bea50058e138591b97485c5da Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 27 Mar 2023 23:37:32 +0900
Subject: fault-inject: allow configuration via configfs

This provides a helper function to allow configuration of fault-injection
for configfs-based drivers.

The config items created by this function have the same interface as the
one created under debugfs by fault_create_debugfs_attr().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Link: https://lore.kernel.org/r/20230327143733.14599-2-akinobu.mita@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fault-inject.h |  22 +++++
 lib/Kconfig.debug            |  13 ++-
 lib/fault-inject.c           | 191 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 444236dadcf0..481abf530b3c 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -6,6 +6,7 @@
 
 #include <linux/types.h>
 #include <linux/debugfs.h>
+#include <linux/configfs.h>
 #include <linux/ratelimit.h>
 #include <linux/atomic.h>
 
@@ -65,6 +66,27 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
 
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 
+#ifdef CONFIG_FAULT_INJECTION_CONFIGFS
+
+struct fault_config {
+	struct fault_attr attr;
+	struct config_group group;
+};
+
+void fault_config_init(struct fault_config *config, const char *name);
+
+#else /* CONFIG_FAULT_INJECTION_CONFIGFS */
+
+struct fault_config {
+};
+
+static inline void fault_config_init(struct fault_config *config,
+			const char *name)
+{
+}
+
+#endif /* CONFIG_FAULT_INJECTION_CONFIGFS */
+
 #endif /* CONFIG_FAULT_INJECTION */
 
 struct kmem_cache;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c8b379e2e9ad..e700b29d7756 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1958,9 +1958,20 @@ config FAIL_SUNRPC
 	  Provide fault-injection capability for SunRPC and
 	  its consumers.
 
+config FAULT_INJECTION_CONFIGFS
+	bool "Configfs interface for fault-injection capabilities"
+	depends on FAULT_INJECTION && CONFIGFS_FS
+	help
+	  This option allows configfs-based drivers to dynamically configure
+	  fault-injection via configfs.  Each parameter for driver-specific
+	  fault-injection can be made visible as a configfs attribute in a
+	  configfs group.
+
+
 config FAULT_INJECTION_STACKTRACE_FILTER
 	bool "stacktrace filter for fault-injection capabilities"
-	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
+	depends on FAULT_INJECTION
+	depends on (FAULT_INJECTION_DEBUG_FS || FAULT_INJECTION_CONFIGFS) && STACKTRACE_SUPPORT
 	select STACKTRACE
 	depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86
 	help
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 6cff320c4eb4..d608f9b48c10 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -244,3 +244,194 @@ struct dentry *fault_create_debugfs_attr(const char *name,
 EXPORT_SYMBOL_GPL(fault_create_debugfs_attr);
 
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#ifdef CONFIG_FAULT_INJECTION_CONFIGFS
+
+/* These configfs attribute utilities are copied from drivers/block/null_blk/main.c */
+
+static ssize_t fault_uint_attr_show(unsigned int val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t fault_ulong_attr_show(unsigned long val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%lu\n", val);
+}
+
+static ssize_t fault_bool_attr_show(bool val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t fault_atomic_t_attr_show(atomic_t val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%d\n", atomic_read(&val));
+}
+
+static ssize_t fault_uint_attr_store(unsigned int *val, const char *page, size_t count)
+{
+	unsigned int tmp;
+	int result;
+
+	result = kstrtouint(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+static ssize_t fault_ulong_attr_store(unsigned long *val, const char *page, size_t count)
+{
+	int result;
+	unsigned long tmp;
+
+	result = kstrtoul(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+static ssize_t fault_bool_attr_store(bool *val, const char *page, size_t count)
+{
+	bool tmp;
+	int result;
+
+	result = kstrtobool(page, &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+static ssize_t fault_atomic_t_attr_store(atomic_t *val, const char *page, size_t count)
+{
+	int tmp;
+	int result;
+
+	result = kstrtoint(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	atomic_set(val, tmp);
+	return count;
+}
+
+#define CONFIGFS_ATTR_NAMED(_pfx, _name, _attr_name)	\
+static struct configfs_attribute _pfx##attr_##_name = {	\
+	.ca_name	= _attr_name,			\
+	.ca_mode	= 0644,				\
+	.ca_owner	= THIS_MODULE,			\
+	.show		= _pfx##_name##_show,		\
+	.store		= _pfx##_name##_store,		\
+}
+
+static struct fault_config *to_fault_config(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct fault_config, group);
+}
+
+#define FAULT_CONFIGFS_ATTR_NAMED(NAME, ATTR_NAME, MEMBER, TYPE)				\
+static ssize_t fault_##NAME##_show(struct config_item *item, char *page)			\
+{												\
+	return fault_##TYPE##_attr_show(to_fault_config(item)->attr.MEMBER, page);		\
+}												\
+static ssize_t fault_##NAME##_store(struct config_item *item, const char *page, size_t count)	\
+{												\
+	struct fault_config *config = to_fault_config(item);					\
+	return fault_##TYPE##_attr_store(&config->attr.MEMBER, page, count);			\
+}												\
+CONFIGFS_ATTR_NAMED(fault_, NAME, ATTR_NAME)
+
+#define FAULT_CONFIGFS_ATTR(NAME, TYPE)	\
+	FAULT_CONFIGFS_ATTR_NAMED(NAME, __stringify(NAME), NAME, TYPE)
+
+FAULT_CONFIGFS_ATTR(probability, ulong);
+FAULT_CONFIGFS_ATTR(interval, ulong);
+FAULT_CONFIGFS_ATTR(times, atomic_t);
+FAULT_CONFIGFS_ATTR(space, atomic_t);
+FAULT_CONFIGFS_ATTR(verbose, ulong);
+FAULT_CONFIGFS_ATTR_NAMED(ratelimit_interval, "verbose_ratelimit_interval_ms",
+		ratelimit_state.interval, uint);
+FAULT_CONFIGFS_ATTR_NAMED(ratelimit_burst, "verbose_ratelimit_burst",
+		ratelimit_state.burst, uint);
+FAULT_CONFIGFS_ATTR_NAMED(task_filter, "task-filter", task_filter, bool);
+
+#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
+
+static ssize_t fault_stacktrace_depth_show(struct config_item *item, char *page)
+{
+	return fault_ulong_attr_show(to_fault_config(item)->attr.stacktrace_depth, page);
+}
+
+static ssize_t fault_stacktrace_depth_store(struct config_item *item, const char *page,
+		size_t count)
+{
+	int result;
+	unsigned long tmp;
+
+	result = kstrtoul(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	to_fault_config(item)->attr.stacktrace_depth =
+		min_t(unsigned long, tmp, MAX_STACK_TRACE_DEPTH);
+
+	return count;
+}
+
+CONFIGFS_ATTR_NAMED(fault_, stacktrace_depth, "stacktrace-depth");
+
+static ssize_t fault_xul_attr_show(unsigned long val, char *page)
+{
+	return snprintf(page, PAGE_SIZE,
+			sizeof(val) == sizeof(u32) ? "0x%08lx\n" : "0x%016lx\n", val);
+}
+
+static ssize_t fault_xul_attr_store(unsigned long *val, const char *page, size_t count)
+{
+	return fault_ulong_attr_store(val, page, count);
+}
+
+FAULT_CONFIGFS_ATTR_NAMED(require_start, "require-start", require_start, xul);
+FAULT_CONFIGFS_ATTR_NAMED(require_end, "require-end", require_end, xul);
+FAULT_CONFIGFS_ATTR_NAMED(reject_start, "reject-start", reject_start, xul);
+FAULT_CONFIGFS_ATTR_NAMED(reject_end, "reject-end", reject_end, xul);
+
+#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
+
+static struct configfs_attribute *fault_config_attrs[] = {
+	&fault_attr_probability,
+	&fault_attr_interval,
+	&fault_attr_times,
+	&fault_attr_space,
+	&fault_attr_verbose,
+	&fault_attr_ratelimit_interval,
+	&fault_attr_ratelimit_burst,
+	&fault_attr_task_filter,
+#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
+	&fault_attr_stacktrace_depth,
+	&fault_attr_require_start,
+	&fault_attr_require_end,
+	&fault_attr_reject_start,
+	&fault_attr_reject_end,
+#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
+	NULL,
+};
+
+static const struct config_item_type fault_config_type = {
+	.ct_attrs	= fault_config_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+void fault_config_init(struct fault_config *config, const char *name)
+{
+	config_group_init_type_name(&config->group, name, &fault_config_type);
+}
+EXPORT_SYMBOL_GPL(fault_config_init);
+
+#endif /* CONFIG_FAULT_INJECTION_CONFIGFS */
-- 
cgit v1.2.3


From 4fdeb847130229dc94befa241461669c7359776b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:07 +0200
Subject: wifi: ieee80211: clean up public action codes

WLAN_PUBLIC_ACTION_FTM_RESPONSE is duplicated with
WLAN_PUB_ACTION_FTM, but that might better be called
WLAN_PUB_ACTION_FTM_RESPONSE; clean up here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2463bdd2a382..0583b2b0ce1f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2022 Intel Corporation
+ * Copyright (c) 2018 - 2023 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -3557,11 +3557,6 @@ enum ieee80211_unprotected_wnm_actioncode {
 	WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
 };
 
-/* Public action codes */
-enum ieee80211_public_actioncode {
-	WLAN_PUBLIC_ACTION_FTM_RESPONSE = 33,
-};
-
 /* Security key length */
 enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
@@ -3653,7 +3648,7 @@ enum ieee80211_pub_actioncode {
 	WLAN_PUB_ACTION_NETWORK_CHANNEL_CONTROL = 30,
 	WLAN_PUB_ACTION_WHITE_SPACE_MAP_ANN = 31,
 	WLAN_PUB_ACTION_FTM_REQUEST = 32,
-	WLAN_PUB_ACTION_FTM = 33,
+	WLAN_PUB_ACTION_FTM_RESPONSE = 33,
 	WLAN_PUB_ACTION_FILS_DISCOVERY = 34,
 };
 
@@ -4383,7 +4378,7 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb)
 		return false;
 
 	if (mgmt->u.action.u.ftm.action_code ==
-		WLAN_PUBLIC_ACTION_FTM_RESPONSE &&
+		WLAN_PUB_ACTION_FTM_RESPONSE &&
 	    skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm))
 		return true;
 
-- 
cgit v1.2.3


From 2c9abe653bc5134eeab411c46dde008d8a1c37b0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:08 +0200
Subject: wifi: ieee80211: correctly mark FTM frames non-bufferable

The checks of whether or not a frame is bufferable were not
taking into account that some action frames aren't, such as
FTM. Check this, which requires some changes to the function
ieee80211_is_bufferable_mmpdu() since we need the whole skb
for the checks now.

Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c |  7 ++--
 drivers/net/wireless/mediatek/mt76/tx.c     |  2 +-
 include/linux/ieee80211.h                   | 52 +++++++++++++++++++++--------
 net/mac80211/tx.c                           |  4 +--
 4 files changed, 45 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 51f21cbf7a20..478442e16d43 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -604,8 +604,9 @@ static void iwl_mvm_skb_prepare_status(struct sk_buff *skb,
 static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
 				      struct iwl_mvm_vif_link_info *link,
 				      struct ieee80211_tx_info *info,
-				      struct ieee80211_hdr *hdr)
+				      struct sk_buff *skb)
 {
+	struct ieee80211_hdr *hdr = (void *)skb->data;
 	__le16 fc = hdr->frame_control;
 
 	switch (info->control.vif->type) {
@@ -622,7 +623,7 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
 		 * reason 7 ("Class 3 frame received from nonassociated STA").
 		 */
 		if (ieee80211_is_mgmt(fc) &&
-		    (!ieee80211_is_bufferable_mmpdu(fc) ||
+		    (!ieee80211_is_bufferable_mmpdu(skb) ||
 		     ieee80211_is_deauth(fc) || ieee80211_is_disassoc(fc)))
 			return link->mgmt_queue;
 
@@ -755,7 +756,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
 				sta_id = link->mcast_sta.sta_id;
 
 			queue = iwl_mvm_get_ctrl_vif_queue(mvm, link, &info,
-							   hdr);
+							   skb);
 		} else if (info.control.vif->type == NL80211_IFTYPE_MONITOR) {
 			queue = mvm->snif_queue;
 			sta_id = mvm->snif_sta.sta_id;
diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 1f309d05380a..3ad9742364ba 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -330,7 +330,7 @@ mt76_tx(struct mt76_phy *phy, struct ieee80211_sta *sta,
 	if ((dev->drv->drv_flags & MT_DRV_HW_MGMT_TXQ) &&
 	    !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
 	    !ieee80211_is_data(hdr->frame_control) &&
-	    !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) {
+	    !ieee80211_is_bufferable_mmpdu(skb)) {
 		qid = MT_TXQ_PSD;
 	}
 
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0583b2b0ce1f..c4cf296e7eaf 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -782,20 +782,6 @@ static inline bool ieee80211_is_any_nullfunc(__le16 fc)
 	return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc));
 }
 
-/**
- * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
- * @fc: frame control field in little-endian byteorder
- */
-static inline bool ieee80211_is_bufferable_mmpdu(__le16 fc)
-{
-	/* IEEE 802.11-2012, definition of "bufferable management frame";
-	 * note that this ignores the IBSS special case. */
-	return ieee80211_is_mgmt(fc) &&
-	       (ieee80211_is_action(fc) ||
-		ieee80211_is_disassoc(fc) ||
-		ieee80211_is_deauth(fc));
-}
-
 /**
  * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set
  * @seq_ctrl: frame sequence control bytes in little-endian byteorder
@@ -4132,6 +4118,44 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
 		return hdr->addr1;
 }
 
+/**
+ * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
+ * @skb: the skb to check, starting with the 802.11 header
+ */
+static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+	__le16 fc = mgmt->frame_control;
+
+	/*
+	 * IEEE 802.11 REVme D2.0 definition of bufferable MMPDU;
+	 * note that this ignores the IBSS special case.
+	 */
+	if (!ieee80211_is_mgmt(fc))
+		return false;
+
+	if (ieee80211_is_disassoc(fc) || ieee80211_is_deauth(fc))
+		return true;
+
+	if (!ieee80211_is_action(fc))
+		return false;
+
+	if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code))
+		return true;
+
+	/* action frame - additionally check for non-bufferable FTM */
+
+	if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
+	    mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION)
+		return true;
+
+	if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST ||
+	    mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE)
+		return false;
+
+	return true;
+}
+
 /**
  * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame
  * @hdr: the frame (buffer must include at least the first octet of payload)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index dfe6b9c9b29e..1a3327407552 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -488,7 +488,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 		int ac = skb_get_queue_mapping(tx->skb);
 
 		if (ieee80211_is_mgmt(hdr->frame_control) &&
-		    !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) {
+		    !ieee80211_is_bufferable_mmpdu(tx->skb)) {
 			info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
 			return TX_CONTINUE;
 		}
@@ -1323,7 +1323,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 	if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
 	    unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
 		if ((!ieee80211_is_mgmt(hdr->frame_control) ||
-		     ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
+		     ieee80211_is_bufferable_mmpdu(skb) ||
 		     vif->type == NL80211_IFTYPE_STATION) &&
 		    sta && sta->uploaded) {
 			/*
-- 
cgit v1.2.3


From d00800a289c9349bb659a698cbd7bc04521dc927 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:17 +0200
Subject: wifi: mac80211: add flush_sta method

Some drivers like iwlwifi might have per-STA queues, so we
may want to flush/drop just those queues rather than all
when removing a station. Add a separate method for that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    |  6 ++++++
 net/mac80211/driver-ops.h | 15 +++++++++++++++
 net/mac80211/sta_info.c   |  8 ++++++--
 net/mac80211/trace.h      |  7 +++++++
 4 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 679421d37a42..a8dadbd83d95 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3950,6 +3950,10 @@ struct ieee80211_prep_tx_info {
  *	Note that vif can be NULL.
  *	The callback can sleep.
  *
+ * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for
+ *	the given station, as it's about to be removed.
+ *	The callback can sleep.
+ *
  * @channel_switch: Drivers that need (or want) to offload the channel
  *	switch operation for CSAs received from the AP may implement this
  *	callback. They must then call ieee80211_chswitch_done() to indicate
@@ -4415,6 +4419,8 @@ struct ieee80211_ops {
 #endif
 	void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		      u32 queues, bool drop);
+	void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			  struct ieee80211_sta *sta);
 	void (*channel_switch)(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_channel_switch *ch_switch);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 0bf208f5bbc5..45d3e53c7383 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -649,6 +649,21 @@ static inline void drv_flush(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
+static inline void drv_flush_sta(struct ieee80211_local *local,
+				 struct ieee80211_sub_if_data *sdata,
+				 struct sta_info *sta)
+{
+	might_sleep();
+
+	if (sdata && !check_sdata_in_driver(sdata))
+		return;
+
+	trace_drv_flush_sta(local, sdata, &sta->sta);
+	if (local->ops->flush_sta)
+		local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta);
+	trace_drv_return_void(local);
+}
+
 static inline void drv_channel_switch(struct ieee80211_local *local,
 				      struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_channel_switch *ch_switch)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index ce7c3b997269..1400512e0dde 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1299,8 +1299,12 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 	 * frames sitting on hardware queues might be sent out without
 	 * any encryption at all.
 	 */
-	if (local->ops->set_key)
-		ieee80211_flush_queues(local, sta->sdata, false);
+	if (local->ops->set_key) {
+		if (local->ops->flush_sta)
+			drv_flush_sta(local, sta->sdata, sta);
+		else
+			ieee80211_flush_queues(local, sta->sdata, false);
+	}
 
 	/* now keys can no longer be reached */
 	ieee80211_free_sta_keys(local, sta);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index e0ccf5fe708a..de5d69f21306 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1177,6 +1177,13 @@ TRACE_EVENT(drv_flush,
 	)
 );
 
+DEFINE_EVENT(sta_event, drv_flush_sta,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta),
+	TP_ARGS(local, sdata, sta)
+);
+
 TRACE_EVENT(drv_channel_switch,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From 0cd917a4a8ace70ff9082d797c899f6bf10de910 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 12 Apr 2023 21:48:40 +0200
Subject: xdp: rss hash types representation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The RSS hash type specifies what portion of packet data NIC hardware used
when calculating RSS hash value. The RSS types are focused on Internet
traffic protocols at OSI layers L3 and L4. L2 (e.g. ARP) often get hash
value zero and no RSS type. For L3 focused on IPv4 vs. IPv6, and L4
primarily TCP vs UDP, but some hardware supports SCTP.

Hardware RSS types are differently encoded for each hardware NIC. Most
hardware represent RSS hash type as a number. Determining L3 vs L4 often
requires a mapping table as there often isn't a pattern or sorting
according to ISO layer.

The patch introduce a XDP RSS hash type (enum xdp_rss_hash_type) that
contains both BITs for the L3/L4 types, and combinations to be used by
drivers for their mapping tables. The enum xdp_rss_type_bits get exposed
to BPF via BTF, and it is up to the BPF-programmer to match using these
defines.

This proposal change the kfunc API bpf_xdp_metadata_rx_hash() adding
a pointer value argument for provide the RSS hash type.
Change signature for all xmo_rx_hash calls in drivers to make it compile.

The RSS type implementations for each driver comes as separate patches.

Fixes: 3d76a4d3d4e5 ("bpf: XDP metadata RX kfuncs")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/168132892042.340624.582563003880565460.stgit@firesoul
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c       |  3 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h     |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c |  3 +-
 drivers/net/veth.c                               |  3 +-
 include/linux/netdevice.h                        |  3 +-
 include/net/xdp.h                                | 45 ++++++++++++++++++++++++
 net/core/xdp.c                                   | 10 +++++-
 7 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 4b5e459b6d49..73d10aa4c503 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -681,7 +681,8 @@ int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
 	return 0;
 }
 
-int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
+			enum xdp_rss_hash_type *rss_type)
 {
 	struct mlx4_en_xdp_buff *_ctx = (void *)ctx;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 544e09b97483..4ac4d883047b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -798,7 +798,8 @@ int mlx4_en_netdev_event(struct notifier_block *this,
 
 struct xdp_md;
 int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp);
-int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash);
+int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
+			enum xdp_rss_hash_type *rss_type);
 
 /*
  * Functions for time stamping
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index c5dae48b7932..efe609f8e3aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -169,7 +169,8 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
 	return 0;
 }
 
-static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
+			     enum xdp_rss_hash_type *rss_type)
 {
 	const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index c1178915496d..424e8876a16b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1648,7 +1648,8 @@ static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
 	return 0;
 }
 
-static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
+			    enum xdp_rss_hash_type *rss_type)
 {
 	struct veth_xdp_buff *_ctx = (void *)ctx;
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 470085b121d3..c35f04f636f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1624,7 +1624,8 @@ struct net_device_ops {
 
 struct xdp_metadata_ops {
 	int	(*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
-	int	(*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash);
+	int	(*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash,
+			       enum xdp_rss_hash_type *rss_type);
 };
 
 /**
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 41c57b8b1671..a76c4ea203ea 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -8,6 +8,7 @@
 
 #include <linux/skbuff.h> /* skb_shared_info */
 #include <uapi/linux/netdev.h>
+#include <linux/bitfield.h>
 
 /**
  * DOC: XDP RX-queue information
@@ -425,6 +426,50 @@ XDP_METADATA_KFUNC_xxx
 MAX_XDP_METADATA_KFUNC,
 };
 
+enum xdp_rss_hash_type {
+	/* First part: Individual bits for L3/L4 types */
+	XDP_RSS_L3_IPV4		= BIT(0),
+	XDP_RSS_L3_IPV6		= BIT(1),
+
+	/* The fixed (L3) IPv4 and IPv6 headers can both be followed by
+	 * variable/dynamic headers, IPv4 called Options and IPv6 called
+	 * Extension Headers. HW RSS type can contain this info.
+	 */
+	XDP_RSS_L3_DYNHDR	= BIT(2),
+
+	/* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in
+	 * addition to the protocol specific bit.  This ease interaction with
+	 * SKBs and avoids reserving a fixed mask for future L4 protocol bits.
+	 */
+	XDP_RSS_L4		= BIT(3), /* L4 based hash, proto can be unknown */
+	XDP_RSS_L4_TCP		= BIT(4),
+	XDP_RSS_L4_UDP		= BIT(5),
+	XDP_RSS_L4_SCTP		= BIT(6),
+	XDP_RSS_L4_IPSEC	= BIT(7), /* L4 based hash include IPSEC SPI */
+
+	/* Second part: RSS hash type combinations used for driver HW mapping */
+	XDP_RSS_TYPE_NONE            = 0,
+	XDP_RSS_TYPE_L2              = XDP_RSS_TYPE_NONE,
+
+	XDP_RSS_TYPE_L3_IPV4         = XDP_RSS_L3_IPV4,
+	XDP_RSS_TYPE_L3_IPV6         = XDP_RSS_L3_IPV6,
+	XDP_RSS_TYPE_L3_IPV4_OPT     = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR,
+	XDP_RSS_TYPE_L3_IPV6_EX      = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR,
+
+	XDP_RSS_TYPE_L4_ANY          = XDP_RSS_L4,
+	XDP_RSS_TYPE_L4_IPV4_TCP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
+	XDP_RSS_TYPE_L4_IPV4_UDP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
+	XDP_RSS_TYPE_L4_IPV4_SCTP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
+
+	XDP_RSS_TYPE_L4_IPV6_TCP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
+	XDP_RSS_TYPE_L4_IPV6_UDP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
+	XDP_RSS_TYPE_L4_IPV6_SCTP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
+
+	XDP_RSS_TYPE_L4_IPV6_TCP_EX  = XDP_RSS_TYPE_L4_IPV6_TCP  | XDP_RSS_L3_DYNHDR,
+	XDP_RSS_TYPE_L4_IPV6_UDP_EX  = XDP_RSS_TYPE_L4_IPV6_UDP  | XDP_RSS_L3_DYNHDR,
+	XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR,
+};
+
 #ifdef CONFIG_NET
 u32 bpf_xdp_metadata_kfunc_id(int id);
 bool bpf_dev_bound_kfunc_id(u32 btf_id);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 528d4b37983d..fb85aca81961 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -734,13 +734,21 @@ __bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *tim
  * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
  * @ctx: XDP context pointer.
  * @hash: Return value pointer.
+ * @rss_type: Return value pointer for RSS type.
+ *
+ * The RSS hash type (@rss_type) specifies what portion of packet headers NIC
+ * hardware used when calculating RSS hash value.  The RSS type can be decoded
+ * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits
+ * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types*
+ * ``XDP_RSS_TYPE_L*``.
  *
  * Return:
  * * Returns 0 on success or ``-errno`` on error.
  * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc
  * * ``-ENODATA``    : means no RX-hash available for this frame
  */
-__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
+					 enum xdp_rss_hash_type *rss_type)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 67f245c2ec0af17d7a90c78910e28bc8b206297c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 12 Apr 2023 21:48:45 +0200
Subject: mlx5: bpf_xdp_metadata_rx_hash add xdp rss hash type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update API for bpf_xdp_metadata_rx_hash() with arg for xdp rss hash type
via mapping table.

The mlx5 hardware can also identify and RSS hash IPSEC.  This indicate
hash includes SPI (Security Parameters Index) as part of IPSEC hash.

Extend xdp core enum xdp_rss_hash_type with IPSEC hash type.

Fixes: bc8d405b1ba9 ("net/mlx5e: Support RX XDP metadata")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/168132892548.340624.11185734579430124869.stgit@firesoul
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 60 +++++++++++++++++++++++-
 include/linux/mlx5/device.h                      | 14 +++++-
 include/net/xdp.h                                |  2 +
 3 files changed, 73 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index efe609f8e3aa..d9d3b9e1f15a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -34,6 +34,7 @@
 #include <net/xdp_sock_drv.h>
 #include "en/xdp.h"
 #include "en/params.h"
+#include <linux/bitfield.h>
 
 int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk)
 {
@@ -169,15 +170,72 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
 	return 0;
 }
 
+/* Mapping HW RSS Type bits CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 into 4-bits*/
+#define RSS_TYPE_MAX_TABLE	16 /* 4-bits max 16 entries */
+#define RSS_L4		GENMASK(1, 0)
+#define RSS_L3		GENMASK(3, 2) /* Same as CQE_RSS_HTYPE_IP */
+
+/* Valid combinations of CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 sorted numerical */
+enum mlx5_rss_hash_type {
+	RSS_TYPE_NO_HASH	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IP_NONE) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)),
+	RSS_TYPE_L3_IPV4	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)),
+	RSS_TYPE_L4_IPV4_TCP	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)),
+	RSS_TYPE_L4_IPV4_UDP	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)),
+	RSS_TYPE_L4_IPV4_IPSEC	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)),
+	RSS_TYPE_L3_IPV6	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)),
+	RSS_TYPE_L4_IPV6_TCP	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)),
+	RSS_TYPE_L4_IPV6_UDP	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)),
+	RSS_TYPE_L4_IPV6_IPSEC	= (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) |
+				   FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)),
+};
+
+/* Invalid combinations will simply return zero, allows no boundary checks */
+static const enum xdp_rss_hash_type mlx5_xdp_rss_type[RSS_TYPE_MAX_TABLE] = {
+	[RSS_TYPE_NO_HASH]	 = XDP_RSS_TYPE_NONE,
+	[1]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[2]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[3]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[RSS_TYPE_L3_IPV4]	 = XDP_RSS_TYPE_L3_IPV4,
+	[RSS_TYPE_L4_IPV4_TCP]	 = XDP_RSS_TYPE_L4_IPV4_TCP,
+	[RSS_TYPE_L4_IPV4_UDP]	 = XDP_RSS_TYPE_L4_IPV4_UDP,
+	[RSS_TYPE_L4_IPV4_IPSEC] = XDP_RSS_TYPE_L4_IPV4_IPSEC,
+	[RSS_TYPE_L3_IPV6]	 = XDP_RSS_TYPE_L3_IPV6,
+	[RSS_TYPE_L4_IPV6_TCP]	 = XDP_RSS_TYPE_L4_IPV6_TCP,
+	[RSS_TYPE_L4_IPV6_UDP]   = XDP_RSS_TYPE_L4_IPV6_UDP,
+	[RSS_TYPE_L4_IPV6_IPSEC] = XDP_RSS_TYPE_L4_IPV6_IPSEC,
+	[12]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[13]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[14]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+	[15]			 = XDP_RSS_TYPE_NONE, /* Implicit zero */
+};
+
 static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
 			     enum xdp_rss_hash_type *rss_type)
 {
 	const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
+	const struct mlx5_cqe64 *cqe = _ctx->cqe;
+	u32 hash_type, l4_type, ip_type, lookup;
 
 	if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXHASH)))
 		return -ENODATA;
 
-	*hash = be32_to_cpu(_ctx->cqe->rss_hash_result);
+	*hash = be32_to_cpu(cqe->rss_hash_result);
+
+	hash_type = cqe->rss_hash_type;
+	BUILD_BUG_ON(CQE_RSS_HTYPE_IP != RSS_L3); /* same mask */
+	ip_type = hash_type & CQE_RSS_HTYPE_IP;
+	l4_type = FIELD_GET(CQE_RSS_HTYPE_L4, hash_type);
+	lookup = ip_type | l4_type;
+	*rss_type = mlx5_xdp_rss_type[lookup];
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 71b06ebad402..1db19a9d26e3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -36,6 +36,7 @@
 #include <linux/types.h>
 #include <rdma/ib_verbs.h>
 #include <linux/mlx5/mlx5_ifc.h>
+#include <linux/bitfield.h>
 
 #if defined(__LITTLE_ENDIAN)
 #define MLX5_SET_HOST_ENDIANNESS	0
@@ -980,14 +981,23 @@ enum {
 };
 
 enum {
-	CQE_RSS_HTYPE_IP	= 0x3 << 2,
+	CQE_RSS_HTYPE_IP	= GENMASK(3, 2),
 	/* cqe->rss_hash_type[3:2] - IP destination selected for hash
 	 * (00 = none,  01 = IPv4, 10 = IPv6, 11 = Reserved)
 	 */
-	CQE_RSS_HTYPE_L4	= 0x3 << 6,
+	CQE_RSS_IP_NONE		= 0x0,
+	CQE_RSS_IPV4		= 0x1,
+	CQE_RSS_IPV6		= 0x2,
+	CQE_RSS_RESERVED	= 0x3,
+
+	CQE_RSS_HTYPE_L4	= GENMASK(7, 6),
 	/* cqe->rss_hash_type[7:6] - L4 destination selected for hash
 	 * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI
 	 */
+	CQE_RSS_L4_NONE		= 0x0,
+	CQE_RSS_L4_TCP		= 0x1,
+	CQE_RSS_L4_UDP		= 0x2,
+	CQE_RSS_L4_IPSEC	= 0x3,
 };
 
 enum {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a76c4ea203ea..76aa748e7923 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -460,10 +460,12 @@ enum xdp_rss_hash_type {
 	XDP_RSS_TYPE_L4_IPV4_TCP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
 	XDP_RSS_TYPE_L4_IPV4_UDP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
 	XDP_RSS_TYPE_L4_IPV4_SCTP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
+	XDP_RSS_TYPE_L4_IPV4_IPSEC   = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
 
 	XDP_RSS_TYPE_L4_IPV6_TCP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
 	XDP_RSS_TYPE_L4_IPV6_UDP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
 	XDP_RSS_TYPE_L4_IPV6_SCTP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
+	XDP_RSS_TYPE_L4_IPV6_IPSEC   = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
 
 	XDP_RSS_TYPE_L4_IPV6_TCP_EX  = XDP_RSS_TYPE_L4_IPV6_TCP  | XDP_RSS_L3_DYNHDR,
 	XDP_RSS_TYPE_L4_IPV6_UDP_EX  = XDP_RSS_TYPE_L4_IPV6_UDP  | XDP_RSS_L3_DYNHDR,
-- 
cgit v1.2.3


From 8cbc82f3ec0d58961cf9c1e5d99e56741f4bf134 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 20 Mar 2023 15:40:10 +0800
Subject: mm: memory-failure: Move memory failure sysctls to its own file

The sysctl_memory_failure_early_kill and memory_failure_recovery
are only used in memory-failure.c, move them to its own file.

Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
[mcgrof: fix by adding empty ctl entry, this caused a crash]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/mm.h  |  2 --
 kernel/sysctl.c     | 20 --------------------
 mm/memory-failure.c | 36 ++++++++++++++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..98da268b834a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3442,8 +3442,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
-extern int sysctl_memory_failure_early_kill;
-extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce0297acf97c..0a8a3c9c82e4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2350,26 +2350,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 		.extra1		= SYSCTL_ZERO,
 	},
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-	{
-		.procname	= "memory_failure_early_kill",
-		.data		= &sysctl_memory_failure_early_kill,
-		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-	{
-		.procname	= "memory_failure_recovery",
-		.data		= &sysctl_memory_failure_recovery,
-		.maxlen		= sizeof(sysctl_memory_failure_recovery),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
 #endif
 	{
 		.procname	= "user_reserve_kbytes",
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fae9baf3be16..10e60b6b2447 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -62,13 +62,14 @@
 #include <linux/page-isolation.h>
 #include <linux/pagewalk.h>
 #include <linux/shmem_fs.h>
+#include <linux/sysctl.h>
 #include "swap.h"
 #include "internal.h"
 #include "ras/ras_event.h"
 
-int sysctl_memory_failure_early_kill __read_mostly = 0;
+static int sysctl_memory_failure_early_kill __read_mostly;
 
-int sysctl_memory_failure_recovery __read_mostly = 1;
+static int sysctl_memory_failure_recovery __read_mostly = 1;
 
 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
@@ -122,6 +123,37 @@ const struct attribute_group memory_failure_attr_group = {
 	.attrs = memory_failure_attr,
 };
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table memory_failure_table[] = {
+	{
+		.procname	= "memory_failure_early_kill",
+		.data		= &sysctl_memory_failure_early_kill,
+		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "memory_failure_recovery",
+		.data		= &sysctl_memory_failure_recovery,
+		.maxlen		= sizeof(sysctl_memory_failure_recovery),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static int __init memory_failure_sysctl_init(void)
+{
+	register_sysctl_init("vm", memory_failure_table);
+	return 0;
+}
+late_initcall(memory_failure_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,
-- 
cgit v1.2.3


From 48fe8ab8d5a39c7bc49cb41d0ad92c75f48a9550 Mon Sep 17 00:00:00 2001
From: Minghao Chi <chi.minghao@zte.com.cn>
Date: Tue, 28 Mar 2023 14:46:28 +0800
Subject: mm: compaction: move compaction sysctl to its own file

This moves all compaction sysctls to its own file.

Move sysctl to where the functionality truly belongs to improve
readability, reduce merge conflicts, and facilitate maintenance.

I use x86_defconfig and linux-next-20230327 branch
$ make defconfig;make all -jn
CONFIG_COMPACTION=y

add/remove: 1/0 grow/shrink: 1/1 up/down: 350/-256 (94)
Function                                     old     new   delta
vm_compaction                                  -     320    +320
kcompactd_init                               180     210     +30
vm_table                                    2112    1856    -256
Total: Before=21119987, After=21120081, chg +0.00%

Despite the addition of 94 bytes the patch still seems a worthwile
cleanup.

Link: https://lore.kernel.org/lkml/067f7347-ba10-5405-920c-0f5f985c84f4@suse.cz/
Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/compaction.h |  7 ----
 kernel/sysctl.c            | 59 --------------------------------
 mm/compaction.c            | 84 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 72 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 52a9ff65faee..a6e512cfb670 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -81,13 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
 }
 
 #ifdef CONFIG_COMPACTION
-extern unsigned int sysctl_compaction_proactiveness;
-extern int sysctl_compaction_handler(struct ctl_table *table, int write,
-			void *buffer, size_t *length, loff_t *ppos);
-extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
-		int write, void *buffer, size_t *length, loff_t *ppos);
-extern int sysctl_extfrag_threshold;
-extern int sysctl_compact_unevictable_allowed;
 
 extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
 extern int fragmentation_index(struct zone *zone, unsigned int order);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0a8a3c9c82e4..bfe53e835524 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -42,7 +42,6 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/ratelimit.h>
-#include <linux/compaction.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -746,27 +745,6 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer,
 	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
 }
 
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
-		int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret, old;
-
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
-		return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
-	old = *(int *)table->data;
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (ret)
-		return ret;
-	if (old != *(int *)table->data)
-		pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
-			     table->procname, current->comm,
-			     task_pid_nr(current));
-	return ret;
-}
-#endif
-
 /**
  * proc_douintvec - read a vector of unsigned integers
  * @table: the sysctl table
@@ -2157,43 +2135,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= SYSCTL_FOUR,
 	},
-#ifdef CONFIG_COMPACTION
-	{
-		.procname	= "compact_memory",
-		.data		= NULL,
-		.maxlen		= sizeof(int),
-		.mode		= 0200,
-		.proc_handler	= sysctl_compaction_handler,
-	},
-	{
-		.procname	= "compaction_proactiveness",
-		.data		= &sysctl_compaction_proactiveness,
-		.maxlen		= sizeof(sysctl_compaction_proactiveness),
-		.mode		= 0644,
-		.proc_handler	= compaction_proactiveness_sysctl_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE_HUNDRED,
-	},
-	{
-		.procname	= "extfrag_threshold",
-		.data		= &sysctl_extfrag_threshold,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE_THOUSAND,
-	},
-	{
-		.procname	= "compact_unevictable_allowed",
-		.data		= &sysctl_compact_unevictable_allowed,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax_warn_RT_change,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-
-#endif /* CONFIG_COMPACTION */
 	{
 		.procname	= "min_free_kbytes",
 		.data		= &min_free_kbytes,
diff --git a/mm/compaction.c b/mm/compaction.c
index 5a9501e0ae01..f2ddfb1140e2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1716,7 +1716,14 @@ typedef enum {
  * Allow userspace to control policy on scanning the unevictable LRU for
  * compactable pages.
  */
-int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
+static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+static int sysctl_extfrag_threshold = 500;
 
 static inline void
 update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -2572,8 +2579,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
 	return ret;
 }
 
-int sysctl_extfrag_threshold = 500;
-
 /**
  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
  * @gfp_mask: The GFP mask of the current allocation
@@ -2730,14 +2735,7 @@ static void compact_nodes(void)
 		compact_node(nid);
 }
 
-/*
- * Tunable for proactive compaction. It determines how
- * aggressively the kernel should compact memory in the
- * background. It takes values in the range [0, 100].
- */
-unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
-
-int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *length, loff_t *ppos)
 {
 	int rc, nid;
@@ -2767,7 +2765,7 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
  * This is the entry point for compacting all nodes via
  * /proc/sys/vm/compact_memory
  */
-int sysctl_compaction_handler(struct ctl_table *table, int write,
+static int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos)
 {
 	if (write)
@@ -3063,6 +3061,65 @@ static int kcompactd_cpu_online(unsigned int cpu)
 	return 0;
 }
 
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+		int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, old;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+		return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	old = *(int *)table->data;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+	if (old != *(int *)table->data)
+		pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+			     table->procname, current->comm,
+			     task_pid_nr(current));
+	return ret;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_compaction[] = {
+	{
+		.procname	= "compact_memory",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0200,
+		.proc_handler	= sysctl_compaction_handler,
+	},
+	{
+		.procname	= "compaction_proactiveness",
+		.data		= &sysctl_compaction_proactiveness,
+		.maxlen		= sizeof(sysctl_compaction_proactiveness),
+		.mode		= 0644,
+		.proc_handler	= compaction_proactiveness_sysctl_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{
+		.procname	= "extfrag_threshold",
+		.data		= &sysctl_extfrag_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_THOUSAND,
+	},
+	{
+		.procname	= "compact_unevictable_allowed",
+		.data		= &sysctl_compact_unevictable_allowed,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax_warn_RT_change,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+#endif
+
 static int __init kcompactd_init(void)
 {
 	int nid;
@@ -3078,6 +3135,9 @@ static int __init kcompactd_init(void)
 
 	for_each_node_state(nid, N_MEMORY)
 		kcompactd_run(nid);
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("vm", vm_compaction);
+#endif
 	return 0;
 }
 subsys_initcall(kcompactd_init)
-- 
cgit v1.2.3


From d1aae06630230daf747ef5bc291c19ea7f046129 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 13 Apr 2023 13:55:28 -0700
Subject: clk: starfive: Avoid casting iomem pointers

Let's use a wrapper struct for the auxiliary_device made in
jh7110_reset_controller_register() so that we can stop casting iomem
pointers. The casts trip up tools like sparse, and make for some awkward
casts that are largely unnecessary. While we're here, change the
allocation from devm and actually free the auxiliary_device memory in
the release function. This avoids any use after free problems where the
parent device driver is unbound from the device but the
auxiliuary_device is still in use accessing devm freed memory.

Cc: Tommaso Merciai <tomm.merciai@gmail.com>
Cc: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Cc: Hal Feng <hal.feng@starfivetech.com>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Xingyu Wu <xingyu.wu@starfivetech.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Fixes: edab7204afe5 ("clk: starfive: Add StarFive JH7110 system clock driver")
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20230413205528.4044216-1-sboyd@kernel.org
---
 drivers/clk/starfive/clk-starfive-jh7110-sys.c | 15 ++++++++++++---
 drivers/reset/starfive/reset-starfive-jh7110.c |  9 ++++++---
 include/soc/starfive/reset-starfive-jh71x0.h   | 17 +++++++++++++++++
 3 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 include/soc/starfive/reset-starfive-jh71x0.h

(limited to 'include')

diff --git a/drivers/clk/starfive/clk-starfive-jh7110-sys.c b/drivers/clk/starfive/clk-starfive-jh7110-sys.c
index 5ec210644e1d..851b93d0f371 100644
--- a/drivers/clk/starfive/clk-starfive-jh7110-sys.c
+++ b/drivers/clk/starfive/clk-starfive-jh7110-sys.c
@@ -11,6 +11,9 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <soc/starfive/reset-starfive-jh71x0.h>
 
 #include <dt-bindings/clock/starfive,jh7110-crg.h>
 
@@ -335,26 +338,32 @@ static void jh7110_reset_unregister_adev(void *_adev)
 	struct auxiliary_device *adev = _adev;
 
 	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
 }
 
 static void jh7110_reset_adev_release(struct device *dev)
 {
 	struct auxiliary_device *adev = to_auxiliary_dev(dev);
+	struct jh71x0_reset_adev *rdev = to_jh71x0_reset_adev(adev);
 
-	auxiliary_device_uninit(adev);
+	kfree(rdev);
 }
 
 int jh7110_reset_controller_register(struct jh71x0_clk_priv *priv,
 				     const char *adev_name,
 				     u32 adev_id)
 {
+	struct jh71x0_reset_adev *rdev;
 	struct auxiliary_device *adev;
 	int ret;
 
-	adev = devm_kzalloc(priv->dev, sizeof(*adev), GFP_KERNEL);
-	if (!adev)
+	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
+	if (!rdev)
 		return -ENOMEM;
 
+	rdev->base = priv->base;
+
+	adev = &rdev->adev;
 	adev->name = adev_name;
 	adev->dev.parent = priv->dev;
 	adev->dev.release = jh7110_reset_adev_release;
diff --git a/drivers/reset/starfive/reset-starfive-jh7110.c b/drivers/reset/starfive/reset-starfive-jh7110.c
index c1b3a490d951..2d26ae95c8cc 100644
--- a/drivers/reset/starfive/reset-starfive-jh7110.c
+++ b/drivers/reset/starfive/reset-starfive-jh7110.c
@@ -7,6 +7,8 @@
 
 #include <linux/auxiliary_bus.h>
 
+#include <soc/starfive/reset-starfive-jh71x0.h>
+
 #include "reset-starfive-jh71x0.h"
 
 #include <dt-bindings/reset/starfive,jh7110-crg.h>
@@ -33,14 +35,15 @@ static int jh7110_reset_probe(struct auxiliary_device *adev,
 			      const struct auxiliary_device_id *id)
 {
 	struct jh7110_reset_info *info = (struct jh7110_reset_info *)(id->driver_data);
-	void __iomem **base = (void __iomem **)dev_get_drvdata(adev->dev.parent);
+	struct jh71x0_reset_adev *rdev = to_jh71x0_reset_adev(adev);
+	void __iomem *base = rdev->base;
 
 	if (!info || !base)
 		return -ENODEV;
 
 	return reset_starfive_jh71x0_register(&adev->dev, adev->dev.parent->of_node,
-					      *base + info->assert_offset,
-					      *base + info->status_offset,
+					      base + info->assert_offset,
+					      base + info->status_offset,
 					      NULL,
 					      info->nr_resets,
 					      NULL);
diff --git a/include/soc/starfive/reset-starfive-jh71x0.h b/include/soc/starfive/reset-starfive-jh71x0.h
new file mode 100644
index 000000000000..47b486ececc5
--- /dev/null
+++ b/include/soc/starfive/reset-starfive-jh71x0.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SOC_STARFIVE_RESET_JH71X0_H
+#define __SOC_STARFIVE_RESET_JH71X0_H
+
+#include <linux/auxiliary_bus.h>
+#include <linux/compiler_types.h>
+#include <linux/container_of.h>
+
+struct jh71x0_reset_adev {
+	void __iomem *base;
+	struct auxiliary_device adev;
+};
+
+#define to_jh71x0_reset_adev(_adev) \
+	container_of((_adev), struct jh71x0_reset_adev, adev)
+
+#endif
-- 
cgit v1.2.3


From 82174a0a9c5cf36a3e421fc1d2c86998d8023f61 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:52:00 -0500
Subject: of: Move of_device_get_match_data() declaration

of_device.h mostly defines functions for bus drivers whereas
of_device_get_match_data() is used by drivers. Let's move it to of.h.

Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-3-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h        | 6 ++++++
 include/linux/of_device.h | 7 -------
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index 36cf94596eba..6dc47d4c4612 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -371,6 +371,7 @@ extern int of_n_addr_cells(struct device_node *np);
 extern int of_n_size_cells(struct device_node *np);
 extern const struct of_device_id *of_match_node(
 	const struct of_device_id *matches, const struct device_node *node);
+extern const void *of_device_get_match_data(const struct device *dev);
 extern int of_modalias_node(struct device_node *node, char *modalias, int len);
 extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args);
 extern int __of_parse_phandle_with_args(const struct device_node *np,
@@ -852,6 +853,11 @@ static inline phys_addr_t of_dma_get_max_cpu_address(struct device_node *np)
 	return PHYS_ADDR_MAX;
 }
 
+static inline const void *of_device_get_match_data(const struct device *dev)
+{
+	return NULL;
+}
+
 #define of_match_ptr(_ptr)	NULL
 #define of_match_node(_matches, _node)	NULL
 #endif /* CONFIG_OF */
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index e4aa61cb2bd0..a54df77370fa 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -26,8 +26,6 @@ static inline int of_driver_match_device(struct device *dev,
 	return of_match_device(drv->of_match_table, dev) != NULL;
 }
 
-extern const void *of_device_get_match_data(const struct device *dev);
-
 extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len);
 extern int of_device_request_module(struct device *dev);
 
@@ -63,11 +61,6 @@ static inline int of_driver_match_device(struct device *dev,
 static inline void of_device_uevent(const struct device *dev,
 			struct kobj_uevent_env *env) { }
 
-static inline const void *of_device_get_match_data(const struct device *dev)
-{
-	return NULL;
-}
-
 static inline int of_device_modalias(struct device *dev,
 				     char *str, ssize_t len)
 {
-- 
cgit v1.2.3


From b58fa269c59d0f401ca569c661c6d7f3e1a20eeb Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:52:01 -0500
Subject: of: Move CPU node related functions to their own file

drivers/of/base.c is quite long and we've accumulated a number of CPU
node functions. Let's move them to a new file, cpu.c, along with the
lone of_cpu_device_node_get() in of_device.h. Moving the declaration has
no effect yet as of.h is included by of_device.h. This serves as
preparation to disentangle the includes in of_device.h and
of_platform.h.

Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-4-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/Makefile       |   2 +-
 drivers/of/base.c         | 187 -----------------------------------------
 drivers/of/cpu.c          | 210 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h        |  19 +++--
 include/linux/of_device.h |  14 ----
 5 files changed, 223 insertions(+), 209 deletions(-)
 create mode 100644 drivers/of/cpu.c

(limited to 'include')

diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index e0360a44306e..10f704592561 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y = base.o device.o platform.o property.o
+obj-y = base.o cpu.o device.o platform.o property.o
 obj-$(CONFIG_OF_KOBJ) += kobj.o
 obj-$(CONFIG_OF_DYNAMIC) += dynamic.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ac6fde53342f..7f1720af813c 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -286,193 +286,6 @@ const void *of_get_property(const struct device_node *np, const char *name,
 }
 EXPORT_SYMBOL(of_get_property);
 
-/**
- * of_get_cpu_hwid - Get the hardware ID from a CPU device node
- *
- * @cpun: CPU number(logical index) for which device node is required
- * @thread: The local thread number to get the hardware ID for.
- *
- * Return: The hardware ID for the CPU node or ~0ULL if not found.
- */
-u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread)
-{
-	const __be32 *cell;
-	int ac, len;
-
-	ac = of_n_addr_cells(cpun);
-	cell = of_get_property(cpun, "reg", &len);
-	if (!cell || !ac || ((sizeof(*cell) * ac * (thread + 1)) > len))
-		return ~0ULL;
-
-	cell += ac * thread;
-	return of_read_number(cell, ac);
-}
-
-/*
- * arch_match_cpu_phys_id - Match the given logical CPU and physical id
- *
- * @cpu: logical cpu index of a core/thread
- * @phys_id: physical identifier of a core/thread
- *
- * CPU logical to physical index mapping is architecture specific.
- * However this __weak function provides a default match of physical
- * id to logical cpu index. phys_id provided here is usually values read
- * from the device tree which must match the hardware internal registers.
- *
- * Returns true if the physical identifier and the logical cpu index
- * correspond to the same core/thread, false otherwise.
- */
-bool __weak arch_match_cpu_phys_id(int cpu, u64 phys_id)
-{
-	return (u32)phys_id == cpu;
-}
-
-/*
- * Checks if the given "prop_name" property holds the physical id of the
- * core/thread corresponding to the logical cpu 'cpu'. If 'thread' is not
- * NULL, local thread number within the core is returned in it.
- */
-static bool __of_find_n_match_cpu_property(struct device_node *cpun,
-			const char *prop_name, int cpu, unsigned int *thread)
-{
-	const __be32 *cell;
-	int ac, prop_len, tid;
-	u64 hwid;
-
-	ac = of_n_addr_cells(cpun);
-	cell = of_get_property(cpun, prop_name, &prop_len);
-	if (!cell && !ac && arch_match_cpu_phys_id(cpu, 0))
-		return true;
-	if (!cell || !ac)
-		return false;
-	prop_len /= sizeof(*cell) * ac;
-	for (tid = 0; tid < prop_len; tid++) {
-		hwid = of_read_number(cell, ac);
-		if (arch_match_cpu_phys_id(cpu, hwid)) {
-			if (thread)
-				*thread = tid;
-			return true;
-		}
-		cell += ac;
-	}
-	return false;
-}
-
-/*
- * arch_find_n_match_cpu_physical_id - See if the given device node is
- * for the cpu corresponding to logical cpu 'cpu'.  Return true if so,
- * else false.  If 'thread' is non-NULL, the local thread number within the
- * core is returned in it.
- */
-bool __weak arch_find_n_match_cpu_physical_id(struct device_node *cpun,
-					      int cpu, unsigned int *thread)
-{
-	/* Check for non-standard "ibm,ppc-interrupt-server#s" property
-	 * for thread ids on PowerPC. If it doesn't exist fallback to
-	 * standard "reg" property.
-	 */
-	if (IS_ENABLED(CONFIG_PPC) &&
-	    __of_find_n_match_cpu_property(cpun,
-					   "ibm,ppc-interrupt-server#s",
-					   cpu, thread))
-		return true;
-
-	return __of_find_n_match_cpu_property(cpun, "reg", cpu, thread);
-}
-
-/**
- * of_get_cpu_node - Get device node associated with the given logical CPU
- *
- * @cpu: CPU number(logical index) for which device node is required
- * @thread: if not NULL, local thread number within the physical core is
- *          returned
- *
- * The main purpose of this function is to retrieve the device node for the
- * given logical CPU index. It should be used to initialize the of_node in
- * cpu device. Once of_node in cpu device is populated, all the further
- * references can use that instead.
- *
- * CPU logical to physical index mapping is architecture specific and is built
- * before booting secondary cores. This function uses arch_match_cpu_phys_id
- * which can be overridden by architecture specific implementation.
- *
- * Return: A node pointer for the logical cpu with refcount incremented, use
- * of_node_put() on it when done. Returns NULL if not found.
- */
-struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
-{
-	struct device_node *cpun;
-
-	for_each_of_cpu_node(cpun) {
-		if (arch_find_n_match_cpu_physical_id(cpun, cpu, thread))
-			return cpun;
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_cpu_node);
-
-/**
- * of_cpu_node_to_id: Get the logical CPU number for a given device_node
- *
- * @cpu_node: Pointer to the device_node for CPU.
- *
- * Return: The logical CPU number of the given CPU device_node or -ENODEV if the
- * CPU is not found.
- */
-int of_cpu_node_to_id(struct device_node *cpu_node)
-{
-	int cpu;
-	bool found = false;
-	struct device_node *np;
-
-	for_each_possible_cpu(cpu) {
-		np = of_cpu_device_node_get(cpu);
-		found = (cpu_node == np);
-		of_node_put(np);
-		if (found)
-			return cpu;
-	}
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL(of_cpu_node_to_id);
-
-/**
- * of_get_cpu_state_node - Get CPU's idle state node at the given index
- *
- * @cpu_node: The device node for the CPU
- * @index: The index in the list of the idle states
- *
- * Two generic methods can be used to describe a CPU's idle states, either via
- * a flattened description through the "cpu-idle-states" binding or via the
- * hierarchical layout, using the "power-domains" and the "domain-idle-states"
- * bindings. This function check for both and returns the idle state node for
- * the requested index.
- *
- * Return: An idle state node if found at @index. The refcount is incremented
- * for it, so call of_node_put() on it when done. Returns NULL if not found.
- */
-struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
-					  int index)
-{
-	struct of_phandle_args args;
-	int err;
-
-	err = of_parse_phandle_with_args(cpu_node, "power-domains",
-					"#power-domain-cells", 0, &args);
-	if (!err) {
-		struct device_node *state_node =
-			of_parse_phandle(args.np, "domain-idle-states", index);
-
-		of_node_put(args.np);
-		if (state_node)
-			return state_node;
-	}
-
-	return of_parse_phandle(cpu_node, "cpu-idle-states", index);
-}
-EXPORT_SYMBOL(of_get_cpu_state_node);
-
 /**
  * __of_device_is_compatible() - Check if the node matches given constraints
  * @device: pointer to node
diff --git a/drivers/of/cpu.c b/drivers/of/cpu.c
new file mode 100644
index 000000000000..d17b2f851082
--- /dev/null
+++ b/drivers/of/cpu.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+
+/**
+ * of_get_cpu_hwid - Get the hardware ID from a CPU device node
+ *
+ * @cpun: CPU number(logical index) for which device node is required
+ * @thread: The local thread number to get the hardware ID for.
+ *
+ * Return: The hardware ID for the CPU node or ~0ULL if not found.
+ */
+u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread)
+{
+	const __be32 *cell;
+	int ac, len;
+
+	ac = of_n_addr_cells(cpun);
+	cell = of_get_property(cpun, "reg", &len);
+	if (!cell || !ac || ((sizeof(*cell) * ac * (thread + 1)) > len))
+		return ~0ULL;
+
+	cell += ac * thread;
+	return of_read_number(cell, ac);
+}
+
+/*
+ * arch_match_cpu_phys_id - Match the given logical CPU and physical id
+ *
+ * @cpu: logical cpu index of a core/thread
+ * @phys_id: physical identifier of a core/thread
+ *
+ * CPU logical to physical index mapping is architecture specific.
+ * However this __weak function provides a default match of physical
+ * id to logical cpu index. phys_id provided here is usually values read
+ * from the device tree which must match the hardware internal registers.
+ *
+ * Returns true if the physical identifier and the logical cpu index
+ * correspond to the same core/thread, false otherwise.
+ */
+bool __weak arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+	return (u32)phys_id == cpu;
+}
+
+/*
+ * Checks if the given "prop_name" property holds the physical id of the
+ * core/thread corresponding to the logical cpu 'cpu'. If 'thread' is not
+ * NULL, local thread number within the core is returned in it.
+ */
+static bool __of_find_n_match_cpu_property(struct device_node *cpun,
+			const char *prop_name, int cpu, unsigned int *thread)
+{
+	const __be32 *cell;
+	int ac, prop_len, tid;
+	u64 hwid;
+
+	ac = of_n_addr_cells(cpun);
+	cell = of_get_property(cpun, prop_name, &prop_len);
+	if (!cell && !ac && arch_match_cpu_phys_id(cpu, 0))
+		return true;
+	if (!cell || !ac)
+		return false;
+	prop_len /= sizeof(*cell) * ac;
+	for (tid = 0; tid < prop_len; tid++) {
+		hwid = of_read_number(cell, ac);
+		if (arch_match_cpu_phys_id(cpu, hwid)) {
+			if (thread)
+				*thread = tid;
+			return true;
+		}
+		cell += ac;
+	}
+	return false;
+}
+
+/*
+ * arch_find_n_match_cpu_physical_id - See if the given device node is
+ * for the cpu corresponding to logical cpu 'cpu'.  Return true if so,
+ * else false.  If 'thread' is non-NULL, the local thread number within the
+ * core is returned in it.
+ */
+bool __weak arch_find_n_match_cpu_physical_id(struct device_node *cpun,
+					      int cpu, unsigned int *thread)
+{
+	/* Check for non-standard "ibm,ppc-interrupt-server#s" property
+	 * for thread ids on PowerPC. If it doesn't exist fallback to
+	 * standard "reg" property.
+	 */
+	if (IS_ENABLED(CONFIG_PPC) &&
+	    __of_find_n_match_cpu_property(cpun,
+					   "ibm,ppc-interrupt-server#s",
+					   cpu, thread))
+		return true;
+
+	return __of_find_n_match_cpu_property(cpun, "reg", cpu, thread);
+}
+
+/**
+ * of_get_cpu_node - Get device node associated with the given logical CPU
+ *
+ * @cpu: CPU number(logical index) for which device node is required
+ * @thread: if not NULL, local thread number within the physical core is
+ *          returned
+ *
+ * The main purpose of this function is to retrieve the device node for the
+ * given logical CPU index. It should be used to initialize the of_node in
+ * cpu device. Once of_node in cpu device is populated, all the further
+ * references can use that instead.
+ *
+ * CPU logical to physical index mapping is architecture specific and is built
+ * before booting secondary cores. This function uses arch_match_cpu_phys_id
+ * which can be overridden by architecture specific implementation.
+ *
+ * Return: A node pointer for the logical cpu with refcount incremented, use
+ * of_node_put() on it when done. Returns NULL if not found.
+ */
+struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
+{
+	struct device_node *cpun;
+
+	for_each_of_cpu_node(cpun) {
+		if (arch_find_n_match_cpu_physical_id(cpun, cpu, thread))
+			return cpun;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_cpu_node);
+
+/**
+ * of_cpu_device_node_get: Get the CPU device_node for a given logical CPU number
+ *
+ * @cpu: The logical CPU number
+ *
+ * Return: Pointer to the device_node for CPU with its reference count
+ * incremented of the given logical CPU number or NULL if the CPU device_node
+ * is not found.
+ */
+struct device_node *of_cpu_device_node_get(int cpu)
+{
+	struct device *cpu_dev;
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return of_get_cpu_node(cpu, NULL);
+	return of_node_get(cpu_dev->of_node);
+}
+EXPORT_SYMBOL(of_cpu_device_node_get);
+
+/**
+ * of_cpu_node_to_id: Get the logical CPU number for a given device_node
+ *
+ * @cpu_node: Pointer to the device_node for CPU.
+ *
+ * Return: The logical CPU number of the given CPU device_node or -ENODEV if the
+ * CPU is not found.
+ */
+int of_cpu_node_to_id(struct device_node *cpu_node)
+{
+	int cpu;
+	bool found = false;
+	struct device_node *np;
+
+	for_each_possible_cpu(cpu) {
+		np = of_cpu_device_node_get(cpu);
+		found = (cpu_node == np);
+		of_node_put(np);
+		if (found)
+			return cpu;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(of_cpu_node_to_id);
+
+/**
+ * of_get_cpu_state_node - Get CPU's idle state node at the given index
+ *
+ * @cpu_node: The device node for the CPU
+ * @index: The index in the list of the idle states
+ *
+ * Two generic methods can be used to describe a CPU's idle states, either via
+ * a flattened description through the "cpu-idle-states" binding or via the
+ * hierarchical layout, using the "power-domains" and the "domain-idle-states"
+ * bindings. This function check for both and returns the idle state node for
+ * the requested index.
+ *
+ * Return: An idle state node if found at @index. The refcount is incremented
+ * for it, so call of_node_put() on it when done. Returns NULL if not found.
+ */
+struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
+					  int index)
+{
+	struct of_phandle_args args;
+	int err;
+
+	err = of_parse_phandle_with_args(cpu_node, "power-domains",
+					"#power-domain-cells", 0, &args);
+	if (!err) {
+		struct device_node *state_node =
+			of_parse_phandle(args.np, "domain-idle-states", index);
+
+		of_node_put(args.np);
+		if (state_node)
+			return state_node;
+	}
+
+	return of_parse_phandle(cpu_node, "cpu-idle-states", index);
+}
+EXPORT_SYMBOL(of_get_cpu_state_node);
diff --git a/include/linux/of.h b/include/linux/of.h
index 6dc47d4c4612..ad3614537054 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -359,6 +359,8 @@ extern const void *of_get_property(const struct device_node *node,
 				const char *name,
 				int *lenp);
 extern struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
+extern struct device_node *of_cpu_device_node_get(int cpu);
+extern int of_cpu_node_to_id(struct device_node *np);
 extern struct device_node *of_get_next_cpu_node(struct device_node *prev);
 extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
 						 int index);
@@ -438,8 +440,6 @@ const char *of_prop_next_string(struct property *prop, const char *cur);
 
 bool of_console_check(struct device_node *dn, char *name, int index);
 
-extern int of_cpu_node_to_id(struct device_node *np);
-
 int of_map_id(struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out);
@@ -634,6 +634,16 @@ static inline struct device_node *of_get_cpu_node(int cpu,
 	return NULL;
 }
 
+static inline struct device_node *of_cpu_device_node_get(int cpu)
+{
+	return NULL;
+}
+
+static inline int of_cpu_node_to_id(struct device_node *np)
+{
+	return -ENODEV;
+}
+
 static inline struct device_node *of_get_next_cpu_node(struct device_node *prev)
 {
 	return NULL;
@@ -836,11 +846,6 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 {
 }
 
-static inline int of_cpu_node_to_id(struct device_node *np)
-{
-	return -ENODEV;
-}
-
 static inline int of_map_id(struct device_node *np, u32 id,
 			     const char *map_name, const char *map_mask_name,
 			     struct device_node **target, u32 *id_out)
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index a54df77370fa..fbe342fb729f 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -32,15 +32,6 @@ extern int of_device_request_module(struct device *dev);
 extern void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env);
 extern int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env);
 
-static inline struct device_node *of_cpu_device_node_get(int cpu)
-{
-	struct device *cpu_dev;
-	cpu_dev = get_cpu_device(cpu);
-	if (!cpu_dev)
-		return of_get_cpu_node(cpu, NULL);
-	return of_node_get(cpu_dev->of_node);
-}
-
 int of_dma_configure_id(struct device *dev,
 		     struct device_node *np,
 		     bool force_dma, const u32 *id);
@@ -84,11 +75,6 @@ static inline const struct of_device_id *of_match_device(
 	return NULL;
 }
 
-static inline struct device_node *of_cpu_device_node_get(int cpu)
-{
-	return NULL;
-}
-
 static inline int of_dma_configure_id(struct device *dev,
 				      struct device_node *np,
 				      bool force_dma,
-- 
cgit v1.2.3


From 2e8fff668dc14e758ae80a898832eb9b6d60c463 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:52:02 -0500
Subject: of: Drop unnecessary includes in headers

Drop unnecessary includes in DT headers. Some simply aren't needed and
some can be replaced with forward declarations.

Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-5-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h          | 5 ++---
 include/linux/of_device.h   | 3 ++-
 include/linux/of_platform.h | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index ad3614537054..bc2eb39dcf75 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -16,13 +16,10 @@
 #include <linux/errno.h>
 #include <linux/kobject.h>
 #include <linux/mod_devicetable.h>
-#include <linux/topology.h>
-#include <linux/notifier.h>
 #include <linux/property.h>
 #include <linux/list.h>
 
 #include <asm/byteorder.h>
-#include <asm/errno.h>
 
 typedef u32 phandle;
 typedef u32 ihandle;
@@ -1521,6 +1518,8 @@ enum of_reconfig_change {
 	OF_RECONFIG_CHANGE_REMOVE,
 };
 
+struct notifier_block;
+
 #ifdef CONFIG_OF_DYNAMIC
 extern int of_reconfig_notifier_register(struct notifier_block *);
 extern int of_reconfig_notifier_unregister(struct notifier_block *);
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index fbe342fb729f..bafe50150d24 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -7,9 +7,10 @@
 #include <linux/of_platform.h> /* temporary until merge */
 
 #include <linux/of.h>
-#include <linux/mod_devicetable.h>
 
 struct device;
+struct of_device_id;
+struct kobj_uevent_env;
 
 #ifdef CONFIG_OF
 extern const struct of_device_id *of_match_device(
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 8ac5cb933dc3..d8045bcfc35e 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -6,12 +6,13 @@
  *			 <benh@kernel.crashing.org>
  */
 
-#include <linux/device.h>
 #include <linux/mod_devicetable.h>
-#include <linux/pm.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
+struct device;
+struct of_device_id;
+
 /**
  * struct of_dev_auxdata - lookup table entry for device names & platform_data
  * @compatible: compatible value of node to match against node
-- 
cgit v1.2.3


From 21bb32b155dfe7b40e5e6545f2df53f81f2e69cc Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:52:11 -0500
Subject: cpufreq: Adjust includes to remove of_device.h

Now that of_cpu_device_node_get() is defined in of.h, of_device.h is just
implicitly including other includes, and is no longer needed. Adjust the
include files with what was implicitly included by of_device.h (cpu.h and
of.h) and drop including of_device.h.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-14-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 1 -
 drivers/cpufreq/kirkwood-cpufreq.c   | 2 +-
 drivers/cpufreq/maple-cpufreq.c      | 2 +-
 drivers/cpufreq/pmac32-cpufreq.c     | 2 +-
 drivers/cpufreq/pmac64-cpufreq.c     | 2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c    | 4 ++--
 drivers/cpufreq/spear-cpufreq.c      | 2 +-
 drivers/cpufreq/tegra124-cpufreq.c   | 1 -
 drivers/cpufreq/tegra20-cpufreq.c    | 2 +-
 include/linux/cpufreq.h              | 1 -
 10 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index e85703651098..f9675e1a8529 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -6,7 +6,6 @@
 
 #include <linux/err.h>
 #include <linux/of.h>
-#include <linux/of_device.h>
 #include <linux/platform_device.h>
 
 #include "cpufreq-dt.h"
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 70ad8fe1d78b..95588101efbd 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <asm/proc-fns.h>
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index 28d346062166..f9306410a07f 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -23,7 +23,7 @@
 #include <linux/completion.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 
 #define DBG(fmt...) pr_debug(fmt)
 
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 4b8ee2014da6..a28716d8fc54 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -23,7 +23,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/hardirq.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 
 #include <asm/machdep.h>
 #include <asm/irq.h>
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index ba9c31d98bd6..2cd2b06849a2 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 
 #include <asm/machdep.h>
 #include <asm/irq.h>
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 2f581d2d617d..df165a078d14 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -11,8 +11,8 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/of_address.h>
-#include <linux/of_platform.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index c6fdf019dbde..78b875db6b66 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -18,7 +18,7 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/types.h>
diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c
index 7a1ea6fdcab6..312ca5ddc6c4 100644
--- a/drivers/cpufreq/tegra124-cpufreq.c
+++ b/drivers/cpufreq/tegra124-cpufreq.c
@@ -11,7 +11,6 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/of_device.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index ab7ac7df9e62..5d1f5f87e46d 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -12,7 +12,7 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
 #include <linux/types.h>
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 65623233ab2f..3ac4a10d4651 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -15,7 +15,6 @@
 #include <linux/kobject.h>
 #include <linux/notifier.h>
 #include <linux/of.h>
-#include <linux/of_device.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #include <linux/spinlock.h>
-- 
cgit v1.2.3


From 7e09cb0b84ac5d17086a65ca21479dfec600a232 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Mar 2023 10:52:16 -0500
Subject: of: Drop cpu.h include from of_device.h

Now that all users which had an implicit dependency on cpu.h have been
fixed. the cpu.h include can be dropped from of_device.h

Link: https://lore.kernel.org/r/20230329-dt-cpu-header-cleanups-v1-19-581e2605fe47@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index bafe50150d24..33f0ca348a62 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-#include <linux/cpu.h>
 #include <linux/platform_device.h>
 #include <linux/of_platform.h> /* temporary until merge */
 
-- 
cgit v1.2.3


From c75a79491835c50ca61938ae1ebd3ba5598f7410 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 28 Mar 2023 15:15:57 -0500
Subject: of/address: Add of_range_to_resource() helper

A few users need to convert a specific "ranges" entry into a struct
resource. Add a helper to similar to of_address_to_resource(). The
existing of_pci_range_to_resource() helper isn't really PCI specific,
so it can be used with the CONFIG_PCI check dropped.

Link: https://lore.kernel.org/r/20230328-dt-address-helpers-v1-2-e2456c3e77ab@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/address.c       | 31 ++++++++++++++++++++++++++++---
 drivers/of/unittest.c      | 16 +++++++++++++++-
 include/linux/of_address.h |  8 ++++++++
 3 files changed, 51 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 4c0b169ef9bf..b79f005834fc 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -229,9 +229,6 @@ int of_pci_range_to_resource(struct of_pci_range *range,
 	res->parent = res->child = res->sibling = NULL;
 	res->name = np->full_name;
 
-	if (!IS_ENABLED(CONFIG_PCI))
-		return -ENOSYS;
-
 	if (res->flags & IORESOURCE_IO) {
 		unsigned long port;
 		err = pci_register_io_range(&np->fwnode, range->cpu_addr,
@@ -263,6 +260,34 @@ invalid_range:
 }
 EXPORT_SYMBOL(of_pci_range_to_resource);
 
+/*
+ * of_range_to_resource - Create a resource from a ranges entry
+ * @np:		device node where the range belongs to
+ * @index:	the 'ranges' index to convert to a resource
+ * @res:	pointer to a valid resource that will be updated to
+ *              reflect the values contained in the range.
+ *
+ * Returns ENOENT if the entry is not found or EINVAL if the range cannot be
+ * converted to resource.
+ */
+int of_range_to_resource(struct device_node *np, int index, struct resource *res)
+{
+	int ret, i = 0;
+	struct of_range_parser parser;
+	struct of_range range;
+
+	ret = of_range_parser_init(&parser, np);
+	if (ret)
+		return ret;
+
+	for_each_of_range(&parser, &range)
+		if (i++ == index)
+			return of_pci_range_to_resource(&range, np, res);
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(of_range_to_resource);
+
 /*
  * ISA bus specific translator
  */
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 1a45df1f354a..945288d5672f 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1013,7 +1013,8 @@ static void __init of_unittest_bus_ranges(void)
 	struct device_node *np;
 	struct of_range range;
 	struct of_range_parser parser;
-	int i = 0;
+	struct resource res;
+	int ret, i = 0;
 
 	np = of_find_node_by_path("/testcase-data/address-tests");
 	if (!np) {
@@ -1026,6 +1027,19 @@ static void __init of_unittest_bus_ranges(void)
 		return;
 	}
 
+	ret = of_range_to_resource(np, 1, &res);
+	unittest(!ret, "of_range_to_resource returned error (%d) node %pOF\n",
+		ret, np);
+	unittest(resource_type(&res) == IORESOURCE_MEM,
+		"of_range_to_resource wrong resource type on node %pOF res=%pR\n",
+		np, &res);
+	unittest(res.start == 0xd0000000,
+		"of_range_to_resource wrong resource start address on node %pOF res=%pR\n",
+		np, &res);
+	unittest(resource_size(&res) == 0x20000000,
+		"of_range_to_resource wrong resource start address on node %pOF res=%pR\n",
+		np, &res);
+
 	/*
 	 * Get the "ranges" from the device tree
 	 */
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 376671594746..1d005439f026 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -68,6 +68,8 @@ extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 extern int of_pci_range_to_resource(struct of_pci_range *range,
 				    struct device_node *np,
 				    struct resource *res);
+extern int of_range_to_resource(struct device_node *np, int index,
+				struct resource *res);
 extern bool of_dma_is_coherent(struct device_node *np);
 #else /* CONFIG_OF_ADDRESS */
 static inline void __iomem *of_io_request_and_map(struct device_node *device,
@@ -120,6 +122,12 @@ static inline int of_pci_range_to_resource(struct of_pci_range *range,
 	return -ENOSYS;
 }
 
+static inline int of_range_to_resource(struct device_node *np, int index,
+				       struct resource *res)
+{
+	return -ENOSYS;
+}
+
 static inline bool of_dma_is_coherent(struct device_node *np)
 {
 	return false;
-- 
cgit v1.2.3


From b50c788a56964a900ebcc817c8a5ad35ddad87b6 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 28 Mar 2023 15:15:59 -0500
Subject: of/address: Add of_range_count() helper

Some users need a count of the number of ranges entries before
iterating over the entries. Typically this is for allocating some data
structure based on the size. Add a helper, of_range_count(), to get the
count. The helper must be called with an struct of_range_parser
initialized by of_range_parser_init().

Link: https://lore.kernel.org/r/20230328-dt-address-helpers-v1-4-e2456c3e77ab@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/unittest.c      |  7 ++++++-
 include/linux/of_address.h | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 29066ecbed47..eaeb58065acc 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1014,7 +1014,7 @@ static void __init of_unittest_bus_ranges(void)
 	struct of_range range;
 	struct of_range_parser parser;
 	struct resource res;
-	int ret, i = 0;
+	int ret, count, i = 0;
 
 	np = of_find_node_by_path("/testcase-data/address-tests");
 	if (!np) {
@@ -1040,6 +1040,11 @@ static void __init of_unittest_bus_ranges(void)
 		"of_range_to_resource wrong resource start address on node %pOF res=%pR\n",
 		np, &res);
 
+	count = of_range_count(&parser);
+	unittest(count == 2,
+		"of_range_count wrong size on node %pOF count=%d\n",
+		np, count);
+
 	/*
 	 * Get the "ranges" from the device tree
 	 */
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 1d005439f026..5292f62c1baa 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -35,6 +35,22 @@ struct of_pci_range {
 	for (; of_pci_range_parser_one(parser, range);)
 #define for_each_of_range for_each_of_pci_range
 
+/*
+ * of_range_count - Get the number of "ranges" or "dma-ranges" entries
+ * @parser:	Parser state initialized by of_range_parser_init()
+ *
+ * Returns the number of entries or 0 if none.
+ *
+ * Note that calling this within or after the for_each_of_range() iterator will
+ * be inaccurate giving the number of entries remaining.
+ */
+static inline int of_range_count(const struct of_range_parser *parser)
+{
+	if (!parser || !parser->node || !parser->range || parser->range == parser->end)
+		return 0;
+	return (parser->end - parser->range) / (parser->na + parser->pna + parser->ns);
+}
+
 /* Translate a DMA address from device space to CPU space */
 extern u64 of_translate_dma_address(struct device_node *dev,
 				    const __be32 *in_addr);
-- 
cgit v1.2.3


From ff61bacd77f258bd2ed145efb69e5449b115d4fe Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 28 Mar 2023 15:16:00 -0500
Subject: of/address: Add of_property_read_reg() helper

Add a helper, of_property_read_reg(), to read "reg" entries untranslated
address and size. This function is intended mainly for cases with an
untranslatable "reg" address (i.e. not MMIO). There's also a few
translatable cases such as address cells containing a bus chip-select
number.

Link: https://lore.kernel.org/r/20230328-dt-address-helpers-v1-5-e2456c3e77ab@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/address.c       | 23 +++++++++++++++++++++++
 drivers/of/unittest.c      | 22 ++++++++++++++++++++++
 include/linux/of_address.h |  7 +++++++
 3 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 8cfae24148e0..fdb15c6fb3b1 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -760,6 +760,29 @@ const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
 }
 EXPORT_SYMBOL(__of_get_address);
 
+/**
+ * of_property_read_reg - Retrieve the specified "reg" entry index without translating
+ * @np: device tree node for which to retrieve "reg" from
+ * @idx: "reg" entry index to read
+ * @addr: return value for the untranslated address
+ * @size: return value for the entry size
+ *
+ * Returns -EINVAL if "reg" is not found. Returns 0 on success with addr and
+ * size values filled in.
+ */
+int of_property_read_reg(struct device_node *np, int idx, u64 *addr, u64 *size)
+{
+	const __be32 *prop = of_get_address(np, idx, size, NULL);
+
+	if (!prop)
+		return -EINVAL;
+
+	*addr = of_read_number(prop, of_n_addr_cells(np));
+
+	return 0;
+}
+EXPORT_SYMBOL(of_property_read_reg);
+
 static int parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node, const char *name)
 {
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index eaeb58065acc..e73ecbef977b 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1134,6 +1134,27 @@ static void __init of_unittest_bus_3cell_ranges(void)
 	of_node_put(np);
 }
 
+static void __init of_unittest_reg(void)
+{
+	struct device_node *np;
+	int ret;
+	u64 addr, size;
+
+	np = of_find_node_by_path("/testcase-data/address-tests/bus@80000000/device@1000");
+	if (!np) {
+		pr_err("missing testcase data\n");
+		return;
+	}
+
+	ret = of_property_read_reg(np, 0, &addr, &size);
+	unittest(!ret, "of_property_read_reg(%pOF) returned error %d\n",
+		np, ret);
+	unittest(addr == 0x1000, "of_property_read_reg(%pOF) untranslated address (%llx) incorrect\n",
+		np, addr);
+
+	of_node_put(np);
+}
+
 static void __init of_unittest_parse_interrupts(void)
 {
 	struct device_node *np;
@@ -3772,6 +3793,7 @@ static int __init of_unittest(void)
 	of_unittest_pci_dma_ranges();
 	of_unittest_bus_ranges();
 	of_unittest_bus_3cell_ranges();
+	of_unittest_reg();
 	of_unittest_match_node();
 	of_unittest_platform_populate();
 	of_unittest_overlay();
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 5292f62c1baa..26a19daf0d09 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -72,6 +72,8 @@ void __iomem *of_io_request_and_map(struct device_node *device,
 extern const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
 				      u64 *size, unsigned int *flags);
 
+int of_property_read_reg(struct device_node *np, int idx, u64 *addr, u64 *size);
+
 extern int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node);
 extern int of_pci_dma_range_parser_init(struct of_pci_range_parser *parser,
@@ -106,6 +108,11 @@ static inline const __be32 *__of_get_address(struct device_node *dev, int index,
 	return NULL;
 }
 
+static inline int of_property_read_reg(struct device_node *np, int idx, u64 *addr, u64 *size)
+{
+	return -ENOSYS;
+}
+
 static inline int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node)
 {
-- 
cgit v1.2.3


From 987d2e0aaa55de40938435be760aa96428470fd6 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 31 Mar 2023 17:15:52 +0800
Subject: module: Move is_arm_mapping_symbol() to module_symbol.h

In order to avoid duplicated code, move is_arm_mapping_symbol() to
include/linux/module_symbol.h, then remove is_arm_mapping_symbol()
in the other places.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module_symbol.h | 15 +++++++++++++++
 kernel/module/kallsyms.c      | 14 +-------------
 scripts/mod/modpost.c         | 10 +---------
 3 files changed, 17 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/module_symbol.h

(limited to 'include')

diff --git a/include/linux/module_symbol.h b/include/linux/module_symbol.h
new file mode 100644
index 000000000000..9fa4173d0197
--- /dev/null
+++ b/include/linux/module_symbol.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_MODULE_SYMBOL_H
+#define _LINUX_MODULE_SYMBOL_H
+
+/* This ignores the intensely annoying "mapping symbols" found in ELF files. */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+	if (str[0] == '.' && str[1] == 'L')
+		return true;
+	return str[0] == '$' &&
+	       (str[1] == 'a' || str[1] == 'd' || str[1] == 't' || str[1] == 'x')
+	       && (str[2] == '\0' || str[2] == '.');
+}
+
+#endif /* _LINUX_MODULE_SYMBOL_H */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index a9045fe55bcd..5de3207f18af 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/module_symbol.h>
 #include <linux/kallsyms.h>
 #include <linux/buildid.h>
 #include <linux/bsearch.h>
@@ -243,19 +244,6 @@ void init_build_id(struct module *mod, const struct load_info *info)
 }
 #endif
 
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
-	if (str[0] == '.' && str[1] == 'L')
-		return true;
-	return str[0] == '$' &&
-	       (str[1] == 'a' || str[1] == 'd' || str[1] == 't' || str[1] == 'x')
-	       && (str[2] == '\0' || str[2] == '.');
-}
-
 static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
 {
 	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 79a27cc5f0b1..7241db81ffde 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include "modpost.h"
 #include "../../include/linux/license.h"
+#include "../../include/linux/module_symbol.h"
 
 /* Are we using CONFIG_MODVERSIONS? */
 static bool modversions;
@@ -1112,15 +1113,6 @@ static int secref_whitelist(const struct sectioncheck *mismatch,
 	return 1;
 }
 
-static inline int is_arm_mapping_symbol(const char *str)
-{
-	if (str[0] == '.' && str[1] == 'L')
-		return true;
-	return str[0] == '$' &&
-	       (str[1] == 'a' || str[1] == 'd' || str[1] == 't' || str[1] == 'x')
-	       && (str[2] == '\0' || str[2] == '.');
-}
-
 /*
  * If there's no name there, ignore it; likewise, ignore it if it's
  * one of the magic symbols emitted used by current ARM tools.
-- 
cgit v1.2.3


From 0a3bf86092c38f7b72c56c6901c78dd302411307 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 31 Mar 2023 17:15:53 +0800
Subject: module: Ignore L0 and rename is_arm_mapping_symbol()

The L0 symbol is generated when build module on LoongArch, ignore it in
modpost and when looking at module symbols, otherwise we can not see the
expected call trace.

Now is_arm_mapping_symbol() is not only for ARM, in order to reflect the
reality, rename is_arm_mapping_symbol() to is_mapping_symbol().

This is related with commit c17a2538704f ("mksysmap: Fix the mismatch of
'L0' symbols in System.map").

(1) Simple test case

  [loongson@linux hello]$ cat hello.c
  #include <linux/init.h>
  #include <linux/module.h>
  #include <linux/printk.h>

  static void test_func(void)
  {
  	  pr_info("This is a test\n");
	  dump_stack();
  }

  static int __init hello_init(void)
  {
	  pr_warn("Hello, world\n");
	  test_func();

	  return 0;
  }

  static void __exit hello_exit(void)
  {
	  pr_warn("Goodbye\n");
  }

  module_init(hello_init);
  module_exit(hello_exit);
  MODULE_LICENSE("GPL");
  [loongson@linux hello]$ cat Makefile
  obj-m:=hello.o

  ccflags-y += -g -Og

  all:
	  make -C /lib/modules/$(shell uname -r)/build/ M=$(PWD) modules
  clean:
	  make -C /lib/modules/$(shell uname -r)/build/ M=$(PWD) clean

(2) Test environment

system: LoongArch CLFS 5.5
https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/tag/5.0
It needs to update grub to avoid booting error "invalid magic number".

kernel: 6.3-rc1 with loongson3_defconfig + CONFIG_DYNAMIC_FTRACE=y

(3) Test result

Without this patch:

  [root@linux hello]# insmod hello.ko
  [root@linux hello]# dmesg
  ...
  Hello, world
  This is a test
  ...
  Call Trace:
  [<9000000000223728>] show_stack+0x68/0x18c
  [<90000000013374cc>] dump_stack_lvl+0x60/0x88
  [<ffff800002050028>] L0\x01+0x20/0x2c [hello]
  [<ffff800002058028>] L0\x01+0x20/0x30 [hello]
  [<900000000022097c>] do_one_initcall+0x88/0x288
  [<90000000002df890>] do_init_module+0x54/0x200
  [<90000000002e1e18>] __do_sys_finit_module+0xc4/0x114
  [<90000000013382e8>] do_syscall+0x7c/0x94
  [<9000000000221e3c>] handle_syscall+0xbc/0x158

With this patch:

  [root@linux hello]# insmod hello.ko
  [root@linux hello]# dmesg
  ...
  Hello, world
  This is a test
  ...
  Call Trace:
  [<9000000000223728>] show_stack+0x68/0x18c
  [<90000000013374cc>] dump_stack_lvl+0x60/0x88
  [<ffff800002050028>] test_func+0x28/0x34 [hello]
  [<ffff800002058028>] hello_init+0x28/0x38 [hello]
  [<900000000022097c>] do_one_initcall+0x88/0x288
  [<90000000002df890>] do_init_module+0x54/0x200
  [<90000000002e1e18>] __do_sys_finit_module+0xc4/0x114
  [<90000000013382e8>] do_syscall+0x7c/0x94
  [<9000000000221e3c>] handle_syscall+0xbc/0x158

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Tested-by: Youling Tang <tangyouling@loongson.cn> # for LoongArch
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module_symbol.h | 4 +++-
 kernel/module/kallsyms.c      | 2 +-
 scripts/mod/modpost.c         | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/module_symbol.h b/include/linux/module_symbol.h
index 9fa4173d0197..7ace7ba30203 100644
--- a/include/linux/module_symbol.h
+++ b/include/linux/module_symbol.h
@@ -3,10 +3,12 @@
 #define _LINUX_MODULE_SYMBOL_H
 
 /* This ignores the intensely annoying "mapping symbols" found in ELF files. */
-static inline int is_arm_mapping_symbol(const char *str)
+static inline int is_mapping_symbol(const char *str)
 {
 	if (str[0] == '.' && str[1] == 'L')
 		return true;
+	if (str[0] == 'L' && str[1] == '0')
+		return true;
 	return str[0] == '$' &&
 	       (str[1] == 'a' || str[1] == 'd' || str[1] == 't' || str[1] == 'x')
 	       && (str[2] == '\0' || str[2] == '.');
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 5de3207f18af..d8e426a9a0cd 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -289,7 +289,7 @@ static const char *find_kallsyms_symbol(struct module *mod,
 		 * and inserted at a whim.
 		 */
 		if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
-		    is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+		    is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
 			continue;
 
 		if (thisval <= addr && thisval > bestval) {
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 7241db81ffde..5cddf7623945 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1115,7 +1115,7 @@ static int secref_whitelist(const struct sectioncheck *mismatch,
 
 /*
  * If there's no name there, ignore it; likewise, ignore it if it's
- * one of the magic symbols emitted used by current ARM tools.
+ * one of the magic symbols emitted used by current tools.
  *
  * Otherwise if find_symbols_between() returns those symbols, they'll
  * fail the whitelist tests and cause lots of false alarms ... fixable
@@ -1128,7 +1128,7 @@ static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym)
 
 	if (!name || !strlen(name))
 		return 0;
-	return !is_arm_mapping_symbol(name);
+	return !is_mapping_symbol(name);
 }
 
 /**
-- 
cgit v1.2.3


From daa9e76d17570cdd2dbec28244e60e2cb0eafb36 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 11 Apr 2023 14:59:04 +0200
Subject: dt-bindings: clock: qcom: describe the GPUCC clock for SA8775P

Add the compatible for the Qualcomm Graphics Clock control module present
on sa8775p platforms. It matches the generic QCom GPUCC description. Add
device-specific DT bindings defines as well.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230411125910.401075-2-brgl@bgdev.pl
---
 .../devicetree/bindings/clock/qcom,gpucc.yaml      |  2 +
 include/dt-bindings/clock/qcom,sa8775p-gpucc.h     | 50 ++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,sa8775p-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
index db53eb288995..1e3dc9deded9 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
@@ -15,6 +15,7 @@ description: |
 
   See also::
     include/dt-bindings/clock/qcom,gpucc-sdm845.h
+    include/dt-bindings/clock/qcom,gpucc-sa8775p.h
     include/dt-bindings/clock/qcom,gpucc-sc7180.h
     include/dt-bindings/clock/qcom,gpucc-sc7280.h
     include/dt-bindings/clock/qcom,gpucc-sc8280xp.h
@@ -27,6 +28,7 @@ properties:
   compatible:
     enum:
       - qcom,sdm845-gpucc
+      - qcom,sa8775p-gpucc
       - qcom,sc7180-gpucc
       - qcom,sc7280-gpucc
       - qcom,sc8180x-gpucc
diff --git a/include/dt-bindings/clock/qcom,sa8775p-gpucc.h b/include/dt-bindings/clock/qcom,sa8775p-gpucc.h
new file mode 100644
index 000000000000..a5fd784b1ea2
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sa8775p-gpucc.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2022, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPUCC_SA8775P_H
+#define _DT_BINDINGS_CLK_QCOM_GPUCC_SA8775P_H
+
+/* GPU_CC clocks */
+#define GPU_CC_PLL0				0
+#define GPU_CC_PLL1				1
+#define GPU_CC_AHB_CLK				2
+#define GPU_CC_CB_CLK				3
+#define GPU_CC_CRC_AHB_CLK			4
+#define GPU_CC_CX_FF_CLK			5
+#define GPU_CC_CX_GMU_CLK			6
+#define GPU_CC_CX_SNOC_DVM_CLK			7
+#define GPU_CC_CXO_AON_CLK			8
+#define GPU_CC_CXO_CLK				9
+#define GPU_CC_DEMET_CLK			10
+#define GPU_CC_DEMET_DIV_CLK_SRC		11
+#define GPU_CC_FF_CLK_SRC			12
+#define GPU_CC_GMU_CLK_SRC			13
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK		14
+#define GPU_CC_HUB_AHB_DIV_CLK_SRC		15
+#define GPU_CC_HUB_AON_CLK			16
+#define GPU_CC_HUB_CLK_SRC			17
+#define GPU_CC_HUB_CX_INT_CLK			18
+#define GPU_CC_HUB_CX_INT_DIV_CLK_SRC		19
+#define GPU_CC_MEMNOC_GFX_CLK			20
+#define GPU_CC_SLEEP_CLK			21
+#define GPU_CC_XO_CLK_SRC			22
+
+/* GPU_CC resets */
+#define GPUCC_GPU_CC_ACD_BCR			0
+#define GPUCC_GPU_CC_CB_BCR			1
+#define GPUCC_GPU_CC_CX_BCR			2
+#define GPUCC_GPU_CC_FAST_HUB_BCR		3
+#define GPUCC_GPU_CC_FF_BCR			4
+#define GPUCC_GPU_CC_GFX3D_AON_BCR		5
+#define GPUCC_GPU_CC_GMU_BCR			6
+#define GPUCC_GPU_CC_GX_BCR			7
+#define GPUCC_GPU_CC_XO_BCR			8
+
+/* GPU_CC power domains */
+#define GPU_CC_CX_GDSC				0
+#define GPU_CC_GX_GDSC				1
+
+#endif /* _DT_BINDINGS_CLK_QCOM_GPUCC_SA8775P_H */
-- 
cgit v1.2.3


From 1cf3bfc60f9836f44da951f58b6ae24680484b35 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 13 Apr 2023 01:06:32 +0200
Subject: bpf: Support 64-bit pointers to kfuncs

test_ksyms_module fails to emit a kfunc call targeting a module on
s390x, because the verifier stores the difference between kfunc
address and __bpf_call_base in bpf_insn.imm, which is s32, and modules
are roughly (1 << 42) bytes away from the kernel on s390x.

Fix by keeping BTF id in bpf_insn.imm for BPF_PSEUDO_KFUNC_CALLs,
and storing the absolute address in bpf_kfunc_desc.

Introduce bpf_jit_supports_far_kfunc_call() in order to limit this new
behavior to the s390x JIT. Otherwise other JITs need to be modified,
which is not desired.

Introduce bpf_get_kfunc_addr() instead of exposing both
find_kfunc_desc() and struct bpf_kfunc_desc.

In addition to sorting kfuncs by imm, also sort them by offset, in
order to handle conflicting imms from different modules. Do this on
all architectures in order to simplify code.

Factor out resolving specialized kfuncs (XPD and dynptr) from
fixup_kfunc_call(). This was required in the first place, because
fixup_kfunc_call() uses find_kfunc_desc(), which returns a const
pointer, so it's not possible to modify kfunc addr without stripping
const, which is not nice. It also removes repetition of code like:

	if (bpf_jit_supports_far_kfunc_call())
		desc->addr = func;
	else
		insn->imm = BPF_CALL_IMM(func);

and separates kfunc_desc_tab fixups from kfunc_call fixups.

Suggested-by: Jiri Olsa <olsajiri@gmail.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230412230632.885985-1-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c |   5 ++
 include/linux/bpf.h          |  10 ++++
 include/linux/filter.h       |   1 +
 kernel/bpf/core.c            |  11 ++++
 kernel/bpf/verifier.c        | 123 +++++++++++++++++++++++++++++--------------
 5 files changed, 110 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d0846ba818ee..7102e4b674a0 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2001,6 +2001,11 @@ bool bpf_jit_supports_kfunc_call(void)
 	return true;
 }
 
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+	return true;
+}
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *old_addr, void *new_addr)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2c6095bd7d69..88845aadc47d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2295,6 +2295,9 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
 bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 			 const struct bpf_insn *insn);
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		       u16 btf_fd_idx, u8 **func_addr);
+
 struct bpf_core_ctx {
 	struct bpf_verifier_log *log;
 	const struct btf *btf;
@@ -2545,6 +2548,13 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 	return NULL;
 }
 
+static inline int
+bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		   u16 btf_fd_idx, u8 **func_addr)
+{
+	return -ENOTSUPP;
+}
+
 static inline bool unprivileged_ebpf_enabled(void)
 {
 	return false;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5364b0c52c1d..bbce89937fde 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -920,6 +920,7 @@ void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
+bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e2d256c82072..7421487422d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 	s16 off = insn->off;
 	s32 imm = insn->imm;
 	u8 *addr;
+	int err;
 
 	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
 	if (!*func_addr_fixed) {
@@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			addr = (u8 *)prog->aux->func[off]->bpf_func;
 		else
 			return -EINVAL;
+	} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+		   bpf_jit_supports_far_kfunc_call()) {
+		err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+		if (err)
+			return err;
 	} else {
 		/* Address of a BPF helper call. Since part of the core
 		 * kernel, it's always at a fixed location. __bpf_call_base
@@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e3d5a7a2f428..4aa6d715e655 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -195,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
 static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
 static int ref_set_non_owning(struct bpf_verifier_env *env,
 			      struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+			     u32 func_id, u16 offset, unsigned long *addr);
 
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
@@ -2374,6 +2376,7 @@ struct bpf_kfunc_desc {
 	u32 func_id;
 	s32 imm;
 	u16 offset;
+	unsigned long addr;
 };
 
 struct bpf_kfunc_btf {
@@ -2383,6 +2386,11 @@ struct bpf_kfunc_btf {
 };
 
 struct bpf_kfunc_desc_tab {
+	/* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+	 * verification. JITs do lookups by bpf_insn, where func_id may not be
+	 * available, therefore at the end of verification do_misc_fixups()
+	 * sorts this by imm and offset.
+	 */
 	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
 	u32 nr_descs;
 };
@@ -2423,6 +2431,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
 		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
 }
 
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		       u16 btf_fd_idx, u8 **func_addr)
+{
+	const struct bpf_kfunc_desc *desc;
+
+	desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+	if (!desc)
+		return -EFAULT;
+
+	*func_addr = (u8 *)desc->addr;
+	return 0;
+}
+
 static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
 					 s16 offset)
 {
@@ -2602,13 +2623,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 			func_name);
 		return -EINVAL;
 	}
+	specialize_kfunc(env, func_id, offset, &addr);
 
-	call_imm = BPF_CALL_IMM(addr);
-	/* Check whether or not the relative offset overflows desc->imm */
-	if ((unsigned long)(s32)call_imm != call_imm) {
-		verbose(env, "address of kernel function %s is out of range\n",
-			func_name);
-		return -EINVAL;
+	if (bpf_jit_supports_far_kfunc_call()) {
+		call_imm = func_id;
+	} else {
+		call_imm = BPF_CALL_IMM(addr);
+		/* Check whether the relative offset overflows desc->imm */
+		if ((unsigned long)(s32)call_imm != call_imm) {
+			verbose(env, "address of kernel function %s is out of range\n",
+				func_name);
+			return -EINVAL;
+		}
 	}
 
 	if (bpf_dev_bound_kfunc_id(func_id)) {
@@ -2621,6 +2647,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 	desc->func_id = func_id;
 	desc->imm = call_imm;
 	desc->offset = offset;
+	desc->addr = addr;
 	err = btf_distill_func_proto(&env->log, desc_btf,
 				     func_proto, func_name,
 				     &desc->func_model);
@@ -2630,19 +2657,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 	return err;
 }
 
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
 {
 	const struct bpf_kfunc_desc *d0 = a;
 	const struct bpf_kfunc_desc *d1 = b;
 
-	if (d0->imm > d1->imm)
-		return 1;
-	else if (d0->imm < d1->imm)
-		return -1;
+	if (d0->imm != d1->imm)
+		return d0->imm < d1->imm ? -1 : 1;
+	if (d0->offset != d1->offset)
+		return d0->offset < d1->offset ? -1 : 1;
 	return 0;
 }
 
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
 {
 	struct bpf_kfunc_desc_tab *tab;
 
@@ -2651,7 +2678,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
 		return;
 
 	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
-	     kfunc_desc_cmp_by_imm, NULL);
+	     kfunc_desc_cmp_by_imm_off, NULL);
 }
 
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -2665,13 +2692,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 {
 	const struct bpf_kfunc_desc desc = {
 		.imm = insn->imm,
+		.offset = insn->off,
 	};
 	const struct bpf_kfunc_desc *res;
 	struct bpf_kfunc_desc_tab *tab;
 
 	tab = prog->aux->kfunc_tab;
 	res = bsearch(&desc, tab->descs, tab->nr_descs,
-		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
 
 	return res ? &res->func_model : NULL;
 }
@@ -17293,11 +17321,45 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+			     u32 func_id, u16 offset, unsigned long *addr)
+{
+	struct bpf_prog *prog = env->prog;
+	bool seen_direct_write;
+	void *xdp_kfunc;
+	bool is_rdonly;
+
+	if (bpf_dev_bound_kfunc_id(func_id)) {
+		xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+		if (xdp_kfunc) {
+			*addr = (unsigned long)xdp_kfunc;
+			return;
+		}
+		/* fallback to default kfunc when not supported by netdev */
+	}
+
+	if (offset)
+		return;
+
+	if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+		seen_direct_write = env->seen_direct_write;
+		is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+		if (is_rdonly)
+			*addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+		/* restore env->seen_direct_write to its original value, since
+		 * may_access_direct_pkt_data mutates it
+		 */
+		env->seen_direct_write = seen_direct_write;
+	}
+}
+
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
 	const struct bpf_kfunc_desc *desc;
-	void *xdp_kfunc;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -17306,18 +17368,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	*cnt = 0;
 
-	if (bpf_dev_bound_kfunc_id(insn->imm)) {
-		xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
-		if (xdp_kfunc) {
-			insn->imm = BPF_CALL_IMM(xdp_kfunc);
-			return 0;
-		}
-
-		/* fallback to default kfunc when not supported by netdev */
-	}
-
-	/* insn->imm has the btf func_id. Replace it with
-	 * an address (relative to __bpf_call_base).
+	/* insn->imm has the btf func_id. Replace it with an offset relative to
+	 * __bpf_call_base, unless the JIT needs to call functions that are
+	 * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
 	 */
 	desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
 	if (!desc) {
@@ -17326,7 +17379,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	insn->imm = desc->imm;
+	if (!bpf_jit_supports_far_kfunc_call())
+		insn->imm = BPF_CALL_IMM(desc->addr);
 	if (insn->off)
 		return 0;
 	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
@@ -17351,17 +17405,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
-	} else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
-		bool seen_direct_write = env->seen_direct_write;
-		bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
-
-		if (is_rdonly)
-			insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly);
-
-		/* restore env->seen_direct_write to its original value, since
-		 * may_access_direct_pkt_data mutates it
-		 */
-		env->seen_direct_write = seen_direct_write;
 	}
 	return 0;
 }
@@ -17891,7 +17934,7 @@ patch_call_imm:
 		}
 	}
 
-	sort_kfunc_descs_by_imm(env->prog);
+	sort_kfunc_descs_by_imm_off(env->prog);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 9ecd05794b8da1f6cfca4c3721a3b0fed2e21a82 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 12 Apr 2023 15:47:30 +0300
Subject: net: mscc: ocelot: strengthen type of "u32 reg" in I/O accessors

The "u32 reg" argument that is passed to these functions is not a plain
address, but rather a driver-specific encoding of another enum
ocelot_target target in the upper bits, and an index into the
u32 ocelot->map[target][] array in the lower bits. That encoded value
takes the type "enum ocelot_reg" and is what is passed to these I/O
functions, so let's actually use that to prevent type confusion.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_io.c | 20 +++++++++++---------
 include/soc/mscc/ocelot.h             | 20 +++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c
index 2067382d0ee1..ddb96f32830d 100644
--- a/drivers/net/ethernet/mscc/ocelot_io.c
+++ b/drivers/net/ethernet/mscc/ocelot_io.c
@@ -10,8 +10,8 @@
 
 #include "ocelot.h"
 
-int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
-			  int count)
+int __ocelot_bulk_read_ix(struct ocelot *ocelot, enum ocelot_reg reg,
+			  u32 offset, void *buf, int count)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -23,7 +23,7 @@ int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
 }
 EXPORT_SYMBOL_GPL(__ocelot_bulk_read_ix);
 
-u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset)
+u32 __ocelot_read_ix(struct ocelot *ocelot, enum ocelot_reg reg, u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 	u32 val;
@@ -36,7 +36,8 @@ u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset)
 }
 EXPORT_SYMBOL_GPL(__ocelot_read_ix);
 
-void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset)
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, enum ocelot_reg reg,
+		       u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -47,8 +48,8 @@ void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset)
 }
 EXPORT_SYMBOL_GPL(__ocelot_write_ix);
 
-void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
-		     u32 offset)
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask,
+		     enum ocelot_reg reg, u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -60,7 +61,7 @@ void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
 }
 EXPORT_SYMBOL_GPL(__ocelot_rmw_ix);
 
-u32 ocelot_port_readl(struct ocelot_port *port, u32 reg)
+u32 ocelot_port_readl(struct ocelot_port *port, enum ocelot_reg reg)
 {
 	struct ocelot *ocelot = port->ocelot;
 	u16 target = reg >> TARGET_OFFSET;
@@ -73,7 +74,7 @@ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_readl);
 
-void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
+void ocelot_port_writel(struct ocelot_port *port, u32 val, enum ocelot_reg reg)
 {
 	struct ocelot *ocelot = port->ocelot;
 	u16 target = reg >> TARGET_OFFSET;
@@ -84,7 +85,8 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_writel);
 
-void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg)
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask,
+		      enum ocelot_reg reg)
 {
 	u32 cur = ocelot_port_readl(port, reg);
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index d757b5e26d26..277e6d1f2096 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -940,15 +940,17 @@ struct ocelot_policer {
 	__ocelot_target_write_ix(ocelot, target, val, reg, 0)
 
 /* I/O */
-u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
-void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
-void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg);
-int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
-			  int count);
-u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
-void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
-void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
-		     u32 offset);
+u32 ocelot_port_readl(struct ocelot_port *port, enum ocelot_reg reg);
+void ocelot_port_writel(struct ocelot_port *port, u32 val, enum ocelot_reg reg);
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask,
+		      enum ocelot_reg reg);
+int __ocelot_bulk_read_ix(struct ocelot *ocelot, enum ocelot_reg reg,
+			  u32 offset, void *buf, int count);
+u32 __ocelot_read_ix(struct ocelot *ocelot, enum ocelot_reg reg, u32 offset);
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, enum ocelot_reg reg,
+		       u32 offset);
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask,
+		     enum ocelot_reg reg, u32 offset);
 u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target,
 			    u32 reg, u32 offset);
 void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
-- 
cgit v1.2.3


From d54151aa0f4b5c89561705a00d8a5ebb4230028c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:49 +0300
Subject: net: ethtool: create and export ethtool_dev_mm_supported()

Create a wrapper over __ethtool_dev_mm_supported() which also calls
ethnl_ops_begin() and ethnl_ops_complete(). It can be used by other code
layers, such as tc, to make sure that preemptible TCs are supported
(this is true if an underlying MAC Merge layer exists).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool_netlink.h |  6 ++++++
 net/ethtool/mm.c                | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 17003b385756..fae0dfb9a9c8 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -39,6 +39,7 @@ void ethtool_aggregate_pause_stats(struct net_device *dev,
 				   struct ethtool_pause_stats *pause_stats);
 void ethtool_aggregate_rmon_stats(struct net_device *dev,
 				  struct ethtool_rmon_stats *rmon_stats);
+bool ethtool_dev_mm_supported(struct net_device *dev);
 
 #else
 static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
@@ -112,5 +113,10 @@ ethtool_aggregate_rmon_stats(struct net_device *dev,
 {
 }
 
+static inline bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+	return false;
+}
+
 #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index fce3cc2734f9..e00d7d5cea7e 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -249,3 +249,26 @@ bool __ethtool_dev_mm_supported(struct net_device *dev)
 
 	return !ret;
 }
+
+bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	bool supported;
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (!ops)
+		return false;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return false;
+
+	supported = __ethtool_dev_mm_supported(dev);
+
+	ethnl_ops_complete(dev);
+
+	return supported;
+}
+EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
-- 
cgit v1.2.3


From c54876cd5961ce0f8e74807f79a6739cd6b35ddf Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:53 +0300
Subject: net/sched: pass netlink extack to mqprio and taprio offload

With the multiplexed ndo_setup_tc() model which lacks a first-class
struct netlink_ext_ack * argument, the only way to pass the netlink
extended ACK message down to the device driver is to embed it within the
offload structure.

Do this for struct tc_mqprio_qopt_offload and struct tc_taprio_qopt_offload.

Since struct tc_taprio_qopt_offload also contains a tc_mqprio_qopt_offload
structure, and since device drivers might effectively reuse their mqprio
implementation for the mqprio portion of taprio, we make taprio set the
extack in both offload structures to point at the same netlink extack
message.

In fact, the taprio handling is a bit more tricky, for 2 reasons.

First is because the offload structure has a longer lifetime than the
extack structure. The driver is supposed to populate the extack
synchronously from ndo_setup_tc() and leave it alone afterwards.
To not have any use-after-free surprises, we zero out the extack pointer
when we leave taprio_enable_offload().

The second reason is because taprio does overwrite the extack message on
ndo_setup_tc() error. We need to switch to the weak form of setting an
extack message, which preserves a potential message set by the driver.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h |  2 ++
 net/sched/sch_mqprio.c  |  5 ++++-
 net/sched/sch_taprio.c  | 12 ++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index bb0bd69fb655..b43ed4733455 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -166,6 +166,7 @@ struct tc_mqprio_caps {
 struct tc_mqprio_qopt_offload {
 	/* struct tc_mqprio_qopt must always be the first element */
 	struct tc_mqprio_qopt qopt;
+	struct netlink_ext_ack *extack;
 	u16 mode;
 	u16 shaper;
 	u32 flags;
@@ -193,6 +194,7 @@ struct tc_taprio_sched_entry {
 
 struct tc_taprio_qopt_offload {
 	struct tc_mqprio_qopt_offload mqprio;
+	struct netlink_ext_ack *extack;
 	u8 enable;
 	ktime_t base_time;
 	u64 cycle_time;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index aae4d64dbf3f..67d77495c8fd 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -33,9 +33,12 @@ static int mqprio_enable_offload(struct Qdisc *sch,
 				 const struct tc_mqprio_qopt *qopt,
 				 struct netlink_ext_ack *extack)
 {
-	struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
 	struct mqprio_sched *priv = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mqprio_qopt_offload mqprio = {
+		.qopt = *qopt,
+		.extack = extack,
+	};
 	int err, i;
 
 	switch (priv->mode) {
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 1f469861eae3..cbad43019172 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1520,7 +1520,9 @@ static int taprio_enable_offload(struct net_device *dev,
 		return -ENOMEM;
 	}
 	offload->enable = 1;
+	offload->extack = extack;
 	mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+	offload->mqprio.extack = extack;
 	taprio_sched_to_offload(dev, sched, offload, &caps);
 
 	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
@@ -1528,14 +1530,20 @@ static int taprio_enable_offload(struct net_device *dev,
 
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
 	if (err < 0) {
-		NL_SET_ERR_MSG(extack,
-			       "Device failed to setup taprio offload");
+		NL_SET_ERR_MSG_WEAK(extack,
+				    "Device failed to setup taprio offload");
 		goto done;
 	}
 
 	q->offloaded = true;
 
 done:
+	/* The offload structure may linger around via a reference taken by the
+	 * device driver, so clear up the netlink extack pointer so that the
+	 * driver isn't tempted to dereference data which stopped being valid
+	 */
+	offload->extack = NULL;
+	offload->mqprio.extack = NULL;
 	taprio_offload_free(offload);
 
 	return err;
-- 
cgit v1.2.3


From f62af20bed2d9e824f51cfc97ff01bc261f40e58 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:54 +0300
Subject: net/sched: mqprio: allow per-TC user input of FP adminStatus

IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each
packet priority can be assigned to a "frame preemption status" value of
either "express" or "preemptible". Express priorities are transmitted by
the local device through the eMAC, and preemptible priorities through
the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge
layer).

The FP adminStatus is defined per packet priority, but 802.1Q clause
12.30.1.1.1 framePreemptionAdminStatus also says that:

| Priorities that all map to the same traffic class should be
| constrained to use the same value of preemption status.

It is impossible to ignore the cognitive dissonance in the standard
here, because it practically means that the FP adminStatus only takes
distinct values per traffic class, even though it is defined per
priority.

I can see no valid use case which is prevented by having the kernel take
the FP adminStatus as input per traffic class (what we do here).
In addition, this also enforces the above constraint by construction.
User space network managers which wish to expose FP adminStatus per
priority are free to do so; they must only observe the prio_tc_map of
the netdev (which presumably is also under their control, when
constructing the mqprio netlink attributes).

The reason for configuring frame preemption as a property of the Qdisc
layer is that the information about "preemptible TCs" is closest to the
place which handles the num_tc and prio_tc_map of the netdev. If the
UAPI would have been any other layer, it would be unclear what to do
with the FP information when num_tc collapses to 0. A key assumption is
that only mqprio/taprio change the num_tc and prio_tc_map of the netdev.
Not sure if that's a great assumption to make.

Having FP in tc-mqprio can be seen as an implementation of the use case
defined in 802.1Q Annex S.2 "Preemption used in isolation". There will
be a separate implementation of FP in tc-taprio, for the other use
cases.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h        |   1 +
 include/uapi/linux/pkt_sched.h |  16 ++++++
 net/sched/sch_mqprio.c         | 128 ++++++++++++++++++++++++++++++++++++++++-
 net/sched/sch_mqprio_lib.c     |  14 +++++
 net/sched/sch_mqprio_lib.h     |   2 +
 5 files changed, 160 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b43ed4733455..f436688b6efc 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -172,6 +172,7 @@ struct tc_mqprio_qopt_offload {
 	u32 flags;
 	u64 min_rate[TC_QOPT_MAX_QUEUE];
 	u64 max_rate[TC_QOPT_MAX_QUEUE];
+	unsigned long preemptible_tcs;
 };
 
 struct tc_taprio_caps {
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 000eec106856..b8d29be91b62 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -719,6 +719,11 @@ enum {
 
 #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
 
+enum {
+	TC_FP_EXPRESS = 1,
+	TC_FP_PREEMPTIBLE = 2,
+};
+
 struct tc_mqprio_qopt {
 	__u8	num_tc;
 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -732,12 +737,23 @@ struct tc_mqprio_qopt {
 #define TC_MQPRIO_F_MIN_RATE		0x4
 #define TC_MQPRIO_F_MAX_RATE		0x8
 
+enum {
+	TCA_MQPRIO_TC_ENTRY_UNSPEC,
+	TCA_MQPRIO_TC_ENTRY_INDEX,		/* u32 */
+	TCA_MQPRIO_TC_ENTRY_FP,			/* u32 */
+
+	/* add new constants above here */
+	__TCA_MQPRIO_TC_ENTRY_CNT,
+	TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
+};
+
 enum {
 	TCA_MQPRIO_UNSPEC,
 	TCA_MQPRIO_MODE,
 	TCA_MQPRIO_SHAPER,
 	TCA_MQPRIO_MIN_RATE64,
 	TCA_MQPRIO_MAX_RATE64,
+	TCA_MQPRIO_TC_ENTRY,
 	__TCA_MQPRIO_MAX,
 };
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 67d77495c8fd..dc5a0ff50b14 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
  */
 
+#include <linux/ethtool_netlink.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -27,6 +28,7 @@ struct mqprio_sched {
 	u32 flags;
 	u64 min_rate[TC_QOPT_MAX_QUEUE];
 	u64 max_rate[TC_QOPT_MAX_QUEUE];
+	u32 fp[TC_QOPT_MAX_QUEUE];
 };
 
 static int mqprio_enable_offload(struct Qdisc *sch,
@@ -63,6 +65,8 @@ static int mqprio_enable_offload(struct Qdisc *sch,
 		return -EINVAL;
 	}
 
+	mqprio_fp_to_offload(priv->fp, &mqprio);
+
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
 					    &mqprio);
 	if (err)
@@ -145,13 +149,95 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
 	return 0;
 }
 
+static const struct
+nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
+	[TCA_MQPRIO_TC_ENTRY_INDEX]	= NLA_POLICY_MAX(NLA_U32,
+							 TC_QOPT_MAX_QUEUE),
+	[TCA_MQPRIO_TC_ENTRY_FP]	= NLA_POLICY_RANGE(NLA_U32,
+							   TC_FP_EXPRESS,
+							   TC_FP_PREEMPTIBLE),
+};
+
 static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
 	[TCA_MQPRIO_MODE]	= { .len = sizeof(u16) },
 	[TCA_MQPRIO_SHAPER]	= { .len = sizeof(u16) },
 	[TCA_MQPRIO_MIN_RATE64]	= { .type = NLA_NESTED },
 	[TCA_MQPRIO_MAX_RATE64]	= { .type = NLA_NESTED },
+	[TCA_MQPRIO_TC_ENTRY]	= { .type = NLA_NESTED },
 };
 
+static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
+				 struct nlattr *opt,
+				 unsigned long *seen_tcs,
+				 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1];
+	int err, tc;
+
+	err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
+			       mqprio_tc_entry_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) {
+		NL_SET_ERR_MSG(extack, "TC entry index missing");
+		return -EINVAL;
+	}
+
+	tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
+	if (*seen_tcs & BIT(tc)) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX],
+				    "Duplicate tc entry");
+		return -EINVAL;
+	}
+
+	*seen_tcs |= BIT(tc);
+
+	if (tb[TCA_MQPRIO_TC_ENTRY_FP])
+		fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
+
+	return 0;
+}
+
+static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
+				   int nlattr_opt_len,
+				   struct netlink_ext_ack *extack)
+{
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	bool have_preemption = false;
+	unsigned long seen_tcs = 0;
+	u32 fp[TC_QOPT_MAX_QUEUE];
+	struct nlattr *n;
+	int tc, rem;
+	int err = 0;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		fp[tc] = priv->fp[tc];
+
+	nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
+		if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
+			continue;
+
+		err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
+		if (err)
+			goto out;
+	}
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+		priv->fp[tc] = fp[tc];
+		if (fp[tc] == TC_FP_PREEMPTIBLE)
+			have_preemption = true;
+	}
+
+	if (have_preemption && !ethtool_dev_mm_supported(dev)) {
+		NL_SET_ERR_MSG(extack, "Device does not support preemption");
+		return -EOPNOTSUPP;
+	}
+out:
+	return err;
+}
+
 /* Parse the other netlink attributes that represent the payload of
  * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
  */
@@ -234,6 +320,13 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
 		priv->flags |= TC_MQPRIO_F_MAX_RATE;
 	}
 
+	if (tb[TCA_MQPRIO_TC_ENTRY]) {
+		err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
+					      extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -247,7 +340,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
 	int i, err = -EOPNOTSUPP;
 	struct tc_mqprio_qopt *qopt = NULL;
 	struct tc_mqprio_caps caps;
-	int len;
+	int len, tc;
 
 	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
 	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -265,6 +358,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		priv->fp[tc] = TC_FP_EXPRESS;
+
 	qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
 				 &caps, sizeof(caps));
 
@@ -415,6 +511,33 @@ nla_put_failure:
 	return -1;
 }
 
+static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
+				  struct sk_buff *skb)
+{
+	struct nlattr *n;
+	int tc;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+		n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
+		if (!n)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, n);
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, n);
+	return -EMSGSIZE;
+}
+
 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -465,6 +588,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    (dump_rates(priv, &opt, skb) != 0))
 		goto nla_put_failure;
 
+	if (mqprio_dump_tc_entries(priv, skb))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, nla);
 nla_put_failure:
 	nlmsg_trim(skb, nla);
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
index c58a533b8ec5..83b3793c4012 100644
--- a/net/sched/sch_mqprio_lib.c
+++ b/net/sched/sch_mqprio_lib.c
@@ -114,4 +114,18 @@ void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt
 }
 EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
 
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+			  struct tc_mqprio_qopt_offload *mqprio)
+{
+	unsigned long preemptible_tcs = 0;
+	int tc;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		if (fp[tc] == TC_FP_PREEMPTIBLE)
+			preemptible_tcs |= BIT(tc);
+
+	mqprio->preemptible_tcs = preemptible_tcs;
+}
+EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
+
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
index 63f725ab8761..079f597072e3 100644
--- a/net/sched/sch_mqprio_lib.h
+++ b/net/sched/sch_mqprio_lib.h
@@ -14,5 +14,7 @@ int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
 			 struct netlink_ext_ack *extack);
 void mqprio_qopt_reconstruct(struct net_device *dev,
 			     struct tc_mqprio_qopt *qopt);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+			  struct tc_mqprio_qopt_offload *mqprio);
 
 #endif
-- 
cgit v1.2.3


From a721c3e54b80e45cd9202e7fca29ef018bed9069 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:55 +0300
Subject: net/sched: taprio: allow per-TC user input of FP adminStatus

This is a duplication of the FP adminStatus logic introduced for
tc-mqprio. Offloading is done through the tc_mqprio_qopt_offload
structure embedded within tc_taprio_qopt_offload. So practically, if a
device driver is written to treat the mqprio portion of taprio just like
standalone mqprio, it gets unified handling of frame preemption.

I would have reused more code with taprio, but this is mostly netlink
attribute parsing, which is hard to transform into generic code without
having something that stinks as a result. We have the same variables
with the same semantics, just different nlattr type values
(TCA_MQPRIO_TC_ENTRY=5 vs TCA_TAPRIO_ATTR_TC_ENTRY=12;
TCA_MQPRIO_TC_ENTRY_FP=2 vs TCA_TAPRIO_TC_ENTRY_FP=3, etc) and
consequently, different policies for the nest.

Every time nla_parse_nested() is called, an on-stack table "tb" of
nlattr pointers is allocated statically, up to the maximum understood
nlattr type. That array size is hardcoded as a constant, but when
transforming this into a common parsing function, it would become either
a VLA (which the Linux kernel rightfully doesn't like) or a call to the
allocator.

Having FP adminStatus in tc-taprio can be seen as addressing the 802.1Q
Annex S.3 "Scheduling and preemption used in combination, no HOLD/RELEASE"
and S.4 "Scheduling and preemption used in combination with HOLD/RELEASE"
use cases. HOLD and RELEASE events are emitted towards the underlying
MAC Merge layer when the schedule hits a Set-And-Hold-MAC or a
Set-And-Release-MAC gate operation. So within the tc-taprio UAPI space,
one can distinguish between the 2 use cases by choosing whether to use
the TC_TAPRIO_CMD_SET_AND_HOLD and TC_TAPRIO_CMD_SET_AND_RELEASE gate
operations within the schedule, or just TC_TAPRIO_CMD_SET_GATES.

A small part of the change is dedicated to refactoring the max_sdu
nlattr parsing to put all logic under the "if" that tests for presence
of that nlattr.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_taprio.c         | 65 +++++++++++++++++++++++++++++++++---------
 2 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index b8d29be91b62..51a7addc56c6 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1252,6 +1252,7 @@ enum {
 	TCA_TAPRIO_TC_ENTRY_UNSPEC,
 	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
 	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
+	TCA_TAPRIO_TC_ENTRY_FP,			/* u32 */
 
 	/* add new constants above here */
 	__TCA_TAPRIO_TC_ENTRY_CNT,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index cbad43019172..76db9a10ef50 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -96,6 +97,7 @@ struct taprio_sched {
 	struct list_head taprio_list;
 	int cur_txq[TC_MAX_QUEUE];
 	u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
+	u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
 	u32 txtime_delay;
 };
 
@@ -1002,6 +1004,9 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
 static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
 	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
 	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
+	[TCA_TAPRIO_TC_ENTRY_FP]	   = NLA_POLICY_RANGE(NLA_U32,
+							      TC_FP_EXPRESS,
+							      TC_FP_PREEMPTIBLE),
 };
 
 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
@@ -1524,6 +1529,7 @@ static int taprio_enable_offload(struct net_device *dev,
 	mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
 	offload->mqprio.extack = extack;
 	taprio_sched_to_offload(dev, sched, offload, &caps);
+	mqprio_fp_to_offload(q->fp, &offload->mqprio);
 
 	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
 		offload->max_sdu[tc] = q->max_sdu[tc];
@@ -1671,13 +1677,14 @@ out:
 static int taprio_parse_tc_entry(struct Qdisc *sch,
 				 struct nlattr *opt,
 				 u32 max_sdu[TC_QOPT_MAX_QUEUE],
+				 u32 fp[TC_QOPT_MAX_QUEUE],
 				 unsigned long *seen_tcs,
 				 struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
 	struct net_device *dev = qdisc_dev(sch);
-	u32 val = 0;
 	int err, tc;
+	u32 val;
 
 	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
 			       taprio_tc_policy, extack);
@@ -1702,15 +1709,18 @@ static int taprio_parse_tc_entry(struct Qdisc *sch,
 
 	*seen_tcs |= BIT(tc);
 
-	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
+	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
 		val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+		if (val > dev->max_mtu) {
+			NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+			return -ERANGE;
+		}
 
-	if (val > dev->max_mtu) {
-		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
-		return -ERANGE;
+		max_sdu[tc] = val;
 	}
 
-	max_sdu[tc] = val;
+	if (tb[TCA_TAPRIO_TC_ENTRY_FP])
+		fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
 
 	return 0;
 }
@@ -1720,29 +1730,51 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
 				   struct netlink_ext_ack *extack)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 	u32 max_sdu[TC_QOPT_MAX_QUEUE];
+	bool have_preemption = false;
 	unsigned long seen_tcs = 0;
+	u32 fp[TC_QOPT_MAX_QUEUE];
 	struct nlattr *n;
 	int tc, rem;
 	int err = 0;
 
-	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
 		max_sdu[tc] = q->max_sdu[tc];
+		fp[tc] = q->fp[tc];
+	}
 
 	nla_for_each_nested(n, opt, rem) {
 		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
 			continue;
 
-		err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+		err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
 					    extack);
 		if (err)
-			goto out;
+			return err;
 	}
 
-	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
 		q->max_sdu[tc] = max_sdu[tc];
+		q->fp[tc] = fp[tc];
+		if (fp[tc] != TC_FP_EXPRESS)
+			have_preemption = true;
+	}
+
+	if (have_preemption) {
+		if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+			NL_SET_ERR_MSG(extack,
+				       "Preemption only supported with full offload");
+			return -EOPNOTSUPP;
+		}
+
+		if (!ethtool_dev_mm_supported(dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "Device does not support preemption");
+			return -EOPNOTSUPP;
+		}
+	}
 
-out:
 	return err;
 }
 
@@ -2023,7 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
-	int i;
+	int i, tc;
 
 	spin_lock_init(&q->current_entry_lock);
 
@@ -2080,6 +2112,9 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 		q->qdiscs[i] = qdisc;
 	}
 
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		q->fp[tc] = TC_FP_EXPRESS;
+
 	taprio_detect_broken_mqprio(q);
 
 	return taprio_change(sch, opt, extack);
@@ -2223,6 +2258,7 @@ error_nest:
 }
 
 static int taprio_dump_tc_entries(struct sk_buff *skb,
+				  struct taprio_sched *q,
 				  struct sched_gate_list *sched)
 {
 	struct nlattr *n;
@@ -2240,6 +2276,9 @@ static int taprio_dump_tc_entries(struct sk_buff *skb,
 				sched->max_sdu[tc]))
 			goto nla_put_failure;
 
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
+			goto nla_put_failure;
+
 		nla_nest_end(skb, n);
 	}
 
@@ -2281,7 +2320,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
-	if (oper && taprio_dump_tc_entries(skb, oper))
+	if (oper && taprio_dump_tc_entries(skb, q, oper))
 		goto options_error;
 
 	if (oper && dump_schedule(skb, oper))
-- 
cgit v1.2.3


From a2917b23497e4205db32271e4e06e142a9f8a6aa Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:01 -0700
Subject: net: mana: Refactor RX buffer allocation code to prepare for various
 MTU

Move out common buffer allocation code from mana_process_rx_cqe() and
mana_alloc_rx_wqe() to helper functions.
Refactor related variables so they can be changed in one place, and buffer
sizes are in sync.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 154 +++++++++++++++-----------
 include/net/mana/mana.h                       |   6 +-
 2 files changed, 91 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 112c642dc89b..911954ff84ee 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1282,14 +1282,64 @@ drop_xdp:
 	u64_stats_update_end(&rx_stats->syncp);
 
 drop:
-	WARN_ON_ONCE(rxq->xdp_save_page);
-	rxq->xdp_save_page = virt_to_page(buf_va);
+	WARN_ON_ONCE(rxq->xdp_save_va);
+	/* Save for reuse */
+	rxq->xdp_save_va = buf_va;
 
 	++ndev->stats.rx_dropped;
 
 	return;
 }
 
+static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
+			     dma_addr_t *da, bool is_napi)
+{
+	struct page *page;
+	void *va;
+
+	/* Reuse XDP dropped page if available */
+	if (rxq->xdp_save_va) {
+		va = rxq->xdp_save_va;
+		rxq->xdp_save_va = NULL;
+	} else {
+		page = dev_alloc_page();
+		if (!page)
+			return NULL;
+
+		va = page_to_virt(page);
+	}
+
+	*da = dma_map_single(dev, va + XDP_PACKET_HEADROOM, rxq->datasize,
+			     DMA_FROM_DEVICE);
+
+	if (dma_mapping_error(dev, *da)) {
+		put_page(virt_to_head_page(va));
+		return NULL;
+	}
+
+	return va;
+}
+
+/* Allocate frag for rx buffer, and save the old buf */
+static void mana_refill_rxoob(struct device *dev, struct mana_rxq *rxq,
+			      struct mana_recv_buf_oob *rxoob, void **old_buf)
+{
+	dma_addr_t da;
+	void *va;
+
+	va = mana_get_rxfrag(rxq, dev, &da, true);
+
+	if (!va)
+		return;
+
+	dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
+			 DMA_FROM_DEVICE);
+	*old_buf = rxoob->buf_va;
+
+	rxoob->buf_va = va;
+	rxoob->sgl[0].address = da;
+}
+
 static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 				struct gdma_comp *cqe)
 {
@@ -1299,10 +1349,8 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct mana_recv_buf_oob *rxbuf_oob;
 	struct mana_port_context *apc;
 	struct device *dev = gc->dev;
-	void *new_buf, *old_buf;
-	struct page *new_page;
+	void *old_buf = NULL;
 	u32 curr, pktlen;
-	dma_addr_t da;
 
 	apc = netdev_priv(ndev);
 
@@ -1345,40 +1393,11 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	rxbuf_oob = &rxq->rx_oobs[curr];
 	WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
 
-	/* Reuse XDP dropped page if available */
-	if (rxq->xdp_save_page) {
-		new_page = rxq->xdp_save_page;
-		rxq->xdp_save_page = NULL;
-	} else {
-		new_page = alloc_page(GFP_ATOMIC);
-	}
-
-	if (new_page) {
-		da = dma_map_page(dev, new_page, XDP_PACKET_HEADROOM, rxq->datasize,
-				  DMA_FROM_DEVICE);
-
-		if (dma_mapping_error(dev, da)) {
-			__free_page(new_page);
-			new_page = NULL;
-		}
-	}
-
-	new_buf = new_page ? page_to_virt(new_page) : NULL;
-
-	if (new_buf) {
-		dma_unmap_page(dev, rxbuf_oob->buf_dma_addr, rxq->datasize,
-			       DMA_FROM_DEVICE);
-
-		old_buf = rxbuf_oob->buf_va;
-
-		/* refresh the rxbuf_oob with the new page */
-		rxbuf_oob->buf_va = new_buf;
-		rxbuf_oob->buf_dma_addr = da;
-		rxbuf_oob->sgl[0].address = rxbuf_oob->buf_dma_addr;
-	} else {
-		old_buf = NULL; /* drop the packet if no memory */
-	}
+	mana_refill_rxoob(dev, rxq, rxbuf_oob, &old_buf);
 
+	/* Unsuccessful refill will have old_buf == NULL.
+	 * In this case, mana_rx_skb() will drop the packet.
+	 */
 	mana_rx_skb(old_buf, oob, rxq);
 
 drop:
@@ -1659,8 +1678,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 
 	mana_deinit_cq(apc, &rxq->rx_cq);
 
-	if (rxq->xdp_save_page)
-		__free_page(rxq->xdp_save_page);
+	if (rxq->xdp_save_va)
+		put_page(virt_to_head_page(rxq->xdp_save_va));
 
 	for (i = 0; i < rxq->num_rx_buf; i++) {
 		rx_oob = &rxq->rx_oobs[i];
@@ -1668,10 +1687,10 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 		if (!rx_oob->buf_va)
 			continue;
 
-		dma_unmap_page(dev, rx_oob->buf_dma_addr, rxq->datasize,
-			       DMA_FROM_DEVICE);
+		dma_unmap_single(dev, rx_oob->sgl[0].address,
+				 rx_oob->sgl[0].size, DMA_FROM_DEVICE);
 
-		free_page((unsigned long)rx_oob->buf_va);
+		put_page(virt_to_head_page(rx_oob->buf_va));
 		rx_oob->buf_va = NULL;
 	}
 
@@ -1681,6 +1700,26 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 	kfree(rxq);
 }
 
+static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
+			    struct mana_rxq *rxq, struct device *dev)
+{
+	dma_addr_t da;
+	void *va;
+
+	va = mana_get_rxfrag(rxq, dev, &da, false);
+
+	if (!va)
+		return -ENOMEM;
+
+	rx_oob->buf_va = va;
+
+	rx_oob->sgl[0].address = da;
+	rx_oob->sgl[0].size = rxq->datasize;
+	rx_oob->sgl[0].mem_key = mem_key;
+
+	return 0;
+}
+
 #define MANA_WQE_HEADER_SIZE 16
 #define MANA_WQE_SGE_SIZE 16
 
@@ -1690,9 +1729,8 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 	struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
 	struct mana_recv_buf_oob *rx_oob;
 	struct device *dev = gc->dev;
-	struct page *page;
-	dma_addr_t da;
 	u32 buf_idx;
+	int ret;
 
 	WARN_ON(rxq->datasize == 0 || rxq->datasize > PAGE_SIZE);
 
@@ -1703,25 +1741,12 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 		rx_oob = &rxq->rx_oobs[buf_idx];
 		memset(rx_oob, 0, sizeof(*rx_oob));
 
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-
-		da = dma_map_page(dev, page, XDP_PACKET_HEADROOM, rxq->datasize,
-				  DMA_FROM_DEVICE);
-
-		if (dma_mapping_error(dev, da)) {
-			__free_page(page);
-			return -ENOMEM;
-		}
-
-		rx_oob->buf_va = page_to_virt(page);
-		rx_oob->buf_dma_addr = da;
-
 		rx_oob->num_sge = 1;
-		rx_oob->sgl[0].address = rx_oob->buf_dma_addr;
-		rx_oob->sgl[0].size = rxq->datasize;
-		rx_oob->sgl[0].mem_key = apc->ac->gdma_dev->gpa_mkey;
+
+		ret = mana_fill_rx_oob(rx_oob, apc->ac->gdma_dev->gpa_mkey, rxq,
+				       dev);
+		if (ret)
+			return ret;
 
 		rx_oob->wqe_req.sgl = rx_oob->sgl;
 		rx_oob->wqe_req.num_sge = rx_oob->num_sge;
@@ -1780,9 +1805,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->ndev = ndev;
 	rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE;
 	rxq->rxq_idx = rxq_idx;
-	rxq->datasize = ALIGN(MAX_FRAME_SIZE, 64);
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
+	rxq->datasize = ALIGN(ETH_FRAME_LEN, 64);
+
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
 		goto out;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index bb11a6535d80..037bcabf6b98 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -36,9 +36,6 @@ enum TRI_STATE {
 
 #define COMP_ENTRY_SIZE 64
 
-#define ADAPTER_MTU_SIZE 1500
-#define MAX_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
-
 #define RX_BUFFERS_PER_QUEUE 512
 
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
@@ -282,7 +279,6 @@ struct mana_recv_buf_oob {
 	struct gdma_wqe_request wqe_req;
 
 	void *buf_va;
-	dma_addr_t buf_dma_addr;
 
 	/* SGL of the buffer going to be sent has part of the work request. */
 	u32 num_sge;
@@ -322,7 +318,7 @@ struct mana_rxq {
 
 	struct bpf_prog __rcu *bpf_prog;
 	struct xdp_rxq_info xdp_rxq;
-	struct page *xdp_save_page;
+	void *xdp_save_va; /* for reusing */
 	bool xdp_flush;
 	int xdp_rc; /* XDP redirect return code */
 
-- 
cgit v1.2.3


From 2fbbd712baf1c60996554326728bbdbef5616e12 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:02 -0700
Subject: net: mana: Enable RX path to handle various MTU sizes

Update RX data path to allocate and use RX queue DMA buffers with
proper size based on potentially various MTU sizes.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 38 ++++++++++++++++++++-------
 include/net/mana/mana.h                       |  7 +++++
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 911954ff84ee..8e7fa6e9c3b5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1185,10 +1185,10 @@ static void mana_post_pkt_rxq(struct mana_rxq *rxq)
 	WARN_ON_ONCE(recv_buf_oob->wqe_inf.wqe_size_in_bu != 1);
 }
 
-static struct sk_buff *mana_build_skb(void *buf_va, uint pkt_len,
-				      struct xdp_buff *xdp)
+static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
+				      uint pkt_len, struct xdp_buff *xdp)
 {
-	struct sk_buff *skb = napi_build_skb(buf_va, PAGE_SIZE);
+	struct sk_buff *skb = napi_build_skb(buf_va, rxq->alloc_size);
 
 	if (!skb)
 		return NULL;
@@ -1196,11 +1196,12 @@ static struct sk_buff *mana_build_skb(void *buf_va, uint pkt_len,
 	if (xdp->data_hard_start) {
 		skb_reserve(skb, xdp->data - xdp->data_hard_start);
 		skb_put(skb, xdp->data_end - xdp->data);
-	} else {
-		skb_reserve(skb, XDP_PACKET_HEADROOM);
-		skb_put(skb, pkt_len);
+		return skb;
 	}
 
+	skb_reserve(skb, rxq->headroom);
+	skb_put(skb, pkt_len);
+
 	return skb;
 }
 
@@ -1233,7 +1234,7 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 	if (act != XDP_PASS && act != XDP_TX)
 		goto drop_xdp;
 
-	skb = mana_build_skb(buf_va, pkt_len, &xdp);
+	skb = mana_build_skb(rxq, buf_va, pkt_len, &xdp);
 
 	if (!skb)
 		goto drop;
@@ -1301,6 +1302,14 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 	if (rxq->xdp_save_va) {
 		va = rxq->xdp_save_va;
 		rxq->xdp_save_va = NULL;
+	} else if (rxq->alloc_size > PAGE_SIZE) {
+		if (is_napi)
+			va = napi_alloc_frag(rxq->alloc_size);
+		else
+			va = netdev_alloc_frag(rxq->alloc_size);
+
+		if (!va)
+			return NULL;
 	} else {
 		page = dev_alloc_page();
 		if (!page)
@@ -1309,7 +1318,7 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 		va = page_to_virt(page);
 	}
 
-	*da = dma_map_single(dev, va + XDP_PACKET_HEADROOM, rxq->datasize,
+	*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
 			     DMA_FROM_DEVICE);
 
 	if (dma_mapping_error(dev, *da)) {
@@ -1732,7 +1741,7 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 	u32 buf_idx;
 	int ret;
 
-	WARN_ON(rxq->datasize == 0 || rxq->datasize > PAGE_SIZE);
+	WARN_ON(rxq->datasize == 0);
 
 	*rxq_size = 0;
 	*cq_size = 0;
@@ -1788,6 +1797,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_obj_spec wq_spec;
 	struct mana_obj_spec cq_spec;
+	unsigned int mtu = ndev->mtu;
 	struct gdma_queue_spec spec;
 	struct mana_cq *cq = NULL;
 	struct gdma_context *gc;
@@ -1807,7 +1817,15 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->rxq_idx = rxq_idx;
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
-	rxq->datasize = ALIGN(ETH_FRAME_LEN, 64);
+	rxq->datasize = ALIGN(mtu + ETH_HLEN, 64);
+
+	if (mtu > MANA_XDP_MTU_MAX) {
+		rxq->alloc_size = mtu + MANA_RXBUF_PAD;
+		rxq->headroom = 0;
+	} else {
+		rxq->alloc_size = mtu + MANA_RXBUF_PAD + XDP_PACKET_HEADROOM;
+		rxq->headroom = XDP_PACKET_HEADROOM;
+	}
 
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 037bcabf6b98..fee99d704281 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -291,6 +291,11 @@ struct mana_recv_buf_oob {
 	struct gdma_posted_wqe_info wqe_inf;
 };
 
+#define MANA_RXBUF_PAD (SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) \
+			+ ETH_HLEN)
+
+#define MANA_XDP_MTU_MAX (PAGE_SIZE - MANA_RXBUF_PAD - XDP_PACKET_HEADROOM)
+
 struct mana_rxq {
 	struct gdma_queue *gdma_rq;
 	/* Cache the gdma receive queue id */
@@ -300,6 +305,8 @@ struct mana_rxq {
 	u32 rxq_idx;
 
 	u32 datasize;
+	u32 alloc_size;
+	u32 headroom;
 
 	mana_handle_t rxobj;
 
-- 
cgit v1.2.3


From 80f6215b450eb8e92d8b1f117abf5ecf867f963e Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:03 -0700
Subject: net: mana: Add support for jumbo frame

During probe, get the hardware-allowed max MTU by querying the device
configuration. Users can select MTU up to the device limit.
When XDP is in use, limit MTU settings so the buffer size is within
one page. And, when MTU is set to a too large value, XDP is not allowed
to run.
Also, to prevent changing MTU fails, and leaves the NIC in a bad state,
pre-allocate all buffers before starting the change. So in low memory
condition, it will return error, without affecting the NIC.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_bpf.c |  22 +--
 drivers/net/ethernet/microsoft/mana/mana_en.c  | 217 +++++++++++++++++++++++--
 include/net/mana/gdma.h                        |   4 +
 include/net/mana/mana.h                        |  14 ++
 4 files changed, 233 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index 3caea631229c..23b1521c0df9 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -133,12 +133,6 @@ out:
 	return act;
 }
 
-static unsigned int mana_xdp_fraglen(unsigned int len)
-{
-	return SKB_DATA_ALIGN(len) +
-	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-}
-
 struct bpf_prog *mana_xdp_get(struct mana_port_context *apc)
 {
 	ASSERT_RTNL();
@@ -179,17 +173,18 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	struct bpf_prog *old_prog;
-	int buf_max;
+	struct gdma_context *gc;
+
+	gc = apc->ac->gdma_dev->gdma_context;
 
 	old_prog = mana_xdp_get(apc);
 
 	if (!old_prog && !prog)
 		return 0;
 
-	buf_max = XDP_PACKET_HEADROOM + mana_xdp_fraglen(ndev->mtu + ETH_HLEN);
-	if (prog && buf_max > PAGE_SIZE) {
-		netdev_err(ndev, "XDP: mtu:%u too large, buf_max:%u\n",
-			   ndev->mtu, buf_max);
+	if (prog && ndev->mtu > MANA_XDP_MTU_MAX) {
+		netdev_err(ndev, "XDP: mtu:%u too large, mtu_max:%lu\n",
+			   ndev->mtu, MANA_XDP_MTU_MAX);
 		NL_SET_ERR_MSG_MOD(extack, "XDP: mtu too large");
 
 		return -EOPNOTSUPP;
@@ -206,6 +201,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 	if (apc->port_is_up)
 		mana_chn_setxdp(apc, prog);
 
+	if (prog)
+		ndev->max_mtu = MANA_XDP_MTU_MAX;
+	else
+		ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 8e7fa6e9c3b5..cabecbfa1102 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -427,6 +427,192 @@ static u16 mana_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	return txq;
 }
 
+/* Release pre-allocated RX buffers */
+static void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
+{
+	struct device *dev;
+	int i;
+
+	dev = mpc->ac->gdma_dev->gdma_context->dev;
+
+	if (!mpc->rxbufs_pre)
+		goto out1;
+
+	if (!mpc->das_pre)
+		goto out2;
+
+	while (mpc->rxbpre_total) {
+		i = --mpc->rxbpre_total;
+		dma_unmap_single(dev, mpc->das_pre[i], mpc->rxbpre_datasize,
+				 DMA_FROM_DEVICE);
+		put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
+	}
+
+	kfree(mpc->das_pre);
+	mpc->das_pre = NULL;
+
+out2:
+	kfree(mpc->rxbufs_pre);
+	mpc->rxbufs_pre = NULL;
+
+out1:
+	mpc->rxbpre_datasize = 0;
+	mpc->rxbpre_alloc_size = 0;
+	mpc->rxbpre_headroom = 0;
+}
+
+/* Get a buffer from the pre-allocated RX buffers */
+static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
+{
+	struct net_device *ndev = rxq->ndev;
+	struct mana_port_context *mpc;
+	void *va;
+
+	mpc = netdev_priv(ndev);
+
+	if (!mpc->rxbufs_pre || !mpc->das_pre || !mpc->rxbpre_total) {
+		netdev_err(ndev, "No RX pre-allocated bufs\n");
+		return NULL;
+	}
+
+	/* Check sizes to catch unexpected coding error */
+	if (mpc->rxbpre_datasize != rxq->datasize) {
+		netdev_err(ndev, "rxbpre_datasize mismatch: %u: %u\n",
+			   mpc->rxbpre_datasize, rxq->datasize);
+		return NULL;
+	}
+
+	if (mpc->rxbpre_alloc_size != rxq->alloc_size) {
+		netdev_err(ndev, "rxbpre_alloc_size mismatch: %u: %u\n",
+			   mpc->rxbpre_alloc_size, rxq->alloc_size);
+		return NULL;
+	}
+
+	if (mpc->rxbpre_headroom != rxq->headroom) {
+		netdev_err(ndev, "rxbpre_headroom mismatch: %u: %u\n",
+			   mpc->rxbpre_headroom, rxq->headroom);
+		return NULL;
+	}
+
+	mpc->rxbpre_total--;
+
+	*da = mpc->das_pre[mpc->rxbpre_total];
+	va = mpc->rxbufs_pre[mpc->rxbpre_total];
+	mpc->rxbufs_pre[mpc->rxbpre_total] = NULL;
+
+	/* Deallocate the array after all buffers are gone */
+	if (!mpc->rxbpre_total)
+		mana_pre_dealloc_rxbufs(mpc);
+
+	return va;
+}
+
+/* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
+static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size,
+			       u32 *headroom)
+{
+	if (mtu > MANA_XDP_MTU_MAX)
+		*headroom = 0; /* no support for XDP */
+	else
+		*headroom = XDP_PACKET_HEADROOM;
+
+	*alloc_size = mtu + MANA_RXBUF_PAD + *headroom;
+
+	*datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN);
+}
+
+static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu)
+{
+	struct device *dev;
+	struct page *page;
+	dma_addr_t da;
+	int num_rxb;
+	void *va;
+	int i;
+
+	mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize,
+			   &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom);
+
+	dev = mpc->ac->gdma_dev->gdma_context->dev;
+
+	num_rxb = mpc->num_queues * RX_BUFFERS_PER_QUEUE;
+
+	WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
+	mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
+	if (!mpc->rxbufs_pre)
+		goto error;
+
+	mpc->das_pre = kmalloc_array(num_rxb, sizeof(dma_addr_t), GFP_KERNEL);
+	if (!mpc->das_pre)
+		goto error;
+
+	mpc->rxbpre_total = 0;
+
+	for (i = 0; i < num_rxb; i++) {
+		if (mpc->rxbpre_alloc_size > PAGE_SIZE) {
+			va = netdev_alloc_frag(mpc->rxbpre_alloc_size);
+			if (!va)
+				goto error;
+		} else {
+			page = dev_alloc_page();
+			if (!page)
+				goto error;
+
+			va = page_to_virt(page);
+		}
+
+		da = dma_map_single(dev, va + mpc->rxbpre_headroom,
+				    mpc->rxbpre_datasize, DMA_FROM_DEVICE);
+
+		if (dma_mapping_error(dev, da)) {
+			put_page(virt_to_head_page(va));
+			goto error;
+		}
+
+		mpc->rxbufs_pre[i] = va;
+		mpc->das_pre[i] = da;
+		mpc->rxbpre_total = i + 1;
+	}
+
+	return 0;
+
+error:
+	mana_pre_dealloc_rxbufs(mpc);
+	return -ENOMEM;
+}
+
+static int mana_change_mtu(struct net_device *ndev, int new_mtu)
+{
+	struct mana_port_context *mpc = netdev_priv(ndev);
+	unsigned int old_mtu = ndev->mtu;
+	int err;
+
+	/* Pre-allocate buffers to prevent failure in mana_attach later */
+	err = mana_pre_alloc_rxbufs(mpc, new_mtu);
+	if (err) {
+		netdev_err(ndev, "Insufficient memory for new MTU\n");
+		return err;
+	}
+
+	err = mana_detach(ndev, false);
+	if (err) {
+		netdev_err(ndev, "mana_detach failed: %d\n", err);
+		goto out;
+	}
+
+	ndev->mtu = new_mtu;
+
+	err = mana_attach(ndev);
+	if (err) {
+		netdev_err(ndev, "mana_attach failed: %d\n", err);
+		ndev->mtu = old_mtu;
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(mpc);
+	return err;
+}
+
 static const struct net_device_ops mana_devops = {
 	.ndo_open		= mana_open,
 	.ndo_stop		= mana_close,
@@ -436,6 +622,7 @@ static const struct net_device_ops mana_devops = {
 	.ndo_get_stats64	= mana_get_stats64,
 	.ndo_bpf		= mana_bpf,
 	.ndo_xdp_xmit		= mana_xdp_xmit,
+	.ndo_change_mtu		= mana_change_mtu,
 };
 
 static void mana_cleanup_port_context(struct mana_port_context *apc)
@@ -625,6 +812,9 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG,
 			     sizeof(req), sizeof(resp));
+
+	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
+
 	req.proto_major_ver = proto_major_ver;
 	req.proto_minor_ver = proto_minor_ver;
 	req.proto_micro_ver = proto_micro_ver;
@@ -647,6 +837,11 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 
 	*max_num_vports = resp.max_num_vports;
 
+	if (resp.hdr.response.msg_version == GDMA_MESSAGE_V2)
+		gc->adapter_mtu = resp.adapter_mtu;
+	else
+		gc->adapter_mtu = ETH_FRAME_LEN;
+
 	return 0;
 }
 
@@ -1712,10 +1907,14 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
 			    struct mana_rxq *rxq, struct device *dev)
 {
+	struct mana_port_context *mpc = netdev_priv(rxq->ndev);
 	dma_addr_t da;
 	void *va;
 
-	va = mana_get_rxfrag(rxq, dev, &da, false);
+	if (mpc->rxbufs_pre)
+		va = mana_get_rxbuf_pre(rxq, &da);
+	else
+		va = mana_get_rxfrag(rxq, dev, &da, false);
 
 	if (!va)
 		return -ENOMEM;
@@ -1797,7 +1996,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_obj_spec wq_spec;
 	struct mana_obj_spec cq_spec;
-	unsigned int mtu = ndev->mtu;
 	struct gdma_queue_spec spec;
 	struct mana_cq *cq = NULL;
 	struct gdma_context *gc;
@@ -1817,15 +2015,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->rxq_idx = rxq_idx;
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
-	rxq->datasize = ALIGN(mtu + ETH_HLEN, 64);
-
-	if (mtu > MANA_XDP_MTU_MAX) {
-		rxq->alloc_size = mtu + MANA_RXBUF_PAD;
-		rxq->headroom = 0;
-	} else {
-		rxq->alloc_size = mtu + MANA_RXBUF_PAD + XDP_PACKET_HEADROOM;
-		rxq->headroom = XDP_PACKET_HEADROOM;
-	}
+	mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
+			   &rxq->headroom);
 
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
@@ -2238,8 +2429,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	ndev->netdev_ops = &mana_devops;
 	ndev->ethtool_ops = &mana_ethtool_ops;
 	ndev->mtu = ETH_DATA_LEN;
-	ndev->max_mtu = ndev->mtu;
-	ndev->min_mtu = ndev->mtu;
+	ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
+	ndev->min_mtu = ETH_MIN_MTU;
 	ndev->needed_headroom = MANA_HEADROOM;
 	ndev->dev_port = port_idx;
 	SET_NETDEV_DEV(ndev, gc->dev);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 56189e4252da..96c120160f15 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -145,6 +145,7 @@ struct gdma_general_req {
 }; /* HW DATA */
 
 #define GDMA_MESSAGE_V1 1
+#define GDMA_MESSAGE_V2 2
 
 struct gdma_general_resp {
 	struct gdma_resp_hdr hdr;
@@ -354,6 +355,9 @@ struct gdma_context {
 	struct gdma_resource	msix_resource;
 	struct gdma_irq_context	*irq_contexts;
 
+	/* L2 MTU */
+	u16 adapter_mtu;
+
 	/* This maps a CQ index to the queue structure. */
 	unsigned int		max_num_cqs;
 	struct gdma_queue	**cq_table;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index fee99d704281..cd386aa7c7cc 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -37,6 +37,7 @@ enum TRI_STATE {
 #define COMP_ENTRY_SIZE 64
 
 #define RX_BUFFERS_PER_QUEUE 512
+#define MANA_RX_DATA_ALIGN 64
 
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
 
@@ -390,6 +391,14 @@ struct mana_port_context {
 	/* This points to an array of num_queues of RQ pointers. */
 	struct mana_rxq **rxqs;
 
+	/* pre-allocated rx buffer array */
+	void **rxbufs_pre;
+	dma_addr_t *das_pre;
+	int rxbpre_total;
+	u32 rxbpre_datasize;
+	u32 rxbpre_alloc_size;
+	u32 rxbpre_headroom;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
@@ -489,6 +498,11 @@ struct mana_query_device_cfg_resp {
 	u16 max_num_vports;
 	u16 reserved;
 	u32 max_num_eqs;
+
+	/* response v2: */
+	u16 adapter_mtu;
+	u16 reserved2;
+	u32 reserved3;
 }; /* HW DATA */
 
 /* Query vPort Configuration */
-- 
cgit v1.2.3


From ef9f643a9f8b62bcbcc51f0e0af8599adc2e17ed Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Fri, 14 Apr 2023 10:14:52 +0200
Subject: cacheinfo: Add use_arch[|_cache]_info field/function

The cache information can be extracted from either a Device
Tree (DT), the PPTT ACPI table, or arch registers (clidr_el1
for arm64).

The clidr_el1 register is used only if DT/ACPI information is not
available. It does not states how caches are shared among CPUs.

Add a use_arch_cache_info field/function to identify when the
DT/ACPI doesn't provide cache information. Use this information
to assume L1 caches are privates and L2 and higher are shared among
all CPUs.

Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://lore.kernel.org/r/20230414081453.244787-5-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/cacheinfo.c  | 12 ++++++++++--
 include/linux/cacheinfo.h |  6 ++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index f16e5a82f0f3..45e36721bc24 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -28,6 +28,9 @@ static DEFINE_PER_CPU(struct cpu_cacheinfo, ci_cpu_cacheinfo);
 #define per_cpu_cacheinfo_idx(cpu, idx)		\
 				(per_cpu_cacheinfo(cpu) + (idx))
 
+/* Set if no cache information is found in DT/ACPI. */
+static bool use_arch_info;
+
 struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu)
 {
 	return ci_cacheinfo(cpu);
@@ -40,7 +43,8 @@ static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
 	 * For non DT/ACPI systems, assume unique level 1 caches,
 	 * system-wide shared caches for all other levels.
 	 */
-	if (!(IS_ENABLED(CONFIG_OF) || IS_ENABLED(CONFIG_ACPI)))
+	if (!(IS_ENABLED(CONFIG_OF) || IS_ENABLED(CONFIG_ACPI)) ||
+	    use_arch_info)
 		return (this_leaf->level != 1) && (sib_leaf->level != 1);
 
 	if ((sib_leaf->attributes & CACHE_ID) &&
@@ -343,6 +347,10 @@ static int cache_setup_properties(unsigned int cpu)
 	else if (!acpi_disabled)
 		ret = cache_setup_acpi(cpu);
 
+	// Assume there is no cache information available in DT/ACPI from now.
+	if (ret && use_arch_cache_info())
+		use_arch_info = true;
+
 	return ret;
 }
 
@@ -361,7 +369,7 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
 	 * to update the shared cpu_map if the cache attributes were
 	 * populated early before all the cpus are brought online
 	 */
-	if (!last_level_cache_is_valid(cpu)) {
+	if (!last_level_cache_is_valid(cpu) && !use_arch_info) {
 		ret = cache_setup_properties(cpu);
 		if (ret)
 			return ret;
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 6147b2672555..a5cfd44fab45 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -131,4 +131,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level)
 	return -1;
 }
 
+#ifdef CONFIG_ARM64
+#define use_arch_cache_info()	(true)
+#else
+#define use_arch_cache_info()	(false)
+#endif
+
 #endif /* _LINUX_CACHEINFO_H */
-- 
cgit v1.2.3


From 01f727cdc4dbecd36c6722977ff9535f16c11751 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Apr 2023 12:48:42 +0800
Subject: crypto: api - Move low-level functions into algapi.h

A number of low-level functions were exposed in crypto.h.  Move
them into algapi.h (and internal.h).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/internal.h       |  2 ++
 crypto/tcrypt.c         | 11 +++++++----
 include/crypto/algapi.h | 14 ++++++++++++++
 include/linux/crypto.h  | 30 +++---------------------------
 4 files changed, 26 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/crypto/internal.h b/crypto/internal.h
index 932f0aafddc3..f84dfe6491e5 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -47,6 +47,8 @@ extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
 extern struct blocking_notifier_head crypto_chain;
 
+int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
+
 #ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
 static inline bool crypto_boot_test_finished(void)
 {
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 6521feec7756..202ca1a3105d 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -25,14 +25,17 @@
 #include <linux/err.h>
 #include <linux/fips.h>
 #include <linux/init.h>
-#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/moduleparam.h>
-#include <linux/jiffies.h>
 #include <linux/timex.h>
-#include <linux/interrupt.h>
+
+#include "internal.h"
 #include "tcrypt.h"
 
 /*
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index bbf8c43c3320..016d5a302b84 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -49,6 +49,7 @@ struct crypto_instance;
 struct module;
 struct notifier_block;
 struct rtattr;
+struct scatterlist;
 struct seq_file;
 struct sk_buff;
 
@@ -132,6 +133,14 @@ struct crypto_attr_type {
 	u32 mask;
 };
 
+/*
+ * Algorithm registration interface.
+ */
+int crypto_register_alg(struct crypto_alg *alg);
+void crypto_unregister_alg(struct crypto_alg *alg);
+int crypto_register_algs(struct crypto_alg *algs, int count);
+void crypto_unregister_algs(struct crypto_alg *algs, int count);
+
 void crypto_mod_put(struct crypto_alg *alg);
 
 int crypto_register_template(struct crypto_template *tmpl);
@@ -263,4 +272,9 @@ static inline void crypto_request_complete(struct crypto_async_request *req,
 	req->complete(req->data, err);
 }
 
+static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
+}
+
 #endif	/* _CRYPTO_ALGAPI_H */
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index d57597ebef6e..fdfa3e8eda43 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -12,13 +12,10 @@
 #ifndef _LINUX_CRYPTO_H
 #define _LINUX_CRYPTO_H
 
-#include <linux/atomic.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/bug.h>
+#include <linux/completion.h>
 #include <linux/refcount.h>
 #include <linux/slab.h>
-#include <linux/completion.h>
+#include <linux/types.h>
 
 /*
  * Algorithm masks and types.
@@ -158,10 +155,9 @@
 
 #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))
 
-struct scatterlist;
-struct crypto_async_request;
 struct crypto_tfm;
 struct crypto_type;
+struct module;
 
 typedef void (*crypto_completion_t)(void *req, int err);
 
@@ -411,14 +407,6 @@ static inline void crypto_init_wait(struct crypto_wait *wait)
 	init_completion(&wait->completion);
 }
 
-/*
- * Algorithm registration interface.
- */
-int crypto_register_alg(struct crypto_alg *alg);
-void crypto_unregister_alg(struct crypto_alg *alg);
-int crypto_register_algs(struct crypto_alg *algs, int count);
-void crypto_unregister_algs(struct crypto_alg *algs, int count);
-
 /*
  * Algorithm query interface.
  */
@@ -459,8 +447,6 @@ static inline void crypto_free_tfm(struct crypto_tfm *tfm)
 	return crypto_destroy_tfm(tfm, tfm);
 }
 
-int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
-
 /*
  * Transform helpers which query the underlying algorithm.
  */
@@ -474,16 +460,6 @@ static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
 	return tfm->__crt_alg->cra_driver_name;
 }
 
-static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm)
-{
-	return tfm->__crt_alg->cra_priority;
-}
-
-static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
-{
-	return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
-}
-
 static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
 {
 	return tfm->__crt_alg->cra_blocksize;
-- 
cgit v1.2.3


From a19c61b06585f71c4dc1303fe6a3af79dfe33678 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 3 Apr 2023 13:32:12 -0500
Subject: crypto: ccp - Return doorbell status code as an argument

If the doorbell failed to ring we return -EIO, but the caller can't
determine why it failed.  Pass the reason for the failure in an
argument for caller to investigate.

Suggested-by: Mark Hasemeyer <markhas@chromium.org>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Mark Hasemeyer <markhas@chromium.org>
Tested-by: Mark Hasemeyer <markhas@chromium.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/platform-access.c | 4 +++-
 include/linux/psp-platform-access.h  | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/platform-access.c b/drivers/crypto/ccp/platform-access.c
index 1cc154a1c6ab..48f59ae91692 100644
--- a/drivers/crypto/ccp/platform-access.c
+++ b/drivers/crypto/ccp/platform-access.c
@@ -132,7 +132,7 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(psp_send_platform_access_msg);
 
-int psp_ring_platform_doorbell(int msg)
+int psp_ring_platform_doorbell(int msg, u32 *result)
 {
 	struct psp_device *psp = psp_get_master_device();
 	struct psp_platform_access_device *pa_dev;
@@ -164,6 +164,8 @@ int psp_ring_platform_doorbell(int msg)
 
 	val = FIELD_GET(PSP_CMDRESP_STS, ioread32(cmd));
 	if (val) {
+		if (result)
+			*result = val;
 		ret = -EIO;
 		goto unlock;
 	}
diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h
index 89df4549fada..1b661341d8f3 100644
--- a/include/linux/psp-platform-access.h
+++ b/include/linux/psp-platform-access.h
@@ -45,9 +45,9 @@ int psp_send_platform_access_msg(enum psp_platform_access_msg, struct psp_reques
  *  -%EBUSY:     mailbox in recovery or in use
  *  -%ENODEV:    driver not bound with PSP device
  *  -%ETIMEDOUT: request timed out
- *  -%EIO:       unknown error (see kernel log)
+ *  -%EIO:       error will be stored in result argument
  */
-int psp_ring_platform_doorbell(int msg);
+int psp_ring_platform_doorbell(int msg, u32 *result);
 
 /**
  * psp_check_platform_access_status() - Checks whether platform features is ready
-- 
cgit v1.2.3


From e223864f8257afde5e23eca4c006a0d69581a7a2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 12 Apr 2023 11:10:45 -0300
Subject: iommu: Make iommu_release_device() static

This is not called outside the core code, and indeed cannot be called
correctly outside the bus notifier. Make it static.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0-v1-c3da18124d2d+56-rm_iommu_release_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 3 ++-
 include/linux/iommu.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7abee83610b6..435fc902df19 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -87,6 +87,7 @@ static const char * const iommu_group_resv_type_string[] = {
 
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
+static void iommu_release_device(struct device *dev);
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
@@ -493,7 +494,7 @@ static void __iommu_group_release_device(struct iommu_group *group,
 	kobject_put(group->devices_kobj);
 }
 
-void iommu_release_device(struct device *dev)
+static void iommu_release_device(struct device *dev)
 {
 	struct iommu_group *group = dev->iommu_group;
 	struct group_device *device;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 54f535ff9868..c892e06f8357 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -699,7 +699,6 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 }
 
 int iommu_probe_device(struct device *dev);
-void iommu_release_device(struct device *dev);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
-- 
cgit v1.2.3


From f7f9c054a227ad4922070d748b1f4fc4b5657329 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 12 Apr 2023 11:11:58 -0300
Subject: iommu: Remove iommu_group_get_by_id()

This is never called.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0-v1-60bbc66d7e92+24-rm_iommu_get_by_id_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 29 -----------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 35 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 435fc902df19..f35058f1d68e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -882,35 +882,6 @@ struct iommu_group *iommu_group_alloc(void)
 }
 EXPORT_SYMBOL_GPL(iommu_group_alloc);
 
-struct iommu_group *iommu_group_get_by_id(int id)
-{
-	struct kobject *group_kobj;
-	struct iommu_group *group;
-	const char *name;
-
-	if (!iommu_group_kset)
-		return NULL;
-
-	name = kasprintf(GFP_KERNEL, "%d", id);
-	if (!name)
-		return NULL;
-
-	group_kobj = kset_find_obj(iommu_group_kset, name);
-	kfree(name);
-
-	if (!group_kobj)
-		return NULL;
-
-	group = container_of(group_kobj, struct iommu_group, kobj);
-	BUG_ON(group->id != id);
-
-	kobject_get(group->devices_kobj);
-	kobject_put(&group->kobj);
-
-	return group;
-}
-EXPORT_SYMBOL_GPL(iommu_group_get_by_id);
-
 /**
  * iommu_group_get_iommudata - retrieve iommu_data registered for a group
  * @group: the group
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c892e06f8357..7dbdd13d7ce0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -460,7 +460,6 @@ extern bool iommu_present(struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
-extern struct iommu_group *iommu_group_get_by_id(int id);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
@@ -746,11 +745,6 @@ static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 	return NULL;
 }
 
-static inline struct iommu_group *iommu_group_get_by_id(int id)
-{
-	return NULL;
-}
-
 static inline void iommu_domain_free(struct iommu_domain *domain)
 {
 }
-- 
cgit v1.2.3


From cc4cffc3c142d57df48c07851862444e1d33bdaa Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Fri, 7 Apr 2023 22:37:52 +0200
Subject: wifi: brcmfmac: add Cypress 43439 SDIO ids

Add SDIO ids for use with the muRata 1YN (Cypress CYW43439).
The odd thing about this is that the previous 1YN populated
on M.2 card for evaluation purposes had BRCM SDIO vendor ID,
while the chip populated on real hardware has a Cypress one.
The device ID also differs between the two devices. But they
are both 43439 otherwise, so add the IDs for both.

On-device 1YN (43439), the new one, chip label reads "1YN":
```
/sys/.../mmc_host/mmc2/mmc2:0001 # cat vendor device
0x04b4
0xbd3d
```

EA M.2 evaluation board 1YN (43439), the old one, chip label reads "1YN ES1.4":
```
/sys/.../mmc_host/mmc0/mmc0:0001/# cat vendor device
0x02d0
0xa9a6
```

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20230407203752.128539-1-marex@denx.de
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 9 ++++++++-
 include/linux/mmc/sdio_ids.h                              | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index b7c918f241c9..f5dc3bb11b64 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -965,6 +965,12 @@ out:
 		.driver_data = BRCMF_FWVENDOR_ ## fw_vend \
 	}
 
+#define CYW_SDIO_DEVICE(dev_id, fw_vend) \
+	{ \
+		SDIO_DEVICE(SDIO_VENDOR_ID_CYPRESS, dev_id), \
+		.driver_data = BRCMF_FWVENDOR_ ## fw_vend \
+	}
+
 /* devices we support, null terminated */
 static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43143, WCC),
@@ -979,6 +985,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430, WCC),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43439, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43455, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354, WCC),
@@ -986,9 +993,9 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012, CYW),
-	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359, CYW),
+	CYW_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW),
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 66f503ed2448..c653accdc7fd 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -74,10 +74,13 @@
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
-#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439	0xa9af
+#define SDIO_DEVICE_ID_BROADCOM_43439		0xa9af
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752	0xaae8
 
+#define SDIO_VENDOR_ID_CYPRESS			0x04b4
+#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439	0xbd3d
+
 #define SDIO_VENDOR_ID_MARVELL			0x02df
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
 #define SDIO_DEVICE_ID_MARVELL_8688_WLAN	0x9104
-- 
cgit v1.2.3


From e0b081d17a9f4e5c0cbb0e5fbeb1abe3de0f7e4e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 10:24:07 -0700
Subject: sched: Fix KCSAN noinstr violation

With KCSAN enabled, end_of_stack() can get out-of-lined.  Force it
inline.

Fixes the following warnings:

  vmlinux.o: warning: objtool: check_stackleak_irqoff+0x2b: call to end_of_stack() leaves .noinstr.text section

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/cc1b4d73d3a428a00d206242a68fdf99a934ca7b.1681320026.git.jpoimboe@kernel.org
---
 include/linux/sched/task_stack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 5e799a47431e..f158b025c175 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -23,7 +23,7 @@ static __always_inline void *task_stack_page(const struct task_struct *task)
 
 #define setup_thread_stack(new,old)	do { } while(0)
 
-static inline unsigned long *end_of_stack(const struct task_struct *task)
+static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
 {
 #ifdef CONFIG_STACK_GROWSUP
 	return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
-- 
cgit v1.2.3


From e8deb00c0c4808654d1bf96a8f79cf1deb59b631 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 10:24:06 -0700
Subject: context_tracking: Fix KCSAN noinstr violation

With KCSAN enabled, even empty inline stubs can be out-of-lined.

Force the context_tracking_guest_exit() stub inline.

Fixes the following warnings:

  vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1be: call to context_tracking_guest_exit() leaves .noinstr.text section
  vmlinux.o: warning: objtool: svm_vcpu_enter_exit+0x85: call to context_tracking_guest_exit() leaves .noinstr.text section

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/dc93f45abdec90c171108b4b590b7fff5790963c.1681320026.git.jpoimboe@kernel.org
---
 include/linux/context_tracking.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d4afa8508a80..5ae3abd767b4 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -97,7 +97,7 @@ static inline int exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline int ct_state(void) { return -1; }
 static __always_inline bool context_tracking_guest_enter(void) { return false; }
-static inline void context_tracking_guest_exit(void) { }
+static __always_inline void context_tracking_guest_exit(void) { }
 #define CT_WARN_ON(cond) do { } while (0)
 #endif /* !CONFIG_CONTEXT_TRACKING_USER */
 
-- 
cgit v1.2.3


From 9ea7e6b62c2bd2f7bbfc3f10099df803002dd33b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 16:49:31 -0700
Subject: init: Mark [arch_call_]rest_init() __noreturn

In preparation for improving objtool's handling of weak noreturn
functions, mark start_kernel(), arch_call_rest_init(), and rest_init()
__noreturn.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/7194ed8a989a85b98d92e62df660f4a90435a723.1681342859.git.jpoimboe@kernel.org
---
 arch/s390/kernel/setup.c     | 2 +-
 include/linux/start_kernel.h | 4 ++--
 init/main.c                  | 4 ++--
 tools/objtool/check.c        | 2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 8ec5cdf9dadc..4259b6c50516 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -396,7 +396,7 @@ int __init arch_early_irq_init(void)
 	return 0;
 }
 
-void __init arch_call_rest_init(void)
+void __init __noreturn arch_call_rest_init(void)
 {
 	unsigned long stack;
 
diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h
index 8b369a41c03c..864921e54c92 100644
--- a/include/linux/start_kernel.h
+++ b/include/linux/start_kernel.h
@@ -9,7 +9,7 @@
    up something else. */
 
 extern asmlinkage void __init start_kernel(void);
-extern void __init arch_call_rest_init(void);
-extern void __ref rest_init(void);
+extern void __init __noreturn arch_call_rest_init(void);
+extern void __ref __noreturn rest_init(void);
 
 #endif /* _LINUX_START_KERNEL_H */
diff --git a/init/main.c b/init/main.c
index 4425d1783d5c..161ed956d738 100644
--- a/init/main.c
+++ b/init/main.c
@@ -683,7 +683,7 @@ static void __init setup_command_line(char *command_line)
 
 static __initdata DECLARE_COMPLETION(kthreadd_done);
 
-noinline void __ref rest_init(void)
+noinline void __ref __noreturn rest_init(void)
 {
 	struct task_struct *tsk;
 	int pid;
@@ -889,7 +889,7 @@ static int __init early_randomize_kstack_offset(char *buf)
 early_param("randomize_kstack_offset", early_randomize_kstack_offset);
 #endif
 
-void __init __weak arch_call_rest_init(void)
+void __init __weak __noreturn arch_call_rest_init(void)
 {
 	rest_init();
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index df634dafefc4..3d7227f0ea2a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"__reiserfs_panic",
 		"__stack_chk_fail",
 		"__ubsan_handle_builtin_unreachable",
+		"arch_call_rest_init",
 		"arch_cpu_idle_dead",
 		"cpu_bringup_and_idle",
 		"cpu_startup_entry",
@@ -217,6 +218,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"machine_real_restart",
 		"make_task_dead",
 		"panic",
+		"rest_init",
 		"rewind_stack_and_make_dead",
 		"sev_es_terminate",
 		"snp_abort",
-- 
cgit v1.2.3


From 25a6917ca63ad4470bf88535c56f0dec72b570fe Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 16:49:32 -0700
Subject: init: Mark start_kernel() __noreturn

Now that arch_call_rest_init() is __noreturn, mark its caller
start_kernel() __noreturn.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/7069acf026a195f26a88061227fba5a3b0337b9a.1681342859.git.jpoimboe@kernel.org
---
 include/linux/start_kernel.h | 2 +-
 init/main.c                  | 2 +-
 tools/objtool/check.c        | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h
index 864921e54c92..a9806a44a605 100644
--- a/include/linux/start_kernel.h
+++ b/include/linux/start_kernel.h
@@ -8,7 +8,7 @@
 /* Define the prototype for start_kernel here, rather than cluttering
    up something else. */
 
-extern asmlinkage void __init start_kernel(void);
+extern asmlinkage void __init __noreturn start_kernel(void);
 extern void __init __noreturn arch_call_rest_init(void);
 extern void __ref __noreturn rest_init(void);
 
diff --git a/init/main.c b/init/main.c
index 161ed956d738..65aab4e9bb49 100644
--- a/init/main.c
+++ b/init/main.c
@@ -937,7 +937,7 @@ static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
-asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
+asmlinkage __visible void __init __no_sanitize_address __noreturn start_kernel(void)
 {
 	char *command_line;
 	char *after_dashes;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3d7227f0ea2a..9aaad9dc6c20 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -222,6 +222,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"rewind_stack_and_make_dead",
 		"sev_es_terminate",
 		"snp_abort",
+		"start_kernel",
 		"stop_this_cpu",
 		"usercopy_abort",
 		"xen_cpu_bringup_again",
-- 
cgit v1.2.3


From 7412a60decec2a6744cf773280ff17a0f89e8395 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 16:49:35 -0700
Subject: cpu: Mark panic_smp_self_stop() __noreturn

In preparation for improving objtool's handling of weak noreturn
functions, mark panic_smp_self_stop() __noreturn.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/92d76ab5c8bf660f04fdcd3da1084519212de248.1681342859.git.jpoimboe@kernel.org
---
 arch/arm/kernel/smp.c          | 2 +-
 arch/arm64/include/asm/smp.h   | 1 -
 arch/arm64/kernel/smp.c        | 2 +-
 arch/powerpc/kernel/setup_64.c | 2 +-
 include/linux/smp.h            | 2 +-
 kernel/panic.c                 | 2 +-
 tools/objtool/check.c          | 1 +
 7 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index d6be4507d22d..f4a4ac028b6b 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -779,7 +779,7 @@ void smp_send_stop(void)
  * kdump fails. So split out the panic_smp_self_stop() and add
  * set_cpu_online(smp_processor_id(), false).
  */
-void panic_smp_self_stop(void)
+void __noreturn panic_smp_self_stop(void)
 {
 	pr_debug("CPU %u will stop doing anything useful since another CPU has paniced\n",
 	         smp_processor_id());
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 07f4ea1490f4..f2d26235bfb4 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -143,7 +143,6 @@ bool cpus_are_stuck_in_kernel(void);
 
 extern void crash_smp_send_stop(void);
 extern bool smp_crash_stop_failed(void);
-extern void panic_smp_self_stop(void);
 
 #endif /* ifndef __ASSEMBLY__ */
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 07d156fddb5f..05fe797e4203 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -830,7 +830,7 @@ static void __noreturn local_cpu_stop(void)
  * that cpu_online_mask gets correctly updated and smp_send_stop() can skip
  * CPUs that have already stopped themselves.
  */
-void panic_smp_self_stop(void)
+void __noreturn panic_smp_self_stop(void)
 {
 	local_cpu_stop();
 }
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b2e0d3ce4261..246201d0d879 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -480,7 +480,7 @@ void early_setup_secondary(void)
 
 #endif /* CONFIG_SMP */
 
-void panic_smp_self_stop(void)
+void __noreturn panic_smp_self_stop(void)
 {
 	hard_irq_disable();
 	spin_begin();
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a80ab58ae3f1..2a737b39cf0a 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -59,7 +59,7 @@ int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
  * Cpus stopping functions in panic. All have default weak definitions.
  * Architecture-dependent code may override them.
  */
-void panic_smp_self_stop(void);
+void __noreturn panic_smp_self_stop(void);
 void nmi_panic_self_stop(struct pt_regs *regs);
 void crash_smp_send_stop(void);
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 5cfea8302d23..5e4982db8dc9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -141,7 +141,7 @@ EXPORT_SYMBOL(panic_blink);
 /*
  * Stop ourself in panic -- architecture code may override this
  */
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
 {
 	while (1)
 		cpu_relax();
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index a436bc1e63d9..4b52ed6bff66 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -218,6 +218,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"machine_real_restart",
 		"make_task_dead",
 		"panic",
+		"panic_smp_self_stop",
 		"rest_init",
 		"rewind_stack_and_make_dead",
 		"sev_es_terminate",
-- 
cgit v1.2.3


From 27dea14c7f05106f39850a9239874cd38703b405 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Apr 2023 16:49:36 -0700
Subject: cpu: Mark nmi_panic_self_stop() __noreturn

In preparation for improving objtool's handling of weak noreturn
functions, mark nmi_panic_self_stop() __noreturn.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/316fc6dfab5a8c4e024c7185484a1ee5fb0afb79.1681342859.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/reboot.h | 1 -
 arch/x86/kernel/reboot.c      | 2 +-
 include/linux/smp.h           | 2 +-
 kernel/panic.c                | 2 +-
 tools/objtool/check.c         | 1 +
 5 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index bc5b4d788c08..9177b4354c3f 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -28,7 +28,6 @@ void __noreturn machine_real_restart(unsigned int type);
 void cpu_emergency_disable_virtualization(void);
 
 typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
-void nmi_panic_self_stop(struct pt_regs *regs);
 void nmi_shootdown_cpus(nmi_shootdown_cb callback);
 void run_crash_ipi_callback(struct pt_regs *regs);
 
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d03c551defcc..3adbe97015c1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -920,7 +920,7 @@ void run_crash_ipi_callback(struct pt_regs *regs)
 }
 
 /* Override the weak function in kernel/panic.c */
-void nmi_panic_self_stop(struct pt_regs *regs)
+void __noreturn nmi_panic_self_stop(struct pt_regs *regs)
 {
 	while (1) {
 		/* If no CPU is preparing crash dump, we simply loop here. */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2a737b39cf0a..7b93504eed26 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -60,7 +60,7 @@ int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
  * Architecture-dependent code may override them.
  */
 void __noreturn panic_smp_self_stop(void);
-void nmi_panic_self_stop(struct pt_regs *regs);
+void __noreturn nmi_panic_self_stop(struct pt_regs *regs);
 void crash_smp_send_stop(void);
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 5e4982db8dc9..886d2ebd0a0d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -151,7 +151,7 @@ void __weak __noreturn panic_smp_self_stop(void)
  * Stop ourselves in NMI context if another CPU has already panicked. Arch code
  * may override this to prepare for crash dumping, e.g. save regs info.
  */
-void __weak nmi_panic_self_stop(struct pt_regs *regs)
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
 {
 	panic_smp_self_stop();
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4b52ed6bff66..8d073bfd7d7f 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -217,6 +217,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"lbug_with_loc",
 		"machine_real_restart",
 		"make_task_dead",
+		"nmi_panic_self_stop",
 		"panic",
 		"panic_smp_self_stop",
 		"rest_init",
-- 
cgit v1.2.3


From 13f6facf3faeed34ca381aef4c9b153c7aed3972 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 14 Apr 2023 12:07:27 -0400
Subject: dm: allow targets to require splitting WRITE_ZEROES and SECURE_ERASE

Introduce max_write_zeroes_granularity and
max_secure_erase_granularity flags in the dm_target struct.

If a target sets these then DM core will split IO of these operation
types accordingly (in terms of max_write_zeroes_sectors and
max_secure_erase_sectors respectively).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 10 ++++++----
 include/linux/device-mapper.h | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 20c6b72a0245..244ebb8c316b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1614,21 +1614,23 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 {
 	unsigned int num_bios = 0;
 	unsigned int max_granularity = 0;
+	struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
 
 	switch (bio_op(ci->bio)) {
 	case REQ_OP_DISCARD:
 		num_bios = ti->num_discard_bios;
-		if (ti->max_discard_granularity) {
-			struct queue_limits *limits =
-				dm_get_queue_limits(ti->table->md);
+		if (ti->max_discard_granularity)
 			max_granularity = limits->max_discard_sectors;
-		}
 		break;
 	case REQ_OP_SECURE_ERASE:
 		num_bios = ti->num_secure_erase_bios;
+		if (ti->max_secure_erase_granularity)
+			max_granularity = limits->max_secure_erase_sectors;
 		break;
 	case REQ_OP_WRITE_ZEROES:
 		num_bios = ti->num_write_zeroes_bios;
+		if (ti->max_write_zeroes_granularity)
+			max_granularity = limits->max_write_zeroes_sectors;
 		break;
 	default:
 		break;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index f2d9afb8a7e9..983f1f0402b5 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -359,11 +359,23 @@ struct dm_target {
 	bool discards_supported:1;
 
 	/*
-	 * Set if this target requires that discards be split on both
-	 * 'discard_granularity' and 'max_discard_sectors' boundaries.
+	 * Set if this target requires that discards be split on
+	 * 'max_discard_sectors' boundaries.
 	 */
 	bool max_discard_granularity:1;
 
+	/*
+	 * Set if this target requires that secure_erases be split on
+	 * 'max_secure_erase_sectors' boundaries.
+	 */
+	bool max_secure_erase_granularity:1;
+
+	/*
+	 * Set if this target requires that write_zeroes be split on
+	 * 'max_write_zeroes_sectors' boundaries.
+	 */
+	bool max_write_zeroes_granularity:1;
+
 	/*
 	 * Set if we need to limit the number of in-flight bios when swapping.
 	 */
-- 
cgit v1.2.3


From f7995089c508a5a11b0491c7b348d5c07217a4e8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 14 Apr 2023 12:32:20 -0400
Subject: dm: unexport dm_get_queue_limits()

There are no dm_get_queue_limits() callers outside of DM core and
there shouldn't be.

Also, remove its BUG_ON(!atomic_read(&md->holders)) to micro-optimize
__process_abnormal_io().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 20 +++++++++-----------
 include/linux/device-mapper.h |  2 --
 2 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 244ebb8c316b..3b694ba3a106 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1072,6 +1072,15 @@ static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
 	__dm_io_dec_pending(io);
 }
 
+/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
+ */
+static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+	return &md->queue->limits;
+}
+
 void disable_discard(struct mapped_device *md)
 {
 	struct queue_limits *limits = dm_get_queue_limits(md);
@@ -2311,17 +2320,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 	return md->immutable_target_type;
 }
 
-/*
- * The queue_limits are only valid as long as you have a reference
- * count on 'md'.
- */
-struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
-{
-	BUG_ON(!atomic_read(&md->holders));
-	return &md->queue->limits;
-}
-EXPORT_SYMBOL_GPL(dm_get_queue_limits);
-
 /*
  * Setup the DM device's queue based on md's type
  */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 983f1f0402b5..a52d2b9a6846 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -530,8 +530,6 @@ int __init dm_early_create(struct dm_ioctl *dmi,
 			   struct dm_target_spec **spec_array,
 			   char **target_params_array);
 
-struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
-
 /*
  * Geometry functions.
  */
-- 
cgit v1.2.3


From 38e4614c27210e63aafad1bb2e4a427515e46d71 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Wed, 29 Mar 2023 14:06:03 +0200
Subject: vfio: correct kdoc for ops structures

Address minor omissions from kdoc for ops structures flagged by check-kdoc:

  ./scripts/kernel-doc -Werror -none include/linux/vfio.h

  include/linux/vfio.h:114: warning: Function parameter or member 'name' not described in 'vfio_device_ops'
  include/linux/vfio.h:143: warning: Cannot understand  * @migration_set_state: Optional callback to change the migration state for
   on line 143 - I thought it was a doc line
  include/linux/vfio.h:168: warning: Cannot understand  * @log_start: Optional callback to ask the device start DMA logging.
   on line 168 - I thought it was a doc line

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230329120603.468031-1-horms@kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 93134b023968..cb46050045c0 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -68,6 +68,7 @@ struct vfio_device {
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
+ * @name: Name of the device driver.
  * @init: initialize private fields in device structure
  * @release: Reclaim private fields in device structure
  * @bind_iommufd: Called when binding the device to an iommufd
@@ -140,6 +141,8 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 #endif
 
 /**
+ * struct vfio_migration_ops - VFIO bus device driver migration callbacks
+ *
  * @migration_set_state: Optional callback to change the migration state for
  *         devices that support migration. It's mandatory for
  *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
@@ -165,6 +168,8 @@ struct vfio_migration_ops {
 };
 
 /**
+ * struct vfio_log_ops - VFIO bus device driver logging callbacks
+ *
  * @log_start: Optional callback to ask the device start DMA logging.
  * @log_stop: Optional callback to ask the device stop DMA logging.
  * @log_read_and_clear: Optional callback to ask the device read
-- 
cgit v1.2.3


From 8c48eea3adf3119e0a3fc57bd31f6966f26ee784 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 12 Apr 2023 21:26:04 -0700
Subject: page_pool: allow caching from safely localized NAPI

Recent patches to mlx5 mentioned a regression when moving from
driver local page pool to only using the generic page pool code.
Page pool has two recycling paths (1) direct one, which runs in
safe NAPI context (basically consumer context, so producing
can be lockless); and (2) via a ptr_ring, which takes a spin
lock because the freeing can happen from any CPU; producer
and consumer may run concurrently.

Since the page pool code was added, Eric introduced a revised version
of deferred skb freeing. TCP skbs are now usually returned to the CPU
which allocated them, and freed in softirq context. This places the
freeing (producing of pages back to the pool) enticingly close to
the allocation (consumer).

If we can prove that we're freeing in the same softirq context in which
the consumer NAPI will run - lockless use of the cache is perfectly fine,
no need for the lock.

Let drivers link the page pool to a NAPI instance. If the NAPI instance
is scheduled on the same CPU on which we're freeing - place the pages
in the direct cache.

With that and patched bnxt (XDP enabled to engage the page pool, sigh,
bnxt really needs page pool work :() I see a 2.6% perf boost with
a TCP stream test (app on a different physical core than softirq).

The CPU use of relevant functions decreases as expected:

  page_pool_refill_alloc_cache   1.17% -> 0%
  _raw_spin_lock                 2.41% -> 0.98%

Only consider lockless path to be safe when NAPI is scheduled
- in practice this should cover majority if not all of steady state
workloads. It's usually the NAPI kicking in that causes the skb flush.

The main case we'll miss out on is when application runs on the same
CPU as NAPI. In that case we don't use the deferred skb free path.

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/page_pool.rst |  1 +
 include/linux/netdevice.h              |  3 +++
 include/linux/skbuff.h                 | 20 +++++++++++++-------
 include/net/page_pool.h                |  3 ++-
 net/core/dev.c                         |  3 +++
 net/core/page_pool.c                   | 15 +++++++++++++--
 net/core/skbuff.c                      |  4 ++--
 7 files changed, 37 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst
index 30f1344e7cca..873efd97f822 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -165,6 +165,7 @@ Registration
     pp_params.pool_size = DESC_NUM;
     pp_params.nid = NUMA_NO_NODE;
     pp_params.dev = priv->dev;
+    pp_params.napi = napi; /* only if locking is tied to NAPI */
     pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
     page_pool = page_pool_create(&pp_params);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 96d27d558b0c..203c0df2046c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,8 +360,11 @@ struct napi_struct {
 	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
+	/* CPU actively polling if netpoll is configured */
 	int			poll_owner;
 #endif
+	/* CPU on which NAPI has been scheduled for processing */
+	int			list_owner;
 	struct net_device	*dev;
 	struct gro_list		gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..a823ec3aa326 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+	struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+	if (recycle && page_pool_return_skb_page(page, napi_safe))
+		return;
+#endif
+	put_page(page);
+}
+
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
-	if (recycle && page_pool_return_skb_page(page))
-		return;
-#endif
-	put_page(page);
+	napi_frag_unref(frag, recycle, false);
 }
 
 /**
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b328677..91b808dade82 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -77,6 +77,7 @@ struct page_pool_params {
 	unsigned int	pool_size;
 	int		nid;  /* Numa node id to allocate from pages from */
 	struct device	*dev; /* device, for DMA pre-mapping purposes */
+	struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
 	unsigned int	max_len; /* max DMA sync memory size */
 	unsigned int	offset;  /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 	return pool->p.dma_dir;
 }
 
-bool page_pool_return_skb_page(struct page *page);
+bool page_pool_return_skb_page(struct page *page, bool napi_safe);
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c7f13742b56c..8aea68275172 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 	}
 
 	list_add_tail(&napi->poll_list, &sd->poll_list);
+	WRITE_ONCE(napi->list_owner, smp_processor_id());
 	/* If not called from net_rx_action()
 	 * we have to raise NET_RX_SOFTIRQ.
 	 */
@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
+	WRITE_ONCE(n->list_owner, -1);
 
 	val = READ_ONCE(n->state);
 	do {
@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 #ifdef CONFIG_NETPOLL
 	napi->poll_owner = -1;
 #endif
+	napi->list_owner = -1;
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c18799865..2f6bf422ed30 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
 #include <linux/ethtool.h>
+#include <linux/netdevice.h>
 
 #include <trace/events/page_pool.h>
 
@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
 }
 EXPORT_SYMBOL(page_pool_update_nid);
 
-bool page_pool_return_skb_page(struct page *page)
+bool page_pool_return_skb_page(struct page *page, bool napi_safe)
 {
+	struct napi_struct *napi;
 	struct page_pool *pp;
+	bool allow_direct;
 
 	page = compound_head(page);
 
@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
 
 	pp = page->pp;
 
+	/* Allow direct recycle if we have reasons to believe that we are
+	 * in the same context as the consumer would run, so there's
+	 * no possible race.
+	 */
+	napi = pp->p.napi;
+	allow_direct = napi_safe && napi &&
+		READ_ONCE(napi->list_owner) == smp_processor_id();
+
 	/* Driver set this to memory recycling info. Reset it on recycle.
 	 * This will *not* work for NIC using a split-page memory model.
 	 * The page will be returned to the pool here regardless of the
 	 * 'flipped' fragment being in use or not.
 	 */
-	page_pool_put_full_page(pp, page, false);
+	page_pool_put_full_page(pp, page, allow_direct);
 
 	return true;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2b5a98c5cb49..ef81452759be 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -843,7 +843,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
 {
 	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
 		return false;
-	return page_pool_return_skb_page(virt_to_page(data));
+	return page_pool_return_skb_page(virt_to_page(data), napi_safe);
 }
 
 static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -889,7 +889,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
 
 free_head:
 	if (shinfo->frag_list)
-- 
cgit v1.2.3


From acdb15732869a429081b347d2ec40a425ea0b47f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 1 Mar 2023 12:21:38 +0000
Subject: media: saa7146: drop 'dev' and 'resources' from struct saa7146_fh

Instead use vv->resources and video_drvdata(file) to
obtain this information.

This prepares for the vb2 conversion later when saa7146_fh is
dropped completely.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/saa7146/saa7146_fops.c  | 31 +++++++-------
 drivers/media/common/saa7146/saa7146_vbi.c   | 21 +++++-----
 drivers/media/common/saa7146/saa7146_video.c | 60 +++++++++++++---------------
 drivers/media/pci/saa7146/hexium_gemini.c    |  4 +-
 drivers/media/pci/saa7146/hexium_orion.c     |  4 +-
 drivers/media/pci/saa7146/mxb.c              | 22 +++++-----
 drivers/media/pci/ttpci/budget-av.c          |  4 +-
 drivers/staging/media/av7110/av7110_v4l.c    | 35 ++++++++--------
 include/media/drv-intf/saa7146_vv.h          |  7 +---
 9 files changed, 86 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index 90de44315304..faebe61a9408 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -7,12 +7,11 @@
 /****************************************************************************/
 /* resource management functions, shamelessly stolen from saa7134 driver */
 
-int saa7146_res_get(struct saa7146_fh *fh, unsigned int bit)
+int saa7146_res_get(struct saa7146_dev *dev, unsigned int bit)
 {
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 
-	if (fh->resources & bit) {
+	if (vv->resources & bit) {
 		DEB_D("already allocated! want: 0x%02x, cur:0x%02x\n",
 		      bit, vv->resources);
 		/* have it already allocated */
@@ -27,20 +26,17 @@ int saa7146_res_get(struct saa7146_fh *fh, unsigned int bit)
 		return 0;
 	}
 	/* it's free, grab it */
-	fh->resources |= bit;
 	vv->resources |= bit;
 	DEB_D("res: get 0x%02x, cur:0x%02x\n", bit, vv->resources);
 	return 1;
 }
 
-void saa7146_res_free(struct saa7146_fh *fh, unsigned int bits)
+void saa7146_res_free(struct saa7146_dev *dev, unsigned int bits)
 {
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 
-	BUG_ON((fh->resources & bits) != bits);
+	WARN_ON((vv->resources & bits) != bits);
 
-	fh->resources &= ~bits;
 	vv->resources &= ~bits;
 	DEB_D("res: put 0x%02x, cur:0x%02x\n", bits, vv->resources);
 }
@@ -221,7 +217,6 @@ static int fops_open(struct file *file)
 	v4l2_fh_init(&fh->fh, vdev);
 
 	file->private_data = &fh->fh;
-	fh->dev = dev;
 
 	if (vdev->vfl_type == VFL_TYPE_VBI) {
 		DEB_S("initializing vbi...\n");
@@ -257,8 +252,8 @@ out:
 static int fops_release(struct file *file)
 {
 	struct video_device *vdev = video_devdata(file);
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh  *fh  = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
 
 	DEB_EE("file:%p\n", file);
 
@@ -287,6 +282,7 @@ static int fops_release(struct file *file)
 static int fops_mmap(struct file *file, struct vm_area_struct * vma)
 {
 	struct video_device *vdev = video_devdata(file);
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = file->private_data;
 	struct videobuf_queue *q;
 	int res;
@@ -301,7 +297,7 @@ static int fops_mmap(struct file *file, struct vm_area_struct * vma)
 	case VFL_TYPE_VBI: {
 		DEB_EE("V4L2_BUF_TYPE_VBI_CAPTURE: file:%p, vma:%p\n",
 		       file, vma);
-		if (fh->dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
+		if (dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
 			return -ENODEV;
 		q = &fh->vbi_q;
 		break;
@@ -320,6 +316,7 @@ static int fops_mmap(struct file *file, struct vm_area_struct * vma)
 static __poll_t __fops_poll(struct file *file, struct poll_table_struct *wait)
 {
 	struct video_device *vdev = video_devdata(file);
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = file->private_data;
 	struct videobuf_buffer *buf = NULL;
 	struct videobuf_queue *q;
@@ -328,7 +325,7 @@ static __poll_t __fops_poll(struct file *file, struct poll_table_struct *wait)
 	DEB_EE("file:%p, poll:%p\n", file, wait);
 
 	if (vdev->vfl_type == VFL_TYPE_VBI) {
-		if (fh->dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
+		if (dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
 			return res | EPOLLOUT | EPOLLWRNORM;
 		if( 0 == fh->vbi_q.streaming )
 			return res | videobuf_poll_stream(file, &fh->vbi_q, wait);
@@ -370,7 +367,7 @@ static __poll_t fops_poll(struct file *file, struct poll_table_struct *wait)
 static ssize_t fops_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
 {
 	struct video_device *vdev = video_devdata(file);
-	struct saa7146_fh *fh = file->private_data;
+	struct saa7146_dev *dev = video_drvdata(file);
 	int ret;
 
 	switch (vdev->vfl_type) {
@@ -385,7 +382,7 @@ static ssize_t fops_read(struct file *file, char __user *data, size_t count, lof
 		DEB_EE("V4L2_BUF_TYPE_VBI_CAPTURE: file:%p, data:%p, count:%lu\n",
 		       file, data, (unsigned long)count);
 */
-		if (fh->dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE) {
+		if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE) {
 			if (mutex_lock_interruptible(vdev->lock))
 				return -ERESTARTSYS;
 			ret = saa7146_vbi_uops.read(file, data, count, ppos);
@@ -401,17 +398,17 @@ static ssize_t fops_read(struct file *file, char __user *data, size_t count, lof
 static ssize_t fops_write(struct file *file, const char __user *data, size_t count, loff_t *ppos)
 {
 	struct video_device *vdev = video_devdata(file);
-	struct saa7146_fh *fh = file->private_data;
+	struct saa7146_dev *dev = video_drvdata(file);
 	int ret;
 
 	switch (vdev->vfl_type) {
 	case VFL_TYPE_VIDEO:
 		return -EINVAL;
 	case VFL_TYPE_VBI:
-		if (fh->dev->ext_vv_data->vbi_fops.write) {
+		if (dev->ext_vv_data->vbi_fops.write) {
 			if (mutex_lock_interruptible(vdev->lock))
 				return -ERESTARTSYS;
-			ret = fh->dev->ext_vv_data->vbi_fops.write(file, data, count, ppos);
+			ret = dev->ext_vv_data->vbi_fops.write(file, data, count, ppos);
 			mutex_unlock(vdev->lock);
 			return ret;
 		}
diff --git a/drivers/media/common/saa7146/saa7146_vbi.c b/drivers/media/common/saa7146/saa7146_vbi.c
index bd442b984423..1a6fb0f381b7 100644
--- a/drivers/media/common/saa7146/saa7146_vbi.c
+++ b/drivers/media/common/saa7146/saa7146_vbi.c
@@ -219,8 +219,7 @@ static int buffer_activate(struct saa7146_dev *dev,
 static int buffer_prepare(struct videobuf_queue *q, struct videobuf_buffer *vb,enum v4l2_field field)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 
 	int err = 0;
@@ -289,8 +288,7 @@ static int buffer_setup(struct videobuf_queue *q, unsigned int *count, unsigned
 static void buffer_queue(struct videobuf_queue *q, struct videobuf_buffer *vb)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 
@@ -301,8 +299,7 @@ static void buffer_queue(struct videobuf_queue *q, struct videobuf_buffer *vb)
 static void buffer_release(struct videobuf_queue *q, struct videobuf_buffer *vb)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh   = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 
 	DEB_VBI("vb:%p\n", vb);
@@ -320,7 +317,7 @@ static const struct videobuf_queue_ops vbi_qops = {
 
 static void vbi_stop(struct saa7146_fh *fh, struct file *file)
 {
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	unsigned long flags;
 	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
@@ -353,8 +350,8 @@ static void vbi_read_timeout(struct timer_list *t)
 {
 	struct saa7146_vv *vv = from_timer(vv, t, vbi_read_timeout);
 	struct file *file = vv->vbi_read_timeout_file;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
 
 	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
 
@@ -376,14 +373,14 @@ static void vbi_init(struct saa7146_dev *dev, struct saa7146_vv *vv)
 static int vbi_open(struct saa7146_dev *dev, struct file *file)
 {
 	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_vv *vv = fh->dev->vv_data;
+	struct saa7146_vv *vv = dev->vv_data;
 
 	u32 arbtr_ctrl	= saa7146_read(dev, PCI_BT_V1);
 	int ret = 0;
 
 	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
 
-	ret = saa7146_res_get(fh, RESOURCE_DMA3_BRS);
+	ret = saa7146_res_get(dev, RESOURCE_DMA3_BRS);
 	if (0 == ret) {
 		DEB_S("cannot get vbi RESOURCE_DMA3_BRS resource\n");
 		return -EBUSY;
@@ -431,7 +428,7 @@ static void vbi_close(struct saa7146_dev *dev, struct file *file)
 	if( fh == vv->vbi_streaming ) {
 		vbi_stop(fh, file);
 	}
-	saa7146_res_free(fh, RESOURCE_DMA3_BRS);
+	saa7146_res_free(dev, RESOURCE_DMA3_BRS);
 }
 
 static void vbi_irq_done(struct saa7146_dev *dev, unsigned long status)
@@ -456,7 +453,7 @@ static void vbi_irq_done(struct saa7146_dev *dev, unsigned long status)
 static ssize_t vbi_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
 {
 	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	ssize_t ret = 0;
 
diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c
index 27c97218ee53..58f39cf64a1c 100644
--- a/drivers/media/common/saa7146/saa7146_video.c
+++ b/drivers/media/common/saa7146/saa7146_video.c
@@ -211,9 +211,8 @@ static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *bu
 /********************************************************************************/
 /* file operations */
 
-static int video_begin(struct saa7146_fh *fh)
+static int video_begin(struct saa7146_dev *dev, struct saa7146_fh *fh)
 {
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_format *fmt = NULL;
 	unsigned int resource;
@@ -241,7 +240,7 @@ static int video_begin(struct saa7146_fh *fh)
 		resource = RESOURCE_DMA1_HPS;
 	}
 
-	ret = saa7146_res_get(fh, resource);
+	ret = saa7146_res_get(dev, resource);
 	if (0 == ret) {
 		DEB_S("cannot get capture resource %d\n", resource);
 		return -EBUSY;
@@ -259,9 +258,8 @@ static int video_begin(struct saa7146_fh *fh)
 	return 0;
 }
 
-static int video_end(struct saa7146_fh *fh, struct file *file)
+static int video_end(struct saa7146_dev *dev, struct saa7146_fh *fh)
 {
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_dmaqueue *q = &vv->video_dmaq;
 	struct saa7146_format *fmt = NULL;
@@ -311,14 +309,14 @@ static int video_end(struct saa7146_fh *fh, struct file *file)
 	vv->video_fh = NULL;
 	vv->video_status = 0;
 
-	saa7146_res_free(fh, resource);
+	saa7146_res_free(dev, resource);
 
 	return 0;
 }
 
 static int vidioc_querycap(struct file *file, void *fh, struct v4l2_capability *cap)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 
 	strscpy((char *)cap->driver, "saa7146 v4l2", sizeof(cap->driver));
 	strscpy((char *)cap->card, dev->ext->name, sizeof(cap->card));
@@ -391,7 +389,7 @@ int saa7146_s_ctrl(struct v4l2_ctrl *ctrl)
 static int vidioc_g_parm(struct file *file, void *fh,
 		struct v4l2_streamparm *parm)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 
 	if (parm->type != V4L2_BUF_TYPE_VIDEO_CAPTURE)
@@ -404,7 +402,7 @@ static int vidioc_g_parm(struct file *file, void *fh,
 
 static int vidioc_g_fmt_vid_cap(struct file *file, void *fh, struct v4l2_format *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 
 	f->fmt.pix = vv->video_fmt;
@@ -413,7 +411,7 @@ static int vidioc_g_fmt_vid_cap(struct file *file, void *fh, struct v4l2_format
 
 static int vidioc_g_fmt_vbi_cap(struct file *file, void *fh, struct v4l2_format *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 
 	f->fmt.vbi = vv->vbi_fmt;
@@ -422,7 +420,7 @@ static int vidioc_g_fmt_vbi_cap(struct file *file, void *fh, struct v4l2_format
 
 static int vidioc_try_fmt_vid_cap(struct file *file, void *fh, struct v4l2_format *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_format *fmt;
 	enum v4l2_field field;
@@ -487,8 +485,8 @@ static int vidioc_try_fmt_vid_cap(struct file *file, void *fh, struct v4l2_forma
 
 static int vidioc_s_fmt_vid_cap(struct file *file, void *__fh, struct v4l2_format *f)
 {
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = __fh;
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	int err;
 
@@ -508,7 +506,7 @@ static int vidioc_s_fmt_vid_cap(struct file *file, void *__fh, struct v4l2_forma
 
 static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 
 	*norm = vv->standard->id;
@@ -535,7 +533,7 @@ static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
 
 static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	int found = 0;
 	int i;
@@ -612,12 +610,13 @@ static int vidioc_dqbuf(struct file *file, void *__fh, struct v4l2_buffer *buf)
 
 static int vidioc_streamon(struct file *file, void *__fh, enum v4l2_buf_type type)
 {
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = __fh;
 	int err;
 
 	DEB_D("VIDIOC_STREAMON, type:%d\n", type);
 
-	err = video_begin(fh);
+	err = video_begin(dev, fh);
 	if (err)
 		return err;
 	if (type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
@@ -629,8 +628,8 @@ static int vidioc_streamon(struct file *file, void *__fh, enum v4l2_buf_type typ
 
 static int vidioc_streamoff(struct file *file, void *__fh, enum v4l2_buf_type type)
 {
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = __fh;
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	int err;
 
@@ -656,9 +655,9 @@ static int vidioc_streamoff(struct file *file, void *__fh, enum v4l2_buf_type ty
 		err = videobuf_streamoff(&fh->vbi_q);
 	if (0 != err) {
 		DEB_D("warning: videobuf_streamoff() failed\n");
-		video_end(fh, file);
+		video_end(dev, fh);
 	} else {
-		err = video_end(fh, file);
+		err = video_end(dev, fh);
 	}
 	return err;
 }
@@ -727,8 +726,7 @@ static int buffer_prepare(struct videobuf_queue *q,
 			  struct videobuf_buffer *vb, enum v4l2_field field)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 	int size,err = 0;
@@ -808,8 +806,8 @@ static int buffer_prepare(struct videobuf_queue *q,
 static int buffer_setup(struct videobuf_queue *q, unsigned int *count, unsigned int *size)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_vv *vv = fh->dev->vv_data;
+	struct saa7146_dev *dev = video_drvdata(file);
+	struct saa7146_vv *vv = dev->vv_data;
 
 	if (0 == *count || *count > MAX_SAA7146_CAPTURE_BUFFERS)
 		*count = MAX_SAA7146_CAPTURE_BUFFERS;
@@ -829,20 +827,18 @@ static int buffer_setup(struct videobuf_queue *q, unsigned int *count, unsigned
 static void buffer_queue(struct videobuf_queue *q, struct videobuf_buffer *vb)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 
 	DEB_CAP("vbuf:%p\n", vb);
-	saa7146_buffer_queue(fh->dev, &vv->video_dmaq, buf);
+	saa7146_buffer_queue(dev, &vv->video_dmaq, buf);
 }
 
 static void buffer_release(struct videobuf_queue *q, struct videobuf_buffer *vb)
 {
 	struct file *file = q->priv_data;
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
 
 	DEB_CAP("vbuf:%p\n", vb);
@@ -900,7 +896,7 @@ static void video_close(struct saa7146_dev *dev, struct file *file)
 	struct videobuf_queue *q = &fh->video_q;
 
 	if (IS_CAPTURE_ACTIVE(fh) != 0)
-		video_end(fh, file);
+		video_end(dev, fh);
 
 	videobuf_stop(q);
 	/* hmm, why is this function declared void? */
@@ -926,8 +922,8 @@ static void video_irq_done(struct saa7146_dev *dev, unsigned long st)
 
 static ssize_t video_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
 {
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
 	struct saa7146_vv *vv = dev->vv_data;
 	ssize_t ret = 0;
 
@@ -943,7 +939,7 @@ static ssize_t video_read(struct file *file, char __user *data, size_t count, lo
 		return -EBUSY;
 	}
 
-	ret = video_begin(fh);
+	ret = video_begin(dev, fh);
 	if( 0 != ret) {
 		goto out;
 	}
@@ -951,9 +947,9 @@ static ssize_t video_read(struct file *file, char __user *data, size_t count, lo
 	ret = videobuf_read_one(&fh->video_q , data, count, ppos,
 				file->f_flags & O_NONBLOCK);
 	if (ret != 0) {
-		video_end(fh, file);
+		video_end(dev, fh);
 	} else {
-		ret = video_end(fh, file);
+		ret = video_end(dev, fh);
 	}
 out:
 	return ret;
diff --git a/drivers/media/pci/saa7146/hexium_gemini.c b/drivers/media/pci/saa7146/hexium_gemini.c
index 3947701cd6c7..7dead0dfcc9f 100644
--- a/drivers/media/pci/saa7146/hexium_gemini.c
+++ b/drivers/media/pci/saa7146/hexium_gemini.c
@@ -215,7 +215,7 @@ static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 
 static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct hexium *hexium = (struct hexium *) dev->ext_priv;
 
 	*input = hexium->cur_input;
@@ -226,7 +226,7 @@ static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 
 static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct hexium *hexium = (struct hexium *) dev->ext_priv;
 
 	DEB_EE("VIDIOC_S_INPUT %d\n", input);
diff --git a/drivers/media/pci/saa7146/hexium_orion.c b/drivers/media/pci/saa7146/hexium_orion.c
index 6207f0861bb0..bececbe79f32 100644
--- a/drivers/media/pci/saa7146/hexium_orion.c
+++ b/drivers/media/pci/saa7146/hexium_orion.c
@@ -326,7 +326,7 @@ static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 
 static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct hexium *hexium = (struct hexium *) dev->ext_priv;
 
 	*input = hexium->cur_input;
@@ -337,7 +337,7 @@ static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 
 static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct hexium *hexium = (struct hexium *) dev->ext_priv;
 
 	if (input >= HEXIUM_INPUTS)
diff --git a/drivers/media/pci/saa7146/mxb.c b/drivers/media/pci/saa7146/mxb.c
index 7ded8f5b05cb..f518ad8c92ed 100644
--- a/drivers/media/pci/saa7146/mxb.c
+++ b/drivers/media/pci/saa7146/mxb.c
@@ -456,7 +456,7 @@ static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 
 static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 	*i = mxb->cur_input;
 
@@ -466,7 +466,7 @@ static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
 
 static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 	int err = 0;
 	int i = 0;
@@ -528,7 +528,7 @@ static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 
 static int vidioc_g_tuner(struct file *file, void *fh, struct v4l2_tuner *t)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 
 	if (t->index) {
@@ -550,7 +550,7 @@ static int vidioc_g_tuner(struct file *file, void *fh, struct v4l2_tuner *t)
 
 static int vidioc_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *t)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 
 	if (t->index) {
@@ -565,14 +565,14 @@ static int vidioc_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *
 
 static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 
 	return call_all(dev, video, querystd, norm);
 }
 
 static int vidioc_g_frequency(struct file *file, void *fh, struct v4l2_frequency *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 
 	if (f->tuner)
@@ -585,7 +585,7 @@ static int vidioc_g_frequency(struct file *file, void *fh, struct v4l2_frequency
 
 static int vidioc_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 	struct saa7146_vv *vv = dev->vv_data;
 
@@ -626,7 +626,7 @@ static int vidioc_enumaudio(struct file *file, void *fh, struct v4l2_audio *a)
 
 static int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *a)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 
 	DEB_EE("VIDIOC_G_AUDIO\n");
@@ -636,7 +636,7 @@ static int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *a)
 
 static int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *a)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
 
 	DEB_D("VIDIOC_S_AUDIO %d\n", a->index);
@@ -656,7 +656,7 @@ static int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int vidioc_g_register(struct file *file, void *fh, struct v4l2_dbg_register *reg)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 
 	if (reg->reg > pci_resource_len(dev->pci, 0) - 4)
 		return -EINVAL;
@@ -667,7 +667,7 @@ static int vidioc_g_register(struct file *file, void *fh, struct v4l2_dbg_regist
 
 static int vidioc_s_register(struct file *file, void *fh, const struct v4l2_dbg_register *reg)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 
 	if (reg->reg > pci_resource_len(dev->pci, 0) - 4)
 		return -EINVAL;
diff --git a/drivers/media/pci/ttpci/budget-av.c b/drivers/media/pci/ttpci/budget-av.c
index 84068f4d4e36..824529f3c74b 100644
--- a/drivers/media/pci/ttpci/budget-av.c
+++ b/drivers/media/pci/ttpci/budget-av.c
@@ -1411,7 +1411,7 @@ static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 
 static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct budget_av *budget_av = (struct budget_av *)dev->ext_priv;
 
 	*i = budget_av->cur_input;
@@ -1422,7 +1422,7 @@ static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
 
 static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct budget_av *budget_av = (struct budget_av *)dev->ext_priv;
 
 	dprintk(1, "VIDIOC_S_INPUT %d\n", input);
diff --git a/drivers/staging/media/av7110/av7110_v4l.c b/drivers/staging/media/av7110/av7110_v4l.c
index 374f78b84c04..3ab930cd8a27 100644
--- a/drivers/staging/media/av7110/av7110_v4l.c
+++ b/drivers/staging/media/av7110/av7110_v4l.c
@@ -213,9 +213,8 @@ static const struct v4l2_audio msp3400_v4l2_audio = {
 	.capability = V4L2_AUDCAP_STEREO
 };
 
-static int av7110_dvb_c_switch(struct saa7146_fh *fh)
+static int av7110_dvb_c_switch(struct saa7146_dev *dev)
 {
-	struct saa7146_dev *dev = fh->dev;
 	struct av7110 *av7110 = (struct av7110*)dev->ext_priv;
 	u16 adswitch;
 	int source, sync;
@@ -295,7 +294,7 @@ static int av7110_dvb_c_switch(struct saa7146_fh *fh)
 
 static int vidioc_g_tuner(struct file *file, void *fh, struct v4l2_tuner *t)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 	u16 stereo_det;
 	s8 stereo;
@@ -339,7 +338,7 @@ static int vidioc_g_tuner(struct file *file, void *fh, struct v4l2_tuner *t)
 
 static int vidioc_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *t)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 	u16 fm_matrix, src;
 	dprintk(2, "VIDIOC_S_TUNER: %d\n", t->index);
@@ -383,7 +382,7 @@ static int vidioc_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *
 
 static int vidioc_g_frequency(struct file *file, void *fh, struct v4l2_frequency *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_G_FREQ: freq:0x%08x\n", f->frequency);
@@ -399,7 +398,7 @@ static int vidioc_g_frequency(struct file *file, void *fh, struct v4l2_frequency
 
 static int vidioc_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_S_FREQUENCY: freq:0x%08x\n", f->frequency);
@@ -429,7 +428,7 @@ static int vidioc_s_frequency(struct file *file, void *fh, const struct v4l2_fre
 
 static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_ENUMINPUT: %d\n", i->index);
@@ -449,7 +448,7 @@ static int vidioc_enum_input(struct file *file, void *fh, struct v4l2_input *i)
 
 static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	*input = av7110->current_input;
@@ -459,7 +458,7 @@ static int vidioc_g_input(struct file *file, void *fh, unsigned int *input)
 
 static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_S_INPUT: %d\n", input);
@@ -471,7 +470,7 @@ static int vidioc_s_input(struct file *file, void *fh, unsigned int input)
 		return -EINVAL;
 
 	av7110->current_input = input;
-	return av7110_dvb_c_switch(fh);
+	return av7110_dvb_c_switch(dev);
 }
 
 static int vidioc_enumaudio(struct file *file, void *fh, struct v4l2_audio *a)
@@ -485,7 +484,7 @@ static int vidioc_enumaudio(struct file *file, void *fh, struct v4l2_audio *a)
 
 static int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *a)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_G_AUDIO: %d\n", a->index);
@@ -499,7 +498,7 @@ static int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *a)
 
 static int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *a)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_S_AUDIO: %d\n", a->index);
@@ -511,7 +510,7 @@ static int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *
 static int vidioc_g_sliced_vbi_cap(struct file *file, void *fh,
 					struct v4l2_sliced_vbi_cap *cap)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_G_SLICED_VBI_CAP\n");
@@ -527,7 +526,7 @@ static int vidioc_g_sliced_vbi_cap(struct file *file, void *fh,
 static int vidioc_g_fmt_sliced_vbi_out(struct file *file, void *fh,
 					struct v4l2_format *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_G_FMT:\n");
@@ -545,7 +544,7 @@ static int vidioc_g_fmt_sliced_vbi_out(struct file *file, void *fh,
 static int vidioc_s_fmt_sliced_vbi_out(struct file *file, void *fh,
 					struct v4l2_format *f)
 {
-	struct saa7146_dev *dev = ((struct saa7146_fh *)fh)->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110 *)dev->ext_priv;
 
 	dprintk(2, "VIDIOC_S_FMT\n");
@@ -573,8 +572,7 @@ static int vidioc_s_fmt_sliced_vbi_out(struct file *file, void *fh,
 
 static int av7110_vbi_reset(struct file *file)
 {
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110*) dev->ext_priv;
 
 	dprintk(2, "%s\n", __func__);
@@ -588,8 +586,7 @@ static int av7110_vbi_reset(struct file *file)
 
 static ssize_t av7110_vbi_write(struct file *file, const char __user *data, size_t count, loff_t *ppos)
 {
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = fh->dev;
+	struct saa7146_dev *dev = video_drvdata(file);
 	struct av7110 *av7110 = (struct av7110*) dev->ext_priv;
 	struct v4l2_sliced_vbi_data d;
 	int rc;
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index 932961e8f5ab..fee861670ddb 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -80,15 +80,12 @@ struct saa7146_dmaqueue {
 struct saa7146_fh {
 	/* Must be the first field! */
 	struct v4l2_fh		fh;
-	struct saa7146_dev	*dev;
 
 	/* video capture */
 	struct videobuf_queue	video_q;
 
 	/* vbi capture */
 	struct videobuf_queue	vbi_q;
-
-	unsigned int resources;	/* resource management for device open */
 };
 
 #define STATUS_CAPTURE	0x02
@@ -192,8 +189,8 @@ int saa7146_s_ctrl(struct v4l2_ctrl *ctrl);
 extern const struct saa7146_use_ops saa7146_vbi_uops;
 
 /* resource management functions */
-int saa7146_res_get(struct saa7146_fh *fh, unsigned int bit);
-void saa7146_res_free(struct saa7146_fh *fh, unsigned int bits);
+int saa7146_res_get(struct saa7146_dev *dev, unsigned int bit);
+void saa7146_res_free(struct saa7146_dev *dev, unsigned int bits);
 
 #define RESOURCE_DMA1_HPS	0x1
 #define RESOURCE_DMA2_CLP	0x2
-- 
cgit v1.2.3


From c79dbdae3a7693903a92a2348493e7084d9477b3 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 2 Mar 2023 09:52:59 +0000
Subject: media: common: saa7146: drop 'fmt' from struct saa7146_buf

Use the video_fmt in saa7146_vv instead of having a pointer
to it in struct saa7146_buf.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/saa7146/saa7146_hlp.c   | 37 +++++++++++++++-------------
 drivers/media/common/saa7146/saa7146_video.c | 12 ++++-----
 include/media/drv-intf/saa7146_vv.h          |  1 -
 3 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/saa7146_hlp.c b/drivers/media/common/saa7146/saa7146_hlp.c
index cb8fc326c0a7..98c339c33afa 100644
--- a/drivers/media/common/saa7146/saa7146_hlp.c
+++ b/drivers/media/common/saa7146/saa7146_hlp.c
@@ -407,14 +407,14 @@ void saa7146_write_out_dma(struct saa7146_dev* dev, int which, struct saa7146_vi
 static int calculate_video_dma_grab_packed(struct saa7146_dev* dev, struct saa7146_buf *buf)
 {
 	struct saa7146_vv *vv = dev->vv_data;
+	struct v4l2_pix_format *pix = &vv->video_fmt;
 	struct saa7146_video_dma vdma1;
+	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pix->pixelformat);
 
-	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev,buf->fmt->pixelformat);
-
-	int width = buf->fmt->width;
-	int height = buf->fmt->height;
-	int bytesperline = buf->fmt->bytesperline;
-	enum v4l2_field field = buf->fmt->field;
+	int width = pix->width;
+	int height = pix->height;
+	int bytesperline = pix->bytesperline;
+	enum v4l2_field field = pix->field;
 
 	int depth = sfmt->depth;
 
@@ -469,8 +469,9 @@ static int calculate_video_dma_grab_packed(struct saa7146_dev* dev, struct saa71
 
 static int calc_planar_422(struct saa7146_vv *vv, struct saa7146_buf *buf, struct saa7146_video_dma *vdma2, struct saa7146_video_dma *vdma3)
 {
-	int height = buf->fmt->height;
-	int width = buf->fmt->width;
+	struct v4l2_pix_format *pix = &vv->video_fmt;
+	int height = pix->height;
+	int width = pix->width;
 
 	vdma2->pitch	= width;
 	vdma3->pitch	= width;
@@ -500,8 +501,9 @@ static int calc_planar_422(struct saa7146_vv *vv, struct saa7146_buf *buf, struc
 
 static int calc_planar_420(struct saa7146_vv *vv, struct saa7146_buf *buf, struct saa7146_video_dma *vdma2, struct saa7146_video_dma *vdma3)
 {
-	int height = buf->fmt->height;
-	int width = buf->fmt->width;
+	struct v4l2_pix_format *pix = &vv->video_fmt;
+	int height = pix->height;
+	int width = pix->width;
 
 	vdma2->pitch	= width/2;
 	vdma3->pitch	= width/2;
@@ -530,15 +532,15 @@ static int calc_planar_420(struct saa7146_vv *vv, struct saa7146_buf *buf, struc
 static int calculate_video_dma_grab_planar(struct saa7146_dev* dev, struct saa7146_buf *buf)
 {
 	struct saa7146_vv *vv = dev->vv_data;
+	struct v4l2_pix_format *pix = &vv->video_fmt;
 	struct saa7146_video_dma vdma1;
 	struct saa7146_video_dma vdma2;
 	struct saa7146_video_dma vdma3;
+	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pix->pixelformat);
 
-	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev,buf->fmt->pixelformat);
-
-	int width = buf->fmt->width;
-	int height = buf->fmt->height;
-	enum v4l2_field field = buf->fmt->field;
+	int width = pix->width;
+	int height = pix->height;
+	enum v4l2_field field = pix->field;
 
 	BUG_ON(0 == buf->pt[0].dma);
 	BUG_ON(0 == buf->pt[1].dma);
@@ -717,8 +719,9 @@ static void saa7146_disable_clipping(struct saa7146_dev *dev)
 
 void saa7146_set_capture(struct saa7146_dev *dev, struct saa7146_buf *buf, struct saa7146_buf *next)
 {
-	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev,buf->fmt->pixelformat);
 	struct saa7146_vv *vv = dev->vv_data;
+	struct v4l2_pix_format *pix = &vv->video_fmt;
+	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pix->pixelformat);
 	u32 vdma1_prot_addr;
 
 	DEB_CAP("buf:%p, next:%p\n", buf, next);
@@ -730,7 +733,7 @@ void saa7146_set_capture(struct saa7146_dev *dev, struct saa7146_buf *buf, struc
 		saa7146_write(dev, MC2, MASK_27 );
 	}
 
-	saa7146_set_window(dev, buf->fmt->width, buf->fmt->height, buf->fmt->field);
+	saa7146_set_window(dev, pix->width, pix->height, pix->field);
 	saa7146_set_output_format(dev, sfmt->trans);
 	saa7146_disable_clipping(dev);
 
diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c
index 58f39cf64a1c..dd0d803c38cd 100644
--- a/drivers/media/common/saa7146/saa7146_video.c
+++ b/drivers/media/common/saa7146/saa7146_video.c
@@ -93,11 +93,13 @@ struct saa7146_format* saa7146_format_by_fourcc(struct saa7146_dev *dev, int fou
 
 static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *buf)
 {
+	struct saa7146_vv *vv = dev->vv_data;
 	struct pci_dev *pci = dev->pci;
 	struct videobuf_dmabuf *dma=videobuf_to_dma(&buf->vb);
 	struct scatterlist *list = dma->sglist;
 	int length = dma->sglen;
-	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev,buf->fmt->pixelformat);
+	struct v4l2_pix_format *pix = &vv->video_fmt;
+	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pix->pixelformat);
 
 	DEB_EE("dev:%p, buf:%p, sg_len:%d\n", dev, buf, length);
 
@@ -108,7 +110,7 @@ static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *bu
 		__le32  *ptr1, *ptr2, *ptr3;
 		__le32 fill;
 
-		int size = buf->fmt->width*buf->fmt->height;
+		int size = pix->width * pix->height;
 		int i,p,m1,m2,m3,o1,o2;
 
 		switch( sfmt->depth ) {
@@ -757,8 +759,7 @@ static int buffer_prepare(struct videobuf_queue *q,
 	    buf->vb.height != vv->video_fmt.height ||
 	    buf->vb.size   != size ||
 	    buf->vb.field  != field      ||
-	    buf->vb.field  != vv->video_fmt.field  ||
-	    buf->fmt       != &vv->video_fmt) {
+	    buf->vb.field  != vv->video_fmt.field) {
 		saa7146_dma_free(dev,q,buf);
 	}
 
@@ -770,10 +771,9 @@ static int buffer_prepare(struct videobuf_queue *q,
 		buf->vb.height = vv->video_fmt.height;
 		buf->vb.size   = size;
 		buf->vb.field  = field;
-		buf->fmt       = &vv->video_fmt;
 		buf->vb.field  = vv->video_fmt.field;
 
-		sfmt = saa7146_format_by_fourcc(dev,buf->fmt->pixelformat);
+		sfmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
 
 		release_all_pagetables(dev, buf);
 		if( 0 != IS_PLANAR(sfmt->trans)) {
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index fee861670ddb..80463fdd30eb 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -60,7 +60,6 @@ struct saa7146_buf {
 	struct videobuf_buffer vb;
 
 	/* saa7146 specific */
-	struct v4l2_pix_format  *fmt;
 	int (*activate)(struct saa7146_dev *dev,
 			struct saa7146_buf *buf,
 			struct saa7146_buf *next);
-- 
cgit v1.2.3


From 0b6e30bd37ae14ae34f7dbcc46a05e98903d0eae Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 1 Mar 2023 11:05:55 +0000
Subject: media: saa7146: convert to vb2

Convert this driver from the old videobuf framework to the vb2
frame.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/saa7146/Kconfig         |   2 +-
 drivers/media/common/saa7146/saa7146_fops.c  | 318 ++++-------------
 drivers/media/common/saa7146/saa7146_vbi.c   | 278 +++++++--------
 drivers/media/common/saa7146/saa7146_video.c | 490 ++++++++-------------------
 drivers/media/pci/saa7146/mxb.c              |  10 -
 include/media/drv-intf/saa7146_vv.h          |  36 +-
 6 files changed, 314 insertions(+), 820 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/Kconfig b/drivers/media/common/saa7146/Kconfig
index a0aa155e5d85..dfec86e50dff 100644
--- a/drivers/media/common/saa7146/Kconfig
+++ b/drivers/media/common/saa7146/Kconfig
@@ -6,5 +6,5 @@ config VIDEO_SAA7146
 config VIDEO_SAA7146_VV
 	tristate
 	depends on VIDEO_DEV
-	select VIDEOBUF_DMA_SG
+	select VIDEOBUF2_DMA_SG
 	select VIDEO_SAA7146
diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index 2154249a26d5..1adfffc987b6 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -42,22 +42,6 @@ void saa7146_res_free(struct saa7146_dev *dev, unsigned int bits)
 }
 
 
-/********************************************************************************/
-/* common dma functions */
-
-void saa7146_dma_free(struct saa7146_dev *dev,struct videobuf_queue *q,
-						struct saa7146_buf *buf)
-{
-	struct videobuf_dmabuf *dma=videobuf_to_dma(&buf->vb);
-	DEB_EE("dev:%p, buf:%p\n", dev, buf);
-
-	videobuf_waiton(q, &buf->vb, 0, 0);
-	videobuf_dma_unmap(q->dev, dma);
-	videobuf_dma_free(dma);
-	buf->vb.state = VIDEOBUF_NEEDS_INIT;
-}
-
-
 /********************************************************************************/
 /* common buffer functions */
 
@@ -76,8 +60,7 @@ int saa7146_buffer_queue(struct saa7146_dev *dev,
 		DEB_D("immediately activating buffer %p\n", buf);
 		buf->activate(dev,buf,NULL);
 	} else {
-		list_add_tail(&buf->vb.queue,&q->queue);
-		buf->vb.state = VIDEOBUF_QUEUED;
+		list_add_tail(&buf->list, &q->queue);
 		DEB_D("adding buffer %p to queue. (active buffer present)\n",
 		      buf);
 	}
@@ -88,21 +71,31 @@ void saa7146_buffer_finish(struct saa7146_dev *dev,
 			   struct saa7146_dmaqueue *q,
 			   int state)
 {
+	struct saa7146_vv *vv = dev->vv_data;
+	struct saa7146_buf *buf = q->curr;
+
 	assert_spin_locked(&dev->slock);
 	DEB_EE("dev:%p, dmaq:%p, state:%d\n", dev, q, state);
 	DEB_EE("q->curr:%p\n", q->curr);
 
 	/* finish current buffer */
-	if (NULL == q->curr) {
+	if (!buf) {
 		DEB_D("aiii. no current buffer\n");
 		return;
 	}
 
-	q->curr->vb.state = state;
-	q->curr->vb.ts = ktime_get_ns();
-	wake_up(&q->curr->vb.done);
-
 	q->curr = NULL;
+	buf->vb.vb2_buf.timestamp = ktime_get_ns();
+	if (vv->video_fmt.field == V4L2_FIELD_ALTERNATE)
+		buf->vb.field = vv->last_field;
+	else if (vv->video_fmt.field == V4L2_FIELD_ANY)
+		buf->vb.field = (vv->video_fmt.height > vv->standard->v_max_out / 2)
+			? V4L2_FIELD_INTERLACED
+			: V4L2_FIELD_BOTTOM;
+	else
+		buf->vb.field = vv->video_fmt.field;
+	buf->vb.sequence = vv->seqnr++;
+	vb2_buffer_done(&buf->vb.vb2_buf, state);
 }
 
 void saa7146_buffer_next(struct saa7146_dev *dev,
@@ -118,10 +111,10 @@ void saa7146_buffer_next(struct saa7146_dev *dev,
 	assert_spin_locked(&dev->slock);
 	if (!list_empty(&q->queue)) {
 		/* activate next one from queue */
-		buf = list_entry(q->queue.next,struct saa7146_buf,vb.queue);
-		list_del(&buf->vb.queue);
+		buf = list_entry(q->queue.next, struct saa7146_buf, list);
+		list_del(&buf->list);
 		if (!list_empty(&q->queue))
-			next = list_entry(q->queue.next,struct saa7146_buf, vb.queue);
+			next = list_entry(q->queue.next, struct saa7146_buf, list);
 		q->curr = buf;
 		DEB_INT("next buffer: buf:%p, prev:%p, next:%p\n",
 			buf, q->queue.prev, q->queue.next);
@@ -169,7 +162,7 @@ void saa7146_buffer_timeout(struct timer_list *t)
 	spin_lock_irqsave(&dev->slock,flags);
 	if (q->curr) {
 		DEB_D("timeout on %p\n", q->curr);
-		saa7146_buffer_finish(dev,q,VIDEOBUF_ERROR);
+		saa7146_buffer_finish(dev, q, VB2_BUF_STATE_ERROR);
 	}
 
 	/* we don't restart the transfer here like other drivers do. when
@@ -178,257 +171,39 @@ void saa7146_buffer_timeout(struct timer_list *t)
 	   we mess up our capture logic. if a timeout occurs on another buffer,
 	   then something is seriously broken before, so no need to buffer the
 	   next capture IMHO... */
-/*
-	saa7146_buffer_next(dev,q);
-*/
+
+	saa7146_buffer_next(dev, q, 0);
+
 	spin_unlock_irqrestore(&dev->slock,flags);
 }
 
 /********************************************************************************/
 /* file operations */
 
-static int fops_open(struct file *file)
-{
-	struct video_device *vdev = video_devdata(file);
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = NULL;
-	int result = 0;
-
-	DEB_EE("file:%p, dev:%s\n", file, video_device_node_name(vdev));
-
-	if (mutex_lock_interruptible(vdev->lock))
-		return -ERESTARTSYS;
-
-	DEB_D("using: %p\n", dev);
-
-	/* check if an extension is registered */
-	if( NULL == dev->ext ) {
-		DEB_S("no extension registered for this device\n");
-		result = -ENODEV;
-		goto out;
-	}
-
-	/* allocate per open data */
-	fh = kzalloc(sizeof(*fh),GFP_KERNEL);
-	if (NULL == fh) {
-		DEB_S("cannot allocate memory for per open data\n");
-		result = -ENOMEM;
-		goto out;
-	}
-
-	v4l2_fh_init(&fh->fh, vdev);
-
-	file->private_data = &fh->fh;
-
-	if (vdev->vfl_type == VFL_TYPE_VBI) {
-		DEB_S("initializing vbi...\n");
-		if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE)
-			result = saa7146_vbi_uops.open(dev,file);
-		if (dev->ext_vv_data->vbi_fops.open)
-			dev->ext_vv_data->vbi_fops.open(file);
-	} else {
-		DEB_S("initializing video...\n");
-		result = saa7146_video_uops.open(dev,file);
-	}
-
-	if (0 != result) {
-		goto out;
-	}
-
-	if( 0 == try_module_get(dev->ext->module)) {
-		result = -EINVAL;
-		goto out;
-	}
-
-	result = 0;
-	v4l2_fh_add(&fh->fh);
-out:
-	if (fh && result != 0) {
-		kfree(fh);
-		file->private_data = NULL;
-	}
-	mutex_unlock(vdev->lock);
-	return result;
-}
-
-static int fops_release(struct file *file)
-{
-	struct video_device *vdev = video_devdata(file);
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh  *fh  = file->private_data;
-
-	DEB_EE("file:%p\n", file);
-
-	mutex_lock(vdev->lock);
-
-	if (vdev->vfl_type == VFL_TYPE_VBI) {
-		if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE)
-			saa7146_vbi_uops.release(dev,file);
-		if (dev->ext_vv_data->vbi_fops.release)
-			dev->ext_vv_data->vbi_fops.release(file);
-	} else {
-		saa7146_video_uops.release(dev,file);
-	}
-
-	v4l2_fh_del(&fh->fh);
-	v4l2_fh_exit(&fh->fh);
-	module_put(dev->ext->module);
-	file->private_data = NULL;
-	kfree(fh);
-
-	mutex_unlock(vdev->lock);
-
-	return 0;
-}
-
-static int fops_mmap(struct file *file, struct vm_area_struct * vma)
-{
-	struct video_device *vdev = video_devdata(file);
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = file->private_data;
-	struct videobuf_queue *q;
-	int res;
-
-	switch (vdev->vfl_type) {
-	case VFL_TYPE_VIDEO: {
-		DEB_EE("V4L2_BUF_TYPE_VIDEO_CAPTURE: file:%p, vma:%p\n",
-		       file, vma);
-		q = &fh->video_q;
-		break;
-		}
-	case VFL_TYPE_VBI: {
-		DEB_EE("V4L2_BUF_TYPE_VBI_CAPTURE: file:%p, vma:%p\n",
-		       file, vma);
-		if (dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
-			return -ENODEV;
-		q = &fh->vbi_q;
-		break;
-		}
-	default:
-		BUG();
-	}
-
-	if (mutex_lock_interruptible(vdev->lock))
-		return -ERESTARTSYS;
-	res = videobuf_mmap_mapper(q, vma);
-	mutex_unlock(vdev->lock);
-	return res;
-}
-
-static __poll_t __fops_poll(struct file *file, struct poll_table_struct *wait)
-{
-	struct video_device *vdev = video_devdata(file);
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = file->private_data;
-	struct videobuf_buffer *buf = NULL;
-	struct videobuf_queue *q;
-	__poll_t res = v4l2_ctrl_poll(file, wait);
-
-	DEB_EE("file:%p, poll:%p\n", file, wait);
-
-	if (vdev->vfl_type == VFL_TYPE_VBI) {
-		if (dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
-			return res | EPOLLOUT | EPOLLWRNORM;
-		if( 0 == fh->vbi_q.streaming )
-			return res | videobuf_poll_stream(file, &fh->vbi_q, wait);
-		q = &fh->vbi_q;
-	} else {
-		DEB_D("using video queue\n");
-		q = &fh->video_q;
-	}
-
-	if (!list_empty(&q->stream))
-		buf = list_entry(q->stream.next, struct videobuf_buffer, stream);
-
-	if (!buf) {
-		DEB_D("buf == NULL!\n");
-		return res | EPOLLERR;
-	}
-
-	poll_wait(file, &buf->done, wait);
-	if (buf->state == VIDEOBUF_DONE || buf->state == VIDEOBUF_ERROR) {
-		DEB_D("poll succeeded!\n");
-		return res | EPOLLIN | EPOLLRDNORM;
-	}
-
-	DEB_D("nothing to poll for, buf->state:%d\n", buf->state);
-	return res;
-}
-
-static __poll_t fops_poll(struct file *file, struct poll_table_struct *wait)
-{
-	struct video_device *vdev = video_devdata(file);
-	__poll_t res;
-
-	mutex_lock(vdev->lock);
-	res = __fops_poll(file, wait);
-	mutex_unlock(vdev->lock);
-	return res;
-}
-
-static ssize_t fops_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
-{
-	struct video_device *vdev = video_devdata(file);
-	struct saa7146_dev *dev = video_drvdata(file);
-	int ret;
-
-	switch (vdev->vfl_type) {
-	case VFL_TYPE_VIDEO:
-/*
-		DEB_EE("V4L2_BUF_TYPE_VIDEO_CAPTURE: file:%p, data:%p, count:%lun",
-		       file, data, (unsigned long)count);
-*/
-		return saa7146_video_uops.read(file,data,count,ppos);
-	case VFL_TYPE_VBI:
-/*
-		DEB_EE("V4L2_BUF_TYPE_VBI_CAPTURE: file:%p, data:%p, count:%lu\n",
-		       file, data, (unsigned long)count);
-*/
-		if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE) {
-			if (mutex_lock_interruptible(vdev->lock))
-				return -ERESTARTSYS;
-			ret = saa7146_vbi_uops.read(file, data, count, ppos);
-			mutex_unlock(vdev->lock);
-			return ret;
-		}
-		return -EINVAL;
-	default:
-		BUG();
-	}
-}
-
 static ssize_t fops_write(struct file *file, const char __user *data, size_t count, loff_t *ppos)
 {
 	struct video_device *vdev = video_devdata(file);
 	struct saa7146_dev *dev = video_drvdata(file);
 	int ret;
 
-	switch (vdev->vfl_type) {
-	case VFL_TYPE_VIDEO:
+	if (vdev->vfl_type != VFL_TYPE_VBI || !dev->ext_vv_data->vbi_fops.write)
 		return -EINVAL;
-	case VFL_TYPE_VBI:
-		if (dev->ext_vv_data->vbi_fops.write) {
-			if (mutex_lock_interruptible(vdev->lock))
-				return -ERESTARTSYS;
-			ret = dev->ext_vv_data->vbi_fops.write(file, data, count, ppos);
-			mutex_unlock(vdev->lock);
-			return ret;
-		}
-		return -EINVAL;
-	default:
-		BUG();
-	}
+	if (mutex_lock_interruptible(vdev->lock))
+		return -ERESTARTSYS;
+	ret = dev->ext_vv_data->vbi_fops.write(file, data, count, ppos);
+	mutex_unlock(vdev->lock);
+	return ret;
 }
 
 static const struct v4l2_file_operations video_fops =
 {
 	.owner		= THIS_MODULE,
-	.open		= fops_open,
-	.release	= fops_release,
-	.read		= fops_read,
+	.open		= v4l2_fh_open,
+	.release	= vb2_fop_release,
+	.read		= vb2_fop_read,
 	.write		= fops_write,
-	.poll		= fops_poll,
-	.mmap		= fops_mmap,
+	.poll		= vb2_fop_poll,
+	.mmap		= vb2_fop_mmap,
 	.unlocked_ioctl	= video_ioctl2,
 };
 
@@ -568,16 +343,20 @@ EXPORT_SYMBOL_GPL(saa7146_vv_release);
 int saa7146_register_device(struct video_device *vfd, struct saa7146_dev *dev,
 			    char *name, int type)
 {
+	struct vb2_queue *q;
 	int err;
 	int i;
 
 	DEB_EE("dev:%p, name:'%s', type:%d\n", dev, name, type);
 
 	vfd->fops = &video_fops;
-	if (type == VFL_TYPE_VIDEO)
+	if (type == VFL_TYPE_VIDEO) {
 		vfd->ioctl_ops = &dev->ext_vv_data->vid_ops;
-	else
+		q = &dev->vv_data->video_dmaq.q;
+	} else {
 		vfd->ioctl_ops = &dev->ext_vv_data->vbi_ops;
+		q = &dev->vv_data->vbi_dmaq.q;
+	}
 	vfd->release = video_device_release_empty;
 	vfd->lock = &dev->v4l2_lock;
 	vfd->v4l2_dev = &dev->v4l2_dev;
@@ -598,6 +377,23 @@ int saa7146_register_device(struct video_device *vfd, struct saa7146_dev *dev,
 	} else {
 		vfd->device_caps &= ~V4L2_CAP_VIDEO_CAPTURE;
 	}
+
+	q->type = type == VFL_TYPE_VIDEO ? V4L2_BUF_TYPE_VIDEO_CAPTURE : V4L2_BUF_TYPE_VBI_CAPTURE;
+	q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+	q->io_modes = VB2_MMAP | VB2_READ | VB2_DMABUF;
+	q->ops = type == VFL_TYPE_VIDEO ? &video_qops : &vbi_qops;
+	q->mem_ops = &vb2_dma_sg_memops;
+	q->drv_priv = dev;
+	q->gfp_flags = __GFP_DMA32;
+	q->buf_struct_size = sizeof(struct saa7146_buf);
+	q->lock = &dev->v4l2_lock;
+	q->min_buffers_needed = 2;
+	q->dev = &dev->pci->dev;
+	err = vb2_queue_init(q);
+	if (err)
+		return err;
+	vfd->queue = q;
+
 	video_set_drvdata(vfd, dev);
 
 	err = video_register_device(vfd, type, -1);
diff --git a/drivers/media/common/saa7146/saa7146_vbi.c b/drivers/media/common/saa7146/saa7146_vbi.c
index 1a6fb0f381b7..bb7d81f7eba6 100644
--- a/drivers/media/common/saa7146/saa7146_vbi.c
+++ b/drivers/media/common/saa7146/saa7146_vbi.c
@@ -207,7 +207,6 @@ static int buffer_activate(struct saa7146_dev *dev,
 			   struct saa7146_buf *next)
 {
 	struct saa7146_vv *vv = dev->vv_data;
-	buf->vb.state = VIDEOBUF_ACTIVE;
 
 	DEB_VBI("dev:%p, buf:%p, next:%p\n", dev, buf, next);
 	saa7146_set_vbi_capture(dev,buf,next);
@@ -216,111 +215,101 @@ static int buffer_activate(struct saa7146_dev *dev,
 	return 0;
 }
 
-static int buffer_prepare(struct videobuf_queue *q, struct videobuf_buffer *vb,enum v4l2_field field)
-{
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
+/* ------------------------------------------------------------------ */
 
-	int err = 0;
-	int lines, llength, size;
+static int queue_setup(struct vb2_queue *q,
+		       unsigned int *num_buffers, unsigned int *num_planes,
+		       unsigned int sizes[], struct device *alloc_devs[])
+{
+	unsigned int size = 16 * 2 * vbi_pixel_to_capture;
 
-	lines   = 16 * 2 ; /* 2 fields */
-	llength = vbi_pixel_to_capture;
-	size = lines * llength;
+	if (*num_planes)
+		return sizes[0] < size ? -EINVAL : 0;
+	*num_planes = 1;
+	sizes[0] = size;
 
-	DEB_VBI("vb:%p\n", vb);
+	return 0;
+}
 
-	if (0 != buf->vb.baddr  &&  buf->vb.bsize < size) {
-		DEB_VBI("size mismatch\n");
-		return -EINVAL;
-	}
+static void buf_queue(struct vb2_buffer *vb)
+{
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	unsigned long flags;
 
-	if (buf->vb.size != size)
-		saa7146_dma_free(dev,q,buf);
+	spin_lock_irqsave(&dev->slock, flags);
 
-	if (VIDEOBUF_NEEDS_INIT == buf->vb.state) {
-		struct videobuf_dmabuf *dma=videobuf_to_dma(&buf->vb);
+	saa7146_buffer_queue(dev, &dev->vv_data->vbi_dmaq, buf);
+	spin_unlock_irqrestore(&dev->slock, flags);
+}
 
-		buf->vb.width  = llength;
-		buf->vb.height = lines;
-		buf->vb.size   = size;
-		buf->vb.field  = field;	// FIXME: check this
+static int buf_init(struct vb2_buffer *vb)
+{
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	struct sg_table *sgt = vb2_dma_sg_plane_desc(&buf->vb.vb2_buf, 0);
+	struct scatterlist *list = sgt->sgl;
+	int length = sgt->nents;
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
+	int ret;
 
-		saa7146_pgtable_free(dev->pci, &buf->pt[2]);
-		saa7146_pgtable_alloc(dev->pci, &buf->pt[2]);
-
-		err = videobuf_iolock(q,&buf->vb, NULL);
-		if (err)
-			goto oops;
-		err = saa7146_pgtable_build_single(dev->pci, &buf->pt[2],
-						 dma->sglist, dma->sglen);
-		if (0 != err)
-			return err;
-	}
-	buf->vb.state = VIDEOBUF_PREPARED;
 	buf->activate = buffer_activate;
 
-	return 0;
-
- oops:
-	DEB_VBI("error out\n");
-	saa7146_dma_free(dev,q,buf);
+	saa7146_pgtable_alloc(dev->pci, &buf->pt[2]);
 
-	return err;
+	ret = saa7146_pgtable_build_single(dev->pci, &buf->pt[2],
+					   list, length);
+	if (ret)
+		saa7146_pgtable_free(dev->pci, &buf->pt[2]);
+	return ret;
 }
 
-static int buffer_setup(struct videobuf_queue *q, unsigned int *count, unsigned int *size)
+static int buf_prepare(struct vb2_buffer *vb)
 {
-	int llength,lines;
-
-	lines   = 16 * 2 ; /* 2 fields */
-	llength = vbi_pixel_to_capture;
-
-	*size = lines * llength;
-	*count = 2;
-
-	DEB_VBI("count:%d, size:%d\n", *count, *size);
+	unsigned int size = 16 * 2 * vbi_pixel_to_capture;
 
+	if (vb2_plane_size(vb, 0) < size)
+		return -EINVAL;
+	vb2_set_plane_payload(vb, 0, size);
 	return 0;
 }
 
-static void buffer_queue(struct videobuf_queue *q, struct videobuf_buffer *vb)
+static void buf_cleanup(struct vb2_buffer *vb)
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
 
-	DEB_VBI("vb:%p\n", vb);
-	saa7146_buffer_queue(dev, &vv->vbi_dmaq, buf);
+	saa7146_pgtable_free(dev->pci, &buf->pt[2]);
 }
 
-static void buffer_release(struct videobuf_queue *q, struct videobuf_buffer *vb)
+static void return_buffers(struct vb2_queue *q, int state)
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
-
-	DEB_VBI("vb:%p\n", vb);
-	saa7146_dma_free(dev,q,buf);
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	struct saa7146_dmaqueue *dq = &dev->vv_data->vbi_dmaq;
+	struct saa7146_buf *buf;
+
+	if (dq->curr) {
+		buf = dq->curr;
+		dq->curr = NULL;
+		vb2_buffer_done(&buf->vb.vb2_buf, state);
+	}
+	while (!list_empty(&dq->queue)) {
+		buf = list_entry(dq->queue.next, struct saa7146_buf, list);
+		list_del(&buf->list);
+		vb2_buffer_done(&buf->vb.vb2_buf, state);
+	}
 }
 
-static const struct videobuf_queue_ops vbi_qops = {
-	.buf_setup    = buffer_setup,
-	.buf_prepare  = buffer_prepare,
-	.buf_queue    = buffer_queue,
-	.buf_release  = buffer_release,
-};
-
-/* ------------------------------------------------------------------ */
-
-static void vbi_stop(struct saa7146_fh *fh, struct file *file)
+static void vbi_stop(struct saa7146_dev *dev)
 {
-	struct saa7146_dev *dev = video_drvdata(file);
 	struct saa7146_vv *vv = dev->vv_data;
 	unsigned long flags;
-	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
+	DEB_VBI("dev:%p\n", dev);
 
 	spin_lock_irqsave(&dev->slock,flags);
 
@@ -333,13 +322,6 @@ static void vbi_stop(struct saa7146_fh *fh, struct file *file)
 	/* shut down dma 3 transfers */
 	saa7146_write(dev, MC1, MASK_20);
 
-	if (vv->vbi_dmaq.curr)
-		saa7146_buffer_finish(dev, &vv->vbi_dmaq, VIDEOBUF_DONE);
-
-	videobuf_queue_cancel(&fh->vbi_q);
-
-	vv->vbi_streaming = NULL;
-
 	del_timer(&vv->vbi_dmaq.timeout);
 	del_timer(&vv->vbi_read_timeout);
 
@@ -349,36 +331,20 @@ static void vbi_stop(struct saa7146_fh *fh, struct file *file)
 static void vbi_read_timeout(struct timer_list *t)
 {
 	struct saa7146_vv *vv = from_timer(vv, t, vbi_read_timeout);
-	struct file *file = vv->vbi_read_timeout_file;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = file->private_data;
-
-	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
-
-	vbi_stop(fh, file);
-}
+	struct saa7146_dev *dev = vv->vbi_dmaq.dev;
 
-static void vbi_init(struct saa7146_dev *dev, struct saa7146_vv *vv)
-{
 	DEB_VBI("dev:%p\n", dev);
 
-	INIT_LIST_HEAD(&vv->vbi_dmaq.queue);
-
-	timer_setup(&vv->vbi_dmaq.timeout, saa7146_buffer_timeout, 0);
-	vv->vbi_dmaq.dev              = dev;
-
-	init_waitqueue_head(&vv->vbi_wq);
+	vbi_stop(dev);
 }
 
-static int vbi_open(struct saa7146_dev *dev, struct file *file)
+static int vbi_begin(struct saa7146_dev *dev)
 {
-	struct saa7146_fh *fh = file->private_data;
 	struct saa7146_vv *vv = dev->vv_data;
-
 	u32 arbtr_ctrl	= saa7146_read(dev, PCI_BT_V1);
 	int ret = 0;
 
-	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
+	DEB_VBI("dev:%p\n", dev);
 
 	ret = saa7146_res_get(dev, RESOURCE_DMA3_BRS);
 	if (0 == ret) {
@@ -392,15 +358,7 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 	saa7146_write(dev, PCI_BT_V1, arbtr_ctrl);
 	saa7146_write(dev, MC2, (MASK_04|MASK_20));
 
-	videobuf_queue_sg_init(&fh->vbi_q, &vbi_qops,
-			    &dev->pci->dev, &dev->slock,
-			    V4L2_BUF_TYPE_VBI_CAPTURE,
-			    V4L2_FIELD_SEQ_TB, // FIXME: does this really work?
-			    sizeof(struct saa7146_buf),
-			    file, &dev->v4l2_lock);
-
 	vv->vbi_read_timeout.function = vbi_read_timeout;
-	vv->vbi_read_timeout_file = file;
 
 	/* initialize the brs */
 	if ( 0 != (SAA7146_USE_PORT_B_FOR_VBI & dev->ext_vv_data->flags)) {
@@ -419,18 +377,54 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 	return 0;
 }
 
-static void vbi_close(struct saa7146_dev *dev, struct file *file)
+static int start_streaming(struct vb2_queue *q, unsigned int count)
 {
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_vv *vv = dev->vv_data;
-	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	int ret;
+
+	if (!vb2_is_streaming(&dev->vv_data->vbi_dmaq.q))
+		dev->vv_data->seqnr = 0;
+	ret = vbi_begin(dev);
+	if (ret)
+		return_buffers(q, VB2_BUF_STATE_QUEUED);
+	return ret;
+}
 
-	if( fh == vv->vbi_streaming ) {
-		vbi_stop(fh, file);
-	}
+static void stop_streaming(struct vb2_queue *q)
+{
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+
+	vbi_stop(dev);
+	return_buffers(q, VB2_BUF_STATE_ERROR);
 	saa7146_res_free(dev, RESOURCE_DMA3_BRS);
 }
 
+const struct vb2_ops vbi_qops = {
+	.queue_setup	= queue_setup,
+	.buf_queue	= buf_queue,
+	.buf_init	= buf_init,
+	.buf_prepare	= buf_prepare,
+	.buf_cleanup	= buf_cleanup,
+	.start_streaming = start_streaming,
+	.stop_streaming = stop_streaming,
+	.wait_prepare	= vb2_ops_wait_prepare,
+	.wait_finish	= vb2_ops_wait_finish,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void vbi_init(struct saa7146_dev *dev, struct saa7146_vv *vv)
+{
+	DEB_VBI("dev:%p\n", dev);
+
+	INIT_LIST_HEAD(&vv->vbi_dmaq.queue);
+
+	timer_setup(&vv->vbi_dmaq.timeout, saa7146_buffer_timeout, 0);
+	vv->vbi_dmaq.dev              = dev;
+
+	init_waitqueue_head(&vv->vbi_wq);
+}
+
 static void vbi_irq_done(struct saa7146_dev *dev, unsigned long status)
 {
 	struct saa7146_vv *vv = dev->vv_data;
@@ -438,10 +432,7 @@ static void vbi_irq_done(struct saa7146_dev *dev, unsigned long status)
 
 	if (vv->vbi_dmaq.curr) {
 		DEB_VBI("dev:%p, curr:%p\n", dev, vv->vbi_dmaq.curr);
-		/* this must be += 2, one count for each field */
-		vv->vbi_fieldcount+=2;
-		vv->vbi_dmaq.curr->vb.field_count = vv->vbi_fieldcount;
-		saa7146_buffer_finish(dev, &vv->vbi_dmaq, VIDEOBUF_DONE);
+		saa7146_buffer_finish(dev, &vv->vbi_dmaq, VB2_BUF_STATE_DONE);
 	} else {
 		DEB_VBI("dev:%p\n", dev);
 	}
@@ -450,46 +441,7 @@ static void vbi_irq_done(struct saa7146_dev *dev, unsigned long status)
 	spin_unlock(&dev->slock);
 }
 
-static ssize_t vbi_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
-{
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_vv *vv = dev->vv_data;
-	ssize_t ret = 0;
-
-	DEB_VBI("dev:%p, fh:%p\n", dev, fh);
-
-	if( NULL == vv->vbi_streaming ) {
-		// fixme: check if dma3 is available
-		// fixme: activate vbi engine here if necessary. (really?)
-		vv->vbi_streaming = fh;
-	}
-
-	if( fh != vv->vbi_streaming ) {
-		DEB_VBI("open %p is already using vbi capture\n",
-			vv->vbi_streaming);
-		return -EBUSY;
-	}
-
-	mod_timer(&vv->vbi_read_timeout, jiffies+BUFFER_TIMEOUT);
-	ret = videobuf_read_stream(&fh->vbi_q, data, count, ppos, 1,
-				   file->f_flags & O_NONBLOCK);
-/*
-	printk("BASE_ODD3:      0x%08x\n", saa7146_read(dev, BASE_ODD3));
-	printk("BASE_EVEN3:     0x%08x\n", saa7146_read(dev, BASE_EVEN3));
-	printk("PROT_ADDR3:     0x%08x\n", saa7146_read(dev, PROT_ADDR3));
-	printk("PITCH3:         0x%08x\n", saa7146_read(dev, PITCH3));
-	printk("BASE_PAGE3:     0x%08x\n", saa7146_read(dev, BASE_PAGE3));
-	printk("NUM_LINE_BYTE3: 0x%08x\n", saa7146_read(dev, NUM_LINE_BYTE3));
-	printk("BRS_CTRL:       0x%08x\n", saa7146_read(dev, BRS_CTRL));
-*/
-	return ret;
-}
-
 const struct saa7146_use_ops saa7146_vbi_uops = {
 	.init		= vbi_init,
-	.open		= vbi_open,
-	.release	= vbi_close,
 	.irq_done	= vbi_irq_done,
-	.read		= vbi_read,
 };
diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c
index 6af30fece176..5d580b5e61fe 100644
--- a/drivers/media/common/saa7146/saa7146_video.c
+++ b/drivers/media/common/saa7146/saa7146_video.c
@@ -6,14 +6,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 
-static int max_memory = 32;
-
-module_param(max_memory, int, 0644);
-MODULE_PARM_DESC(max_memory, "maximum memory usage for capture buffers (default: 32Mb)");
-
-#define IS_CAPTURE_ACTIVE(fh) \
-	(((vv->video_status & STATUS_CAPTURE) != 0) && (vv->video_fh == fh))
-
 /* format descriptions for capture and preview */
 static struct saa7146_format formats[] = {
 	{
@@ -95,9 +87,9 @@ static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *bu
 {
 	struct saa7146_vv *vv = dev->vv_data;
 	struct pci_dev *pci = dev->pci;
-	struct videobuf_dmabuf *dma=videobuf_to_dma(&buf->vb);
-	struct scatterlist *list = dma->sglist;
-	int length = dma->sglen;
+	struct sg_table *sgt = vb2_dma_sg_plane_desc(&buf->vb.vb2_buf, 0);
+	struct scatterlist *list = sgt->sgl;
+	int length = sgt->nents;
 	struct v4l2_pix_format *pix = &vv->video_fmt;
 	struct saa7146_format *sfmt = saa7146_format_by_fourcc(dev, pix->pixelformat);
 
@@ -151,7 +143,7 @@ static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *bu
 
 		/* if we have a user buffer, the first page may not be
 		   aligned to a page boundary. */
-		pt1->offset = dma->sglist->offset;
+		pt1->offset = sgt->sgl->offset;
 		pt2->offset = pt1->offset + o1;
 		pt3->offset = pt1->offset + o2;
 
@@ -187,23 +179,14 @@ static int saa7146_pgtable_build(struct saa7146_dev *dev, struct saa7146_buf *bu
 /********************************************************************************/
 /* file operations */
 
-static int video_begin(struct saa7146_dev *dev, struct saa7146_fh *fh)
+static int video_begin(struct saa7146_dev *dev)
 {
 	struct saa7146_vv *vv = dev->vv_data;
 	struct saa7146_format *fmt = NULL;
 	unsigned int resource;
 	int ret = 0;
 
-	DEB_EE("dev:%p, fh:%p\n", dev, fh);
-
-	if ((vv->video_status & STATUS_CAPTURE) != 0) {
-		if (vv->video_fh == fh) {
-			DEB_S("already capturing\n");
-			return 0;
-		}
-		DEB_S("already capturing in another open\n");
-		return -EBUSY;
-	}
+	DEB_EE("dev:%p\n", dev);
 
 	fmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
 	/* we need to have a valid format set here */
@@ -228,36 +211,22 @@ static int video_begin(struct saa7146_dev *dev, struct saa7146_fh *fh)
 	/* enable rps0 irqs */
 	SAA7146_IER_ENABLE(dev, MASK_27);
 
-	vv->video_fh = fh;
-	vv->video_status = STATUS_CAPTURE;
-
 	return 0;
 }
 
-static int video_end(struct saa7146_dev *dev, struct saa7146_fh *fh)
+static void video_end(struct saa7146_dev *dev)
 {
 	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_dmaqueue *q = &vv->video_dmaq;
 	struct saa7146_format *fmt = NULL;
 	unsigned long flags;
 	unsigned int resource;
 	u32 dmas = 0;
-	DEB_EE("dev:%p, fh:%p\n", dev, fh);
-
-	if ((vv->video_status & STATUS_CAPTURE) != STATUS_CAPTURE) {
-		DEB_S("not capturing\n");
-		return 0;
-	}
-
-	if (vv->video_fh != fh) {
-		DEB_S("capturing, but in another open\n");
-		return -EBUSY;
-	}
+	DEB_EE("dev:%p\n", dev);
 
 	fmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
 	/* we need to have a valid format set here */
 	if (!fmt)
-		return -EINVAL;
+		return;
 
 	if (0 != (fmt->flags & FORMAT_IS_PLANAR)) {
 		resource = RESOURCE_DMA1_HPS|RESOURCE_DMA2_CLP|RESOURCE_DMA3_BRS;
@@ -277,17 +246,9 @@ static int video_end(struct saa7146_dev *dev, struct saa7146_fh *fh)
 	/* shut down all used video dma transfers */
 	saa7146_write(dev, MC1, dmas);
 
-	if (q->curr)
-		saa7146_buffer_finish(dev, q, VIDEOBUF_DONE);
-
 	spin_unlock_irqrestore(&dev->slock, flags);
 
-	vv->video_fh = NULL;
-	vv->video_status = 0;
-
 	saa7146_res_free(dev, resource);
-
-	return 0;
 }
 
 static int vidioc_querycap(struct file *file, void *fh, struct v4l2_capability *cap)
@@ -345,13 +306,13 @@ int saa7146_s_ctrl(struct v4l2_ctrl *ctrl)
 
 	case V4L2_CID_HFLIP:
 		/* fixme: we can support changing VFLIP and HFLIP here... */
-		if ((vv->video_status & STATUS_CAPTURE))
+		if (vb2_is_busy(&vv->video_dmaq.q))
 			return -EBUSY;
 		vv->hflip = ctrl->val;
 		break;
 
 	case V4L2_CID_VFLIP:
-		if ((vv->video_status & STATUS_CAPTURE))
+		if (vb2_is_busy(&vv->video_dmaq.q))
 			return -EBUSY;
 		vv->vflip = ctrl->val;
 		break;
@@ -459,15 +420,14 @@ static int vidioc_try_fmt_vid_cap(struct file *file, void *fh, struct v4l2_forma
 	return 0;
 }
 
-static int vidioc_s_fmt_vid_cap(struct file *file, void *__fh, struct v4l2_format *f)
+static int vidioc_s_fmt_vid_cap(struct file *file, void *fh, struct v4l2_format *f)
 {
 	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = __fh;
 	struct saa7146_vv *vv = dev->vv_data;
 	int err;
 
-	DEB_EE("V4L2_BUF_TYPE_VIDEO_CAPTURE: dev:%p, fh:%p\n", dev, fh);
-	if (IS_CAPTURE_ACTIVE(fh) != 0) {
+	DEB_EE("V4L2_BUF_TYPE_VIDEO_CAPTURE: dev:%p\n", dev);
+	if (vb2_is_busy(&vv->video_dmaq.q)) {
 		DEB_EE("streaming capture is active\n");
 		return -EBUSY;
 	}
@@ -489,24 +449,6 @@ static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
 	return 0;
 }
 
-	/* the saa7146 supfhrts (used in conjunction with the saa7111a for example)
-	   PAL / NTSC / SECAM. if your hardware does not (or does more)
-	   -- override this function in your extension */
-/*
-	case VIDIOC_ENUMSTD:
-	{
-		struct v4l2_standard *e = arg;
-		if (e->index < 0 )
-			return -EINVAL;
-		if( e->index < dev->ext_vv_data->num_stds ) {
-			DEB_EE("VIDIOC_ENUMSTD: index:%d\n", e->index);
-			v4l2_video_std_construct(e, dev->ext_vv_data->stds[e->index].id, dev->ext_vv_data->stds[e->index].name);
-			return 0;
-		}
-		return -EINVAL;
-	}
-	*/
-
 static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 {
 	struct saa7146_dev *dev = video_drvdata(file);
@@ -516,7 +458,7 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 
 	DEB_EE("VIDIOC_S_STD\n");
 
-	if ((vv->video_status & STATUS_CAPTURE) == STATUS_CAPTURE) {
+	if (vb2_is_busy(&vv->video_dmaq.q) || vb2_is_busy(&vv->vbi_dmaq.q)) {
 		DEB_D("cannot change video standard while streaming capture is active\n");
 		return -EBUSY;
 	}
@@ -540,120 +482,22 @@ static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id id)
 	return 0;
 }
 
-static int vidioc_reqbufs(struct file *file, void *__fh, struct v4l2_requestbuffers *b)
-{
-	struct saa7146_fh *fh = __fh;
-
-	if (b->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		return videobuf_reqbufs(&fh->video_q, b);
-	if (b->type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		return videobuf_reqbufs(&fh->vbi_q, b);
-	return -EINVAL;
-}
-
-static int vidioc_querybuf(struct file *file, void *__fh, struct v4l2_buffer *buf)
-{
-	struct saa7146_fh *fh = __fh;
-
-	if (buf->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		return videobuf_querybuf(&fh->video_q, buf);
-	if (buf->type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		return videobuf_querybuf(&fh->vbi_q, buf);
-	return -EINVAL;
-}
-
-static int vidioc_qbuf(struct file *file, void *__fh, struct v4l2_buffer *buf)
-{
-	struct saa7146_fh *fh = __fh;
-
-	if (buf->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		return videobuf_qbuf(&fh->video_q, buf);
-	if (buf->type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		return videobuf_qbuf(&fh->vbi_q, buf);
-	return -EINVAL;
-}
-
-static int vidioc_dqbuf(struct file *file, void *__fh, struct v4l2_buffer *buf)
-{
-	struct saa7146_fh *fh = __fh;
-
-	if (buf->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		return videobuf_dqbuf(&fh->video_q, buf, file->f_flags & O_NONBLOCK);
-	if (buf->type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		return videobuf_dqbuf(&fh->vbi_q, buf, file->f_flags & O_NONBLOCK);
-	return -EINVAL;
-}
-
-static int vidioc_streamon(struct file *file, void *__fh, enum v4l2_buf_type type)
-{
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = __fh;
-	int err;
-
-	DEB_D("VIDIOC_STREAMON, type:%d\n", type);
-
-	err = video_begin(dev, fh);
-	if (err)
-		return err;
-	if (type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		return videobuf_streamon(&fh->video_q);
-	if (type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		return videobuf_streamon(&fh->vbi_q);
-	return -EINVAL;
-}
-
-static int vidioc_streamoff(struct file *file, void *__fh, enum v4l2_buf_type type)
-{
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = __fh;
-	struct saa7146_vv *vv = dev->vv_data;
-	int err;
-
-	DEB_D("VIDIOC_STREAMOFF, type:%d\n", type);
-
-	/* ugly: we need to copy some checks from video_end(),
-	   because videobuf_streamoff() relies on the capture running.
-	   check and fix this */
-	if ((vv->video_status & STATUS_CAPTURE) != STATUS_CAPTURE) {
-		DEB_S("not capturing\n");
-		return 0;
-	}
-
-	if (vv->video_fh != fh) {
-		DEB_S("capturing, but in another open\n");
-		return -EBUSY;
-	}
-
-	err = -EINVAL;
-	if (type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		err = videobuf_streamoff(&fh->video_q);
-	else if (type == V4L2_BUF_TYPE_VBI_CAPTURE)
-		err = videobuf_streamoff(&fh->vbi_q);
-	if (0 != err) {
-		DEB_D("warning: videobuf_streamoff() failed\n");
-		video_end(dev, fh);
-	} else {
-		err = video_end(dev, fh);
-	}
-	return err;
-}
-
 const struct v4l2_ioctl_ops saa7146_video_ioctl_ops = {
 	.vidioc_querycap             = vidioc_querycap,
 	.vidioc_enum_fmt_vid_cap     = vidioc_enum_fmt_vid_cap,
 	.vidioc_g_fmt_vid_cap        = vidioc_g_fmt_vid_cap,
 	.vidioc_try_fmt_vid_cap      = vidioc_try_fmt_vid_cap,
 	.vidioc_s_fmt_vid_cap        = vidioc_s_fmt_vid_cap,
-
-	.vidioc_reqbufs              = vidioc_reqbufs,
-	.vidioc_querybuf             = vidioc_querybuf,
-	.vidioc_qbuf                 = vidioc_qbuf,
-	.vidioc_dqbuf                = vidioc_dqbuf,
 	.vidioc_g_std                = vidioc_g_std,
 	.vidioc_s_std                = vidioc_s_std,
-	.vidioc_streamon             = vidioc_streamon,
-	.vidioc_streamoff            = vidioc_streamoff,
 	.vidioc_g_parm		     = vidioc_g_parm,
+	.vidioc_reqbufs		     = vb2_ioctl_reqbufs,
+	.vidioc_create_bufs	     = vb2_ioctl_create_bufs,
+	.vidioc_querybuf	     = vb2_ioctl_querybuf,
+	.vidioc_qbuf		     = vb2_ioctl_qbuf,
+	.vidioc_dqbuf		     = vb2_ioctl_dqbuf,
+	.vidioc_streamon	     = vb2_ioctl_streamon,
+	.vidioc_streamoff	     = vb2_ioctl_streamoff,
 	.vidioc_subscribe_event      = v4l2_ctrl_subscribe_event,
 	.vidioc_unsubscribe_event    = v4l2_event_unsubscribe,
 };
@@ -661,16 +505,17 @@ const struct v4l2_ioctl_ops saa7146_video_ioctl_ops = {
 const struct v4l2_ioctl_ops saa7146_vbi_ioctl_ops = {
 	.vidioc_querycap             = vidioc_querycap,
 	.vidioc_g_fmt_vbi_cap        = vidioc_g_fmt_vbi_cap,
-
-	.vidioc_reqbufs              = vidioc_reqbufs,
-	.vidioc_querybuf             = vidioc_querybuf,
-	.vidioc_qbuf                 = vidioc_qbuf,
-	.vidioc_dqbuf                = vidioc_dqbuf,
+	.vidioc_s_fmt_vbi_cap        = vidioc_g_fmt_vbi_cap,
 	.vidioc_g_std                = vidioc_g_std,
 	.vidioc_s_std                = vidioc_s_std,
-	.vidioc_streamon             = vidioc_streamon,
-	.vidioc_streamoff            = vidioc_streamoff,
 	.vidioc_g_parm		     = vidioc_g_parm,
+	.vidioc_reqbufs		     = vb2_ioctl_reqbufs,
+	.vidioc_create_bufs	     = vb2_ioctl_create_bufs,
+	.vidioc_querybuf	     = vb2_ioctl_querybuf,
+	.vidioc_qbuf		     = vb2_ioctl_qbuf,
+	.vidioc_dqbuf		     = vb2_ioctl_dqbuf,
+	.vidioc_streamon	     = vb2_ioctl_streamon,
+	.vidioc_streamoff	     = vb2_ioctl_streamoff,
 	.vidioc_subscribe_event      = v4l2_ctrl_subscribe_event,
 	.vidioc_unsubscribe_event    = v4l2_event_unsubscribe,
 };
@@ -684,7 +529,6 @@ static int buffer_activate (struct saa7146_dev *dev,
 {
 	struct saa7146_vv *vv = dev->vv_data;
 
-	buf->vb.state = VIDEOBUF_ACTIVE;
 	saa7146_set_capture(dev,buf,next);
 
 	mod_timer(&vv->video_dmaq.timeout, jiffies+BUFFER_TIMEOUT);
@@ -698,135 +542,136 @@ static void release_all_pagetables(struct saa7146_dev *dev, struct saa7146_buf *
 	saa7146_pgtable_free(dev->pci, &buf->pt[2]);
 }
 
-static int buffer_prepare(struct videobuf_queue *q,
-			  struct videobuf_buffer *vb, enum v4l2_field field)
+static int queue_setup(struct vb2_queue *q,
+		       unsigned int *num_buffers, unsigned int *num_planes,
+		       unsigned int sizes[], struct device *alloc_devs[])
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
-	int size,err = 0;
-
-	DEB_CAP("vbuf:%p\n", vb);
-
-	/* sanity checks */
-	if (vv->video_fmt.width  < 48 ||
-	    vv->video_fmt.height < 32 ||
-	    vv->video_fmt.width  > vv->standard->h_max_out ||
-	    vv->video_fmt.height > vv->standard->v_max_out) {
-		DEB_D("w (%d) / h (%d) out of bounds\n",
-		      vv->video_fmt.width, vv->video_fmt.height);
-		return -EINVAL;
-	}
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	unsigned int size = dev->vv_data->video_fmt.sizeimage;
 
-	size = vv->video_fmt.sizeimage;
-	if (0 != buf->vb.baddr && buf->vb.bsize < size) {
-		DEB_D("size mismatch\n");
-		return -EINVAL;
-	}
+	if (*num_planes)
+		return sizes[0] < size ? -EINVAL : 0;
+	*num_planes = 1;
+	sizes[0] = size;
 
-	DEB_CAP("buffer_prepare [size=%dx%d,bytes=%d,fields=%s]\n",
-		vv->video_fmt.width, vv->video_fmt.height,
-		size, v4l2_field_names[vv->video_fmt.field]);
-	if (buf->vb.width  != vv->video_fmt.width  ||
-	    buf->vb.bytesperline != vv->video_fmt.bytesperline ||
-	    buf->vb.height != vv->video_fmt.height ||
-	    buf->vb.size   != size ||
-	    buf->vb.field  != field      ||
-	    buf->vb.field  != vv->video_fmt.field) {
-		saa7146_dma_free(dev,q,buf);
-	}
+	return 0;
+}
 
-	if (VIDEOBUF_NEEDS_INIT == buf->vb.state) {
-		struct saa7146_format *sfmt;
+static void buf_queue(struct vb2_buffer *vb)
+{
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	unsigned long flags;
 
-		buf->vb.bytesperline  = vv->video_fmt.bytesperline;
-		buf->vb.width  = vv->video_fmt.width;
-		buf->vb.height = vv->video_fmt.height;
-		buf->vb.size   = size;
-		buf->vb.field  = field;
-		buf->vb.field  = vv->video_fmt.field;
+	spin_lock_irqsave(&dev->slock, flags);
 
-		sfmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
+	saa7146_buffer_queue(dev, &dev->vv_data->video_dmaq, buf);
+	spin_unlock_irqrestore(&dev->slock, flags);
+}
 
-		release_all_pagetables(dev, buf);
-		if( 0 != IS_PLANAR(sfmt->trans)) {
-			saa7146_pgtable_alloc(dev->pci, &buf->pt[0]);
-			saa7146_pgtable_alloc(dev->pci, &buf->pt[1]);
-			saa7146_pgtable_alloc(dev->pci, &buf->pt[2]);
-		} else {
-			saa7146_pgtable_alloc(dev->pci, &buf->pt[0]);
-		}
+static int buf_init(struct vb2_buffer *vb)
+{
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
+	struct saa7146_vv *vv = dev->vv_data;
+	struct saa7146_format *sfmt;
+	int ret;
 
-		err = videobuf_iolock(q, &buf->vb, NULL);
-		if (err)
-			goto oops;
-		err = saa7146_pgtable_build(dev,buf);
-		if (err)
-			goto oops;
-	}
-	buf->vb.state = VIDEOBUF_PREPARED;
 	buf->activate = buffer_activate;
+	sfmt = saa7146_format_by_fourcc(dev, vv->video_fmt.pixelformat);
 
-	return 0;
-
- oops:
-	DEB_D("error out\n");
-	saa7146_dma_free(dev,q,buf);
+	if (IS_PLANAR(sfmt->trans)) {
+		saa7146_pgtable_alloc(dev->pci, &buf->pt[0]);
+		saa7146_pgtable_alloc(dev->pci, &buf->pt[1]);
+		saa7146_pgtable_alloc(dev->pci, &buf->pt[2]);
+	} else {
+		saa7146_pgtable_alloc(dev->pci, &buf->pt[0]);
+	}
 
-	return err;
+	ret = saa7146_pgtable_build(dev, buf);
+	if (ret)
+		release_all_pagetables(dev, buf);
+	return ret;
 }
 
-static int buffer_setup(struct videobuf_queue *q, unsigned int *count, unsigned int *size)
+static int buf_prepare(struct vb2_buffer *vb)
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
 	struct saa7146_vv *vv = dev->vv_data;
+	unsigned int size = vv->video_fmt.sizeimage;
 
-	if (0 == *count || *count > MAX_SAA7146_CAPTURE_BUFFERS)
-		*count = MAX_SAA7146_CAPTURE_BUFFERS;
-
-	*size = vv->video_fmt.sizeimage;
-
-	/* check if we exceed the "max_memory" parameter */
-	if( (*count * *size) > (max_memory*1048576) ) {
-		*count = (max_memory*1048576) / *size;
-	}
-
-	DEB_CAP("%d buffers, %d bytes each\n", *count, *size);
-
+	if (vb2_plane_size(vb, 0) < size)
+		return -EINVAL;
+	vb2_set_plane_payload(vb, 0, size);
 	return 0;
 }
 
-static void buffer_queue(struct videobuf_queue *q, struct videobuf_buffer *vb)
+static void buf_cleanup(struct vb2_buffer *vb)
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_vv *vv = dev->vv_data;
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+	struct saa7146_buf *buf = container_of(vbuf, struct saa7146_buf, vb);
+	struct vb2_queue *vq = vb->vb2_queue;
+	struct saa7146_dev *dev = vb2_get_drv_priv(vq);
 
-	DEB_CAP("vbuf:%p\n", vb);
-	saa7146_buffer_queue(dev, &vv->video_dmaq, buf);
+	release_all_pagetables(dev, buf);
 }
 
-static void buffer_release(struct videobuf_queue *q, struct videobuf_buffer *vb)
+static void return_buffers(struct vb2_queue *q, int state)
 {
-	struct file *file = q->priv_data;
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_buf *buf = (struct saa7146_buf *)vb;
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	struct saa7146_dmaqueue *dq = &dev->vv_data->video_dmaq;
+	struct saa7146_buf *buf;
+
+	if (dq->curr) {
+		buf = dq->curr;
+		dq->curr = NULL;
+		vb2_buffer_done(&buf->vb.vb2_buf, state);
+	}
+	while (!list_empty(&dq->queue)) {
+		buf = list_entry(dq->queue.next, struct saa7146_buf, list);
+		list_del(&buf->list);
+		vb2_buffer_done(&buf->vb.vb2_buf, state);
+	}
+}
 
-	DEB_CAP("vbuf:%p\n", vb);
+static int start_streaming(struct vb2_queue *q, unsigned int count)
+{
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	int ret;
+
+	if (!vb2_is_streaming(&dev->vv_data->video_dmaq.q))
+		dev->vv_data->seqnr = 0;
+	ret = video_begin(dev);
+	if (ret)
+		return_buffers(q, VB2_BUF_STATE_QUEUED);
+	return ret;
+}
 
-	saa7146_dma_free(dev,q,buf);
+static void stop_streaming(struct vb2_queue *q)
+{
+	struct saa7146_dev *dev = vb2_get_drv_priv(q);
+	struct saa7146_dmaqueue *dq = &dev->vv_data->video_dmaq;
 
-	release_all_pagetables(dev, buf);
+	del_timer(&dq->timeout);
+	video_end(dev);
+	return_buffers(q, VB2_BUF_STATE_ERROR);
 }
 
-static const struct videobuf_queue_ops video_qops = {
-	.buf_setup    = buffer_setup,
-	.buf_prepare  = buffer_prepare,
-	.buf_queue    = buffer_queue,
-	.buf_release  = buffer_release,
+const struct vb2_ops video_qops = {
+	.queue_setup	= queue_setup,
+	.buf_queue	= buf_queue,
+	.buf_init	= buf_init,
+	.buf_prepare	= buf_prepare,
+	.buf_cleanup	= buf_cleanup,
+	.start_streaming = start_streaming,
+	.stop_streaming = stop_streaming,
+	.wait_prepare	= vb2_ops_wait_prepare,
+	.wait_finish	= vb2_ops_wait_finish,
 };
 
 /********************************************************************************/
@@ -847,36 +692,6 @@ static void video_init(struct saa7146_dev *dev, struct saa7146_vv *vv)
 	vv->current_hps_sync = SAA7146_HPS_SYNC_PORT_A;
 }
 
-
-static int video_open(struct saa7146_dev *dev, struct file *file)
-{
-	struct saa7146_fh *fh = file->private_data;
-
-	videobuf_queue_sg_init(&fh->video_q, &video_qops,
-			    &dev->pci->dev, &dev->slock,
-			    V4L2_BUF_TYPE_VIDEO_CAPTURE,
-			    V4L2_FIELD_INTERLACED,
-			    sizeof(struct saa7146_buf),
-			    file, &dev->v4l2_lock);
-
-	return 0;
-}
-
-
-static void video_close(struct saa7146_dev *dev, struct file *file)
-{
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_vv *vv = dev->vv_data;
-	struct videobuf_queue *q = &fh->video_q;
-
-	if (IS_CAPTURE_ACTIVE(fh) != 0)
-		video_end(dev, fh);
-
-	videobuf_stop(q);
-	/* hmm, why is this function declared void? */
-}
-
-
 static void video_irq_done(struct saa7146_dev *dev, unsigned long st)
 {
 	struct saa7146_vv *vv = dev->vv_data;
@@ -886,53 +701,14 @@ static void video_irq_done(struct saa7146_dev *dev, unsigned long st)
 	DEB_CAP("called\n");
 
 	/* only finish the buffer if we have one... */
-	if( NULL != q->curr ) {
-		saa7146_buffer_finish(dev,q,VIDEOBUF_DONE);
-	}
+	if (q->curr)
+		saa7146_buffer_finish(dev, q, VB2_BUF_STATE_DONE);
 	saa7146_buffer_next(dev,q,0);
 
 	spin_unlock(&dev->slock);
 }
 
-static ssize_t video_read(struct file *file, char __user *data, size_t count, loff_t *ppos)
-{
-	struct saa7146_dev *dev = video_drvdata(file);
-	struct saa7146_fh *fh = file->private_data;
-	struct saa7146_vv *vv = dev->vv_data;
-	ssize_t ret = 0;
-
-	DEB_EE("called\n");
-
-	if ((vv->video_status & STATUS_CAPTURE) != 0) {
-		/* fixme: should we allow read() captures while streaming capture? */
-		if (vv->video_fh == fh) {
-			DEB_S("already capturing\n");
-			return -EBUSY;
-		}
-		DEB_S("already capturing in another open\n");
-		return -EBUSY;
-	}
-
-	ret = video_begin(dev, fh);
-	if( 0 != ret) {
-		goto out;
-	}
-
-	ret = videobuf_read_one(&fh->video_q , data, count, ppos,
-				file->f_flags & O_NONBLOCK);
-	if (ret != 0) {
-		video_end(dev, fh);
-	} else {
-		ret = video_end(dev, fh);
-	}
-out:
-	return ret;
-}
-
 const struct saa7146_use_ops saa7146_video_uops = {
 	.init = video_init,
-	.open = video_open,
-	.release = video_close,
 	.irq_done = video_irq_done,
-	.read = video_read,
 };
diff --git a/drivers/media/pci/saa7146/mxb.c b/drivers/media/pci/saa7146/mxb.c
index f518ad8c92ed..557ba89cd12d 100644
--- a/drivers/media/pci/saa7146/mxb.c
+++ b/drivers/media/pci/saa7146/mxb.c
@@ -587,7 +587,6 @@ static int vidioc_s_frequency(struct file *file, void *fh, const struct v4l2_fre
 {
 	struct saa7146_dev *dev = video_drvdata(file);
 	struct mxb *mxb = (struct mxb *)dev->ext_priv;
-	struct saa7146_vv *vv = dev->vv_data;
 
 	if (f->tuner)
 		return -EINVAL;
@@ -604,15 +603,6 @@ static int vidioc_s_frequency(struct file *file, void *fh, const struct v4l2_fre
 	tuner_call(mxb, tuner, g_frequency, &mxb->cur_freq);
 	if (mxb->cur_audinput == 0)
 		mxb_update_audmode(mxb);
-
-	if (mxb->cur_input)
-		return 0;
-
-	/* hack: changing the frequency should invalidate the vbi-counter (=> alevt) */
-	spin_lock(&dev->slock);
-	vv->vbi_fieldcount = 0;
-	spin_unlock(&dev->slock);
-
 	return 0;
 }
 
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index 80463fdd30eb..55c7d70b9feb 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -6,7 +6,7 @@
 #include <media/v4l2-ioctl.h>
 #include <media/v4l2-fh.h>
 #include <media/drv-intf/saa7146.h>
-#include <media/videobuf-dma-sg.h>
+#include <media/videobuf2-dma-sg.h>
 
 #define MAX_SAA7146_CAPTURE_BUFFERS	32	/* arbitrary */
 #define BUFFER_TIMEOUT     (HZ/2)  /* 0.5 seconds */
@@ -57,7 +57,8 @@ struct saa7146_standard
 /* buffer for one video/vbi frame */
 struct saa7146_buf {
 	/* common v4l buffer stuff -- must be first */
-	struct videobuf_buffer vb;
+	struct vb2_v4l2_buffer vb;
+	struct list_head list;
 
 	/* saa7146 specific */
 	int (*activate)(struct saa7146_dev *dev,
@@ -73,41 +74,23 @@ struct saa7146_dmaqueue {
 	struct saa7146_buf	*curr;
 	struct list_head	queue;
 	struct timer_list	timeout;
+	struct vb2_queue	q;
 };
 
-/* per open data */
-struct saa7146_fh {
-	/* Must be the first field! */
-	struct v4l2_fh		fh;
-
-	/* video capture */
-	struct videobuf_queue	video_q;
-
-	/* vbi capture */
-	struct videobuf_queue	vbi_q;
-};
-
-#define STATUS_CAPTURE	0x02
-
 struct saa7146_vv
 {
 	/* vbi capture */
 	struct saa7146_dmaqueue		vbi_dmaq;
 	struct v4l2_vbi_format		vbi_fmt;
 	struct timer_list		vbi_read_timeout;
-	struct file			*vbi_read_timeout_file;
 	/* vbi workaround interrupt queue */
 	wait_queue_head_t		vbi_wq;
-	int				vbi_fieldcount;
-	struct saa7146_fh		*vbi_streaming;
-
-	int				video_status;
-	struct saa7146_fh		*video_fh;
 
 	/* video capture */
 	struct saa7146_dmaqueue		video_dmaq;
 	struct v4l2_pix_format		video_fmt;
 	enum v4l2_field			last_field;
+	u32				seqnr;
 
 	/* common: fixme? shouldn't this be in saa7146_fh?
 	   (this leads to a more complicated question: shall the driver
@@ -122,7 +105,7 @@ struct saa7146_vv
 	int	current_hps_source;
 	int	current_hps_sync;
 
-	unsigned int resources;	/* resource management for device */
+	unsigned int resources; /* resource management for device */
 };
 
 /* flags */
@@ -152,10 +135,7 @@ struct saa7146_ext_vv
 
 struct saa7146_use_ops  {
 	void (*init)(struct saa7146_dev *, struct saa7146_vv *);
-	int(*open)(struct saa7146_dev *, struct file *);
-	void (*release)(struct saa7146_dev *, struct file *);
 	void (*irq_done)(struct saa7146_dev *, unsigned long status);
-	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
 };
 
 /* from saa7146_fops.c */
@@ -165,8 +145,6 @@ void saa7146_buffer_finish(struct saa7146_dev *dev, struct saa7146_dmaqueue *q,
 void saa7146_buffer_next(struct saa7146_dev *dev, struct saa7146_dmaqueue *q,int vbi);
 int saa7146_buffer_queue(struct saa7146_dev *dev, struct saa7146_dmaqueue *q, struct saa7146_buf *buf);
 void saa7146_buffer_timeout(struct timer_list *t);
-void saa7146_dma_free(struct saa7146_dev* dev,struct videobuf_queue *q,
-						struct saa7146_buf *buf);
 
 int saa7146_vv_init(struct saa7146_dev* dev, struct saa7146_ext_vv *ext_vv);
 int saa7146_vv_release(struct saa7146_dev* dev);
@@ -181,11 +159,13 @@ void saa7146_set_gpio(struct saa7146_dev *saa, u8 pin, u8 data);
 extern const struct v4l2_ioctl_ops saa7146_video_ioctl_ops;
 extern const struct v4l2_ioctl_ops saa7146_vbi_ioctl_ops;
 extern const struct saa7146_use_ops saa7146_video_uops;
+extern const struct vb2_ops video_qops;
 long saa7146_video_do_ioctl(struct file *file, unsigned int cmd, void *arg);
 int saa7146_s_ctrl(struct v4l2_ctrl *ctrl);
 
 /* from saa7146_vbi.c */
 extern const struct saa7146_use_ops saa7146_vbi_uops;
+extern const struct vb2_ops vbi_qops;
 
 /* resource management functions */
 int saa7146_res_get(struct saa7146_dev *dev, unsigned int bit);
-- 
cgit v1.2.3


From f57fa2959244026ea5e885249e6c55fe3f133bd7 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 23 Mar 2023 13:58:35 +0000
Subject: media: v4l2-subdev: Add new ioctl for client capabilities

Add new ioctls to set and get subdev client capabilities. Client in this
context means the userspace application which opens the subdev device
node. The client capabilities are stored in the file handle of the
opened subdev device node, and the client must set the capabilities for
each opened subdev.

For now we only add a single flag, V4L2_SUBDEV_CLIENT_CAP_STREAMS, which
indicates that the client is streams-aware.

The reason for needing such a flag is as follows:

Many structs passed via ioctls, e.g. struct v4l2_subdev_format, contain
reserved fields (usually a single array field). These reserved fields
can be used to extend the ioctl. The userspace is required to zero the
reserved fields.

We recently added a new 'stream' field to many of these structs, and the
space for the field was taken from these reserved arrays. The assumption
was that these new 'stream' fields are always initialized to zero if the
userspace does not use them. This was a mistake, as, as mentioned above,
the userspace is required to zero the _reserved_ fields. In other words,
there is no requirement to zero this new stream field, and if the
userspace doesn't use the field (which is the case for all userspace
applications at the moment), the field may contain random data.

This shows that the way the reserved fields are defined in v4l2 is, in
my opinion, somewhat broken, but there is nothing to do about that.

To fix this issue we need a way for the userspace to tell the kernel
that the userspace has indeed set the 'stream' field, and it's fine for
the kernel to access it. This is achieved with the new ioctl, which the
userspace should usually use right after opening the subdev device node.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/user-func.rst          |  1 +
 .../media/v4l/vidioc-subdev-g-client-cap.rst       | 83 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-subdev.c              | 63 ++++++++++++++++
 include/media/v4l2-subdev.h                        |  1 +
 include/uapi/linux/v4l2-subdev.h                   | 21 ++++++
 5 files changed, 169 insertions(+)
 create mode 100644 Documentation/userspace-api/media/v4l/vidioc-subdev-g-client-cap.rst

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/user-func.rst b/Documentation/userspace-api/media/v4l/user-func.rst
index 228c1521f190..15ff0bf7bbe6 100644
--- a/Documentation/userspace-api/media/v4l/user-func.rst
+++ b/Documentation/userspace-api/media/v4l/user-func.rst
@@ -72,6 +72,7 @@ Function Reference
     vidioc-subdev-g-frame-interval
     vidioc-subdev-g-routing
     vidioc-subdev-g-selection
+    vidioc-subdev-g-client-cap
     vidioc-subdev-querycap
     vidioc-subscribe-event
     func-mmap
diff --git a/Documentation/userspace-api/media/v4l/vidioc-subdev-g-client-cap.rst b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-client-cap.rst
new file mode 100644
index 000000000000..20f12a1cc0f7
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-client-cap.rst
@@ -0,0 +1,83 @@
+.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
+.. c:namespace:: V4L
+
+.. _VIDIOC_SUBDEV_G_CLIENT_CAP:
+
+************************************************************
+ioctl VIDIOC_SUBDEV_G_CLIENT_CAP, VIDIOC_SUBDEV_S_CLIENT_CAP
+************************************************************
+
+Name
+====
+
+VIDIOC_SUBDEV_G_CLIENT_CAP - VIDIOC_SUBDEV_S_CLIENT_CAP - Get or set client
+capabilities.
+
+Synopsis
+========
+
+.. c:macro:: VIDIOC_SUBDEV_G_CLIENT_CAP
+
+``int ioctl(int fd, VIDIOC_SUBDEV_G_CLIENT_CAP, struct v4l2_subdev_client_capability *argp)``
+
+.. c:macro:: VIDIOC_SUBDEV_S_CLIENT_CAP
+
+``int ioctl(int fd, VIDIOC_SUBDEV_S_CLIENT_CAP, struct v4l2_subdev_client_capability *argp)``
+
+Arguments
+=========
+
+``fd``
+    File descriptor returned by :ref:`open() <func-open>`.
+
+``argp``
+    Pointer to struct :c:type:`v4l2_subdev_client_capability`.
+
+Description
+===========
+
+These ioctls are used to get and set the client (the application using the
+subdevice ioctls) capabilities. The client capabilities are stored in the file
+handle of the opened subdev device node, and the client must set the
+capabilities for each opened subdev separately.
+
+By default no client capabilities are set when a subdev device node is opened.
+
+The purpose of the client capabilities are to inform the kernel of the behavior
+of the client, mainly related to maintaining compatibility with different
+kernel and userspace versions.
+
+The ``VIDIOC_SUBDEV_G_CLIENT_CAP`` ioctl returns the current client capabilities
+associated with the file handle ``fd``.
+
+The ``VIDIOC_SUBDEV_S_CLIENT_CAP`` ioctl sets client capabilities for the file
+handle ``fd``. The new capabilities fully replace the current capabilities, the
+ioctl can therefore also be used to remove capabilities that have previously
+been set.
+
+``VIDIOC_SUBDEV_S_CLIENT_CAP`` modifies the struct
+:c:type:`v4l2_subdev_client_capability` to reflect the capabilities that have
+been accepted. A common case for the kernel not accepting a capability is that
+the kernel is older than the headers the userspace uses, and thus the capability
+is unknown to the kernel.
+
+.. flat-table:: Client Capabilities
+    :header-rows:  1
+
+    * - Capability
+      - Description
+    * - ``V4L2_SUBDEV_CLIENT_CAP_STREAMS``
+      - The client is aware of streams. Setting this flag enables the use
+        of 'stream' fields (referring to the stream number) with various
+        ioctls. If this is not set (which is the default), the 'stream' fields
+        will be forced to 0 by the kernel.
+
+Return Value
+============
+
+On success 0 is returned, on error -1 and the ``errno`` variable is set
+appropriately. The generic error codes are described at the
+:ref:`Generic Error Codes <gen-errors>` chapter.
+
+ENOIOCTLCMD
+   The kernel does not support this ioctl.
diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index bfb102f2121a..2ec179cd1264 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -510,8 +510,11 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	struct video_device *vdev = video_devdata(file);
 	struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev);
 	struct v4l2_fh *vfh = file->private_data;
+	struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh);
 	bool ro_subdev = test_bit(V4L2_FL_SUBDEV_RO_DEVNODE, &vdev->flags);
 	bool streams_subdev = sd->flags & V4L2_SUBDEV_FL_STREAMS;
+	bool client_supports_streams = subdev_fh->client_caps &
+				       V4L2_SUBDEV_CLIENT_CAP_STREAMS;
 	int rval;
 
 	switch (cmd) {
@@ -636,6 +639,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_G_FMT: {
 		struct v4l2_subdev_format *format = arg;
 
+		if (!client_supports_streams)
+			format->stream = 0;
+
 		memset(format->reserved, 0, sizeof(format->reserved));
 		memset(format->format.reserved, 0, sizeof(format->format.reserved));
 		return v4l2_subdev_call(sd, pad, get_fmt, state, format);
@@ -647,6 +653,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		if (format->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev)
 			return -EPERM;
 
+		if (!client_supports_streams)
+			format->stream = 0;
+
 		memset(format->reserved, 0, sizeof(format->reserved));
 		memset(format->format.reserved, 0, sizeof(format->format.reserved));
 		return v4l2_subdev_call(sd, pad, set_fmt, state, format);
@@ -656,6 +665,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		struct v4l2_subdev_crop *crop = arg;
 		struct v4l2_subdev_selection sel;
 
+		if (!client_supports_streams)
+			crop->stream = 0;
+
 		memset(crop->reserved, 0, sizeof(crop->reserved));
 		memset(&sel, 0, sizeof(sel));
 		sel.which = crop->which;
@@ -677,6 +689,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		if (crop->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev)
 			return -EPERM;
 
+		if (!client_supports_streams)
+			crop->stream = 0;
+
 		memset(crop->reserved, 0, sizeof(crop->reserved));
 		memset(&sel, 0, sizeof(sel));
 		sel.which = crop->which;
@@ -695,6 +710,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_ENUM_MBUS_CODE: {
 		struct v4l2_subdev_mbus_code_enum *code = arg;
 
+		if (!client_supports_streams)
+			code->stream = 0;
+
 		memset(code->reserved, 0, sizeof(code->reserved));
 		return v4l2_subdev_call(sd, pad, enum_mbus_code, state,
 					code);
@@ -703,6 +721,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_ENUM_FRAME_SIZE: {
 		struct v4l2_subdev_frame_size_enum *fse = arg;
 
+		if (!client_supports_streams)
+			fse->stream = 0;
+
 		memset(fse->reserved, 0, sizeof(fse->reserved));
 		return v4l2_subdev_call(sd, pad, enum_frame_size, state,
 					fse);
@@ -711,6 +732,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_G_FRAME_INTERVAL: {
 		struct v4l2_subdev_frame_interval *fi = arg;
 
+		if (!client_supports_streams)
+			fi->stream = 0;
+
 		memset(fi->reserved, 0, sizeof(fi->reserved));
 		return v4l2_subdev_call(sd, video, g_frame_interval, arg);
 	}
@@ -721,6 +745,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		if (ro_subdev)
 			return -EPERM;
 
+		if (!client_supports_streams)
+			fi->stream = 0;
+
 		memset(fi->reserved, 0, sizeof(fi->reserved));
 		return v4l2_subdev_call(sd, video, s_frame_interval, arg);
 	}
@@ -728,6 +755,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL: {
 		struct v4l2_subdev_frame_interval_enum *fie = arg;
 
+		if (!client_supports_streams)
+			fie->stream = 0;
+
 		memset(fie->reserved, 0, sizeof(fie->reserved));
 		return v4l2_subdev_call(sd, pad, enum_frame_interval, state,
 					fie);
@@ -736,6 +766,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	case VIDIOC_SUBDEV_G_SELECTION: {
 		struct v4l2_subdev_selection *sel = arg;
 
+		if (!client_supports_streams)
+			sel->stream = 0;
+
 		memset(sel->reserved, 0, sizeof(sel->reserved));
 		return v4l2_subdev_call(
 			sd, pad, get_selection, state, sel);
@@ -747,6 +780,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		if (sel->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev)
 			return -EPERM;
 
+		if (!client_supports_streams)
+			sel->stream = 0;
+
 		memset(sel->reserved, 0, sizeof(sel->reserved));
 		return v4l2_subdev_call(
 			sd, pad, set_selection, state, sel);
@@ -888,6 +924,33 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 					routing->which, &krouting);
 	}
 
+	case VIDIOC_SUBDEV_G_CLIENT_CAP: {
+		struct v4l2_subdev_client_capability *client_cap = arg;
+
+		client_cap->capabilities = subdev_fh->client_caps;
+
+		return 0;
+	}
+
+	case VIDIOC_SUBDEV_S_CLIENT_CAP: {
+		struct v4l2_subdev_client_capability *client_cap = arg;
+
+		/*
+		 * Clear V4L2_SUBDEV_CLIENT_CAP_STREAMS if streams API is not
+		 * enabled. Remove this when streams API is no longer
+		 * experimental.
+		 */
+		if (!v4l2_subdev_enable_streams_api)
+			client_cap->capabilities &= ~V4L2_SUBDEV_CLIENT_CAP_STREAMS;
+
+		/* Filter out unsupported capabilities */
+		client_cap->capabilities &= V4L2_SUBDEV_CLIENT_CAP_STREAMS;
+
+		subdev_fh->client_caps = client_cap->capabilities;
+
+		return 0;
+	}
+
 	default:
 		return v4l2_subdev_call(sd, core, ioctl, cmd, arg);
 	}
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 1d2877700cdb..cfd19e72d0fc 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1125,6 +1125,7 @@ struct v4l2_subdev_fh {
 	struct module *owner;
 #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API)
 	struct v4l2_subdev_state *state;
+	u64 client_caps;
 #endif
 };
 
diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h
index 654d659de835..4a195b68f28f 100644
--- a/include/uapi/linux/v4l2-subdev.h
+++ b/include/uapi/linux/v4l2-subdev.h
@@ -233,6 +233,24 @@ struct v4l2_subdev_routing {
 	__u32 reserved[6];
 };
 
+/*
+ * The client is aware of streams. Setting this flag enables the use of 'stream'
+ * fields (referring to the stream number) with various ioctls. If this is not
+ * set (which is the default), the 'stream' fields will be forced to 0 by the
+ * kernel.
+ */
+ #define V4L2_SUBDEV_CLIENT_CAP_STREAMS		(1U << 0)
+
+/**
+ * struct v4l2_subdev_client_capability - Capabilities of the client accessing
+ *					  the subdev
+ *
+ * @capabilities: A bitmask of V4L2_SUBDEV_CLIENT_CAP_* flags.
+ */
+struct v4l2_subdev_client_capability {
+	__u64 capabilities;
+};
+
 /* Backwards compatibility define --- to be removed */
 #define v4l2_subdev_edid v4l2_edid
 
@@ -250,6 +268,9 @@ struct v4l2_subdev_routing {
 #define VIDIOC_SUBDEV_S_SELECTION		_IOWR('V', 62, struct v4l2_subdev_selection)
 #define VIDIOC_SUBDEV_G_ROUTING			_IOWR('V', 38, struct v4l2_subdev_routing)
 #define VIDIOC_SUBDEV_S_ROUTING			_IOWR('V', 39, struct v4l2_subdev_routing)
+#define VIDIOC_SUBDEV_G_CLIENT_CAP		_IOR('V',  101, struct v4l2_subdev_client_capability)
+#define VIDIOC_SUBDEV_S_CLIENT_CAP		_IOWR('V',  102, struct v4l2_subdev_client_capability)
+
 /* The following ioctls are identical to the ioctls in videodev2.h */
 #define VIDIOC_SUBDEV_G_STD			_IOR('V', 23, v4l2_std_id)
 #define VIDIOC_SUBDEV_S_STD			_IOW('V', 24, v4l2_std_id)
-- 
cgit v1.2.3


From aa1080404200694aace5989f99664ca75e73b03d Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Wed, 22 Mar 2023 05:13:04 +0000
Subject: media: Add P012 and P012M video format

P012 is a YUV format with 12-bits per component with interleaved UV,
like NV12, expanded to 16 bits.
Data in the 12 high bits, zeros in the 4 low bits,
arranged in little endian order.
And P012M has two non contiguous planes.

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst  | 94 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |  2 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  2 +
 include/uapi/linux/videodev2.h                     |  2 +
 4 files changed, 100 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index f1d5bb7b806d..72324274f20c 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -123,6 +123,20 @@ All components are stored with the same number of bits per component.
       - Cb, Cr
       - Yes
       - 4x4 tiles
+    * - V4L2_PIX_FMT_P012
+      - 'P012'
+      - 12
+      - 4:2:0
+      - Cb, Cr
+      - Yes
+      - Linear
+    * - V4L2_PIX_FMT_P012M
+      - 'PM12'
+      - 12
+      - 4:2:0
+      - Cb, Cr
+      - No
+      - Linear
     * - V4L2_PIX_FMT_NV16
       - 'NV16'
       - 8
@@ -586,6 +600,86 @@ Data in the 10 high bits, zeros in the 6 low bits, arranged in little endian ord
       - Cb\ :sub:`11`
       - Cr\ :sub:`11`
 
+.. _V4L2-PIX-FMT-P012:
+.. _V4L2-PIX-FMT-P012M:
+
+P012 and P012M
+--------------
+
+P012 is like NV12 with 12 bits per component, expanded to 16 bits.
+Data in the 12 high bits, zeros in the 4 low bits, arranged in little endian order.
+
+.. flat-table:: Sample 4x4 P012 Image
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - start + 0:
+      - Y'\ :sub:`00`
+      - Y'\ :sub:`01`
+      - Y'\ :sub:`02`
+      - Y'\ :sub:`03`
+    * - start + 8:
+      - Y'\ :sub:`10`
+      - Y'\ :sub:`11`
+      - Y'\ :sub:`12`
+      - Y'\ :sub:`13`
+    * - start + 16:
+      - Y'\ :sub:`20`
+      - Y'\ :sub:`21`
+      - Y'\ :sub:`22`
+      - Y'\ :sub:`23`
+    * - start + 24:
+      - Y'\ :sub:`30`
+      - Y'\ :sub:`31`
+      - Y'\ :sub:`32`
+      - Y'\ :sub:`33`
+    * - start + 32:
+      - Cb\ :sub:`00`
+      - Cr\ :sub:`00`
+      - Cb\ :sub:`01`
+      - Cr\ :sub:`01`
+    * - start + 40:
+      - Cb\ :sub:`10`
+      - Cr\ :sub:`10`
+      - Cb\ :sub:`11`
+      - Cr\ :sub:`11`
+
+.. flat-table:: Sample 4x4 P012M Image
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - start0 + 0:
+      - Y'\ :sub:`00`
+      - Y'\ :sub:`01`
+      - Y'\ :sub:`02`
+      - Y'\ :sub:`03`
+    * - start0 + 8:
+      - Y'\ :sub:`10`
+      - Y'\ :sub:`11`
+      - Y'\ :sub:`12`
+      - Y'\ :sub:`13`
+    * - start0 + 16:
+      - Y'\ :sub:`20`
+      - Y'\ :sub:`21`
+      - Y'\ :sub:`22`
+      - Y'\ :sub:`23`
+    * - start0 + 24:
+      - Y'\ :sub:`30`
+      - Y'\ :sub:`31`
+      - Y'\ :sub:`32`
+      - Y'\ :sub:`33`
+    * -
+    * - start1 + 0:
+      - Cb\ :sub:`00`
+      - Cr\ :sub:`00`
+      - Cb\ :sub:`01`
+      - Cr\ :sub:`01`
+    * - start1 + 8:
+      - Cb\ :sub:`10`
+      - Cr\ :sub:`10`
+      - Cb\ :sub:`11`
+      - Cr\ :sub:`11`
+
 
 Fully Planar YUV Formats
 ========================
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 40f56e044640..a5e8ba370d33 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -267,6 +267,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_NV24,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV42,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_P010,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_P012,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 4, 0, 0 }, .hdiv = 2, .vdiv = 2 },
 
 		{ .format = V4L2_PIX_FMT_YUV410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
 		{ .format = V4L2_PIX_FMT_YVU410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
@@ -292,6 +293,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_NV21M,   .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 },
 		{ .format = V4L2_PIX_FMT_NV16M,   .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV61M,   .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_P012M,   .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 2, 4, 0, 0 }, .hdiv = 2, .vdiv = 2 },
 
 		/* Bayer RGB formats */
 		{ .format = V4L2_PIX_FMT_SBGGR8,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 592410ad6e8a..07b9708cf69f 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1348,6 +1348,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV24:		descr = "Y/UV 4:4:4"; break;
 	case V4L2_PIX_FMT_NV42:		descr = "Y/VU 4:4:4"; break;
 	case V4L2_PIX_FMT_P010:		descr = "10-bit Y/UV 4:2:0"; break;
+	case V4L2_PIX_FMT_P012:		descr = "12-bit Y/UV 4:2:0"; break;
 	case V4L2_PIX_FMT_NV12_4L4:	descr = "Y/UV 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_16L16:	descr = "Y/UV 4:2:0 (16x16 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_32L32:   descr = "Y/UV 4:2:0 (32x32 Linear)"; break;
@@ -1358,6 +1359,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV61M:	descr = "Y/VU 4:2:2 (N-C)"; break;
 	case V4L2_PIX_FMT_NV12MT:	descr = "Y/UV 4:2:0 (64x32 MB, N-C)"; break;
 	case V4L2_PIX_FMT_NV12MT_16X16:	descr = "Y/UV 4:2:0 (16x16 MB, N-C)"; break;
+	case V4L2_PIX_FMT_P012M:	descr = "12-bit Y/UV 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_YUV420M:	descr = "Planar YUV 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_YVU420M:	descr = "Planar YVU 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_YUV422M:	descr = "Planar YUV 4:2:2 (N-C)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d452481a5316..1f88e00327ea 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -637,12 +637,14 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV24    v4l2_fourcc('N', 'V', '2', '4') /* 24  Y/CbCr 4:4:4  */
 #define V4L2_PIX_FMT_NV42    v4l2_fourcc('N', 'V', '4', '2') /* 24  Y/CrCb 4:4:4  */
 #define V4L2_PIX_FMT_P010    v4l2_fourcc('P', '0', '1', '0') /* 24  Y/CbCr 4:2:0 10-bit per component */
+#define V4L2_PIX_FMT_P012    v4l2_fourcc('P', '0', '1', '2') /* 24  Y/CbCr 4:2:0 12-bit per component */
 
 /* two non contiguous planes - one Y, one Cr + Cb interleaved  */
 #define V4L2_PIX_FMT_NV12M   v4l2_fourcc('N', 'M', '1', '2') /* 12  Y/CbCr 4:2:0  */
 #define V4L2_PIX_FMT_NV21M   v4l2_fourcc('N', 'M', '2', '1') /* 21  Y/CrCb 4:2:0  */
 #define V4L2_PIX_FMT_NV16M   v4l2_fourcc('N', 'M', '1', '6') /* 16  Y/CbCr 4:2:2  */
 #define V4L2_PIX_FMT_NV61M   v4l2_fourcc('N', 'M', '6', '1') /* 16  Y/CrCb 4:2:2  */
+#define V4L2_PIX_FMT_P012M   v4l2_fourcc('P', 'M', '1', '2') /* 24  Y/CbCr 4:2:0 12-bit per component */
 
 /* three planes - Y Cb, Cr */
 #define V4L2_PIX_FMT_YUV410  v4l2_fourcc('Y', 'U', 'V', '9') /*  9  YUV 4:1:0     */
-- 
cgit v1.2.3


From a490ea68444084ec0368c019e11ee4a7e5c8bb13 Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Wed, 22 Mar 2023 05:13:05 +0000
Subject: media: Add Y012 video format

Y012 is a luma-only formats with 12-bits per pixel,
expanded to 16bits.
Data in the 12 high bits, zeros in the 4 low bits,
arranged in little endian order.

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst | 15 +++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                      |  1 +
 include/uapi/linux/videodev2.h                            |  1 +
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
index 62078a01de76..cf8e4dfbfbd4 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
@@ -103,6 +103,17 @@ are often referred to as greyscale formats.
       - ...
       - ...
 
+    * .. _V4L2-PIX-FMT-Y012:
+
+      - ``V4L2_PIX_FMT_Y012``
+      - 'Y012'
+
+      - Y'\ :sub:`0`\ [3:0] `0000`
+      - Y'\ :sub:`0`\ [11:4]
+      - ...
+      - ...
+      - ...
+
     * .. _V4L2-PIX-FMT-Y14:
 
       - ``V4L2_PIX_FMT_Y14``
@@ -146,3 +157,7 @@ are often referred to as greyscale formats.
     than 16 bits. For example, 10 bits per pixel uses values in the range 0 to
     1023. For the IPU3_Y10 format 25 pixels are packed into 32 bytes, which
     leaves the 6 most significant bits of the last byte padded with 0.
+
+    For Y012 and Y12 formats, Y012 places its data in the 12 high bits, with
+    padding zeros in the 4 low bits, in contrast to the Y12 format, which has
+    its padding located in the most significant bits of the 16 bit word.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 07b9708cf69f..354ba586078c 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1303,6 +1303,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y6:		descr = "6-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y10:		descr = "10-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12:		descr = "12-bit Greyscale"; break;
+	case V4L2_PIX_FMT_Y012:		descr = "12-bit Greyscale (bits 15-4)"; break;
 	case V4L2_PIX_FMT_Y14:		descr = "14-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y16:		descr = "16-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y16_BE:	descr = "16-bit Greyscale BE"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 1f88e00327ea..2eb3e66a3427 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -586,6 +586,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_Y6      v4l2_fourcc('Y', '0', '6', ' ') /*  6  Greyscale     */
 #define V4L2_PIX_FMT_Y10     v4l2_fourcc('Y', '1', '0', ' ') /* 10  Greyscale     */
 #define V4L2_PIX_FMT_Y12     v4l2_fourcc('Y', '1', '2', ' ') /* 12  Greyscale     */
+#define V4L2_PIX_FMT_Y012    v4l2_fourcc('Y', '0', '1', '2') /* 12  Greyscale     */
 #define V4L2_PIX_FMT_Y14     v4l2_fourcc('Y', '1', '4', ' ') /* 14  Greyscale     */
 #define V4L2_PIX_FMT_Y16     v4l2_fourcc('Y', '1', '6', ' ') /* 16  Greyscale     */
 #define V4L2_PIX_FMT_Y16_BE  v4l2_fourcc_be('Y', '1', '6', ' ') /* 16  Greyscale BE  */
-- 
cgit v1.2.3


From 99c954967762976b15265ea383354095e1ed1efa Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Wed, 22 Mar 2023 05:13:07 +0000
Subject: media: Add YUV48_12 video format

YUV48_12 is a YUV format with 12-bits per component like YUV24,
expanded to 16bits.
Data in the 12 high bits, zeros in the 4 low bits,
arranged in little endian order.

[hverkuil: replaced a . by ,]

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-packed-yuv.rst  | 28 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 4 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
index 24a771542059..9f111ed594d2 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
@@ -257,6 +257,34 @@ the second byte and Y'\ :sub:`7-0` in the third byte.
     - The padding bits contain undefined values that must be ignored by all
       applications and drivers.
 
+The next table lists the packed YUV 4:4:4 formats with 12 bits per component.
+Expand the bits per component to 16 bits, data in the high bits, zeros in the low bits,
+arranged in little endian order, storing 1 pixel in 6 bytes.
+
+.. flat-table:: Packed YUV 4:4:4 Image Formats (12bpc)
+    :header-rows: 1
+    :stub-columns: 0
+
+    * - Identifier
+      - Code
+      - Byte 1-0
+      - Byte 3-2
+      - Byte 5-4
+      - Byte 7-6
+      - Byte 9-8
+      - Byte 11-10
+
+    * .. _V4L2-PIX-FMT-YUV48-12:
+
+      - ``V4L2_PIX_FMT_YUV48_12``
+      - 'Y312'
+
+      - Y'\ :sub:`0`
+      - Cb\ :sub:`0`
+      - Cr\ :sub:`0`
+      - Y'\ :sub:`1`
+      - Cb\ :sub:`1`
+      - Cr\ :sub:`1`
 
 4:2:2 Subsampling
 =================
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 21ace56fac04..da313a0637de 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -259,6 +259,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_UYVY,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_VYUY,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_Y212,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_YUV48_12, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 6, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 
 		/* YUV planar formats */
 		{ .format = V4L2_PIX_FMT_NV12,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 354ba586078c..ce8d47f9f53b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1342,6 +1342,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_YUV420:	descr = "Planar YUV 4:2:0"; break;
 	case V4L2_PIX_FMT_HI240:	descr = "8-bit Dithered RGB (BTTV)"; break;
 	case V4L2_PIX_FMT_M420:		descr = "YUV 4:2:0 (M420)"; break;
+	case V4L2_PIX_FMT_YUV48_12:	descr = "12-bit YUV 4:4:4 Packed"; break;
 	case V4L2_PIX_FMT_NV12:		descr = "Y/UV 4:2:0"; break;
 	case V4L2_PIX_FMT_NV21:		descr = "Y/VU 4:2:0"; break;
 	case V4L2_PIX_FMT_NV16:		descr = "Y/UV 4:2:2"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2eb3e66a3427..25ca4bf83ba4 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -621,6 +621,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YUVA32  v4l2_fourcc('Y', 'U', 'V', 'A') /* 32  YUVA-8-8-8-8  */
 #define V4L2_PIX_FMT_YUVX32  v4l2_fourcc('Y', 'U', 'V', 'X') /* 32  YUVX-8-8-8-8  */
 #define V4L2_PIX_FMT_M420    v4l2_fourcc('M', '4', '2', '0') /* 12  YUV 4:2:0 2 lines y, 1 line uv interleaved */
+#define V4L2_PIX_FMT_YUV48_12    v4l2_fourcc('Y', '3', '1', '2') /* 48  YUV 4:4:4 12-bit per component */
 
 /*
  * YCbCr packed format. For each Y2xx format, xx bits of valid data occupy the MSBs
-- 
cgit v1.2.3


From da0b7a400e4f39726c3c383f377fb51dbd8b0c71 Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Wed, 22 Mar 2023 05:13:08 +0000
Subject: media: Add BGR48_12 video format

BGR48_12 is a reversed RGB format with 12 bits per component like BGR24,
expanded to 16bits.
Data in the 12 high bits, zeros in the 4 low bits,
arranged in little endian order.

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-rgb.rst         | 33 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  3 ++
 4 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
index d330aeb4d3eb..779f0bdda642 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
@@ -956,6 +956,39 @@ number of bits for each component.
 
     \endgroup
 
+12 Bits Per Component
+==============================
+
+These formats store an RGB triplet in six or eight bytes, with 12 bits per component.
+Expand the bits per component to 16 bits, data in the high bits, zeros in the low bits,
+arranged in little endian order.
+
+.. raw:: latex
+
+    \small
+
+.. flat-table:: RGB Formats With 12 Bits Per Component
+    :header-rows:  1
+
+    * - Identifier
+      - Code
+      - Byte 1-0
+      - Byte 3-2
+      - Byte 5-4
+      - Byte 7-6
+    * .. _V4L2-PIX-FMT-BGR48-12:
+
+      - ``V4L2_PIX_FMT_BGR48_12``
+      - 'B312'
+
+      - B\ :sub:`15-4`
+      - G\ :sub:`15-4`
+      - R\ :sub:`15-4`
+      -
+
+.. raw:: latex
+
+    \normalsize
 
 Deprecated RGB Formats
 ======================
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index da313a0637de..16d3c91c7da2 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -252,6 +252,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_RGB565,  .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_RGB555,  .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_BGR666,  .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_BGR48_12, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 6, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 
 		/* YUV packed formats */
 		{ .format = V4L2_PIX_FMT_YUYV,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index ce8d47f9f53b..7cd0708cb5fb 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1298,6 +1298,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_RGBX1010102:	descr = "32-bit RGBX 10-10-10-2"; break;
 	case V4L2_PIX_FMT_RGBA1010102:	descr = "32-bit RGBA 10-10-10-2"; break;
 	case V4L2_PIX_FMT_ARGB2101010:	descr = "32-bit ARGB 2-10-10-10"; break;
+	case V4L2_PIX_FMT_BGR48_12:	descr = "12-bit Depth BGR"; break;
 	case V4L2_PIX_FMT_GREY:		descr = "8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y4:		descr = "4-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y6:		descr = "6-bit Greyscale"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 25ca4bf83ba4..42358d04075d 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -580,6 +580,9 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_RGBA1010102 v4l2_fourcc('R', 'A', '3', '0') /* 32  RGBA-10-10-10-2 */
 #define V4L2_PIX_FMT_ARGB2101010 v4l2_fourcc('A', 'R', '3', '0') /* 32  ARGB-2-10-10-10 */
 
+/* RGB formats (6 or 8 bytes per pixel) */
+#define V4L2_PIX_FMT_BGR48_12    v4l2_fourcc('B', '3', '1', '2') /* 48  BGR 12-bit per component */
+
 /* Grey formats */
 #define V4L2_PIX_FMT_GREY    v4l2_fourcc('G', 'R', 'E', 'Y') /*  8  Greyscale     */
 #define V4L2_PIX_FMT_Y4      v4l2_fourcc('Y', '0', '4', ' ') /*  4  Greyscale     */
-- 
cgit v1.2.3


From 302b988ca03d83da0a7e006a57efda646c30f978 Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Wed, 22 Mar 2023 05:13:09 +0000
Subject: media: Add ABGR64_12 video format

ABGR64_12 is a reversed RGB format with alpha channel last,
12 bits per component like ABGR32,
expanded to 16bits.
Data in the 12 high bits, zeros in the 4 low bits,
arranged in little endian order.

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-rgb.rst | 9 +++++++++
 drivers/media/v4l2-core/v4l2-common.c                | 1 +
 drivers/media/v4l2-core/v4l2-ioctl.c                 | 1 +
 include/uapi/linux/videodev2.h                       | 1 +
 4 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
index 779f0bdda642..38811025d71b 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-rgb.rst
@@ -985,6 +985,15 @@ arranged in little endian order.
       - G\ :sub:`15-4`
       - R\ :sub:`15-4`
       -
+    * .. _V4L2-PIX-FMT-ABGR64-12:
+
+      - ``V4L2_PIX_FMT_ABGR64_12``
+      - 'B412'
+
+      - B\ :sub:`15-4`
+      - G\ :sub:`15-4`
+      - R\ :sub:`15-4`
+      - A\ :sub:`15-4`
 
 .. raw:: latex
 
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 16d3c91c7da2..3c5ab5ecd678 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -253,6 +253,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_RGB555,  .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_BGR666,  .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_BGR48_12, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 6, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_ABGR64_12, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 
 		/* YUV packed formats */
 		{ .format = V4L2_PIX_FMT_YUYV,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 7cd0708cb5fb..a858acea6547 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1299,6 +1299,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_RGBA1010102:	descr = "32-bit RGBA 10-10-10-2"; break;
 	case V4L2_PIX_FMT_ARGB2101010:	descr = "32-bit ARGB 2-10-10-10"; break;
 	case V4L2_PIX_FMT_BGR48_12:	descr = "12-bit Depth BGR"; break;
+	case V4L2_PIX_FMT_ABGR64_12:	descr = "12-bit Depth BGRA"; break;
 	case V4L2_PIX_FMT_GREY:		descr = "8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y4:		descr = "4-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y6:		descr = "6-bit Greyscale"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 42358d04075d..aee75eb9e686 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -582,6 +582,7 @@ struct v4l2_pix_format {
 
 /* RGB formats (6 or 8 bytes per pixel) */
 #define V4L2_PIX_FMT_BGR48_12    v4l2_fourcc('B', '3', '1', '2') /* 48  BGR 12-bit per component */
+#define V4L2_PIX_FMT_ABGR64_12   v4l2_fourcc('B', '4', '1', '2') /* 64  BGRA 12-bit per component */
 
 /* Grey formats */
 #define V4L2_PIX_FMT_GREY    v4l2_fourcc('G', 'R', 'E', 'Y') /*  8  Greyscale     */
-- 
cgit v1.2.3


From f4bf3ca2e5cba655824b6e0893a98dfb33ed24e5 Mon Sep 17 00:00:00 2001
From: Lingutla Chandrasekhar <clingutla@codeaurora.org>
Date: Fri, 7 Apr 2023 23:05:26 +0000
Subject: softirq: Add trace points for tasklet entry/exit

Tasklets are supposed to finish their work quickly and should not block the
current running process, but it is not guaranteed that they do so.

Currently softirq_entry/exit can be used to analyse the total tasklets
execution time, but that's not helpful to track individual tasklets
execution time. That makes it hard to identify tasklet functions, which
take more time than expected.

Add tasklet_entry/exit trace point support to track individual tasklet
execution.

Trivial usage example:
   # echo 1 > /sys/kernel/debug/tracing/events/irq/tasklet_entry/enable
   # echo 1 > /sys/kernel/debug/tracing/events/irq/tasklet_exit/enable
   # cat /sys/kernel/debug/tracing/trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 4/4   #P:4
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
           <idle>-0       [003] ..s1.   314.011428: tasklet_entry: tasklet=0xffffa01ef8db2740 function=tcp_tasklet_func
           <idle>-0       [003] ..s1.   314.011432: tasklet_exit: tasklet=0xffffa01ef8db2740 function=tcp_tasklet_func
           <idle>-0       [003] ..s1.   314.017369: tasklet_entry: tasklet=0xffffa01ef8db2740 function=tcp_tasklet_func
           <idle>-0       [003] ..s1.   314.017371: tasklet_exit: tasklet=0xffffa01ef8db2740 function=tcp_tasklet_func

Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
Signed-off-by: J. Avila <elavila@google.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230407230526.1685443-1-jstultz@google.com

[elavila: Port to android-mainline]
[jstultz: Rebased to upstream, cut unused trace points, added
 comments for the tracepoints, reworded commit]
---
 include/trace/events/irq.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softirq.c           |  9 +++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index eeceafaaea4c..a07b4607b663 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -160,6 +160,53 @@ DEFINE_EVENT(softirq, softirq_raise,
 	TP_ARGS(vec_nr)
 );
 
+DECLARE_EVENT_CLASS(tasklet,
+
+	TP_PROTO(struct tasklet_struct *t, void *func),
+
+	TP_ARGS(t, func),
+
+	TP_STRUCT__entry(
+		__field(	void *,	tasklet)
+		__field(	void *,	func)
+	),
+
+	TP_fast_assign(
+		__entry->tasklet = t;
+		__entry->func = func;
+	),
+
+	TP_printk("tasklet=%ps function=%ps", __entry->tasklet, __entry->func)
+);
+
+/**
+ * tasklet_entry - called immediately before the tasklet is run
+ * @t: tasklet pointer
+ * @func: tasklet callback or function being run
+ *
+ * Used to find individual tasklet execution time
+ */
+DEFINE_EVENT(tasklet, tasklet_entry,
+
+	TP_PROTO(struct tasklet_struct *t, void *func),
+
+	TP_ARGS(t, func)
+);
+
+/**
+ * tasklet_exit - called immediately after the tasklet is run
+ * @t: tasklet pointer
+ * @func: tasklet callback or function being run
+ *
+ * Used to find individual tasklet execution time
+ */
+DEFINE_EVENT(tasklet, tasklet_exit,
+
+	TP_PROTO(struct tasklet_struct *t, void *func),
+
+	TP_ARGS(t, func)
+);
+
 #endif /*  _TRACE_IRQ_H */
 
 /* This part must be outside protection */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..1b725510dd0f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -793,10 +793,15 @@ static void tasklet_action_common(struct softirq_action *a,
 		if (tasklet_trylock(t)) {
 			if (!atomic_read(&t->count)) {
 				if (tasklet_clear_sched(t)) {
-					if (t->use_callback)
+					if (t->use_callback) {
+						trace_tasklet_entry(t, t->callback);
 						t->callback(t);
-					else
+						trace_tasklet_exit(t, t->callback);
+					} else {
+						trace_tasklet_entry(t, t->func);
 						t->func(t->data);
+						trace_tasklet_exit(t, t->func);
+					}
 				}
 				tasklet_unlock(t);
 				continue;
-- 
cgit v1.2.3


From db24aa04119421149bee3a0ef6c9f651c942cbcb Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused ad9389b video encoder driver

The ad9389b video encoder driver doesn't support DT and relies on
platform data. No board file has ever provided platform data for that
device. The driver has thus never been used in the mainline kernel since
its introduction in v3.7. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |    1 -
 MAINTAINERS                                      |    6 -
 drivers/media/i2c/Kconfig                        |   14 -
 drivers/media/i2c/Makefile                       |    1 -
 drivers/media/i2c/ad9389b.c                      | 1215 ----------------------
 include/media/i2c/ad9389b.h                      |   37 -
 6 files changed, 1274 deletions(-)
 delete mode 100644 drivers/media/i2c/ad9389b.c
 delete mode 100644 include/media/i2c/ad9389b.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index ef3b5fff3b01..4819d9aa55f1 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -222,7 +222,6 @@ Video encoders
 ============  ==========================================================
 Driver        Name
 ============  ==========================================================
-ad9389b       Analog Devices AD9389B encoder
 adv7170       Analog Devices ADV7170 video encoder
 adv7175       Analog Devices ADV7175 video encoder
 adv7343       ADV7343 video encoder
diff --git a/MAINTAINERS b/MAINTAINERS
index 9a6c5ac4dbe3..8b475bf063f6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1227,12 +1227,6 @@ F:	Documentation/devicetree/bindings/iio/addac/adi,ad74413r.yaml
 F:	drivers/iio/addac/ad74413r.c
 F:	include/dt-bindings/iio/addac/adi,ad74413r.h
 
-ANALOG DEVICES INC AD9389B DRIVER
-M:	Hans Verkuil <hverkuil-cisco@xs4all.nl>
-L:	linux-media@vger.kernel.org
-S:	Maintained
-F:	drivers/media/i2c/ad9389b*
-
 ANALOG DEVICES INC ADA4250 DRIVER
 M:	Antoniu Miclaus <antoniu.miclaus@analog.com>
 L:	linux-iio@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index f07fc7ea84f1..c369db886b40 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -1454,20 +1454,6 @@ endmenu
 menu "Video encoders"
 	visible if !MEDIA_HIDE_ANCILLARY_SUBDRV
 
-config VIDEO_AD9389B
-	tristate "Analog Devices AD9389B encoder"
-	depends on VIDEO_DEV && I2C
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-
-	help
-	  Support for the Analog Devices AD9389B video encoder.
-
-	  This is a Analog Devices HDMI transmitter.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ad9389b.
-
 config VIDEO_ADV7170
 	tristate "Analog Devices ADV7170 video encoder"
 	depends on VIDEO_DEV && I2C
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 4f5e9d9cee85..77052f941fec 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -4,7 +4,6 @@ msp3400-objs	:=	msp3400-driver.o msp3400-kthreads.o
 
 obj-$(CONFIG_SDR_MAX2175) += max2175.o
 obj-$(CONFIG_VIDEO_AD5820) += ad5820.o
-obj-$(CONFIG_VIDEO_AD9389B) += ad9389b.o
 obj-$(CONFIG_VIDEO_ADP1653) += adp1653.o
 obj-$(CONFIG_VIDEO_ADV7170) += adv7170.o
 obj-$(CONFIG_VIDEO_ADV7175) += adv7175.o
diff --git a/drivers/media/i2c/ad9389b.c b/drivers/media/i2c/ad9389b.c
deleted file mode 100644
index ad17097a2d25..000000000000
--- a/drivers/media/i2c/ad9389b.c
+++ /dev/null
@@ -1,1215 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Analog Devices AD9389B/AD9889B video encoder driver
- *
- * Copyright 2012 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
- */
-
-/*
- * References (c = chapter, p = page):
- * REF_01 - Analog Devices, Programming Guide, AD9889B/AD9389B,
- * HDMI Transitter, Rev. A, October 2010
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/i2c.h>
-#include <linux/delay.h>
-#include <linux/videodev2.h>
-#include <linux/workqueue.h>
-#include <linux/v4l2-dv-timings.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-common.h>
-#include <media/v4l2-dv-timings.h>
-#include <media/v4l2-ctrls.h>
-#include <media/i2c/ad9389b.h>
-
-static int debug;
-module_param(debug, int, 0644);
-MODULE_PARM_DESC(debug, "debug level (0-2)");
-
-MODULE_DESCRIPTION("Analog Devices AD9389B/AD9889B video encoder driver");
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
-MODULE_AUTHOR("Martin Bugge <marbugge@cisco.com>");
-MODULE_LICENSE("GPL");
-
-#define MASK_AD9389B_EDID_RDY_INT   0x04
-#define MASK_AD9389B_MSEN_INT       0x40
-#define MASK_AD9389B_HPD_INT        0x80
-
-#define MASK_AD9389B_HPD_DETECT     0x40
-#define MASK_AD9389B_MSEN_DETECT    0x20
-#define MASK_AD9389B_EDID_RDY       0x10
-
-#define EDID_MAX_RETRIES (8)
-#define EDID_DELAY 250
-#define EDID_MAX_SEGM 8
-
-/*
-**********************************************************************
-*
-*  Arrays with configuration parameters for the AD9389B
-*
-**********************************************************************
-*/
-
-struct ad9389b_state_edid {
-	/* total number of blocks */
-	u32 blocks;
-	/* Number of segments read */
-	u32 segments;
-	u8 data[EDID_MAX_SEGM * 256];
-	/* Number of EDID read retries left */
-	unsigned read_retries;
-};
-
-struct ad9389b_state {
-	struct ad9389b_platform_data pdata;
-	struct v4l2_subdev sd;
-	struct media_pad pad;
-	struct v4l2_ctrl_handler hdl;
-	int chip_revision;
-	/* Is the ad9389b powered on? */
-	bool power_on;
-	/* Did we receive hotplug and rx-sense signals? */
-	bool have_monitor;
-	/* timings from s_dv_timings */
-	struct v4l2_dv_timings dv_timings;
-	/* controls */
-	struct v4l2_ctrl *hdmi_mode_ctrl;
-	struct v4l2_ctrl *hotplug_ctrl;
-	struct v4l2_ctrl *rx_sense_ctrl;
-	struct v4l2_ctrl *have_edid0_ctrl;
-	struct v4l2_ctrl *rgb_quantization_range_ctrl;
-	struct i2c_client *edid_i2c_client;
-	struct ad9389b_state_edid edid;
-	/* Running counter of the number of detected EDIDs (for debugging) */
-	unsigned edid_detect_counter;
-	struct delayed_work edid_handler; /* work entry */
-};
-
-static void ad9389b_check_monitor_present_status(struct v4l2_subdev *sd);
-static bool ad9389b_check_edid_status(struct v4l2_subdev *sd);
-static void ad9389b_setup(struct v4l2_subdev *sd);
-static int ad9389b_s_i2s_clock_freq(struct v4l2_subdev *sd, u32 freq);
-static int ad9389b_s_clock_freq(struct v4l2_subdev *sd, u32 freq);
-
-static inline struct ad9389b_state *get_ad9389b_state(struct v4l2_subdev *sd)
-{
-	return container_of(sd, struct ad9389b_state, sd);
-}
-
-static inline struct v4l2_subdev *to_sd(struct v4l2_ctrl *ctrl)
-{
-	return &container_of(ctrl->handler, struct ad9389b_state, hdl)->sd;
-}
-
-/* ------------------------ I2C ----------------------------------------------- */
-
-static int ad9389b_rd(struct v4l2_subdev *sd, u8 reg)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-
-	return i2c_smbus_read_byte_data(client, reg);
-}
-
-static int ad9389b_wr(struct v4l2_subdev *sd, u8 reg, u8 val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	int ret;
-	int i;
-
-	for (i = 0; i < 3; i++) {
-		ret = i2c_smbus_write_byte_data(client, reg, val);
-		if (ret == 0)
-			return 0;
-	}
-	v4l2_err(sd, "%s: failed reg 0x%x, val 0x%x\n", __func__, reg, val);
-	return ret;
-}
-
-/* To set specific bits in the register, a clear-mask is given (to be AND-ed),
-   and then the value-mask (to be OR-ed). */
-static inline void ad9389b_wr_and_or(struct v4l2_subdev *sd, u8 reg,
-				     u8 clr_mask, u8 val_mask)
-{
-	ad9389b_wr(sd, reg, (ad9389b_rd(sd, reg) & clr_mask) | val_mask);
-}
-
-static void ad9389b_edid_rd(struct v4l2_subdev *sd, u16 len, u8 *buf)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	int i;
-
-	v4l2_dbg(1, debug, sd, "%s:\n", __func__);
-
-	for (i = 0; i < len; i++)
-		buf[i] = i2c_smbus_read_byte_data(state->edid_i2c_client, i);
-}
-
-static inline bool ad9389b_have_hotplug(struct v4l2_subdev *sd)
-{
-	return ad9389b_rd(sd, 0x42) & MASK_AD9389B_HPD_DETECT;
-}
-
-static inline bool ad9389b_have_rx_sense(struct v4l2_subdev *sd)
-{
-	return ad9389b_rd(sd, 0x42) & MASK_AD9389B_MSEN_DETECT;
-}
-
-static void ad9389b_csc_conversion_mode(struct v4l2_subdev *sd, u8 mode)
-{
-	ad9389b_wr_and_or(sd, 0x17, 0xe7, (mode & 0x3)<<3);
-	ad9389b_wr_and_or(sd, 0x18, 0x9f, (mode & 0x3)<<5);
-}
-
-static void ad9389b_csc_coeff(struct v4l2_subdev *sd,
-			      u16 A1, u16 A2, u16 A3, u16 A4,
-			      u16 B1, u16 B2, u16 B3, u16 B4,
-			      u16 C1, u16 C2, u16 C3, u16 C4)
-{
-	/* A */
-	ad9389b_wr_and_or(sd, 0x18, 0xe0, A1>>8);
-	ad9389b_wr(sd, 0x19, A1);
-	ad9389b_wr_and_or(sd, 0x1A, 0xe0, A2>>8);
-	ad9389b_wr(sd, 0x1B, A2);
-	ad9389b_wr_and_or(sd, 0x1c, 0xe0, A3>>8);
-	ad9389b_wr(sd, 0x1d, A3);
-	ad9389b_wr_and_or(sd, 0x1e, 0xe0, A4>>8);
-	ad9389b_wr(sd, 0x1f, A4);
-
-	/* B */
-	ad9389b_wr_and_or(sd, 0x20, 0xe0, B1>>8);
-	ad9389b_wr(sd, 0x21, B1);
-	ad9389b_wr_and_or(sd, 0x22, 0xe0, B2>>8);
-	ad9389b_wr(sd, 0x23, B2);
-	ad9389b_wr_and_or(sd, 0x24, 0xe0, B3>>8);
-	ad9389b_wr(sd, 0x25, B3);
-	ad9389b_wr_and_or(sd, 0x26, 0xe0, B4>>8);
-	ad9389b_wr(sd, 0x27, B4);
-
-	/* C */
-	ad9389b_wr_and_or(sd, 0x28, 0xe0, C1>>8);
-	ad9389b_wr(sd, 0x29, C1);
-	ad9389b_wr_and_or(sd, 0x2A, 0xe0, C2>>8);
-	ad9389b_wr(sd, 0x2B, C2);
-	ad9389b_wr_and_or(sd, 0x2C, 0xe0, C3>>8);
-	ad9389b_wr(sd, 0x2D, C3);
-	ad9389b_wr_and_or(sd, 0x2E, 0xe0, C4>>8);
-	ad9389b_wr(sd, 0x2F, C4);
-}
-
-static void ad9389b_csc_rgb_full2limit(struct v4l2_subdev *sd, bool enable)
-{
-	if (enable) {
-		u8 csc_mode = 0;
-
-		ad9389b_csc_conversion_mode(sd, csc_mode);
-		ad9389b_csc_coeff(sd,
-				  4096-564, 0, 0, 256,
-				  0, 4096-564, 0, 256,
-				  0, 0, 4096-564, 256);
-		/* enable CSC */
-		ad9389b_wr_and_or(sd, 0x3b, 0xfe, 0x1);
-		/* AVI infoframe: Limited range RGB (16-235) */
-		ad9389b_wr_and_or(sd, 0xcd, 0xf9, 0x02);
-	} else {
-		/* disable CSC */
-		ad9389b_wr_and_or(sd, 0x3b, 0xfe, 0x0);
-		/* AVI infoframe: Full range RGB (0-255) */
-		ad9389b_wr_and_or(sd, 0xcd, 0xf9, 0x04);
-	}
-}
-
-static void ad9389b_set_IT_content_AVI_InfoFrame(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	if (state->dv_timings.bt.flags & V4L2_DV_FL_IS_CE_VIDEO) {
-		/* CE format, not IT  */
-		ad9389b_wr_and_or(sd, 0xcd, 0xbf, 0x00);
-	} else {
-		/* IT format */
-		ad9389b_wr_and_or(sd, 0xcd, 0xbf, 0x40);
-	}
-}
-
-static int ad9389b_set_rgb_quantization_mode(struct v4l2_subdev *sd, struct v4l2_ctrl *ctrl)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	switch (ctrl->val) {
-	case V4L2_DV_RGB_RANGE_AUTO:
-		/* automatic */
-		if (state->dv_timings.bt.flags & V4L2_DV_FL_IS_CE_VIDEO) {
-			/* CE format, RGB limited range (16-235) */
-			ad9389b_csc_rgb_full2limit(sd, true);
-		} else {
-			/* not CE format, RGB full range (0-255) */
-			ad9389b_csc_rgb_full2limit(sd, false);
-		}
-		break;
-	case V4L2_DV_RGB_RANGE_LIMITED:
-		/* RGB limited range (16-235) */
-		ad9389b_csc_rgb_full2limit(sd, true);
-		break;
-	case V4L2_DV_RGB_RANGE_FULL:
-		/* RGB full range (0-255) */
-		ad9389b_csc_rgb_full2limit(sd, false);
-		break;
-	default:
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static void ad9389b_set_manual_pll_gear(struct v4l2_subdev *sd, u32 pixelclock)
-{
-	u8 gear;
-
-	/* Workaround for TMDS PLL problem
-	 * The TMDS PLL in AD9389b change gear when the chip is heated above a
-	 * certain temperature. The output is disabled when the PLL change gear
-	 * so the monitor has to lock on the signal again. A workaround for
-	 * this is to use the manual PLL gears. This is a solution from Analog
-	 * Devices that is not documented in the datasheets.
-	 * 0x98 [7] = enable manual gearing. 0x98 [6:4] = gear
-	 *
-	 * The pixel frequency ranges are based on readout of the gear the
-	 * automatic gearing selects for different pixel clocks
-	 * (read from 0x9e [3:1]).
-	 */
-
-	if (pixelclock > 140000000)
-		gear = 0xc0; /* 4th gear */
-	else if (pixelclock > 117000000)
-		gear = 0xb0; /* 3rd gear */
-	else if (pixelclock > 87000000)
-		gear = 0xa0; /* 2nd gear */
-	else if (pixelclock > 60000000)
-		gear = 0x90; /* 1st gear */
-	else
-		gear = 0x80; /* 0th gear */
-
-	ad9389b_wr_and_or(sd, 0x98, 0x0f, gear);
-}
-
-/* ------------------------------ CTRL OPS ------------------------------ */
-
-static int ad9389b_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct v4l2_subdev *sd = to_sd(ctrl);
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	v4l2_dbg(1, debug, sd,
-		 "%s: ctrl id: %d, ctrl->val %d\n", __func__, ctrl->id, ctrl->val);
-
-	if (state->hdmi_mode_ctrl == ctrl) {
-		/* Set HDMI or DVI-D */
-		ad9389b_wr_and_or(sd, 0xaf, 0xfd,
-				  ctrl->val == V4L2_DV_TX_MODE_HDMI ? 0x02 : 0x00);
-		return 0;
-	}
-	if (state->rgb_quantization_range_ctrl == ctrl)
-		return ad9389b_set_rgb_quantization_mode(sd, ctrl);
-	return -EINVAL;
-}
-
-static const struct v4l2_ctrl_ops ad9389b_ctrl_ops = {
-	.s_ctrl = ad9389b_s_ctrl,
-};
-
-/* ---------------------------- CORE OPS ------------------------------------------- */
-
-#ifdef CONFIG_VIDEO_ADV_DEBUG
-static int ad9389b_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
-{
-	reg->val = ad9389b_rd(sd, reg->reg & 0xff);
-	reg->size = 1;
-	return 0;
-}
-
-static int ad9389b_s_register(struct v4l2_subdev *sd, const struct v4l2_dbg_register *reg)
-{
-	ad9389b_wr(sd, reg->reg & 0xff, reg->val & 0xff);
-	return 0;
-}
-#endif
-
-static int ad9389b_log_status(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	struct ad9389b_state_edid *edid = &state->edid;
-
-	static const char * const states[] = {
-		"in reset",
-		"reading EDID",
-		"idle",
-		"initializing HDCP",
-		"HDCP enabled",
-		"initializing HDCP repeater",
-		"6", "7", "8", "9", "A", "B", "C", "D", "E", "F"
-	};
-	static const char * const errors[] = {
-		"no error",
-		"bad receiver BKSV",
-		"Ri mismatch",
-		"Pj mismatch",
-		"i2c error",
-		"timed out",
-		"max repeater cascade exceeded",
-		"hash check failed",
-		"too many devices",
-		"9", "A", "B", "C", "D", "E", "F"
-	};
-
-	u8 manual_gear;
-
-	v4l2_info(sd, "chip revision %d\n", state->chip_revision);
-	v4l2_info(sd, "power %s\n", state->power_on ? "on" : "off");
-	v4l2_info(sd, "%s hotplug, %s Rx Sense, %s EDID (%d block(s))\n",
-		  (ad9389b_rd(sd, 0x42) & MASK_AD9389B_HPD_DETECT) ?
-		  "detected" : "no",
-		  (ad9389b_rd(sd, 0x42) & MASK_AD9389B_MSEN_DETECT) ?
-		  "detected" : "no",
-		  edid->segments ? "found" : "no", edid->blocks);
-	v4l2_info(sd, "%s output %s\n",
-		  (ad9389b_rd(sd, 0xaf) & 0x02) ?
-		  "HDMI" : "DVI-D",
-		  (ad9389b_rd(sd, 0xa1) & 0x3c) ?
-		  "disabled" : "enabled");
-	v4l2_info(sd, "ad9389b: %s\n", (ad9389b_rd(sd, 0xb8) & 0x40) ?
-		  "encrypted" : "no encryption");
-	v4l2_info(sd, "state: %s, error: %s, detect count: %u, msk/irq: %02x/%02x\n",
-		  states[ad9389b_rd(sd, 0xc8) & 0xf],
-		  errors[ad9389b_rd(sd, 0xc8) >> 4],
-		  state->edid_detect_counter,
-		  ad9389b_rd(sd, 0x94), ad9389b_rd(sd, 0x96));
-	manual_gear = ad9389b_rd(sd, 0x98) & 0x80;
-	v4l2_info(sd, "ad9389b: RGB quantization: %s range\n",
-		  ad9389b_rd(sd, 0x3b) & 0x01 ? "limited" : "full");
-	v4l2_info(sd, "ad9389b: %s gear %d\n",
-		  manual_gear ? "manual" : "automatic",
-		  manual_gear ? ((ad9389b_rd(sd, 0x98) & 0x70) >> 4) :
-		  ((ad9389b_rd(sd, 0x9e) & 0x0e) >> 1));
-	if (ad9389b_rd(sd, 0xaf) & 0x02) {
-		/* HDMI only */
-		u8 manual_cts = ad9389b_rd(sd, 0x0a) & 0x80;
-		u32 N = (ad9389b_rd(sd, 0x01) & 0xf) << 16 |
-			ad9389b_rd(sd, 0x02) << 8 |
-			ad9389b_rd(sd, 0x03);
-		u8 vic_detect = ad9389b_rd(sd, 0x3e) >> 2;
-		u8 vic_sent = ad9389b_rd(sd, 0x3d) & 0x3f;
-		u32 CTS;
-
-		if (manual_cts)
-			CTS = (ad9389b_rd(sd, 0x07) & 0xf) << 16 |
-			      ad9389b_rd(sd, 0x08) << 8 |
-			      ad9389b_rd(sd, 0x09);
-		else
-			CTS = (ad9389b_rd(sd, 0x04) & 0xf) << 16 |
-			      ad9389b_rd(sd, 0x05) << 8 |
-			      ad9389b_rd(sd, 0x06);
-		N = (ad9389b_rd(sd, 0x01) & 0xf) << 16 |
-		    ad9389b_rd(sd, 0x02) << 8 |
-		    ad9389b_rd(sd, 0x03);
-
-		v4l2_info(sd, "ad9389b: CTS %s mode: N %d, CTS %d\n",
-			  manual_cts ? "manual" : "automatic", N, CTS);
-
-		v4l2_info(sd, "ad9389b: VIC: detected %d, sent %d\n",
-			  vic_detect, vic_sent);
-	}
-	if (state->dv_timings.type == V4L2_DV_BT_656_1120)
-		v4l2_print_dv_timings(sd->name, "timings: ",
-				&state->dv_timings, false);
-	else
-		v4l2_info(sd, "no timings set\n");
-	return 0;
-}
-
-/* Power up/down ad9389b */
-static int ad9389b_s_power(struct v4l2_subdev *sd, int on)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	struct ad9389b_platform_data *pdata = &state->pdata;
-	const int retries = 20;
-	int i;
-
-	v4l2_dbg(1, debug, sd, "%s: power %s\n", __func__, on ? "on" : "off");
-
-	state->power_on = on;
-
-	if (!on) {
-		/* Power down */
-		ad9389b_wr_and_or(sd, 0x41, 0xbf, 0x40);
-		return true;
-	}
-
-	/* Power up */
-	/* The ad9389b does not always come up immediately.
-	   Retry multiple times. */
-	for (i = 0; i < retries; i++) {
-		ad9389b_wr_and_or(sd, 0x41, 0xbf, 0x0);
-		if ((ad9389b_rd(sd, 0x41) & 0x40) == 0)
-			break;
-		ad9389b_wr_and_or(sd, 0x41, 0xbf, 0x40);
-		msleep(10);
-	}
-	if (i == retries) {
-		v4l2_dbg(1, debug, sd, "failed to powerup the ad9389b\n");
-		ad9389b_s_power(sd, 0);
-		return false;
-	}
-	if (i > 1)
-		v4l2_dbg(1, debug, sd,
-			 "needed %d retries to powerup the ad9389b\n", i);
-
-	/* Select chip: AD9389B */
-	ad9389b_wr_and_or(sd, 0xba, 0xef, 0x10);
-
-	/* Reserved registers that must be set according to REF_01 p. 11*/
-	ad9389b_wr_and_or(sd, 0x98, 0xf0, 0x07);
-	ad9389b_wr(sd, 0x9c, 0x38);
-	ad9389b_wr_and_or(sd, 0x9d, 0xfc, 0x01);
-
-	/* Differential output drive strength */
-	if (pdata->diff_data_drive_strength > 0)
-		ad9389b_wr(sd, 0xa2, pdata->diff_data_drive_strength);
-	else
-		ad9389b_wr(sd, 0xa2, 0x87);
-
-	if (pdata->diff_clk_drive_strength > 0)
-		ad9389b_wr(sd, 0xa3, pdata->diff_clk_drive_strength);
-	else
-		ad9389b_wr(sd, 0xa3, 0x87);
-
-	ad9389b_wr(sd, 0x0a, 0x01);
-	ad9389b_wr(sd, 0xbb, 0xff);
-
-	/* Set number of attempts to read the EDID */
-	ad9389b_wr(sd, 0xc9, 0xf);
-	return true;
-}
-
-/* Enable interrupts */
-static void ad9389b_set_isr(struct v4l2_subdev *sd, bool enable)
-{
-	u8 irqs = MASK_AD9389B_HPD_INT | MASK_AD9389B_MSEN_INT;
-	u8 irqs_rd;
-	int retries = 100;
-
-	/* The datasheet says that the EDID ready interrupt should be
-	   disabled if there is no hotplug. */
-	if (!enable)
-		irqs = 0;
-	else if (ad9389b_have_hotplug(sd))
-		irqs |= MASK_AD9389B_EDID_RDY_INT;
-
-	/*
-	 * This i2c write can fail (approx. 1 in 1000 writes). But it
-	 * is essential that this register is correct, so retry it
-	 * multiple times.
-	 *
-	 * Note that the i2c write does not report an error, but the readback
-	 * clearly shows the wrong value.
-	 */
-	do {
-		ad9389b_wr(sd, 0x94, irqs);
-		irqs_rd = ad9389b_rd(sd, 0x94);
-	} while (retries-- && irqs_rd != irqs);
-
-	if (irqs_rd != irqs)
-		v4l2_err(sd, "Could not set interrupts: hw failure?\n");
-}
-
-/* Interrupt handler */
-static int ad9389b_isr(struct v4l2_subdev *sd, u32 status, bool *handled)
-{
-	u8 irq_status;
-
-	/* disable interrupts to prevent a race condition */
-	ad9389b_set_isr(sd, false);
-	irq_status = ad9389b_rd(sd, 0x96);
-	/* clear detected interrupts */
-	ad9389b_wr(sd, 0x96, irq_status);
-	/* enable interrupts */
-	ad9389b_set_isr(sd, true);
-
-	v4l2_dbg(1, debug, sd, "%s: irq_status 0x%x\n", __func__, irq_status);
-
-	if (irq_status & (MASK_AD9389B_HPD_INT))
-		ad9389b_check_monitor_present_status(sd);
-	if (irq_status & MASK_AD9389B_EDID_RDY_INT)
-		ad9389b_check_edid_status(sd);
-
-	*handled = true;
-	return 0;
-}
-
-static const struct v4l2_subdev_core_ops ad9389b_core_ops = {
-	.log_status = ad9389b_log_status,
-#ifdef CONFIG_VIDEO_ADV_DEBUG
-	.g_register = ad9389b_g_register,
-	.s_register = ad9389b_s_register,
-#endif
-	.s_power = ad9389b_s_power,
-	.interrupt_service_routine = ad9389b_isr,
-};
-
-/* ------------------------------ VIDEO OPS ------------------------------ */
-
-/* Enable/disable ad9389b output */
-static int ad9389b_s_stream(struct v4l2_subdev *sd, int enable)
-{
-	v4l2_dbg(1, debug, sd, "%s: %sable\n", __func__, (enable ? "en" : "dis"));
-
-	ad9389b_wr_and_or(sd, 0xa1, ~0x3c, (enable ? 0 : 0x3c));
-	if (enable) {
-		ad9389b_check_monitor_present_status(sd);
-	} else {
-		ad9389b_s_power(sd, 0);
-	}
-	return 0;
-}
-
-static const struct v4l2_dv_timings_cap ad9389b_timings_cap = {
-	.type = V4L2_DV_BT_656_1120,
-	/* keep this initialization for compatibility with GCC < 4.4.6 */
-	.reserved = { 0 },
-	V4L2_INIT_BT_TIMINGS(640, 1920, 350, 1200, 25000000, 170000000,
-		V4L2_DV_BT_STD_CEA861 | V4L2_DV_BT_STD_DMT |
-			V4L2_DV_BT_STD_GTF | V4L2_DV_BT_STD_CVT,
-		V4L2_DV_BT_CAP_PROGRESSIVE | V4L2_DV_BT_CAP_REDUCED_BLANKING |
-		V4L2_DV_BT_CAP_CUSTOM)
-};
-
-static int ad9389b_s_dv_timings(struct v4l2_subdev *sd,
-				struct v4l2_dv_timings *timings)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	v4l2_dbg(1, debug, sd, "%s:\n", __func__);
-
-	/* quick sanity check */
-	if (!v4l2_valid_dv_timings(timings, &ad9389b_timings_cap, NULL, NULL))
-		return -EINVAL;
-
-	/* Fill the optional fields .standards and .flags in struct v4l2_dv_timings
-	   if the format is one of the CEA or DMT timings. */
-	v4l2_find_dv_timings_cap(timings, &ad9389b_timings_cap, 0, NULL, NULL);
-
-	timings->bt.flags &= ~V4L2_DV_FL_REDUCED_FPS;
-
-	/* save timings */
-	state->dv_timings = *timings;
-
-	/* update quantization range based on new dv_timings */
-	ad9389b_set_rgb_quantization_mode(sd, state->rgb_quantization_range_ctrl);
-
-	/* update PLL gear based on new dv_timings */
-	if (state->pdata.tmds_pll_gear == AD9389B_TMDS_PLL_GEAR_SEMI_AUTOMATIC)
-		ad9389b_set_manual_pll_gear(sd, (u32)timings->bt.pixelclock);
-
-	/* update AVI infoframe */
-	ad9389b_set_IT_content_AVI_InfoFrame(sd);
-
-	return 0;
-}
-
-static int ad9389b_g_dv_timings(struct v4l2_subdev *sd,
-				struct v4l2_dv_timings *timings)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	v4l2_dbg(1, debug, sd, "%s:\n", __func__);
-
-	if (!timings)
-		return -EINVAL;
-
-	*timings = state->dv_timings;
-
-	return 0;
-}
-
-static int ad9389b_enum_dv_timings(struct v4l2_subdev *sd,
-				   struct v4l2_enum_dv_timings *timings)
-{
-	if (timings->pad != 0)
-		return -EINVAL;
-
-	return v4l2_enum_dv_timings_cap(timings, &ad9389b_timings_cap,
-			NULL, NULL);
-}
-
-static int ad9389b_dv_timings_cap(struct v4l2_subdev *sd,
-				  struct v4l2_dv_timings_cap *cap)
-{
-	if (cap->pad != 0)
-		return -EINVAL;
-
-	*cap = ad9389b_timings_cap;
-	return 0;
-}
-
-static const struct v4l2_subdev_video_ops ad9389b_video_ops = {
-	.s_stream = ad9389b_s_stream,
-	.s_dv_timings = ad9389b_s_dv_timings,
-	.g_dv_timings = ad9389b_g_dv_timings,
-};
-
-/* ------------------------------ PAD OPS ------------------------------ */
-
-static int ad9389b_get_edid(struct v4l2_subdev *sd, struct v4l2_edid *edid)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	if (edid->pad != 0)
-		return -EINVAL;
-	if (edid->blocks == 0 || edid->blocks > 256)
-		return -EINVAL;
-	if (!state->edid.segments) {
-		v4l2_dbg(1, debug, sd, "EDID segment 0 not found\n");
-		return -ENODATA;
-	}
-	if (edid->start_block >= state->edid.segments * 2)
-		return -E2BIG;
-	if (edid->blocks + edid->start_block >= state->edid.segments * 2)
-		edid->blocks = state->edid.segments * 2 - edid->start_block;
-	memcpy(edid->edid, &state->edid.data[edid->start_block * 128],
-	       128 * edid->blocks);
-	return 0;
-}
-
-static const struct v4l2_subdev_pad_ops ad9389b_pad_ops = {
-	.get_edid = ad9389b_get_edid,
-	.enum_dv_timings = ad9389b_enum_dv_timings,
-	.dv_timings_cap = ad9389b_dv_timings_cap,
-};
-
-/* ------------------------------ AUDIO OPS ------------------------------ */
-
-static int ad9389b_s_audio_stream(struct v4l2_subdev *sd, int enable)
-{
-	v4l2_dbg(1, debug, sd, "%s: %sable\n", __func__, (enable ? "en" : "dis"));
-
-	if (enable)
-		ad9389b_wr_and_or(sd, 0x45, 0x3f, 0x80);
-	else
-		ad9389b_wr_and_or(sd, 0x45, 0x3f, 0x40);
-
-	return 0;
-}
-
-static int ad9389b_s_clock_freq(struct v4l2_subdev *sd, u32 freq)
-{
-	u32 N;
-
-	switch (freq) {
-	case 32000:  N = 4096;  break;
-	case 44100:  N = 6272;  break;
-	case 48000:  N = 6144;  break;
-	case 88200:  N = 12544; break;
-	case 96000:  N = 12288; break;
-	case 176400: N = 25088; break;
-	case 192000: N = 24576; break;
-	default:
-	     return -EINVAL;
-	}
-
-	/* Set N (used with CTS to regenerate the audio clock) */
-	ad9389b_wr(sd, 0x01, (N >> 16) & 0xf);
-	ad9389b_wr(sd, 0x02, (N >> 8) & 0xff);
-	ad9389b_wr(sd, 0x03, N & 0xff);
-
-	return 0;
-}
-
-static int ad9389b_s_i2s_clock_freq(struct v4l2_subdev *sd, u32 freq)
-{
-	u32 i2s_sf;
-
-	switch (freq) {
-	case 32000:  i2s_sf = 0x30; break;
-	case 44100:  i2s_sf = 0x00; break;
-	case 48000:  i2s_sf = 0x20; break;
-	case 88200:  i2s_sf = 0x80; break;
-	case 96000:  i2s_sf = 0xa0; break;
-	case 176400: i2s_sf = 0xc0; break;
-	case 192000: i2s_sf = 0xe0; break;
-	default:
-	     return -EINVAL;
-	}
-
-	/* Set sampling frequency for I2S audio to 48 kHz */
-	ad9389b_wr_and_or(sd, 0x15, 0xf, i2s_sf);
-
-	return 0;
-}
-
-static int ad9389b_s_routing(struct v4l2_subdev *sd, u32 input, u32 output, u32 config)
-{
-	/* TODO based on input/output/config */
-	/* TODO See datasheet "Programmers guide" p. 39-40 */
-
-	/* Only 2 channels in use for application */
-	ad9389b_wr_and_or(sd, 0x50, 0x1f, 0x20);
-	/* Speaker mapping */
-	ad9389b_wr(sd, 0x51, 0x00);
-
-	/* TODO Where should this be placed? */
-	/* 16 bit audio word length */
-	ad9389b_wr_and_or(sd, 0x14, 0xf0, 0x02);
-
-	return 0;
-}
-
-static const struct v4l2_subdev_audio_ops ad9389b_audio_ops = {
-	.s_stream = ad9389b_s_audio_stream,
-	.s_clock_freq = ad9389b_s_clock_freq,
-	.s_i2s_clock_freq = ad9389b_s_i2s_clock_freq,
-	.s_routing = ad9389b_s_routing,
-};
-
-/* --------------------- SUBDEV OPS --------------------------------------- */
-
-static const struct v4l2_subdev_ops ad9389b_ops = {
-	.core  = &ad9389b_core_ops,
-	.video = &ad9389b_video_ops,
-	.audio = &ad9389b_audio_ops,
-	.pad = &ad9389b_pad_ops,
-};
-
-/* ----------------------------------------------------------------------- */
-static void ad9389b_dbg_dump_edid(int lvl, int debug, struct v4l2_subdev *sd,
-				  int segment, u8 *buf)
-{
-	int i, j;
-
-	if (debug < lvl)
-		return;
-
-	v4l2_dbg(lvl, debug, sd, "edid segment %d\n", segment);
-	for (i = 0; i < 256; i += 16) {
-		u8 b[128];
-		u8 *bp = b;
-
-		if (i == 128)
-			v4l2_dbg(lvl, debug, sd, "\n");
-		for (j = i; j < i + 16; j++) {
-			sprintf(bp, "0x%02x, ", buf[j]);
-			bp += 6;
-		}
-		bp[0] = '\0';
-		v4l2_dbg(lvl, debug, sd, "%s\n", b);
-	}
-}
-
-static void ad9389b_edid_handler(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct ad9389b_state *state =
-		container_of(dwork, struct ad9389b_state, edid_handler);
-	struct v4l2_subdev *sd = &state->sd;
-	struct ad9389b_edid_detect ed;
-
-	v4l2_dbg(1, debug, sd, "%s:\n", __func__);
-
-	if (ad9389b_check_edid_status(sd)) {
-		/* Return if we received the EDID. */
-		return;
-	}
-
-	if (ad9389b_have_hotplug(sd)) {
-		/* We must retry reading the EDID several times, it is possible
-		 * that initially the EDID couldn't be read due to i2c errors
-		 * (DVI connectors are particularly prone to this problem). */
-		if (state->edid.read_retries) {
-			state->edid.read_retries--;
-			v4l2_dbg(1, debug, sd, "%s: edid read failed\n", __func__);
-			ad9389b_s_power(sd, false);
-			ad9389b_s_power(sd, true);
-			schedule_delayed_work(&state->edid_handler, EDID_DELAY);
-			return;
-		}
-	}
-
-	/* We failed to read the EDID, so send an event for this. */
-	ed.present = false;
-	ed.segment = ad9389b_rd(sd, 0xc4);
-	v4l2_subdev_notify(sd, AD9389B_EDID_DETECT, (void *)&ed);
-	v4l2_dbg(1, debug, sd, "%s: no edid found\n", __func__);
-}
-
-static void ad9389b_audio_setup(struct v4l2_subdev *sd)
-{
-	v4l2_dbg(1, debug, sd, "%s\n", __func__);
-
-	ad9389b_s_i2s_clock_freq(sd, 48000);
-	ad9389b_s_clock_freq(sd, 48000);
-	ad9389b_s_routing(sd, 0, 0, 0);
-}
-
-/* Initial setup of AD9389b */
-
-/* Configure hdmi transmitter. */
-static void ad9389b_setup(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	v4l2_dbg(1, debug, sd, "%s\n", __func__);
-
-	/* Input format: RGB 4:4:4 */
-	ad9389b_wr_and_or(sd, 0x15, 0xf1, 0x0);
-	/* Output format: RGB 4:4:4 */
-	ad9389b_wr_and_or(sd, 0x16, 0x3f, 0x0);
-	/* 1st order interpolation 4:2:2 -> 4:4:4 up conversion,
-	   Aspect ratio: 16:9 */
-	ad9389b_wr_and_or(sd, 0x17, 0xf9, 0x06);
-	/* Output format: RGB 4:4:4, Active Format Information is valid. */
-	ad9389b_wr_and_or(sd, 0x45, 0xc7, 0x08);
-	/* Underscanned */
-	ad9389b_wr_and_or(sd, 0x46, 0x3f, 0x80);
-	/* Setup video format */
-	ad9389b_wr(sd, 0x3c, 0x0);
-	/* Active format aspect ratio: same as picure. */
-	ad9389b_wr(sd, 0x47, 0x80);
-	/* No encryption */
-	ad9389b_wr_and_or(sd, 0xaf, 0xef, 0x0);
-	/* Positive clk edge capture for input video clock */
-	ad9389b_wr_and_or(sd, 0xba, 0x1f, 0x60);
-
-	ad9389b_audio_setup(sd);
-
-	v4l2_ctrl_handler_setup(&state->hdl);
-
-	ad9389b_set_IT_content_AVI_InfoFrame(sd);
-}
-
-static void ad9389b_notify_monitor_detect(struct v4l2_subdev *sd)
-{
-	struct ad9389b_monitor_detect mdt;
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	mdt.present = state->have_monitor;
-	v4l2_subdev_notify(sd, AD9389B_MONITOR_DETECT, (void *)&mdt);
-}
-
-static void ad9389b_update_monitor_present_status(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	/* read hotplug and rx-sense state */
-	u8 status = ad9389b_rd(sd, 0x42);
-
-	v4l2_dbg(1, debug, sd, "%s: status: 0x%x%s%s\n",
-		 __func__,
-		 status,
-		 status & MASK_AD9389B_HPD_DETECT ? ", hotplug" : "",
-		 status & MASK_AD9389B_MSEN_DETECT ? ", rx-sense" : "");
-
-	if (status & MASK_AD9389B_HPD_DETECT) {
-		v4l2_dbg(1, debug, sd, "%s: hotplug detected\n", __func__);
-		state->have_monitor = true;
-		if (!ad9389b_s_power(sd, true)) {
-			v4l2_dbg(1, debug, sd,
-				 "%s: monitor detected, powerup failed\n", __func__);
-			return;
-		}
-		ad9389b_setup(sd);
-		ad9389b_notify_monitor_detect(sd);
-		state->edid.read_retries = EDID_MAX_RETRIES;
-		schedule_delayed_work(&state->edid_handler, EDID_DELAY);
-	} else if (!(status & MASK_AD9389B_HPD_DETECT)) {
-		v4l2_dbg(1, debug, sd, "%s: hotplug not detected\n", __func__);
-		state->have_monitor = false;
-		ad9389b_notify_monitor_detect(sd);
-		ad9389b_s_power(sd, false);
-		memset(&state->edid, 0, sizeof(struct ad9389b_state_edid));
-	}
-
-	/* update read only ctrls */
-	v4l2_ctrl_s_ctrl(state->hotplug_ctrl, ad9389b_have_hotplug(sd) ? 0x1 : 0x0);
-	v4l2_ctrl_s_ctrl(state->rx_sense_ctrl, ad9389b_have_rx_sense(sd) ? 0x1 : 0x0);
-	v4l2_ctrl_s_ctrl(state->have_edid0_ctrl, state->edid.segments ? 0x1 : 0x0);
-
-	/* update with setting from ctrls */
-	ad9389b_s_ctrl(state->rgb_quantization_range_ctrl);
-	ad9389b_s_ctrl(state->hdmi_mode_ctrl);
-}
-
-static void ad9389b_check_monitor_present_status(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	int retry = 0;
-
-	ad9389b_update_monitor_present_status(sd);
-
-	/*
-	 * Rapid toggling of the hotplug may leave the chip powered off,
-	 * even if we think it is on. In that case reset and power up again.
-	 */
-	while (state->power_on && (ad9389b_rd(sd, 0x41) & 0x40)) {
-		if (++retry > 5) {
-			v4l2_err(sd, "retried %d times, give up\n", retry);
-			return;
-		}
-		v4l2_dbg(1, debug, sd, "%s: reset and re-check status (%d)\n", __func__, retry);
-		ad9389b_notify_monitor_detect(sd);
-		cancel_delayed_work_sync(&state->edid_handler);
-		memset(&state->edid, 0, sizeof(struct ad9389b_state_edid));
-		ad9389b_s_power(sd, false);
-		ad9389b_update_monitor_present_status(sd);
-	}
-}
-
-static bool edid_block_verify_crc(u8 *edid_block)
-{
-	u8 sum = 0;
-	int i;
-
-	for (i = 0; i < 128; i++)
-		sum += edid_block[i];
-	return sum == 0;
-}
-
-static bool edid_verify_crc(struct v4l2_subdev *sd, u32 segment)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	u32 blocks = state->edid.blocks;
-	u8 *data = state->edid.data;
-
-	if (edid_block_verify_crc(&data[segment * 256])) {
-		if ((segment + 1) * 2 <= blocks)
-			return edid_block_verify_crc(&data[segment * 256 + 128]);
-		return true;
-	}
-	return false;
-}
-
-static bool edid_verify_header(struct v4l2_subdev *sd, u32 segment)
-{
-	static const u8 hdmi_header[] = {
-		0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00
-	};
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	u8 *data = state->edid.data;
-	int i;
-
-	if (segment)
-		return true;
-
-	for (i = 0; i < ARRAY_SIZE(hdmi_header); i++)
-		if (data[i] != hdmi_header[i])
-			return false;
-
-	return true;
-}
-
-static bool ad9389b_check_edid_status(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	struct ad9389b_edid_detect ed;
-	int segment;
-	u8 edidRdy = ad9389b_rd(sd, 0xc5);
-
-	v4l2_dbg(1, debug, sd, "%s: edid ready (retries: %d)\n",
-		 __func__, EDID_MAX_RETRIES - state->edid.read_retries);
-
-	if (!(edidRdy & MASK_AD9389B_EDID_RDY))
-		return false;
-
-	segment = ad9389b_rd(sd, 0xc4);
-	if (segment >= EDID_MAX_SEGM) {
-		v4l2_err(sd, "edid segment number too big\n");
-		return false;
-	}
-	v4l2_dbg(1, debug, sd, "%s: got segment %d\n", __func__, segment);
-	ad9389b_edid_rd(sd, 256, &state->edid.data[segment * 256]);
-	ad9389b_dbg_dump_edid(2, debug, sd, segment,
-			      &state->edid.data[segment * 256]);
-	if (segment == 0) {
-		state->edid.blocks = state->edid.data[0x7e] + 1;
-		v4l2_dbg(1, debug, sd, "%s: %d blocks in total\n",
-			 __func__, state->edid.blocks);
-	}
-	if (!edid_verify_crc(sd, segment) ||
-	    !edid_verify_header(sd, segment)) {
-		/* edid crc error, force reread of edid segment */
-		v4l2_err(sd, "%s: edid crc or header error\n", __func__);
-		ad9389b_s_power(sd, false);
-		ad9389b_s_power(sd, true);
-		return false;
-	}
-	/* one more segment read ok */
-	state->edid.segments = segment + 1;
-	if (((state->edid.data[0x7e] >> 1) + 1) > state->edid.segments) {
-		/* Request next EDID segment */
-		v4l2_dbg(1, debug, sd, "%s: request segment %d\n",
-			 __func__, state->edid.segments);
-		ad9389b_wr(sd, 0xc9, 0xf);
-		ad9389b_wr(sd, 0xc4, state->edid.segments);
-		state->edid.read_retries = EDID_MAX_RETRIES;
-		schedule_delayed_work(&state->edid_handler, EDID_DELAY);
-		return false;
-	}
-
-	/* report when we have all segments but report only for segment 0 */
-	ed.present = true;
-	ed.segment = 0;
-	v4l2_subdev_notify(sd, AD9389B_EDID_DETECT, (void *)&ed);
-	state->edid_detect_counter++;
-	v4l2_ctrl_s_ctrl(state->have_edid0_ctrl, state->edid.segments ? 0x1 : 0x0);
-	return ed.present;
-}
-
-/* ----------------------------------------------------------------------- */
-
-static void ad9389b_init_setup(struct v4l2_subdev *sd)
-{
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-	struct ad9389b_state_edid *edid = &state->edid;
-
-	v4l2_dbg(1, debug, sd, "%s\n", __func__);
-
-	/* clear all interrupts */
-	ad9389b_wr(sd, 0x96, 0xff);
-
-	memset(edid, 0, sizeof(struct ad9389b_state_edid));
-	state->have_monitor = false;
-	ad9389b_set_isr(sd, false);
-}
-
-static int ad9389b_probe(struct i2c_client *client)
-{
-	const struct v4l2_dv_timings dv1080p60 = V4L2_DV_BT_CEA_1920X1080P60;
-	struct ad9389b_state *state;
-	struct ad9389b_platform_data *pdata = client->dev.platform_data;
-	struct v4l2_ctrl_handler *hdl;
-	struct v4l2_subdev *sd;
-	int err = -EIO;
-
-	/* Check if the adapter supports the needed features */
-	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
-		return -EIO;
-
-	v4l_dbg(1, debug, client, "detecting ad9389b client on address 0x%x\n",
-		client->addr << 1);
-
-	state = devm_kzalloc(&client->dev, sizeof(*state), GFP_KERNEL);
-	if (!state)
-		return -ENOMEM;
-
-	/* Platform data */
-	if (pdata == NULL) {
-		v4l_err(client, "No platform data!\n");
-		return -ENODEV;
-	}
-	memcpy(&state->pdata, pdata, sizeof(state->pdata));
-
-	sd = &state->sd;
-	v4l2_i2c_subdev_init(sd, client, &ad9389b_ops);
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	hdl = &state->hdl;
-	v4l2_ctrl_handler_init(hdl, 5);
-
-	state->hdmi_mode_ctrl = v4l2_ctrl_new_std_menu(hdl, &ad9389b_ctrl_ops,
-			V4L2_CID_DV_TX_MODE, V4L2_DV_TX_MODE_HDMI,
-			0, V4L2_DV_TX_MODE_DVI_D);
-	state->hotplug_ctrl = v4l2_ctrl_new_std(hdl, NULL,
-			V4L2_CID_DV_TX_HOTPLUG, 0, 1, 0, 0);
-	state->rx_sense_ctrl = v4l2_ctrl_new_std(hdl, NULL,
-			V4L2_CID_DV_TX_RXSENSE, 0, 1, 0, 0);
-	state->have_edid0_ctrl = v4l2_ctrl_new_std(hdl, NULL,
-			V4L2_CID_DV_TX_EDID_PRESENT, 0, 1, 0, 0);
-	state->rgb_quantization_range_ctrl =
-		v4l2_ctrl_new_std_menu(hdl, &ad9389b_ctrl_ops,
-			V4L2_CID_DV_TX_RGB_RANGE, V4L2_DV_RGB_RANGE_FULL,
-			0, V4L2_DV_RGB_RANGE_AUTO);
-	sd->ctrl_handler = hdl;
-	if (hdl->error) {
-		err = hdl->error;
-
-		goto err_hdl;
-	}
-	state->pad.flags = MEDIA_PAD_FL_SINK;
-	sd->entity.function = MEDIA_ENT_F_DV_ENCODER;
-	err = media_entity_pads_init(&sd->entity, 1, &state->pad);
-	if (err)
-		goto err_hdl;
-
-	state->chip_revision = ad9389b_rd(sd, 0x0);
-	if (state->chip_revision != 2) {
-		v4l2_err(sd, "chip_revision %d != 2\n", state->chip_revision);
-		err = -EIO;
-		goto err_entity;
-	}
-	v4l2_dbg(1, debug, sd, "reg 0x41 0x%x, chip version (reg 0x00) 0x%x\n",
-		 ad9389b_rd(sd, 0x41), state->chip_revision);
-
-	state->edid_i2c_client = i2c_new_dummy_device(client->adapter, (0x7e >> 1));
-	if (IS_ERR(state->edid_i2c_client)) {
-		v4l2_err(sd, "failed to register edid i2c client\n");
-		err = PTR_ERR(state->edid_i2c_client);
-		goto err_entity;
-	}
-
-	INIT_DELAYED_WORK(&state->edid_handler, ad9389b_edid_handler);
-	state->dv_timings = dv1080p60;
-
-	ad9389b_init_setup(sd);
-	ad9389b_set_isr(sd, true);
-
-	v4l2_info(sd, "%s found @ 0x%x (%s)\n", client->name,
-		  client->addr << 1, client->adapter->name);
-	return 0;
-
-err_entity:
-	media_entity_cleanup(&sd->entity);
-err_hdl:
-	v4l2_ctrl_handler_free(&state->hdl);
-	return err;
-}
-
-/* ----------------------------------------------------------------------- */
-
-static void ad9389b_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *sd = i2c_get_clientdata(client);
-	struct ad9389b_state *state = get_ad9389b_state(sd);
-
-	state->chip_revision = -1;
-
-	v4l2_dbg(1, debug, sd, "%s removed @ 0x%x (%s)\n", client->name,
-		 client->addr << 1, client->adapter->name);
-
-	ad9389b_s_stream(sd, false);
-	ad9389b_s_audio_stream(sd, false);
-	ad9389b_init_setup(sd);
-	cancel_delayed_work_sync(&state->edid_handler);
-	i2c_unregister_device(state->edid_i2c_client);
-	v4l2_device_unregister_subdev(sd);
-	media_entity_cleanup(&sd->entity);
-	v4l2_ctrl_handler_free(sd->ctrl_handler);
-}
-
-/* ----------------------------------------------------------------------- */
-
-static const struct i2c_device_id ad9389b_id[] = {
-	{ "ad9389b", 0 },
-	{ "ad9889b", 0 },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, ad9389b_id);
-
-static struct i2c_driver ad9389b_driver = {
-	.driver = {
-		.name = "ad9389b",
-	},
-	.probe_new = ad9389b_probe,
-	.remove = ad9389b_remove,
-	.id_table = ad9389b_id,
-};
-
-module_i2c_driver(ad9389b_driver);
diff --git a/include/media/i2c/ad9389b.h b/include/media/i2c/ad9389b.h
deleted file mode 100644
index 30f9ea9a1273..000000000000
--- a/include/media/i2c/ad9389b.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Analog Devices AD9389B/AD9889B video encoder driver header
- *
- * Copyright 2012 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
- */
-
-#ifndef AD9389B_H
-#define AD9389B_H
-
-enum ad9389b_tmds_pll_gear {
-	AD9389B_TMDS_PLL_GEAR_AUTOMATIC,
-	AD9389B_TMDS_PLL_GEAR_SEMI_AUTOMATIC,
-};
-
-/* Platform dependent definitions */
-struct ad9389b_platform_data {
-	enum ad9389b_tmds_pll_gear tmds_pll_gear ;
-	/* Differential Data/Clock Output Drive Strength (reg. 0xa2/0xa3) */
-	u8 diff_data_drive_strength;
-	u8 diff_clk_drive_strength;
-};
-
-/* notify events */
-#define AD9389B_MONITOR_DETECT 0
-#define AD9389B_EDID_DETECT 1
-
-struct ad9389b_monitor_detect {
-	int present;
-};
-
-struct ad9389b_edid_detect {
-	int present;
-	int segment;
-};
-
-#endif
-- 
cgit v1.2.3


From c77d3da12e812c695cd9a3a0e428518e1442f73b Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused m5mols camera sensor driver

The m5mols camera sensor driver doesn't support DT and relies on
platform data. The last board files supplying platform data for that
device have been removed from the kernel in v3.11. The driver hasn't
been used since them. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |    1 -
 MAINTAINERS                                      |    8 -
 drivers/media/i2c/Kconfig                        |    1 -
 drivers/media/i2c/Makefile                       |    1 -
 drivers/media/i2c/m5mols/Kconfig                 |    8 -
 drivers/media/i2c/m5mols/Makefile                |    4 -
 drivers/media/i2c/m5mols/m5mols.h                |  349 -------
 drivers/media/i2c/m5mols/m5mols_capture.c        |  158 ----
 drivers/media/i2c/m5mols/m5mols_controls.c       |  625 -------------
 drivers/media/i2c/m5mols/m5mols_core.c           | 1051 ----------------------
 drivers/media/i2c/m5mols/m5mols_reg.h            |  359 --------
 include/media/i2c/m5mols.h                       |   25 -
 12 files changed, 2590 deletions(-)
 delete mode 100644 drivers/media/i2c/m5mols/Kconfig
 delete mode 100644 drivers/media/i2c/m5mols/Makefile
 delete mode 100644 drivers/media/i2c/m5mols/m5mols.h
 delete mode 100644 drivers/media/i2c/m5mols/m5mols_capture.c
 delete mode 100644 drivers/media/i2c/m5mols/m5mols_controls.c
 delete mode 100644 drivers/media/i2c/m5mols/m5mols_core.c
 delete mode 100644 drivers/media/i2c/m5mols/m5mols_reg.h
 delete mode 100644 include/media/i2c/m5mols.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index 4819d9aa55f1..26827f4b4a3d 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -72,7 +72,6 @@ imx319        Sony IMX319 sensor
 imx334        Sony IMX334 sensor
 imx355        Sony IMX355 sensor
 imx412        Sony IMX412 sensor
-m5mols        Fujitsu M-5MOLS 8MP sensor
 mt9m001       mt9m001
 mt9m032       MT9M032 camera sensor
 mt9m111       mt9m111, mt9m112 and mt9m131
diff --git a/MAINTAINERS b/MAINTAINERS
index 8b475bf063f6..0dfc8f3e4e2a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8411,14 +8411,6 @@ L:	platform-driver-x86@vger.kernel.org
 S:	Maintained
 F:	drivers/platform/x86/fujitsu-laptop.c
 
-FUJITSU M-5MO LS CAMERA ISP DRIVER
-M:	Kyungmin Park <kyungmin.park@samsung.com>
-M:	Heungjun Kim <riverful.kim@samsung.com>
-L:	linux-media@vger.kernel.org
-S:	Maintained
-F:	drivers/media/i2c/m5mols/
-F:	include/media/i2c/m5mols.h
-
 FUJITSU TABLET EXTRAS
 M:	Robert Gerlach <khnz@gmx.de>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index c369db886b40..5be28ca5b5b2 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -848,7 +848,6 @@ config VIDEO_VS6624
 
 source "drivers/media/i2c/ccs/Kconfig"
 source "drivers/media/i2c/et8ek8/Kconfig"
-source "drivers/media/i2c/m5mols/Kconfig"
 
 endmenu
 
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 77052f941fec..f739b639b429 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -55,7 +55,6 @@ obj-$(CONFIG_VIDEO_KS0127) += ks0127.o
 obj-$(CONFIG_VIDEO_LM3560) += lm3560.o
 obj-$(CONFIG_VIDEO_LM3646) += lm3646.o
 obj-$(CONFIG_VIDEO_M52790) += m52790.o
-obj-$(CONFIG_VIDEO_M5MOLS) += m5mols/
 obj-$(CONFIG_VIDEO_MAX9271_LIB) += max9271.o
 obj-$(CONFIG_VIDEO_MAX9286) += max9286.o
 obj-$(CONFIG_VIDEO_ML86V7667) += ml86v7667.o
diff --git a/drivers/media/i2c/m5mols/Kconfig b/drivers/media/i2c/m5mols/Kconfig
deleted file mode 100644
index 7f0af32f4376..000000000000
--- a/drivers/media/i2c/m5mols/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config VIDEO_M5MOLS
-	tristate "Fujitsu M-5MOLS 8MP sensor support"
-	depends on I2C && VIDEO_DEV
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	help
-	  This driver supports Fujitsu M-5MOLS camera sensor with ISP
diff --git a/drivers/media/i2c/m5mols/Makefile b/drivers/media/i2c/m5mols/Makefile
deleted file mode 100644
index 13fa8ec29ac0..000000000000
--- a/drivers/media/i2c/m5mols/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-m5mols-objs	:= m5mols_core.o m5mols_controls.o m5mols_capture.o
-
-obj-$(CONFIG_VIDEO_M5MOLS)		+= m5mols.o
diff --git a/drivers/media/i2c/m5mols/m5mols.h b/drivers/media/i2c/m5mols/m5mols.h
deleted file mode 100644
index d8545d2280af..000000000000
--- a/drivers/media/i2c/m5mols/m5mols.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Header for M-5MOLS 8M Pixel camera sensor with ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#ifndef M5MOLS_H
-#define M5MOLS_H
-
-#include <linux/sizes.h>
-#include <linux/gpio/consumer.h>
-#include <media/v4l2-subdev.h>
-#include "m5mols_reg.h"
-
-
-/* An amount of data transmitted in addition to the value
- * determined by CAPP_JPEG_SIZE_MAX register.
- */
-#define M5MOLS_JPEG_TAGS_SIZE		0x20000
-#define M5MOLS_MAIN_JPEG_SIZE_MAX	(5 * SZ_1M)
-
-extern int m5mols_debug;
-
-enum m5mols_restype {
-	M5MOLS_RESTYPE_MONITOR,
-	M5MOLS_RESTYPE_CAPTURE,
-	M5MOLS_RESTYPE_MAX,
-};
-
-/**
- * struct m5mols_resolution - structure for the resolution
- * @type: resolution type according to the pixel code
- * @width: width of the resolution
- * @height: height of the resolution
- * @reg: resolution preset register value
- */
-struct m5mols_resolution {
-	u8 reg;
-	enum m5mols_restype type;
-	u16 width;
-	u16 height;
-};
-
-/**
- * struct m5mols_exif - structure for the EXIF information of M-5MOLS
- * @exposure_time: exposure time register value
- * @shutter_speed: speed of the shutter register value
- * @aperture: aperture register value
- * @brightness: brightness register value
- * @exposure_bias: it calls also EV bias
- * @iso_speed: ISO register value
- * @flash: status register value of the flash
- * @sdr: status register value of the Subject Distance Range
- * @qval: not written exact meaning in document
- */
-struct m5mols_exif {
-	u32 exposure_time;
-	u32 shutter_speed;
-	u32 aperture;
-	u32 brightness;
-	u32 exposure_bias;
-	u16 iso_speed;
-	u16 flash;
-	u16 sdr;
-	u16 qval;
-};
-
-/**
- * struct m5mols_capture - Structure for the capture capability
- * @exif: EXIF information
- * @buf_size: internal JPEG frame buffer size, in bytes
- * @main: size in bytes of the main image
- * @thumb: size in bytes of the thumb image, if it was accompanied
- * @total: total size in bytes of the produced image
- */
-struct m5mols_capture {
-	struct m5mols_exif exif;
-	unsigned int buf_size;
-	u32 main;
-	u32 thumb;
-	u32 total;
-};
-
-/**
- * struct m5mols_scenemode - structure for the scenemode capability
- * @metering: metering light register value
- * @ev_bias: EV bias register value
- * @wb_mode: mode which means the WhiteBalance is Auto or Manual
- * @wb_preset: whitebalance preset register value in the Manual mode
- * @chroma_en: register value whether the Chroma capability is enabled or not
- * @chroma_lvl: chroma's level register value
- * @edge_en: register value Whether the Edge capability is enabled or not
- * @edge_lvl: edge's level register value
- * @af_range: Auto Focus's range
- * @fd_mode: Face Detection mode
- * @mcc: Multi-axis Color Conversion which means emotion color
- * @light: status of the Light
- * @flash: status of the Flash
- * @tone: Tone color which means Contrast
- * @iso: ISO register value
- * @capt_mode: Mode of the Image Stabilization while the camera capturing
- * @wdr: Wide Dynamic Range register value
- *
- * The each value according to each scenemode is recommended in the documents.
- */
-struct m5mols_scenemode {
-	u8 metering;
-	u8 ev_bias;
-	u8 wb_mode;
-	u8 wb_preset;
-	u8 chroma_en;
-	u8 chroma_lvl;
-	u8 edge_en;
-	u8 edge_lvl;
-	u8 af_range;
-	u8 fd_mode;
-	u8 mcc;
-	u8 light;
-	u8 flash;
-	u8 tone;
-	u8 iso;
-	u8 capt_mode;
-	u8 wdr;
-};
-
-#define VERSION_STRING_SIZE	22
-
-/**
- * struct m5mols_version - firmware version information
- * @customer:	customer information
- * @project:	version of project information according to customer
- * @fw:		firmware revision
- * @hw:		hardware revision
- * @param:	version of the parameter
- * @awb:	Auto WhiteBalance algorithm version
- * @str:	information about manufacturer and packaging vendor
- * @af:		Auto Focus version
- *
- * The register offset starts the customer version at 0x0, and it ends
- * the awb version at 0x09. The customer, project information occupies 1 bytes
- * each. And also the fw, hw, param, awb each requires 2 bytes. The str is
- * unique string associated with firmware's version. It includes information
- * about manufacturer and the vendor of the sensor's packaging. The least
- * significant 2 bytes of the string indicate packaging manufacturer.
- */
-struct m5mols_version {
-	u8	customer;
-	u8	project;
-	u16	fw;
-	u16	hw;
-	u16	param;
-	u16	awb;
-	u8	str[VERSION_STRING_SIZE];
-	u8	af;
-};
-
-/**
- * struct m5mols_info - M-5MOLS driver data structure
- * @pdata: platform data
- * @sd: v4l-subdev instance
- * @pad: media pad
- * @irq_waitq: waitqueue for the capture
- * @irq_done: set to 1 in the interrupt handler
- * @handle: control handler
- * @auto_exposure: auto/manual exposure control
- * @exposure_bias: exposure compensation control
- * @exposure: manual exposure control
- * @metering: exposure metering control
- * @auto_iso: auto/manual ISO sensitivity control
- * @iso: manual ISO sensitivity control
- * @auto_wb: auto white balance control
- * @lock_3a: 3A lock control
- * @colorfx: color effect control
- * @saturation: saturation control
- * @zoom: zoom control
- * @wdr: wide dynamic range control
- * @stabilization: image stabilization control
- * @jpeg_quality: JPEG compression quality control
- * @set_power: optional power callback to the board code
- * @reset: GPIO driving the reset pin of M-5MOLS
- * @lock: mutex protecting the structure fields below
- * @ffmt: current fmt according to resolution type
- * @res_type: current resolution type
- * @ver: information of the version
- * @cap: the capture mode attributes
- * @isp_ready: 1 when the ISP controller has completed booting
- * @power: current sensor's power status
- * @ctrl_sync: 1 when the control handler state is restored in H/W
- * @resolution:	register value for current resolution
- * @mode: register value for current operation mode
- */
-struct m5mols_info {
-	const struct m5mols_platform_data *pdata;
-	struct v4l2_subdev sd;
-	struct media_pad pad;
-
-	wait_queue_head_t irq_waitq;
-	atomic_t irq_done;
-
-	struct v4l2_ctrl_handler handle;
-	struct {
-		/* exposure/exposure bias/auto exposure cluster */
-		struct v4l2_ctrl *auto_exposure;
-		struct v4l2_ctrl *exposure_bias;
-		struct v4l2_ctrl *exposure;
-		struct v4l2_ctrl *metering;
-	};
-	struct {
-		/* iso/auto iso cluster */
-		struct v4l2_ctrl *auto_iso;
-		struct v4l2_ctrl *iso;
-	};
-	struct v4l2_ctrl *auto_wb;
-
-	struct v4l2_ctrl *lock_3a;
-	struct v4l2_ctrl *colorfx;
-	struct v4l2_ctrl *saturation;
-	struct v4l2_ctrl *zoom;
-	struct v4l2_ctrl *wdr;
-	struct v4l2_ctrl *stabilization;
-	struct v4l2_ctrl *jpeg_quality;
-
-	int (*set_power)(struct device *dev, int on);
-	struct gpio_desc *reset;
-
-	struct mutex lock;
-
-	struct v4l2_mbus_framefmt ffmt[M5MOLS_RESTYPE_MAX];
-	int res_type;
-
-	struct m5mols_version ver;
-	struct m5mols_capture cap;
-
-	unsigned int isp_ready:1;
-	unsigned int power:1;
-	unsigned int ctrl_sync:1;
-
-	u8 resolution;
-	u8 mode;
-};
-
-#define is_available_af(__info)	(__info->ver.af)
-#define is_code(__code, __type) (__code == m5mols_default_ffmt[__type].code)
-#define is_manufacturer(__info, __manufacturer)	\
-				(__info->ver.str[0] == __manufacturer[0] && \
-				 __info->ver.str[1] == __manufacturer[1])
-/*
- * I2C operation of the M-5MOLS
- *
- * The I2C read operation of the M-5MOLS requires 2 messages. The first
- * message sends the information about the command, command category, and total
- * message size. The second message is used to retrieve the data specified in
- * the first message
- *
- *   1st message                                2nd message
- *   +-------+---+----------+-----+-------+     +------+------+------+------+
- *   | size1 | R | category | cmd | size2 |     | d[0] | d[1] | d[2] | d[3] |
- *   +-------+---+----------+-----+-------+     +------+------+------+------+
- *   - size1: message data size(5 in this case)
- *   - size2: desired buffer size of the 2nd message
- *   - d[0..3]: according to size2
- *
- * The I2C write operation needs just one message. The message includes
- * category, command, total size, and desired data.
- *
- *   1st message
- *   +-------+---+----------+-----+------+------+------+------+
- *   | size1 | W | category | cmd | d[0] | d[1] | d[2] | d[3] |
- *   +-------+---+----------+-----+------+------+------+------+
- *   - d[0..3]: according to size1
- */
-int m5mols_read_u8(struct v4l2_subdev *sd, u32 reg_comb, u8 *val);
-int m5mols_read_u16(struct v4l2_subdev *sd, u32 reg_comb, u16 *val);
-int m5mols_read_u32(struct v4l2_subdev *sd, u32 reg_comb, u32 *val);
-int m5mols_write(struct v4l2_subdev *sd, u32 reg_comb, u32 val);
-
-int m5mols_busy_wait(struct v4l2_subdev *sd, u32 reg, u32 value, u32 mask,
-		     int timeout);
-
-/* Mask value for busy waiting until M-5MOLS I2C interface is initialized */
-#define M5MOLS_I2C_RDY_WAIT_FL		(1 << 16)
-/* ISP state transition timeout, in ms */
-#define M5MOLS_MODE_CHANGE_TIMEOUT	200
-#define M5MOLS_BUSY_WAIT_DEF_TIMEOUT	250
-
-/*
- * Mode operation of the M-5MOLS
- *
- * Changing the mode of the M-5MOLS is needed right executing order.
- * There are three modes(PARAMETER, MONITOR, CAPTURE) which can be changed
- * by user. There are various categories associated with each mode.
- *
- * +============================================================+
- * | mode	| category					|
- * +============================================================+
- * | FLASH	| FLASH(only after Stand-by or Power-on)	|
- * | SYSTEM	| SYSTEM(only after sensor arm-booting)		|
- * | PARAMETER	| PARAMETER					|
- * | MONITOR	| MONITOR(preview), Auto Focus, Face Detection	|
- * | CAPTURE	| Single CAPTURE, Preview(recording)		|
- * +============================================================+
- *
- * The available executing order between each modes are as follows:
- *   PARAMETER <---> MONITOR <---> CAPTURE
- */
-int m5mols_set_mode(struct m5mols_info *info, u8 mode);
-
-int m5mols_enable_interrupt(struct v4l2_subdev *sd, u8 reg);
-int m5mols_wait_interrupt(struct v4l2_subdev *sd, u8 condition, u32 timeout);
-int m5mols_restore_controls(struct m5mols_info *info);
-int m5mols_start_capture(struct m5mols_info *info);
-int m5mols_do_scenemode(struct m5mols_info *info, u8 mode);
-int m5mols_lock_3a(struct m5mols_info *info, bool lock);
-int m5mols_set_ctrl(struct v4l2_ctrl *ctrl);
-int m5mols_init_controls(struct v4l2_subdev *sd);
-
-/* The firmware function */
-int m5mols_update_fw(struct v4l2_subdev *sd,
-		     int (*set_power)(struct m5mols_info *, bool));
-
-static inline struct m5mols_info *to_m5mols(struct v4l2_subdev *subdev)
-{
-	return container_of(subdev, struct m5mols_info, sd);
-}
-
-static inline struct v4l2_subdev *to_sd(struct v4l2_ctrl *ctrl)
-{
-	struct m5mols_info *info = container_of(ctrl->handler,
-						struct m5mols_info, handle);
-	return &info->sd;
-}
-
-static inline void m5mols_set_ctrl_mode(struct v4l2_ctrl *ctrl,
-					unsigned int mode)
-{
-	ctrl->priv = (void *)(uintptr_t)mode;
-}
-
-static inline unsigned int m5mols_get_ctrl_mode(struct v4l2_ctrl *ctrl)
-{
-	return (unsigned int)(uintptr_t)ctrl->priv;
-}
-
-#endif	/* M5MOLS_H */
diff --git a/drivers/media/i2c/m5mols/m5mols_capture.c b/drivers/media/i2c/m5mols/m5mols_capture.c
deleted file mode 100644
index 275c5b2539fd..000000000000
--- a/drivers/media/i2c/m5mols/m5mols_capture.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/*
- * The Capture code for Fujitsu M-5MOLS ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/regulator/consumer.h>
-#include <linux/videodev2.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-#include <media/i2c/m5mols.h>
-#include <media/drv-intf/exynos-fimc.h>
-
-#include "m5mols.h"
-#include "m5mols_reg.h"
-
-/**
- * m5mols_read_rational - I2C read of a rational number
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @addr_num: numerator register
- * @addr_den: denominator register
- * @val: place to store the division result
- *
- * Read numerator and denominator from registers @addr_num and @addr_den
- * respectively and return the division result in @val.
- */
-static int m5mols_read_rational(struct v4l2_subdev *sd, u32 addr_num,
-				u32 addr_den, u32 *val)
-{
-	u32 num, den;
-
-	int ret = m5mols_read_u32(sd, addr_num, &num);
-	if (!ret)
-		ret = m5mols_read_u32(sd, addr_den, &den);
-	if (ret)
-		return ret;
-	*val = den == 0 ? 0 : num / den;
-	return ret;
-}
-
-/**
- * m5mols_capture_info - Gather captured image information
- * @info: M-5MOLS driver data structure
- *
- * For now it gathers only EXIF information and file size.
- */
-static int m5mols_capture_info(struct m5mols_info *info)
-{
-	struct m5mols_exif *exif = &info->cap.exif;
-	struct v4l2_subdev *sd = &info->sd;
-	int ret;
-
-	ret = m5mols_read_rational(sd, EXIF_INFO_EXPTIME_NU,
-				   EXIF_INFO_EXPTIME_DE, &exif->exposure_time);
-	if (ret)
-		return ret;
-	ret = m5mols_read_rational(sd, EXIF_INFO_TV_NU, EXIF_INFO_TV_DE,
-				   &exif->shutter_speed);
-	if (ret)
-		return ret;
-	ret = m5mols_read_rational(sd, EXIF_INFO_AV_NU, EXIF_INFO_AV_DE,
-				   &exif->aperture);
-	if (ret)
-		return ret;
-	ret = m5mols_read_rational(sd, EXIF_INFO_BV_NU, EXIF_INFO_BV_DE,
-				   &exif->brightness);
-	if (ret)
-		return ret;
-	ret = m5mols_read_rational(sd, EXIF_INFO_EBV_NU, EXIF_INFO_EBV_DE,
-				   &exif->exposure_bias);
-	if (ret)
-		return ret;
-
-	ret = m5mols_read_u16(sd, EXIF_INFO_ISO, &exif->iso_speed);
-	if (!ret)
-		ret = m5mols_read_u16(sd, EXIF_INFO_FLASH, &exif->flash);
-	if (!ret)
-		ret = m5mols_read_u16(sd, EXIF_INFO_SDR, &exif->sdr);
-	if (!ret)
-		ret = m5mols_read_u16(sd, EXIF_INFO_QVAL, &exif->qval);
-	if (ret)
-		return ret;
-
-	if (!ret)
-		ret = m5mols_read_u32(sd, CAPC_IMAGE_SIZE, &info->cap.main);
-	if (!ret)
-		ret = m5mols_read_u32(sd, CAPC_THUMB_SIZE, &info->cap.thumb);
-	if (!ret)
-		info->cap.total = info->cap.main + info->cap.thumb;
-
-	return ret;
-}
-
-int m5mols_start_capture(struct m5mols_info *info)
-{
-	unsigned int framesize = info->cap.buf_size - M5MOLS_JPEG_TAGS_SIZE;
-	struct v4l2_subdev *sd = &info->sd;
-	int ret;
-
-	/*
-	 * Synchronize the controls, set the capture frame resolution and color
-	 * format. The frame capture is initiated during switching from Monitor
-	 * to Capture mode.
-	 */
-	ret = m5mols_set_mode(info, REG_MONITOR);
-	if (!ret)
-		ret = m5mols_restore_controls(info);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_YUVOUT_MAIN, REG_JPEG);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_MAIN_IMAGE_SIZE, info->resolution);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_JPEG_SIZE_MAX, framesize);
-	if (!ret)
-		ret = m5mols_set_mode(info, REG_CAPTURE);
-	if (!ret)
-		/* Wait until a frame is captured to ISP internal memory */
-		ret = m5mols_wait_interrupt(sd, REG_INT_CAPTURE, 2000);
-	if (ret)
-		return ret;
-
-	/*
-	 * Initiate the captured data transfer to a MIPI-CSI receiver.
-	 */
-	ret = m5mols_write(sd, CAPC_SEL_FRAME, 1);
-	if (!ret)
-		ret = m5mols_write(sd, CAPC_START, REG_CAP_START_MAIN);
-	if (!ret) {
-		bool captured = false;
-		unsigned int size;
-
-		/* Wait for the capture completion interrupt */
-		ret = m5mols_wait_interrupt(sd, REG_INT_CAPTURE, 2000);
-		if (!ret) {
-			captured = true;
-			ret = m5mols_capture_info(info);
-		}
-		size = captured ? info->cap.main : 0;
-		v4l2_dbg(1, m5mols_debug, sd, "%s: size: %d, thumb.: %d B\n",
-			 __func__, size, info->cap.thumb);
-
-		v4l2_subdev_notify(sd, S5P_FIMC_TX_END_NOTIFY, &size);
-	}
-
-	return ret;
-}
diff --git a/drivers/media/i2c/m5mols/m5mols_controls.c b/drivers/media/i2c/m5mols/m5mols_controls.c
deleted file mode 100644
index b45e0e08b6c8..000000000000
--- a/drivers/media/i2c/m5mols/m5mols_controls.c
+++ /dev/null
@@ -1,625 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Controls for M-5MOLS 8M Pixel camera sensor with ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/delay.h>
-#include <linux/videodev2.h>
-#include <media/v4l2-ctrls.h>
-
-#include "m5mols.h"
-#include "m5mols_reg.h"
-
-static struct m5mols_scenemode m5mols_default_scenemode[] = {
-	[REG_SCENE_NORMAL] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_NORMAL, REG_LIGHT_OFF, REG_FLASH_OFF,
-		5, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_PORTRAIT] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 4,
-		REG_AF_NORMAL, BIT_FD_EN | BIT_FD_DRAW_FACE_FRAME,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_LANDSCAPE] = {
-		REG_AE_ALL, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 4, REG_EDGE_ON, 6,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_SPORTS] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_PARTY_INDOOR] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 4, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_200, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_BEACH_SNOW] = {
-		REG_AE_CENTER, REG_AE_INDEX_10_POS, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 4, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_50, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_SUNSET] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_PRESET,
-		REG_AWB_DAYLIGHT,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_DAWN_DUSK] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_PRESET,
-		REG_AWB_FLUORESCENT_1,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_FALL] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 5, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_NIGHT] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_AGAINST_LIGHT] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_FIRE] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_50, REG_CAP_NONE, REG_WDR_OFF,
-	},
-	[REG_SCENE_TEXT] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 7,
-		REG_AF_MACRO, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_ANTI_SHAKE, REG_WDR_ON,
-	},
-	[REG_SCENE_CANDLE] = {
-		REG_AE_CENTER, REG_AE_INDEX_00, REG_AWB_AUTO, 0,
-		REG_CHROMA_ON, 3, REG_EDGE_ON, 5,
-		REG_AF_NORMAL, REG_FD_OFF,
-		REG_MCC_OFF, REG_LIGHT_OFF, REG_FLASH_OFF,
-		6, REG_ISO_AUTO, REG_CAP_NONE, REG_WDR_OFF,
-	},
-};
-
-/**
- * m5mols_do_scenemode() - Change current scenemode
- * @info: M-5MOLS driver data structure
- * @mode:	Desired mode of the scenemode
- *
- * WARNING: The execution order is important. Do not change the order.
- */
-int m5mols_do_scenemode(struct m5mols_info *info, u8 mode)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	struct m5mols_scenemode scenemode = m5mols_default_scenemode[mode];
-	int ret;
-
-	if (mode > REG_SCENE_CANDLE)
-		return -EINVAL;
-
-	ret = v4l2_ctrl_s_ctrl(info->lock_3a, 0);
-	if (!ret)
-		ret = m5mols_write(sd, AE_EV_PRESET_MONITOR, mode);
-	if (!ret)
-		ret = m5mols_write(sd, AE_EV_PRESET_CAPTURE, mode);
-	if (!ret)
-		ret = m5mols_write(sd, AE_MODE, scenemode.metering);
-	if (!ret)
-		ret = m5mols_write(sd, AE_INDEX, scenemode.ev_bias);
-	if (!ret)
-		ret = m5mols_write(sd, AWB_MODE, scenemode.wb_mode);
-	if (!ret)
-		ret = m5mols_write(sd, AWB_MANUAL, scenemode.wb_preset);
-	if (!ret)
-		ret = m5mols_write(sd, MON_CHROMA_EN, scenemode.chroma_en);
-	if (!ret)
-		ret = m5mols_write(sd, MON_CHROMA_LVL, scenemode.chroma_lvl);
-	if (!ret)
-		ret = m5mols_write(sd, MON_EDGE_EN, scenemode.edge_en);
-	if (!ret)
-		ret = m5mols_write(sd, MON_EDGE_LVL, scenemode.edge_lvl);
-	if (!ret && is_available_af(info))
-		ret = m5mols_write(sd, AF_MODE, scenemode.af_range);
-	if (!ret && is_available_af(info))
-		ret = m5mols_write(sd, FD_CTL, scenemode.fd_mode);
-	if (!ret)
-		ret = m5mols_write(sd, MON_TONE_CTL, scenemode.tone);
-	if (!ret)
-		ret = m5mols_write(sd, AE_ISO, scenemode.iso);
-	if (!ret)
-		ret = m5mols_set_mode(info, REG_CAPTURE);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_WDR_EN, scenemode.wdr);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_MCC_MODE, scenemode.mcc);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_LIGHT_CTRL, scenemode.light);
-	if (!ret)
-		ret = m5mols_write(sd, CAPP_FLASH_CTRL, scenemode.flash);
-	if (!ret)
-		ret = m5mols_write(sd, CAPC_MODE, scenemode.capt_mode);
-	if (!ret)
-		ret = m5mols_set_mode(info, REG_MONITOR);
-
-	return ret;
-}
-
-static int m5mols_3a_lock(struct m5mols_info *info, struct v4l2_ctrl *ctrl)
-{
-	bool af_lock = ctrl->val & V4L2_LOCK_FOCUS;
-	int ret = 0;
-
-	if ((ctrl->val ^ ctrl->cur.val) & V4L2_LOCK_EXPOSURE) {
-		bool ae_lock = ctrl->val & V4L2_LOCK_EXPOSURE;
-
-		ret = m5mols_write(&info->sd, AE_LOCK, ae_lock ?
-				   REG_AE_LOCK : REG_AE_UNLOCK);
-		if (ret)
-			return ret;
-	}
-
-	if (((ctrl->val ^ ctrl->cur.val) & V4L2_LOCK_WHITE_BALANCE)
-	    && info->auto_wb->val) {
-		bool awb_lock = ctrl->val & V4L2_LOCK_WHITE_BALANCE;
-
-		ret = m5mols_write(&info->sd, AWB_LOCK, awb_lock ?
-				   REG_AWB_LOCK : REG_AWB_UNLOCK);
-		if (ret)
-			return ret;
-	}
-
-	if (!info->ver.af || !af_lock)
-		return ret;
-
-	if ((ctrl->val ^ ctrl->cur.val) & V4L2_LOCK_FOCUS)
-		ret = m5mols_write(&info->sd, AF_EXECUTE, REG_AF_STOP);
-
-	return ret;
-}
-
-static int m5mols_set_metering_mode(struct m5mols_info *info, int mode)
-{
-	unsigned int metering;
-
-	switch (mode) {
-	case V4L2_EXPOSURE_METERING_CENTER_WEIGHTED:
-		metering = REG_AE_CENTER;
-		break;
-	case V4L2_EXPOSURE_METERING_SPOT:
-		metering = REG_AE_SPOT;
-		break;
-	default:
-		metering = REG_AE_ALL;
-		break;
-	}
-
-	return m5mols_write(&info->sd, AE_MODE, metering);
-}
-
-static int m5mols_set_exposure(struct m5mols_info *info, int exposure)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	int ret = 0;
-
-	if (exposure == V4L2_EXPOSURE_AUTO) {
-		/* Unlock auto exposure */
-		info->lock_3a->val &= ~V4L2_LOCK_EXPOSURE;
-		m5mols_3a_lock(info, info->lock_3a);
-
-		ret = m5mols_set_metering_mode(info, info->metering->val);
-		if (ret < 0)
-			return ret;
-
-		v4l2_dbg(1, m5mols_debug, sd,
-			 "%s: exposure bias: %#x, metering: %#x\n",
-			 __func__, info->exposure_bias->val,
-			 info->metering->val);
-
-		return m5mols_write(sd, AE_INDEX, info->exposure_bias->val);
-	}
-
-	if (exposure == V4L2_EXPOSURE_MANUAL) {
-		ret = m5mols_write(sd, AE_MODE, REG_AE_OFF);
-		if (ret == 0)
-			ret = m5mols_write(sd, AE_MAN_GAIN_MON,
-					   info->exposure->val);
-		if (ret == 0)
-			ret = m5mols_write(sd, AE_MAN_GAIN_CAP,
-					   info->exposure->val);
-
-		v4l2_dbg(1, m5mols_debug, sd, "%s: exposure: %#x\n",
-			 __func__, info->exposure->val);
-	}
-
-	return ret;
-}
-
-static int m5mols_set_white_balance(struct m5mols_info *info, int val)
-{
-	static const unsigned short wb[][2] = {
-		{ V4L2_WHITE_BALANCE_INCANDESCENT,  REG_AWB_INCANDESCENT },
-		{ V4L2_WHITE_BALANCE_FLUORESCENT,   REG_AWB_FLUORESCENT_1 },
-		{ V4L2_WHITE_BALANCE_FLUORESCENT_H, REG_AWB_FLUORESCENT_2 },
-		{ V4L2_WHITE_BALANCE_HORIZON,       REG_AWB_HORIZON },
-		{ V4L2_WHITE_BALANCE_DAYLIGHT,      REG_AWB_DAYLIGHT },
-		{ V4L2_WHITE_BALANCE_FLASH,         REG_AWB_LEDLIGHT },
-		{ V4L2_WHITE_BALANCE_CLOUDY,        REG_AWB_CLOUDY },
-		{ V4L2_WHITE_BALANCE_SHADE,         REG_AWB_SHADE },
-		{ V4L2_WHITE_BALANCE_AUTO,          REG_AWB_AUTO },
-	};
-	int i;
-	struct v4l2_subdev *sd = &info->sd;
-	int ret = -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(wb); i++) {
-		int awb;
-		if (wb[i][0] != val)
-			continue;
-
-		v4l2_dbg(1, m5mols_debug, sd,
-			 "Setting white balance to: %#x\n", wb[i][0]);
-
-		awb = wb[i][0] == V4L2_WHITE_BALANCE_AUTO;
-		ret = m5mols_write(sd, AWB_MODE, awb ? REG_AWB_AUTO :
-						 REG_AWB_PRESET);
-		if (ret < 0)
-			return ret;
-
-		if (!awb)
-			ret = m5mols_write(sd, AWB_MANUAL, wb[i][1]);
-	}
-
-	return ret;
-}
-
-static int m5mols_set_saturation(struct m5mols_info *info, int val)
-{
-	int ret = m5mols_write(&info->sd, MON_CHROMA_LVL, val);
-	if (ret < 0)
-		return ret;
-
-	return m5mols_write(&info->sd, MON_CHROMA_EN, REG_CHROMA_ON);
-}
-
-static int m5mols_set_color_effect(struct m5mols_info *info, int val)
-{
-	unsigned int m_effect = REG_COLOR_EFFECT_OFF;
-	unsigned int p_effect = REG_EFFECT_OFF;
-	unsigned int cfix_r = 0, cfix_b = 0;
-	struct v4l2_subdev *sd = &info->sd;
-	int ret = 0;
-
-	switch (val) {
-	case V4L2_COLORFX_BW:
-		m_effect = REG_COLOR_EFFECT_ON;
-		break;
-	case V4L2_COLORFX_NEGATIVE:
-		p_effect = REG_EFFECT_NEGA;
-		break;
-	case V4L2_COLORFX_EMBOSS:
-		p_effect = REG_EFFECT_EMBOSS;
-		break;
-	case V4L2_COLORFX_SEPIA:
-		m_effect = REG_COLOR_EFFECT_ON;
-		cfix_r = REG_CFIXR_SEPIA;
-		cfix_b = REG_CFIXB_SEPIA;
-		break;
-	}
-
-	ret = m5mols_write(sd, PARM_EFFECT, p_effect);
-	if (!ret)
-		ret = m5mols_write(sd, MON_EFFECT, m_effect);
-
-	if (ret == 0 && m_effect == REG_COLOR_EFFECT_ON) {
-		ret = m5mols_write(sd, MON_CFIXR, cfix_r);
-		if (!ret)
-			ret = m5mols_write(sd, MON_CFIXB, cfix_b);
-	}
-
-	v4l2_dbg(1, m5mols_debug, sd,
-		 "p_effect: %#x, m_effect: %#x, r: %#x, b: %#x (%d)\n",
-		 p_effect, m_effect, cfix_r, cfix_b, ret);
-
-	return ret;
-}
-
-static int m5mols_set_iso(struct m5mols_info *info, int auto_iso)
-{
-	u32 iso = auto_iso ? 0 : info->iso->val + 1;
-
-	return m5mols_write(&info->sd, AE_ISO, iso);
-}
-
-static int m5mols_set_wdr(struct m5mols_info *info, int wdr)
-{
-	int ret;
-
-	ret = m5mols_write(&info->sd, MON_TONE_CTL, wdr ? 9 : 5);
-	if (ret < 0)
-		return ret;
-
-	ret = m5mols_set_mode(info, REG_CAPTURE);
-	if (ret < 0)
-		return ret;
-
-	return m5mols_write(&info->sd, CAPP_WDR_EN, wdr);
-}
-
-static int m5mols_set_stabilization(struct m5mols_info *info, int val)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	unsigned int evp = val ? 0xe : 0x0;
-	int ret;
-
-	ret = m5mols_write(sd, AE_EV_PRESET_MONITOR, evp);
-	if (ret < 0)
-		return ret;
-
-	return m5mols_write(sd, AE_EV_PRESET_CAPTURE, evp);
-}
-
-static int m5mols_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct v4l2_subdev *sd = to_sd(ctrl);
-	struct m5mols_info *info = to_m5mols(sd);
-	int ret = 0;
-	u8 status = REG_ISO_AUTO;
-
-	v4l2_dbg(1, m5mols_debug, sd, "%s: ctrl: %s (%d)\n",
-		 __func__, ctrl->name, info->isp_ready);
-
-	if (!info->isp_ready)
-		return -EBUSY;
-
-	switch (ctrl->id) {
-	case V4L2_CID_ISO_SENSITIVITY_AUTO:
-		ret = m5mols_read_u8(sd, AE_ISO, &status);
-		if (ret == 0)
-			ctrl->val = !status;
-		if (status != REG_ISO_AUTO)
-			info->iso->val = status - 1;
-		break;
-
-	case V4L2_CID_3A_LOCK:
-		ctrl->val &= ~0x7;
-
-		ret = m5mols_read_u8(sd, AE_LOCK, &status);
-		if (ret)
-			return ret;
-		if (status)
-			info->lock_3a->val |= V4L2_LOCK_EXPOSURE;
-
-		ret = m5mols_read_u8(sd, AWB_LOCK, &status);
-		if (ret)
-			return ret;
-		if (status)
-			info->lock_3a->val |= V4L2_LOCK_EXPOSURE;
-
-		ret = m5mols_read_u8(sd, AF_EXECUTE, &status);
-		if (!status)
-			info->lock_3a->val |= V4L2_LOCK_EXPOSURE;
-		break;
-	}
-
-	return ret;
-}
-
-static int m5mols_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	unsigned int ctrl_mode = m5mols_get_ctrl_mode(ctrl);
-	struct v4l2_subdev *sd = to_sd(ctrl);
-	struct m5mols_info *info = to_m5mols(sd);
-	int last_mode = info->mode;
-	int ret = 0;
-
-	/*
-	 * If needed, defer restoring the controls until
-	 * the device is fully initialized.
-	 */
-	if (!info->isp_ready) {
-		info->ctrl_sync = 0;
-		return 0;
-	}
-
-	v4l2_dbg(1, m5mols_debug, sd, "%s: %s, val: %d, priv: %p\n",
-		 __func__, ctrl->name, ctrl->val, ctrl->priv);
-
-	if (ctrl_mode && ctrl_mode != info->mode) {
-		ret = m5mols_set_mode(info, ctrl_mode);
-		if (ret < 0)
-			return ret;
-	}
-
-	switch (ctrl->id) {
-	case V4L2_CID_3A_LOCK:
-		ret = m5mols_3a_lock(info, ctrl);
-		break;
-
-	case V4L2_CID_ZOOM_ABSOLUTE:
-		ret = m5mols_write(sd, MON_ZOOM, ctrl->val);
-		break;
-
-	case V4L2_CID_EXPOSURE_AUTO:
-		ret = m5mols_set_exposure(info, ctrl->val);
-		break;
-
-	case V4L2_CID_ISO_SENSITIVITY:
-		ret = m5mols_set_iso(info, ctrl->val);
-		break;
-
-	case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE:
-		ret = m5mols_set_white_balance(info, ctrl->val);
-		break;
-
-	case V4L2_CID_SATURATION:
-		ret = m5mols_set_saturation(info, ctrl->val);
-		break;
-
-	case V4L2_CID_COLORFX:
-		ret = m5mols_set_color_effect(info, ctrl->val);
-		break;
-
-	case V4L2_CID_WIDE_DYNAMIC_RANGE:
-		ret = m5mols_set_wdr(info, ctrl->val);
-		break;
-
-	case V4L2_CID_IMAGE_STABILIZATION:
-		ret = m5mols_set_stabilization(info, ctrl->val);
-		break;
-
-	case V4L2_CID_JPEG_COMPRESSION_QUALITY:
-		ret = m5mols_write(sd, CAPP_JPEG_RATIO, ctrl->val);
-		break;
-	}
-
-	if (ret == 0 && info->mode != last_mode)
-		ret = m5mols_set_mode(info, last_mode);
-
-	return ret;
-}
-
-static const struct v4l2_ctrl_ops m5mols_ctrl_ops = {
-	.g_volatile_ctrl	= m5mols_g_volatile_ctrl,
-	.s_ctrl			= m5mols_s_ctrl,
-};
-
-/* Supported manual ISO values */
-static const s64 iso_qmenu[] = {
-	/* AE_ISO: 0x01...0x07 (ISO: 50...3200) */
-	50000, 100000, 200000, 400000, 800000, 1600000, 3200000
-};
-
-/* Supported Exposure Bias values, -2.0EV...+2.0EV */
-static const s64 ev_bias_qmenu[] = {
-	/* AE_INDEX: 0x00...0x08 */
-	-2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000
-};
-
-int m5mols_init_controls(struct v4l2_subdev *sd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	u16 exposure_max;
-	u16 zoom_step;
-	int ret;
-
-	/* Determine the firmware dependent control range and step values */
-	ret = m5mols_read_u16(sd, AE_MAX_GAIN_MON, &exposure_max);
-	if (ret < 0)
-		return ret;
-
-	zoom_step = is_manufacturer(info, REG_SAMSUNG_OPTICS) ? 31 : 1;
-	v4l2_ctrl_handler_init(&info->handle, 20);
-
-	info->auto_wb = v4l2_ctrl_new_std_menu(&info->handle,
-			&m5mols_ctrl_ops, V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE,
-			9, ~0x3fe, V4L2_WHITE_BALANCE_AUTO);
-
-	/* Exposure control cluster */
-	info->auto_exposure = v4l2_ctrl_new_std_menu(&info->handle,
-			&m5mols_ctrl_ops, V4L2_CID_EXPOSURE_AUTO,
-			1, ~0x03, V4L2_EXPOSURE_AUTO);
-
-	info->exposure = v4l2_ctrl_new_std(&info->handle,
-			&m5mols_ctrl_ops, V4L2_CID_EXPOSURE,
-			0, exposure_max, 1, exposure_max / 2);
-
-	info->exposure_bias = v4l2_ctrl_new_int_menu(&info->handle,
-			&m5mols_ctrl_ops, V4L2_CID_AUTO_EXPOSURE_BIAS,
-			ARRAY_SIZE(ev_bias_qmenu) - 1,
-			ARRAY_SIZE(ev_bias_qmenu)/2 - 1,
-			ev_bias_qmenu);
-
-	info->metering = v4l2_ctrl_new_std_menu(&info->handle,
-			&m5mols_ctrl_ops, V4L2_CID_EXPOSURE_METERING,
-			2, ~0x7, V4L2_EXPOSURE_METERING_AVERAGE);
-
-	/* ISO control cluster */
-	info->auto_iso = v4l2_ctrl_new_std_menu(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_ISO_SENSITIVITY_AUTO, 1, ~0x03, 1);
-
-	info->iso = v4l2_ctrl_new_int_menu(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_ISO_SENSITIVITY, ARRAY_SIZE(iso_qmenu) - 1,
-			ARRAY_SIZE(iso_qmenu)/2 - 1, iso_qmenu);
-
-	info->saturation = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_SATURATION, 1, 5, 1, 3);
-
-	info->zoom = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_ZOOM_ABSOLUTE, 1, 70, zoom_step, 1);
-
-	info->colorfx = v4l2_ctrl_new_std_menu(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_COLORFX, 4, 0, V4L2_COLORFX_NONE);
-
-	info->wdr = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_WIDE_DYNAMIC_RANGE, 0, 1, 1, 0);
-
-	info->stabilization = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_IMAGE_STABILIZATION, 0, 1, 1, 0);
-
-	info->jpeg_quality = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_JPEG_COMPRESSION_QUALITY, 1, 100, 1, 80);
-
-	info->lock_3a = v4l2_ctrl_new_std(&info->handle, &m5mols_ctrl_ops,
-			V4L2_CID_3A_LOCK, 0, 0x7, 0, 0);
-
-	if (info->handle.error) {
-		int ret = info->handle.error;
-		v4l2_err(sd, "Failed to initialize controls: %d\n", ret);
-		v4l2_ctrl_handler_free(&info->handle);
-		return ret;
-	}
-
-	v4l2_ctrl_auto_cluster(4, &info->auto_exposure, 1, false);
-	info->auto_iso->flags |= V4L2_CTRL_FLAG_VOLATILE |
-				V4L2_CTRL_FLAG_UPDATE;
-	v4l2_ctrl_auto_cluster(2, &info->auto_iso, 0, false);
-
-	info->lock_3a->flags |= V4L2_CTRL_FLAG_VOLATILE;
-
-	m5mols_set_ctrl_mode(info->auto_exposure, REG_PARAMETER);
-	m5mols_set_ctrl_mode(info->auto_wb, REG_PARAMETER);
-	m5mols_set_ctrl_mode(info->colorfx, REG_MONITOR);
-
-	sd->ctrl_handler = &info->handle;
-
-	return 0;
-}
diff --git a/drivers/media/i2c/m5mols/m5mols_core.c b/drivers/media/i2c/m5mols/m5mols_core.c
deleted file mode 100644
index 2b01873ba0db..000000000000
--- a/drivers/media/i2c/m5mols/m5mols_core.c
+++ /dev/null
@@ -1,1051 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for M-5MOLS 8M Pixel camera sensor with ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/gpio/consumer.h>
-#include <linux/regulator/consumer.h>
-#include <linux/videodev2.h>
-#include <linux/module.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-#include <media/i2c/m5mols.h>
-
-#include "m5mols.h"
-#include "m5mols_reg.h"
-
-int m5mols_debug;
-module_param(m5mols_debug, int, 0644);
-
-#define MODULE_NAME		"M5MOLS"
-#define M5MOLS_I2C_CHECK_RETRY	500
-
-/* The regulator consumer names for external voltage regulators */
-static struct regulator_bulk_data supplies[] = {
-	{
-		.supply = "core",	/* ARM core power, 1.2V */
-	}, {
-		.supply	= "dig_18",	/* digital power 1, 1.8V */
-	}, {
-		.supply	= "d_sensor",	/* sensor power 1, 1.8V */
-	}, {
-		.supply	= "dig_28",	/* digital power 2, 2.8V */
-	}, {
-		.supply	= "a_sensor",	/* analog power */
-	}, {
-		.supply	= "dig_12",	/* digital power 3, 1.2V */
-	},
-};
-
-static struct v4l2_mbus_framefmt m5mols_default_ffmt[M5MOLS_RESTYPE_MAX] = {
-	[M5MOLS_RESTYPE_MONITOR] = {
-		.width		= 1920,
-		.height		= 1080,
-		.code		= MEDIA_BUS_FMT_VYUY8_2X8,
-		.field		= V4L2_FIELD_NONE,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-	},
-	[M5MOLS_RESTYPE_CAPTURE] = {
-		.width		= 1920,
-		.height		= 1080,
-		.code		= MEDIA_BUS_FMT_JPEG_1X8,
-		.field		= V4L2_FIELD_NONE,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-	},
-};
-#define SIZE_DEFAULT_FFMT	ARRAY_SIZE(m5mols_default_ffmt)
-
-static const struct m5mols_resolution m5mols_reg_res[] = {
-	{ 0x01, M5MOLS_RESTYPE_MONITOR, 128, 96 },	/* SUB-QCIF */
-	{ 0x03, M5MOLS_RESTYPE_MONITOR, 160, 120 },	/* QQVGA */
-	{ 0x05, M5MOLS_RESTYPE_MONITOR, 176, 144 },	/* QCIF */
-	{ 0x06, M5MOLS_RESTYPE_MONITOR, 176, 176 },
-	{ 0x08, M5MOLS_RESTYPE_MONITOR, 240, 320 },	/* QVGA */
-	{ 0x09, M5MOLS_RESTYPE_MONITOR, 320, 240 },	/* QVGA */
-	{ 0x0c, M5MOLS_RESTYPE_MONITOR, 240, 400 },	/* WQVGA */
-	{ 0x0d, M5MOLS_RESTYPE_MONITOR, 400, 240 },	/* WQVGA */
-	{ 0x0e, M5MOLS_RESTYPE_MONITOR, 352, 288 },	/* CIF */
-	{ 0x13, M5MOLS_RESTYPE_MONITOR, 480, 360 },
-	{ 0x15, M5MOLS_RESTYPE_MONITOR, 640, 360 },	/* qHD */
-	{ 0x17, M5MOLS_RESTYPE_MONITOR, 640, 480 },	/* VGA */
-	{ 0x18, M5MOLS_RESTYPE_MONITOR, 720, 480 },
-	{ 0x1a, M5MOLS_RESTYPE_MONITOR, 800, 480 },	/* WVGA */
-	{ 0x1f, M5MOLS_RESTYPE_MONITOR, 800, 600 },	/* SVGA */
-	{ 0x21, M5MOLS_RESTYPE_MONITOR, 1280, 720 },	/* HD */
-	{ 0x25, M5MOLS_RESTYPE_MONITOR, 1920, 1080 },	/* 1080p */
-	{ 0x29, M5MOLS_RESTYPE_MONITOR, 3264, 2448 },	/* 2.63fps 8M */
-	{ 0x39, M5MOLS_RESTYPE_MONITOR, 800, 602 },	/* AHS_MON debug */
-
-	{ 0x02, M5MOLS_RESTYPE_CAPTURE, 320, 240 },	/* QVGA */
-	{ 0x04, M5MOLS_RESTYPE_CAPTURE, 400, 240 },	/* WQVGA */
-	{ 0x07, M5MOLS_RESTYPE_CAPTURE, 480, 360 },
-	{ 0x08, M5MOLS_RESTYPE_CAPTURE, 640, 360 },	/* qHD */
-	{ 0x09, M5MOLS_RESTYPE_CAPTURE, 640, 480 },	/* VGA */
-	{ 0x0a, M5MOLS_RESTYPE_CAPTURE, 800, 480 },	/* WVGA */
-	{ 0x10, M5MOLS_RESTYPE_CAPTURE, 1280, 720 },	/* HD */
-	{ 0x14, M5MOLS_RESTYPE_CAPTURE, 1280, 960 },	/* 1M */
-	{ 0x17, M5MOLS_RESTYPE_CAPTURE, 1600, 1200 },	/* 2M */
-	{ 0x19, M5MOLS_RESTYPE_CAPTURE, 1920, 1080 },	/* Full-HD */
-	{ 0x1a, M5MOLS_RESTYPE_CAPTURE, 2048, 1152 },	/* 3Mega */
-	{ 0x1b, M5MOLS_RESTYPE_CAPTURE, 2048, 1536 },
-	{ 0x1c, M5MOLS_RESTYPE_CAPTURE, 2560, 1440 },	/* 4Mega */
-	{ 0x1d, M5MOLS_RESTYPE_CAPTURE, 2560, 1536 },
-	{ 0x1f, M5MOLS_RESTYPE_CAPTURE, 2560, 1920 },	/* 5Mega */
-	{ 0x21, M5MOLS_RESTYPE_CAPTURE, 3264, 1836 },	/* 6Mega */
-	{ 0x22, M5MOLS_RESTYPE_CAPTURE, 3264, 1960 },
-	{ 0x25, M5MOLS_RESTYPE_CAPTURE, 3264, 2448 },	/* 8Mega */
-};
-
-/**
- * m5mols_swap_byte - an byte array to integer conversion function
- * @data: byte array
- * @length: size in bytes of I2C packet defined in the M-5MOLS datasheet
- *
- * Convert I2C data byte array with performing any required byte
- * reordering to assure proper values for each data type, regardless
- * of the architecture endianness.
- */
-static u32 m5mols_swap_byte(u8 *data, u8 length)
-{
-	if (length == 1)
-		return *data;
-	else if (length == 2)
-		return be16_to_cpu(*((__be16 *)data));
-	else
-		return be32_to_cpu(*((__be32 *)data));
-}
-
-/**
- * m5mols_read -  I2C read function
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @size: desired size of I2C packet
- * @reg: combination of size, category and command for the I2C packet
- * @val: read value
- *
- * Returns 0 on success, or else negative errno.
- */
-static int m5mols_read(struct v4l2_subdev *sd, u32 size, u32 reg, u32 *val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct m5mols_info *info = to_m5mols(sd);
-	u8 rbuf[M5MOLS_I2C_MAX_SIZE + 1];
-	u8 category = I2C_CATEGORY(reg);
-	u8 cmd = I2C_COMMAND(reg);
-	struct i2c_msg msg[2];
-	u8 wbuf[5];
-	int ret;
-
-	if (!client->adapter)
-		return -ENODEV;
-
-	msg[0].addr = client->addr;
-	msg[0].flags = 0;
-	msg[0].len = 5;
-	msg[0].buf = wbuf;
-	wbuf[0] = 5;
-	wbuf[1] = M5MOLS_BYTE_READ;
-	wbuf[2] = category;
-	wbuf[3] = cmd;
-	wbuf[4] = size;
-
-	msg[1].addr = client->addr;
-	msg[1].flags = I2C_M_RD;
-	msg[1].len = size + 1;
-	msg[1].buf = rbuf;
-
-	/* minimum stabilization time */
-	usleep_range(200, 300);
-
-	ret = i2c_transfer(client->adapter, msg, 2);
-
-	if (ret == 2) {
-		*val = m5mols_swap_byte(&rbuf[1], size);
-		return 0;
-	}
-
-	if (info->isp_ready)
-		v4l2_err(sd, "read failed: size:%d cat:%02x cmd:%02x. %d\n",
-			 size, category, cmd, ret);
-
-	return ret < 0 ? ret : -EIO;
-}
-
-int m5mols_read_u8(struct v4l2_subdev *sd, u32 reg, u8 *val)
-{
-	u32 val_32;
-	int ret;
-
-	if (I2C_SIZE(reg) != 1) {
-		v4l2_err(sd, "Wrong data size\n");
-		return -EINVAL;
-	}
-
-	ret = m5mols_read(sd, I2C_SIZE(reg), reg, &val_32);
-	if (ret)
-		return ret;
-
-	*val = (u8)val_32;
-	return ret;
-}
-
-int m5mols_read_u16(struct v4l2_subdev *sd, u32 reg, u16 *val)
-{
-	u32 val_32;
-	int ret;
-
-	if (I2C_SIZE(reg) != 2) {
-		v4l2_err(sd, "Wrong data size\n");
-		return -EINVAL;
-	}
-
-	ret = m5mols_read(sd, I2C_SIZE(reg), reg, &val_32);
-	if (ret)
-		return ret;
-
-	*val = (u16)val_32;
-	return ret;
-}
-
-int m5mols_read_u32(struct v4l2_subdev *sd, u32 reg, u32 *val)
-{
-	if (I2C_SIZE(reg) != 4) {
-		v4l2_err(sd, "Wrong data size\n");
-		return -EINVAL;
-	}
-
-	return m5mols_read(sd, I2C_SIZE(reg), reg, val);
-}
-
-/**
- * m5mols_write - I2C command write function
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @reg: combination of size, category and command for the I2C packet
- * @val: value to write
- *
- * Returns 0 on success, or else negative errno.
- */
-int m5mols_write(struct v4l2_subdev *sd, u32 reg, u32 val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct m5mols_info *info = to_m5mols(sd);
-	u8 wbuf[M5MOLS_I2C_MAX_SIZE + 4];
-	u8 category = I2C_CATEGORY(reg);
-	u8 cmd = I2C_COMMAND(reg);
-	u8 size	= I2C_SIZE(reg);
-	u32 *buf = (u32 *)&wbuf[4];
-	struct i2c_msg msg[1];
-	int ret;
-
-	if (!client->adapter)
-		return -ENODEV;
-
-	if (size != 1 && size != 2 && size != 4) {
-		v4l2_err(sd, "Wrong data size\n");
-		return -EINVAL;
-	}
-
-	msg->addr = client->addr;
-	msg->flags = 0;
-	msg->len = (u16)size + 4;
-	msg->buf = wbuf;
-	wbuf[0] = size + 4;
-	wbuf[1] = M5MOLS_BYTE_WRITE;
-	wbuf[2] = category;
-	wbuf[3] = cmd;
-
-	*buf = m5mols_swap_byte((u8 *)&val, size);
-
-	/* minimum stabilization time */
-	usleep_range(200, 300);
-
-	ret = i2c_transfer(client->adapter, msg, 1);
-	if (ret == 1)
-		return 0;
-
-	if (info->isp_ready)
-		v4l2_err(sd, "write failed: cat:%02x cmd:%02x ret:%d\n",
-			 category, cmd, ret);
-
-	return ret < 0 ? ret : -EIO;
-}
-
-/**
- * m5mols_busy_wait - Busy waiting with I2C register polling
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @reg: the I2C_REG() address of an 8-bit status register to check
- * @value: expected status register value
- * @mask: bit mask for the read status register value
- * @timeout: timeout in milliseconds, or -1 for default timeout
- *
- * The @reg register value is ORed with @mask before comparing with @value.
- *
- * Return: 0 if the requested condition became true within less than
- *         @timeout ms, or else negative errno.
- */
-int m5mols_busy_wait(struct v4l2_subdev *sd, u32 reg, u32 value, u32 mask,
-		     int timeout)
-{
-	int ms = timeout < 0 ? M5MOLS_BUSY_WAIT_DEF_TIMEOUT : timeout;
-	unsigned long end = jiffies + msecs_to_jiffies(ms);
-	u8 status;
-
-	do {
-		int ret = m5mols_read_u8(sd, reg, &status);
-
-		if (ret < 0 && !(mask & M5MOLS_I2C_RDY_WAIT_FL))
-			return ret;
-		if (!ret && (status & mask & 0xff) == (value & 0xff))
-			return 0;
-		usleep_range(100, 250);
-	} while (ms > 0 && time_is_after_jiffies(end));
-
-	return -EBUSY;
-}
-
-/**
- * m5mols_enable_interrupt - Clear interrupt pending bits and unmask interrupts
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @reg: combination of size, category and command for the I2C packet
- *
- * Before writing desired interrupt value the INT_FACTOR register should
- * be read to clear pending interrupts.
- */
-int m5mols_enable_interrupt(struct v4l2_subdev *sd, u8 reg)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	u8 mask = is_available_af(info) ? REG_INT_AF : 0;
-	u8 dummy;
-	int ret;
-
-	ret = m5mols_read_u8(sd, SYSTEM_INT_FACTOR, &dummy);
-	if (!ret)
-		ret = m5mols_write(sd, SYSTEM_INT_ENABLE, reg & ~mask);
-	return ret;
-}
-
-int m5mols_wait_interrupt(struct v4l2_subdev *sd, u8 irq_mask, u32 timeout)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-
-	int ret = wait_event_interruptible_timeout(info->irq_waitq,
-				atomic_add_unless(&info->irq_done, -1, 0),
-				msecs_to_jiffies(timeout));
-	if (ret <= 0)
-		return ret ? ret : -ETIMEDOUT;
-
-	return m5mols_busy_wait(sd, SYSTEM_INT_FACTOR, irq_mask,
-				M5MOLS_I2C_RDY_WAIT_FL | irq_mask, -1);
-}
-
-/**
- * m5mols_reg_mode - Write the mode and check busy status
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @mode: the required operation mode
- *
- * It always accompanies a little delay changing the M-5MOLS mode, so it is
- * needed checking current busy status to guarantee right mode.
- */
-static int m5mols_reg_mode(struct v4l2_subdev *sd, u8 mode)
-{
-	int ret = m5mols_write(sd, SYSTEM_SYSMODE, mode);
-	if (ret < 0)
-		return ret;
-	return m5mols_busy_wait(sd, SYSTEM_SYSMODE, mode, 0xff,
-				M5MOLS_MODE_CHANGE_TIMEOUT);
-}
-
-/**
- * m5mols_set_mode - set the M-5MOLS controller mode
- * @info: M-5MOLS driver data structure
- * @mode: the required operation mode
- *
- * The commands of M-5MOLS are grouped into specific modes. Each functionality
- * can be guaranteed only when the sensor is operating in mode which a command
- * belongs to.
- */
-int m5mols_set_mode(struct m5mols_info *info, u8 mode)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	int ret = -EINVAL;
-	u8 reg;
-
-	if (mode < REG_PARAMETER || mode > REG_CAPTURE)
-		return ret;
-
-	ret = m5mols_read_u8(sd, SYSTEM_SYSMODE, &reg);
-	if (ret || reg == mode)
-		return ret;
-
-	switch (reg) {
-	case REG_PARAMETER:
-		ret = m5mols_reg_mode(sd, REG_MONITOR);
-		if (mode == REG_MONITOR)
-			break;
-		if (!ret)
-			ret = m5mols_reg_mode(sd, REG_CAPTURE);
-		break;
-
-	case REG_MONITOR:
-		if (mode == REG_PARAMETER) {
-			ret = m5mols_reg_mode(sd, REG_PARAMETER);
-			break;
-		}
-
-		ret = m5mols_reg_mode(sd, REG_CAPTURE);
-		break;
-
-	case REG_CAPTURE:
-		ret = m5mols_reg_mode(sd, REG_MONITOR);
-		if (mode == REG_MONITOR)
-			break;
-		if (!ret)
-			ret = m5mols_reg_mode(sd, REG_PARAMETER);
-		break;
-
-	default:
-		v4l2_warn(sd, "Wrong mode: %d\n", mode);
-	}
-
-	if (!ret)
-		info->mode = mode;
-
-	return ret;
-}
-
-/**
- * m5mols_get_version - retrieve full revisions information of M-5MOLS
- * @sd: sub-device, as pointed by struct v4l2_subdev
- *
- * The version information includes revisions of hardware and firmware,
- * AutoFocus alghorithm version and the version string.
- */
-static int m5mols_get_version(struct v4l2_subdev *sd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	struct m5mols_version *ver = &info->ver;
-	u8 *str = ver->str;
-	int i;
-	int ret;
-
-	ret = m5mols_read_u8(sd, SYSTEM_VER_CUSTOMER, &ver->customer);
-	if (!ret)
-		ret = m5mols_read_u8(sd, SYSTEM_VER_PROJECT, &ver->project);
-	if (!ret)
-		ret = m5mols_read_u16(sd, SYSTEM_VER_FIRMWARE, &ver->fw);
-	if (!ret)
-		ret = m5mols_read_u16(sd, SYSTEM_VER_HARDWARE, &ver->hw);
-	if (!ret)
-		ret = m5mols_read_u16(sd, SYSTEM_VER_PARAMETER, &ver->param);
-	if (!ret)
-		ret = m5mols_read_u16(sd, SYSTEM_VER_AWB, &ver->awb);
-	if (!ret)
-		ret = m5mols_read_u8(sd, AF_VERSION, &ver->af);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < VERSION_STRING_SIZE; i++) {
-		ret = m5mols_read_u8(sd, SYSTEM_VER_STRING, &str[i]);
-		if (ret)
-			return ret;
-	}
-
-	v4l2_info(sd, "Manufacturer\t[%s]\n",
-			is_manufacturer(info, REG_SAMSUNG_ELECTRO) ?
-			"Samsung Electro-Mechanics" :
-			is_manufacturer(info, REG_SAMSUNG_OPTICS) ?
-			"Samsung Fiber-Optics" :
-			is_manufacturer(info, REG_SAMSUNG_TECHWIN) ?
-			"Samsung Techwin" : "None");
-	v4l2_info(sd, "Customer/Project\t[0x%02x/0x%02x]\n",
-			info->ver.customer, info->ver.project);
-
-	if (!is_available_af(info))
-		v4l2_info(sd, "No support Auto Focus on this firmware\n");
-
-	return ret;
-}
-
-/**
- * __find_restype - Lookup M-5MOLS resolution type according to pixel code
- * @code: pixel code
- */
-static enum m5mols_restype __find_restype(u32 code)
-{
-	enum m5mols_restype type = M5MOLS_RESTYPE_MONITOR;
-
-	do {
-		if (code == m5mols_default_ffmt[type].code)
-			return type;
-	} while (type++ != SIZE_DEFAULT_FFMT);
-
-	return 0;
-}
-
-/**
- * __find_resolution - Lookup preset and type of M-5MOLS's resolution
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @mf: pixel format to find/negotiate the resolution preset for
- * @type: M-5MOLS resolution type
- * @resolution:	M-5MOLS resolution preset register value
- *
- * Find nearest resolution matching resolution preset and adjust mf
- * to supported values.
- */
-static int __find_resolution(struct v4l2_subdev *sd,
-			     struct v4l2_mbus_framefmt *mf,
-			     enum m5mols_restype *type,
-			     u32 *resolution)
-{
-	const struct m5mols_resolution *fsize = &m5mols_reg_res[0];
-	const struct m5mols_resolution *match = NULL;
-	enum m5mols_restype stype = __find_restype(mf->code);
-	int i = ARRAY_SIZE(m5mols_reg_res);
-	unsigned int min_err = ~0;
-
-	while (i--) {
-		int err;
-		if (stype == fsize->type) {
-			err = abs(fsize->width - mf->width)
-				+ abs(fsize->height - mf->height);
-
-			if (err < min_err) {
-				min_err = err;
-				match = fsize;
-			}
-		}
-		fsize++;
-	}
-	if (match) {
-		mf->width  = match->width;
-		mf->height = match->height;
-		*resolution = match->reg;
-		*type = stype;
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static struct v4l2_mbus_framefmt *__find_format(struct m5mols_info *info,
-				struct v4l2_subdev_state *sd_state,
-				enum v4l2_subdev_format_whence which,
-				enum m5mols_restype type)
-{
-	if (which == V4L2_SUBDEV_FORMAT_TRY)
-		return sd_state ? v4l2_subdev_get_try_format(&info->sd,
-							     sd_state, 0) : NULL;
-
-	return &info->ffmt[type];
-}
-
-static int m5mols_get_fmt(struct v4l2_subdev *sd,
-			  struct v4l2_subdev_state *sd_state,
-			  struct v4l2_subdev_format *fmt)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	struct v4l2_mbus_framefmt *format;
-	int ret = 0;
-
-	mutex_lock(&info->lock);
-
-	format = __find_format(info, sd_state, fmt->which, info->res_type);
-	if (format)
-		fmt->format = *format;
-	else
-		ret = -EINVAL;
-
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static int m5mols_set_fmt(struct v4l2_subdev *sd,
-			  struct v4l2_subdev_state *sd_state,
-			  struct v4l2_subdev_format *fmt)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	struct v4l2_mbus_framefmt *format = &fmt->format;
-	struct v4l2_mbus_framefmt *sfmt;
-	enum m5mols_restype type;
-	u32 resolution = 0;
-	int ret;
-
-	ret = __find_resolution(sd, format, &type, &resolution);
-	if (ret < 0)
-		return ret;
-
-	sfmt = __find_format(info, sd_state, fmt->which, type);
-	if (!sfmt)
-		return 0;
-
-	mutex_lock(&info->lock);
-
-	format->code = m5mols_default_ffmt[type].code;
-	format->colorspace = V4L2_COLORSPACE_JPEG;
-	format->field = V4L2_FIELD_NONE;
-
-	if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE) {
-		*sfmt = *format;
-		info->resolution = resolution;
-		info->res_type = type;
-	}
-
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static int m5mols_get_frame_desc(struct v4l2_subdev *sd, unsigned int pad,
-				 struct v4l2_mbus_frame_desc *fd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-
-	if (pad != 0 || fd == NULL)
-		return -EINVAL;
-
-	mutex_lock(&info->lock);
-	/*
-	 * .get_frame_desc is only used for compressed formats,
-	 * thus we always return the capture frame parameters here.
-	 */
-	fd->entry[0].length = info->cap.buf_size;
-	fd->entry[0].pixelcode = info->ffmt[M5MOLS_RESTYPE_CAPTURE].code;
-	mutex_unlock(&info->lock);
-
-	fd->entry[0].flags = V4L2_MBUS_FRAME_DESC_FL_LEN_MAX;
-	fd->num_entries = 1;
-
-	return 0;
-}
-
-static int m5mols_set_frame_desc(struct v4l2_subdev *sd, unsigned int pad,
-				 struct v4l2_mbus_frame_desc *fd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	struct v4l2_mbus_framefmt *mf = &info->ffmt[M5MOLS_RESTYPE_CAPTURE];
-
-	if (pad != 0 || fd == NULL)
-		return -EINVAL;
-
-	fd->entry[0].flags = V4L2_MBUS_FRAME_DESC_FL_LEN_MAX;
-	fd->num_entries = 1;
-	fd->entry[0].length = clamp_t(u32, fd->entry[0].length,
-				      mf->width * mf->height,
-				      M5MOLS_MAIN_JPEG_SIZE_MAX);
-	mutex_lock(&info->lock);
-	info->cap.buf_size = fd->entry[0].length;
-	mutex_unlock(&info->lock);
-
-	return 0;
-}
-
-
-static int m5mols_enum_mbus_code(struct v4l2_subdev *sd,
-				 struct v4l2_subdev_state *sd_state,
-				 struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (!code || code->index >= SIZE_DEFAULT_FFMT)
-		return -EINVAL;
-
-	code->code = m5mols_default_ffmt[code->index].code;
-
-	return 0;
-}
-
-static const struct v4l2_subdev_pad_ops m5mols_pad_ops = {
-	.enum_mbus_code	= m5mols_enum_mbus_code,
-	.get_fmt	= m5mols_get_fmt,
-	.set_fmt	= m5mols_set_fmt,
-	.get_frame_desc	= m5mols_get_frame_desc,
-	.set_frame_desc	= m5mols_set_frame_desc,
-};
-
-/**
- * m5mols_restore_controls - Apply current control values to the registers
- * @info: M-5MOLS driver data structure
- *
- * m5mols_do_scenemode() handles all parameters for which there is yet no
- * individual control. It should be replaced at some point by setting each
- * control individually, in required register set up order.
- */
-int m5mols_restore_controls(struct m5mols_info *info)
-{
-	int ret;
-
-	if (info->ctrl_sync)
-		return 0;
-
-	ret = m5mols_do_scenemode(info, REG_SCENE_NORMAL);
-	if (ret)
-		return ret;
-
-	ret = v4l2_ctrl_handler_setup(&info->handle);
-	info->ctrl_sync = !ret;
-
-	return ret;
-}
-
-/**
- * m5mols_start_monitor - Start the monitor mode
- * @info: M-5MOLS driver data structure
- *
- * Before applying the controls setup the resolution and frame rate
- * in PARAMETER mode, and then switch over to MONITOR mode.
- */
-static int m5mols_start_monitor(struct m5mols_info *info)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	int ret;
-
-	ret = m5mols_set_mode(info, REG_PARAMETER);
-	if (!ret)
-		ret = m5mols_write(sd, PARM_MON_SIZE, info->resolution);
-	if (!ret)
-		ret = m5mols_write(sd, PARM_MON_FPS, REG_FPS_30);
-	if (!ret)
-		ret = m5mols_set_mode(info, REG_MONITOR);
-	if (!ret)
-		ret = m5mols_restore_controls(info);
-
-	return ret;
-}
-
-static int m5mols_s_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	u32 code;
-	int ret;
-
-	mutex_lock(&info->lock);
-	code = info->ffmt[info->res_type].code;
-
-	if (enable) {
-		if (is_code(code, M5MOLS_RESTYPE_MONITOR))
-			ret = m5mols_start_monitor(info);
-		else if (is_code(code, M5MOLS_RESTYPE_CAPTURE))
-			ret = m5mols_start_capture(info);
-		else
-			ret = -EINVAL;
-	} else {
-		ret = m5mols_set_mode(info, REG_PARAMETER);
-	}
-
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static const struct v4l2_subdev_video_ops m5mols_video_ops = {
-	.s_stream	= m5mols_s_stream,
-};
-
-static int m5mols_sensor_power(struct m5mols_info *info, bool enable)
-{
-	struct v4l2_subdev *sd = &info->sd;
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	int ret;
-
-	if (info->power == enable)
-		return 0;
-
-	if (enable) {
-		if (info->set_power) {
-			ret = info->set_power(&client->dev, 1);
-			if (ret)
-				return ret;
-		}
-
-		ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
-		if (ret) {
-			if (info->set_power)
-				info->set_power(&client->dev, 0);
-			return ret;
-		}
-
-		gpiod_set_value(info->reset, 0);
-		info->power = 1;
-
-		return ret;
-	}
-
-	ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
-	if (ret)
-		return ret;
-
-	if (info->set_power)
-		info->set_power(&client->dev, 0);
-
-	gpiod_set_value(info->reset, 1);
-
-	info->isp_ready = 0;
-	info->power = 0;
-
-	return ret;
-}
-
-/* m5mols_update_fw - optional firmware update routine */
-int __attribute__ ((weak)) m5mols_update_fw(struct v4l2_subdev *sd,
-		int (*set_power)(struct m5mols_info *, bool))
-{
-	return 0;
-}
-
-/**
- * m5mols_fw_start - M-5MOLS internal ARM controller initialization
- * @sd: sub-device, as pointed by struct v4l2_subdev
- *
- * Execute the M-5MOLS internal ARM controller initialization sequence.
- * This function should be called after the supply voltage has been
- * applied and before any requests to the device are made.
- */
-static int m5mols_fw_start(struct v4l2_subdev *sd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	int ret;
-
-	atomic_set(&info->irq_done, 0);
-	/* Wait until I2C slave is initialized in Flash Writer mode */
-	ret = m5mols_busy_wait(sd, FLASH_CAM_START, REG_IN_FLASH_MODE,
-			       M5MOLS_I2C_RDY_WAIT_FL | 0xff, -1);
-	if (!ret)
-		ret = m5mols_write(sd, FLASH_CAM_START, REG_START_ARM_BOOT);
-	if (!ret)
-		ret = m5mols_wait_interrupt(sd, REG_INT_MODE, 2000);
-	if (ret < 0)
-		return ret;
-
-	info->isp_ready = 1;
-
-	ret = m5mols_get_version(sd);
-	if (!ret)
-		ret = m5mols_update_fw(sd, m5mols_sensor_power);
-	if (ret)
-		return ret;
-
-	v4l2_dbg(1, m5mols_debug, sd, "Success ARM Booting\n");
-
-	ret = m5mols_write(sd, PARM_INTERFACE, REG_INTERFACE_MIPI);
-	if (!ret)
-		ret = m5mols_enable_interrupt(sd,
-				REG_INT_AF | REG_INT_CAPTURE);
-
-	return ret;
-}
-
-/* Execute the lens soft-landing algorithm */
-static int m5mols_auto_focus_stop(struct m5mols_info *info)
-{
-	int ret;
-
-	ret = m5mols_write(&info->sd, AF_EXECUTE, REG_AF_STOP);
-	if (!ret)
-		ret = m5mols_write(&info->sd, AF_MODE, REG_AF_POWEROFF);
-	if (!ret)
-		ret = m5mols_busy_wait(&info->sd, SYSTEM_STATUS, REG_AF_IDLE,
-				       0xff, -1);
-	return ret;
-}
-
-/**
- * m5mols_s_power - Main sensor power control function
- * @sd: sub-device, as pointed by struct v4l2_subdev
- * @on: if true, powers on the device; powers off otherwise.
- *
- * To prevent breaking the lens when the sensor is powered off the Soft-Landing
- * algorithm is called where available. The Soft-Landing algorithm availability
- * dependends on the firmware provider.
- */
-static int m5mols_s_power(struct v4l2_subdev *sd, int on)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-	int ret;
-
-	mutex_lock(&info->lock);
-
-	if (on) {
-		ret = m5mols_sensor_power(info, true);
-		if (!ret)
-			ret = m5mols_fw_start(sd);
-	} else {
-		if (is_manufacturer(info, REG_SAMSUNG_TECHWIN)) {
-			ret = m5mols_set_mode(info, REG_MONITOR);
-			if (!ret)
-				ret = m5mols_auto_focus_stop(info);
-			if (ret < 0)
-				v4l2_warn(sd, "Soft landing lens failed\n");
-		}
-		ret = m5mols_sensor_power(info, false);
-
-		info->ctrl_sync = 0;
-	}
-
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static int m5mols_log_status(struct v4l2_subdev *sd)
-{
-	struct m5mols_info *info = to_m5mols(sd);
-
-	v4l2_ctrl_handler_log_status(&info->handle, sd->name);
-
-	return 0;
-}
-
-static const struct v4l2_subdev_core_ops m5mols_core_ops = {
-	.s_power	= m5mols_s_power,
-	.log_status	= m5mols_log_status,
-};
-
-/*
- * V4L2 subdev internal operations
- */
-static int m5mols_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_mbus_framefmt *format = v4l2_subdev_get_try_format(sd,
-								       fh->state,
-								       0);
-
-	*format = m5mols_default_ffmt[0];
-	return 0;
-}
-
-static const struct v4l2_subdev_internal_ops m5mols_subdev_internal_ops = {
-	.open		= m5mols_open,
-};
-
-static const struct v4l2_subdev_ops m5mols_ops = {
-	.core		= &m5mols_core_ops,
-	.pad		= &m5mols_pad_ops,
-	.video		= &m5mols_video_ops,
-};
-
-static irqreturn_t m5mols_irq_handler(int irq, void *data)
-{
-	struct m5mols_info *info = to_m5mols(data);
-
-	atomic_set(&info->irq_done, 1);
-	wake_up_interruptible(&info->irq_waitq);
-
-	return IRQ_HANDLED;
-}
-
-static int m5mols_probe(struct i2c_client *client)
-{
-	const struct m5mols_platform_data *pdata = client->dev.platform_data;
-	struct m5mols_info *info;
-	struct v4l2_subdev *sd;
-	int ret;
-
-	if (pdata == NULL) {
-		dev_err(&client->dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	if (!client->irq) {
-		dev_err(&client->dev, "Interrupt not assigned\n");
-		return -EINVAL;
-	}
-
-	info = devm_kzalloc(&client->dev, sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	/* This asserts reset, descriptor shall have polarity specified */
-	info->reset = devm_gpiod_get(&client->dev, "reset", GPIOD_OUT_HIGH);
-	if (IS_ERR(info->reset))
-		return PTR_ERR(info->reset);
-	/* Notice: the "N" in M5MOLS_NRST implies active low */
-	gpiod_set_consumer_name(info->reset, "M5MOLS_NRST");
-
-	info->pdata = pdata;
-	info->set_power	= pdata->set_power;
-
-	ret = devm_regulator_bulk_get(&client->dev, ARRAY_SIZE(supplies),
-				      supplies);
-	if (ret) {
-		dev_err(&client->dev, "Failed to get regulators: %d\n", ret);
-		return ret;
-	}
-
-	sd = &info->sd;
-	v4l2_i2c_subdev_init(sd, client, &m5mols_ops);
-	/* Static name; NEVER use in new drivers! */
-	strscpy(sd->name, MODULE_NAME, sizeof(sd->name));
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	sd->internal_ops = &m5mols_subdev_internal_ops;
-	info->pad.flags = MEDIA_PAD_FL_SOURCE;
-	ret = media_entity_pads_init(&sd->entity, 1, &info->pad);
-	if (ret < 0)
-		return ret;
-	sd->entity.function = MEDIA_ENT_F_CAM_SENSOR;
-
-	init_waitqueue_head(&info->irq_waitq);
-	mutex_init(&info->lock);
-
-	ret = devm_request_irq(&client->dev, client->irq, m5mols_irq_handler,
-			       IRQF_TRIGGER_RISING, MODULE_NAME, sd);
-	if (ret) {
-		dev_err(&client->dev, "Interrupt request failed: %d\n", ret);
-		goto error;
-	}
-	info->res_type = M5MOLS_RESTYPE_MONITOR;
-	info->ffmt[0] = m5mols_default_ffmt[0];
-	info->ffmt[1] =	m5mols_default_ffmt[1];
-
-	ret = m5mols_sensor_power(info, true);
-	if (ret)
-		goto error;
-
-	ret = m5mols_fw_start(sd);
-	if (!ret)
-		ret = m5mols_init_controls(sd);
-
-	ret = m5mols_sensor_power(info, false);
-	if (!ret)
-		return 0;
-error:
-	media_entity_cleanup(&sd->entity);
-	return ret;
-}
-
-static void m5mols_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *sd = i2c_get_clientdata(client);
-
-	v4l2_device_unregister_subdev(sd);
-	v4l2_ctrl_handler_free(sd->ctrl_handler);
-	media_entity_cleanup(&sd->entity);
-}
-
-static const struct i2c_device_id m5mols_id[] = {
-	{ MODULE_NAME, 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, m5mols_id);
-
-static struct i2c_driver m5mols_i2c_driver = {
-	.driver = {
-		.name	= MODULE_NAME,
-	},
-	.probe_new	= m5mols_probe,
-	.remove		= m5mols_remove,
-	.id_table	= m5mols_id,
-};
-
-module_i2c_driver(m5mols_i2c_driver);
-
-MODULE_AUTHOR("HeungJun Kim <riverful.kim@samsung.com>");
-MODULE_AUTHOR("Dongsoo Kim <dongsoo45.kim@samsung.com>");
-MODULE_DESCRIPTION("Fujitsu M-5MOLS 8M Pixel camera driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/media/i2c/m5mols/m5mols_reg.h b/drivers/media/i2c/m5mols/m5mols_reg.h
deleted file mode 100644
index 947ee33812d3..000000000000
--- a/drivers/media/i2c/m5mols/m5mols_reg.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Register map for M-5MOLS 8M Pixel camera sensor with ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#ifndef M5MOLS_REG_H
-#define M5MOLS_REG_H
-
-#define M5MOLS_I2C_MAX_SIZE	4
-#define M5MOLS_BYTE_READ	0x01
-#define M5MOLS_BYTE_WRITE	0x02
-
-#define I2C_CATEGORY(__cat)		((__cat >> 16) & 0xff)
-#define I2C_COMMAND(__comm)		((__comm >> 8) & 0xff)
-#define I2C_SIZE(__reg_s)		((__reg_s) & 0xff)
-#define I2C_REG(__cat, __cmd, __reg_s)	((__cat << 16) | (__cmd << 8) | __reg_s)
-
-/*
- * Category section register
- *
- * The category means set including relevant command of M-5MOLS.
- */
-#define CAT_SYSTEM		0x00
-#define CAT_PARAM		0x01
-#define CAT_MONITOR		0x02
-#define CAT_AE			0x03
-#define CAT_WB			0x06
-#define CAT_EXIF		0x07
-#define CAT_FD			0x09
-#define CAT_LENS		0x0a
-#define CAT_CAPT_PARM		0x0b
-#define CAT_CAPT_CTRL		0x0c
-#define CAT_FLASH		0x0f	/* related to FW, revisions, booting */
-
-/*
- * Category 0 - SYSTEM mode
- *
- * The SYSTEM mode in the M-5MOLS means area available to handle with the whole
- * & all-round system of sensor. It deals with version/interrupt/setting mode &
- * even sensor's status. Especially, the M-5MOLS sensor with ISP varies by
- * packaging & manufacturer, even the customer and project code. And the
- * function details may vary among them. The version information helps to
- * determine what methods shall be used in the driver.
- *
- * There is many registers between customer version address and awb one. For
- * more specific contents, see definition if file m5mols.h.
- */
-#define SYSTEM_VER_CUSTOMER	I2C_REG(CAT_SYSTEM, 0x00, 1)
-#define SYSTEM_VER_PROJECT	I2C_REG(CAT_SYSTEM, 0x01, 1)
-#define SYSTEM_VER_FIRMWARE	I2C_REG(CAT_SYSTEM, 0x02, 2)
-#define SYSTEM_VER_HARDWARE	I2C_REG(CAT_SYSTEM, 0x04, 2)
-#define SYSTEM_VER_PARAMETER	I2C_REG(CAT_SYSTEM, 0x06, 2)
-#define SYSTEM_VER_AWB		I2C_REG(CAT_SYSTEM, 0x08, 2)
-
-#define SYSTEM_SYSMODE		I2C_REG(CAT_SYSTEM, 0x0b, 1)
-#define REG_SYSINIT		0x00	/* SYSTEM mode */
-#define REG_PARAMETER		0x01	/* PARAMETER mode */
-#define REG_MONITOR		0x02	/* MONITOR mode */
-#define REG_CAPTURE		0x03	/* CAPTURE mode */
-
-#define SYSTEM_CMD(__cmd)	I2C_REG(CAT_SYSTEM, cmd, 1)
-#define SYSTEM_VER_STRING	I2C_REG(CAT_SYSTEM, 0x0a, 1)
-#define REG_SAMSUNG_ELECTRO	"SE"	/* Samsung Electro-Mechanics */
-#define REG_SAMSUNG_OPTICS	"OP"	/* Samsung Fiber-Optics */
-#define REG_SAMSUNG_TECHWIN	"TB"	/* Samsung Techwin */
-/* SYSTEM mode status */
-#define SYSTEM_STATUS	I2C_REG(CAT_SYSTEM, 0x0c, 1)
-
-/* Interrupt pending register */
-#define SYSTEM_INT_FACTOR	I2C_REG(CAT_SYSTEM, 0x10, 1)
-/* interrupt enable register */
-#define SYSTEM_INT_ENABLE	I2C_REG(CAT_SYSTEM, 0x11, 1)
-#define REG_INT_MODE		(1 << 0)
-#define REG_INT_AF		(1 << 1)
-#define REG_INT_ZOOM		(1 << 2)
-#define REG_INT_CAPTURE		(1 << 3)
-#define REG_INT_FRAMESYNC	(1 << 4)
-#define REG_INT_FD		(1 << 5)
-#define REG_INT_LENS_INIT	(1 << 6)
-#define REG_INT_SOUND		(1 << 7)
-#define REG_INT_MASK		0x0f
-
-/*
- * category 1 - PARAMETER mode
- *
- * This category supports function of camera features of M-5MOLS. It means we
- * can handle with preview(MONITOR) resolution size/frame per second/interface
- * between the sensor and the Application Processor/even the image effect.
- */
-
-/* Resolution in the MONITOR mode */
-#define PARM_MON_SIZE		I2C_REG(CAT_PARAM, 0x01, 1)
-
-/* Frame rate */
-#define PARM_MON_FPS		I2C_REG(CAT_PARAM, 0x02, 1)
-#define REG_FPS_30		0x02
-
-/* Video bus between the sensor and a host processor */
-#define PARM_INTERFACE		I2C_REG(CAT_PARAM, 0x00, 1)
-#define REG_INTERFACE_MIPI	0x02
-
-/* Image effects */
-#define PARM_EFFECT		I2C_REG(CAT_PARAM, 0x0b, 1)
-#define REG_EFFECT_OFF		0x00
-#define REG_EFFECT_NEGA		0x01
-#define REG_EFFECT_EMBOSS	0x06
-#define REG_EFFECT_OUTLINE	0x07
-#define REG_EFFECT_WATERCOLOR	0x08
-
-/*
- * Category 2 - MONITOR mode
- *
- * The MONITOR mode is same as preview mode as we said. The M-5MOLS has another
- * mode named "Preview", but this preview mode is used at the case specific
- * vider-recording mode. This mmode supports only YUYV format. On the other
- * hand, the JPEG & RAW formats is supports by CAPTURE mode. And, there are
- * another options like zoom/color effect(different with effect in PARAMETER
- * mode)/anti hand shaking algorithm.
- */
-
-/* Target digital zoom position */
-#define MON_ZOOM		I2C_REG(CAT_MONITOR, 0x01, 1)
-
-/* CR value for color effect */
-#define MON_CFIXR		I2C_REG(CAT_MONITOR, 0x0a, 1)
-/* CB value for color effect */
-#define MON_CFIXB		I2C_REG(CAT_MONITOR, 0x09, 1)
-#define REG_CFIXB_SEPIA		0xd8
-#define REG_CFIXR_SEPIA		0x18
-
-#define MON_EFFECT		I2C_REG(CAT_MONITOR, 0x0b, 1)
-#define REG_COLOR_EFFECT_OFF	0x00
-#define REG_COLOR_EFFECT_ON	0x01
-
-/* Chroma enable */
-#define MON_CHROMA_EN		I2C_REG(CAT_MONITOR, 0x10, 1)
-/* Chroma level */
-#define MON_CHROMA_LVL		I2C_REG(CAT_MONITOR, 0x0f, 1)
-#define REG_CHROMA_OFF		0x00
-#define REG_CHROMA_ON		0x01
-
-/* Sharpness on/off */
-#define MON_EDGE_EN		I2C_REG(CAT_MONITOR, 0x12, 1)
-/* Sharpness level */
-#define MON_EDGE_LVL		I2C_REG(CAT_MONITOR, 0x11, 1)
-#define REG_EDGE_OFF		0x00
-#define REG_EDGE_ON		0x01
-
-/* Set color tone (contrast) */
-#define MON_TONE_CTL		I2C_REG(CAT_MONITOR, 0x25, 1)
-
-/*
- * Category 3 - Auto Exposure
- *
- * The M-5MOLS exposure capbility is detailed as which is similar to digital
- * camera. This category supports AE locking/various AE mode(range of exposure)
- * /ISO/flickering/EV bias/shutter/meteoring, and anything else. And the
- * maximum/minimum exposure gain value depending on M-5MOLS firmware, may be
- * different. So, this category also provide getting the max/min values. And,
- * each MONITOR and CAPTURE mode has each gain/shutter/max exposure values.
- */
-
-/* Auto Exposure locking */
-#define AE_LOCK			I2C_REG(CAT_AE, 0x00, 1)
-#define REG_AE_UNLOCK		0x00
-#define REG_AE_LOCK		0x01
-
-/* Auto Exposure algorithm mode */
-#define AE_MODE			I2C_REG(CAT_AE, 0x01, 1)
-#define REG_AE_OFF		0x00	/* AE off */
-#define REG_AE_ALL		0x01	/* calc AE in all block integral */
-#define REG_AE_CENTER		0x03	/* calc AE in center weighted */
-#define REG_AE_SPOT		0x06	/* calc AE in specific spot */
-
-#define AE_ISO			I2C_REG(CAT_AE, 0x05, 1)
-#define REG_ISO_AUTO		0x00
-#define REG_ISO_50		0x01
-#define REG_ISO_100		0x02
-#define REG_ISO_200		0x03
-#define REG_ISO_400		0x04
-#define REG_ISO_800		0x05
-
-/* EV (scenemode) preset for MONITOR */
-#define AE_EV_PRESET_MONITOR	I2C_REG(CAT_AE, 0x0a, 1)
-/* EV (scenemode) preset for CAPTURE */
-#define AE_EV_PRESET_CAPTURE	I2C_REG(CAT_AE, 0x0b, 1)
-#define REG_SCENE_NORMAL	0x00
-#define REG_SCENE_PORTRAIT	0x01
-#define REG_SCENE_LANDSCAPE	0x02
-#define REG_SCENE_SPORTS	0x03
-#define REG_SCENE_PARTY_INDOOR	0x04
-#define REG_SCENE_BEACH_SNOW	0x05
-#define REG_SCENE_SUNSET	0x06
-#define REG_SCENE_DAWN_DUSK	0x07
-#define REG_SCENE_FALL		0x08
-#define REG_SCENE_NIGHT		0x09
-#define REG_SCENE_AGAINST_LIGHT	0x0a
-#define REG_SCENE_FIRE		0x0b
-#define REG_SCENE_TEXT		0x0c
-#define REG_SCENE_CANDLE	0x0d
-
-/* Manual gain in MONITOR mode */
-#define AE_MAN_GAIN_MON		I2C_REG(CAT_AE, 0x12, 2)
-/* Maximum gain in MONITOR mode */
-#define AE_MAX_GAIN_MON		I2C_REG(CAT_AE, 0x1a, 2)
-/* Manual gain in CAPTURE mode */
-#define AE_MAN_GAIN_CAP		I2C_REG(CAT_AE, 0x26, 2)
-
-#define AE_INDEX		I2C_REG(CAT_AE, 0x38, 1)
-#define REG_AE_INDEX_20_NEG	0x00
-#define REG_AE_INDEX_15_NEG	0x01
-#define REG_AE_INDEX_10_NEG	0x02
-#define REG_AE_INDEX_05_NEG	0x03
-#define REG_AE_INDEX_00		0x04
-#define REG_AE_INDEX_05_POS	0x05
-#define REG_AE_INDEX_10_POS	0x06
-#define REG_AE_INDEX_15_POS	0x07
-#define REG_AE_INDEX_20_POS	0x08
-
-/*
- * Category 6 - White Balance
- */
-
-/* Auto Whitebalance locking */
-#define AWB_LOCK		I2C_REG(CAT_WB, 0x00, 1)
-#define REG_AWB_UNLOCK		0x00
-#define REG_AWB_LOCK		0x01
-
-#define AWB_MODE		I2C_REG(CAT_WB, 0x02, 1)
-#define REG_AWB_AUTO		0x01	/* AWB off */
-#define REG_AWB_PRESET		0x02	/* AWB preset */
-
-/* Manual WB (preset) */
-#define AWB_MANUAL		I2C_REG(CAT_WB, 0x03, 1)
-#define REG_AWB_INCANDESCENT	0x01
-#define REG_AWB_FLUORESCENT_1	0x02
-#define REG_AWB_FLUORESCENT_2	0x03
-#define REG_AWB_DAYLIGHT	0x04
-#define REG_AWB_CLOUDY		0x05
-#define REG_AWB_SHADE		0x06
-#define REG_AWB_HORIZON		0x07
-#define REG_AWB_LEDLIGHT	0x09
-
-/*
- * Category 7 - EXIF information
- */
-#define EXIF_INFO_EXPTIME_NU	I2C_REG(CAT_EXIF, 0x00, 4)
-#define EXIF_INFO_EXPTIME_DE	I2C_REG(CAT_EXIF, 0x04, 4)
-#define EXIF_INFO_TV_NU		I2C_REG(CAT_EXIF, 0x08, 4)
-#define EXIF_INFO_TV_DE		I2C_REG(CAT_EXIF, 0x0c, 4)
-#define EXIF_INFO_AV_NU		I2C_REG(CAT_EXIF, 0x10, 4)
-#define EXIF_INFO_AV_DE		I2C_REG(CAT_EXIF, 0x14, 4)
-#define EXIF_INFO_BV_NU		I2C_REG(CAT_EXIF, 0x18, 4)
-#define EXIF_INFO_BV_DE		I2C_REG(CAT_EXIF, 0x1c, 4)
-#define EXIF_INFO_EBV_NU	I2C_REG(CAT_EXIF, 0x20, 4)
-#define EXIF_INFO_EBV_DE	I2C_REG(CAT_EXIF, 0x24, 4)
-#define EXIF_INFO_ISO		I2C_REG(CAT_EXIF, 0x28, 2)
-#define EXIF_INFO_FLASH		I2C_REG(CAT_EXIF, 0x2a, 2)
-#define EXIF_INFO_SDR		I2C_REG(CAT_EXIF, 0x2c, 2)
-#define EXIF_INFO_QVAL		I2C_REG(CAT_EXIF, 0x2e, 2)
-
-/*
- * Category 9 - Face Detection
- */
-#define FD_CTL			I2C_REG(CAT_FD, 0x00, 1)
-#define BIT_FD_EN		0
-#define BIT_FD_DRAW_FACE_FRAME	4
-#define BIT_FD_DRAW_SMILE_LVL	6
-#define REG_FD(shift)		(1 << shift)
-#define REG_FD_OFF		0x0
-
-/*
- * Category A - Lens Parameter
- */
-#define AF_MODE			I2C_REG(CAT_LENS, 0x01, 1)
-#define REG_AF_NORMAL		0x00	/* Normal AF, one time */
-#define REG_AF_MACRO		0x01	/* Macro AF, one time */
-#define REG_AF_POWEROFF		0x07
-
-#define AF_EXECUTE		I2C_REG(CAT_LENS, 0x02, 1)
-#define REG_AF_STOP		0x00
-#define REG_AF_EXE_AUTO		0x01
-#define REG_AF_EXE_CAF		0x02
-
-#define AF_STATUS		I2C_REG(CAT_LENS, 0x03, 1)
-#define REG_AF_FAIL		0x00
-#define REG_AF_SUCCESS		0x02
-#define REG_AF_IDLE		0x04
-#define REG_AF_BUSY		0x05
-
-#define AF_VERSION		I2C_REG(CAT_LENS, 0x0a, 1)
-
-/*
- * Category B - CAPTURE Parameter
- */
-#define CAPP_YUVOUT_MAIN	I2C_REG(CAT_CAPT_PARM, 0x00, 1)
-#define REG_YUV422		0x00
-#define REG_BAYER10		0x05
-#define REG_BAYER8		0x06
-#define REG_JPEG		0x10
-
-#define CAPP_MAIN_IMAGE_SIZE	I2C_REG(CAT_CAPT_PARM, 0x01, 1)
-#define CAPP_JPEG_SIZE_MAX	I2C_REG(CAT_CAPT_PARM, 0x0f, 4)
-#define CAPP_JPEG_RATIO		I2C_REG(CAT_CAPT_PARM, 0x17, 1)
-
-#define CAPP_MCC_MODE		I2C_REG(CAT_CAPT_PARM, 0x1d, 1)
-#define REG_MCC_OFF		0x00
-#define REG_MCC_NORMAL		0x01
-
-#define CAPP_WDR_EN		I2C_REG(CAT_CAPT_PARM, 0x2c, 1)
-#define REG_WDR_OFF		0x00
-#define REG_WDR_ON		0x01
-#define REG_WDR_AUTO		0x02
-
-#define CAPP_LIGHT_CTRL		I2C_REG(CAT_CAPT_PARM, 0x40, 1)
-#define REG_LIGHT_OFF		0x00
-#define REG_LIGHT_ON		0x01
-#define REG_LIGHT_AUTO		0x02
-
-#define CAPP_FLASH_CTRL		I2C_REG(CAT_CAPT_PARM, 0x41, 1)
-#define REG_FLASH_OFF		0x00
-#define REG_FLASH_ON		0x01
-#define REG_FLASH_AUTO		0x02
-
-/*
- * Category C - CAPTURE Control
- */
-#define CAPC_MODE		I2C_REG(CAT_CAPT_CTRL, 0x00, 1)
-#define REG_CAP_NONE		0x00
-#define REG_CAP_ANTI_SHAKE	0x02
-
-/* Select single- or multi-shot capture */
-#define CAPC_SEL_FRAME		I2C_REG(CAT_CAPT_CTRL, 0x06, 1)
-
-#define CAPC_START		I2C_REG(CAT_CAPT_CTRL, 0x09, 1)
-#define REG_CAP_START_MAIN	0x01
-#define REG_CAP_START_THUMB	0x03
-
-#define CAPC_IMAGE_SIZE		I2C_REG(CAT_CAPT_CTRL, 0x0d, 4)
-#define CAPC_THUMB_SIZE		I2C_REG(CAT_CAPT_CTRL, 0x11, 4)
-
-/*
- * Category F - Flash
- *
- * This mode provides functions about internal flash stuff and system startup.
- */
-
-/* Starts internal ARM core booting after power-up */
-#define FLASH_CAM_START		I2C_REG(CAT_FLASH, 0x12, 1)
-#define REG_START_ARM_BOOT	0x01	/* write value */
-#define REG_IN_FLASH_MODE	0x00	/* read value */
-
-#endif	/* M5MOLS_REG_H */
diff --git a/include/media/i2c/m5mols.h b/include/media/i2c/m5mols.h
deleted file mode 100644
index a56ae353c891..000000000000
--- a/include/media/i2c/m5mols.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Driver header for M-5MOLS 8M Pixel camera sensor with ISP
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- *
- * Copyright (C) 2009 Samsung Electronics Co., Ltd.
- * Author: Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#ifndef MEDIA_M5MOLS_H
-#define MEDIA_M5MOLS_H
-
-/**
- * struct m5mols_platform_data - platform data for M-5MOLS driver
- * @set_power:	an additional callback to the board setup code
- *		to be called after enabling and before disabling
- *		the sensor's supply regulators
- */
-struct m5mols_platform_data {
-	int (*set_power)(struct device *dev, int on);
-};
-
-#endif	/* MEDIA_M5MOLS_H */
-- 
cgit v1.2.3


From a22eabc825738dfcf06b7fcfe2cd6d020b832786 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused mt9m032 camera sensor driver

The mt9m032 camera sensor driver doesn't support DT and relies on
platform data. No board file has ever provided platform data for that
device. The driver has thus never been used in the mainline kernel since
its introduction in v3.4. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |   1 -
 MAINTAINERS                                      |   8 -
 drivers/media/i2c/Kconfig                        |  10 -
 drivers/media/i2c/Makefile                       |   1 -
 drivers/media/i2c/mt9m032.c                      | 891 -----------------------
 include/media/i2c/mt9m032.h                      |  22 -
 6 files changed, 933 deletions(-)
 delete mode 100644 drivers/media/i2c/mt9m032.c
 delete mode 100644 include/media/i2c/mt9m032.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index 26827f4b4a3d..6db642460e25 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -73,7 +73,6 @@ imx334        Sony IMX334 sensor
 imx355        Sony IMX355 sensor
 imx412        Sony IMX412 sensor
 mt9m001       mt9m001
-mt9m032       MT9M032 camera sensor
 mt9m111       mt9m111, mt9m112 and mt9m131
 mt9p031       Aptina MT9P031
 mt9t001       Aptina MT9T001
diff --git a/MAINTAINERS b/MAINTAINERS
index 0dfc8f3e4e2a..7f258cb5d54d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14168,14 +14168,6 @@ L:	linux-mtd@lists.infradead.org
 S:	Maintained
 F:	drivers/mtd/devices/docg3*
 
-MT9M032 APTINA SENSOR DRIVER
-M:	Laurent Pinchart <laurent.pinchart@ideasonboard.com>
-L:	linux-media@vger.kernel.org
-S:	Maintained
-T:	git git://linuxtv.org/media_tree.git
-F:	drivers/media/i2c/mt9m032.c
-F:	include/media/i2c/mt9m032.h
-
 MT9P031 APTINA CAMERA SENSOR
 M:	Laurent Pinchart <laurent.pinchart@ideasonboard.com>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 5be28ca5b5b2..7e5090a6a6b6 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -267,16 +267,6 @@ config VIDEO_MT9M001
 	  This driver supports MT9M001 cameras from Micron, monochrome
 	  and colour models.
 
-config VIDEO_MT9M032
-	tristate "MT9M032 camera sensor support"
-	depends on I2C && VIDEO_DEV
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	select VIDEO_APTINA_PLL
-	help
-	  This driver supports MT9M032 camera sensors from Aptina, monochrome
-	  models only.
-
 config VIDEO_MT9M111
 	tristate "mt9m111, mt9m112 and mt9m131 support"
 	depends on I2C && VIDEO_DEV
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index f739b639b429..18844982ab3d 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -60,7 +60,6 @@ obj-$(CONFIG_VIDEO_MAX9286) += max9286.o
 obj-$(CONFIG_VIDEO_ML86V7667) += ml86v7667.o
 obj-$(CONFIG_VIDEO_MSP3400) += msp3400.o
 obj-$(CONFIG_VIDEO_MT9M001) += mt9m001.o
-obj-$(CONFIG_VIDEO_MT9M032) += mt9m032.o
 obj-$(CONFIG_VIDEO_MT9M111) += mt9m111.o
 obj-$(CONFIG_VIDEO_MT9P031) += mt9p031.o
 obj-$(CONFIG_VIDEO_MT9T001) += mt9t001.o
diff --git a/drivers/media/i2c/mt9m032.c b/drivers/media/i2c/mt9m032.c
deleted file mode 100644
index 958cfdd73d57..000000000000
--- a/drivers/media/i2c/mt9m032.c
+++ /dev/null
@@ -1,891 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Driver for MT9M032 CMOS Image Sensor from Micron
- *
- * Copyright (C) 2010-2011 Lund Engineering
- * Contact: Gil Lund <gwlund@lundeng.com>
- * Author: Martin Hostettler <martin@neutronstar.dyndns.org>
- */
-
-#include <linux/delay.h>
-#include <linux/i2c.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/math64.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/v4l2-mediabus.h>
-
-#include <media/media-entity.h>
-#include <media/i2c/mt9m032.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-
-#include "aptina-pll.h"
-
-/*
- * width and height include active boundary and black parts
- *
- * column    0-  15 active boundary
- * column   16-1455 image
- * column 1456-1471 active boundary
- * column 1472-1599 black
- *
- * row       0-  51 black
- * row      53-  59 active boundary
- * row      60-1139 image
- * row    1140-1147 active boundary
- * row    1148-1151 black
- */
-
-#define MT9M032_PIXEL_ARRAY_WIDTH			1600
-#define MT9M032_PIXEL_ARRAY_HEIGHT			1152
-
-#define MT9M032_CHIP_VERSION				0x00
-#define		MT9M032_CHIP_VERSION_VALUE		0x1402
-#define MT9M032_ROW_START				0x01
-#define		MT9M032_ROW_START_MIN			0
-#define		MT9M032_ROW_START_MAX			1152
-#define		MT9M032_ROW_START_DEF			60
-#define MT9M032_COLUMN_START				0x02
-#define		MT9M032_COLUMN_START_MIN		0
-#define		MT9M032_COLUMN_START_MAX		1600
-#define		MT9M032_COLUMN_START_DEF		16
-#define MT9M032_ROW_SIZE				0x03
-#define		MT9M032_ROW_SIZE_MIN			32
-#define		MT9M032_ROW_SIZE_MAX			1152
-#define		MT9M032_ROW_SIZE_DEF			1080
-#define MT9M032_COLUMN_SIZE				0x04
-#define		MT9M032_COLUMN_SIZE_MIN			32
-#define		MT9M032_COLUMN_SIZE_MAX			1600
-#define		MT9M032_COLUMN_SIZE_DEF			1440
-#define MT9M032_HBLANK					0x05
-#define MT9M032_VBLANK					0x06
-#define		MT9M032_VBLANK_MAX			0x7ff
-#define MT9M032_SHUTTER_WIDTH_HIGH			0x08
-#define MT9M032_SHUTTER_WIDTH_LOW			0x09
-#define		MT9M032_SHUTTER_WIDTH_MIN		1
-#define		MT9M032_SHUTTER_WIDTH_MAX		1048575
-#define		MT9M032_SHUTTER_WIDTH_DEF		1943
-#define MT9M032_PIX_CLK_CTRL				0x0a
-#define		MT9M032_PIX_CLK_CTRL_INV_PIXCLK		0x8000
-#define MT9M032_RESTART					0x0b
-#define MT9M032_RESET					0x0d
-#define MT9M032_PLL_CONFIG1				0x11
-#define		MT9M032_PLL_CONFIG1_PREDIV_MASK		0x3f
-#define		MT9M032_PLL_CONFIG1_MUL_SHIFT		8
-#define MT9M032_READ_MODE1				0x1e
-#define		MT9M032_READ_MODE1_OUTPUT_BAD_FRAMES	(1 << 13)
-#define		MT9M032_READ_MODE1_MAINTAIN_FRAME_RATE	(1 << 12)
-#define		MT9M032_READ_MODE1_XOR_LINE_VALID	(1 << 11)
-#define		MT9M032_READ_MODE1_CONT_LINE_VALID	(1 << 10)
-#define		MT9M032_READ_MODE1_INVERT_TRIGGER	(1 << 9)
-#define		MT9M032_READ_MODE1_SNAPSHOT		(1 << 8)
-#define		MT9M032_READ_MODE1_GLOBAL_RESET		(1 << 7)
-#define		MT9M032_READ_MODE1_BULB_EXPOSURE	(1 << 6)
-#define		MT9M032_READ_MODE1_INVERT_STROBE	(1 << 5)
-#define		MT9M032_READ_MODE1_STROBE_ENABLE	(1 << 4)
-#define		MT9M032_READ_MODE1_STROBE_START_TRIG1	(0 << 2)
-#define		MT9M032_READ_MODE1_STROBE_START_EXP	(1 << 2)
-#define		MT9M032_READ_MODE1_STROBE_START_SHUTTER	(2 << 2)
-#define		MT9M032_READ_MODE1_STROBE_START_TRIG2	(3 << 2)
-#define		MT9M032_READ_MODE1_STROBE_END_TRIG1	(0 << 0)
-#define		MT9M032_READ_MODE1_STROBE_END_EXP	(1 << 0)
-#define		MT9M032_READ_MODE1_STROBE_END_SHUTTER	(2 << 0)
-#define		MT9M032_READ_MODE1_STROBE_END_TRIG2	(3 << 0)
-#define MT9M032_READ_MODE2				0x20
-#define		MT9M032_READ_MODE2_VFLIP_SHIFT		15
-#define		MT9M032_READ_MODE2_HFLIP_SHIFT		14
-#define		MT9M032_READ_MODE2_ROW_BLC		0x40
-#define MT9M032_GAIN_GREEN1				0x2b
-#define MT9M032_GAIN_BLUE				0x2c
-#define MT9M032_GAIN_RED				0x2d
-#define MT9M032_GAIN_GREEN2				0x2e
-
-/* write only */
-#define MT9M032_GAIN_ALL				0x35
-#define		MT9M032_GAIN_DIGITAL_MASK		0x7f
-#define		MT9M032_GAIN_DIGITAL_SHIFT		8
-#define		MT9M032_GAIN_AMUL_SHIFT			6
-#define		MT9M032_GAIN_ANALOG_MASK		0x3f
-#define MT9M032_FORMATTER1				0x9e
-#define		MT9M032_FORMATTER1_PLL_P1_6		(1 << 8)
-#define		MT9M032_FORMATTER1_PARALLEL		(1 << 12)
-#define MT9M032_FORMATTER2				0x9f
-#define		MT9M032_FORMATTER2_DOUT_EN		0x1000
-#define		MT9M032_FORMATTER2_PIXCLK_EN		0x2000
-
-/*
- * The available MT9M032 datasheet is missing documentation for register 0x10
- * MT9P031 seems to be close enough, so use constants from that datasheet for
- * now.
- * But keep the name MT9P031 to remind us, that this isn't really confirmed
- * for this sensor.
- */
-#define MT9P031_PLL_CONTROL				0x10
-#define		MT9P031_PLL_CONTROL_PWROFF		0x0050
-#define		MT9P031_PLL_CONTROL_PWRON		0x0051
-#define		MT9P031_PLL_CONTROL_USEPLL		0x0052
-
-struct mt9m032 {
-	struct v4l2_subdev subdev;
-	struct media_pad pad;
-	struct mt9m032_platform_data *pdata;
-
-	unsigned int pix_clock;
-
-	struct v4l2_ctrl_handler ctrls;
-	struct {
-		struct v4l2_ctrl *hflip;
-		struct v4l2_ctrl *vflip;
-	};
-
-	struct mutex lock; /* Protects streaming, format, interval and crop */
-
-	bool streaming;
-
-	struct v4l2_mbus_framefmt format;
-	struct v4l2_rect crop;
-	struct v4l2_fract frame_interval;
-};
-
-#define to_mt9m032(sd)	container_of(sd, struct mt9m032, subdev)
-#define to_dev(sensor) \
-	(&((struct i2c_client *)v4l2_get_subdevdata(&(sensor)->subdev))->dev)
-
-static int mt9m032_read(struct i2c_client *client, u8 reg)
-{
-	return i2c_smbus_read_word_swapped(client, reg);
-}
-
-static int mt9m032_write(struct i2c_client *client, u8 reg, const u16 data)
-{
-	return i2c_smbus_write_word_swapped(client, reg, data);
-}
-
-static u32 mt9m032_row_time(struct mt9m032 *sensor, unsigned int width)
-{
-	unsigned int effective_width;
-	u32 ns;
-
-	effective_width = width + 716; /* empirical value */
-	ns = div_u64(1000000000ULL * effective_width, sensor->pix_clock);
-	dev_dbg(to_dev(sensor),	"MT9M032 line time: %u ns\n", ns);
-	return ns;
-}
-
-static int mt9m032_update_timing(struct mt9m032 *sensor,
-				 struct v4l2_fract *interval)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	struct v4l2_rect *crop = &sensor->crop;
-	unsigned int min_vblank;
-	unsigned int vblank;
-	u32 row_time;
-
-	if (!interval)
-		interval = &sensor->frame_interval;
-
-	row_time = mt9m032_row_time(sensor, crop->width);
-
-	vblank = div_u64(1000000000ULL * interval->numerator,
-			 (u64)row_time * interval->denominator)
-	       - crop->height;
-
-	if (vblank > MT9M032_VBLANK_MAX) {
-		/* hardware limits to 11 bit values */
-		interval->denominator = 1000;
-		interval->numerator =
-			div_u64((crop->height + MT9M032_VBLANK_MAX) *
-				(u64)row_time * interval->denominator,
-				1000000000ULL);
-		vblank = div_u64(1000000000ULL * interval->numerator,
-				 (u64)row_time * interval->denominator)
-		       - crop->height;
-	}
-	/* enforce minimal 1.6ms blanking time. */
-	min_vblank = 1600000 / row_time;
-	vblank = clamp_t(unsigned int, vblank, min_vblank, MT9M032_VBLANK_MAX);
-
-	return mt9m032_write(client, MT9M032_VBLANK, vblank);
-}
-
-static int mt9m032_update_geom_timing(struct mt9m032 *sensor)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	int ret;
-
-	ret = mt9m032_write(client, MT9M032_COLUMN_SIZE,
-			    sensor->crop.width - 1);
-	if (!ret)
-		ret = mt9m032_write(client, MT9M032_ROW_SIZE,
-				    sensor->crop.height - 1);
-	if (!ret)
-		ret = mt9m032_write(client, MT9M032_COLUMN_START,
-				    sensor->crop.left);
-	if (!ret)
-		ret = mt9m032_write(client, MT9M032_ROW_START,
-				    sensor->crop.top);
-	if (!ret)
-		ret = mt9m032_update_timing(sensor, NULL);
-	return ret;
-}
-
-static int update_formatter2(struct mt9m032 *sensor, bool streaming)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	u16 reg_val =   MT9M032_FORMATTER2_DOUT_EN
-		      | 0x0070;  /* parts reserved! */
-				 /* possibly for changing to 14-bit mode */
-
-	if (streaming)
-		reg_val |= MT9M032_FORMATTER2_PIXCLK_EN;   /* pixclock enable */
-
-	return mt9m032_write(client, MT9M032_FORMATTER2, reg_val);
-}
-
-static int mt9m032_setup_pll(struct mt9m032 *sensor)
-{
-	static const struct aptina_pll_limits limits = {
-		.ext_clock_min = 8000000,
-		.ext_clock_max = 16500000,
-		.int_clock_min = 2000000,
-		.int_clock_max = 24000000,
-		.out_clock_min = 322000000,
-		.out_clock_max = 693000000,
-		.pix_clock_max = 99000000,
-		.n_min = 1,
-		.n_max = 64,
-		.m_min = 16,
-		.m_max = 255,
-		.p1_min = 6,
-		.p1_max = 7,
-	};
-
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	struct mt9m032_platform_data *pdata = sensor->pdata;
-	struct aptina_pll pll;
-	u16 reg_val;
-	int ret;
-
-	pll.ext_clock = pdata->ext_clock;
-	pll.pix_clock = pdata->pix_clock;
-
-	ret = aptina_pll_calculate(&client->dev, &limits, &pll);
-	if (ret < 0)
-		return ret;
-
-	sensor->pix_clock = pdata->pix_clock;
-
-	ret = mt9m032_write(client, MT9M032_PLL_CONFIG1,
-			    (pll.m << MT9M032_PLL_CONFIG1_MUL_SHIFT) |
-			    ((pll.n - 1) & MT9M032_PLL_CONFIG1_PREDIV_MASK));
-	if (!ret)
-		ret = mt9m032_write(client, MT9P031_PLL_CONTROL,
-				    MT9P031_PLL_CONTROL_PWRON |
-				    MT9P031_PLL_CONTROL_USEPLL);
-	if (!ret)		/* more reserved, Continuous, Master Mode */
-		ret = mt9m032_write(client, MT9M032_READ_MODE1, 0x8000 |
-				    MT9M032_READ_MODE1_STROBE_START_EXP |
-				    MT9M032_READ_MODE1_STROBE_END_SHUTTER);
-	if (!ret) {
-		reg_val = (pll.p1 == 6 ? MT9M032_FORMATTER1_PLL_P1_6 : 0)
-			| MT9M032_FORMATTER1_PARALLEL | 0x001e; /* 14-bit */
-		ret = mt9m032_write(client, MT9M032_FORMATTER1, reg_val);
-	}
-
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * Subdev pad operations
- */
-
-static int mt9m032_enum_mbus_code(struct v4l2_subdev *subdev,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (code->index != 0)
-		return -EINVAL;
-
-	code->code = MEDIA_BUS_FMT_Y8_1X8;
-	return 0;
-}
-
-static int mt9m032_enum_frame_size(struct v4l2_subdev *subdev,
-				   struct v4l2_subdev_state *sd_state,
-				   struct v4l2_subdev_frame_size_enum *fse)
-{
-	if (fse->index != 0 || fse->code != MEDIA_BUS_FMT_Y8_1X8)
-		return -EINVAL;
-
-	fse->min_width = MT9M032_COLUMN_SIZE_DEF;
-	fse->max_width = MT9M032_COLUMN_SIZE_DEF;
-	fse->min_height = MT9M032_ROW_SIZE_DEF;
-	fse->max_height = MT9M032_ROW_SIZE_DEF;
-
-	return 0;
-}
-
-/**
- * __mt9m032_get_pad_crop() - get crop rect
- * @sensor: pointer to the sensor struct
- * @sd_state: v4l2_subdev_state for getting the try crop rect from
- * @which: select try or active crop rect
- *
- * Returns a pointer the current active or fh relative try crop rect
- */
-static struct v4l2_rect *
-__mt9m032_get_pad_crop(struct mt9m032 *sensor,
-		       struct v4l2_subdev_state *sd_state,
-		       enum v4l2_subdev_format_whence which)
-{
-	switch (which) {
-	case V4L2_SUBDEV_FORMAT_TRY:
-		return v4l2_subdev_get_try_crop(&sensor->subdev, sd_state, 0);
-	case V4L2_SUBDEV_FORMAT_ACTIVE:
-		return &sensor->crop;
-	default:
-		return NULL;
-	}
-}
-
-/**
- * __mt9m032_get_pad_format() - get format
- * @sensor: pointer to the sensor struct
- * @sd_state: v4l2_subdev_state for getting the try format from
- * @which: select try or active format
- *
- * Returns a pointer the current active or fh relative try format
- */
-static struct v4l2_mbus_framefmt *
-__mt9m032_get_pad_format(struct mt9m032 *sensor,
-			 struct v4l2_subdev_state *sd_state,
-			 enum v4l2_subdev_format_whence which)
-{
-	switch (which) {
-	case V4L2_SUBDEV_FORMAT_TRY:
-		return v4l2_subdev_get_try_format(&sensor->subdev, sd_state,
-						  0);
-	case V4L2_SUBDEV_FORMAT_ACTIVE:
-		return &sensor->format;
-	default:
-		return NULL;
-	}
-}
-
-static int mt9m032_get_pad_format(struct v4l2_subdev *subdev,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_format *fmt)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-
-	mutex_lock(&sensor->lock);
-	fmt->format = *__mt9m032_get_pad_format(sensor, sd_state, fmt->which);
-	mutex_unlock(&sensor->lock);
-
-	return 0;
-}
-
-static int mt9m032_set_pad_format(struct v4l2_subdev *subdev,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_format *fmt)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-	int ret;
-
-	mutex_lock(&sensor->lock);
-
-	if (sensor->streaming && fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE) {
-		ret = -EBUSY;
-		goto done;
-	}
-
-	/* Scaling is not supported, the format is thus fixed. */
-	fmt->format = *__mt9m032_get_pad_format(sensor, sd_state, fmt->which);
-	ret = 0;
-
-done:
-	mutex_unlock(&sensor->lock);
-	return ret;
-}
-
-static int mt9m032_get_pad_selection(struct v4l2_subdev *subdev,
-				     struct v4l2_subdev_state *sd_state,
-				     struct v4l2_subdev_selection *sel)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	mutex_lock(&sensor->lock);
-	sel->r = *__mt9m032_get_pad_crop(sensor, sd_state, sel->which);
-	mutex_unlock(&sensor->lock);
-
-	return 0;
-}
-
-static int mt9m032_set_pad_selection(struct v4l2_subdev *subdev,
-				     struct v4l2_subdev_state *sd_state,
-				     struct v4l2_subdev_selection *sel)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-	struct v4l2_mbus_framefmt *format;
-	struct v4l2_rect *__crop;
-	struct v4l2_rect rect;
-	int ret = 0;
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	mutex_lock(&sensor->lock);
-
-	if (sensor->streaming && sel->which == V4L2_SUBDEV_FORMAT_ACTIVE) {
-		ret = -EBUSY;
-		goto done;
-	}
-
-	/* Clamp the crop rectangle boundaries and align them to a multiple of 2
-	 * pixels to ensure a GRBG Bayer pattern.
-	 */
-	rect.left = clamp(ALIGN(sel->r.left, 2), MT9M032_COLUMN_START_MIN,
-			  MT9M032_COLUMN_START_MAX);
-	rect.top = clamp(ALIGN(sel->r.top, 2), MT9M032_ROW_START_MIN,
-			 MT9M032_ROW_START_MAX);
-	rect.width = clamp_t(unsigned int, ALIGN(sel->r.width, 2),
-			     MT9M032_COLUMN_SIZE_MIN, MT9M032_COLUMN_SIZE_MAX);
-	rect.height = clamp_t(unsigned int, ALIGN(sel->r.height, 2),
-			      MT9M032_ROW_SIZE_MIN, MT9M032_ROW_SIZE_MAX);
-
-	rect.width = min_t(unsigned int, rect.width,
-			   MT9M032_PIXEL_ARRAY_WIDTH - rect.left);
-	rect.height = min_t(unsigned int, rect.height,
-			    MT9M032_PIXEL_ARRAY_HEIGHT - rect.top);
-
-	__crop = __mt9m032_get_pad_crop(sensor, sd_state, sel->which);
-
-	if (rect.width != __crop->width || rect.height != __crop->height) {
-		/* Reset the output image size if the crop rectangle size has
-		 * been modified.
-		 */
-		format = __mt9m032_get_pad_format(sensor, sd_state,
-						  sel->which);
-		format->width = rect.width;
-		format->height = rect.height;
-	}
-
-	*__crop = rect;
-	sel->r = rect;
-
-	if (sel->which == V4L2_SUBDEV_FORMAT_ACTIVE)
-		ret = mt9m032_update_geom_timing(sensor);
-
-done:
-	mutex_unlock(&sensor->lock);
-	return ret;
-}
-
-static int mt9m032_get_frame_interval(struct v4l2_subdev *subdev,
-				      struct v4l2_subdev_frame_interval *fi)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-
-	mutex_lock(&sensor->lock);
-	memset(fi, 0, sizeof(*fi));
-	fi->interval = sensor->frame_interval;
-	mutex_unlock(&sensor->lock);
-
-	return 0;
-}
-
-static int mt9m032_set_frame_interval(struct v4l2_subdev *subdev,
-				      struct v4l2_subdev_frame_interval *fi)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-	int ret;
-
-	mutex_lock(&sensor->lock);
-
-	if (sensor->streaming) {
-		ret = -EBUSY;
-		goto done;
-	}
-
-	/* Avoid divisions by 0. */
-	if (fi->interval.denominator == 0)
-		fi->interval.denominator = 1;
-
-	ret = mt9m032_update_timing(sensor, &fi->interval);
-	if (!ret)
-		sensor->frame_interval = fi->interval;
-
-done:
-	mutex_unlock(&sensor->lock);
-	return ret;
-}
-
-static int mt9m032_s_stream(struct v4l2_subdev *subdev, int streaming)
-{
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-	int ret;
-
-	mutex_lock(&sensor->lock);
-	ret = update_formatter2(sensor, streaming);
-	if (!ret)
-		sensor->streaming = streaming;
-	mutex_unlock(&sensor->lock);
-
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev core operations
- */
-
-#ifdef CONFIG_VIDEO_ADV_DEBUG
-static int mt9m032_g_register(struct v4l2_subdev *sd,
-			      struct v4l2_dbg_register *reg)
-{
-	struct mt9m032 *sensor = to_mt9m032(sd);
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	int val;
-
-	if (reg->reg > 0xff)
-		return -EINVAL;
-
-	val = mt9m032_read(client, reg->reg);
-	if (val < 0)
-		return -EIO;
-
-	reg->size = 2;
-	reg->val = val;
-
-	return 0;
-}
-
-static int mt9m032_s_register(struct v4l2_subdev *sd,
-			      const struct v4l2_dbg_register *reg)
-{
-	struct mt9m032 *sensor = to_mt9m032(sd);
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-
-	if (reg->reg > 0xff)
-		return -EINVAL;
-
-	return mt9m032_write(client, reg->reg, reg->val);
-}
-#endif
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev control operations
- */
-
-static int update_read_mode2(struct mt9m032 *sensor, bool vflip, bool hflip)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	int reg_val = (vflip << MT9M032_READ_MODE2_VFLIP_SHIFT)
-		    | (hflip << MT9M032_READ_MODE2_HFLIP_SHIFT)
-		    | MT9M032_READ_MODE2_ROW_BLC
-		    | 0x0007;
-
-	return mt9m032_write(client, MT9M032_READ_MODE2, reg_val);
-}
-
-static int mt9m032_set_gain(struct mt9m032 *sensor, s32 val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	int digital_gain_val;	/* in 1/8th (0..127) */
-	int analog_mul;		/* 0 or 1 */
-	int analog_gain_val;	/* in 1/16th. (0..63) */
-	u16 reg_val;
-
-	digital_gain_val = 51; /* from setup example */
-
-	if (val < 63) {
-		analog_mul = 0;
-		analog_gain_val = val;
-	} else {
-		analog_mul = 1;
-		analog_gain_val = val / 2;
-	}
-
-	/* a_gain = (1 + analog_mul) + (analog_gain_val + 1) / 16 */
-	/* overall_gain = a_gain * (1 + digital_gain_val / 8) */
-
-	reg_val = ((digital_gain_val & MT9M032_GAIN_DIGITAL_MASK)
-		   << MT9M032_GAIN_DIGITAL_SHIFT)
-		| ((analog_mul & 1) << MT9M032_GAIN_AMUL_SHIFT)
-		| (analog_gain_val & MT9M032_GAIN_ANALOG_MASK);
-
-	return mt9m032_write(client, MT9M032_GAIN_ALL, reg_val);
-}
-
-static int mt9m032_try_ctrl(struct v4l2_ctrl *ctrl)
-{
-	if (ctrl->id == V4L2_CID_GAIN && ctrl->val >= 63) {
-		/* round because of multiplier used for values >= 63 */
-		ctrl->val &= ~1;
-	}
-
-	return 0;
-}
-
-static int mt9m032_set_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct mt9m032 *sensor =
-		container_of(ctrl->handler, struct mt9m032, ctrls);
-	struct i2c_client *client = v4l2_get_subdevdata(&sensor->subdev);
-	int ret;
-
-	switch (ctrl->id) {
-	case V4L2_CID_GAIN:
-		return mt9m032_set_gain(sensor, ctrl->val);
-
-	case V4L2_CID_HFLIP:
-	/* case V4L2_CID_VFLIP: -- In the same cluster */
-		return update_read_mode2(sensor, sensor->vflip->val,
-					 sensor->hflip->val);
-
-	case V4L2_CID_EXPOSURE:
-		ret = mt9m032_write(client, MT9M032_SHUTTER_WIDTH_HIGH,
-				    (ctrl->val >> 16) & 0xffff);
-		if (ret < 0)
-			return ret;
-
-		return mt9m032_write(client, MT9M032_SHUTTER_WIDTH_LOW,
-				     ctrl->val & 0xffff);
-	}
-
-	return 0;
-}
-
-static const struct v4l2_ctrl_ops mt9m032_ctrl_ops = {
-	.s_ctrl = mt9m032_set_ctrl,
-	.try_ctrl = mt9m032_try_ctrl,
-};
-
-/* -------------------------------------------------------------------------- */
-
-static const struct v4l2_subdev_core_ops mt9m032_core_ops = {
-#ifdef CONFIG_VIDEO_ADV_DEBUG
-	.g_register = mt9m032_g_register,
-	.s_register = mt9m032_s_register,
-#endif
-};
-
-static const struct v4l2_subdev_video_ops mt9m032_video_ops = {
-	.s_stream = mt9m032_s_stream,
-	.g_frame_interval = mt9m032_get_frame_interval,
-	.s_frame_interval = mt9m032_set_frame_interval,
-};
-
-static const struct v4l2_subdev_pad_ops mt9m032_pad_ops = {
-	.enum_mbus_code = mt9m032_enum_mbus_code,
-	.enum_frame_size = mt9m032_enum_frame_size,
-	.get_fmt = mt9m032_get_pad_format,
-	.set_fmt = mt9m032_set_pad_format,
-	.set_selection = mt9m032_set_pad_selection,
-	.get_selection = mt9m032_get_pad_selection,
-};
-
-static const struct v4l2_subdev_ops mt9m032_ops = {
-	.core = &mt9m032_core_ops,
-	.video = &mt9m032_video_ops,
-	.pad = &mt9m032_pad_ops,
-};
-
-/* -----------------------------------------------------------------------------
- * Driver initialization and probing
- */
-
-static int mt9m032_probe(struct i2c_client *client)
-{
-	struct mt9m032_platform_data *pdata = client->dev.platform_data;
-	struct i2c_adapter *adapter = client->adapter;
-	struct mt9m032 *sensor;
-	int chip_version;
-	int ret;
-
-	if (pdata == NULL) {
-		dev_err(&client->dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_WORD_DATA)) {
-		dev_warn(&client->dev,
-			 "I2C-Adapter doesn't support I2C_FUNC_SMBUS_WORD\n");
-		return -EIO;
-	}
-
-	if (!client->dev.platform_data)
-		return -ENODEV;
-
-	sensor = devm_kzalloc(&client->dev, sizeof(*sensor), GFP_KERNEL);
-	if (sensor == NULL)
-		return -ENOMEM;
-
-	mutex_init(&sensor->lock);
-
-	sensor->pdata = pdata;
-
-	v4l2_i2c_subdev_init(&sensor->subdev, client, &mt9m032_ops);
-	sensor->subdev.flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	chip_version = mt9m032_read(client, MT9M032_CHIP_VERSION);
-	if (chip_version != MT9M032_CHIP_VERSION_VALUE) {
-		dev_err(&client->dev, "MT9M032 not detected, wrong version "
-			"0x%04x\n", chip_version);
-		ret = -ENODEV;
-		goto error_sensor;
-	}
-
-	dev_info(&client->dev, "MT9M032 detected at address 0x%02x\n",
-		 client->addr);
-
-	sensor->frame_interval.numerator = 1;
-	sensor->frame_interval.denominator = 30;
-
-	sensor->crop.left = MT9M032_COLUMN_START_DEF;
-	sensor->crop.top = MT9M032_ROW_START_DEF;
-	sensor->crop.width = MT9M032_COLUMN_SIZE_DEF;
-	sensor->crop.height = MT9M032_ROW_SIZE_DEF;
-
-	sensor->format.width = sensor->crop.width;
-	sensor->format.height = sensor->crop.height;
-	sensor->format.code = MEDIA_BUS_FMT_Y8_1X8;
-	sensor->format.field = V4L2_FIELD_NONE;
-	sensor->format.colorspace = V4L2_COLORSPACE_SRGB;
-
-	v4l2_ctrl_handler_init(&sensor->ctrls, 5);
-
-	v4l2_ctrl_new_std(&sensor->ctrls, &mt9m032_ctrl_ops,
-			  V4L2_CID_GAIN, 0, 127, 1, 64);
-
-	sensor->hflip = v4l2_ctrl_new_std(&sensor->ctrls,
-					  &mt9m032_ctrl_ops,
-					  V4L2_CID_HFLIP, 0, 1, 1, 0);
-	sensor->vflip = v4l2_ctrl_new_std(&sensor->ctrls,
-					  &mt9m032_ctrl_ops,
-					  V4L2_CID_VFLIP, 0, 1, 1, 0);
-
-	v4l2_ctrl_new_std(&sensor->ctrls, &mt9m032_ctrl_ops,
-			  V4L2_CID_EXPOSURE, MT9M032_SHUTTER_WIDTH_MIN,
-			  MT9M032_SHUTTER_WIDTH_MAX, 1,
-			  MT9M032_SHUTTER_WIDTH_DEF);
-	v4l2_ctrl_new_std(&sensor->ctrls, &mt9m032_ctrl_ops,
-			  V4L2_CID_PIXEL_RATE, pdata->pix_clock,
-			  pdata->pix_clock, 1, pdata->pix_clock);
-
-	if (sensor->ctrls.error) {
-		ret = sensor->ctrls.error;
-		dev_err(&client->dev, "control initialization error %d\n", ret);
-		goto error_ctrl;
-	}
-
-	v4l2_ctrl_cluster(2, &sensor->hflip);
-
-	sensor->subdev.ctrl_handler = &sensor->ctrls;
-	sensor->subdev.entity.function = MEDIA_ENT_F_CAM_SENSOR;
-	sensor->pad.flags = MEDIA_PAD_FL_SOURCE;
-	ret = media_entity_pads_init(&sensor->subdev.entity, 1, &sensor->pad);
-	if (ret < 0)
-		goto error_ctrl;
-
-	ret = mt9m032_write(client, MT9M032_RESET, 1);	/* reset on */
-	if (ret < 0)
-		goto error_entity;
-	ret = mt9m032_write(client, MT9M032_RESET, 0);	/* reset off */
-	if (ret < 0)
-		goto error_entity;
-
-	ret = mt9m032_setup_pll(sensor);
-	if (ret < 0)
-		goto error_entity;
-	usleep_range(10000, 11000);
-
-	ret = v4l2_ctrl_handler_setup(&sensor->ctrls);
-	if (ret < 0)
-		goto error_entity;
-
-	/* SIZE */
-	ret = mt9m032_update_geom_timing(sensor);
-	if (ret < 0)
-		goto error_entity;
-
-	ret = mt9m032_write(client, 0x41, 0x0000);	/* reserved !!! */
-	if (ret < 0)
-		goto error_entity;
-	ret = mt9m032_write(client, 0x42, 0x0003);	/* reserved !!! */
-	if (ret < 0)
-		goto error_entity;
-	ret = mt9m032_write(client, 0x43, 0x0003);	/* reserved !!! */
-	if (ret < 0)
-		goto error_entity;
-	ret = mt9m032_write(client, 0x7f, 0x0000);	/* reserved !!! */
-	if (ret < 0)
-		goto error_entity;
-	if (sensor->pdata->invert_pixclock) {
-		ret = mt9m032_write(client, MT9M032_PIX_CLK_CTRL,
-				    MT9M032_PIX_CLK_CTRL_INV_PIXCLK);
-		if (ret < 0)
-			goto error_entity;
-	}
-
-	ret = mt9m032_write(client, MT9M032_RESTART, 1); /* Restart on */
-	if (ret < 0)
-		goto error_entity;
-	msleep(100);
-	ret = mt9m032_write(client, MT9M032_RESTART, 0); /* Restart off */
-	if (ret < 0)
-		goto error_entity;
-	msleep(100);
-	ret = update_formatter2(sensor, false);
-	if (ret < 0)
-		goto error_entity;
-
-	return ret;
-
-error_entity:
-	media_entity_cleanup(&sensor->subdev.entity);
-error_ctrl:
-	v4l2_ctrl_handler_free(&sensor->ctrls);
-error_sensor:
-	mutex_destroy(&sensor->lock);
-	return ret;
-}
-
-static void mt9m032_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *subdev = i2c_get_clientdata(client);
-	struct mt9m032 *sensor = to_mt9m032(subdev);
-
-	v4l2_device_unregister_subdev(subdev);
-	v4l2_ctrl_handler_free(&sensor->ctrls);
-	media_entity_cleanup(&subdev->entity);
-	mutex_destroy(&sensor->lock);
-}
-
-static const struct i2c_device_id mt9m032_id_table[] = {
-	{ MT9M032_NAME, 0 },
-	{ }
-};
-
-MODULE_DEVICE_TABLE(i2c, mt9m032_id_table);
-
-static struct i2c_driver mt9m032_i2c_driver = {
-	.driver = {
-		.name = MT9M032_NAME,
-	},
-	.probe_new = mt9m032_probe,
-	.remove = mt9m032_remove,
-	.id_table = mt9m032_id_table,
-};
-
-module_i2c_driver(mt9m032_i2c_driver);
-
-MODULE_AUTHOR("Martin Hostettler <martin@neutronstar.dyndns.org>");
-MODULE_DESCRIPTION("MT9M032 camera sensor driver");
-MODULE_LICENSE("GPL v2");
diff --git a/include/media/i2c/mt9m032.h b/include/media/i2c/mt9m032.h
deleted file mode 100644
index 1bd58757717a..000000000000
--- a/include/media/i2c/mt9m032.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Driver for MT9M032 CMOS Image Sensor from Micron
- *
- * Copyright (C) 2010-2011 Lund Engineering
- * Contact: Gil Lund <gwlund@lundeng.com>
- * Author: Martin Hostettler <martin@neutronstar.dyndns.org>
- */
-
-#ifndef MT9M032_H
-#define MT9M032_H
-
-#define MT9M032_NAME		"mt9m032"
-#define MT9M032_I2C_ADDR	(0xb8 >> 1)
-
-struct mt9m032_platform_data {
-	u32 ext_clock;
-	u32 pix_clock;
-	bool invert_pixclock;
-
-};
-#endif /* MT9M032_H */
-- 
cgit v1.2.3


From 95a9ea8a13885ef540f851091e87c78594c08829 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused mt9t001 camera sensor driver

The mt9t001 camera sensor driver doesn't support DT and relies on
platform data. No board file has ever provided platform data for that
device. The driver has thus never been used in the mainline kernel since
its introduction in v3.2. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |   1 -
 MAINTAINERS                                      |   8 -
 drivers/media/i2c/Kconfig                        |   9 -
 drivers/media/i2c/Makefile                       |   1 -
 drivers/media/i2c/mt9t001.c                      | 992 -----------------------
 include/media/i2c/mt9t001.h                      |  10 -
 6 files changed, 1021 deletions(-)
 delete mode 100644 drivers/media/i2c/mt9t001.c
 delete mode 100644 include/media/i2c/mt9t001.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index 6db642460e25..35660b6c6cf5 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -75,7 +75,6 @@ imx412        Sony IMX412 sensor
 mt9m001       mt9m001
 mt9m111       mt9m111, mt9m112 and mt9m131
 mt9p031       Aptina MT9P031
-mt9t001       Aptina MT9T001
 mt9t112       Aptina MT9T111/MT9T112
 mt9v011       Micron mt9v011 sensor
 mt9v032       Micron MT9V032 sensor
diff --git a/MAINTAINERS b/MAINTAINERS
index 7f258cb5d54d..da34d2eb2b0f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14177,14 +14177,6 @@ F:	Documentation/devicetree/bindings/media/i2c/aptina,mt9p031.yaml
 F:	drivers/media/i2c/mt9p031.c
 F:	include/media/i2c/mt9p031.h
 
-MT9T001 APTINA CAMERA SENSOR
-M:	Laurent Pinchart <laurent.pinchart@ideasonboard.com>
-L:	linux-media@vger.kernel.org
-S:	Maintained
-T:	git git://linuxtv.org/media_tree.git
-F:	drivers/media/i2c/mt9t001.c
-F:	include/media/i2c/mt9t001.h
-
 MT9T112 APTINA CAMERA SENSOR
 M:	Jacopo Mondi <jacopo@jmondi.org>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 7e5090a6a6b6..8444cfe24306 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -286,15 +286,6 @@ config VIDEO_MT9P031
 	  This is a Video4Linux2 sensor driver for the Aptina
 	  (Micron) mt9p031 5 Mpixel camera.
 
-config VIDEO_MT9T001
-	tristate "Aptina MT9T001 support"
-	depends on I2C && VIDEO_DEV
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	help
-	  This is a Video4Linux2 sensor driver for the Aptina
-	  (Micron) mt0t001 3 Mpixel camera.
-
 config VIDEO_MT9T112
 	tristate "Aptina MT9T111/MT9T112 support"
 	depends on I2C && VIDEO_DEV
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 18844982ab3d..77c6875c808f 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -62,7 +62,6 @@ obj-$(CONFIG_VIDEO_MSP3400) += msp3400.o
 obj-$(CONFIG_VIDEO_MT9M001) += mt9m001.o
 obj-$(CONFIG_VIDEO_MT9M111) += mt9m111.o
 obj-$(CONFIG_VIDEO_MT9P031) += mt9p031.o
-obj-$(CONFIG_VIDEO_MT9T001) += mt9t001.o
 obj-$(CONFIG_VIDEO_MT9T112) += mt9t112.o
 obj-$(CONFIG_VIDEO_MT9V011) += mt9v011.o
 obj-$(CONFIG_VIDEO_MT9V032) += mt9v032.o
diff --git a/drivers/media/i2c/mt9t001.c b/drivers/media/i2c/mt9t001.c
deleted file mode 100644
index c635ed11388a..000000000000
--- a/drivers/media/i2c/mt9t001.c
+++ /dev/null
@@ -1,992 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Driver for MT9T001 CMOS Image Sensor from Aptina (Micron)
- *
- * Copyright (C) 2010-2011, Laurent Pinchart <laurent.pinchart@ideasonboard.com>
- *
- * Based on the MT9M001 driver,
- *
- * Copyright (C) 2008, Guennadi Liakhovetski <kernel@pengutronix.de>
- */
-
-#include <linux/clk.h>
-#include <linux/i2c.h>
-#include <linux/log2.h>
-#include <linux/module.h>
-#include <linux/regulator/consumer.h>
-#include <linux/slab.h>
-#include <linux/videodev2.h>
-#include <linux/v4l2-mediabus.h>
-
-#include <media/i2c/mt9t001.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-
-#define MT9T001_PIXEL_ARRAY_HEIGHT			1568
-#define MT9T001_PIXEL_ARRAY_WIDTH			2112
-
-#define MT9T001_CHIP_VERSION				0x00
-#define		MT9T001_CHIP_ID				0x1621
-#define MT9T001_ROW_START				0x01
-#define		MT9T001_ROW_START_MIN			0
-#define		MT9T001_ROW_START_DEF			20
-#define		MT9T001_ROW_START_MAX			1534
-#define MT9T001_COLUMN_START				0x02
-#define		MT9T001_COLUMN_START_MIN		0
-#define		MT9T001_COLUMN_START_DEF		32
-#define		MT9T001_COLUMN_START_MAX		2046
-#define MT9T001_WINDOW_HEIGHT				0x03
-#define		MT9T001_WINDOW_HEIGHT_MIN		1
-#define		MT9T001_WINDOW_HEIGHT_DEF		1535
-#define		MT9T001_WINDOW_HEIGHT_MAX		1567
-#define MT9T001_WINDOW_WIDTH				0x04
-#define		MT9T001_WINDOW_WIDTH_MIN		1
-#define		MT9T001_WINDOW_WIDTH_DEF		2047
-#define		MT9T001_WINDOW_WIDTH_MAX		2111
-#define MT9T001_HORIZONTAL_BLANKING			0x05
-#define		MT9T001_HORIZONTAL_BLANKING_MIN		21
-#define		MT9T001_HORIZONTAL_BLANKING_MAX		1023
-#define MT9T001_VERTICAL_BLANKING			0x06
-#define		MT9T001_VERTICAL_BLANKING_MIN		3
-#define		MT9T001_VERTICAL_BLANKING_MAX		1023
-#define MT9T001_OUTPUT_CONTROL				0x07
-#define		MT9T001_OUTPUT_CONTROL_SYNC		(1 << 0)
-#define		MT9T001_OUTPUT_CONTROL_CHIP_ENABLE	(1 << 1)
-#define		MT9T001_OUTPUT_CONTROL_TEST_DATA	(1 << 6)
-#define		MT9T001_OUTPUT_CONTROL_DEF		0x0002
-#define MT9T001_SHUTTER_WIDTH_HIGH			0x08
-#define MT9T001_SHUTTER_WIDTH_LOW			0x09
-#define		MT9T001_SHUTTER_WIDTH_MIN		1
-#define		MT9T001_SHUTTER_WIDTH_DEF		1561
-#define		MT9T001_SHUTTER_WIDTH_MAX		(1024 * 1024)
-#define MT9T001_PIXEL_CLOCK				0x0a
-#define		MT9T001_PIXEL_CLOCK_INVERT		(1 << 15)
-#define		MT9T001_PIXEL_CLOCK_SHIFT_MASK		(7 << 8)
-#define		MT9T001_PIXEL_CLOCK_SHIFT_SHIFT		8
-#define		MT9T001_PIXEL_CLOCK_DIVIDE_MASK		(0x7f << 0)
-#define MT9T001_FRAME_RESTART				0x0b
-#define MT9T001_SHUTTER_DELAY				0x0c
-#define		MT9T001_SHUTTER_DELAY_MAX		2047
-#define MT9T001_RESET					0x0d
-#define MT9T001_READ_MODE1				0x1e
-#define		MT9T001_READ_MODE_SNAPSHOT		(1 << 8)
-#define		MT9T001_READ_MODE_STROBE_ENABLE		(1 << 9)
-#define		MT9T001_READ_MODE_STROBE_WIDTH		(1 << 10)
-#define		MT9T001_READ_MODE_STROBE_OVERRIDE	(1 << 11)
-#define MT9T001_READ_MODE2				0x20
-#define		MT9T001_READ_MODE_BAD_FRAMES		(1 << 0)
-#define		MT9T001_READ_MODE_LINE_VALID_CONTINUOUS	(1 << 9)
-#define		MT9T001_READ_MODE_LINE_VALID_FRAME	(1 << 10)
-#define MT9T001_READ_MODE3				0x21
-#define		MT9T001_READ_MODE_GLOBAL_RESET		(1 << 0)
-#define		MT9T001_READ_MODE_GHST_CTL		(1 << 1)
-#define MT9T001_ROW_ADDRESS_MODE			0x22
-#define		MT9T001_ROW_SKIP_MASK			(7 << 0)
-#define		MT9T001_ROW_BIN_MASK			(3 << 3)
-#define		MT9T001_ROW_BIN_SHIFT			3
-#define MT9T001_COLUMN_ADDRESS_MODE			0x23
-#define		MT9T001_COLUMN_SKIP_MASK		(7 << 0)
-#define		MT9T001_COLUMN_BIN_MASK			(3 << 3)
-#define		MT9T001_COLUMN_BIN_SHIFT		3
-#define MT9T001_GREEN1_GAIN				0x2b
-#define MT9T001_BLUE_GAIN				0x2c
-#define MT9T001_RED_GAIN				0x2d
-#define MT9T001_GREEN2_GAIN				0x2e
-#define MT9T001_TEST_DATA				0x32
-#define MT9T001_GLOBAL_GAIN				0x35
-#define		MT9T001_GLOBAL_GAIN_MIN			8
-#define		MT9T001_GLOBAL_GAIN_MAX			1024
-#define MT9T001_BLACK_LEVEL				0x49
-#define MT9T001_ROW_BLACK_DEFAULT_OFFSET		0x4b
-#define MT9T001_BLC_DELTA_THRESHOLDS			0x5d
-#define MT9T001_CAL_THRESHOLDS				0x5f
-#define MT9T001_GREEN1_OFFSET				0x60
-#define MT9T001_GREEN2_OFFSET				0x61
-#define MT9T001_BLACK_LEVEL_CALIBRATION			0x62
-#define		MT9T001_BLACK_LEVEL_OVERRIDE		(1 << 0)
-#define		MT9T001_BLACK_LEVEL_DISABLE_OFFSET	(1 << 1)
-#define		MT9T001_BLACK_LEVEL_RECALCULATE		(1 << 12)
-#define		MT9T001_BLACK_LEVEL_LOCK_RED_BLUE	(1 << 13)
-#define		MT9T001_BLACK_LEVEL_LOCK_GREEN		(1 << 14)
-#define MT9T001_RED_OFFSET				0x63
-#define MT9T001_BLUE_OFFSET				0x64
-
-struct mt9t001 {
-	struct v4l2_subdev subdev;
-	struct media_pad pad;
-
-	struct clk *clk;
-	struct regulator_bulk_data regulators[2];
-
-	struct mutex power_lock; /* lock to protect power_count */
-	int power_count;
-
-	struct v4l2_mbus_framefmt format;
-	struct v4l2_rect crop;
-
-	struct v4l2_ctrl_handler ctrls;
-	struct v4l2_ctrl *gains[4];
-
-	u16 output_control;
-	u16 black_level;
-};
-
-static inline struct mt9t001 *to_mt9t001(struct v4l2_subdev *sd)
-{
-	return container_of(sd, struct mt9t001, subdev);
-}
-
-static int mt9t001_read(struct i2c_client *client, u8 reg)
-{
-	return i2c_smbus_read_word_swapped(client, reg);
-}
-
-static int mt9t001_write(struct i2c_client *client, u8 reg, u16 data)
-{
-	return i2c_smbus_write_word_swapped(client, reg, data);
-}
-
-static int mt9t001_set_output_control(struct mt9t001 *mt9t001, u16 clear,
-				      u16 set)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&mt9t001->subdev);
-	u16 value = (mt9t001->output_control & ~clear) | set;
-	int ret;
-
-	if (value == mt9t001->output_control)
-		return 0;
-
-	ret = mt9t001_write(client, MT9T001_OUTPUT_CONTROL, value);
-	if (ret < 0)
-		return ret;
-
-	mt9t001->output_control = value;
-	return 0;
-}
-
-static int mt9t001_reset(struct mt9t001 *mt9t001)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&mt9t001->subdev);
-	int ret;
-
-	/* Reset the chip and stop data read out */
-	ret = mt9t001_write(client, MT9T001_RESET, 1);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_RESET, 0);
-	if (ret < 0)
-		return ret;
-
-	mt9t001->output_control = MT9T001_OUTPUT_CONTROL_DEF;
-
-	return mt9t001_set_output_control(mt9t001,
-					  MT9T001_OUTPUT_CONTROL_CHIP_ENABLE,
-					  0);
-}
-
-static int mt9t001_power_on(struct mt9t001 *mt9t001)
-{
-	int ret;
-
-	/* Bring up the supplies */
-	ret = regulator_bulk_enable(ARRAY_SIZE(mt9t001->regulators),
-				   mt9t001->regulators);
-	if (ret < 0)
-		return ret;
-
-	/* Enable clock */
-	ret = clk_prepare_enable(mt9t001->clk);
-	if (ret < 0)
-		regulator_bulk_disable(ARRAY_SIZE(mt9t001->regulators),
-				       mt9t001->regulators);
-
-	return ret;
-}
-
-static void mt9t001_power_off(struct mt9t001 *mt9t001)
-{
-	regulator_bulk_disable(ARRAY_SIZE(mt9t001->regulators),
-			       mt9t001->regulators);
-
-	clk_disable_unprepare(mt9t001->clk);
-}
-
-static int __mt9t001_set_power(struct mt9t001 *mt9t001, bool on)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&mt9t001->subdev);
-	int ret;
-
-	if (!on) {
-		mt9t001_power_off(mt9t001);
-		return 0;
-	}
-
-	ret = mt9t001_power_on(mt9t001);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_reset(mt9t001);
-	if (ret < 0) {
-		dev_err(&client->dev, "Failed to reset the camera\n");
-		goto e_power;
-	}
-
-	ret = v4l2_ctrl_handler_setup(&mt9t001->ctrls);
-	if (ret < 0) {
-		dev_err(&client->dev, "Failed to set up control handlers\n");
-		goto e_power;
-	}
-
-	return 0;
-
-e_power:
-	mt9t001_power_off(mt9t001);
-
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev video operations
- */
-
-static struct v4l2_mbus_framefmt *
-__mt9t001_get_pad_format(struct mt9t001 *mt9t001,
-			 struct v4l2_subdev_state *sd_state,
-			 unsigned int pad, enum v4l2_subdev_format_whence which)
-{
-	switch (which) {
-	case V4L2_SUBDEV_FORMAT_TRY:
-		return v4l2_subdev_get_try_format(&mt9t001->subdev, sd_state,
-						  pad);
-	case V4L2_SUBDEV_FORMAT_ACTIVE:
-		return &mt9t001->format;
-	default:
-		return NULL;
-	}
-}
-
-static struct v4l2_rect *
-__mt9t001_get_pad_crop(struct mt9t001 *mt9t001,
-		       struct v4l2_subdev_state *sd_state,
-		       unsigned int pad, enum v4l2_subdev_format_whence which)
-{
-	switch (which) {
-	case V4L2_SUBDEV_FORMAT_TRY:
-		return v4l2_subdev_get_try_crop(&mt9t001->subdev, sd_state,
-						pad);
-	case V4L2_SUBDEV_FORMAT_ACTIVE:
-		return &mt9t001->crop;
-	default:
-		return NULL;
-	}
-}
-
-static int mt9t001_s_stream(struct v4l2_subdev *subdev, int enable)
-{
-	const u16 mode = MT9T001_OUTPUT_CONTROL_CHIP_ENABLE;
-	struct i2c_client *client = v4l2_get_subdevdata(subdev);
-	struct mt9t001_platform_data *pdata = client->dev.platform_data;
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-	struct v4l2_mbus_framefmt *format = &mt9t001->format;
-	struct v4l2_rect *crop = &mt9t001->crop;
-	unsigned int hratio;
-	unsigned int vratio;
-	int ret;
-
-	if (!enable)
-		return mt9t001_set_output_control(mt9t001, mode, 0);
-
-	/* Configure the pixel clock polarity */
-	if (pdata->clk_pol) {
-		ret  = mt9t001_write(client, MT9T001_PIXEL_CLOCK,
-				     MT9T001_PIXEL_CLOCK_INVERT);
-		if (ret < 0)
-			return ret;
-	}
-
-	/* Configure the window size and row/column bin */
-	hratio = DIV_ROUND_CLOSEST(crop->width, format->width);
-	vratio = DIV_ROUND_CLOSEST(crop->height, format->height);
-
-	ret = mt9t001_write(client, MT9T001_ROW_ADDRESS_MODE, hratio - 1);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_COLUMN_ADDRESS_MODE, vratio - 1);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_COLUMN_START, crop->left);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_ROW_START, crop->top);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_WINDOW_WIDTH, crop->width - 1);
-	if (ret < 0)
-		return ret;
-
-	ret = mt9t001_write(client, MT9T001_WINDOW_HEIGHT, crop->height - 1);
-	if (ret < 0)
-		return ret;
-
-	/* Switch to master "normal" mode */
-	return mt9t001_set_output_control(mt9t001, 0, mode);
-}
-
-static int mt9t001_enum_mbus_code(struct v4l2_subdev *subdev,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (code->index > 0)
-		return -EINVAL;
-
-	code->code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	return 0;
-}
-
-static int mt9t001_enum_frame_size(struct v4l2_subdev *subdev,
-				   struct v4l2_subdev_state *sd_state,
-				   struct v4l2_subdev_frame_size_enum *fse)
-{
-	if (fse->index >= 8 || fse->code != MEDIA_BUS_FMT_SGRBG10_1X10)
-		return -EINVAL;
-
-	fse->min_width = (MT9T001_WINDOW_WIDTH_DEF + 1) / fse->index;
-	fse->max_width = fse->min_width;
-	fse->min_height = (MT9T001_WINDOW_HEIGHT_DEF + 1) / fse->index;
-	fse->max_height = fse->min_height;
-
-	return 0;
-}
-
-static int mt9t001_get_format(struct v4l2_subdev *subdev,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *format)
-{
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-
-	format->format = *__mt9t001_get_pad_format(mt9t001, sd_state,
-						   format->pad,
-						   format->which);
-	return 0;
-}
-
-static int mt9t001_set_format(struct v4l2_subdev *subdev,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_format *format)
-{
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-	struct v4l2_mbus_framefmt *__format;
-	struct v4l2_rect *__crop;
-	unsigned int width;
-	unsigned int height;
-	unsigned int hratio;
-	unsigned int vratio;
-
-	__crop = __mt9t001_get_pad_crop(mt9t001, sd_state, format->pad,
-					format->which);
-
-	/* Clamp the width and height to avoid dividing by zero. */
-	width = clamp_t(unsigned int, ALIGN(format->format.width, 2),
-			max_t(unsigned int, __crop->width / 8,
-			      MT9T001_WINDOW_HEIGHT_MIN + 1),
-			__crop->width);
-	height = clamp_t(unsigned int, ALIGN(format->format.height, 2),
-			 max_t(unsigned int, __crop->height / 8,
-			       MT9T001_WINDOW_HEIGHT_MIN + 1),
-			 __crop->height);
-
-	hratio = DIV_ROUND_CLOSEST(__crop->width, width);
-	vratio = DIV_ROUND_CLOSEST(__crop->height, height);
-
-	__format = __mt9t001_get_pad_format(mt9t001, sd_state, format->pad,
-					    format->which);
-	__format->width = __crop->width / hratio;
-	__format->height = __crop->height / vratio;
-
-	format->format = *__format;
-
-	return 0;
-}
-
-static int mt9t001_get_selection(struct v4l2_subdev *subdev,
-				 struct v4l2_subdev_state *sd_state,
-				 struct v4l2_subdev_selection *sel)
-{
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	sel->r = *__mt9t001_get_pad_crop(mt9t001, sd_state, sel->pad,
-					 sel->which);
-	return 0;
-}
-
-static int mt9t001_set_selection(struct v4l2_subdev *subdev,
-				 struct v4l2_subdev_state *sd_state,
-				 struct v4l2_subdev_selection *sel)
-{
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-	struct v4l2_mbus_framefmt *__format;
-	struct v4l2_rect *__crop;
-	struct v4l2_rect rect;
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	/* Clamp the crop rectangle boundaries and align them to a multiple of 2
-	 * pixels.
-	 */
-	rect.left = clamp(ALIGN(sel->r.left, 2),
-			  MT9T001_COLUMN_START_MIN,
-			  MT9T001_COLUMN_START_MAX);
-	rect.top = clamp(ALIGN(sel->r.top, 2),
-			 MT9T001_ROW_START_MIN,
-			 MT9T001_ROW_START_MAX);
-	rect.width = clamp_t(unsigned int, ALIGN(sel->r.width, 2),
-			     MT9T001_WINDOW_WIDTH_MIN + 1,
-			     MT9T001_WINDOW_WIDTH_MAX + 1);
-	rect.height = clamp_t(unsigned int, ALIGN(sel->r.height, 2),
-			      MT9T001_WINDOW_HEIGHT_MIN + 1,
-			      MT9T001_WINDOW_HEIGHT_MAX + 1);
-
-	rect.width = min_t(unsigned int, rect.width,
-			   MT9T001_PIXEL_ARRAY_WIDTH - rect.left);
-	rect.height = min_t(unsigned int, rect.height,
-			    MT9T001_PIXEL_ARRAY_HEIGHT - rect.top);
-
-	__crop = __mt9t001_get_pad_crop(mt9t001, sd_state, sel->pad,
-					sel->which);
-
-	if (rect.width != __crop->width || rect.height != __crop->height) {
-		/* Reset the output image size if the crop rectangle size has
-		 * been modified.
-		 */
-		__format = __mt9t001_get_pad_format(mt9t001, sd_state,
-						    sel->pad,
-						    sel->which);
-		__format->width = rect.width;
-		__format->height = rect.height;
-	}
-
-	*__crop = rect;
-	sel->r = rect;
-
-	return 0;
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev control operations
- */
-
-#define V4L2_CID_TEST_PATTERN_COLOR	(V4L2_CID_USER_BASE | 0x1001)
-#define V4L2_CID_BLACK_LEVEL_AUTO	(V4L2_CID_USER_BASE | 0x1002)
-#define V4L2_CID_BLACK_LEVEL_OFFSET	(V4L2_CID_USER_BASE | 0x1003)
-#define V4L2_CID_BLACK_LEVEL_CALIBRATE	(V4L2_CID_USER_BASE | 0x1004)
-
-#define V4L2_CID_GAIN_RED		(V4L2_CTRL_CLASS_CAMERA | 0x1001)
-#define V4L2_CID_GAIN_GREEN_RED		(V4L2_CTRL_CLASS_CAMERA | 0x1002)
-#define V4L2_CID_GAIN_GREEN_BLUE	(V4L2_CTRL_CLASS_CAMERA | 0x1003)
-#define V4L2_CID_GAIN_BLUE		(V4L2_CTRL_CLASS_CAMERA | 0x1004)
-
-static u16 mt9t001_gain_value(s32 *gain)
-{
-	/* Gain is controlled by 2 analog stages and a digital stage. Valid
-	 * values for the 3 stages are
-	 *
-	 * Stage		Min	Max	Step
-	 * ------------------------------------------
-	 * First analog stage	x1	x2	1
-	 * Second analog stage	x1	x4	0.125
-	 * Digital stage	x1	x16	0.125
-	 *
-	 * To minimize noise, the gain stages should be used in the second
-	 * analog stage, first analog stage, digital stage order. Gain from a
-	 * previous stage should be pushed to its maximum value before the next
-	 * stage is used.
-	 */
-	if (*gain <= 32)
-		return *gain;
-
-	if (*gain <= 64) {
-		*gain &= ~1;
-		return (1 << 6) | (*gain >> 1);
-	}
-
-	*gain &= ~7;
-	return ((*gain - 64) << 5) | (1 << 6) | 32;
-}
-
-static int mt9t001_ctrl_freeze(struct mt9t001 *mt9t001, bool freeze)
-{
-	return mt9t001_set_output_control(mt9t001,
-		freeze ? 0 : MT9T001_OUTPUT_CONTROL_SYNC,
-		freeze ? MT9T001_OUTPUT_CONTROL_SYNC : 0);
-}
-
-static int mt9t001_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	static const u8 gains[4] = {
-		MT9T001_RED_GAIN, MT9T001_GREEN1_GAIN,
-		MT9T001_GREEN2_GAIN, MT9T001_BLUE_GAIN
-	};
-
-	struct mt9t001 *mt9t001 =
-			container_of(ctrl->handler, struct mt9t001, ctrls);
-	struct i2c_client *client = v4l2_get_subdevdata(&mt9t001->subdev);
-	unsigned int count;
-	unsigned int i;
-	u16 value;
-	int ret;
-
-	switch (ctrl->id) {
-	case V4L2_CID_GAIN_RED:
-	case V4L2_CID_GAIN_GREEN_RED:
-	case V4L2_CID_GAIN_GREEN_BLUE:
-	case V4L2_CID_GAIN_BLUE:
-
-		/* Disable control updates if more than one control has changed
-		 * in the cluster.
-		 */
-		for (i = 0, count = 0; i < 4; ++i) {
-			struct v4l2_ctrl *gain = mt9t001->gains[i];
-
-			if (gain->val != gain->cur.val)
-				count++;
-		}
-
-		if (count > 1) {
-			ret = mt9t001_ctrl_freeze(mt9t001, true);
-			if (ret < 0)
-				return ret;
-		}
-
-		/* Update the gain controls. */
-		for (i = 0; i < 4; ++i) {
-			struct v4l2_ctrl *gain = mt9t001->gains[i];
-
-			if (gain->val == gain->cur.val)
-				continue;
-
-			value = mt9t001_gain_value(&gain->val);
-			ret = mt9t001_write(client, gains[i], value);
-			if (ret < 0) {
-				mt9t001_ctrl_freeze(mt9t001, false);
-				return ret;
-			}
-		}
-
-		/* Enable control updates. */
-		if (count > 1) {
-			ret = mt9t001_ctrl_freeze(mt9t001, false);
-			if (ret < 0)
-				return ret;
-		}
-
-		break;
-
-	case V4L2_CID_EXPOSURE:
-		ret = mt9t001_write(client, MT9T001_SHUTTER_WIDTH_LOW,
-				    ctrl->val & 0xffff);
-		if (ret < 0)
-			return ret;
-
-		return mt9t001_write(client, MT9T001_SHUTTER_WIDTH_HIGH,
-				     ctrl->val >> 16);
-
-	case V4L2_CID_TEST_PATTERN:
-		return mt9t001_set_output_control(mt9t001,
-			ctrl->val ? 0 : MT9T001_OUTPUT_CONTROL_TEST_DATA,
-			ctrl->val ? MT9T001_OUTPUT_CONTROL_TEST_DATA : 0);
-
-	case V4L2_CID_TEST_PATTERN_COLOR:
-		return mt9t001_write(client, MT9T001_TEST_DATA, ctrl->val << 2);
-
-	case V4L2_CID_BLACK_LEVEL_AUTO:
-		value = ctrl->val ? 0 : MT9T001_BLACK_LEVEL_OVERRIDE;
-		ret = mt9t001_write(client, MT9T001_BLACK_LEVEL_CALIBRATION,
-				    value);
-		if (ret < 0)
-			return ret;
-
-		mt9t001->black_level = value;
-		break;
-
-	case V4L2_CID_BLACK_LEVEL_OFFSET:
-		ret = mt9t001_write(client, MT9T001_GREEN1_OFFSET, ctrl->val);
-		if (ret < 0)
-			return ret;
-
-		ret = mt9t001_write(client, MT9T001_GREEN2_OFFSET, ctrl->val);
-		if (ret < 0)
-			return ret;
-
-		ret = mt9t001_write(client, MT9T001_RED_OFFSET, ctrl->val);
-		if (ret < 0)
-			return ret;
-
-		return mt9t001_write(client, MT9T001_BLUE_OFFSET, ctrl->val);
-
-	case V4L2_CID_BLACK_LEVEL_CALIBRATE:
-		return mt9t001_write(client, MT9T001_BLACK_LEVEL_CALIBRATION,
-				     MT9T001_BLACK_LEVEL_RECALCULATE |
-				     mt9t001->black_level);
-	}
-
-	return 0;
-}
-
-static const struct v4l2_ctrl_ops mt9t001_ctrl_ops = {
-	.s_ctrl = mt9t001_s_ctrl,
-};
-
-static const char * const mt9t001_test_pattern_menu[] = {
-	"Disabled",
-	"Enabled",
-};
-
-static const struct v4l2_ctrl_config mt9t001_ctrls[] = {
-	{
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_TEST_PATTERN_COLOR,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Test Pattern Color",
-		.min		= 0,
-		.max		= 1023,
-		.step		= 1,
-		.def		= 0,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_BLACK_LEVEL_AUTO,
-		.type		= V4L2_CTRL_TYPE_BOOLEAN,
-		.name		= "Black Level, Auto",
-		.min		= 0,
-		.max		= 1,
-		.step		= 1,
-		.def		= 1,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_BLACK_LEVEL_OFFSET,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Black Level, Offset",
-		.min		= -256,
-		.max		= 255,
-		.step		= 1,
-		.def		= 32,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_BLACK_LEVEL_CALIBRATE,
-		.type		= V4L2_CTRL_TYPE_BUTTON,
-		.name		= "Black Level, Calibrate",
-		.min		= 0,
-		.max		= 0,
-		.step		= 0,
-		.def		= 0,
-		.flags		= V4L2_CTRL_FLAG_WRITE_ONLY,
-	},
-};
-
-static const struct v4l2_ctrl_config mt9t001_gains[] = {
-	{
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_GAIN_RED,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Gain, Red",
-		.min		= MT9T001_GLOBAL_GAIN_MIN,
-		.max		= MT9T001_GLOBAL_GAIN_MAX,
-		.step		= 1,
-		.def		= MT9T001_GLOBAL_GAIN_MIN,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_GAIN_GREEN_RED,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Gain, Green (R)",
-		.min		= MT9T001_GLOBAL_GAIN_MIN,
-		.max		= MT9T001_GLOBAL_GAIN_MAX,
-		.step		= 1,
-		.def		= MT9T001_GLOBAL_GAIN_MIN,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_GAIN_GREEN_BLUE,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Gain, Green (B)",
-		.min		= MT9T001_GLOBAL_GAIN_MIN,
-		.max		= MT9T001_GLOBAL_GAIN_MAX,
-		.step		= 1,
-		.def		= MT9T001_GLOBAL_GAIN_MIN,
-		.flags		= 0,
-	}, {
-		.ops		= &mt9t001_ctrl_ops,
-		.id		= V4L2_CID_GAIN_BLUE,
-		.type		= V4L2_CTRL_TYPE_INTEGER,
-		.name		= "Gain, Blue",
-		.min		= MT9T001_GLOBAL_GAIN_MIN,
-		.max		= MT9T001_GLOBAL_GAIN_MAX,
-		.step		= 1,
-		.def		= MT9T001_GLOBAL_GAIN_MIN,
-		.flags		= 0,
-	},
-};
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev core operations
- */
-
-static int mt9t001_set_power(struct v4l2_subdev *subdev, int on)
-{
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-	int ret = 0;
-
-	mutex_lock(&mt9t001->power_lock);
-
-	/* If the power count is modified from 0 to != 0 or from != 0 to 0,
-	 * update the power state.
-	 */
-	if (mt9t001->power_count == !on) {
-		ret = __mt9t001_set_power(mt9t001, !!on);
-		if (ret < 0)
-			goto out;
-	}
-
-	/* Update the power count. */
-	mt9t001->power_count += on ? 1 : -1;
-	WARN_ON(mt9t001->power_count < 0);
-
-out:
-	mutex_unlock(&mt9t001->power_lock);
-	return ret;
-}
-
-/* -----------------------------------------------------------------------------
- * V4L2 subdev internal operations
- */
-
-static int mt9t001_registered(struct v4l2_subdev *subdev)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(subdev);
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-	s32 data;
-	int ret;
-
-	ret = mt9t001_power_on(mt9t001);
-	if (ret < 0) {
-		dev_err(&client->dev, "MT9T001 power up failed\n");
-		return ret;
-	}
-
-	/* Read out the chip version register */
-	data = mt9t001_read(client, MT9T001_CHIP_VERSION);
-	mt9t001_power_off(mt9t001);
-
-	if (data != MT9T001_CHIP_ID) {
-		dev_err(&client->dev,
-			"MT9T001 not detected, wrong version 0x%04x\n", data);
-		return -ENODEV;
-	}
-
-	dev_info(&client->dev, "MT9T001 detected at address 0x%02x\n",
-		 client->addr);
-
-	return 0;
-}
-
-static int mt9t001_open(struct v4l2_subdev *subdev, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_mbus_framefmt *format;
-	struct v4l2_rect *crop;
-
-	crop = v4l2_subdev_get_try_crop(subdev, fh->state, 0);
-	crop->left = MT9T001_COLUMN_START_DEF;
-	crop->top = MT9T001_ROW_START_DEF;
-	crop->width = MT9T001_WINDOW_WIDTH_DEF + 1;
-	crop->height = MT9T001_WINDOW_HEIGHT_DEF + 1;
-
-	format = v4l2_subdev_get_try_format(subdev, fh->state, 0);
-	format->code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	format->width = MT9T001_WINDOW_WIDTH_DEF + 1;
-	format->height = MT9T001_WINDOW_HEIGHT_DEF + 1;
-	format->field = V4L2_FIELD_NONE;
-	format->colorspace = V4L2_COLORSPACE_SRGB;
-
-	return mt9t001_set_power(subdev, 1);
-}
-
-static int mt9t001_close(struct v4l2_subdev *subdev, struct v4l2_subdev_fh *fh)
-{
-	return mt9t001_set_power(subdev, 0);
-}
-
-static const struct v4l2_subdev_core_ops mt9t001_subdev_core_ops = {
-	.s_power = mt9t001_set_power,
-};
-
-static const struct v4l2_subdev_video_ops mt9t001_subdev_video_ops = {
-	.s_stream = mt9t001_s_stream,
-};
-
-static const struct v4l2_subdev_pad_ops mt9t001_subdev_pad_ops = {
-	.enum_mbus_code = mt9t001_enum_mbus_code,
-	.enum_frame_size = mt9t001_enum_frame_size,
-	.get_fmt = mt9t001_get_format,
-	.set_fmt = mt9t001_set_format,
-	.get_selection = mt9t001_get_selection,
-	.set_selection = mt9t001_set_selection,
-};
-
-static const struct v4l2_subdev_ops mt9t001_subdev_ops = {
-	.core = &mt9t001_subdev_core_ops,
-	.video = &mt9t001_subdev_video_ops,
-	.pad = &mt9t001_subdev_pad_ops,
-};
-
-static const struct v4l2_subdev_internal_ops mt9t001_subdev_internal_ops = {
-	.registered = mt9t001_registered,
-	.open = mt9t001_open,
-	.close = mt9t001_close,
-};
-
-static int mt9t001_probe(struct i2c_client *client)
-{
-	struct mt9t001_platform_data *pdata = client->dev.platform_data;
-	struct mt9t001 *mt9t001;
-	unsigned int i;
-	int ret;
-
-	if (pdata == NULL) {
-		dev_err(&client->dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	if (!i2c_check_functionality(client->adapter,
-				     I2C_FUNC_SMBUS_WORD_DATA)) {
-		dev_warn(&client->adapter->dev,
-			 "I2C-Adapter doesn't support I2C_FUNC_SMBUS_WORD\n");
-		return -EIO;
-	}
-
-	mt9t001 = devm_kzalloc(&client->dev, sizeof(*mt9t001), GFP_KERNEL);
-	if (!mt9t001)
-		return -ENOMEM;
-
-	mutex_init(&mt9t001->power_lock);
-	mt9t001->output_control = MT9T001_OUTPUT_CONTROL_DEF;
-
-	mt9t001->regulators[0].supply = "vdd";
-	mt9t001->regulators[1].supply = "vaa";
-
-	ret = devm_regulator_bulk_get(&client->dev, 2, mt9t001->regulators);
-	if (ret < 0) {
-		dev_err(&client->dev, "Unable to get regulators\n");
-		return ret;
-	}
-
-	mt9t001->clk = devm_clk_get(&client->dev, NULL);
-	if (IS_ERR(mt9t001->clk)) {
-		dev_err(&client->dev, "Unable to get clock\n");
-		return PTR_ERR(mt9t001->clk);
-	}
-
-	v4l2_ctrl_handler_init(&mt9t001->ctrls, ARRAY_SIZE(mt9t001_ctrls) +
-						ARRAY_SIZE(mt9t001_gains) + 4);
-
-	v4l2_ctrl_new_std(&mt9t001->ctrls, &mt9t001_ctrl_ops,
-			  V4L2_CID_EXPOSURE, MT9T001_SHUTTER_WIDTH_MIN,
-			  MT9T001_SHUTTER_WIDTH_MAX, 1,
-			  MT9T001_SHUTTER_WIDTH_DEF);
-	v4l2_ctrl_new_std(&mt9t001->ctrls, &mt9t001_ctrl_ops,
-			  V4L2_CID_BLACK_LEVEL, 1, 1, 1, 1);
-	v4l2_ctrl_new_std(&mt9t001->ctrls, &mt9t001_ctrl_ops,
-			  V4L2_CID_PIXEL_RATE, pdata->ext_clk, pdata->ext_clk,
-			  1, pdata->ext_clk);
-	v4l2_ctrl_new_std_menu_items(&mt9t001->ctrls, &mt9t001_ctrl_ops,
-			V4L2_CID_TEST_PATTERN,
-			ARRAY_SIZE(mt9t001_test_pattern_menu) - 1, 0,
-			0, mt9t001_test_pattern_menu);
-
-	for (i = 0; i < ARRAY_SIZE(mt9t001_ctrls); ++i)
-		v4l2_ctrl_new_custom(&mt9t001->ctrls, &mt9t001_ctrls[i], NULL);
-
-	for (i = 0; i < ARRAY_SIZE(mt9t001_gains); ++i)
-		mt9t001->gains[i] = v4l2_ctrl_new_custom(&mt9t001->ctrls,
-			&mt9t001_gains[i], NULL);
-
-	v4l2_ctrl_cluster(ARRAY_SIZE(mt9t001_gains), mt9t001->gains);
-
-	mt9t001->subdev.ctrl_handler = &mt9t001->ctrls;
-
-	if (mt9t001->ctrls.error) {
-		printk(KERN_INFO "%s: control initialization error %d\n",
-		       __func__, mt9t001->ctrls.error);
-		ret = -EINVAL;
-		goto done;
-	}
-
-	mt9t001->crop.left = MT9T001_COLUMN_START_DEF;
-	mt9t001->crop.top = MT9T001_ROW_START_DEF;
-	mt9t001->crop.width = MT9T001_WINDOW_WIDTH_DEF + 1;
-	mt9t001->crop.height = MT9T001_WINDOW_HEIGHT_DEF + 1;
-
-	mt9t001->format.code = MEDIA_BUS_FMT_SGRBG10_1X10;
-	mt9t001->format.width = MT9T001_WINDOW_WIDTH_DEF + 1;
-	mt9t001->format.height = MT9T001_WINDOW_HEIGHT_DEF + 1;
-	mt9t001->format.field = V4L2_FIELD_NONE;
-	mt9t001->format.colorspace = V4L2_COLORSPACE_SRGB;
-
-	v4l2_i2c_subdev_init(&mt9t001->subdev, client, &mt9t001_subdev_ops);
-	mt9t001->subdev.internal_ops = &mt9t001_subdev_internal_ops;
-	mt9t001->subdev.flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	mt9t001->subdev.entity.function = MEDIA_ENT_F_CAM_SENSOR;
-	mt9t001->pad.flags = MEDIA_PAD_FL_SOURCE;
-	ret = media_entity_pads_init(&mt9t001->subdev.entity, 1, &mt9t001->pad);
-
-done:
-	if (ret < 0) {
-		v4l2_ctrl_handler_free(&mt9t001->ctrls);
-		media_entity_cleanup(&mt9t001->subdev.entity);
-	}
-
-	return ret;
-}
-
-static void mt9t001_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *subdev = i2c_get_clientdata(client);
-	struct mt9t001 *mt9t001 = to_mt9t001(subdev);
-
-	v4l2_ctrl_handler_free(&mt9t001->ctrls);
-	v4l2_device_unregister_subdev(subdev);
-	media_entity_cleanup(&subdev->entity);
-}
-
-static const struct i2c_device_id mt9t001_id[] = {
-	{ "mt9t001", 0 },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, mt9t001_id);
-
-static struct i2c_driver mt9t001_driver = {
-	.driver = {
-		.name = "mt9t001",
-	},
-	.probe_new	= mt9t001_probe,
-	.remove		= mt9t001_remove,
-	.id_table	= mt9t001_id,
-};
-
-module_i2c_driver(mt9t001_driver);
-
-MODULE_DESCRIPTION("Aptina (Micron) MT9T001 Camera driver");
-MODULE_AUTHOR("Laurent Pinchart <laurent.pinchart@ideasonboard.com>");
-MODULE_LICENSE("GPL");
diff --git a/include/media/i2c/mt9t001.h b/include/media/i2c/mt9t001.h
deleted file mode 100644
index 4b1090554270..000000000000
--- a/include/media/i2c/mt9t001.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _MEDIA_MT9T001_H
-#define _MEDIA_MT9T001_H
-
-struct mt9t001_platform_data {
-	unsigned int clk_pol:1;
-	unsigned int ext_clk;
-};
-
-#endif
-- 
cgit v1.2.3


From 4604236d67278885fee3459a39315fb5ff7dc1c6 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused noon010pc30 camera sensor driver

The noon010pc30 camera sensor driver doesn't support DT and relies on
platform data. The last board files supplying platform data for that
device have been removed from the kernel in v3.16. A device tree file
referencing the device has been added in v3.17, but without
corresponding DT bindings, and with DT support in the driver. The driver
thus hasn't been used since v316. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |   1 -
 drivers/media/i2c/Kconfig                        |   8 -
 drivers/media/i2c/Makefile                       |   1 -
 drivers/media/i2c/noon010pc30.c                  | 821 -----------------------
 include/media/i2c/noon010pc30.h                  |  21 -
 5 files changed, 852 deletions(-)
 delete mode 100644 drivers/media/i2c/noon010pc30.c
 delete mode 100644 include/media/i2c/noon010pc30.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index 35660b6c6cf5..e6c2ae43d02d 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -79,7 +79,6 @@ mt9t112       Aptina MT9T111/MT9T112
 mt9v011       Micron mt9v011 sensor
 mt9v032       Micron MT9V032 sensor
 mt9v111       Aptina MT9V111 sensor
-noon010pc30   Siliconfile NOON010PC30 sensor
 ov13858       OmniVision OV13858 sensor
 ov13b10       OmniVision OV13B10 sensor
 ov2640        OmniVision OV2640 sensor
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 8444cfe24306..40304e017458 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -325,14 +325,6 @@ config VIDEO_MT9V111
 	  To compile this driver as a module, choose M here: the
 	  module will be called mt9v111.
 
-config VIDEO_NOON010PC30
-	tristate "Siliconfile NOON010PC30 sensor support"
-	depends on I2C && VIDEO_DEV
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	help
-	  This driver supports NOON010PC30 CIF camera from Siliconfile
-
 config VIDEO_OG01A1B
 	tristate "OmniVision OG01A1B sensor support"
 	depends on I2C && VIDEO_DEV
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 77c6875c808f..e5e06c56efcd 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -66,7 +66,6 @@ obj-$(CONFIG_VIDEO_MT9T112) += mt9t112.o
 obj-$(CONFIG_VIDEO_MT9V011) += mt9v011.o
 obj-$(CONFIG_VIDEO_MT9V032) += mt9v032.o
 obj-$(CONFIG_VIDEO_MT9V111) += mt9v111.o
-obj-$(CONFIG_VIDEO_NOON010PC30) += noon010pc30.o
 obj-$(CONFIG_VIDEO_OG01A1B) += og01a1b.o
 obj-$(CONFIG_VIDEO_OV02A10) += ov02a10.o
 obj-$(CONFIG_VIDEO_OV08D10) += ov08d10.o
diff --git a/drivers/media/i2c/noon010pc30.c b/drivers/media/i2c/noon010pc30.c
deleted file mode 100644
index 144bef2835f7..000000000000
--- a/drivers/media/i2c/noon010pc30.c
+++ /dev/null
@@ -1,821 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for SiliconFile NOON010PC30 CIF (1/11") Image Sensor with ISP
- *
- * Copyright (C) 2010 - 2011 Samsung Electronics Co., Ltd.
- * Contact: Sylwester Nawrocki, <s.nawrocki@samsung.com>
- *
- * Initial register configuration based on a driver authored by
- * HeungJun Kim <riverful.kim@samsung.com>.
- */
-
-#include <linux/delay.h>
-#include <linux/gpio/consumer.h>
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/regulator/consumer.h>
-#include <media/i2c/noon010pc30.h>
-#include <linux/videodev2.h>
-#include <linux/module.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-mediabus.h>
-#include <media/v4l2-subdev.h>
-
-static int debug;
-module_param(debug, int, 0644);
-MODULE_PARM_DESC(debug, "Enable module debug trace. Set to 1 to enable.");
-
-#define MODULE_NAME		"NOON010PC30"
-
-/*
- * Register offsets within a page
- * b15..b8 - page id, b7..b0 - register address
- */
-#define POWER_CTRL_REG		0x0001
-#define PAGEMODE_REG		0x03
-#define DEVICE_ID_REG		0x0004
-#define NOON010PC30_ID		0x86
-#define VDO_CTL_REG(n)		(0x0010 + (n))
-#define SYNC_CTL_REG		0x0012
-/* Window size and position */
-#define WIN_ROWH_REG		0x0013
-#define WIN_ROWL_REG		0x0014
-#define WIN_COLH_REG		0x0015
-#define WIN_COLL_REG		0x0016
-#define WIN_HEIGHTH_REG		0x0017
-#define WIN_HEIGHTL_REG		0x0018
-#define WIN_WIDTHH_REG		0x0019
-#define WIN_WIDTHL_REG		0x001A
-#define HBLANKH_REG		0x001B
-#define HBLANKL_REG		0x001C
-#define VSYNCH_REG		0x001D
-#define VSYNCL_REG		0x001E
-/* VSYNC control */
-#define VS_CTL_REG(n)		(0x00A1 + (n))
-/* page 1 */
-#define ISP_CTL_REG(n)		(0x0110 + (n))
-#define YOFS_REG		0x0119
-#define DARK_YOFS_REG		0x011A
-#define SAT_CTL_REG		0x0120
-#define BSAT_REG		0x0121
-#define RSAT_REG		0x0122
-/* Color correction */
-#define CMC_CTL_REG		0x0130
-#define CMC_OFSGH_REG		0x0133
-#define CMC_OFSGL_REG		0x0135
-#define CMC_SIGN_REG		0x0136
-#define CMC_GOFS_REG		0x0137
-#define CMC_COEF_REG(n)		(0x0138 + (n))
-#define CMC_OFS_REG(n)		(0x0141 + (n))
-/* Gamma correction */
-#define GMA_CTL_REG		0x0160
-#define GMA_COEF_REG(n)		(0x0161 + (n))
-/* Lens Shading */
-#define LENS_CTRL_REG		0x01D0
-#define LENS_XCEN_REG		0x01D1
-#define LENS_YCEN_REG		0x01D2
-#define LENS_RC_REG		0x01D3
-#define LENS_GC_REG		0x01D4
-#define LENS_BC_REG		0x01D5
-#define L_AGON_REG		0x01D6
-#define L_AGOFF_REG		0x01D7
-/* Page 3 - Auto Exposure */
-#define AE_CTL_REG(n)		(0x0310 + (n))
-#define AE_CTL9_REG		0x032C
-#define AE_CTL10_REG		0x032D
-#define AE_YLVL_REG		0x031C
-#define AE_YTH_REG(n)		(0x031D + (n))
-#define AE_WGT_REG		0x0326
-#define EXP_TIMEH_REG		0x0333
-#define EXP_TIMEM_REG		0x0334
-#define EXP_TIMEL_REG		0x0335
-#define EXP_MMINH_REG		0x0336
-#define EXP_MMINL_REG		0x0337
-#define EXP_MMAXH_REG		0x0338
-#define EXP_MMAXM_REG		0x0339
-#define EXP_MMAXL_REG		0x033A
-/* Page 4 - Auto White Balance */
-#define AWB_CTL_REG(n)		(0x0410 + (n))
-#define AWB_ENABE		0x80
-#define AWB_WGHT_REG		0x0419
-#define BGAIN_PAR_REG(n)	(0x044F + (n))
-/* Manual white balance, when AWB_CTL2[0]=1 */
-#define MWB_RGAIN_REG		0x0466
-#define MWB_BGAIN_REG		0x0467
-
-/* The token to mark an array end */
-#define REG_TERM		0xFFFF
-
-struct noon010_format {
-	u32 code;
-	enum v4l2_colorspace colorspace;
-	u16 ispctl1_reg;
-};
-
-struct noon010_frmsize {
-	u16 width;
-	u16 height;
-	int vid_ctl1;
-};
-
-static const char * const noon010_supply_name[] = {
-	"vdd_core", "vddio", "vdda"
-};
-
-#define NOON010_NUM_SUPPLIES ARRAY_SIZE(noon010_supply_name)
-
-struct noon010_info {
-	struct v4l2_subdev sd;
-	struct media_pad pad;
-	struct v4l2_ctrl_handler hdl;
-	struct regulator_bulk_data supply[NOON010_NUM_SUPPLIES];
-	struct gpio_desc *reset;
-	struct gpio_desc *stby;
-
-	/* Protects the struct members below */
-	struct mutex lock;
-
-	const struct noon010_format *curr_fmt;
-	const struct noon010_frmsize *curr_win;
-	unsigned int apply_new_cfg:1;
-	unsigned int streaming:1;
-	unsigned int hflip:1;
-	unsigned int vflip:1;
-	unsigned int power:1;
-	u8 i2c_reg_page;
-};
-
-struct i2c_regval {
-	u16 addr;
-	u16 val;
-};
-
-/* Supported resolutions. */
-static const struct noon010_frmsize noon010_sizes[] = {
-	{
-		.width		= 352,
-		.height		= 288,
-		.vid_ctl1	= 0,
-	}, {
-		.width		= 176,
-		.height		= 144,
-		.vid_ctl1	= 0x10,
-	}, {
-		.width		= 88,
-		.height		= 72,
-		.vid_ctl1	= 0x20,
-	},
-};
-
-/* Supported pixel formats. */
-static const struct noon010_format noon010_formats[] = {
-	{
-		.code		= MEDIA_BUS_FMT_YUYV8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x03,
-	}, {
-		.code		= MEDIA_BUS_FMT_YVYU8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x02,
-	}, {
-		.code		= MEDIA_BUS_FMT_VYUY8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0,
-	}, {
-		.code		= MEDIA_BUS_FMT_UYVY8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x01,
-	}, {
-		.code		= MEDIA_BUS_FMT_RGB565_2X8_BE,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x40,
-	},
-};
-
-static const struct i2c_regval noon010_base_regs[] = {
-	{ WIN_COLL_REG,		0x06 },	{ HBLANKL_REG,		0x7C },
-	/* Color corection and saturation */
-	{ ISP_CTL_REG(0),	0x30 }, { ISP_CTL_REG(2),	0x30 },
-	{ YOFS_REG,		0x80 }, { DARK_YOFS_REG,	0x04 },
-	{ SAT_CTL_REG,		0x1F }, { BSAT_REG,		0x90 },
-	{ CMC_CTL_REG,		0x0F }, { CMC_OFSGH_REG,	0x3C },
-	{ CMC_OFSGL_REG,	0x2C }, { CMC_SIGN_REG,		0x3F },
-	{ CMC_COEF_REG(0),	0x79 }, { CMC_OFS_REG(0),	0x00 },
-	{ CMC_COEF_REG(1),	0x39 }, { CMC_OFS_REG(1),	0x00 },
-	{ CMC_COEF_REG(2),	0x00 }, { CMC_OFS_REG(2),	0x00 },
-	{ CMC_COEF_REG(3),	0x11 }, { CMC_OFS_REG(3),	0x8B },
-	{ CMC_COEF_REG(4),	0x65 }, { CMC_OFS_REG(4),	0x07 },
-	{ CMC_COEF_REG(5),	0x14 }, { CMC_OFS_REG(5),	0x04 },
-	{ CMC_COEF_REG(6),	0x01 }, { CMC_OFS_REG(6),	0x9C },
-	{ CMC_COEF_REG(7),	0x33 }, { CMC_OFS_REG(7),	0x89 },
-	{ CMC_COEF_REG(8),	0x74 }, { CMC_OFS_REG(8),	0x25 },
-	/* Automatic white balance */
-	{ AWB_CTL_REG(0),	0x78 }, { AWB_CTL_REG(1),	0x2E },
-	{ AWB_CTL_REG(2),	0x20 }, { AWB_CTL_REG(3),	0x85 },
-	/* Auto exposure */
-	{ AE_CTL_REG(0),	0xDC }, { AE_CTL_REG(1),	0x81 },
-	{ AE_CTL_REG(2),	0x30 }, { AE_CTL_REG(3),	0xA5 },
-	{ AE_CTL_REG(4),	0x40 }, { AE_CTL_REG(5),	0x51 },
-	{ AE_CTL_REG(6),	0x33 }, { AE_CTL_REG(7),	0x7E },
-	{ AE_CTL9_REG,		0x00 }, { AE_CTL10_REG,		0x02 },
-	{ AE_YLVL_REG,		0x44 },	{ AE_YTH_REG(0),	0x34 },
-	{ AE_YTH_REG(1),	0x30 },	{ AE_WGT_REG,		0xD5 },
-	/* Lens shading compensation */
-	{ LENS_CTRL_REG,	0x01 }, { LENS_XCEN_REG,	0x80 },
-	{ LENS_YCEN_REG,	0x70 }, { LENS_RC_REG,		0x53 },
-	{ LENS_GC_REG,		0x40 }, { LENS_BC_REG,		0x3E },
-	{ REG_TERM,		0 },
-};
-
-static inline struct noon010_info *to_noon010(struct v4l2_subdev *sd)
-{
-	return container_of(sd, struct noon010_info, sd);
-}
-
-static inline struct v4l2_subdev *to_sd(struct v4l2_ctrl *ctrl)
-{
-	return &container_of(ctrl->handler, struct noon010_info, hdl)->sd;
-}
-
-static inline int set_i2c_page(struct noon010_info *info,
-			       struct i2c_client *client, unsigned int reg)
-{
-	u32 page = reg >> 8 & 0xFF;
-	int ret = 0;
-
-	if (info->i2c_reg_page != page && (reg & 0xFF) != 0x03) {
-		ret = i2c_smbus_write_byte_data(client, PAGEMODE_REG, page);
-		if (!ret)
-			info->i2c_reg_page = page;
-	}
-	return ret;
-}
-
-static int cam_i2c_read(struct v4l2_subdev *sd, u32 reg_addr)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct noon010_info *info = to_noon010(sd);
-	int ret = set_i2c_page(info, client, reg_addr);
-
-	if (ret)
-		return ret;
-	return i2c_smbus_read_byte_data(client, reg_addr & 0xFF);
-}
-
-static int cam_i2c_write(struct v4l2_subdev *sd, u32 reg_addr, u32 val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct noon010_info *info = to_noon010(sd);
-	int ret = set_i2c_page(info, client, reg_addr);
-
-	if (ret)
-		return ret;
-	return i2c_smbus_write_byte_data(client, reg_addr & 0xFF, val);
-}
-
-static inline int noon010_bulk_write_reg(struct v4l2_subdev *sd,
-					 const struct i2c_regval *msg)
-{
-	while (msg->addr != REG_TERM) {
-		int ret = cam_i2c_write(sd, msg->addr, msg->val);
-
-		if (ret)
-			return ret;
-		msg++;
-	}
-	return 0;
-}
-
-/* Device reset and sleep mode control */
-static int noon010_power_ctrl(struct v4l2_subdev *sd, bool reset, bool sleep)
-{
-	struct noon010_info *info = to_noon010(sd);
-	u8 reg = sleep ? 0xF1 : 0xF0;
-	int ret = 0;
-
-	if (reset) {
-		ret = cam_i2c_write(sd, POWER_CTRL_REG, reg | 0x02);
-		udelay(20);
-	}
-	if (!ret) {
-		ret = cam_i2c_write(sd, POWER_CTRL_REG, reg);
-		if (reset && !ret)
-			info->i2c_reg_page = -1;
-	}
-	return ret;
-}
-
-/* Automatic white balance control */
-static int noon010_enable_autowhitebalance(struct v4l2_subdev *sd, int on)
-{
-	int ret;
-
-	ret = cam_i2c_write(sd, AWB_CTL_REG(1), on ? 0x2E : 0x2F);
-	if (!ret)
-		ret = cam_i2c_write(sd, AWB_CTL_REG(0), on ? 0xFB : 0x7B);
-	return ret;
-}
-
-/* Called with struct noon010_info.lock mutex held */
-static int noon010_set_flip(struct v4l2_subdev *sd, int hflip, int vflip)
-{
-	struct noon010_info *info = to_noon010(sd);
-	int reg, ret;
-
-	reg = cam_i2c_read(sd, VDO_CTL_REG(1));
-	if (reg < 0)
-		return reg;
-
-	reg &= 0x7C;
-	if (hflip)
-		reg |= 0x01;
-	if (vflip)
-		reg |= 0x02;
-
-	ret = cam_i2c_write(sd, VDO_CTL_REG(1), reg | 0x80);
-	if (!ret) {
-		info->hflip = hflip;
-		info->vflip = vflip;
-	}
-	return ret;
-}
-
-/* Configure resolution and color format */
-static int noon010_set_params(struct v4l2_subdev *sd)
-{
-	struct noon010_info *info = to_noon010(sd);
-
-	int ret = cam_i2c_write(sd, VDO_CTL_REG(0),
-				info->curr_win->vid_ctl1);
-	if (ret)
-		return ret;
-	return cam_i2c_write(sd, ISP_CTL_REG(0),
-			     info->curr_fmt->ispctl1_reg);
-}
-
-/* Find nearest matching image pixel size. */
-static int noon010_try_frame_size(struct v4l2_mbus_framefmt *mf,
-				  const struct noon010_frmsize **size)
-{
-	unsigned int min_err = ~0;
-	int i = ARRAY_SIZE(noon010_sizes);
-	const struct noon010_frmsize *fsize = &noon010_sizes[0],
-		*match = NULL;
-
-	while (i--) {
-		int err = abs(fsize->width - mf->width)
-				+ abs(fsize->height - mf->height);
-
-		if (err < min_err) {
-			min_err = err;
-			match = fsize;
-		}
-		fsize++;
-	}
-	if (match) {
-		mf->width  = match->width;
-		mf->height = match->height;
-		if (size)
-			*size = match;
-		return 0;
-	}
-	return -EINVAL;
-}
-
-/* Called with info.lock mutex held */
-static int power_enable(struct noon010_info *info)
-{
-	int ret;
-
-	if (info->power) {
-		v4l2_info(&info->sd, "%s: sensor is already on\n", __func__);
-		return 0;
-	}
-
-	/* Assert standby: line should be flagged active low in descriptor */
-	if (info->stby)
-		gpiod_set_value(info->stby, 1);
-
-	/* Assert reset: line should be flagged active low in descriptor */
-	if (info->reset)
-		gpiod_set_value(info->reset, 1);
-
-	ret = regulator_bulk_enable(NOON010_NUM_SUPPLIES, info->supply);
-	if (ret)
-		return ret;
-
-	/* De-assert reset and standby */
-	if (info->reset) {
-		msleep(50);
-		gpiod_set_value(info->reset, 0);
-	}
-	if (info->stby) {
-		udelay(1000);
-		gpiod_set_value(info->stby, 0);
-	}
-	/* Cycle reset: assert and deassert */
-	if (info->reset) {
-		udelay(1000);
-		gpiod_set_value(info->reset, 1);
-		msleep(100);
-		gpiod_set_value(info->reset, 0);
-		msleep(20);
-	}
-	info->power = 1;
-
-	v4l2_dbg(1, debug, &info->sd,  "%s: sensor is on\n", __func__);
-	return 0;
-}
-
-/* Called with info.lock mutex held */
-static int power_disable(struct noon010_info *info)
-{
-	int ret;
-
-	if (!info->power) {
-		v4l2_info(&info->sd, "%s: sensor is already off\n", __func__);
-		return 0;
-	}
-
-	ret = regulator_bulk_disable(NOON010_NUM_SUPPLIES, info->supply);
-	if (ret)
-		return ret;
-
-	/* Assert standby and reset */
-	if (info->stby)
-		gpiod_set_value(info->stby, 1);
-
-	if (info->reset)
-		gpiod_set_value(info->reset, 1);
-
-	info->power = 0;
-
-	v4l2_dbg(1, debug, &info->sd,  "%s: sensor is off\n", __func__);
-
-	return 0;
-}
-
-static int noon010_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct v4l2_subdev *sd = to_sd(ctrl);
-	struct noon010_info *info = to_noon010(sd);
-	int ret = 0;
-
-	v4l2_dbg(1, debug, sd, "%s: ctrl_id: %d, value: %d\n",
-		 __func__, ctrl->id, ctrl->val);
-
-	mutex_lock(&info->lock);
-	/*
-	 * If the device is not powered up by the host driver do
-	 * not apply any controls to H/W at this time. Instead
-	 * the controls will be restored right after power-up.
-	 */
-	if (!info->power)
-		goto unlock;
-
-	switch (ctrl->id) {
-	case V4L2_CID_AUTO_WHITE_BALANCE:
-		ret = noon010_enable_autowhitebalance(sd, ctrl->val);
-		break;
-	case V4L2_CID_BLUE_BALANCE:
-		ret = cam_i2c_write(sd, MWB_BGAIN_REG, ctrl->val);
-		break;
-	case V4L2_CID_RED_BALANCE:
-		ret =  cam_i2c_write(sd, MWB_RGAIN_REG, ctrl->val);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-unlock:
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static int noon010_enum_mbus_code(struct v4l2_subdev *sd,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (code->index >= ARRAY_SIZE(noon010_formats))
-		return -EINVAL;
-
-	code->code = noon010_formats[code->index].code;
-	return 0;
-}
-
-static int noon010_get_fmt(struct v4l2_subdev *sd,
-			   struct v4l2_subdev_state *sd_state,
-			   struct v4l2_subdev_format *fmt)
-{
-	struct noon010_info *info = to_noon010(sd);
-	struct v4l2_mbus_framefmt *mf;
-
-	if (fmt->which == V4L2_SUBDEV_FORMAT_TRY) {
-		if (sd_state) {
-			mf = v4l2_subdev_get_try_format(sd, sd_state, 0);
-			fmt->format = *mf;
-		}
-		return 0;
-	}
-	mf = &fmt->format;
-
-	mutex_lock(&info->lock);
-	mf->width = info->curr_win->width;
-	mf->height = info->curr_win->height;
-	mf->code = info->curr_fmt->code;
-	mf->colorspace = info->curr_fmt->colorspace;
-	mf->field = V4L2_FIELD_NONE;
-
-	mutex_unlock(&info->lock);
-	return 0;
-}
-
-/* Return nearest media bus frame format. */
-static const struct noon010_format *noon010_try_fmt(struct v4l2_subdev *sd,
-					    struct v4l2_mbus_framefmt *mf)
-{
-	int i = ARRAY_SIZE(noon010_formats);
-
-	while (--i)
-		if (mf->code == noon010_formats[i].code)
-			break;
-	mf->code = noon010_formats[i].code;
-
-	return &noon010_formats[i];
-}
-
-static int noon010_set_fmt(struct v4l2_subdev *sd,
-			   struct v4l2_subdev_state *sd_state,
-			   struct v4l2_subdev_format *fmt)
-{
-	struct noon010_info *info = to_noon010(sd);
-	const struct noon010_frmsize *size = NULL;
-	const struct noon010_format *nf;
-	struct v4l2_mbus_framefmt *mf;
-	int ret = 0;
-
-	nf = noon010_try_fmt(sd, &fmt->format);
-	noon010_try_frame_size(&fmt->format, &size);
-	fmt->format.colorspace = V4L2_COLORSPACE_JPEG;
-	fmt->format.field = V4L2_FIELD_NONE;
-
-	if (fmt->which == V4L2_SUBDEV_FORMAT_TRY) {
-		if (sd_state) {
-			mf = v4l2_subdev_get_try_format(sd, sd_state, 0);
-			*mf = fmt->format;
-		}
-		return 0;
-	}
-	mutex_lock(&info->lock);
-	if (!info->streaming) {
-		info->apply_new_cfg = 1;
-		info->curr_fmt = nf;
-		info->curr_win = size;
-	} else {
-		ret = -EBUSY;
-	}
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-/* Called with struct noon010_info.lock mutex held */
-static int noon010_base_config(struct v4l2_subdev *sd)
-{
-	int ret = noon010_bulk_write_reg(sd, noon010_base_regs);
-	if (!ret)
-		ret = noon010_set_params(sd);
-	if (!ret)
-		ret = noon010_set_flip(sd, 1, 0);
-
-	return ret;
-}
-
-static int noon010_s_power(struct v4l2_subdev *sd, int on)
-{
-	struct noon010_info *info = to_noon010(sd);
-	int ret;
-
-	mutex_lock(&info->lock);
-	if (on) {
-		ret = power_enable(info);
-		if (!ret)
-			ret = noon010_base_config(sd);
-	} else {
-		noon010_power_ctrl(sd, false, true);
-		ret = power_disable(info);
-	}
-	mutex_unlock(&info->lock);
-
-	/* Restore the controls state */
-	if (!ret && on)
-		ret = v4l2_ctrl_handler_setup(&info->hdl);
-
-	return ret;
-}
-
-static int noon010_s_stream(struct v4l2_subdev *sd, int on)
-{
-	struct noon010_info *info = to_noon010(sd);
-	int ret = 0;
-
-	mutex_lock(&info->lock);
-	if (!info->streaming != !on) {
-		ret = noon010_power_ctrl(sd, false, !on);
-		if (!ret)
-			info->streaming = on;
-	}
-	if (!ret && on && info->apply_new_cfg) {
-		ret = noon010_set_params(sd);
-		if (!ret)
-			info->apply_new_cfg = 0;
-	}
-	mutex_unlock(&info->lock);
-	return ret;
-}
-
-static int noon010_log_status(struct v4l2_subdev *sd)
-{
-	struct noon010_info *info = to_noon010(sd);
-
-	v4l2_ctrl_handler_log_status(&info->hdl, sd->name);
-	return 0;
-}
-
-static int noon010_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_mbus_framefmt *mf = v4l2_subdev_get_try_format(sd,
-								   fh->state,
-								   0);
-
-	mf->width = noon010_sizes[0].width;
-	mf->height = noon010_sizes[0].height;
-	mf->code = noon010_formats[0].code;
-	mf->colorspace = V4L2_COLORSPACE_JPEG;
-	mf->field = V4L2_FIELD_NONE;
-	return 0;
-}
-
-static const struct v4l2_subdev_internal_ops noon010_subdev_internal_ops = {
-	.open = noon010_open,
-};
-
-static const struct v4l2_ctrl_ops noon010_ctrl_ops = {
-	.s_ctrl = noon010_s_ctrl,
-};
-
-static const struct v4l2_subdev_core_ops noon010_core_ops = {
-	.s_power	= noon010_s_power,
-	.log_status	= noon010_log_status,
-};
-
-static const struct v4l2_subdev_pad_ops noon010_pad_ops = {
-	.enum_mbus_code	= noon010_enum_mbus_code,
-	.get_fmt	= noon010_get_fmt,
-	.set_fmt	= noon010_set_fmt,
-};
-
-static const struct v4l2_subdev_video_ops noon010_video_ops = {
-	.s_stream	= noon010_s_stream,
-};
-
-static const struct v4l2_subdev_ops noon010_ops = {
-	.core	= &noon010_core_ops,
-	.pad	= &noon010_pad_ops,
-	.video	= &noon010_video_ops,
-};
-
-/* Return 0 if NOON010PC30L sensor type was detected or -ENODEV otherwise. */
-static int noon010_detect(struct i2c_client *client, struct noon010_info *info)
-{
-	int ret;
-
-	ret = power_enable(info);
-	if (ret)
-		return ret;
-
-	ret = i2c_smbus_read_byte_data(client, DEVICE_ID_REG);
-	if (ret < 0)
-		dev_err(&client->dev, "I2C read failed: 0x%X\n", ret);
-
-	power_disable(info);
-
-	return ret == NOON010PC30_ID ? 0 : -ENODEV;
-}
-
-static int noon010_probe(struct i2c_client *client)
-{
-	struct noon010_info *info;
-	struct v4l2_subdev *sd;
-	const struct noon010pc30_platform_data *pdata
-		= client->dev.platform_data;
-	int ret;
-	int i;
-
-	if (!pdata) {
-		dev_err(&client->dev, "No platform data!\n");
-		return -EIO;
-	}
-
-	info = devm_kzalloc(&client->dev, sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	mutex_init(&info->lock);
-	sd = &info->sd;
-	v4l2_i2c_subdev_init(sd, client, &noon010_ops);
-	/* Static name; NEVER use in new drivers! */
-	strscpy(sd->name, MODULE_NAME, sizeof(sd->name));
-
-	sd->internal_ops = &noon010_subdev_internal_ops;
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	v4l2_ctrl_handler_init(&info->hdl, 3);
-
-	v4l2_ctrl_new_std(&info->hdl, &noon010_ctrl_ops,
-			  V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, 1);
-	v4l2_ctrl_new_std(&info->hdl, &noon010_ctrl_ops,
-			  V4L2_CID_RED_BALANCE, 0, 127, 1, 64);
-	v4l2_ctrl_new_std(&info->hdl, &noon010_ctrl_ops,
-			  V4L2_CID_BLUE_BALANCE, 0, 127, 1, 64);
-
-	sd->ctrl_handler = &info->hdl;
-
-	ret = info->hdl.error;
-	if (ret)
-		goto np_err;
-
-	info->i2c_reg_page	= -1;
-	info->curr_fmt		= &noon010_formats[0];
-	info->curr_win		= &noon010_sizes[0];
-
-	/* Request reset asserted so we get put into reset */
-	info->reset = devm_gpiod_get(&client->dev, "reset", GPIOD_OUT_HIGH);
-	if (IS_ERR(info->reset)) {
-		ret = PTR_ERR(info->reset);
-		goto np_err;
-	}
-	gpiod_set_consumer_name(info->reset, "NOON010PC30 NRST");
-
-	/* Request standby asserted so we get put into standby */
-	info->stby = devm_gpiod_get(&client->dev, "standby", GPIOD_OUT_HIGH);
-	if (IS_ERR(info->stby)) {
-		ret = PTR_ERR(info->stby);
-		goto np_err;
-	}
-	gpiod_set_consumer_name(info->reset, "NOON010PC30 STBY");
-
-	for (i = 0; i < NOON010_NUM_SUPPLIES; i++)
-		info->supply[i].supply = noon010_supply_name[i];
-
-	ret = devm_regulator_bulk_get(&client->dev, NOON010_NUM_SUPPLIES,
-				 info->supply);
-	if (ret)
-		goto np_err;
-
-	info->pad.flags = MEDIA_PAD_FL_SOURCE;
-	sd->entity.function = MEDIA_ENT_F_CAM_SENSOR;
-	ret = media_entity_pads_init(&sd->entity, 1, &info->pad);
-	if (ret < 0)
-		goto np_err;
-
-	ret = noon010_detect(client, info);
-	if (!ret)
-		return 0;
-
-np_err:
-	v4l2_ctrl_handler_free(&info->hdl);
-	v4l2_device_unregister_subdev(sd);
-	return ret;
-}
-
-static void noon010_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *sd = i2c_get_clientdata(client);
-	struct noon010_info *info = to_noon010(sd);
-
-	v4l2_device_unregister_subdev(sd);
-	v4l2_ctrl_handler_free(&info->hdl);
-	media_entity_cleanup(&sd->entity);
-}
-
-static const struct i2c_device_id noon010_id[] = {
-	{ MODULE_NAME, 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, noon010_id);
-
-
-static struct i2c_driver noon010_i2c_driver = {
-	.driver = {
-		.name = MODULE_NAME
-	},
-	.probe_new	= noon010_probe,
-	.remove		= noon010_remove,
-	.id_table	= noon010_id,
-};
-
-module_i2c_driver(noon010_i2c_driver);
-
-MODULE_DESCRIPTION("Siliconfile NOON010PC30 camera driver");
-MODULE_AUTHOR("Sylwester Nawrocki <s.nawrocki@samsung.com>");
-MODULE_LICENSE("GPL");
diff --git a/include/media/i2c/noon010pc30.h b/include/media/i2c/noon010pc30.h
deleted file mode 100644
index 1880dad25cf0..000000000000
--- a/include/media/i2c/noon010pc30.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Driver header for NOON010PC30L camera sensor chip.
- *
- * Copyright (c) 2010 Samsung Electronics, Co. Ltd
- * Contact: Sylwester Nawrocki <s.nawrocki@samsung.com>
- */
-
-#ifndef NOON010PC30_H
-#define NOON010PC30_H
-
-/**
- * struct noon010pc30_platform_data - platform data
- * @clk_rate: the clock frequency in Hz
- */
-
-struct noon010pc30_platform_data {
-	unsigned long clk_rate;
-};
-
-#endif /* NOON010PC30_H */
-- 
cgit v1.2.3


From 56ac4aa4ed76639c4786e7167b2c48c78d04b706 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused s5k6aa camera sensor driver

The s5k6aa camera sensor driver doesn't support DT and relies on
platform data. The last board files supplying platform data for that
device have been removed from the kernel in v3.11. The driver hasn't
been used since them. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |    1 -
 drivers/media/i2c/Kconfig                        |    9 -
 drivers/media/i2c/Makefile                       |    1 -
 drivers/media/i2c/s5k6aa.c                       | 1652 ----------------------
 include/media/i2c/s5k6aa.h                       |   48 -
 5 files changed, 1711 deletions(-)
 delete mode 100644 drivers/media/i2c/s5k6aa.c
 delete mode 100644 include/media/i2c/s5k6aa.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index e6c2ae43d02d..b9a3a561183f 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -105,7 +105,6 @@ s5c73m3       Samsung S5C73M3 sensor
 s5k4ecgx      Samsung S5K4ECGX sensor
 s5k5baf       Samsung S5K5BAF sensor
 s5k6a3        Samsung S5K6A3 sensor
-s5k6aa        Samsung S5K6AAFX sensor
 sr030pc30     Siliconfile SR030PC30 sensor
 vs6624        ST VS6624 sensor
 ============  ==========================================================
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 40304e017458..a7f2a8ba2fd3 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -784,15 +784,6 @@ config VIDEO_S5K6A3
 	  This is a V4L2 sensor driver for Samsung S5K6A3 raw
 	  camera sensor.
 
-config VIDEO_S5K6AA
-	tristate "Samsung S5K6AAFX sensor support"
-	depends on I2C && VIDEO_DEV
-	select MEDIA_CONTROLLER
-	select VIDEO_V4L2_SUBDEV_API
-	help
-	  This is a V4L2 sensor driver for Samsung S5K6AA(FX) 1.3M
-	  camera sensor with an embedded SoC image signal processor.
-
 config VIDEO_SR030PC30
 	tristate "Siliconfile SR030PC30 sensor support"
 	depends on I2C && VIDEO_DEV
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index e5e06c56efcd..6bc6c4682132 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -105,7 +105,6 @@ obj-$(CONFIG_VIDEO_RJ54N1) += rj54n1cb0c.o
 obj-$(CONFIG_VIDEO_S5C73M3) += s5c73m3/
 obj-$(CONFIG_VIDEO_S5K5BAF) += s5k5baf.o
 obj-$(CONFIG_VIDEO_S5K6A3) += s5k6a3.o
-obj-$(CONFIG_VIDEO_S5K6AA) += s5k6aa.o
 obj-$(CONFIG_VIDEO_SAA6588) += saa6588.o
 obj-$(CONFIG_VIDEO_SAA6752HS) += saa6752hs.o
 obj-$(CONFIG_VIDEO_SAA7110) += saa7110.o
diff --git a/drivers/media/i2c/s5k6aa.c b/drivers/media/i2c/s5k6aa.c
deleted file mode 100644
index 5996153371fc..000000000000
--- a/drivers/media/i2c/s5k6aa.c
+++ /dev/null
@@ -1,1652 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for Samsung S5K6AAFX SXGA 1/6" 1.3M CMOS Image Sensor
- * with embedded SoC ISP.
- *
- * Copyright (C) 2011, Samsung Electronics Co., Ltd.
- * Sylwester Nawrocki <s.nawrocki@samsung.com>
- *
- * Based on a driver authored by Dongsoo Nathaniel Kim.
- * Copyright (C) 2009, Dongsoo Nathaniel Kim <dongsoo45.kim@samsung.com>
- */
-
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <linux/media.h>
-#include <linux/module.h>
-#include <linux/regulator/consumer.h>
-#include <linux/slab.h>
-
-#include <media/media-entity.h>
-#include <media/v4l2-ctrls.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-#include <media/v4l2-mediabus.h>
-#include <media/i2c/s5k6aa.h>
-
-static int debug;
-module_param(debug, int, 0644);
-
-#define DRIVER_NAME			"S5K6AA"
-
-/* The token to indicate array termination */
-#define S5K6AA_TERM			0xffff
-#define S5K6AA_OUT_WIDTH_DEF		640
-#define S5K6AA_OUT_HEIGHT_DEF		480
-#define S5K6AA_WIN_WIDTH_MAX		1280
-#define S5K6AA_WIN_HEIGHT_MAX		1024
-#define S5K6AA_WIN_WIDTH_MIN		8
-#define S5K6AA_WIN_HEIGHT_MIN		8
-
-/*
- * H/W register Interface (0xD0000000 - 0xD0000FFF)
- */
-#define AHB_MSB_ADDR_PTR		0xfcfc
-#define GEN_REG_OFFSH			0xd000
-#define REG_CMDWR_ADDRH			0x0028
-#define REG_CMDWR_ADDRL			0x002a
-#define REG_CMDRD_ADDRH			0x002c
-#define REG_CMDRD_ADDRL			0x002e
-#define REG_CMDBUF0_ADDR		0x0f12
-#define REG_CMDBUF1_ADDR		0x0f10
-
-/*
- * Host S/W Register interface (0x70000000 - 0x70002000)
- * The value of the two most significant address bytes is 0x7000,
- * (HOST_SWIF_OFFS_H). The register addresses below specify 2 LSBs.
- */
-#define HOST_SWIF_OFFSH			0x7000
-
-/* Initialization parameters */
-/* Master clock frequency in KHz */
-#define REG_I_INCLK_FREQ_L		0x01b8
-#define REG_I_INCLK_FREQ_H		0x01ba
-#define  MIN_MCLK_FREQ_KHZ		6000U
-#define  MAX_MCLK_FREQ_KHZ		27000U
-#define REG_I_USE_NPVI_CLOCKS		0x01c6
-#define REG_I_USE_NMIPI_CLOCKS		0x01c8
-
-/* Clock configurations, n = 0..2. REG_I_* frequency unit is 4 kHz. */
-#define REG_I_OPCLK_4KHZ(n)		((n) * 6 + 0x01cc)
-#define REG_I_MIN_OUTRATE_4KHZ(n)	((n) * 6 + 0x01ce)
-#define REG_I_MAX_OUTRATE_4KHZ(n)	((n) * 6 + 0x01d0)
-#define  SYS_PLL_OUT_FREQ		(48000000 / 4000)
-#define  PCLK_FREQ_MIN			(24000000 / 4000)
-#define  PCLK_FREQ_MAX			(48000000 / 4000)
-#define REG_I_INIT_PARAMS_UPDATED	0x01e0
-#define REG_I_ERROR_INFO		0x01e2
-
-/* General purpose parameters */
-#define REG_USER_BRIGHTNESS		0x01e4
-#define REG_USER_CONTRAST		0x01e6
-#define REG_USER_SATURATION		0x01e8
-#define REG_USER_SHARPBLUR		0x01ea
-
-#define REG_G_SPEC_EFFECTS		0x01ee
-#define REG_G_ENABLE_PREV		0x01f0
-#define REG_G_ENABLE_PREV_CHG		0x01f2
-#define REG_G_NEW_CFG_SYNC		0x01f8
-#define REG_G_PREVZOOM_IN_WIDTH		0x020a
-#define REG_G_PREVZOOM_IN_HEIGHT	0x020c
-#define REG_G_PREVZOOM_IN_XOFFS		0x020e
-#define REG_G_PREVZOOM_IN_YOFFS		0x0210
-#define REG_G_INPUTS_CHANGE_REQ		0x021a
-#define REG_G_ACTIVE_PREV_CFG		0x021c
-#define REG_G_PREV_CFG_CHG		0x021e
-#define REG_G_PREV_OPEN_AFTER_CH	0x0220
-#define REG_G_PREV_CFG_ERROR		0x0222
-
-/* Preview control section. n = 0...4. */
-#define PREG(n, x)			((n) * 0x26 + x)
-#define REG_P_OUT_WIDTH(n)		PREG(n, 0x0242)
-#define REG_P_OUT_HEIGHT(n)		PREG(n, 0x0244)
-#define REG_P_FMT(n)			PREG(n, 0x0246)
-#define REG_P_MAX_OUT_RATE(n)		PREG(n, 0x0248)
-#define REG_P_MIN_OUT_RATE(n)		PREG(n, 0x024a)
-#define REG_P_PVI_MASK(n)		PREG(n, 0x024c)
-#define REG_P_CLK_INDEX(n)		PREG(n, 0x024e)
-#define REG_P_FR_RATE_TYPE(n)		PREG(n, 0x0250)
-#define  FR_RATE_DYNAMIC		0
-#define  FR_RATE_FIXED			1
-#define  FR_RATE_FIXED_ACCURATE		2
-#define REG_P_FR_RATE_Q_TYPE(n)		PREG(n, 0x0252)
-#define  FR_RATE_Q_BEST_FRRATE		1 /* Binning enabled */
-#define  FR_RATE_Q_BEST_QUALITY		2 /* Binning disabled */
-/* Frame period in 0.1 ms units */
-#define REG_P_MAX_FR_TIME(n)		PREG(n, 0x0254)
-#define REG_P_MIN_FR_TIME(n)		PREG(n, 0x0256)
-/* Conversion to REG_P_[MAX/MIN]_FR_TIME value; __t: time in us */
-#define  US_TO_FR_TIME(__t)		((__t) / 100)
-#define  S5K6AA_MIN_FR_TIME		33300  /* us */
-#define  S5K6AA_MAX_FR_TIME		650000 /* us */
-#define  S5K6AA_MAX_HIGHRES_FR_TIME	666    /* x100 us */
-/* The below 5 registers are for "device correction" values */
-#define REG_P_COLORTEMP(n)		PREG(n, 0x025e)
-#define REG_P_PREV_MIRROR(n)		PREG(n, 0x0262)
-
-/* Extended image property controls */
-/* Exposure time in 10 us units */
-#define REG_SF_USR_EXPOSURE_L		0x03c6
-#define REG_SF_USR_EXPOSURE_H		0x03c8
-#define REG_SF_USR_EXPOSURE_CHG		0x03ca
-#define REG_SF_USR_TOT_GAIN		0x03cc
-#define REG_SF_USR_TOT_GAIN_CHG		0x03ce
-#define REG_SF_RGAIN			0x03d0
-#define REG_SF_RGAIN_CHG		0x03d2
-#define REG_SF_GGAIN			0x03d4
-#define REG_SF_GGAIN_CHG		0x03d6
-#define REG_SF_BGAIN			0x03d8
-#define REG_SF_BGAIN_CHG		0x03da
-#define REG_SF_FLICKER_QUANT		0x03dc
-#define REG_SF_FLICKER_QUANT_CHG	0x03de
-
-/* Output interface (parallel/MIPI) setup */
-#define REG_OIF_EN_MIPI_LANES		0x03fa
-#define REG_OIF_EN_PACKETS		0x03fc
-#define REG_OIF_CFG_CHG			0x03fe
-
-/* Auto-algorithms enable mask */
-#define REG_DBG_AUTOALG_EN		0x0400
-#define  AALG_ALL_EN_MASK		(1 << 0)
-#define  AALG_AE_EN_MASK		(1 << 1)
-#define  AALG_DIVLEI_EN_MASK		(1 << 2)
-#define  AALG_WB_EN_MASK		(1 << 3)
-#define  AALG_FLICKER_EN_MASK		(1 << 5)
-#define  AALG_FIT_EN_MASK		(1 << 6)
-#define  AALG_WRHW_EN_MASK		(1 << 7)
-
-/* Firmware revision information */
-#define REG_FW_APIVER			0x012e
-#define  S5K6AAFX_FW_APIVER		0x0001
-#define REG_FW_REVISION			0x0130
-
-/* For now we use only one user configuration register set */
-#define S5K6AA_MAX_PRESETS		1
-
-static const char * const s5k6aa_supply_names[] = {
-	"vdd_core",	/* Digital core supply 1.5V (1.4V to 1.6V) */
-	"vdda",		/* Analog power supply 2.8V (2.6V to 3.0V) */
-	"vdd_reg",	/* Regulator input power 1.8V (1.7V to 1.9V)
-			   or 2.8V (2.6V to 3.0) */
-	"vddio",	/* I/O supply 1.8V (1.65V to 1.95V)
-			   or 2.8V (2.5V to 3.1V) */
-};
-#define S5K6AA_NUM_SUPPLIES ARRAY_SIZE(s5k6aa_supply_names)
-
-enum s5k6aa_gpio_id {
-	STBY,
-	RSET,
-	GPIO_NUM,
-};
-
-struct s5k6aa_regval {
-	u16 addr;
-	u16 val;
-};
-
-struct s5k6aa_pixfmt {
-	u32 code;
-	u32 colorspace;
-	/* REG_P_FMT(x) register value */
-	u16 reg_p_fmt;
-};
-
-struct s5k6aa_preset {
-	/* output pixel format and resolution */
-	struct v4l2_mbus_framefmt mbus_fmt;
-	u8 clk_id;
-	u8 index;
-};
-
-struct s5k6aa_ctrls {
-	struct v4l2_ctrl_handler handler;
-	/* Auto / manual white balance cluster */
-	struct v4l2_ctrl *awb;
-	struct v4l2_ctrl *gain_red;
-	struct v4l2_ctrl *gain_blue;
-	struct v4l2_ctrl *gain_green;
-	/* Mirror cluster */
-	struct v4l2_ctrl *hflip;
-	struct v4l2_ctrl *vflip;
-	/* Auto exposure / manual exposure and gain cluster */
-	struct v4l2_ctrl *auto_exp;
-	struct v4l2_ctrl *exposure;
-	struct v4l2_ctrl *gain;
-};
-
-struct s5k6aa_interval {
-	u16 reg_fr_time;
-	struct v4l2_fract interval;
-	/* Maximum rectangle for the interval */
-	struct v4l2_frmsize_discrete size;
-};
-
-struct s5k6aa {
-	struct v4l2_subdev sd;
-	struct media_pad pad;
-
-	enum v4l2_mbus_type bus_type;
-	u8 mipi_lanes;
-
-	int (*s_power)(int enable);
-	struct regulator_bulk_data supplies[S5K6AA_NUM_SUPPLIES];
-	struct s5k6aa_gpio gpio[GPIO_NUM];
-
-	/* external master clock frequency */
-	unsigned long mclk_frequency;
-	/* ISP internal master clock frequency */
-	u16 clk_fop;
-	/* output pixel clock frequency range */
-	u16 pclk_fmin;
-	u16 pclk_fmax;
-
-	unsigned int inv_hflip:1;
-	unsigned int inv_vflip:1;
-
-	/* protects the struct members below */
-	struct mutex lock;
-
-	/* sensor matrix scan window */
-	struct v4l2_rect ccd_rect;
-
-	struct s5k6aa_ctrls ctrls;
-	struct s5k6aa_preset presets[S5K6AA_MAX_PRESETS];
-	struct s5k6aa_preset *preset;
-	const struct s5k6aa_interval *fiv;
-
-	unsigned int streaming:1;
-	unsigned int apply_cfg:1;
-	unsigned int apply_crop:1;
-	unsigned int power;
-};
-
-static struct s5k6aa_regval s5k6aa_analog_config[] = {
-	/* Analog settings */
-	{ 0x112a, 0x0000 }, { 0x1132, 0x0000 },
-	{ 0x113e, 0x0000 }, { 0x115c, 0x0000 },
-	{ 0x1164, 0x0000 }, { 0x1174, 0x0000 },
-	{ 0x1178, 0x0000 }, { 0x077a, 0x0000 },
-	{ 0x077c, 0x0000 }, { 0x077e, 0x0000 },
-	{ 0x0780, 0x0000 }, { 0x0782, 0x0000 },
-	{ 0x0784, 0x0000 }, { 0x0786, 0x0000 },
-	{ 0x0788, 0x0000 }, { 0x07a2, 0x0000 },
-	{ 0x07a4, 0x0000 }, { 0x07a6, 0x0000 },
-	{ 0x07a8, 0x0000 }, { 0x07b6, 0x0000 },
-	{ 0x07b8, 0x0002 }, { 0x07ba, 0x0004 },
-	{ 0x07bc, 0x0004 }, { 0x07be, 0x0005 },
-	{ 0x07c0, 0x0005 }, { S5K6AA_TERM, 0 },
-};
-
-/* TODO: Add RGB888 and Bayer format */
-static const struct s5k6aa_pixfmt s5k6aa_formats[] = {
-	{ MEDIA_BUS_FMT_YUYV8_2X8,	V4L2_COLORSPACE_JPEG,	5 },
-	/* range 16-240 */
-	{ MEDIA_BUS_FMT_YUYV8_2X8,	V4L2_COLORSPACE_REC709,	6 },
-	{ MEDIA_BUS_FMT_RGB565_2X8_BE,	V4L2_COLORSPACE_JPEG,	0 },
-};
-
-static const struct s5k6aa_interval s5k6aa_intervals[] = {
-	{ 1000, {10000, 1000000}, {1280, 1024} }, /* 10 fps */
-	{ 666,  {15000, 1000000}, {1280, 1024} }, /* 15 fps */
-	{ 500,  {20000, 1000000}, {1280, 720} },  /* 20 fps */
-	{ 400,  {25000, 1000000}, {640, 480} },   /* 25 fps */
-	{ 333,  {33300, 1000000}, {640, 480} },   /* 30 fps */
-};
-
-#define S5K6AA_INTERVAL_DEF_INDEX 1 /* 15 fps */
-
-static inline struct v4l2_subdev *ctrl_to_sd(struct v4l2_ctrl *ctrl)
-{
-	return &container_of(ctrl->handler, struct s5k6aa, ctrls.handler)->sd;
-}
-
-static inline struct s5k6aa *to_s5k6aa(struct v4l2_subdev *sd)
-{
-	return container_of(sd, struct s5k6aa, sd);
-}
-
-/* Set initial values for all preview presets */
-static void s5k6aa_presets_data_init(struct s5k6aa *s5k6aa)
-{
-	struct s5k6aa_preset *preset = &s5k6aa->presets[0];
-	int i;
-
-	for (i = 0; i < S5K6AA_MAX_PRESETS; i++) {
-		preset->mbus_fmt.width	= S5K6AA_OUT_WIDTH_DEF;
-		preset->mbus_fmt.height	= S5K6AA_OUT_HEIGHT_DEF;
-		preset->mbus_fmt.code	= s5k6aa_formats[0].code;
-		preset->index		= i;
-		preset->clk_id		= 0;
-		preset++;
-	}
-
-	s5k6aa->fiv = &s5k6aa_intervals[S5K6AA_INTERVAL_DEF_INDEX];
-	s5k6aa->preset = &s5k6aa->presets[0];
-}
-
-static int s5k6aa_i2c_read(struct i2c_client *client, u16 addr, u16 *val)
-{
-	u8 wbuf[2] = {addr >> 8, addr & 0xFF};
-	struct i2c_msg msg[2];
-	u8 rbuf[2];
-	int ret;
-
-	msg[0].addr = client->addr;
-	msg[0].flags = 0;
-	msg[0].len = 2;
-	msg[0].buf = wbuf;
-
-	msg[1].addr = client->addr;
-	msg[1].flags = I2C_M_RD;
-	msg[1].len = 2;
-	msg[1].buf = rbuf;
-
-	ret = i2c_transfer(client->adapter, msg, 2);
-	*val = be16_to_cpu(*((__be16 *)rbuf));
-
-	v4l2_dbg(3, debug, client, "i2c_read: 0x%04X : 0x%04x\n", addr, *val);
-
-	return ret == 2 ? 0 : ret;
-}
-
-static int s5k6aa_i2c_write(struct i2c_client *client, u16 addr, u16 val)
-{
-	u8 buf[4] = {addr >> 8, addr & 0xFF, val >> 8, val & 0xFF};
-
-	int ret = i2c_master_send(client, buf, 4);
-	v4l2_dbg(3, debug, client, "i2c_write: 0x%04X : 0x%04x\n", addr, val);
-
-	return ret == 4 ? 0 : ret;
-}
-
-/* The command register write, assumes Command_Wr_addH = 0x7000. */
-static int s5k6aa_write(struct i2c_client *c, u16 addr, u16 val)
-{
-	int ret = s5k6aa_i2c_write(c, REG_CMDWR_ADDRL, addr);
-	if (ret)
-		return ret;
-	return s5k6aa_i2c_write(c, REG_CMDBUF0_ADDR, val);
-}
-
-/* The command register read, assumes Command_Rd_addH = 0x7000. */
-static int s5k6aa_read(struct i2c_client *client, u16 addr, u16 *val)
-{
-	int ret = s5k6aa_i2c_write(client, REG_CMDRD_ADDRL, addr);
-	if (ret)
-		return ret;
-	return s5k6aa_i2c_read(client, REG_CMDBUF0_ADDR, val);
-}
-
-static int s5k6aa_write_array(struct v4l2_subdev *sd,
-			      const struct s5k6aa_regval *msg)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	u16 addr_incr = 0;
-	int ret = 0;
-
-	while (msg->addr != S5K6AA_TERM) {
-		if (addr_incr != 2)
-			ret = s5k6aa_i2c_write(client, REG_CMDWR_ADDRL,
-					       msg->addr);
-		if (ret)
-			break;
-		ret = s5k6aa_i2c_write(client, REG_CMDBUF0_ADDR, msg->val);
-		if (ret)
-			break;
-		/* Assume that msg->addr is always less than 0xfffc */
-		addr_incr = (msg + 1)->addr - msg->addr;
-		msg++;
-	}
-
-	return ret;
-}
-
-/* Configure the AHB high address bytes for GTG registers access */
-static int s5k6aa_set_ahb_address(struct i2c_client *client)
-{
-	int ret = s5k6aa_i2c_write(client, AHB_MSB_ADDR_PTR, GEN_REG_OFFSH);
-	if (ret)
-		return ret;
-	ret = s5k6aa_i2c_write(client, REG_CMDRD_ADDRH, HOST_SWIF_OFFSH);
-	if (ret)
-		return ret;
-	return s5k6aa_i2c_write(client, REG_CMDWR_ADDRH, HOST_SWIF_OFFSH);
-}
-
-/**
- * s5k6aa_configure_pixel_clocks - apply ISP main clock/PLL configuration
- * @s5k6aa: pointer to &struct s5k6aa describing the device
- *
- * Configure the internal ISP PLL for the required output frequency.
- * Locking: called with s5k6aa.lock mutex held.
- */
-static int s5k6aa_configure_pixel_clocks(struct s5k6aa *s5k6aa)
-{
-	struct i2c_client *c = v4l2_get_subdevdata(&s5k6aa->sd);
-	unsigned long fmclk = s5k6aa->mclk_frequency / 1000;
-	u16 status;
-	int ret;
-
-	if (WARN(fmclk < MIN_MCLK_FREQ_KHZ || fmclk > MAX_MCLK_FREQ_KHZ,
-		 "Invalid clock frequency: %ld\n", fmclk))
-		return -EINVAL;
-
-	s5k6aa->pclk_fmin = PCLK_FREQ_MIN;
-	s5k6aa->pclk_fmax = PCLK_FREQ_MAX;
-	s5k6aa->clk_fop = SYS_PLL_OUT_FREQ;
-
-	/* External input clock frequency in kHz */
-	ret = s5k6aa_write(c, REG_I_INCLK_FREQ_H, fmclk >> 16);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_INCLK_FREQ_L, fmclk & 0xFFFF);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_USE_NPVI_CLOCKS, 1);
-	/* Internal PLL frequency */
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_OPCLK_4KHZ(0), s5k6aa->clk_fop);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_MIN_OUTRATE_4KHZ(0),
-				   s5k6aa->pclk_fmin);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_MAX_OUTRATE_4KHZ(0),
-				   s5k6aa->pclk_fmax);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_I_INIT_PARAMS_UPDATED, 1);
-	if (!ret)
-		ret = s5k6aa_read(c, REG_I_ERROR_INFO, &status);
-
-	return ret ? ret : (status ? -EINVAL : 0);
-}
-
-/* Set horizontal and vertical image flipping */
-static int s5k6aa_set_mirror(struct s5k6aa *s5k6aa, int horiz_flip)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	int index = s5k6aa->preset->index;
-
-	unsigned int vflip = s5k6aa->ctrls.vflip->val ^ s5k6aa->inv_vflip;
-	unsigned int flip = (horiz_flip ^ s5k6aa->inv_hflip) | (vflip << 1);
-
-	return s5k6aa_write(client, REG_P_PREV_MIRROR(index), flip);
-}
-
-/* Configure auto/manual white balance and R/G/B gains */
-static int s5k6aa_set_awb(struct s5k6aa *s5k6aa, int awb)
-{
-	struct i2c_client *c = v4l2_get_subdevdata(&s5k6aa->sd);
-	struct s5k6aa_ctrls *ctrls = &s5k6aa->ctrls;
-	u16 reg;
-
-	int ret = s5k6aa_read(c, REG_DBG_AUTOALG_EN, &reg);
-
-	if (!ret && !awb) {
-		ret = s5k6aa_write(c, REG_SF_RGAIN, ctrls->gain_red->val);
-		if (!ret)
-			ret = s5k6aa_write(c, REG_SF_RGAIN_CHG, 1);
-		if (ret)
-			return ret;
-
-		ret = s5k6aa_write(c, REG_SF_GGAIN, ctrls->gain_green->val);
-		if (!ret)
-			ret = s5k6aa_write(c, REG_SF_GGAIN_CHG, 1);
-		if (ret)
-			return ret;
-
-		ret = s5k6aa_write(c, REG_SF_BGAIN, ctrls->gain_blue->val);
-		if (!ret)
-			ret = s5k6aa_write(c, REG_SF_BGAIN_CHG, 1);
-	}
-	if (!ret) {
-		reg = awb ? reg | AALG_WB_EN_MASK : reg & ~AALG_WB_EN_MASK;
-		ret = s5k6aa_write(c, REG_DBG_AUTOALG_EN, reg);
-	}
-
-	return ret;
-}
-
-/* Program FW with exposure time, 'exposure' in us units */
-static int s5k6aa_set_user_exposure(struct i2c_client *client, int exposure)
-{
-	unsigned int time = exposure / 10;
-
-	int ret = s5k6aa_write(client, REG_SF_USR_EXPOSURE_L, time & 0xffff);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_SF_USR_EXPOSURE_H, time >> 16);
-	if (ret)
-		return ret;
-	return s5k6aa_write(client, REG_SF_USR_EXPOSURE_CHG, 1);
-}
-
-static int s5k6aa_set_user_gain(struct i2c_client *client, int gain)
-{
-	int ret = s5k6aa_write(client, REG_SF_USR_TOT_GAIN, gain);
-	if (ret)
-		return ret;
-	return s5k6aa_write(client, REG_SF_USR_TOT_GAIN_CHG, 1);
-}
-
-/* Set auto/manual exposure and total gain */
-static int s5k6aa_set_auto_exposure(struct s5k6aa *s5k6aa, int value)
-{
-	struct i2c_client *c = v4l2_get_subdevdata(&s5k6aa->sd);
-	unsigned int exp_time = s5k6aa->ctrls.exposure->val;
-	u16 auto_alg;
-
-	int ret = s5k6aa_read(c, REG_DBG_AUTOALG_EN, &auto_alg);
-	if (ret)
-		return ret;
-
-	v4l2_dbg(1, debug, c, "man_exp: %d, auto_exp: %d, a_alg: 0x%x\n",
-		 exp_time, value, auto_alg);
-
-	if (value == V4L2_EXPOSURE_AUTO) {
-		auto_alg |= AALG_AE_EN_MASK | AALG_DIVLEI_EN_MASK;
-	} else {
-		ret = s5k6aa_set_user_exposure(c, exp_time);
-		if (ret)
-			return ret;
-		ret = s5k6aa_set_user_gain(c, s5k6aa->ctrls.gain->val);
-		if (ret)
-			return ret;
-		auto_alg &= ~(AALG_AE_EN_MASK | AALG_DIVLEI_EN_MASK);
-	}
-
-	return s5k6aa_write(c, REG_DBG_AUTOALG_EN, auto_alg);
-}
-
-static int s5k6aa_set_anti_flicker(struct s5k6aa *s5k6aa, int value)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	u16 auto_alg;
-	int ret;
-
-	ret = s5k6aa_read(client, REG_DBG_AUTOALG_EN, &auto_alg);
-	if (ret)
-		return ret;
-
-	if (value == V4L2_CID_POWER_LINE_FREQUENCY_AUTO) {
-		auto_alg |= AALG_FLICKER_EN_MASK;
-	} else {
-		auto_alg &= ~AALG_FLICKER_EN_MASK;
-		/* The V4L2_CID_LINE_FREQUENCY control values match
-		 * the register values */
-		ret = s5k6aa_write(client, REG_SF_FLICKER_QUANT, value);
-		if (ret)
-			return ret;
-		ret = s5k6aa_write(client, REG_SF_FLICKER_QUANT_CHG, 1);
-		if (ret)
-			return ret;
-	}
-
-	return s5k6aa_write(client, REG_DBG_AUTOALG_EN, auto_alg);
-}
-
-static int s5k6aa_set_colorfx(struct s5k6aa *s5k6aa, int val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	static const struct v4l2_control colorfx[] = {
-		{ V4L2_COLORFX_NONE,	 0 },
-		{ V4L2_COLORFX_BW,	 1 },
-		{ V4L2_COLORFX_NEGATIVE, 2 },
-		{ V4L2_COLORFX_SEPIA,	 3 },
-		{ V4L2_COLORFX_SKY_BLUE, 4 },
-		{ V4L2_COLORFX_SKETCH,	 5 },
-	};
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(colorfx); i++) {
-		if (colorfx[i].id == val)
-			return s5k6aa_write(client, REG_G_SPEC_EFFECTS,
-					    colorfx[i].value);
-	}
-	return -EINVAL;
-}
-
-static int s5k6aa_preview_config_status(struct i2c_client *client)
-{
-	u16 error = 0;
-	int ret = s5k6aa_read(client, REG_G_PREV_CFG_ERROR, &error);
-
-	v4l2_dbg(1, debug, client, "error: 0x%x (%d)\n", error, ret);
-	return ret ? ret : (error ? -EINVAL : 0);
-}
-
-static int s5k6aa_get_pixfmt_index(struct s5k6aa *s5k6aa,
-				   struct v4l2_mbus_framefmt *mf)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(s5k6aa_formats); i++)
-		if (mf->colorspace == s5k6aa_formats[i].colorspace &&
-		    mf->code == s5k6aa_formats[i].code)
-			return i;
-	return 0;
-}
-
-static int s5k6aa_set_output_framefmt(struct s5k6aa *s5k6aa,
-				      struct s5k6aa_preset *preset)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	int fmt_index = s5k6aa_get_pixfmt_index(s5k6aa, &preset->mbus_fmt);
-	int ret;
-
-	ret = s5k6aa_write(client, REG_P_OUT_WIDTH(preset->index),
-			   preset->mbus_fmt.width);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_OUT_HEIGHT(preset->index),
-				   preset->mbus_fmt.height);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_FMT(preset->index),
-				   s5k6aa_formats[fmt_index].reg_p_fmt);
-	return ret;
-}
-
-static int s5k6aa_set_input_params(struct s5k6aa *s5k6aa)
-{
-	struct i2c_client *c = v4l2_get_subdevdata(&s5k6aa->sd);
-	struct v4l2_rect *r = &s5k6aa->ccd_rect;
-	int ret;
-
-	ret = s5k6aa_write(c, REG_G_PREVZOOM_IN_WIDTH, r->width);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_G_PREVZOOM_IN_HEIGHT, r->height);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_G_PREVZOOM_IN_XOFFS, r->left);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_G_PREVZOOM_IN_YOFFS, r->top);
-	if (!ret)
-		ret = s5k6aa_write(c, REG_G_INPUTS_CHANGE_REQ, 1);
-	if (!ret)
-		s5k6aa->apply_crop = 0;
-
-	return ret;
-}
-
-/**
- * s5k6aa_configure_video_bus - configure the video output interface
- * @s5k6aa: pointer to &struct s5k6aa describing the device
- * @bus_type: video bus type: parallel or MIPI-CSI
- * @nlanes: number of MIPI lanes to be used (MIPI-CSI only)
- *
- * Note: Only parallel bus operation has been tested.
- */
-static int s5k6aa_configure_video_bus(struct s5k6aa *s5k6aa,
-				      enum v4l2_mbus_type bus_type, int nlanes)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	u16 cfg = 0;
-	int ret;
-
-	/*
-	 * TODO: The sensor is supposed to support BT.601 and BT.656
-	 * but there is nothing indicating how to switch between both
-	 * in the datasheet. For now default BT.601 interface is assumed.
-	 */
-	if (bus_type == V4L2_MBUS_CSI2_DPHY)
-		cfg = nlanes;
-	else if (bus_type != V4L2_MBUS_PARALLEL)
-		return -EINVAL;
-
-	ret = s5k6aa_write(client, REG_OIF_EN_MIPI_LANES, cfg);
-	if (ret)
-		return ret;
-	return s5k6aa_write(client, REG_OIF_CFG_CHG, 1);
-}
-
-/* This function should be called when switching to new user configuration set*/
-static int s5k6aa_new_config_sync(struct i2c_client *client, int timeout,
-				  int cid)
-{
-	unsigned long end = jiffies + msecs_to_jiffies(timeout);
-	u16 reg = 1;
-	int ret;
-
-	ret = s5k6aa_write(client, REG_G_ACTIVE_PREV_CFG, cid);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_G_PREV_CFG_CHG, 1);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_G_NEW_CFG_SYNC, 1);
-	if (timeout == 0)
-		return ret;
-
-	while (ret >= 0 && time_is_after_jiffies(end)) {
-		ret = s5k6aa_read(client, REG_G_NEW_CFG_SYNC, &reg);
-		if (!reg)
-			return 0;
-		usleep_range(1000, 5000);
-	}
-	return ret ? ret : -ETIMEDOUT;
-}
-
-/**
- * s5k6aa_set_prev_config - write user preview register set
- * @s5k6aa: pointer to &struct s5k6aa describing the device
- * @preset: s5kaa preset to be applied
- *
- * Configure output resolution and color format, pixel clock
- * frequency range, device frame rate type and frame period range.
- */
-static int s5k6aa_set_prev_config(struct s5k6aa *s5k6aa,
-				  struct s5k6aa_preset *preset)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	int idx = preset->index;
-	u16 frame_rate_q;
-	int ret;
-
-	if (s5k6aa->fiv->reg_fr_time >= S5K6AA_MAX_HIGHRES_FR_TIME)
-		frame_rate_q = FR_RATE_Q_BEST_FRRATE;
-	else
-		frame_rate_q = FR_RATE_Q_BEST_QUALITY;
-
-	ret = s5k6aa_set_output_framefmt(s5k6aa, preset);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_MAX_OUT_RATE(idx),
-				   s5k6aa->pclk_fmax);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_MIN_OUT_RATE(idx),
-				   s5k6aa->pclk_fmin);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_CLK_INDEX(idx),
-				   preset->clk_id);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_FR_RATE_TYPE(idx),
-				   FR_RATE_DYNAMIC);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_FR_RATE_Q_TYPE(idx),
-				   frame_rate_q);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_MAX_FR_TIME(idx),
-				   s5k6aa->fiv->reg_fr_time + 33);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_P_MIN_FR_TIME(idx),
-				   s5k6aa->fiv->reg_fr_time - 33);
-	if (!ret)
-		ret = s5k6aa_new_config_sync(client, 250, idx);
-	if (!ret)
-		ret = s5k6aa_preview_config_status(client);
-	if (!ret)
-		s5k6aa->apply_cfg = 0;
-
-	v4l2_dbg(1, debug, client, "Frame interval: %d +/- 3.3ms. (%d)\n",
-		 s5k6aa->fiv->reg_fr_time, ret);
-	return ret;
-}
-
-/**
- * s5k6aa_initialize_isp - basic ISP MCU initialization
- * @sd: pointer to V4L2 sub-device descriptor
- *
- * Configure AHB addresses for registers read/write; configure PLLs for
- * required output pixel clock. The ISP power supply needs to be already
- * enabled, with an optional H/W reset.
- * Locking: called with s5k6aa.lock mutex held.
- */
-static int s5k6aa_initialize_isp(struct v4l2_subdev *sd)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int ret;
-
-	s5k6aa->apply_crop = 1;
-	s5k6aa->apply_cfg = 1;
-	msleep(100);
-
-	ret = s5k6aa_set_ahb_address(client);
-	if (ret)
-		return ret;
-	ret = s5k6aa_configure_video_bus(s5k6aa, s5k6aa->bus_type,
-					 s5k6aa->mipi_lanes);
-	if (ret)
-		return ret;
-	ret = s5k6aa_write_array(sd, s5k6aa_analog_config);
-	if (ret)
-		return ret;
-	msleep(20);
-
-	return s5k6aa_configure_pixel_clocks(s5k6aa);
-}
-
-static int s5k6aa_gpio_set_value(struct s5k6aa *priv, int id, u32 val)
-{
-	if (!gpio_is_valid(priv->gpio[id].gpio))
-		return 0;
-	gpio_set_value(priv->gpio[id].gpio, !!val);
-	return 1;
-}
-
-static int s5k6aa_gpio_assert(struct s5k6aa *priv, int id)
-{
-	return s5k6aa_gpio_set_value(priv, id, priv->gpio[id].level);
-}
-
-static int s5k6aa_gpio_deassert(struct s5k6aa *priv, int id)
-{
-	return s5k6aa_gpio_set_value(priv, id, !priv->gpio[id].level);
-}
-
-static int __s5k6aa_power_on(struct s5k6aa *s5k6aa)
-{
-	int ret;
-
-	ret = regulator_bulk_enable(S5K6AA_NUM_SUPPLIES, s5k6aa->supplies);
-	if (ret)
-		return ret;
-	if (s5k6aa_gpio_deassert(s5k6aa, STBY))
-		usleep_range(150, 200);
-
-	if (s5k6aa->s_power)
-		ret = s5k6aa->s_power(1);
-	usleep_range(4000, 5000);
-
-	if (s5k6aa_gpio_deassert(s5k6aa, RSET))
-		msleep(20);
-
-	return ret;
-}
-
-static int __s5k6aa_power_off(struct s5k6aa *s5k6aa)
-{
-	int ret;
-
-	if (s5k6aa_gpio_assert(s5k6aa, RSET))
-		usleep_range(100, 150);
-
-	if (s5k6aa->s_power) {
-		ret = s5k6aa->s_power(0);
-		if (ret)
-			return ret;
-	}
-	if (s5k6aa_gpio_assert(s5k6aa, STBY))
-		usleep_range(50, 100);
-	s5k6aa->streaming = 0;
-
-	return regulator_bulk_disable(S5K6AA_NUM_SUPPLIES, s5k6aa->supplies);
-}
-
-/*
- * V4L2 subdev core and video operations
- */
-static int s5k6aa_set_power(struct v4l2_subdev *sd, int on)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int ret = 0;
-
-	mutex_lock(&s5k6aa->lock);
-
-	if (s5k6aa->power == !on) {
-		if (on) {
-			ret = __s5k6aa_power_on(s5k6aa);
-			if (!ret)
-				ret = s5k6aa_initialize_isp(sd);
-		} else {
-			ret = __s5k6aa_power_off(s5k6aa);
-		}
-
-		if (!ret)
-			s5k6aa->power += on ? 1 : -1;
-	}
-
-	mutex_unlock(&s5k6aa->lock);
-
-	if (!on || ret || s5k6aa->power != 1)
-		return ret;
-
-	return v4l2_ctrl_handler_setup(sd->ctrl_handler);
-}
-
-static int __s5k6aa_stream(struct s5k6aa *s5k6aa, int enable)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	int ret = 0;
-
-	ret = s5k6aa_write(client, REG_G_ENABLE_PREV, enable);
-	if (!ret)
-		ret = s5k6aa_write(client, REG_G_ENABLE_PREV_CHG, 1);
-	if (!ret)
-		s5k6aa->streaming = enable;
-
-	return ret;
-}
-
-static int s5k6aa_s_stream(struct v4l2_subdev *sd, int on)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int ret = 0;
-
-	mutex_lock(&s5k6aa->lock);
-
-	if (s5k6aa->streaming == !on) {
-		if (!ret && s5k6aa->apply_cfg)
-			ret = s5k6aa_set_prev_config(s5k6aa, s5k6aa->preset);
-		if (s5k6aa->apply_crop)
-			ret = s5k6aa_set_input_params(s5k6aa);
-		if (!ret)
-			ret = __s5k6aa_stream(s5k6aa, !!on);
-	}
-	mutex_unlock(&s5k6aa->lock);
-
-	return ret;
-}
-
-static int s5k6aa_g_frame_interval(struct v4l2_subdev *sd,
-				   struct v4l2_subdev_frame_interval *fi)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-
-	mutex_lock(&s5k6aa->lock);
-	fi->interval = s5k6aa->fiv->interval;
-	mutex_unlock(&s5k6aa->lock);
-
-	return 0;
-}
-
-static int __s5k6aa_set_frame_interval(struct s5k6aa *s5k6aa,
-				       struct v4l2_subdev_frame_interval *fi)
-{
-	struct v4l2_mbus_framefmt *mbus_fmt = &s5k6aa->preset->mbus_fmt;
-	const struct s5k6aa_interval *fiv = &s5k6aa_intervals[0];
-	unsigned int err, min_err = UINT_MAX;
-	unsigned int i, fr_time;
-
-	if (fi->interval.denominator == 0)
-		return -EINVAL;
-
-	fr_time = fi->interval.numerator * 10000 / fi->interval.denominator;
-
-	for (i = 0; i < ARRAY_SIZE(s5k6aa_intervals); i++) {
-		const struct s5k6aa_interval *iv = &s5k6aa_intervals[i];
-
-		if (mbus_fmt->width > iv->size.width ||
-		    mbus_fmt->height > iv->size.height)
-			continue;
-
-		err = abs(iv->reg_fr_time - fr_time);
-		if (err < min_err) {
-			fiv = iv;
-			min_err = err;
-		}
-	}
-	s5k6aa->fiv = fiv;
-
-	v4l2_dbg(1, debug, &s5k6aa->sd, "Changed frame interval to %d us\n",
-		 fiv->reg_fr_time * 100);
-	return 0;
-}
-
-static int s5k6aa_s_frame_interval(struct v4l2_subdev *sd,
-				   struct v4l2_subdev_frame_interval *fi)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int ret;
-
-	v4l2_dbg(1, debug, sd, "Setting %d/%d frame interval\n",
-		 fi->interval.numerator, fi->interval.denominator);
-
-	mutex_lock(&s5k6aa->lock);
-	ret = __s5k6aa_set_frame_interval(s5k6aa, fi);
-	s5k6aa->apply_cfg = 1;
-
-	mutex_unlock(&s5k6aa->lock);
-	return ret;
-}
-
-/*
- * V4L2 subdev pad level and video operations
- */
-static int s5k6aa_enum_frame_interval(struct v4l2_subdev *sd,
-			      struct v4l2_subdev_state *sd_state,
-			      struct v4l2_subdev_frame_interval_enum *fie)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	const struct s5k6aa_interval *fi;
-	int ret = 0;
-
-	if (fie->index >= ARRAY_SIZE(s5k6aa_intervals))
-		return -EINVAL;
-
-	v4l_bound_align_image(&fie->width, S5K6AA_WIN_WIDTH_MIN,
-			      S5K6AA_WIN_WIDTH_MAX, 1,
-			      &fie->height, S5K6AA_WIN_HEIGHT_MIN,
-			      S5K6AA_WIN_HEIGHT_MAX, 1, 0);
-
-	mutex_lock(&s5k6aa->lock);
-	fi = &s5k6aa_intervals[fie->index];
-	if (fie->width > fi->size.width || fie->height > fi->size.height)
-		ret = -EINVAL;
-	else
-		fie->interval = fi->interval;
-	mutex_unlock(&s5k6aa->lock);
-
-	return ret;
-}
-
-static int s5k6aa_enum_mbus_code(struct v4l2_subdev *sd,
-				 struct v4l2_subdev_state *sd_state,
-				 struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (code->index >= ARRAY_SIZE(s5k6aa_formats))
-		return -EINVAL;
-
-	code->code = s5k6aa_formats[code->index].code;
-	return 0;
-}
-
-static int s5k6aa_enum_frame_size(struct v4l2_subdev *sd,
-				  struct v4l2_subdev_state *sd_state,
-				  struct v4l2_subdev_frame_size_enum *fse)
-{
-	int i = ARRAY_SIZE(s5k6aa_formats);
-
-	if (fse->index > 0)
-		return -EINVAL;
-
-	while (--i)
-		if (fse->code == s5k6aa_formats[i].code)
-			break;
-
-	fse->code = s5k6aa_formats[i].code;
-	fse->min_width  = S5K6AA_WIN_WIDTH_MIN;
-	fse->max_width  = S5K6AA_WIN_WIDTH_MAX;
-	fse->max_height = S5K6AA_WIN_HEIGHT_MIN;
-	fse->min_height = S5K6AA_WIN_HEIGHT_MAX;
-
-	return 0;
-}
-
-static struct v4l2_rect *
-__s5k6aa_get_crop_rect(struct s5k6aa *s5k6aa,
-		       struct v4l2_subdev_state *sd_state,
-		       enum v4l2_subdev_format_whence which)
-{
-	if (which == V4L2_SUBDEV_FORMAT_ACTIVE)
-		return &s5k6aa->ccd_rect;
-
-	WARN_ON(which != V4L2_SUBDEV_FORMAT_TRY);
-	return v4l2_subdev_get_try_crop(&s5k6aa->sd, sd_state, 0);
-}
-
-static void s5k6aa_try_format(struct s5k6aa *s5k6aa,
-			      struct v4l2_mbus_framefmt *mf)
-{
-	unsigned int index;
-
-	v4l_bound_align_image(&mf->width, S5K6AA_WIN_WIDTH_MIN,
-			      S5K6AA_WIN_WIDTH_MAX, 1,
-			      &mf->height, S5K6AA_WIN_HEIGHT_MIN,
-			      S5K6AA_WIN_HEIGHT_MAX, 1, 0);
-
-	if (mf->colorspace != V4L2_COLORSPACE_JPEG &&
-	    mf->colorspace != V4L2_COLORSPACE_REC709)
-		mf->colorspace = V4L2_COLORSPACE_JPEG;
-
-	index = s5k6aa_get_pixfmt_index(s5k6aa, mf);
-
-	mf->colorspace	= s5k6aa_formats[index].colorspace;
-	mf->code	= s5k6aa_formats[index].code;
-	mf->field	= V4L2_FIELD_NONE;
-}
-
-static int s5k6aa_get_fmt(struct v4l2_subdev *sd,
-			  struct v4l2_subdev_state *sd_state,
-			  struct v4l2_subdev_format *fmt)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	struct v4l2_mbus_framefmt *mf;
-
-	memset(fmt->reserved, 0, sizeof(fmt->reserved));
-
-	if (fmt->which == V4L2_SUBDEV_FORMAT_TRY) {
-		mf = v4l2_subdev_get_try_format(sd, sd_state, 0);
-		fmt->format = *mf;
-		return 0;
-	}
-
-	mutex_lock(&s5k6aa->lock);
-	fmt->format = s5k6aa->preset->mbus_fmt;
-	mutex_unlock(&s5k6aa->lock);
-
-	return 0;
-}
-
-static int s5k6aa_set_fmt(struct v4l2_subdev *sd,
-			  struct v4l2_subdev_state *sd_state,
-			  struct v4l2_subdev_format *fmt)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	struct s5k6aa_preset *preset = s5k6aa->preset;
-	struct v4l2_mbus_framefmt *mf;
-	struct v4l2_rect *crop;
-	int ret = 0;
-
-	mutex_lock(&s5k6aa->lock);
-	s5k6aa_try_format(s5k6aa, &fmt->format);
-
-	if (fmt->which == V4L2_SUBDEV_FORMAT_TRY) {
-		mf = v4l2_subdev_get_try_format(sd, sd_state, fmt->pad);
-		crop = v4l2_subdev_get_try_crop(sd, sd_state, 0);
-	} else {
-		if (s5k6aa->streaming) {
-			ret = -EBUSY;
-		} else {
-			mf = &preset->mbus_fmt;
-			crop = &s5k6aa->ccd_rect;
-			s5k6aa->apply_cfg = 1;
-		}
-	}
-
-	if (ret == 0) {
-		struct v4l2_subdev_frame_interval fiv = {
-			.interval = {0, 1}
-		};
-
-		*mf = fmt->format;
-		/*
-		 * Make sure the crop window is valid, i.e. its size is
-		 * greater than the output window, as the ISP supports
-		 * only down-scaling.
-		 */
-		crop->width = clamp_t(unsigned int, crop->width, mf->width,
-				      S5K6AA_WIN_WIDTH_MAX);
-		crop->height = clamp_t(unsigned int, crop->height, mf->height,
-				       S5K6AA_WIN_HEIGHT_MAX);
-		crop->left = clamp_t(unsigned int, crop->left, 0,
-				     S5K6AA_WIN_WIDTH_MAX - crop->width);
-		crop->top  = clamp_t(unsigned int, crop->top, 0,
-				     S5K6AA_WIN_HEIGHT_MAX - crop->height);
-
-		/* Reset to minimum possible frame interval */
-		ret = __s5k6aa_set_frame_interval(s5k6aa, &fiv);
-	}
-	mutex_unlock(&s5k6aa->lock);
-
-	return ret;
-}
-
-static int s5k6aa_get_selection(struct v4l2_subdev *sd,
-				struct v4l2_subdev_state *sd_state,
-				struct v4l2_subdev_selection *sel)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	struct v4l2_rect *rect;
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	memset(sel->reserved, 0, sizeof(sel->reserved));
-
-	mutex_lock(&s5k6aa->lock);
-	rect = __s5k6aa_get_crop_rect(s5k6aa, sd_state, sel->which);
-	sel->r = *rect;
-	mutex_unlock(&s5k6aa->lock);
-
-	v4l2_dbg(1, debug, sd, "Current crop rectangle: (%d,%d)/%dx%d\n",
-		 rect->left, rect->top, rect->width, rect->height);
-
-	return 0;
-}
-
-static int s5k6aa_set_selection(struct v4l2_subdev *sd,
-				struct v4l2_subdev_state *sd_state,
-				struct v4l2_subdev_selection *sel)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	struct v4l2_mbus_framefmt *mf;
-	unsigned int max_x, max_y;
-	struct v4l2_rect *crop_r;
-
-	if (sel->target != V4L2_SEL_TGT_CROP)
-		return -EINVAL;
-
-	mutex_lock(&s5k6aa->lock);
-	crop_r = __s5k6aa_get_crop_rect(s5k6aa, sd_state, sel->which);
-
-	if (sel->which == V4L2_SUBDEV_FORMAT_ACTIVE) {
-		mf = &s5k6aa->preset->mbus_fmt;
-		s5k6aa->apply_crop = 1;
-	} else {
-		mf = v4l2_subdev_get_try_format(sd, sd_state, 0);
-	}
-	v4l_bound_align_image(&sel->r.width, mf->width,
-			      S5K6AA_WIN_WIDTH_MAX, 1,
-			      &sel->r.height, mf->height,
-			      S5K6AA_WIN_HEIGHT_MAX, 1, 0);
-
-	max_x = (S5K6AA_WIN_WIDTH_MAX - sel->r.width) & ~1;
-	max_y = (S5K6AA_WIN_HEIGHT_MAX - sel->r.height) & ~1;
-
-	sel->r.left = clamp_t(unsigned int, sel->r.left, 0, max_x);
-	sel->r.top  = clamp_t(unsigned int, sel->r.top, 0, max_y);
-
-	*crop_r = sel->r;
-
-	mutex_unlock(&s5k6aa->lock);
-
-	v4l2_dbg(1, debug, sd, "Set crop rectangle: (%d,%d)/%dx%d\n",
-		 crop_r->left, crop_r->top, crop_r->width, crop_r->height);
-
-	return 0;
-}
-
-static const struct v4l2_subdev_pad_ops s5k6aa_pad_ops = {
-	.enum_mbus_code		= s5k6aa_enum_mbus_code,
-	.enum_frame_size	= s5k6aa_enum_frame_size,
-	.enum_frame_interval	= s5k6aa_enum_frame_interval,
-	.get_fmt		= s5k6aa_get_fmt,
-	.set_fmt		= s5k6aa_set_fmt,
-	.get_selection		= s5k6aa_get_selection,
-	.set_selection		= s5k6aa_set_selection,
-};
-
-static const struct v4l2_subdev_video_ops s5k6aa_video_ops = {
-	.g_frame_interval	= s5k6aa_g_frame_interval,
-	.s_frame_interval	= s5k6aa_s_frame_interval,
-	.s_stream		= s5k6aa_s_stream,
-};
-
-/*
- * V4L2 subdev controls
- */
-
-static int s5k6aa_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct v4l2_subdev *sd = ctrl_to_sd(ctrl);
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int idx, err = 0;
-
-	v4l2_dbg(1, debug, sd, "ctrl: 0x%x, value: %d\n", ctrl->id, ctrl->val);
-
-	mutex_lock(&s5k6aa->lock);
-	/*
-	 * If the device is not powered up by the host driver do
-	 * not apply any controls to H/W at this time. Instead
-	 * the controls will be restored right after power-up.
-	 */
-	if (s5k6aa->power == 0)
-		goto unlock;
-	idx = s5k6aa->preset->index;
-
-	switch (ctrl->id) {
-	case V4L2_CID_AUTO_WHITE_BALANCE:
-		err = s5k6aa_set_awb(s5k6aa, ctrl->val);
-		break;
-
-	case V4L2_CID_BRIGHTNESS:
-		err = s5k6aa_write(client, REG_USER_BRIGHTNESS, ctrl->val);
-		break;
-
-	case V4L2_CID_COLORFX:
-		err = s5k6aa_set_colorfx(s5k6aa, ctrl->val);
-		break;
-
-	case V4L2_CID_CONTRAST:
-		err = s5k6aa_write(client, REG_USER_CONTRAST, ctrl->val);
-		break;
-
-	case V4L2_CID_EXPOSURE_AUTO:
-		err = s5k6aa_set_auto_exposure(s5k6aa, ctrl->val);
-		break;
-
-	case V4L2_CID_HFLIP:
-		err = s5k6aa_set_mirror(s5k6aa, ctrl->val);
-		if (err)
-			break;
-		err = s5k6aa_write(client, REG_G_PREV_CFG_CHG, 1);
-		break;
-
-	case V4L2_CID_POWER_LINE_FREQUENCY:
-		err = s5k6aa_set_anti_flicker(s5k6aa, ctrl->val);
-		break;
-
-	case V4L2_CID_SATURATION:
-		err = s5k6aa_write(client, REG_USER_SATURATION, ctrl->val);
-		break;
-
-	case V4L2_CID_SHARPNESS:
-		err = s5k6aa_write(client, REG_USER_SHARPBLUR, ctrl->val);
-		break;
-
-	case V4L2_CID_WHITE_BALANCE_TEMPERATURE:
-		err = s5k6aa_write(client, REG_P_COLORTEMP(idx), ctrl->val);
-		if (err)
-			break;
-		err = s5k6aa_write(client, REG_G_PREV_CFG_CHG, 1);
-		break;
-	}
-unlock:
-	mutex_unlock(&s5k6aa->lock);
-	return err;
-}
-
-static const struct v4l2_ctrl_ops s5k6aa_ctrl_ops = {
-	.s_ctrl	= s5k6aa_s_ctrl,
-};
-
-static int s5k6aa_log_status(struct v4l2_subdev *sd)
-{
-	v4l2_ctrl_handler_log_status(sd->ctrl_handler, sd->name);
-	return 0;
-}
-
-#define V4L2_CID_RED_GAIN	(V4L2_CTRL_CLASS_CAMERA | 0x1001)
-#define V4L2_CID_GREEN_GAIN	(V4L2_CTRL_CLASS_CAMERA | 0x1002)
-#define V4L2_CID_BLUE_GAIN	(V4L2_CTRL_CLASS_CAMERA | 0x1003)
-
-static const struct v4l2_ctrl_config s5k6aa_ctrls[] = {
-	{
-		.ops	= &s5k6aa_ctrl_ops,
-		.id	= V4L2_CID_RED_GAIN,
-		.type	= V4L2_CTRL_TYPE_INTEGER,
-		.name	= "Gain, Red",
-		.min	= 0,
-		.max	= 256,
-		.def	= 127,
-		.step	= 1,
-	}, {
-		.ops	= &s5k6aa_ctrl_ops,
-		.id	= V4L2_CID_GREEN_GAIN,
-		.type	= V4L2_CTRL_TYPE_INTEGER,
-		.name	= "Gain, Green",
-		.min	= 0,
-		.max	= 256,
-		.def	= 127,
-		.step	= 1,
-	}, {
-		.ops	= &s5k6aa_ctrl_ops,
-		.id	= V4L2_CID_BLUE_GAIN,
-		.type	= V4L2_CTRL_TYPE_INTEGER,
-		.name	= "Gain, Blue",
-		.min	= 0,
-		.max	= 256,
-		.def	= 127,
-		.step	= 1,
-	},
-};
-
-static int s5k6aa_initialize_ctrls(struct s5k6aa *s5k6aa)
-{
-	const struct v4l2_ctrl_ops *ops = &s5k6aa_ctrl_ops;
-	struct s5k6aa_ctrls *ctrls = &s5k6aa->ctrls;
-	struct v4l2_ctrl_handler *hdl = &ctrls->handler;
-
-	int ret = v4l2_ctrl_handler_init(hdl, 16);
-	if (ret)
-		return ret;
-	/* Auto white balance cluster */
-	ctrls->awb = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_AUTO_WHITE_BALANCE,
-				       0, 1, 1, 1);
-	ctrls->gain_red = v4l2_ctrl_new_custom(hdl, &s5k6aa_ctrls[0], NULL);
-	ctrls->gain_green = v4l2_ctrl_new_custom(hdl, &s5k6aa_ctrls[1], NULL);
-	ctrls->gain_blue = v4l2_ctrl_new_custom(hdl, &s5k6aa_ctrls[2], NULL);
-	v4l2_ctrl_auto_cluster(4, &ctrls->awb, 0, false);
-
-	ctrls->hflip = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_HFLIP, 0, 1, 1, 0);
-	ctrls->vflip = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_VFLIP, 0, 1, 1, 0);
-	v4l2_ctrl_cluster(2, &ctrls->hflip);
-
-	ctrls->auto_exp = v4l2_ctrl_new_std_menu(hdl, ops,
-				V4L2_CID_EXPOSURE_AUTO,
-				V4L2_EXPOSURE_MANUAL, 0, V4L2_EXPOSURE_AUTO);
-	/* Exposure time: x 1 us */
-	ctrls->exposure = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_EXPOSURE,
-					    0, 6000000U, 1, 100000U);
-	/* Total gain: 256 <=> 1x */
-	ctrls->gain = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_GAIN,
-					0, 256, 1, 256);
-	v4l2_ctrl_auto_cluster(3, &ctrls->auto_exp, 0, false);
-
-	v4l2_ctrl_new_std_menu(hdl, ops, V4L2_CID_POWER_LINE_FREQUENCY,
-			       V4L2_CID_POWER_LINE_FREQUENCY_AUTO, 0,
-			       V4L2_CID_POWER_LINE_FREQUENCY_AUTO);
-
-	v4l2_ctrl_new_std_menu(hdl, ops, V4L2_CID_COLORFX,
-			       V4L2_COLORFX_SKY_BLUE, ~0x6f, V4L2_COLORFX_NONE);
-
-	v4l2_ctrl_new_std(hdl, ops, V4L2_CID_WHITE_BALANCE_TEMPERATURE,
-			  0, 256, 1, 0);
-
-	v4l2_ctrl_new_std(hdl, ops, V4L2_CID_SATURATION, -127, 127, 1, 0);
-	v4l2_ctrl_new_std(hdl, ops, V4L2_CID_BRIGHTNESS, -127, 127, 1, 0);
-	v4l2_ctrl_new_std(hdl, ops, V4L2_CID_CONTRAST, -127, 127, 1, 0);
-	v4l2_ctrl_new_std(hdl, ops, V4L2_CID_SHARPNESS, -127, 127, 1, 0);
-
-	if (hdl->error) {
-		ret = hdl->error;
-		v4l2_ctrl_handler_free(hdl);
-		return ret;
-	}
-
-	s5k6aa->sd.ctrl_handler = hdl;
-	return 0;
-}
-
-/*
- * V4L2 subdev internal operations
- */
-static int s5k6aa_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
-{
-	struct v4l2_mbus_framefmt *format = v4l2_subdev_get_try_format(sd,
-								       fh->state,
-								       0);
-	struct v4l2_rect *crop = v4l2_subdev_get_try_crop(sd, fh->state, 0);
-
-	format->colorspace = s5k6aa_formats[0].colorspace;
-	format->code = s5k6aa_formats[0].code;
-	format->width = S5K6AA_OUT_WIDTH_DEF;
-	format->height = S5K6AA_OUT_HEIGHT_DEF;
-	format->field = V4L2_FIELD_NONE;
-
-	crop->width = S5K6AA_WIN_WIDTH_MAX;
-	crop->height = S5K6AA_WIN_HEIGHT_MAX;
-	crop->left = 0;
-	crop->top = 0;
-
-	return 0;
-}
-
-static int s5k6aa_check_fw_revision(struct s5k6aa *s5k6aa)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	u16 api_ver = 0, fw_rev = 0;
-
-	int ret = s5k6aa_set_ahb_address(client);
-
-	if (!ret)
-		ret = s5k6aa_read(client, REG_FW_APIVER, &api_ver);
-	if (!ret)
-		ret = s5k6aa_read(client, REG_FW_REVISION, &fw_rev);
-	if (ret) {
-		v4l2_err(&s5k6aa->sd, "FW revision check failed!\n");
-		return ret;
-	}
-
-	v4l2_info(&s5k6aa->sd, "FW API ver.: 0x%X, FW rev.: 0x%X\n",
-		  api_ver, fw_rev);
-
-	return api_ver == S5K6AAFX_FW_APIVER ? 0 : -ENODEV;
-}
-
-static int s5k6aa_registered(struct v4l2_subdev *sd)
-{
-	struct s5k6aa *s5k6aa = to_s5k6aa(sd);
-	int ret;
-
-	mutex_lock(&s5k6aa->lock);
-	ret = __s5k6aa_power_on(s5k6aa);
-	if (!ret) {
-		msleep(100);
-		ret = s5k6aa_check_fw_revision(s5k6aa);
-		__s5k6aa_power_off(s5k6aa);
-	}
-	mutex_unlock(&s5k6aa->lock);
-
-	return ret;
-}
-
-static const struct v4l2_subdev_internal_ops s5k6aa_subdev_internal_ops = {
-	.registered = s5k6aa_registered,
-	.open = s5k6aa_open,
-};
-
-static const struct v4l2_subdev_core_ops s5k6aa_core_ops = {
-	.s_power = s5k6aa_set_power,
-	.log_status = s5k6aa_log_status,
-};
-
-static const struct v4l2_subdev_ops s5k6aa_subdev_ops = {
-	.core = &s5k6aa_core_ops,
-	.pad = &s5k6aa_pad_ops,
-	.video = &s5k6aa_video_ops,
-};
-
-/*
- * GPIO setup
- */
-
-static int s5k6aa_configure_gpios(struct s5k6aa *s5k6aa,
-				  const struct s5k6aa_platform_data *pdata)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(&s5k6aa->sd);
-	const struct s5k6aa_gpio *gpio;
-	unsigned long flags;
-	int ret;
-
-	s5k6aa->gpio[STBY].gpio = -EINVAL;
-	s5k6aa->gpio[RSET].gpio  = -EINVAL;
-
-	gpio = &pdata->gpio_stby;
-	if (gpio_is_valid(gpio->gpio)) {
-		flags = (gpio->level ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW)
-		      | GPIOF_EXPORT;
-		ret = devm_gpio_request_one(&client->dev, gpio->gpio, flags,
-					    "S5K6AA_STBY");
-		if (ret < 0)
-			return ret;
-
-		s5k6aa->gpio[STBY] = *gpio;
-	}
-
-	gpio = &pdata->gpio_reset;
-	if (gpio_is_valid(gpio->gpio)) {
-		flags = (gpio->level ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW)
-		      | GPIOF_EXPORT;
-		ret = devm_gpio_request_one(&client->dev, gpio->gpio, flags,
-					    "S5K6AA_RST");
-		if (ret < 0)
-			return ret;
-
-		s5k6aa->gpio[RSET] = *gpio;
-	}
-
-	return 0;
-}
-
-static int s5k6aa_probe(struct i2c_client *client)
-{
-	const struct s5k6aa_platform_data *pdata = client->dev.platform_data;
-	struct v4l2_subdev *sd;
-	struct s5k6aa *s5k6aa;
-	int i, ret;
-
-	if (pdata == NULL) {
-		dev_err(&client->dev, "Platform data not specified\n");
-		return -EINVAL;
-	}
-
-	if (pdata->mclk_frequency == 0) {
-		dev_err(&client->dev, "MCLK frequency not specified\n");
-		return -EINVAL;
-	}
-
-	s5k6aa = devm_kzalloc(&client->dev, sizeof(*s5k6aa), GFP_KERNEL);
-	if (!s5k6aa)
-		return -ENOMEM;
-
-	mutex_init(&s5k6aa->lock);
-
-	s5k6aa->mclk_frequency = pdata->mclk_frequency;
-	s5k6aa->bus_type = pdata->bus_type;
-	s5k6aa->mipi_lanes = pdata->nlanes;
-	s5k6aa->s_power	= pdata->set_power;
-	s5k6aa->inv_hflip = pdata->horiz_flip;
-	s5k6aa->inv_vflip = pdata->vert_flip;
-
-	sd = &s5k6aa->sd;
-	v4l2_i2c_subdev_init(sd, client, &s5k6aa_subdev_ops);
-	/* Static name; NEVER use in new drivers! */
-	strscpy(sd->name, DRIVER_NAME, sizeof(sd->name));
-
-	sd->internal_ops = &s5k6aa_subdev_internal_ops;
-	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
-
-	s5k6aa->pad.flags = MEDIA_PAD_FL_SOURCE;
-	sd->entity.function = MEDIA_ENT_F_CAM_SENSOR;
-	ret = media_entity_pads_init(&sd->entity, 1, &s5k6aa->pad);
-	if (ret)
-		return ret;
-
-	ret = s5k6aa_configure_gpios(s5k6aa, pdata);
-	if (ret)
-		goto out_err;
-
-	for (i = 0; i < S5K6AA_NUM_SUPPLIES; i++)
-		s5k6aa->supplies[i].supply = s5k6aa_supply_names[i];
-
-	ret = devm_regulator_bulk_get(&client->dev, S5K6AA_NUM_SUPPLIES,
-				 s5k6aa->supplies);
-	if (ret) {
-		dev_err(&client->dev, "Failed to get regulators\n");
-		goto out_err;
-	}
-
-	ret = s5k6aa_initialize_ctrls(s5k6aa);
-	if (ret)
-		goto out_err;
-
-	s5k6aa_presets_data_init(s5k6aa);
-
-	s5k6aa->ccd_rect.width = S5K6AA_WIN_WIDTH_MAX;
-	s5k6aa->ccd_rect.height	= S5K6AA_WIN_HEIGHT_MAX;
-	s5k6aa->ccd_rect.left = 0;
-	s5k6aa->ccd_rect.top = 0;
-
-	return 0;
-
-out_err:
-	media_entity_cleanup(&s5k6aa->sd.entity);
-	return ret;
-}
-
-static void s5k6aa_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *sd = i2c_get_clientdata(client);
-
-	v4l2_device_unregister_subdev(sd);
-	v4l2_ctrl_handler_free(sd->ctrl_handler);
-	media_entity_cleanup(&sd->entity);
-}
-
-static const struct i2c_device_id s5k6aa_id[] = {
-	{ DRIVER_NAME, 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, s5k6aa_id);
-
-
-static struct i2c_driver s5k6aa_i2c_driver = {
-	.driver = {
-		.name = DRIVER_NAME
-	},
-	.probe_new	= s5k6aa_probe,
-	.remove		= s5k6aa_remove,
-	.id_table	= s5k6aa_id,
-};
-
-module_i2c_driver(s5k6aa_i2c_driver);
-
-MODULE_DESCRIPTION("Samsung S5K6AA(FX) SXGA camera driver");
-MODULE_AUTHOR("Sylwester Nawrocki <s.nawrocki@samsung.com>");
-MODULE_LICENSE("GPL");
diff --git a/include/media/i2c/s5k6aa.h b/include/media/i2c/s5k6aa.h
deleted file mode 100644
index eb3444d8b731..000000000000
--- a/include/media/i2c/s5k6aa.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * S5K6AAFX camera sensor driver header
- *
- * Copyright (C) 2011 Samsung Electronics Co., Ltd.
- */
-
-#ifndef S5K6AA_H
-#define S5K6AA_H
-
-#include <media/v4l2-mediabus.h>
-
-/**
- * struct s5k6aa_gpio - data structure describing a GPIO
- * @gpio:  GPIO number
- * @level: indicates active state of the @gpio
- */
-struct s5k6aa_gpio {
-	int gpio;
-	int level;
-};
-
-/**
- * struct s5k6aa_platform_data - s5k6aa driver platform data
- * @set_power:   an additional callback to the board code, called
- *               after enabling the regulators and before switching
- *               the sensor off
- * @mclk_frequency: sensor's master clock frequency in Hz
- * @gpio_reset:  GPIO driving RESET pin
- * @gpio_stby:   GPIO driving STBY pin
- * @bus_type:    bus type
- * @nlanes:      maximum number of MIPI-CSI lanes used
- * @horiz_flip:  default horizontal image flip value, non zero to enable
- * @vert_flip:   default vertical image flip value, non zero to enable
- */
-
-struct s5k6aa_platform_data {
-	int (*set_power)(int enable);
-	unsigned long mclk_frequency;
-	struct s5k6aa_gpio gpio_reset;
-	struct s5k6aa_gpio gpio_stby;
-	enum v4l2_mbus_type bus_type;
-	u8 nlanes;
-	u8 horiz_flip;
-	u8 vert_flip;
-};
-
-#endif /* S5K6AA_H */
-- 
cgit v1.2.3


From 6a692b059c821887a6277a294c2788bf99a47a08 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 25 Jan 2023 18:29:48 +0000
Subject: media: i2c: Drop unused sr030pc30 camera sensor driver

The sr030pc30 camera sensor driver doesn't support DT and relies on
platform data. No board file has ever provided platform data for that
device. The driver has thus never been used in the mainline kernel since
its introduction in v2.6.37. Drop it.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/admin-guide/media/i2c-cardlist.rst |   1 -
 drivers/media/i2c/Kconfig                        |   6 -
 drivers/media/i2c/Makefile                       |   1 -
 drivers/media/i2c/sr030pc30.c                    | 762 -----------------------
 include/media/i2c/sr030pc30.h                    |  17 -
 5 files changed, 787 deletions(-)
 delete mode 100644 drivers/media/i2c/sr030pc30.c
 delete mode 100644 include/media/i2c/sr030pc30.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index b9a3a561183f..ada06fd8a377 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -105,7 +105,6 @@ s5c73m3       Samsung S5C73M3 sensor
 s5k4ecgx      Samsung S5K4ECGX sensor
 s5k5baf       Samsung S5K5BAF sensor
 s5k6a3        Samsung S5K6A3 sensor
-sr030pc30     Siliconfile SR030PC30 sensor
 vs6624        ST VS6624 sensor
 ============  ==========================================================
 
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index a7f2a8ba2fd3..d326399b8a48 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -784,12 +784,6 @@ config VIDEO_S5K6A3
 	  This is a V4L2 sensor driver for Samsung S5K6A3 raw
 	  camera sensor.
 
-config VIDEO_SR030PC30
-	tristate "Siliconfile SR030PC30 sensor support"
-	depends on I2C && VIDEO_DEV
-	help
-	  This driver supports SR030PC30 VGA camera from Siliconfile
-
 config VIDEO_ST_VGXY61
 	tristate "ST VGXY61 sensor support"
 	depends on OF && GPIOLIB && VIDEO_DEV && I2C
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 6bc6c4682132..035a96f8ffaf 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -113,7 +113,6 @@ obj-$(CONFIG_VIDEO_SAA7127) += saa7127.o
 obj-$(CONFIG_VIDEO_SAA717X) += saa717x.o
 obj-$(CONFIG_VIDEO_SAA7185) += saa7185.o
 obj-$(CONFIG_VIDEO_SONY_BTF_MPX) += sony-btf-mpx.o
-obj-$(CONFIG_VIDEO_SR030PC30) += sr030pc30.o
 obj-$(CONFIG_VIDEO_ST_MIPID02) += st-mipid02.o
 obj-$(CONFIG_VIDEO_ST_VGXY61) += st-vgxy61.o
 obj-$(CONFIG_VIDEO_TC358743) += tc358743.o
diff --git a/drivers/media/i2c/sr030pc30.c b/drivers/media/i2c/sr030pc30.c
deleted file mode 100644
index a83c8bf1c5dd..000000000000
--- a/drivers/media/i2c/sr030pc30.c
+++ /dev/null
@@ -1,762 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for SiliconFile SR030PC30 VGA (1/10-Inch) Image Sensor with ISP
- *
- * Copyright (C) 2010 Samsung Electronics Co., Ltd
- * Author: Sylwester Nawrocki, s.nawrocki@samsung.com
- *
- * Based on original driver authored by Dongsoo Nathaniel Kim
- * and HeungJun Kim <riverful.kim@samsung.com>.
- *
- * Based on mt9v011 Micron Digital Image Sensor driver
- * Copyright (c) 2009 Mauro Carvalho Chehab
- */
-
-#include <linux/i2c.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <media/v4l2-device.h>
-#include <media/v4l2-subdev.h>
-#include <media/v4l2-mediabus.h>
-#include <media/v4l2-ctrls.h>
-#include <media/i2c/sr030pc30.h>
-
-static int debug;
-module_param(debug, int, 0644);
-
-#define MODULE_NAME	"SR030PC30"
-
-/*
- * Register offsets within a page
- * b15..b8 - page id, b7..b0 - register address
- */
-#define POWER_CTRL_REG		0x0001
-#define PAGEMODE_REG		0x03
-#define DEVICE_ID_REG		0x0004
-#define NOON010PC30_ID		0x86
-#define SR030PC30_ID		0x8C
-#define VDO_CTL1_REG		0x0010
-#define SUBSAMPL_NONE_VGA	0
-#define SUBSAMPL_QVGA		0x10
-#define SUBSAMPL_QQVGA		0x20
-#define VDO_CTL2_REG		0x0011
-#define SYNC_CTL_REG		0x0012
-#define WIN_ROWH_REG		0x0020
-#define WIN_ROWL_REG		0x0021
-#define WIN_COLH_REG		0x0022
-#define WIN_COLL_REG		0x0023
-#define WIN_HEIGHTH_REG		0x0024
-#define WIN_HEIGHTL_REG		0x0025
-#define WIN_WIDTHH_REG		0x0026
-#define WIN_WIDTHL_REG		0x0027
-#define HBLANKH_REG		0x0040
-#define HBLANKL_REG		0x0041
-#define VSYNCH_REG		0x0042
-#define VSYNCL_REG		0x0043
-/* page 10 */
-#define ISP_CTL_REG(n)		(0x1010 + (n))
-#define YOFS_REG		0x1040
-#define DARK_YOFS_REG		0x1041
-#define AG_ABRTH_REG		0x1050
-#define SAT_CTL_REG		0x1060
-#define BSAT_REG		0x1061
-#define RSAT_REG		0x1062
-#define AG_SAT_TH_REG		0x1063
-/* page 11 */
-#define ZLPF_CTRL_REG		0x1110
-#define ZLPF_CTRL2_REG		0x1112
-#define ZLPF_AGH_THR_REG	0x1121
-#define ZLPF_THR_REG		0x1160
-#define ZLPF_DYN_THR_REG	0x1160
-/* page 12 */
-#define YCLPF_CTL1_REG		0x1240
-#define YCLPF_CTL2_REG		0x1241
-#define YCLPF_THR_REG		0x1250
-#define BLPF_CTL_REG		0x1270
-#define BLPF_THR1_REG		0x1274
-#define BLPF_THR2_REG		0x1275
-/* page 14 - Lens Shading Compensation */
-#define LENS_CTRL_REG		0x1410
-#define LENS_XCEN_REG		0x1420
-#define LENS_YCEN_REG		0x1421
-#define LENS_R_COMP_REG		0x1422
-#define LENS_G_COMP_REG		0x1423
-#define LENS_B_COMP_REG		0x1424
-/* page 15 - Color correction */
-#define CMC_CTL_REG		0x1510
-#define CMC_OFSGH_REG		0x1514
-#define CMC_OFSGL_REG		0x1516
-#define CMC_SIGN_REG		0x1517
-/* Color correction coefficients */
-#define CMC_COEF_REG(n)		(0x1530 + (n))
-/* Color correction offset coefficients */
-#define CMC_OFS_REG(n)		(0x1540 + (n))
-/* page 16 - Gamma correction */
-#define GMA_CTL_REG		0x1610
-/* Gamma correction coefficients 0.14 */
-#define GMA_COEF_REG(n)		(0x1630 + (n))
-/* page 20 - Auto Exposure */
-#define AE_CTL1_REG		0x2010
-#define AE_CTL2_REG		0x2011
-#define AE_FRM_CTL_REG		0x2020
-#define AE_FINE_CTL_REG(n)	(0x2028 + (n))
-#define EXP_TIMEH_REG		0x2083
-#define EXP_TIMEM_REG		0x2084
-#define EXP_TIMEL_REG		0x2085
-#define EXP_MMINH_REG		0x2086
-#define EXP_MMINL_REG		0x2087
-#define EXP_MMAXH_REG		0x2088
-#define EXP_MMAXM_REG		0x2089
-#define EXP_MMAXL_REG		0x208A
-/* page 22 - Auto White Balance */
-#define AWB_CTL1_REG		0x2210
-#define AWB_ENABLE		0x80
-#define AWB_CTL2_REG		0x2211
-#define MWB_ENABLE		0x01
-/* RGB gain control (manual WB) when AWB_CTL1[7]=0 */
-#define AWB_RGAIN_REG		0x2280
-#define AWB_GGAIN_REG		0x2281
-#define AWB_BGAIN_REG		0x2282
-#define AWB_RMAX_REG		0x2283
-#define AWB_RMIN_REG		0x2284
-#define AWB_BMAX_REG		0x2285
-#define AWB_BMIN_REG		0x2286
-/* R, B gain range in bright light conditions */
-#define AWB_RMAXB_REG		0x2287
-#define AWB_RMINB_REG		0x2288
-#define AWB_BMAXB_REG		0x2289
-#define AWB_BMINB_REG		0x228A
-/* manual white balance, when AWB_CTL2[0]=1 */
-#define MWB_RGAIN_REG		0x22B2
-#define MWB_BGAIN_REG		0x22B3
-/* the token to mark an array end */
-#define REG_TERM		0xFFFF
-
-/* Minimum and maximum exposure time in ms */
-#define EXPOS_MIN_MS		1
-#define EXPOS_MAX_MS		125
-
-struct sr030pc30_info {
-	struct v4l2_subdev sd;
-	struct v4l2_ctrl_handler hdl;
-	const struct sr030pc30_platform_data *pdata;
-	const struct sr030pc30_format *curr_fmt;
-	const struct sr030pc30_frmsize *curr_win;
-	unsigned int hflip:1;
-	unsigned int vflip:1;
-	unsigned int sleep:1;
-	struct {
-		/* auto whitebalance control cluster */
-		struct v4l2_ctrl *awb;
-		struct v4l2_ctrl *red;
-		struct v4l2_ctrl *blue;
-	};
-	struct {
-		/* auto exposure control cluster */
-		struct v4l2_ctrl *autoexp;
-		struct v4l2_ctrl *exp;
-	};
-	u8 i2c_reg_page;
-};
-
-struct sr030pc30_format {
-	u32 code;
-	enum v4l2_colorspace colorspace;
-	u16 ispctl1_reg;
-};
-
-struct sr030pc30_frmsize {
-	u16 width;
-	u16 height;
-	int vid_ctl1;
-};
-
-struct i2c_regval {
-	u16 addr;
-	u16 val;
-};
-
-/* supported resolutions */
-static const struct sr030pc30_frmsize sr030pc30_sizes[] = {
-	{
-		.width		= 640,
-		.height		= 480,
-		.vid_ctl1	= SUBSAMPL_NONE_VGA,
-	}, {
-		.width		= 320,
-		.height		= 240,
-		.vid_ctl1	= SUBSAMPL_QVGA,
-	}, {
-		.width		= 160,
-		.height		= 120,
-		.vid_ctl1	= SUBSAMPL_QQVGA,
-	},
-};
-
-/* supported pixel formats */
-static const struct sr030pc30_format sr030pc30_formats[] = {
-	{
-		.code		= MEDIA_BUS_FMT_YUYV8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x03,
-	}, {
-		.code		= MEDIA_BUS_FMT_YVYU8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x02,
-	}, {
-		.code		= MEDIA_BUS_FMT_VYUY8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0,
-	}, {
-		.code		= MEDIA_BUS_FMT_UYVY8_2X8,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x01,
-	}, {
-		.code		= MEDIA_BUS_FMT_RGB565_2X8_BE,
-		.colorspace	= V4L2_COLORSPACE_JPEG,
-		.ispctl1_reg	= 0x40,
-	},
-};
-
-static const struct i2c_regval sr030pc30_base_regs[] = {
-	/* Window size and position within pixel matrix */
-	{ WIN_ROWH_REG,		0x00 }, { WIN_ROWL_REG,		0x06 },
-	{ WIN_COLH_REG,		0x00 },	{ WIN_COLL_REG,		0x06 },
-	{ WIN_HEIGHTH_REG,	0x01 }, { WIN_HEIGHTL_REG,	0xE0 },
-	{ WIN_WIDTHH_REG,	0x02 }, { WIN_WIDTHL_REG,	0x80 },
-	{ HBLANKH_REG,		0x01 }, { HBLANKL_REG,		0x50 },
-	{ VSYNCH_REG,		0x00 }, { VSYNCL_REG,		0x14 },
-	{ SYNC_CTL_REG,		0 },
-	/* Color corection and saturation */
-	{ ISP_CTL_REG(0),	0x30 }, { YOFS_REG,		0x80 },
-	{ DARK_YOFS_REG,	0x04 }, { AG_ABRTH_REG,		0x78 },
-	{ SAT_CTL_REG,		0x1F }, { BSAT_REG,		0x90 },
-	{ AG_SAT_TH_REG,	0xF0 }, { 0x1064,		0x80 },
-	{ CMC_CTL_REG,		0x03 }, { CMC_OFSGH_REG,	0x3C },
-	{ CMC_OFSGL_REG,	0x2C }, { CMC_SIGN_REG,		0x2F },
-	{ CMC_COEF_REG(0),	0xCB }, { CMC_OFS_REG(0),	0x87 },
-	{ CMC_COEF_REG(1),	0x61 }, { CMC_OFS_REG(1),	0x18 },
-	{ CMC_COEF_REG(2),	0x16 }, { CMC_OFS_REG(2),	0x91 },
-	{ CMC_COEF_REG(3),	0x23 }, { CMC_OFS_REG(3),	0x94 },
-	{ CMC_COEF_REG(4),	0xCE }, { CMC_OFS_REG(4),	0x9f },
-	{ CMC_COEF_REG(5),	0x2B }, { CMC_OFS_REG(5),	0x33 },
-	{ CMC_COEF_REG(6),	0x01 }, { CMC_OFS_REG(6),	0x00 },
-	{ CMC_COEF_REG(7),	0x34 }, { CMC_OFS_REG(7),	0x94 },
-	{ CMC_COEF_REG(8),	0x75 }, { CMC_OFS_REG(8),	0x14 },
-	/* Color corection coefficients */
-	{ GMA_CTL_REG,		0x03 },	{ GMA_COEF_REG(0),	0x00 },
-	{ GMA_COEF_REG(1),	0x19 },	{ GMA_COEF_REG(2),	0x26 },
-	{ GMA_COEF_REG(3),	0x3B },	{ GMA_COEF_REG(4),	0x5D },
-	{ GMA_COEF_REG(5),	0x79 }, { GMA_COEF_REG(6),	0x8E },
-	{ GMA_COEF_REG(7),	0x9F },	{ GMA_COEF_REG(8),	0xAF },
-	{ GMA_COEF_REG(9),	0xBD },	{ GMA_COEF_REG(10),	0xCA },
-	{ GMA_COEF_REG(11),	0xDD }, { GMA_COEF_REG(12),	0xEC },
-	{ GMA_COEF_REG(13),	0xF7 },	{ GMA_COEF_REG(14),	0xFF },
-	/* Noise reduction, Z-LPF, YC-LPF and BLPF filters setup */
-	{ ZLPF_CTRL_REG,	0x99 }, { ZLPF_CTRL2_REG,	0x0E },
-	{ ZLPF_AGH_THR_REG,	0x29 }, { ZLPF_THR_REG,		0x0F },
-	{ ZLPF_DYN_THR_REG,	0x63 }, { YCLPF_CTL1_REG,	0x23 },
-	{ YCLPF_CTL2_REG,	0x3B }, { YCLPF_THR_REG,	0x05 },
-	{ BLPF_CTL_REG,		0x1D }, { BLPF_THR1_REG,	0x05 },
-	{ BLPF_THR2_REG,	0x04 },
-	/* Automatic white balance */
-	{ AWB_CTL1_REG,		0xFB }, { AWB_CTL2_REG,		0x26 },
-	{ AWB_RMAX_REG,		0x54 }, { AWB_RMIN_REG,		0x2B },
-	{ AWB_BMAX_REG,		0x57 }, { AWB_BMIN_REG,		0x29 },
-	{ AWB_RMAXB_REG,	0x50 }, { AWB_RMINB_REG,	0x43 },
-	{ AWB_BMAXB_REG,	0x30 }, { AWB_BMINB_REG,	0x22 },
-	/* Auto exposure */
-	{ AE_CTL1_REG,		0x8C }, { AE_CTL2_REG,		0x04 },
-	{ AE_FRM_CTL_REG,	0x01 }, { AE_FINE_CTL_REG(0),	0x3F },
-	{ AE_FINE_CTL_REG(1),	0xA3 }, { AE_FINE_CTL_REG(3),	0x34 },
-	/* Lens shading compensation */
-	{ LENS_CTRL_REG,	0x01 }, { LENS_XCEN_REG,	0x80 },
-	{ LENS_YCEN_REG,	0x70 }, { LENS_R_COMP_REG,	0x53 },
-	{ LENS_G_COMP_REG,	0x40 }, { LENS_B_COMP_REG,	0x3e },
-	{ REG_TERM,		0 },
-};
-
-static inline struct sr030pc30_info *to_sr030pc30(struct v4l2_subdev *sd)
-{
-	return container_of(sd, struct sr030pc30_info, sd);
-}
-
-static inline int set_i2c_page(struct sr030pc30_info *info,
-			       struct i2c_client *client, unsigned int reg)
-{
-	int ret = 0;
-	u32 page = reg >> 8 & 0xFF;
-
-	if (info->i2c_reg_page != page && (reg & 0xFF) != 0x03) {
-		ret = i2c_smbus_write_byte_data(client, PAGEMODE_REG, page);
-		if (!ret)
-			info->i2c_reg_page = page;
-	}
-	return ret;
-}
-
-static int cam_i2c_read(struct v4l2_subdev *sd, u32 reg_addr)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-
-	int ret = set_i2c_page(info, client, reg_addr);
-	if (!ret)
-		ret = i2c_smbus_read_byte_data(client, reg_addr & 0xFF);
-	return ret;
-}
-
-static int cam_i2c_write(struct v4l2_subdev *sd, u32 reg_addr, u32 val)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-
-	int ret = set_i2c_page(info, client, reg_addr);
-	if (!ret)
-		ret = i2c_smbus_write_byte_data(
-			client, reg_addr & 0xFF, val);
-	return ret;
-}
-
-static inline int sr030pc30_bulk_write_reg(struct v4l2_subdev *sd,
-				const struct i2c_regval *msg)
-{
-	while (msg->addr != REG_TERM) {
-		int ret = cam_i2c_write(sd, msg->addr, msg->val);
-		if (ret)
-			return ret;
-		msg++;
-	}
-	return 0;
-}
-
-/* Device reset and sleep mode control */
-static int sr030pc30_pwr_ctrl(struct v4l2_subdev *sd,
-				     bool reset, bool sleep)
-{
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-	u8 reg = sleep ? 0xF1 : 0xF0;
-	int ret = 0;
-
-	if (reset)
-		ret = cam_i2c_write(sd, POWER_CTRL_REG, reg | 0x02);
-	if (!ret) {
-		ret = cam_i2c_write(sd, POWER_CTRL_REG, reg);
-		if (!ret) {
-			info->sleep = sleep;
-			if (reset)
-				info->i2c_reg_page = -1;
-		}
-	}
-	return ret;
-}
-
-static int sr030pc30_set_flip(struct v4l2_subdev *sd)
-{
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-
-	s32 reg = cam_i2c_read(sd, VDO_CTL2_REG);
-	if (reg < 0)
-		return reg;
-
-	reg &= 0x7C;
-	if (info->hflip)
-		reg |= 0x01;
-	if (info->vflip)
-		reg |= 0x02;
-	return cam_i2c_write(sd, VDO_CTL2_REG, reg | 0x80);
-}
-
-/* Configure resolution, color format and image flip */
-static int sr030pc30_set_params(struct v4l2_subdev *sd)
-{
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-	int ret;
-
-	if (!info->curr_win)
-		return -EINVAL;
-
-	/* Configure the resolution through subsampling */
-	ret = cam_i2c_write(sd, VDO_CTL1_REG,
-			    info->curr_win->vid_ctl1);
-
-	if (!ret && info->curr_fmt)
-		ret = cam_i2c_write(sd, ISP_CTL_REG(0),
-				info->curr_fmt->ispctl1_reg);
-	if (!ret)
-		ret = sr030pc30_set_flip(sd);
-
-	return ret;
-}
-
-/* Find nearest matching image pixel size. */
-static int sr030pc30_try_frame_size(struct v4l2_mbus_framefmt *mf)
-{
-	unsigned int min_err = ~0;
-	int i = ARRAY_SIZE(sr030pc30_sizes);
-	const struct sr030pc30_frmsize *fsize = &sr030pc30_sizes[0],
-					*match = NULL;
-	while (i--) {
-		int err = abs(fsize->width - mf->width)
-				+ abs(fsize->height - mf->height);
-		if (err < min_err) {
-			min_err = err;
-			match = fsize;
-		}
-		fsize++;
-	}
-	if (match) {
-		mf->width  = match->width;
-		mf->height = match->height;
-		return 0;
-	}
-	return -EINVAL;
-}
-
-static int sr030pc30_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct sr030pc30_info *info =
-		container_of(ctrl->handler, struct sr030pc30_info, hdl);
-	struct v4l2_subdev *sd = &info->sd;
-	int ret = 0;
-
-	v4l2_dbg(1, debug, sd, "%s: ctrl_id: %d, value: %d\n",
-			 __func__, ctrl->id, ctrl->val);
-
-	switch (ctrl->id) {
-	case V4L2_CID_AUTO_WHITE_BALANCE:
-		if (ctrl->is_new) {
-			ret = cam_i2c_write(sd, AWB_CTL2_REG,
-					ctrl->val ? 0x2E : 0x2F);
-			if (!ret)
-				ret = cam_i2c_write(sd, AWB_CTL1_REG,
-						ctrl->val ? 0xFB : 0x7B);
-		}
-		if (!ret && info->blue->is_new)
-			ret = cam_i2c_write(sd, MWB_BGAIN_REG, info->blue->val);
-		if (!ret && info->red->is_new)
-			ret = cam_i2c_write(sd, MWB_RGAIN_REG, info->red->val);
-		return ret;
-
-	case V4L2_CID_EXPOSURE_AUTO:
-		/* auto anti-flicker is also enabled here */
-		if (ctrl->is_new)
-			ret = cam_i2c_write(sd, AE_CTL1_REG,
-				ctrl->val == V4L2_EXPOSURE_AUTO ? 0xDC : 0x0C);
-		if (info->exp->is_new) {
-			unsigned long expos = info->exp->val;
-
-			expos = expos * info->pdata->clk_rate / (8 * 1000);
-
-			if (!ret)
-				ret = cam_i2c_write(sd, EXP_TIMEH_REG,
-						expos >> 16 & 0xFF);
-			if (!ret)
-				ret = cam_i2c_write(sd, EXP_TIMEM_REG,
-						expos >> 8 & 0xFF);
-			if (!ret)
-				ret = cam_i2c_write(sd, EXP_TIMEL_REG,
-						expos & 0xFF);
-		}
-		return ret;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int sr030pc30_enum_mbus_code(struct v4l2_subdev *sd,
-		struct v4l2_subdev_state *sd_state,
-		struct v4l2_subdev_mbus_code_enum *code)
-{
-	if (!code || code->pad ||
-	    code->index >= ARRAY_SIZE(sr030pc30_formats))
-		return -EINVAL;
-
-	code->code = sr030pc30_formats[code->index].code;
-	return 0;
-}
-
-static int sr030pc30_get_fmt(struct v4l2_subdev *sd,
-		struct v4l2_subdev_state *sd_state,
-		struct v4l2_subdev_format *format)
-{
-	struct v4l2_mbus_framefmt *mf;
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-
-	if (!format || format->pad)
-		return -EINVAL;
-
-	mf = &format->format;
-
-	if (!info->curr_win || !info->curr_fmt)
-		return -EINVAL;
-
-	mf->width	= info->curr_win->width;
-	mf->height	= info->curr_win->height;
-	mf->code	= info->curr_fmt->code;
-	mf->colorspace	= info->curr_fmt->colorspace;
-	mf->field	= V4L2_FIELD_NONE;
-
-	return 0;
-}
-
-/* Return nearest media bus frame format. */
-static const struct sr030pc30_format *try_fmt(struct v4l2_subdev *sd,
-					      struct v4l2_mbus_framefmt *mf)
-{
-	int i;
-
-	sr030pc30_try_frame_size(mf);
-
-	for (i = 0; i < ARRAY_SIZE(sr030pc30_formats); i++) {
-		if (mf->code == sr030pc30_formats[i].code)
-			break;
-	}
-	if (i == ARRAY_SIZE(sr030pc30_formats))
-		i = 0;
-
-	mf->code = sr030pc30_formats[i].code;
-
-	return &sr030pc30_formats[i];
-}
-
-/* Return nearest media bus frame format. */
-static int sr030pc30_set_fmt(struct v4l2_subdev *sd,
-		struct v4l2_subdev_state *sd_state,
-		struct v4l2_subdev_format *format)
-{
-	struct sr030pc30_info *info = sd ? to_sr030pc30(sd) : NULL;
-	const struct sr030pc30_format *fmt;
-	struct v4l2_mbus_framefmt *mf;
-
-	if (!sd || !format)
-		return -EINVAL;
-
-	mf = &format->format;
-	if (format->pad)
-		return -EINVAL;
-
-	fmt = try_fmt(sd, mf);
-	if (format->which == V4L2_SUBDEV_FORMAT_TRY) {
-		sd_state->pads->try_fmt = *mf;
-		return 0;
-	}
-
-	info->curr_fmt = fmt;
-
-	return sr030pc30_set_params(sd);
-}
-
-static int sr030pc30_base_config(struct v4l2_subdev *sd)
-{
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-	int ret;
-	unsigned long expmin, expmax;
-
-	ret = sr030pc30_bulk_write_reg(sd, sr030pc30_base_regs);
-	if (!ret) {
-		info->curr_fmt = &sr030pc30_formats[0];
-		info->curr_win = &sr030pc30_sizes[0];
-		ret = sr030pc30_set_params(sd);
-	}
-	if (!ret)
-		ret = sr030pc30_pwr_ctrl(sd, false, false);
-
-	if (ret)
-		return ret;
-
-	expmin = EXPOS_MIN_MS * info->pdata->clk_rate / (8 * 1000);
-	expmax = EXPOS_MAX_MS * info->pdata->clk_rate / (8 * 1000);
-
-	v4l2_dbg(1, debug, sd, "%s: expmin= %lx, expmax= %lx", __func__,
-		 expmin, expmax);
-
-	/* Setting up manual exposure time range */
-	ret = cam_i2c_write(sd, EXP_MMINH_REG, expmin >> 8 & 0xFF);
-	if (!ret)
-		ret = cam_i2c_write(sd, EXP_MMINL_REG, expmin & 0xFF);
-	if (!ret)
-		ret = cam_i2c_write(sd, EXP_MMAXH_REG, expmax >> 16 & 0xFF);
-	if (!ret)
-		ret = cam_i2c_write(sd, EXP_MMAXM_REG, expmax >> 8 & 0xFF);
-	if (!ret)
-		ret = cam_i2c_write(sd, EXP_MMAXL_REG, expmax & 0xFF);
-
-	return ret;
-}
-
-static int sr030pc30_s_power(struct v4l2_subdev *sd, int on)
-{
-	struct i2c_client *client = v4l2_get_subdevdata(sd);
-	struct sr030pc30_info *info = to_sr030pc30(sd);
-	const struct sr030pc30_platform_data *pdata = info->pdata;
-	int ret;
-
-	if (pdata == NULL) {
-		WARN(1, "No platform data!\n");
-		return -EINVAL;
-	}
-
-	/*
-	 * Put sensor into power sleep mode before switching off
-	 * power and disabling MCLK.
-	 */
-	if (!on)
-		sr030pc30_pwr_ctrl(sd, false, true);
-
-	/* set_power controls sensor's power and clock */
-	if (pdata->set_power) {
-		ret = pdata->set_power(&client->dev, on);
-		if (ret)
-			return ret;
-	}
-
-	if (on) {
-		ret = sr030pc30_base_config(sd);
-	} else {
-		ret = 0;
-		info->curr_win = NULL;
-		info->curr_fmt = NULL;
-	}
-
-	return ret;
-}
-
-static const struct v4l2_ctrl_ops sr030pc30_ctrl_ops = {
-	.s_ctrl = sr030pc30_s_ctrl,
-};
-
-static const struct v4l2_subdev_core_ops sr030pc30_core_ops = {
-	.s_power	= sr030pc30_s_power,
-};
-
-static const struct v4l2_subdev_pad_ops sr030pc30_pad_ops = {
-	.enum_mbus_code = sr030pc30_enum_mbus_code,
-	.get_fmt	= sr030pc30_get_fmt,
-	.set_fmt	= sr030pc30_set_fmt,
-};
-
-static const struct v4l2_subdev_ops sr030pc30_ops = {
-	.core	= &sr030pc30_core_ops,
-	.pad	= &sr030pc30_pad_ops,
-};
-
-/*
- * Detect sensor type. Return 0 if SR030PC30 was detected
- * or -ENODEV otherwise.
- */
-static int sr030pc30_detect(struct i2c_client *client)
-{
-	const struct sr030pc30_platform_data *pdata
-		= client->dev.platform_data;
-	int ret;
-
-	/* Enable sensor's power and clock */
-	if (pdata->set_power) {
-		ret = pdata->set_power(&client->dev, 1);
-		if (ret)
-			return ret;
-	}
-
-	ret = i2c_smbus_read_byte_data(client, DEVICE_ID_REG);
-
-	if (pdata->set_power)
-		pdata->set_power(&client->dev, 0);
-
-	if (ret < 0) {
-		dev_err(&client->dev, "%s: I2C read failed\n", __func__);
-		return ret;
-	}
-
-	return ret == SR030PC30_ID ? 0 : -ENODEV;
-}
-
-
-static int sr030pc30_probe(struct i2c_client *client)
-{
-	struct sr030pc30_info *info;
-	struct v4l2_subdev *sd;
-	struct v4l2_ctrl_handler *hdl;
-	const struct sr030pc30_platform_data *pdata
-		= client->dev.platform_data;
-	int ret;
-
-	if (!pdata) {
-		dev_err(&client->dev, "No platform data!");
-		return -EIO;
-	}
-
-	ret = sr030pc30_detect(client);
-	if (ret)
-		return ret;
-
-	info = devm_kzalloc(&client->dev, sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	sd = &info->sd;
-	info->pdata = client->dev.platform_data;
-
-	v4l2_i2c_subdev_init(sd, client, &sr030pc30_ops);
-
-	hdl = &info->hdl;
-	v4l2_ctrl_handler_init(hdl, 6);
-	info->awb = v4l2_ctrl_new_std(hdl, &sr030pc30_ctrl_ops,
-			V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, 1);
-	info->red = v4l2_ctrl_new_std(hdl, &sr030pc30_ctrl_ops,
-			V4L2_CID_RED_BALANCE, 0, 127, 1, 64);
-	info->blue = v4l2_ctrl_new_std(hdl, &sr030pc30_ctrl_ops,
-			V4L2_CID_BLUE_BALANCE, 0, 127, 1, 64);
-	info->autoexp = v4l2_ctrl_new_std(hdl, &sr030pc30_ctrl_ops,
-			V4L2_CID_EXPOSURE_AUTO, 0, 1, 1, 1);
-	info->exp = v4l2_ctrl_new_std(hdl, &sr030pc30_ctrl_ops,
-			V4L2_CID_EXPOSURE, EXPOS_MIN_MS, EXPOS_MAX_MS, 1, 30);
-	sd->ctrl_handler = hdl;
-	if (hdl->error) {
-		int err = hdl->error;
-
-		v4l2_ctrl_handler_free(hdl);
-		return err;
-	}
-	v4l2_ctrl_auto_cluster(3, &info->awb, 0, false);
-	v4l2_ctrl_auto_cluster(2, &info->autoexp, V4L2_EXPOSURE_MANUAL, false);
-	v4l2_ctrl_handler_setup(hdl);
-
-	info->i2c_reg_page	= -1;
-	info->hflip		= 1;
-
-	return 0;
-}
-
-static void sr030pc30_remove(struct i2c_client *client)
-{
-	struct v4l2_subdev *sd = i2c_get_clientdata(client);
-
-	v4l2_device_unregister_subdev(sd);
-	v4l2_ctrl_handler_free(sd->ctrl_handler);
-}
-
-static const struct i2c_device_id sr030pc30_id[] = {
-	{ MODULE_NAME, 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, sr030pc30_id);
-
-
-static struct i2c_driver sr030pc30_i2c_driver = {
-	.driver = {
-		.name = MODULE_NAME
-	},
-	.probe_new	= sr030pc30_probe,
-	.remove		= sr030pc30_remove,
-	.id_table	= sr030pc30_id,
-};
-
-module_i2c_driver(sr030pc30_i2c_driver);
-
-MODULE_DESCRIPTION("Siliconfile SR030PC30 camera driver");
-MODULE_AUTHOR("Sylwester Nawrocki <s.nawrocki@samsung.com>");
-MODULE_LICENSE("GPL");
diff --git a/include/media/i2c/sr030pc30.h b/include/media/i2c/sr030pc30.h
deleted file mode 100644
index 84c602d681fa..000000000000
--- a/include/media/i2c/sr030pc30.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Driver header for SR030PC30 camera sensor
- *
- * Copyright (c) 2010 Samsung Electronics, Co. Ltd
- * Contact: Sylwester Nawrocki <s.nawrocki@samsung.com>
- */
-
-#ifndef SR030PC30_H
-#define SR030PC30_H
-
-struct sr030pc30_platform_data {
-	unsigned long clk_rate;	/* master clock frequency in Hz */
-	int (*set_power)(struct device *dev, int on);
-};
-
-#endif /* SR030PC30_H */
-- 
cgit v1.2.3


From 4ea15b56f0810f0d8795d475db1bb74b3a7c1b2f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:08 +0100
Subject: io_uring/rsrc: use wq for quiescing

Replace completions with waitqueues for rsrc data quiesce, the main
wakeup condition is when data refs hit zero. Note that data refs are
only changes under ->uring_lock, so we prepare before mutex_unlock()
reacquire it after taking the lock back. This change will be needed
in the next patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1d0dbc74b3b4fd67c8f01819e680c5e0da252956.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            |  1 +
 io_uring/rsrc.c                | 18 ++++++++++++------
 io_uring/rsrc.h                |  1 -
 4 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 40cab420b1bd..5c9645319770 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -333,6 +333,7 @@ struct io_ring_ctx {
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
+	struct wait_queue_head		rsrc_quiesce_wq;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9083a8466ebf..3c1c8c788b7b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -321,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	init_waitqueue_head(&ctx->poll_wq);
+	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d7e7528f7159..f9ce4076c73d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -158,6 +158,7 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
+	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
 	if (ref_node->inline_items)
@@ -171,13 +172,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 	if (io_put_rsrc_data_ref(rsrc_data))
-		complete(&rsrc_data->done);
+		wake_up_all(&ctx->rsrc_quiesce_wq);
 }
 
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
-	if (data && !io_put_rsrc_data_ref(data))
-		wait_for_completion(&data->done);
+	if (data)
+		WARN_ON_ONCE(!io_put_rsrc_data_ref(data));
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -257,6 +258,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 				      struct io_ring_ctx *ctx)
 {
+	DEFINE_WAIT(we);
 	int ret;
 
 	/* As we may drop ->uring_lock, other task may have started quiesce */
@@ -273,7 +275,9 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 
 	data->quiesce = true;
 	do {
+		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
 		mutex_unlock(&ctx->uring_lock);
+
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
@@ -285,12 +289,15 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			}
 			break;
 		}
-		wait_for_completion_interruptible(&data->done);
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
 		mutex_lock(&ctx->uring_lock);
 		ret = 0;
 	} while (data->refs);
-	data->quiesce = false;
 
+	finish_wait(&ctx->rsrc_quiesce_wq, &we);
+	data->quiesce = false;
 	return ret;
 }
 
@@ -366,7 +373,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 				goto fail;
 		}
 	}
-	init_completion(&data->done);
 	*pdata = data;
 	return 0;
 fail:
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 88adcb0b7963..d93ba4e9742a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -35,7 +35,6 @@ struct io_rsrc_data {
 	u64				**tags;
 	unsigned int			nr;
 	rsrc_put_fn			*do_put;
-	struct completion		done;
 	int				refs;
 	bool				quiesce;
 };
-- 
cgit v1.2.3


From 0b222eeb6514ba6c3457b667fa4f3645032e1fc9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:10 +0100
Subject: io_uring/rsrc: remove rsrc_data refs

Instead of waiting for rsrc_data->refs to be downed to zero, check
whether there are rsrc nodes queued for completion, that's easier then
maintaining references.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8e33fd143d83e11af3e386aea28eb6d6c6a1be10.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            |  4 ++--
 io_uring/rsrc.c                | 32 ++++++++------------------------
 io_uring/rsrc.h                |  2 --
 4 files changed, 11 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5c9645319770..1b2a20a42413 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -334,6 +334,7 @@ struct io_ring_ctx {
 	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
 	struct wait_queue_head		rsrc_quiesce_wq;
+	unsigned			rsrc_quiesce;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3c1c8c788b7b..3d43df8f1e4e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2831,8 +2831,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
 	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
-	io_wait_rsrc_data(ctx->buf_data);
-	io_wait_rsrc_data(ctx->file_data);
+	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
+		return;
 
 	mutex_lock(&ctx->uring_lock);
 	if (ctx->buf_data)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index e634ef384724..5415a18844e0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -31,11 +31,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
-static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data)
-{
-	return !--rsrc_data->refs;
-}
-
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -158,7 +153,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
 	if (ref_node->inline_items)
@@ -171,14 +165,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	}
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
-	if (io_put_rsrc_data_ref(rsrc_data))
-		wake_up_all(&ctx->rsrc_quiesce_wq);
-}
-
-void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
-	if (data)
-		WARN_ON_ONCE(!io_put_rsrc_data_ref(data));
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -201,6 +187,8 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		list_del(&node->node);
 		__io_rsrc_put_work(node);
 	}
+	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
+		wake_up_all(&ctx->rsrc_quiesce_wq);
 }
 
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
@@ -235,7 +223,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	if (WARN_ON_ONCE(!backup))
 		return;
 
-	data_to_kill->refs++;
 	node->rsrc_data = data_to_kill;
 	list_add_tail(&node->node, &ctx->rsrc_ref_list);
 	/* put master ref */
@@ -269,8 +256,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		return ret;
 	io_rsrc_node_switch(ctx, data);
 
-	/* kill initial ref */
-	if (io_put_rsrc_data_ref(data))
+	if (list_empty(&ctx->rsrc_ref_list))
 		return 0;
 
 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
@@ -278,6 +264,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		smp_mb();
 	}
 
+	ctx->rsrc_quiesce++;
 	data->quiesce = true;
 	do {
 		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
@@ -286,12 +273,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
-			if (!data->refs) {
+			if (list_empty(&ctx->rsrc_ref_list))
 				ret = 0;
-			} else {
-				/* restore the master reference */
-				data->refs++;
-			}
 			break;
 		}
 
@@ -299,10 +282,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		__set_current_state(TASK_RUNNING);
 		mutex_lock(&ctx->uring_lock);
 		ret = 0;
-	} while (data->refs);
+	} while (!list_empty(&ctx->rsrc_ref_list));
 
 	finish_wait(&ctx->rsrc_quiesce_wq, &we);
 	data->quiesce = false;
+	ctx->rsrc_quiesce--;
+
 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 		atomic_set(&ctx->cq_wait_nr, 0);
 		smp_mb();
@@ -371,7 +356,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	data->nr = nr;
 	data->ctx = ctx;
 	data->do_put = do_put;
-	data->refs = 1;
 	if (utags) {
 		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index d93ba4e9742a..5dd2fcb28069 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -35,7 +35,6 @@ struct io_rsrc_data {
 	u64				**tags;
 	unsigned int			nr;
 	rsrc_put_fn			*do_put;
-	int				refs;
 	bool				quiesce;
 };
 
@@ -69,7 +68,6 @@ struct io_mapped_ubuf {
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
-void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-- 
cgit v1.2.3


From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:03 -0700
Subject: bpf: Remove btf_field_offs, use btf_record's fields instead

The btf_field_offs struct contains (offset, size) for btf_record fields,
sorted by offset. btf_field_offs is always used in conjunction with
btf_record, which has btf_field 'fields' array with (offset, type), the
latter of which btf_field_offs' size is derived from via
btf_field_type_size.

This patch adds a size field to struct btf_field and sorts btf_record's
fields by offset, making it possible to get rid of btf_field_offs. Less
data duplication and less code complexity results.

Since btf_field_offs' lifetime closely followed the btf_record used to
populate it, most complexity wins are from removal of initialization
code like:

  if (btf_record_successfully_initialized) {
    foffs = btf_parse_field_offs(rec);
    if (IS_ERR_OR_NULL(foffs))
      // free the btf_record and return err
  }

Other changes in this patch are pretty mechanical:

  * foffs->field_off[i] -> rec->fields[i].offset
  * foffs->field_sz[i] -> rec->fields[i].size
  * Sort rec->fields in btf_parse_fields before returning
    * It's possible that this is necessary independently of other
      changes in this patch. btf_record_find in syscall.c expects
      btf_record's fields to be sorted by offset, yet there's no
      explicit sorting of them before this patch, record's fields are
      populated in the order they're read from BTF struct definition.
      BTF docs don't say anything about the sortedness of struct fields.
  * All functions taking struct btf_field_offs * input now instead take
    struct btf_record *. All callsites of these functions already have
    access to the correct btf_record.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 44 ++++++++++-------------
 include/linux/btf.h     |  2 --
 kernel/bpf/btf.c        | 93 +++++++++++--------------------------------------
 kernel/bpf/helpers.c    |  2 +-
 kernel/bpf/map_in_map.c | 15 --------
 kernel/bpf/syscall.c    | 17 ++-------
 6 files changed, 43 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 88845aadc47d..7888ed497432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -210,6 +210,7 @@ struct btf_field_graph_root {
 
 struct btf_field {
 	u32 offset;
+	u32 size;
 	enum btf_field_type type;
 	union {
 		struct btf_field_kptr kptr;
@@ -225,12 +226,6 @@ struct btf_record {
 	struct btf_field fields[];
 };
 
-struct btf_field_offs {
-	u32 cnt;
-	u32 field_off[BTF_FIELDS_MAX];
-	u8 field_sz[BTF_FIELDS_MAX];
-};
-
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
@@ -257,7 +252,6 @@ struct bpf_map {
 	struct obj_cgroup *objcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
-	struct btf_field_offs *field_offs;
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
@@ -360,14 +354,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
 	return rec->field_mask & type;
 }
 
-static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
+static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 {
 	int i;
 
-	if (!foffs)
+	if (IS_ERR_OR_NULL(rec))
 		return;
-	for (i = 0; i < foffs->cnt; i++)
-		memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]);
+	for (i = 0; i < rec->cnt; i++)
+		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
@@ -379,7 +373,7 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
  */
 static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 {
-	bpf_obj_init(map->field_offs, dst);
+	bpf_obj_init(map->record, dst);
 }
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
@@ -399,14 +393,14 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
-static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
+static inline void bpf_obj_memcpy(struct btf_record *rec,
 				  void *dst, void *src, u32 size,
 				  bool long_memcpy)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!foffs)) {
+	if (IS_ERR_OR_NULL(rec)) {
 		if (long_memcpy)
 			bpf_long_memcpy(dst, src, round_up(size, 8));
 		else
@@ -414,49 +408,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
 		return;
 	}
 
-	for (i = 0; i < foffs->cnt; i++) {
-		u32 next_off = foffs->field_off[i];
+	for (i = 0; i < rec->cnt; i++) {
+		u32 next_off = rec->fields[i].offset;
 		u32 sz = next_off - curr_off;
 
 		memcpy(dst + curr_off, src + curr_off, sz);
-		curr_off += foffs->field_sz[i] + sz;
+		curr_off += rec->fields[i].size + sz;
 	}
 	memcpy(dst + curr_off, src + curr_off, size - curr_off);
 }
 
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
-	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false);
+	bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
 }
 
 static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
 {
-	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true);
+	bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
 }
 
-static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size)
+static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!foffs)) {
+	if (IS_ERR_OR_NULL(rec)) {
 		memset(dst, 0, size);
 		return;
 	}
 
-	for (i = 0; i < foffs->cnt; i++) {
-		u32 next_off = foffs->field_off[i];
+	for (i = 0; i < rec->cnt; i++) {
+		u32 next_off = rec->fields[i].offset;
 		u32 sz = next_off - curr_off;
 
 		memset(dst + curr_off, 0, sz);
-		curr_off += foffs->field_sz[i] + sz;
+		curr_off += rec->fields[i].size + sz;
 	}
 	memset(dst + curr_off, 0, size - curr_off);
 }
 
 static inline void zero_map_value(struct bpf_map *map, void *dst)
 {
-	bpf_obj_memzero(map->field_offs, dst, map->value_size);
+	bpf_obj_memzero(map->record, dst, map->value_size);
 }
 
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 495250162422..813227bff58a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc {
 struct btf_struct_meta {
 	u32 btf_id;
 	struct btf_record *record;
-	struct btf_field_offs *field_offs;
 };
 
 struct btf_struct_metas {
@@ -207,7 +206,6 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size);
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2c2d1fb9f410..f3c998feeccb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1666,10 +1666,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)
 
 	if (!tab)
 		return;
-	for (i = 0; i < tab->cnt; i++) {
+	for (i = 0; i < tab->cnt; i++)
 		btf_record_free(tab->types[i].record);
-		kfree(tab->types[i].field_offs);
-	}
 	kfree(tab);
 }
 
@@ -3700,12 +3698,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
 					    __alignof__(struct bpf_rb_node));
 }
 
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+	const struct btf_field *a = (const struct btf_field *)_a;
+	const struct btf_field *b = (const struct btf_field *)_b;
+
+	if (a->offset < b->offset)
+		return -1;
+	else if (a->offset > b->offset)
+		return 1;
+	return 0;
+}
+
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
+	u32 next_off = 0, field_type_size;
 	struct btf_record *rec;
-	u32 next_off = 0;
 	int ret, i, cnt;
 
 	ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3725,7 +3735,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 	rec->spin_lock_off = -EINVAL;
 	rec->timer_off = -EINVAL;
 	for (i = 0; i < cnt; i++) {
-		if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+		field_type_size = btf_field_type_size(info_arr[i].type);
+		if (info_arr[i].off + field_type_size > value_size) {
 			WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
 			ret = -EFAULT;
 			goto end;
@@ -3734,11 +3745,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			ret = -EEXIST;
 			goto end;
 		}
-		next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
+		next_off = info_arr[i].off + field_type_size;
 
 		rec->field_mask |= info_arr[i].type;
 		rec->fields[i].offset = info_arr[i].off;
 		rec->fields[i].type = info_arr[i].type;
+		rec->fields[i].size = field_type_size;
 
 		switch (info_arr[i].type) {
 		case BPF_SPIN_LOCK:
@@ -3808,6 +3820,9 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 		goto end;
 	}
 
+	sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+	       NULL, rec);
+
 	return rec;
 end:
 	btf_record_free(rec);
@@ -3889,61 +3904,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 	return 0;
 }
 
-static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
-{
-	const u32 a = *(const u32 *)_a;
-	const u32 b = *(const u32 *)_b;
-
-	if (a < b)
-		return -1;
-	else if (a > b)
-		return 1;
-	return 0;
-}
-
-static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
-{
-	struct btf_field_offs *foffs = (void *)priv;
-	u32 *off_base = foffs->field_off;
-	u32 *a = _a, *b = _b;
-	u8 *sz_a, *sz_b;
-
-	sz_a = foffs->field_sz + (a - off_base);
-	sz_b = foffs->field_sz + (b - off_base);
-
-	swap(*a, *b);
-	swap(*sz_a, *sz_b);
-}
-
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
-{
-	struct btf_field_offs *foffs;
-	u32 i, *off;
-	u8 *sz;
-
-	BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
-	if (IS_ERR_OR_NULL(rec))
-		return NULL;
-
-	foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
-	if (!foffs)
-		return ERR_PTR(-ENOMEM);
-
-	off = foffs->field_off;
-	sz = foffs->field_sz;
-	for (i = 0; i < rec->cnt; i++) {
-		off[i] = rec->fields[i].offset;
-		sz[i] = btf_field_type_size(rec->fields[i].type);
-	}
-	foffs->cnt = rec->cnt;
-
-	if (foffs->cnt == 1)
-		return foffs;
-	sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
-	       btf_field_offs_cmp, btf_field_offs_swap, foffs);
-	return foffs;
-}
-
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
@@ -5386,7 +5346,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 	for (i = 1; i < n; i++) {
 		struct btf_struct_metas *new_tab;
 		const struct btf_member *member;
-		struct btf_field_offs *foffs;
 		struct btf_struct_meta *type;
 		struct btf_record *record;
 		const struct btf_type *t;
@@ -5428,17 +5387,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
 			goto free;
 		}
-		foffs = btf_parse_field_offs(record);
-		/* We need the field_offs to be valid for a valid record,
-		 * either both should be set or both should be unset.
-		 */
-		if (IS_ERR_OR_NULL(foffs)) {
-			btf_record_free(record);
-			ret = -EFAULT;
-			goto free;
-		}
 		type->record = record;
-		type->field_offs = foffs;
 		tab->cnt++;
 	}
 	return tab;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f04e60a4847f..e989b6460147 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1893,7 +1893,7 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	if (!p)
 		return NULL;
 	if (meta)
-		bpf_obj_init(meta->field_offs, p);
+		bpf_obj_init(meta->record, p);
 	return p;
 }
 
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 38136ec4e095..2c5c64c2a53b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		ret = PTR_ERR(inner_map_meta->record);
 		goto free;
 	}
-	if (inner_map_meta->record) {
-		struct btf_field_offs *field_offs;
-		/* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
-		 * valid.
-		 */
-		field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
-		if (!field_offs) {
-			ret = -ENOMEM;
-			goto free_rec;
-		}
-		inner_map_meta->field_offs = field_offs;
-	}
 	/* Note: We must use the same BTF, as we also used btf_record_dup above
 	 * which relies on BTF being same for both maps, as some members like
 	 * record->fields.list_head have pointers like value_rec pointing into
@@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 
 	fdput(f);
 	return inner_map_meta;
-free_rec:
-	btf_record_free(inner_map_meta->record);
 free:
 	kfree(inner_map_meta);
 put:
@@ -99,7 +85,6 @@ put:
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
 {
-	kfree(map_meta->field_offs);
 	bpf_map_free_record(map_meta);
 	btf_put(map_meta->btf);
 	kfree(map_meta);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6d575505f89c..c08b7933bf8f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -717,14 +717,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
-	struct btf_field_offs *foffs = map->field_offs;
 	struct btf_record *rec = map->record;
 
 	security_bpf_map_free(map);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
-	/* Delay freeing of field_offs and btf_record for maps, as map_free
+	/* Delay freeing of btf_record for maps, as map_free
 	 * callback usually needs access to them. It is better to do it here
 	 * than require each callback to do the free itself manually.
 	 *
@@ -733,7 +732,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a
 	 * template bpf_map struct used during verification.
 	 */
-	kfree(foffs);
 	btf_record_free(rec);
 }
 
@@ -1125,7 +1123,6 @@ free_map_tab:
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
-	struct btf_field_offs *foffs;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -1205,17 +1202,9 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-
-	foffs = btf_parse_field_offs(map->record);
-	if (IS_ERR(foffs)) {
-		err = PTR_ERR(foffs);
-		goto free_map;
-	}
-	map->field_offs = foffs;
-
 	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_field_offs;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
@@ -1239,8 +1228,6 @@ static int map_create(union bpf_attr *attr)
 
 free_map_sec:
 	security_bpf_map_free(map);
-free_map_field_offs:
-	kfree(map->field_offs);
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
-- 
cgit v1.2.3


From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:04 -0700
Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing

A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types
meant for use in BPF programs. Similarly to other opaque types like
bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in
user-defined struct types a bpf_refcount can be located, so necessary
btf_record plumbing is added to enable this. bpf_refcount is sized to
hold a refcount_t.

Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in
btf_record as refcount_off in addition to being in the field array.
Caching refcount_off makes sense for this field because further patches
in the series will modify functions that take local kptrs (e.g.
bpf_obj_drop) to change their behavior if the type they're operating on
is refcounted. So enabling fast "is this type refcounted?" checks is
desirable.

No such verifier behavior changes are introduced in this patch, just
logic to recognize 'struct bpf_refcount' in btf_record.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 ++++++++
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/btf.c               | 12 +++++++++++-
 kernel/bpf/syscall.c           |  6 +++++-
 tools/include/uapi/linux/bpf.h |  4 ++++
 5 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7888ed497432..be44d765b7a4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,7 @@ enum btf_field_type {
 	BPF_RB_NODE    = (1 << 7),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
+	BPF_REFCOUNT   = (1 << 8),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -223,6 +224,7 @@ struct btf_record {
 	u32 field_mask;
 	int spin_lock_off;
 	int timer_off;
+	int refcount_off;
 	struct btf_field fields[];
 };
 
@@ -293,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_rb_root";
 	case BPF_RB_NODE:
 		return "bpf_rb_node";
+	case BPF_REFCOUNT:
+		return "bpf_refcount";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -317,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_rb_root);
 	case BPF_RB_NODE:
 		return sizeof(struct bpf_rb_node);
+	case BPF_REFCOUNT:
+		return sizeof(struct bpf_refcount);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -341,6 +347,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_rb_root);
 	case BPF_RB_NODE:
 		return __alignof__(struct bpf_rb_node);
+	case BPF_REFCOUNT:
+		return __alignof__(struct bpf_refcount);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3823100b7934..4b20a7269bee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6985,6 +6985,10 @@ struct bpf_rb_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_refcount {
+	__u32 :32;
+} __attribute__((aligned(4)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f3c998feeccb..14889fd5ba8e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3391,6 +3391,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
 	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
 	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
+	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
 
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR) {
@@ -3439,6 +3440,7 @@ static int btf_find_struct_field(const struct btf *btf,
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			ret = btf_find_struct(btf, member_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3504,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			ret = btf_find_struct(btf, var_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3734,6 +3737,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 
 	rec->spin_lock_off = -EINVAL;
 	rec->timer_off = -EINVAL;
+	rec->refcount_off = -EINVAL;
 	for (i = 0; i < cnt; i++) {
 		field_type_size = btf_field_type_size(info_arr[i].type);
 		if (info_arr[i].off + field_type_size > value_size) {
@@ -3763,6 +3767,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			/* Cache offset for faster lookup at runtime */
 			rec->timer_off = rec->fields[i].offset;
 			break;
+		case BPF_REFCOUNT:
+			WARN_ON_ONCE(rec->refcount_off >= 0);
+			/* Cache offset for faster lookup at runtime */
+			rec->refcount_off = rec->fields[i].offset;
+			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
@@ -5308,6 +5317,7 @@ static const char *alloc_obj_fields[] = {
 	"bpf_list_node",
 	"bpf_rb_root",
 	"bpf_rb_node",
+	"bpf_refcount",
 };
 
 static struct btf_struct_metas *
@@ -5381,7 +5391,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 		type = &tab->types[tab->cnt];
 		type->btf_id = i;
 		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
-						  BPF_RB_ROOT | BPF_RB_NODE, t->size);
+						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
 		/* The record cannot be unset, treat it as an error if so */
 		if (IS_ERR_OR_NULL(record)) {
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c08b7933bf8f..28eac7434d32 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -552,6 +552,7 @@ void btf_record_free(struct btf_record *rec)
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_REFCOUNT:
 			/* Nothing to release */
 			break;
 		default:
@@ -599,6 +600,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_REFCOUNT:
 			/* Nothing to acquire */
 			break;
 		default:
@@ -705,6 +707,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			break;
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1032,7 +1035,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
-				       BPF_RB_ROOT,
+				       BPF_RB_ROOT | BPF_REFCOUNT,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
@@ -1071,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3823100b7934..4b20a7269bee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6985,6 +6985,10 @@ struct bpf_rb_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_refcount {
+	__u32 :32;
+} __attribute__((aligned(4)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3


From 1512217c47f0e8ea076dd0e67262e5a668a78f01 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:05 -0700
Subject: bpf: Support refcounted local kptrs in existing semantics

A local kptr is considered 'refcounted' when it is of a type that has a
bpf_refcount field. When such a kptr is created, its refcount should be
initialized to 1; when destroyed, the object should be free'd only if a
refcount decr results in 0 refcount.

Existing logic always frees the underlying memory when destroying a
local kptr, and 0-initializes all btf_record fields. This patch adds
checks for "is local kptr refcounted?" and new logic for that case in
the appropriate places.

This patch focuses on changing existing semantics and thus conspicuously
does _not_ provide a way for BPF programs in increment refcount. That
follows later in the series.

__bpf_obj_drop_impl is modified to do the right thing when it sees a
refcounted type. Container types for graph nodes (list, tree, stashed in
map) are migrated to use __bpf_obj_drop_impl as a destructor for their
nodes instead of each having custom destruction code in their _free
paths. Now that "drop" isn't a synonym for "free" when the type is
refcounted it makes sense to centralize this logic.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  3 +++
 kernel/bpf/helpers.c | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index be44d765b7a4..b065de2cfcea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -370,6 +370,9 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 		return;
 	for (i = 0; i < rec->cnt; i++)
 		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
+
+	if (rec->refcount_off >= 0)
+		refcount_set((refcount_t *)(obj + rec->refcount_off), 1);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e989b6460147..e2dbd9644e5c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1798,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
@@ -1828,13 +1830,8 @@ unlock:
 		/* The contained type can also have resources, including a
 		 * bpf_list_head which needs to be freed.
 		 */
-		bpf_obj_free_fields(field->graph_root.value_rec, obj);
-		/* bpf_mem_free requires migrate_disable(), since we can be
-		 * called from map free path as well apart from BPF program (as
-		 * part of map ops doing bpf_obj_free_fields).
-		 */
 		migrate_disable();
-		bpf_mem_free(&bpf_global_ma, obj);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
 		migrate_enable();
 	}
 }
@@ -1871,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		obj = pos;
 		obj -= field->graph_root.node_offset;
 
-		bpf_obj_free_fields(field->graph_root.value_rec, obj);
 
 		migrate_disable();
-		bpf_mem_free(&bpf_global_ma, obj);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
 		migrate_enable();
 	}
 }
@@ -1897,8 +1893,17 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 {
+	if (rec && rec->refcount_off >= 0 &&
+	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+		/* Object is refcounted and refcount_dec didn't result in 0
+		 * refcount. Return without freeing the object
+		 */
+		return;
+	}
+
 	if (rec)
 		bpf_obj_free_fields(rec, p);
 	bpf_mem_free(&bpf_global_ma, p);
-- 
cgit v1.2.3


From d2dcc67df910dd85253a701b6a5b747f955d28f5 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:07 -0700
Subject: bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to
 possibly fail

Consider this code snippet:

  struct node {
    long key;
    bpf_list_node l;
    bpf_rb_node r;
    bpf_refcount ref;
  }

  int some_bpf_prog(void *ctx)
  {
    struct node *n = bpf_obj_new(/*...*/), *m;

    bpf_spin_lock(&glock);

    bpf_rbtree_add(&some_tree, &n->r, /* ... */);
    m = bpf_refcount_acquire(n);
    bpf_rbtree_add(&other_tree, &m->r, /* ... */);

    bpf_spin_unlock(&glock);

    /* ... */
  }

After bpf_refcount_acquire, n and m point to the same underlying memory,
and that node's bpf_rb_node field is being used by the some_tree insert,
so overwriting it as a result of the second insert is an error. In order
to properly support refcounted nodes, the rbtree and list insert
functions must be allowed to fail. This patch adds such support.

The kfuncs bpf_rbtree_add, bpf_list_push_{front,back} are modified to
return an int indicating success/failure, with 0 -> success, nonzero ->
failure.

bpf_obj_drop on failure
=======================

Currently the only reason an insert can fail is the example above: the
bpf_{list,rb}_node is already in use. When such a failure occurs, the
insert kfuncs will bpf_obj_drop the input node. This allows the insert
operations to logically fail without changing their verifier owning ref
behavior, namely the unconditional release_reference of the input
owning ref.

With insert that always succeeds, ownership of the node is always passed
to the collection, since the node always ends up in the collection.

With a possibly-failed insert w/ bpf_obj_drop, ownership of the node
is always passed either to the collection (success), or to bpf_obj_drop
(failure). Regardless, it's correct to continue unconditionally
releasing the input owning ref, as something is always taking ownership
from the calling program on insert.

Keeping owning ref behavior unchanged results in a nice default UX for
insert functions that can fail. If the program's reaction to a failed
insert is "fine, just get rid of this owning ref for me and let me go
on with my business", then there's no reason to check for failure since
that's default behavior. e.g.:

  long important_failures = 0;

  int some_bpf_prog(void *ctx)
  {
    struct node *n, *m, *o; /* all bpf_obj_new'd */

    bpf_spin_lock(&glock);
    bpf_rbtree_add(&some_tree, &n->node, /* ... */);
    bpf_rbtree_add(&some_tree, &m->node, /* ... */);
    if (bpf_rbtree_add(&some_tree, &o->node, /* ... */)) {
      important_failures++;
    }
    bpf_spin_unlock(&glock);
  }

If we instead chose to pass ownership back to the program on failed
insert - by returning NULL on success or an owning ref on failure -
programs would always have to do something with the returned ref on
failure. The most likely action is probably "I'll just get rid of this
owning ref and go about my business", which ideally would look like:

  if (n = bpf_rbtree_add(&some_tree, &n->node, /* ... */))
    bpf_obj_drop(n);

But bpf_obj_drop isn't allowed in a critical section and inserts must
occur within one, so in reality error handling would become a
hard-to-parse mess.

For refcounted nodes, we can replicate the "pass ownership back to
program on failure" logic with this patch's semantics, albeit in an ugly
way:

  struct node *n = bpf_obj_new(/* ... */), *m;

  bpf_spin_lock(&glock);

  m = bpf_refcount_acquire(n);
  if (bpf_rbtree_add(&some_tree, &n->node, /* ... */)) {
    /* Do something with m */
  }

  bpf_spin_unlock(&glock);
  bpf_obj_drop(m);

bpf_refcount_acquire is used to simulate "return owning ref on failure".
This should be an uncommon occurrence, though.

Addition of two verifier-fixup'd args to collection inserts
===========================================================

The actual bpf_obj_drop kfunc is
bpf_obj_drop_impl(void *, struct btf_struct_meta *), with bpf_obj_drop
macro populating the second arg with 0 and the verifier later filling in
the arg during insn fixup.

Because bpf_rbtree_add and bpf_list_push_{front,back} now might do
bpf_obj_drop, these kfuncs need a btf_struct_meta parameter that can be
passed to bpf_obj_drop_impl.

Similarly, because the 'node' param to those insert functions is the
bpf_{list,rb}_node within the node type, and bpf_obj_drop expects a
pointer to the beginning of the node, the insert functions need to be
able to find the beginning of the node struct. A second
verifier-populated param is necessary: the offset of {list,rb}_node within the
node type.

These two new params allow the insert kfuncs to correctly call
__bpf_obj_drop_impl:

  beginning_of_node = bpf_rb_node_ptr - offset
  if (already_inserted)
    __bpf_obj_drop_impl(beginning_of_node, btf_struct_meta->record);

Similarly to other kfuncs with "hidden" verifier-populated params, the
insert functions are renamed with _impl prefix and a macro is provided
for common usage. For example, bpf_rbtree_add kfunc is now
bpf_rbtree_add_impl and bpf_rbtree_add is now a macro which sets
"hidden" args to 0.

Due to the two new args BPF progs will need to be recompiled to work
with the new _impl kfuncs.

This patch also rewrites the "hidden argument" explanation to more
directly say why the BPF program writer doesn't need to populate the
arguments with anything meaningful.

How does this new logic affect non-owning references?
=====================================================

Currently, non-owning refs are valid until the end of the critical
section in which they're created. We can make this guarantee because, if
a non-owning ref exists, the referent was added to some collection. The
collection will drop() its nodes when it goes away, but it can't go away
while our program is accessing it, so that's not a problem. If the
referent is removed from the collection in the same CS that it was added
in, it can't be bpf_obj_drop'd until after CS end. Those are the only
two ways to free the referent's memory and neither can happen until
after the non-owning ref's lifetime ends.

On first glance, having these collection insert functions potentially
bpf_obj_drop their input seems like it breaks the "can't be
bpf_obj_drop'd until after CS end" line of reasoning. But we care about
the memory not being _freed_ until end of CS end, and a previous patch
in the series modified bpf_obj_drop such that it doesn't free refcounted
nodes until refcount == 0. So the statement can be more accurately
rewritten as "can't be free'd until after CS end".

We can prove that this rewritten statement holds for any non-owning
reference produced by collection insert functions:

* If the input to the insert function is _not_ refcounted
  * We have an owning reference to the input, and can conclude it isn't
    in any collection
    * Inserting a node in a collection turns owning refs into
      non-owning, and since our input type isn't refcounted, there's no
      way to obtain additional owning refs to the same underlying
      memory
  * Because our node isn't in any collection, the insert operation
    cannot fail, so bpf_obj_drop will not execute
  * If bpf_obj_drop is guaranteed not to execute, there's no risk of
    memory being free'd

* Otherwise, the input to the insert function is refcounted
  * If the insert operation fails due to the node's list_head or rb_root
    already being in some collection, there was some previous successful
    insert which passed refcount to the collection
  * We have an owning reference to the input, it must have been
    acquired via bpf_refcount_acquire, which bumped the refcount
  * refcount must be >= 2 since there's a valid owning reference and the
    node is already in a collection
  * Insert triggering bpf_obj_drop will decr refcount to >= 1, never
    resulting in a free

So although we may do bpf_obj_drop during the critical section, this
will never result in memory being free'd, and no changes to non-owning
ref logic are needed in this patch.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-6-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                   |  7 ++-
 kernel/bpf/helpers.c                           | 65 +++++++++++++++------
 kernel/bpf/verifier.c                          | 78 ++++++++++++++++++--------
 tools/testing/selftests/bpf/bpf_experimental.h | 49 ++++++++++++----
 4 files changed, 148 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f03852b89d28..3dd29a53b711 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -464,7 +464,12 @@ struct bpf_insn_aux_data {
 		 */
 		struct bpf_loop_inline_state loop_inline_state;
 	};
-	u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */
+	union {
+		/* remember the size of type passed to bpf_obj_new to rewrite R1 */
+		u64 obj_new_size;
+		/* remember the offset of node field within type to rewrite */
+		u64 insert_off;
+	};
 	struct btf_struct_meta *kptr_struct_meta;
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 57ff8a60222c..5067f8d46872 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1931,7 +1931,8 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
 	return (void *)p__refcounted_kptr;
 }
 
-static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+			  bool tail, struct btf_record *rec, u64 off)
 {
 	struct list_head *n = (void *)node, *h = (void *)head;
 
@@ -1939,17 +1940,35 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea
 		INIT_LIST_HEAD(h);
 	if (unlikely(!n->next))
 		INIT_LIST_HEAD(n);
+	if (!list_empty(n)) {
+		/* Only called from BPF prog, no need to migrate_disable */
+		__bpf_obj_drop_impl(n - off, rec);
+		return -EINVAL;
+	}
+
 	tail ? list_add_tail(n, h) : list_add(n, h);
+
+	return 0;
 }
 
-__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+					 struct bpf_list_node *node,
+					 void *meta__ign, u64 off)
 {
-	return __bpf_list_add(node, head, false);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_list_add(node, head, false,
+			      meta ? meta->record : NULL, off);
 }
 
-__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+					struct bpf_list_node *node,
+					void *meta__ign, u64 off)
 {
-	return __bpf_list_add(node, head, true);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_list_add(node, head, true,
+			      meta ? meta->record : NULL, off);
 }
 
 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
@@ -1989,14 +2008,23 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
  * program
  */
-static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-			     void *less)
+static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			    void *less, struct btf_record *rec, u64 off)
 {
 	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+	struct rb_node *parent = NULL, *n = (struct rb_node *)node;
 	bpf_callback_t cb = (bpf_callback_t)less;
-	struct rb_node *parent = NULL;
 	bool leftmost = true;
 
+	if (!n->__rb_parent_color)
+		RB_CLEAR_NODE(n);
+
+	if (!RB_EMPTY_NODE(n)) {
+		/* Only called from BPF prog, no need to migrate_disable */
+		__bpf_obj_drop_impl(n - off, rec);
+		return -EINVAL;
+	}
+
 	while (*link) {
 		parent = *link;
 		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
@@ -2007,15 +2035,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
 		}
 	}
 
-	rb_link_node((struct rb_node *)node, parent, link);
-	rb_insert_color_cached((struct rb_node *)node,
-			       (struct rb_root_cached *)root, leftmost);
+	rb_link_node(n, parent, link);
+	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+	return 0;
 }
 
-__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-				bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+				    void *meta__ign, u64 off)
 {
-	__bpf_rbtree_add(root, node, (void *)less);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -2291,14 +2322,14 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_list_push_front)
-BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 
 #ifdef CONFIG_CGROUPS
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 29e106f7ccaa..736cb7cec0bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8500,10 +8500,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
 					 struct bpf_func_state *callee,
 					 int insn_idx)
 {
-	/* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+	/* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
 	 *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
 	 *
-	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
 	 * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
 	 * by this point, so look at 'root'
 	 */
@@ -9571,8 +9571,8 @@ enum special_kfunc_type {
 	KF_bpf_obj_new_impl,
 	KF_bpf_obj_drop_impl,
 	KF_bpf_refcount_acquire_impl,
-	KF_bpf_list_push_front,
-	KF_bpf_list_push_back,
+	KF_bpf_list_push_front_impl,
+	KF_bpf_list_push_back_impl,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
 	KF_bpf_cast_to_kern_ctx,
@@ -9580,7 +9580,7 @@ enum special_kfunc_type {
 	KF_bpf_rcu_read_lock,
 	KF_bpf_rcu_read_unlock,
 	KF_bpf_rbtree_remove,
-	KF_bpf_rbtree_add,
+	KF_bpf_rbtree_add_impl,
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
@@ -9592,14 +9592,14 @@ BTF_SET_START(special_kfunc_set)
 BTF_ID(func, bpf_obj_new_impl)
 BTF_ID(func, bpf_obj_drop_impl)
 BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -9611,8 +9611,8 @@ BTF_ID_LIST(special_kfunc_list)
 BTF_ID(func, bpf_obj_new_impl)
 BTF_ID(func, bpf_obj_drop_impl)
 BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
@@ -9620,7 +9620,7 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rcu_read_lock)
 BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -9954,15 +9954,15 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
 
 static bool is_bpf_list_api_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
-	       btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+	return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_back];
 }
 
 static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
 	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
 	       btf_id == special_kfunc_list[KF_bpf_rbtree_first];
 }
@@ -9975,7 +9975,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
 
 static bool is_callback_calling_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
@@ -10016,12 +10016,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 
 	switch (node_field_type) {
 	case BPF_LIST_NODE:
-		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
-		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
 		break;
 	case BPF_RB_NODE:
 		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
-		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
 		break;
 	default:
 		verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -10702,10 +10702,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
-	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
-	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
 		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+		insn_aux->insert_off = regs[BPF_REG_2].off;
 		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
@@ -10721,7 +10722,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_rbtree_add_callback_state);
 		if (err) {
@@ -14764,7 +14765,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 		       const struct bpf_reg_state *rcur,
 		       struct bpf_id_pair *idmap)
 {
-	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && 
+	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 	       check_ids(rold->id, rcur->id, idmap) &&
 	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
 }
@@ -17407,6 +17408,23 @@ static void specialize_kfunc(struct bpf_verifier_env *env,
 	}
 }
 
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+					    u16 struct_meta_reg,
+					    u16 node_offset_reg,
+					    struct bpf_insn *insn,
+					    struct bpf_insn *insn_buf,
+					    int *cnt)
+{
+	struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+	struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+	insn_buf[0] = addr[0];
+	insn_buf[1] = addr[1];
+	insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+	insn_buf[3] = *insn;
+	*cnt = 4;
+}
+
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
@@ -17453,6 +17471,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[1] = addr[1];
 		insn_buf[2] = *insn;
 		*cnt = 3;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+		int struct_meta_reg = BPF_REG_3;
+		int node_offset_reg = BPF_REG_4;
+
+		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+		if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+			struct_meta_reg = BPF_REG_4;
+			node_offset_reg = BPF_REG_5;
+		}
+
+		__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+						node_offset_reg, insn, insn_buf, cnt);
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 619afcab2ab0..209811b1993a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -14,7 +14,8 @@
  *	type ID of a struct in program BTF.
  *
  *	The 'local_type_id' parameter must be a known constant.
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	A pointer to an object of the type corresponding to the passed in
  *	'local_type_id', or NULL on failure.
@@ -28,7 +29,8 @@ extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
  *	Free an allocated object. All fields of the object that require
  *	destruction will be destructed before the storage is freed.
  *
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	Void.
  */
@@ -41,7 +43,8 @@ extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
  *	Increment the refcount on a refcounted local kptr, turning the
  *	non-owning reference input into an owning reference in the process.
  *
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	An owning reference to the object pointed to by 'kptr'
  */
@@ -52,17 +55,35 @@ extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
 
 /* Description
  *	Add a new entry to the beginning of the BPF linked list.
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Void.
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a list
  */
-extern void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+extern int bpf_list_push_front_impl(struct bpf_list_head *head,
+				    struct bpf_list_node *node,
+				    void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_list_push_front_impl */
+#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0)
 
 /* Description
  *	Add a new entry to the end of the BPF linked list.
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Void.
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a list
  */
-extern void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+extern int bpf_list_push_back_impl(struct bpf_list_head *head,
+				   struct bpf_list_node *node,
+				   void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_list_push_back_impl */
+#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0)
 
 /* Description
  *	Remove the entry at the beginning of the BPF linked list.
@@ -88,11 +109,19 @@ extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 
 /* Description
  *	Add 'node' to rbtree with root 'root' using comparator 'less'
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Nothing
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a tree
  */
-extern void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-			   bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			       bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+			       void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_rbtree_add_impl */
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
 
 /* Description
  *	Return the first (leftmost) node in input tree
-- 
cgit v1.2.3


From 3e81740a90626024a9d9c6f9bfa3d36204dafefb Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:10 -0700
Subject: bpf: Centralize btf_field-specific initialization logic

All btf_fields in an object are 0-initialized by memset in
bpf_obj_init. This might not be a valid initial state for some field
types, in which case kfuncs that use the type will properly initialize
their input if it's been 0-initialized. Some BPF graph collection types
and kfuncs do this: bpf_list_{head,node} and bpf_rb_node.

An earlier patch in this series added the bpf_refcount field, for which
the 0 state indicates that the refcounted object should be free'd.
bpf_obj_init treats this field specially, setting refcount to 1 instead
of relying on scattered "refcount is 0? Must have just been initialized,
let's set to 1" logic in kfuncs.

This patch extends this treatment to list and rbtree field types,
allowing most scattered initialization logic in kfuncs to be removed.

Note that bpf_{list_head,rb_root} may be inside a BPF map, in which case
they'll be 0-initialized without passing through the newly-added logic,
so scattered initialization logic must remain for these collection root
types.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-9-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 33 +++++++++++++++++++++++++++++----
 kernel/bpf/helpers.c | 14 ++++++--------
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b065de2cfcea..18b592fde896 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,34 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 	}
 }
 
+static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
+{
+	memset(addr, 0, field->size);
+
+	switch (field->type) {
+	case BPF_REFCOUNT:
+		refcount_set((refcount_t *)addr, 1);
+		break;
+	case BPF_RB_NODE:
+		RB_CLEAR_NODE((struct rb_node *)addr);
+		break;
+	case BPF_LIST_HEAD:
+	case BPF_LIST_NODE:
+		INIT_LIST_HEAD((struct list_head *)addr);
+		break;
+	case BPF_RB_ROOT:
+		/* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
+	case BPF_SPIN_LOCK:
+	case BPF_TIMER:
+	case BPF_KPTR_UNREF:
+	case BPF_KPTR_REF:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
+}
+
 static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
 {
 	if (IS_ERR_OR_NULL(rec))
@@ -369,10 +397,7 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 	if (IS_ERR_OR_NULL(rec))
 		return;
 	for (i = 0; i < rec->cnt; i++)
-		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
-
-	if (rec->refcount_off >= 0)
-		refcount_set((refcount_t *)(obj + rec->refcount_off), 1);
+		bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1835df333287..00e5fb0682ac 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1936,10 +1936,11 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head
 {
 	struct list_head *n = (void *)node, *h = (void *)head;
 
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
-	if (unlikely(!n->next))
-		INIT_LIST_HEAD(n);
 	if (!list_empty(n)) {
 		/* Only called from BPF prog, no need to migrate_disable */
 		__bpf_obj_drop_impl(n - off, rec);
@@ -1975,6 +1976,9 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
 {
 	struct list_head *n, *h = (void *)head;
 
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
 	if (list_empty(h))
@@ -2000,9 +2004,6 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 	struct rb_root_cached *r = (struct rb_root_cached *)root;
 	struct rb_node *n = (struct rb_node *)node;
 
-	if (!n->__rb_parent_color)
-		RB_CLEAR_NODE(n);
-
 	if (RB_EMPTY_NODE(n))
 		return NULL;
 
@@ -2022,9 +2023,6 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
 	bpf_callback_t cb = (bpf_callback_t)less;
 	bool leftmost = true;
 
-	if (!n->__rb_parent_color)
-		RB_CLEAR_NODE(n);
-
 	if (!RB_EMPTY_NODE(n)) {
 		/* Only called from BPF prog, no need to migrate_disable */
 		__bpf_obj_drop_impl(n - off, rec);
-- 
cgit v1.2.3


From 8b0977ecc8b30a30966e76fcb64cef5041626b02 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Thu, 13 Apr 2023 10:57:37 -0700
Subject: swiotlb: track and report io_tlb_used high water marks in debugfs

swiotlb currently reports the total number of slabs and the instantaneous
in-use slabs in debugfs. But with increased usage of swiotlb for all I/O
in Confidential Computing (coco) VMs, it has become difficult to know
how much memory to allocate for swiotlb bounce buffers, either via the
automatic algorithm in the kernel or by specifying a value on the
kernel boot line. The current automatic algorithm generously allocates
swiotlb bounce buffer memory, and may be wasting significant memory in
many use cases.

To support better understanding of swiotlb usage, add tracking of the
the high water mark for usage of the default swiotlb bounce buffer memory
pool and any reserved memory pools. Report these high water marks in
debugfs along with the other swiotlb pool metrics.  Allow the high water
marks to be reset to zero at runtime by writing to them.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h |  7 ++++++
 kernel/dma/swiotlb.c    | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index bcef10e20ea4..6dc4598d2260 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -87,6 +87,11 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  * @for_alloc:  %true if the pool is used for memory allocation
  * @nareas:  The area number in the pool.
  * @area_nslabs: The slot number in the area.
+ * @total_used:	The total number of slots in the pool that are currently used
+ *		across all areas. Used only for calculating used_hiwater in
+ *		debugfs.
+ * @used_hiwater: The high water mark for total_used.  Used only for reporting
+ *		in debugfs.
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -102,6 +107,8 @@ struct io_tlb_mem {
 	unsigned int area_nslabs;
 	struct io_tlb_area *areas;
 	struct io_tlb_slot *slots;
+	atomic_long_t total_used;
+	atomic_long_t used_hiwater;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 938c959ab19e..9bbc2802a444 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -608,6 +608,40 @@ static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
 	return index;
 }
 
+/*
+ * Track the total used slots with a global atomic value in order to have
+ * correct information to determine the high water mark. The mem_used()
+ * function gives imprecise results because there's no locking across
+ * multiple areas.
+ */
+#ifdef CONFIG_DEBUG_FS
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+	unsigned long old_hiwater, new_used;
+
+	new_used = atomic_long_add_return(nslots, &mem->total_used);
+	old_hiwater = atomic_long_read(&mem->used_hiwater);
+	do {
+		if (new_used <= old_hiwater)
+			break;
+	} while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
+					  &old_hiwater, new_used));
+}
+
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+	atomic_long_sub(nslots, &mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
 /*
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
@@ -702,6 +736,8 @@ found:
 	area->index = wrap_area_index(mem, index + nslots);
 	area->used += nslots;
 	spin_unlock_irqrestore(&area->lock, flags);
+
+	inc_used_and_hiwater(mem, nslots);
 	return slot_index;
 }
 
@@ -834,6 +870,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 		mem->slots[i].list = ++count;
 	area->used -= nslots;
 	spin_unlock_irqrestore(&area->lock, flags);
+
+	dec_used(mem, nslots);
 }
 
 /*
@@ -935,11 +973,37 @@ static int io_tlb_used_get(void *data, u64 *val)
 	*val = mem_used(mem);
 	return 0;
 }
+
+static int io_tlb_hiwater_get(void *data, u64 *val)
+{
+	struct io_tlb_mem *mem = data;
+
+	*val = atomic_long_read(&mem->used_hiwater);
+	return 0;
+}
+
+static int io_tlb_hiwater_set(void *data, u64 val)
+{
+	struct io_tlb_mem *mem = data;
+
+	/* Only allow setting to zero */
+	if (val != 0)
+		return -EINVAL;
+
+	atomic_long_set(&mem->used_hiwater, val);
+	return 0;
+}
+
 DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
+				io_tlb_hiwater_set, "%llu\n");
 
 static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
 					 const char *dirname)
 {
+	atomic_long_set(&mem->total_used, 0);
+	atomic_long_set(&mem->used_hiwater, 0);
+
 	mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
 	if (!mem->nslabs)
 		return;
@@ -947,6 +1011,8 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
 	debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
 			&fops_io_tlb_used);
+	debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
+			&fops_io_tlb_hiwater);
 }
 
 static int __init __maybe_unused swiotlb_create_default_debugfs(void)
-- 
cgit v1.2.3


From 8d7c7c0eeb74281c846ef9231ce20536c79a99b4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 14 Apr 2023 10:58:29 -0300
Subject: RDMA: Add ib_virt_dma_to_page()

Make it clearer what is going on by adding a function to go back from the
"virtual" dma_addr to a kva and another to a struct page. This is used in the
ib_uses_virt_dma() style drivers (siw, rxe, hfi, qib).

Call them instead of a naked casting and  virt_to_page() when working with dma_addr
values encoded by the various ib_map functions.

This also fixes the virt_to_page() casting problem Linus Walleij has been
chasing.

Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/0-v2-05ea785520ed+10-ib_virt_page_jgg@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/sw/rxe/rxe_mr.c    | 16 ++++++++--------
 drivers/infiniband/sw/rxe/rxe_verbs.c |  2 +-
 drivers/infiniband/sw/siw/siw_qp_rx.c |  6 +++---
 drivers/infiniband/sw/siw/siw_qp_tx.c | 19 ++++++-------------
 drivers/infiniband/sw/siw/siw_verbs.c |  4 ++--
 include/rdma/ib_verbs.h               | 25 +++++++++++++++++++++++++
 6 files changed, 45 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 1e17f8086d59..0e538fafcc20 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -210,10 +210,10 @@ err1:
 	return err;
 }
 
-static int rxe_set_page(struct ib_mr *ibmr, u64 iova)
+static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr)
 {
 	struct rxe_mr *mr = to_rmr(ibmr);
-	struct page *page = virt_to_page(iova & mr->page_mask);
+	struct page *page = ib_virt_dma_to_page(dma_addr);
 	bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
 	int err;
 
@@ -279,16 +279,16 @@ static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr,
 	return 0;
 }
 
-static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr,
+static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr,
 			    unsigned int length, enum rxe_mr_copy_dir dir)
 {
-	unsigned int page_offset = iova & (PAGE_SIZE - 1);
+	unsigned int page_offset = dma_addr & (PAGE_SIZE - 1);
 	unsigned int bytes;
 	struct page *page;
 	u8 *va;
 
 	while (length) {
-		page = virt_to_page(iova & mr->page_mask);
+		page = ib_virt_dma_to_page(dma_addr);
 		bytes = min_t(unsigned int, length,
 				PAGE_SIZE - page_offset);
 		va = kmap_local_page(page);
@@ -300,7 +300,7 @@ static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr,
 
 		kunmap_local(va);
 		page_offset = 0;
-		iova += bytes;
+		dma_addr += bytes;
 		addr += bytes;
 		length -= bytes;
 	}
@@ -488,7 +488,7 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
 
 	if (mr->ibmr.type == IB_MR_TYPE_DMA) {
 		page_offset = iova & (PAGE_SIZE - 1);
-		page = virt_to_page(iova & PAGE_MASK);
+		page = ib_virt_dma_to_page(iova);
 	} else {
 		unsigned long index;
 		int err;
@@ -545,7 +545,7 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
 
 	if (mr->ibmr.type == IB_MR_TYPE_DMA) {
 		page_offset = iova & (PAGE_SIZE - 1);
-		page = virt_to_page(iova & PAGE_MASK);
+		page = ib_virt_dma_to_page(iova);
 	} else {
 		unsigned long index;
 		int err;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 4e2db7c2e4ed..12819153bda7 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -760,7 +760,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe,
 	int i;
 
 	for (i = 0; i < ibwr->num_sge; i++, sge++) {
-		memcpy(p, (void *)(uintptr_t)sge->addr, sge->length);
+		memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length);
 		p += sge->length;
 	}
 }
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
index fd721cc19682..58bbf738e4e5 100644
--- a/drivers/infiniband/sw/siw/siw_qp_rx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -139,7 +139,7 @@ static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
 			break;
 
 		bytes = min(bytes, len);
-		if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
+		if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) ==
 		    bytes) {
 			copied += bytes;
 			offset += bytes;
@@ -487,7 +487,7 @@ int siw_proc_send(struct siw_qp *qp)
 		mem_p = *mem;
 		if (mem_p->mem_obj == NULL)
 			rv = siw_rx_kva(srx,
-				(void *)(uintptr_t)(sge->laddr + frx->sge_off),
+				ib_virt_dma_to_ptr(sge->laddr + frx->sge_off),
 				sge_bytes);
 		else if (!mem_p->is_pbl)
 			rv = siw_rx_umem(srx, mem_p->umem,
@@ -852,7 +852,7 @@ int siw_proc_rresp(struct siw_qp *qp)
 
 	if (mem_p->mem_obj == NULL)
 		rv = siw_rx_kva(srx,
-			(void *)(uintptr_t)(sge->laddr + wqe->processed),
+			ib_virt_dma_to_ptr(sge->laddr + wqe->processed),
 			bytes);
 	else if (!mem_p->is_pbl)
 		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
index 6bb9e9e81ff4..4b292e0504f1 100644
--- a/drivers/infiniband/sw/siw/siw_qp_tx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -29,7 +29,7 @@ static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
 	dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
 
 	if (paddr)
-		return virt_to_page((void *)(uintptr_t)paddr);
+		return ib_virt_dma_to_page(paddr);
 
 	return NULL;
 }
@@ -56,8 +56,7 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
 
 		if (!mem->mem_obj) {
 			/* Kernel client using kva */
-			memcpy(paddr,
-			       (const void *)(uintptr_t)sge->laddr, bytes);
+			memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes);
 		} else if (c_tx->in_syscall) {
 			if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr),
 					   bytes))
@@ -477,7 +476,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
 			 * or memory region with assigned kernel buffer
 			 */
 			iov[seg].iov_base =
-				(void *)(uintptr_t)(sge->laddr + sge_off);
+				ib_virt_dma_to_ptr(sge->laddr + sge_off);
 			iov[seg].iov_len = sge_len;
 
 			if (do_crc)
@@ -537,19 +536,13 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
 				 * Cast to an uintptr_t to preserve all 64 bits
 				 * in sge->laddr.
 				 */
-				uintptr_t va = (uintptr_t)(sge->laddr + sge_off);
+				u64 va = sge->laddr + sge_off;
 
-				/*
-				 * virt_to_page() takes a (void *) pointer
-				 * so cast to a (void *) meaning it will be 64
-				 * bits on a 64 bit platform and 32 bits on a
-				 * 32 bit platform.
-				 */
-				page_array[seg] = virt_to_page((void *)(va & PAGE_MASK));
+				page_array[seg] = ib_virt_dma_to_page(va);
 				if (do_crc)
 					crypto_shash_update(
 						c_tx->mpa_crc_hd,
-						(void *)va,
+						ib_virt_dma_to_ptr(va),
 						plen);
 			}
 
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 906fde1a2a0d..398ec13db624 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -660,7 +660,7 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 			bytes = -EINVAL;
 			break;
 		}
-		memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
+		memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr),
 		       core_sge->length);
 
 		kbuf += core_sge->length;
@@ -1523,7 +1523,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
 		}
 		siw_dbg_mem(mem,
 			"sge[%d], size %u, addr 0x%p, total %lu\n",
-			i, pble->size, (void *)(uintptr_t)pble->addr,
+			i, pble->size, ib_virt_dma_to_ptr(pble->addr),
 			pbl_size);
 	}
 	rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 949cf4ffc536..1e7774ac808f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4035,6 +4035,31 @@ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev)
 	return dma_pci_p2pdma_supported(dev->dma_device);
 }
 
+/**
+ * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer
+ * @dma_addr: The DMA address
+ *
+ * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after
+ * going through the dma_addr marshalling.
+ */
+static inline void *ib_virt_dma_to_ptr(u64 dma_addr)
+{
+	/* virt_dma mode maps the kvs's directly into the dma addr */
+	return (void *)(uintptr_t)dma_addr;
+}
+
+/**
+ * ib_virt_dma_to_page - Convert a dma_addr to a struct page
+ * @dma_addr: The DMA address
+ *
+ * Used by ib_uses_virt_dma() device to get back to the struct page after going
+ * through the dma_addr marshalling.
+ */
+static inline struct page *ib_virt_dma_to_page(u64 dma_addr)
+{
+	return virt_to_page(ib_virt_dma_to_ptr(dma_addr));
+}
+
 /**
  * ib_dma_mapping_error - check a DMA addr for error
  * @dev: The device for which the dma_addr was created
-- 
cgit v1.2.3


From ccbbfe0682f2fff1e157413c30092dd27c50e20e Mon Sep 17 00:00:00 2001
From: Avihai Horon <avihaih@nvidia.com>
Date: Mon, 10 Apr 2023 16:07:52 +0300
Subject: net/mlx5: Update relaxed ordering read HCA capabilities

Rename existing HCA capability relaxed_ordering_read to
relaxed_ordering_read_pci_enabled. This is in accordance with recent PRM
change to better describe the capability, as it's set only if both the
device supports relaxed ordering (RO) read and RO is enabled in PCI
config space.

In addition, add new HCA capability relaxed_ordering_read which is set
if the device supports RO read, regardless of RO in PCI config space.
This will be used in the following patch to allow RO in VFs and VMs.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Link: https://lore.kernel.org/r/caa0002fd8135086357dfcc368e2f5cc73b08480.1681131553.git.leon@kernel.org
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/mr.c                     | 5 +++--
 drivers/infiniband/hw/mlx5/umr.h                    | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 2 +-
 include/linux/mlx5/mlx5_ifc.h                       | 5 +++--
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index bd0a818ba1cd..6a3a8e00bfaa 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -70,7 +70,8 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 	if (acc & IB_ACCESS_RELAXED_ORDERING) {
 		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
 			MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
-		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+		if (MLX5_CAP_GEN(dev->mdev,
+				 relaxed_ordering_read_pci_enabled) &&
 		    pcie_relaxed_ordering_enabled(dev->mdev->pdev))
 			MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
 	}
@@ -791,7 +792,7 @@ static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
 		ret |= IB_ACCESS_RELAXED_ORDERING;
 
 	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
-	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled) &&
 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 		ret |= IB_ACCESS_RELAXED_ORDERING;
 
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index c9d0021381a2..e12ecd7e079c 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -62,7 +62,7 @@ static inline bool mlx5r_umr_can_reconfig(struct mlx5_ib_dev *dev,
 		return false;
 
 	if ((diffs & IB_ACCESS_RELAXED_ORDERING) &&
-	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled) &&
 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 		return false;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 993af4c12d90..3c765a1f91a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -41,7 +41,7 @@ void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
 {
 	bool ro_pci_enable = pcie_relaxed_ordering_enabled(mdev->pdev);
 	bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write);
-	bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read);
+	bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read_pci_enabled);
 
 	MLX5_SET(mkc, mkc, relaxed_ordering_read, ro_pci_enable && ro_read);
 	MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_write);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e4306cd87cd7..b54339a1b1c6 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1511,7 +1511,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_max_eq_sz[0x8];
 	u8         relaxed_ordering_write[0x1];
-	u8         relaxed_ordering_read[0x1];
+	u8         relaxed_ordering_read_pci_enabled[0x1];
 	u8         log_max_mkey[0x6];
 	u8         reserved_at_f0[0x6];
 	u8	   terminate_scatter_list_mkey[0x1];
@@ -1727,7 +1727,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_320[0x3];
 	u8         log_max_transport_domain[0x5];
-	u8         reserved_at_328[0x3];
+	u8         reserved_at_328[0x2];
+	u8	   relaxed_ordering_read[0x1];
 	u8         log_max_pd[0x5];
 	u8         reserved_at_330[0x9];
 	u8         q_counter_aggregation[0x1];
-- 
cgit v1.2.3


From 3f67987cdc09778e75098f9f5168832f8f8e1f1c Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Fri, 7 Apr 2023 13:18:33 -0400
Subject: ptrace: Provide set/get interface for syscall user dispatch

The syscall user dispatch configuration can only be set by the task itself,
but lacks a ptrace set/get interface which makes it impossible to implement
checkpoint/restore for it.

Add the required ptrace requests and the get/set functions in the syscall
user dispatch code to make that possible.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230407171834.3558-4-gregory.price@memverge.com
---
 .../admin-guide/syscall-user-dispatch.rst          |  4 +++
 include/linux/syscall_user_dispatch.h              | 18 ++++++++++
 include/uapi/linux/ptrace.h                        | 30 ++++++++++++++++
 kernel/entry/syscall_user_dispatch.c               | 40 ++++++++++++++++++++++
 kernel/ptrace.c                                    |  9 +++++
 5 files changed, 101 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst
index 60314953c728..e3cfffef5a63 100644
--- a/Documentation/admin-guide/syscall-user-dispatch.rst
+++ b/Documentation/admin-guide/syscall-user-dispatch.rst
@@ -73,6 +73,10 @@ thread-wide, without the need to invoke the kernel directly.  selector
 can be set to SYSCALL_DISPATCH_FILTER_ALLOW or SYSCALL_DISPATCH_FILTER_BLOCK.
 Any other value should terminate the program with a SIGSYS.
 
+Additionally, a tasks syscall user dispatch configuration can be peeked
+and poked via the PTRACE_(GET|SET)_SYSCALL_USER_DISPATCH_CONFIG ptrace
+requests. This is useful for checkpoint/restart software.
+
 Security Notes
 --------------
 
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
index a0ae443fb7df..641ca8880995 100644
--- a/include/linux/syscall_user_dispatch.h
+++ b/include/linux/syscall_user_dispatch.h
@@ -22,6 +22,12 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 #define clear_syscall_work_syscall_user_dispatch(tsk) \
 	clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH)
 
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+				     void __user *data);
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+				     void __user *data);
+
 #else
 struct syscall_user_dispatch {};
 
@@ -35,6 +41,18 @@ static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *
 {
 }
 
+static inline int syscall_user_dispatch_get_config(struct task_struct *task,
+						   unsigned long size, void __user *data)
+{
+	return -EINVAL;
+}
+
+static inline int syscall_user_dispatch_set_config(struct task_struct *task,
+						   unsigned long size, void __user *data)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_GENERIC_ENTRY */
 
 #endif /* _SYSCALL_USER_DISPATCH_H */
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 195ae64a8c87..72c038fc71d0 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -112,6 +112,36 @@ struct ptrace_rseq_configuration {
 	__u32 pad;
 };
 
+#define PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG 0x4210
+#define PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG 0x4211
+
+/*
+ * struct ptrace_sud_config - Per-task configuration for Syscall User Dispatch
+ * @mode:	One of PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF
+ * @selector:	Tracees user virtual address of SUD selector
+ * @offset:	SUD exclusion area (virtual address)
+ * @len:	Length of SUD exclusion area
+ *
+ * Used to get/set the syscall user dispatch configuration for a tracee.
+ * Selector is optional (may be NULL), and if invalid will produce
+ * a SIGSEGV in the tracee upon first access.
+ *
+ * If mode is PR_SYS_DISPATCH_ON, syscall dispatch will be enabled. If
+ * PR_SYS_DISPATCH_OFF, syscall dispatch will be disabled and all other
+ * parameters must be 0.  The value in *selector (if not null), also determines
+ * whether syscall dispatch will occur.
+ *
+ * The Syscall User Dispatch Exclusion area described by offset/len is the
+ * virtual address space from which syscalls will not produce a user
+ * dispatch.
+ */
+struct ptrace_sud_config {
+	__u64 mode;
+	__u64 selector;
+	__u64 offset;
+	__u64 len;
+};
+
 /*
  * These values are stored in task->ptrace_message
  * by ptrace_stop to describe the current syscall-stop.
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 7f2add43672d..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
  */
 #include <linux/sched.h>
 #include <linux/prctl.h>
+#include <linux/ptrace.h>
 #include <linux/syscall_user_dispatch.h>
 #include <linux/uaccess.h>
 #include <linux/signal.h>
@@ -122,3 +123,42 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 {
 	return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
 }
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+				     void __user *data)
+{
+	struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+	struct ptrace_sud_config cfg;
+
+	if (size != sizeof(cfg))
+		return -EINVAL;
+
+	if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+		cfg.mode = PR_SYS_DISPATCH_ON;
+	else
+		cfg.mode = PR_SYS_DISPATCH_OFF;
+
+	cfg.offset = sd->offset;
+	cfg.len = sd->len;
+	cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+	if (copy_to_user(data, &cfg, sizeof(cfg)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+				     void __user *data)
+{
+	struct ptrace_sud_config cfg;
+
+	if (size != sizeof(cfg))
+		return -EINVAL;
+
+	if (copy_from_user(&cfg, data, sizeof(cfg)))
+		return -EFAULT;
+
+	return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+					      (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0786450074c1..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 #include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
 
 #include <asm/syscall.h>	/* for syscall_get_* */
 
@@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
 		break;
 #endif
 
+	case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+		ret = syscall_user_dispatch_set_config(child, addr, datavp);
+		break;
+
+	case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+		ret = syscall_user_dispatch_get_config(child, addr, datavp);
+		break;
+
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 7b4ddf3920d247c2949073b9c274301c8131332a Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sun, 16 Apr 2023 03:49:27 -0500
Subject: bpf: Remove KF_KPTR_GET kfunc flag

We've managed to improve the UX for kptrs significantly over the last 9
months. All of the existing use cases which previously had KF_KPTR_GET
kfuncs (struct bpf_cpumask *, struct task_struct *, and struct cgroup *)
have all been updated to be synchronized using RCU. In other words,
their KF_KPTR_GET kfuncs have been removed in favor of KF_RCU |
KF_ACQUIRE kfuncs, with the pointers themselves also being readable from
maps in an RCU read region thanks to the types being RCU safe.

While KF_KPTR_GET was a logical starting point for kptrs, it's become
clear that they're not the correct abstraction. KF_KPTR_GET is a flag
that essentially does nothing other than enforcing that the argument to
a function is a pointer to a referenced kptr map value. At first glance,
that's a useful thing to guarantee to a kfunc. It gives kfuncs the
ability to try and acquire a reference on that kptr without requiring
the BPF prog to do something like this:

struct kptr_type *in_map, *new = NULL;

in_map = bpf_kptr_xchg(&map->value, NULL);
if (in_map) {
        new = bpf_kptr_type_acquire(in_map);
        in_map = bpf_kptr_xchg(&map->value, in_map);
        if (in_map)
                bpf_kptr_type_release(in_map);
}

That's clearly a pretty ugly (and racy) UX, and if using KF_KPTR_GET is
the only alternative, it's better than nothing. However, the problem
with any KF_KPTR_GET kfunc lies in the fact that it always requires some
kind of synchronization in order to safely do an opportunistic acquire
of the kptr in the map. This is because a BPF program running on another
CPU could do a bpf_kptr_xchg() on that map value, and free the kptr
after it's been read by the KF_KPTR_GET kfunc. For example, the
now-removed bpf_task_kptr_get() kfunc did the following:

struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
{
            struct task_struct *p;

        rcu_read_lock();
        p = READ_ONCE(*pp);
        /* If p is non-NULL, it could still be freed by another CPU,
         * so we have to do an opportunistic refcount_inc_not_zero()
         * and return NULL if the task will be freed after the
         * current RCU read region.
         */
        |f (p && !refcount_inc_not_zero(&p->rcu_users))
                p = NULL;
        rcu_read_unlock();

        return p;
}

In other words, the kfunc uses RCU to ensure that the task remains valid
after it's been peeked from the map. However, this is completely
redundant with just defining a KF_RCU kfunc that itself does a
refcount_inc_not_zero(), which is exactly what bpf_task_acquire() now
does.

So, the question of whether KF_KPTR_GET is useful is actually, "Are
there any synchronization mechanisms / safety flags that are required by
certain kptrs, but which are not provided by the verifier to kfuncs?"
The answer to that question today is "No", because every kptr we
currently care about is RCU protected.

Even if the answer ever became "yes", the proper way to support that
referenced kptr type would be to add support for whatever
synchronization mechanism it requires in the verifier, rather than
giving kfuncs a flag that says, "Here's a pointer to a referenced kptr
in a map, do whatever you need to do."

With all that said -- so as to allow us to consolidate the kfunc API,
and simplify the verifier a bit, this patch removes KF_KPTR_GET, and all
relevant logic from the verifier.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230416084928.326135-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   |  1 -
 kernel/bpf/verifier.c | 65 ---------------------------------------------------
 2 files changed, 66 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 813227bff58a..508199e38415 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -18,7 +18,6 @@
 #define KF_ACQUIRE	(1 << 0) /* kfunc is an acquire function */
 #define KF_RELEASE	(1 << 1) /* kfunc is a release function */
 #define KF_RET_NULL	(1 << 2) /* kfunc returns a pointer that may be NULL */
-#define KF_KPTR_GET	(1 << 3) /* kfunc returns reference to a kptr */
 /* Trusted arguments are those which are guaranteed to be valid when passed to
  * the kfunc. It is used to enforce that pointers obtained from either acquire
  * kfuncs, or from the main kernel on a tracepoint or struct_ops callback
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a41b69a424e..5dae11ee01c3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9339,11 +9339,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU;
 }
 
-static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
-{
-	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
-}
-
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -9554,7 +9549,6 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_CTX,
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */
 	KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
-	KF_ARG_PTR_TO_KPTR,	       /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
 	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
@@ -9666,21 +9660,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
 
-	if (is_kfunc_arg_kptr_get(meta, argno)) {
-		if (!btf_type_is_ptr(ref_t)) {
-			verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
-			return -EINVAL;
-		}
-		ref_t = btf_type_by_id(meta->btf, ref_t->type);
-		ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
-		if (!btf_type_is_struct(ref_t)) {
-			verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
-				meta->func_name, btf_type_str(ref_t), ref_tname);
-			return -EINVAL;
-		}
-		return KF_ARG_PTR_TO_KPTR;
-	}
-
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
@@ -9794,40 +9773,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
-				      struct bpf_reg_state *reg,
-				      const struct btf_type *ref_t,
-				      const char *ref_tname,
-				      struct bpf_kfunc_call_arg_meta *meta,
-				      int argno)
-{
-	struct btf_field *kptr_field;
-
-	/* check_func_arg_reg_off allows var_off for
-	 * PTR_TO_MAP_VALUE, but we need fixed offset to find
-	 * off_desc.
-	 */
-	if (!tnum_is_const(reg->var_off)) {
-		verbose(env, "arg#0 must have constant offset\n");
-		return -EINVAL;
-	}
-
-	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
-	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
-		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
-			reg->off + reg->var_off.value);
-		return -EINVAL;
-	}
-
-	if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
-				  kptr_field->kptr.btf_id, true)) {
-		verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
-			meta->func_name, argno, btf_type_str(ref_t), ref_tname);
-		return -EINVAL;
-	}
-	return 0;
-}
-
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -10315,7 +10260,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			/* Trusted arguments have the same offset checks as release arguments */
 			arg_type |= OBJ_RELEASE;
 			break;
-		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
 		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
@@ -10368,15 +10312,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				meta->arg_obj_drop.btf_id = reg->btf_id;
 			}
 			break;
-		case KF_ARG_PTR_TO_KPTR:
-			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#0 expected pointer to map value\n");
-				return -EINVAL;
-			}
-			ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
-			if (ret < 0)
-				return ret;
-			break;
 		case KF_ARG_PTR_TO_DYNPTR:
 		{
 			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
-- 
cgit v1.2.3


From 3acea5fc335420ba7ef53947cf2d98d07fac39f7 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Mon, 13 Mar 2023 21:53:08 +0800
Subject: erofs: avoid hardcoded blocksize for subpage block support

As the first step of converting hardcoded blocksize to that specified in
on-disk superblock, convert all call sites of hardcoded blocksize to
sb->s_blocksize except for:

1) use sbi->blkszbits instead of sb->s_blocksize in
erofs_superblock_csum_verify() since sb->s_blocksize has not been
updated with the on-disk blocksize yet when the function is called.

2) use inode->i_blkbits instead of sb->s_blocksize in erofs_bread(),
since the inode operated on may be an anonymous inode in fscache mode.
Currently the anonymous inode is allocated from an anonymous mount
maintained in erofs, while in the near future we may allocate anonymous
inodes from a generic API directly and thus have no access to the
anonymous inode's i_sb.  Thus we keep the block size in i_blkbits for
anonymous inodes in fscache mode.

Be noted that this patch only gets rid of the hardcoded blocksize, in
preparation for actually setting the on-disk block size in the following
patch.  The hard limit of constraining the block size to PAGE_SIZE still
exists until the next patch.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20230313135309.75269-2-jefflexu@linux.alibaba.com
[ Gao Xiang: fold a patch to fix incorrect truncated offsets. ]
Link: https://lore.kernel.org/r/20230413035734.15457-1-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/data.c              | 50 ++++++++++++++++++++++++--------------------
 fs/erofs/decompressor.c      |  6 +++---
 fs/erofs/decompressor_lzma.c |  4 ++--
 fs/erofs/dir.c               | 22 +++++++++----------
 fs/erofs/fscache.c           |  5 +++--
 fs/erofs/inode.c             | 20 ++++++++++--------
 fs/erofs/internal.h          | 20 +++++++-----------
 fs/erofs/namei.c             | 14 ++++++-------
 fs/erofs/super.c             | 27 ++++++++++++++----------
 fs/erofs/xattr.c             | 40 +++++++++++++++++------------------
 fs/erofs/xattr.h             | 10 ++++-----
 fs/erofs/zdata.c             | 18 +++++++++-------
 fs/erofs/zmap.c              | 29 ++++++++++++-------------
 include/trace/events/erofs.h |  4 ++--
 14 files changed, 137 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c08c0f578bc6..1c931e32d28e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -27,11 +27,15 @@ void erofs_put_metabuf(struct erofs_buf *buf)
 	buf->page = NULL;
 }
 
+/*
+ * Derive the block size from inode->i_blkbits to make compatible with
+ * anonymous inode in fscache mode.
+ */
 void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
 		  erofs_blk_t blkaddr, enum erofs_kmap_type type)
 {
+	erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
 	struct address_space *const mapping = inode->i_mapping;
-	erofs_off_t offset = blknr_to_addr(blkaddr);
 	pgoff_t index = offset >> PAGE_SHIFT;
 	struct page *page = buf->page;
 	struct folio *folio;
@@ -79,33 +83,32 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	erofs_blk_t nblocks, lastblk;
 	u64 offset = map->m_la;
 	struct erofs_inode *vi = EROFS_I(inode);
+	struct super_block *sb = inode->i_sb;
 	bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
 
-	nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+	nblocks = erofs_iblks(inode);
 	lastblk = nblocks - tailendpacking;
 
 	/* there is no hole in flatmode */
 	map->m_flags = EROFS_MAP_MAPPED;
-	if (offset < blknr_to_addr(lastblk)) {
-		map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
-		map->m_plen = blknr_to_addr(lastblk) - offset;
+	if (offset < erofs_pos(sb, lastblk)) {
+		map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
+		map->m_plen = erofs_pos(sb, lastblk) - offset;
 	} else if (tailendpacking) {
 		map->m_pa = erofs_iloc(inode) + vi->inode_isize +
-			vi->xattr_isize + erofs_blkoff(offset);
+			vi->xattr_isize + erofs_blkoff(sb, offset);
 		map->m_plen = inode->i_size - offset;
 
 		/* inline data should be located in the same meta block */
-		if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
-			erofs_err(inode->i_sb,
-				  "inline data cross block boundary @ nid %llu",
+		if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+			erofs_err(sb, "inline data cross block boundary @ nid %llu",
 				  vi->nid);
 			DBG_BUGON(1);
 			return -EFSCORRUPTED;
 		}
 		map->m_flags |= EROFS_MAP_META;
 	} else {
-		erofs_err(inode->i_sb,
-			  "internal error @ nid: %llu (size %llu), m_la 0x%llx",
+		erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
 			  vi->nid, inode->i_size, map->m_la);
 		DBG_BUGON(1);
 		return -EIO;
@@ -148,29 +151,29 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
 	pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
 		    vi->xattr_isize, unit) + unit * chunknr;
 
-	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
 	if (IS_ERR(kaddr)) {
 		err = PTR_ERR(kaddr);
 		goto out;
 	}
 	map->m_la = chunknr << vi->chunkbits;
 	map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
-			    roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
+			round_up(inode->i_size - map->m_la, sb->s_blocksize));
 
 	/* handle block map */
 	if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
-		__le32 *blkaddr = kaddr + erofs_blkoff(pos);
+		__le32 *blkaddr = kaddr + erofs_blkoff(sb, pos);
 
 		if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
 			map->m_flags = 0;
 		} else {
-			map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
+			map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr));
 			map->m_flags = EROFS_MAP_MAPPED;
 		}
 		goto out_unlock;
 	}
 	/* parse chunk indexes */
-	idx = kaddr + erofs_blkoff(pos);
+	idx = kaddr + erofs_blkoff(sb, pos);
 	switch (le32_to_cpu(idx->blkaddr)) {
 	case EROFS_NULL_ADDR:
 		map->m_flags = 0;
@@ -178,7 +181,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
 	default:
 		map->m_deviceid = le16_to_cpu(idx->device_id) &
 			EROFS_SB(sb)->device_id_mask;
-		map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
+		map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr));
 		map->m_flags = EROFS_MAP_MAPPED;
 		break;
 	}
@@ -222,8 +225,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 
 			if (!dif->mapped_blkaddr)
 				continue;
-			startoff = blknr_to_addr(dif->mapped_blkaddr);
-			length = blknr_to_addr(dif->blocks);
+			startoff = erofs_pos(sb, dif->mapped_blkaddr);
+			length = erofs_pos(sb, dif->blocks);
 
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
@@ -244,6 +247,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
 {
 	int ret;
+	struct super_block *sb = inode->i_sb;
 	struct erofs_map_blocks map;
 	struct erofs_map_dev mdev;
 
@@ -258,7 +262,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		.m_deviceid = map.m_deviceid,
 		.m_pa = map.m_pa,
 	};
-	ret = erofs_map_dev(inode->i_sb, &mdev);
+	ret = erofs_map_dev(sb, &mdev);
 	if (ret)
 		return ret;
 
@@ -284,11 +288,11 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 
 		iomap->type = IOMAP_INLINE;
-		ptr = erofs_read_metabuf(&buf, inode->i_sb,
-					 erofs_blknr(mdev.m_pa), EROFS_KMAP);
+		ptr = erofs_read_metabuf(&buf, sb,
+				erofs_blknr(sb, mdev.m_pa), EROFS_KMAP);
 		if (IS_ERR(ptr))
 			return PTR_ERR(ptr);
-		iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa);
+		iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa);
 		iomap->private = buf.base;
 	} else {
 		iomap->type = IOMAP_MAPPED;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 51b7ac7166d9..7021e2cf6146 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -42,7 +42,7 @@ int z_erofs_load_lz4_config(struct super_block *sb,
 		if (!sbi->lz4.max_pclusterblks) {
 			sbi->lz4.max_pclusterblks = 1;	/* reserved case */
 		} else if (sbi->lz4.max_pclusterblks >
-			   Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+			   erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) {
 			erofs_err(sb, "too large lz4 pclusterblks %u",
 				  sbi->lz4.max_pclusterblks);
 			return -EINVAL;
@@ -221,13 +221,13 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 		support_0padding = true;
 		ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
 				min_t(unsigned int, rq->inputsize,
-				      EROFS_BLKSIZ - rq->pageofs_in));
+				      rq->sb->s_blocksize - rq->pageofs_in));
 		if (ret) {
 			kunmap_atomic(headpage);
 			return ret;
 		}
 		may_inplace = !((rq->pageofs_in + rq->inputsize) &
-				(EROFS_BLKSIZ - 1));
+				(rq->sb->s_blocksize - 1));
 	}
 
 	inputmargin = rq->pageofs_in;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index d38e19c11270..73091fbe3ea4 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -166,8 +166,8 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
 	/* 1. get the exact LZMA compressed size */
 	kin = kmap(*rq->in);
 	err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
-				   min_t(unsigned int, rq->inputsize,
-					 EROFS_BLKSIZ - rq->pageofs_in));
+			min_t(unsigned int, rq->inputsize,
+			      rq->sb->s_blocksize - rq->pageofs_in));
 	if (err) {
 		kunmap(*rq->in);
 		return err;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 6970b09b8307..963bbed0b699 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -50,9 +50,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 {
 	struct inode *dir = file_inode(f);
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	struct super_block *sb = dir->i_sb;
+	unsigned long bsz = sb->s_blocksize;
 	const size_t dirsize = i_size_read(dir);
-	unsigned int i = ctx->pos / EROFS_BLKSIZ;
-	unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
+	unsigned int i = erofs_blknr(sb, ctx->pos);
+	unsigned int ofs = erofs_blkoff(sb, ctx->pos);
 	int err = 0;
 	bool initial = true;
 
@@ -62,32 +64,28 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 
 		de = erofs_bread(&buf, dir, i, EROFS_KMAP);
 		if (IS_ERR(de)) {
-			erofs_err(dir->i_sb,
-				  "fail to readdir of logical block %u of nid %llu",
+			erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
 				  i, EROFS_I(dir)->nid);
 			err = PTR_ERR(de);
 			break;
 		}
 
 		nameoff = le16_to_cpu(de->nameoff);
-		if (nameoff < sizeof(struct erofs_dirent) ||
-		    nameoff >= EROFS_BLKSIZ) {
-			erofs_err(dir->i_sb,
-				  "invalid de[0].nameoff %u @ nid %llu",
+		if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) {
+			erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu",
 				  nameoff, EROFS_I(dir)->nid);
 			err = -EFSCORRUPTED;
 			break;
 		}
 
-		maxsize = min_t(unsigned int,
-				dirsize - ctx->pos + ofs, EROFS_BLKSIZ);
+		maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
 
 		/* search dirents at the arbitrary position */
 		if (initial) {
 			initial = false;
 
 			ofs = roundup(ofs, sizeof(struct erofs_dirent));
-			ctx->pos = blknr_to_addr(i) + ofs;
+			ctx->pos = erofs_pos(sb, i) + ofs;
 			if (ofs >= nameoff)
 				goto skip_this;
 		}
@@ -97,7 +95,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 		if (err)
 			break;
 skip_this:
-		ctx->pos = blknr_to_addr(i) + maxsize;
+		ctx->pos = erofs_pos(sb, i) + maxsize;
 		++i;
 		ofs = 0;
 	}
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 96a87c023128..87ff35bff8d5 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -209,8 +209,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
 		void *src;
 
 		/* For tail packing layout, the offset may be non-zero. */
-		offset = erofs_blkoff(map.m_pa);
-		blknr = erofs_blknr(map.m_pa);
+		offset = erofs_blkoff(sb, map.m_pa);
+		blknr = erofs_blknr(sb, map.m_pa);
 		size = map.m_llen;
 
 		src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
@@ -460,6 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
 	inode->i_size = OFFSET_MAX;
 	inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	inode->i_blkbits = EROFS_SB(sb)->blkszbits;
 	inode->i_private = ctx;
 
 	ctx->cookie = cookie;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 4be7dda3cd24..3502daee8d3b 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -23,8 +23,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 	unsigned int ifmt;
 	int err;
 
-	blkaddr = erofs_blknr(inode_loc);
-	*ofs = erofs_blkoff(inode_loc);
+	blkaddr = erofs_blknr(sb, inode_loc);
+	*ofs = erofs_blkoff(sb, inode_loc);
 
 	erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
 		  __func__, vi->nid, *ofs, blkaddr);
@@ -58,11 +58,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 	case EROFS_INODE_LAYOUT_EXTENDED:
 		vi->inode_isize = sizeof(struct erofs_inode_extended);
 		/* check if the extended inode acrosses block boundary */
-		if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
+		if (*ofs + vi->inode_isize <= sb->s_blocksize) {
 			*ofs += vi->inode_isize;
 			die = (struct erofs_inode_extended *)dic;
 		} else {
-			const unsigned int gotten = EROFS_BLKSIZ - *ofs;
+			const unsigned int gotten = sb->s_blocksize - *ofs;
 
 			copied = kmalloc(vi->inode_isize, GFP_NOFS);
 			if (!copied) {
@@ -176,7 +176,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 			err = -EOPNOTSUPP;
 			goto err_out;
 		}
-		vi->chunkbits = LOG_BLOCK_SIZE +
+		vi->chunkbits = sb->s_blocksize_bits +
 			(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
 	}
 	inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
@@ -188,11 +188,12 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 	if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
 	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
 		inode->i_flags |= S_DAX;
+
 	if (!nblks)
 		/* measure inode.i_blocks as generic filesystems */
-		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
+		inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
 	else
-		inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
+		inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
 	return kaddr;
 
 bogusimode:
@@ -210,11 +211,12 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
 			      unsigned int m_pofs)
 {
 	struct erofs_inode *vi = EROFS_I(inode);
+	unsigned int bsz = i_blocksize(inode);
 	char *lnk;
 
 	/* if it cannot be handled with fast symlink scheme */
 	if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
-	    inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
+	    inode->i_size >= bsz || inode->i_size < 0) {
 		inode->i_op = &erofs_symlink_iops;
 		return 0;
 	}
@@ -225,7 +227,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
 
 	m_pofs += vi->xattr_isize;
 	/* inline symlink data shouldn't cross block boundary */
-	if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
+	if (m_pofs + inode->i_size > bsz) {
 		kfree(lnk);
 		erofs_err(inode->i_sb,
 			  "inline data cross block boundary @ nid %llu",
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1db018f8c2e8..66ff0513866f 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -147,8 +147,8 @@ struct erofs_sb_info {
 #endif
 	u16 device_id_mask;	/* valid bits of device id to be used */
 
-	/* inode slot unit size in bit shift */
-	unsigned char islotbits;
+	unsigned char islotbits;	/* inode slot unit size in bit shift */
+	unsigned char blkszbits;
 
 	u32 sb_size;			/* total superblock size */
 	u32 build_time_nsec;
@@ -242,13 +242,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
-
-#undef LOG_SECTORS_PER_BLOCK
-#define LOG_SECTORS_PER_BLOCK	(PAGE_SHIFT - 9)
-
-#undef SECTORS_PER_BLOCK
-#define SECTORS_PER_BLOCK	(1 << SECTORS_PER_BLOCK)
-
 #define EROFS_BLKSIZ		(1 << LOG_BLOCK_SIZE)
 
 #if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
@@ -269,9 +262,10 @@ struct erofs_buf {
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
-#define erofs_blknr(addr)       ((addr) / EROFS_BLKSIZ)
-#define erofs_blkoff(addr)      ((addr) % EROFS_BLKSIZ)
-#define blknr_to_addr(nr)       ((erofs_off_t)(nr) * EROFS_BLKSIZ)
+#define erofs_blknr(sb, addr)	((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blkoff(sb, addr)	((addr) & ((sb)->s_blocksize - 1))
+#define erofs_pos(sb, blk)	((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
+#define erofs_iblks(i)	(round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
 
 #define EROFS_FEATURE_FUNCS(name, compat, feature) \
 static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
@@ -343,7 +337,7 @@ static inline erofs_off_t erofs_iloc(struct inode *inode)
 {
 	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
 
-	return blknr_to_addr(sbi->meta_blkaddr) +
+	return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
 		(EROFS_I(inode)->nid << sbi->islotbits);
 }
 
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 966eabc61c13..f091e9a0f0a1 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -89,7 +89,8 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
 static void *erofs_find_target_block(struct erofs_buf *target,
 		struct inode *dir, struct erofs_qstr *name, int *_ndirents)
 {
-	int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1;
+	unsigned int bsz = i_blocksize(dir);
+	int head = 0, back = erofs_iblks(dir) - 1;
 	unsigned int startprfx = 0, endprfx = 0;
 	void *candidate = ERR_PTR(-ENOENT);
 
@@ -100,8 +101,7 @@ static void *erofs_find_target_block(struct erofs_buf *target,
 
 		de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
 		if (!IS_ERR(de)) {
-			const int nameoff = nameoff_from_disk(de->nameoff,
-							      EROFS_BLKSIZ);
+			const int nameoff = nameoff_from_disk(de->nameoff, bsz);
 			const int ndirents = nameoff / sizeof(*de);
 			int diff;
 			unsigned int matched;
@@ -121,11 +121,10 @@ static void *erofs_find_target_block(struct erofs_buf *target,
 
 			dname.name = (u8 *)de + nameoff;
 			if (ndirents == 1)
-				dname.end = (u8 *)de + EROFS_BLKSIZ;
+				dname.end = (u8 *)de + bsz;
 			else
 				dname.end = (u8 *)de +
-					nameoff_from_disk(de[1].nameoff,
-							  EROFS_BLKSIZ);
+					nameoff_from_disk(de[1].nameoff, bsz);
 
 			/* string comparison without already matched prefix */
 			diff = erofs_dirnamecmp(name, &dname, &matched);
@@ -178,7 +177,8 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
 		return PTR_ERR(de);
 
 	if (ndirents)
-		de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
+		de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir),
+					ndirents);
 
 	if (!IS_ERR(de)) {
 		*nid = le64_to_cpu(de->nid);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 19b1ae79cec4..4303b4e69e55 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -52,18 +52,21 @@ void _erofs_info(struct super_block *sb, const char *function,
 
 static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
 {
+	size_t len = 1 << EROFS_SB(sb)->blkszbits;
 	struct erofs_super_block *dsb;
 	u32 expected_crc, crc;
 
-	dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET,
-		      EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL);
+	if (len > EROFS_SUPER_OFFSET)
+		len -= EROFS_SUPER_OFFSET;
+
+	dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL);
 	if (!dsb)
 		return -ENOMEM;
 
 	expected_crc = le32_to_cpu(dsb->checksum);
 	dsb->checksum = 0;
 	/* to allow for x86 boot sectors and other oddities. */
-	crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET);
+	crc = crc32c(~0, dsb, len);
 	kfree(dsb);
 
 	if (crc != expected_crc) {
@@ -132,11 +135,11 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
 	int len, i, cnt;
 
 	*offset = round_up(*offset, 4);
-	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP);
+	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *offset), EROFS_KMAP);
 	if (IS_ERR(ptr))
 		return ptr;
 
-	len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+	len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
 	if (!len)
 		len = U16_MAX + 1;
 	buffer = kmalloc(len, GFP_KERNEL);
@@ -146,14 +149,15 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
 	*lengthp = len;
 
 	for (i = 0; i < len; i += cnt) {
-		cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
-		ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset),
+		cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
+			    len - i);
+		ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *offset),
 					 EROFS_KMAP);
 		if (IS_ERR(ptr)) {
 			kfree(buffer);
 			return ptr;
 		}
-		memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+		memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
 		*offset += cnt;
 	}
 	return buffer;
@@ -228,10 +232,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	struct block_device *bdev;
 	void *ptr;
 
-	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
-	dis = ptr + erofs_blkoff(*pos);
+	dis = ptr + erofs_blkoff(sb, *pos);
 
 	if (!dif->path) {
 		if (!dis->tag[0]) {
@@ -733,6 +737,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	sbi->domain_id = ctx->domain_id;
 	ctx->domain_id = NULL;
 
+	sbi->blkszbits = PAGE_SHIFT;
 	if (erofs_is_fscache_mode(sb)) {
 		sb->s_blocksize = EROFS_BLKSIZ;
 		sb->s_blocksize_bits = LOG_BLOCK_SIZE;
@@ -1060,7 +1065,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = sb->s_magic;
-	buf->f_bsize = EROFS_BLKSIZ;
+	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = sbi->total_blocks;
 	buf->f_bfree = buf->f_bavail = 0;
 
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 60729b1220b6..459caa3cd65d 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -68,8 +68,8 @@ static int init_inode_xattrs(struct inode *inode)
 	}
 
 	it.buf = __EROFS_BUF_INITIALIZER;
-	it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize);
-	it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize);
+	it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize);
+	it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize);
 
 	/* read in shared xattr array (non-atomic, see kmalloc below) */
 	it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
@@ -92,9 +92,9 @@ static int init_inode_xattrs(struct inode *inode)
 	it.ofs += sizeof(struct erofs_xattr_ibody_header);
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
-		if (it.ofs >= EROFS_BLKSIZ) {
+		if (it.ofs >= sb->s_blocksize) {
 			/* cannot be unaligned */
-			DBG_BUGON(it.ofs != EROFS_BLKSIZ);
+			DBG_BUGON(it.ofs != sb->s_blocksize);
 
 			it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
 						      EROFS_KMAP);
@@ -139,15 +139,15 @@ struct xattr_iter_handlers {
 
 static inline int xattr_iter_fixup(struct xattr_iter *it)
 {
-	if (it->ofs < EROFS_BLKSIZ)
+	if (it->ofs < it->sb->s_blocksize)
 		return 0;
 
-	it->blkaddr += erofs_blknr(it->ofs);
+	it->blkaddr += erofs_blknr(it->sb, it->ofs);
 	it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
 				       EROFS_KMAP);
 	if (IS_ERR(it->kaddr))
 		return PTR_ERR(it->kaddr);
-	it->ofs = erofs_blkoff(it->ofs);
+	it->ofs = erofs_blkoff(it->sb, it->ofs);
 	return 0;
 }
 
@@ -165,8 +165,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
 
 	inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
 
-	it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs);
-	it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs);
+	it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs);
+	it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs);
 	it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
 				       EROFS_KMAP);
 	if (IS_ERR(it->kaddr))
@@ -222,8 +222,8 @@ static int xattr_foreach(struct xattr_iter *it,
 	processed = 0;
 
 	while (processed < entry.e_name_len) {
-		if (it->ofs >= EROFS_BLKSIZ) {
-			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+		if (it->ofs >= it->sb->s_blocksize) {
+			DBG_BUGON(it->ofs > it->sb->s_blocksize);
 
 			err = xattr_iter_fixup(it);
 			if (err)
@@ -231,7 +231,7 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
+		slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs,
 			      entry.e_name_len - processed);
 
 		/* handle name */
@@ -257,8 +257,8 @@ static int xattr_foreach(struct xattr_iter *it,
 	}
 
 	while (processed < value_sz) {
-		if (it->ofs >= EROFS_BLKSIZ) {
-			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+		if (it->ofs >= it->sb->s_blocksize) {
+			DBG_BUGON(it->ofs > it->sb->s_blocksize);
 
 			err = xattr_iter_fixup(it);
 			if (err)
@@ -266,7 +266,7 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
+		slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs,
 			      value_sz - processed);
 		op->value(it, processed, it->kaddr + it->ofs, slice);
 		it->ofs += slice;
@@ -352,15 +352,14 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
 	struct super_block *const sb = inode->i_sb;
-	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	unsigned int i;
 	int ret = -ENOATTR;
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
 		erofs_blk_t blkaddr =
-			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+			xattrblock_addr(sb, vi->xattr_shared_xattrs[i]);
 
-		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+		it->it.ofs = xattrblock_offset(sb, vi->xattr_shared_xattrs[i]);
 		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
 						  EROFS_KMAP);
 		if (IS_ERR(it->it.kaddr))
@@ -564,15 +563,14 @@ static int shared_listxattr(struct listxattr_iter *it)
 	struct inode *const inode = d_inode(it->dentry);
 	struct erofs_inode *const vi = EROFS_I(inode);
 	struct super_block *const sb = inode->i_sb;
-	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	unsigned int i;
 	int ret = 0;
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
 		erofs_blk_t blkaddr =
-			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+			xattrblock_addr(sb, vi->xattr_shared_xattrs[i]);
 
-		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+		it->it.ofs = xattrblock_offset(sb, vi->xattr_shared_xattrs[i]);
 		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
 						  EROFS_KMAP);
 		if (IS_ERR(it->it.kaddr))
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 0a43c9ee9f8f..f7a21aaa9755 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -19,21 +19,21 @@ static inline unsigned int inlinexattr_header_size(struct inode *inode)
 		sizeof(u32) * EROFS_I(inode)->xattr_shared_count;
 }
 
-static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
+static inline erofs_blk_t xattrblock_addr(struct super_block *sb,
 					  unsigned int xattr_id)
 {
 #ifdef CONFIG_EROFS_FS_XATTR
-	return sbi->xattr_blkaddr +
-		xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
+	return EROFS_SB(sb)->xattr_blkaddr +
+		xattr_id * sizeof(__u32) / sb->s_blocksize;
 #else
 	return 0;
 #endif
 }
 
-static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
+static inline unsigned int xattrblock_offset(struct super_block *sb,
 					     unsigned int xattr_id)
 {
-	return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
+	return (xattr_id * sizeof(__u32)) % sb->s_blocksize;
 }
 
 #ifdef CONFIG_EROFS_FS_XATTR
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f1708c77a991..a90d37c7bdd7 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -807,7 +807,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 
 	if (ztailpacking) {
 		pcl->obj.index = 0;	/* which indicates ztailpacking */
-		pcl->pageofs_in = erofs_blkoff(map->m_pa);
+		pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa);
 		pcl->tailpacking_size = map->m_plen;
 	} else {
 		pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -930,6 +930,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
 				 struct page *page, unsigned int pageofs,
 				 unsigned int len)
 {
+	struct super_block *sb = inode->i_sb;
 	struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 	u8 *src, *dst;
@@ -941,16 +942,16 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
 	pos += EROFS_I(inode)->z_fragmentoff;
 	for (i = 0; i < len; i += cnt) {
 		cnt = min_t(unsigned int, len - i,
-			    EROFS_BLKSIZ - erofs_blkoff(pos));
+			    sb->s_blocksize - erofs_blkoff(sb, pos));
 		src = erofs_bread(&buf, packed_inode,
-				  erofs_blknr(pos), EROFS_KMAP);
+				  erofs_blknr(sb, pos), EROFS_KMAP);
 		if (IS_ERR(src)) {
 			erofs_put_metabuf(&buf);
 			return PTR_ERR(src);
 		}
 
 		dst = kmap_local_page(page);
-		memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
+		memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt);
 		kunmap_local(dst);
 		pos += cnt;
 	}
@@ -1005,7 +1006,8 @@ repeat:
 		void *mp;
 
 		mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
-					erofs_blknr(map->m_pa), EROFS_NO_KMAP);
+					erofs_blknr(inode->i_sb, map->m_pa),
+					EROFS_NO_KMAP);
 		if (IS_ERR(mp)) {
 			err = PTR_ERR(mp);
 			erofs_err(inode->i_sb,
@@ -1726,11 +1728,11 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 
 		/* no device id here, thus it will always succeed */
 		mdev = (struct erofs_map_dev) {
-			.m_pa = blknr_to_addr(pcl->obj.index),
+			.m_pa = erofs_pos(sb, pcl->obj.index),
 		};
 		(void)erofs_map_dev(sb, &mdev);
 
-		cur = erofs_blknr(mdev.m_pa);
+		cur = erofs_blknr(sb, mdev.m_pa);
 		end = cur + pcl->pclusterpages;
 
 		do {
@@ -1764,7 +1766,7 @@ submit_bio_retry:
 
 				last_bdev = mdev.m_bdev;
 				bio->bi_iter.bi_sector = (sector_t)cur <<
-					LOG_SECTORS_PER_BLOCK;
+					(sb->s_blocksize_bits - 9);
 				bio->bi_private = q[JQ_SUBMIT];
 				if (f->readahead)
 					bio->bi_opf |= REQ_RAHEAD;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 655da4d739cb..ecbcae9b5494 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -18,7 +18,7 @@ int z_erofs_fill_inode(struct inode *inode)
 		vi->z_advise = 0;
 		vi->z_algorithmtype[0] = 0;
 		vi->z_algorithmtype[1] = 0;
-		vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
+		vi->z_logical_clusterbits = inode->i_sb->s_blocksize_bits;
 		set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
 	}
 	inode->i_mapping->a_ops = &z_erofs_aops;
@@ -53,13 +53,13 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	unsigned int advise, type;
 
 	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
-				      erofs_blknr(pos), EROFS_KMAP);
+				      erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
 	if (IS_ERR(m->kaddr))
 		return PTR_ERR(m->kaddr);
 
 	m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
 	m->lcn = lcn;
-	di = m->kaddr + erofs_blkoff(pos);
+	di = m->kaddr + erofs_blkoff(inode->i_sb, pos);
 
 	advise = le16_to_cpu(di->di_advise);
 	type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
@@ -156,7 +156,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 			 (vcnt << amortizedshift);
 	big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
 	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
-	eofs = erofs_blkoff(pos);
+	eofs = erofs_blkoff(m->inode->i_sb, pos);
 	base = round_down(eofs, vcnt << amortizedshift);
 	in = m->kaddr + base;
 
@@ -249,7 +249,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	const unsigned int lclusterbits = vi->z_logical_clusterbits;
 	const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
 		ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
-	const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+	unsigned int totalidx = erofs_iblks(inode);
 	unsigned int compacted_4b_initial, compacted_2b;
 	unsigned int amortizedshift;
 	erofs_off_t pos;
@@ -290,7 +290,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 out:
 	pos += lcn * (1 << amortizedshift);
 	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
-				      erofs_blknr(pos), EROFS_KMAP);
+				      erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
 	if (IS_ERR(m->kaddr))
 		return PTR_ERR(m->kaddr);
 	return unpack_compacted_index(m, amortizedshift, pos, lookahead);
@@ -360,6 +360,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
 static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 					    unsigned int initial_lcn)
 {
+	struct super_block *sb = m->inode->i_sb;
 	struct erofs_inode *const vi = EROFS_I(m->inode);
 	struct erofs_map_blocks *const map = m->map;
 	const unsigned int lclusterbits = vi->z_logical_clusterbits;
@@ -406,7 +407,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 		 * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
 		 * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
 		 */
-		m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE);
+		m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
 		break;
 	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
 		if (m->delta[0] != 1)
@@ -422,7 +423,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 		return -EFSCORRUPTED;
 	}
 out:
-	map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE;
+	map->m_plen = erofs_pos(sb, m->compressedblks);
 	return 0;
 err_bonus_cblkcnt:
 	erofs_err(m->inode->i_sb,
@@ -565,7 +566,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 	} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
 		map->m_flags |= EROFS_MAP_FRAGMENT;
 	} else {
-		map->m_pa = blknr_to_addr(m.pblk);
+		map->m_pa = erofs_pos(inode->i_sb, m.pblk);
 		err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
 		if (err)
 			goto unmap_out;
@@ -592,7 +593,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 	if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
 	    ((flags & EROFS_GET_BLOCKS_READMORE) &&
 	     map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
-	     map->m_llen >= EROFS_BLKSIZ)) {
+	     map->m_llen >= i_blocksize(inode))) {
 		err = z_erofs_get_extent_decompressedlen(&m);
 		if (!err)
 			map->m_flags |= EROFS_MAP_FULL_MAPPED;
@@ -633,13 +634,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 		goto out_unlock;
 
 	pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
-	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
 	if (IS_ERR(kaddr)) {
 		err = PTR_ERR(kaddr);
 		goto out_unlock;
 	}
 
-	h = kaddr + erofs_blkoff(pos);
+	h = kaddr + erofs_blkoff(sb, pos);
 	/*
 	 * if the highest bit of the 8-byte map header is set, the whole file
 	 * is stored in the packed inode. The rest bits keeps z_fragmentoff.
@@ -663,7 +664,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 		goto out_put_metabuf;
 	}
 
-	vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+	vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7);
 	if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
 	    vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
 			    Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -692,7 +693,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 		erofs_put_metabuf(&map.buf);
 
 		if (!map.m_plen ||
-		    erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) {
+		    erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
 			erofs_err(sb, "invalid tail-packing pclustersize %llu",
 				  map.m_plen);
 			err = -EFSCORRUPTED;
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index cf4a0d28b178..71dbe8bfa7db 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -71,8 +71,8 @@ TRACE_EVENT(erofs_fill_inode,
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->nid		= EROFS_I(inode)->nid;
-		__entry->blkaddr	= erofs_blknr(erofs_iloc(inode));
-		__entry->ofs		= erofs_blkoff(erofs_iloc(inode));
+		__entry->blkaddr	= erofs_blknr(inode->i_sb, erofs_iloc(inode));
+		__entry->ofs		= erofs_blkoff(inode->i_sb, erofs_iloc(inode));
 	),
 
 	TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u",
-- 
cgit v1.2.3


From 3838c406a594f15600ad6f83c1e3b16bfd1829d0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 14 Apr 2023 07:30:16 -0600
Subject: block: re-arrange the struct block_device fields for better layout

This moves struct device out-of-line as it's just used at open/close
time, so we can keep some of the commonly used fields closer together.
On a standard setup, it also reduces the size from 864 bytes to 848
bytes. Yes, struct device is a pig...

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..c1da616fadae 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -40,26 +40,25 @@ struct bio_crypt_ctx;
 struct block_device {
 	sector_t		bd_start_sect;
 	sector_t		bd_nr_sectors;
+	struct gendisk *	bd_disk;
+	struct request_queue *	bd_queue;
 	struct disk_stats __percpu *bd_stats;
 	unsigned long		bd_stamp;
 	bool			bd_read_only;	/* read-only policy */
+	u8			bd_partno;
+	bool			bd_write_holder;
 	dev_t			bd_dev;
 	atomic_t		bd_openers;
+	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
 	struct inode *		bd_inode;	/* will die */
 	struct super_block *	bd_super;
 	void *			bd_claiming;
-	struct device		bd_device;
 	void *			bd_holder;
+	/* The counter of freeze processes */
+	int			bd_fsfreeze_count;
 	int			bd_holders;
-	bool			bd_write_holder;
 	struct kobject		*bd_holder_dir;
-	u8			bd_partno;
-	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
-	struct gendisk *	bd_disk;
-	struct request_queue *	bd_queue;
 
-	/* The counter of freeze processes */
-	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
 	struct super_block	*bd_fsfreeze_sb;
@@ -68,6 +67,11 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	/*
+	 * keep this out-of-line as it's both big and not needed in the fast
+	 * path
+	 */
+	struct device		bd_device;
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
-- 
cgit v1.2.3


From 9f4107b07b17b5ee68af680150f91227bea2df6f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 14 Apr 2023 07:32:02 -0600
Subject: block: store bdev->bd_disk->fops->submit_bio state in bdev

We have a long chain of memory dereferencing just to whether or not
this disk has a special submit_bio helper. As that's not necessarily
the common case, add a bd_has_submit_bio state in the bdev to avoid
traversing this memory dependency chain if we don't need to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c              | 1 +
 block/blk-core.c          | 8 ++++----
 block/genhd.c             | 3 +++
 include/linux/blk_types.h | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 1795c7d4b99e..850852fe4b78 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -419,6 +419,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
 	bdev->bd_stats = alloc_percpu(struct disk_stats);
+	bdev->bd_has_submit_bio = false;
 	if (!bdev->bd_stats) {
 		iput(inode);
 		return NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index 269765d16cfd..06bdb352568c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -587,14 +587,14 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 
 static void __submit_bio(struct bio *bio)
 {
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-
 	if (unlikely(!blk_crypto_bio_prep(&bio)))
 		return;
 
-	if (!disk->fops->submit_bio) {
+	if (!bio->bi_bdev->bd_has_submit_bio) {
 		blk_mq_submit_bio(bio);
 	} else if (likely(bio_queue_enter(bio) == 0)) {
+		struct gendisk *disk = bio->bi_bdev->bd_disk;
+
 		disk->fops->submit_bio(bio);
 		blk_queue_exit(disk->queue);
 	}
@@ -698,7 +698,7 @@ void submit_bio_noacct_nocheck(struct bio *bio)
 	 */
 	if (current->bio_list)
 		bio_list_add(&current->bio_list[0], bio);
-	else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+	else if (!bio->bi_bdev->bd_has_submit_bio)
 		__submit_bio_noacct_mq(bio);
 	else
 		__submit_bio_noacct(bio);
diff --git a/block/genhd.c b/block/genhd.c
index 02d9cfb9e077..cadcb472dfc3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -420,6 +420,9 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	 */
 	elevator_init_mq(disk->queue);
 
+	/* Mark bdev as having a submit_bio, if needed */
+	disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL;
+
 	/*
 	 * If the driver provides an explicit major number it also must provide
 	 * the number of minors numbers supported, and those will be used to
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c1da616fadae..3e8e9a97de46 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -47,6 +47,7 @@ struct block_device {
 	bool			bd_read_only;	/* read-only policy */
 	u8			bd_partno;
 	bool			bd_write_holder;
+	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
 	atomic_t		bd_openers;
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
-- 
cgit v1.2.3


From bd4b28189469492df2b962d737842c311ce2659c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 14 Apr 2023 17:21:15 -0400
Subject: sctp: delete the obsolete code for the host name address param

In the latest RFC9260, the Host Name Address param has been deprecated.
For INIT chunk:

  Note 3: An INIT chunk MUST NOT contain the Host Name Address
  parameter.  The receiver of an INIT chunk containing a Host Name
  Address parameter MUST send an ABORT chunk and MAY include an
  "Unresolvable Address" error cause.

For Supported Address Types:

  The value indicating the Host Name Address parameter MUST NOT be
  used when sending this parameter and MUST be ignored when receiving
  this parameter.

Currently Linux SCTP doesn't really support Host Name Address param,
but only saves some flag and print debug info, which actually won't
even be triggered due to the verification in sctp_verify_param().
This patch is to delete those dead code.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 -
 net/sctp/sm_make_chunk.c   | 10 +---------
 net/sctp/socket.c          |  2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a0933efd93c3..070c9458fff4 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1711,7 +1711,6 @@ struct sctp_association {
 		__u16	ecn_capable:1,      /* Can peer do ECN? */
 			ipv4_address:1,     /* Peer understands IPv4 addresses? */
 			ipv6_address:1,     /* Peer understands IPv6 addresses? */
-			hostname_address:1, /* Peer understands DNS addresses? */
 			asconf_capable:1,   /* Does peer support ADDIP? */
 			prsctp_capable:1,   /* Can peer do PR-SCTP? */
 			reconf_capable:1,   /* Can peer do RE-CONFIG? */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c7503fd64915..c8f4ec5d5f98 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2207,7 +2207,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
 		break;
 
 	case SCTP_PARAM_HOST_NAME_ADDRESS:
-		/* Tell the peer, we won't support this param.  */
+		/* This param has been Deprecated, send ABORT.  */
 		sctp_process_hn_param(asoc, param, chunk, err_chunk);
 		retval = SCTP_IERROR_ABORT;
 		break;
@@ -2589,10 +2589,6 @@ do_addr_param:
 		asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale);
 		break;
 
-	case SCTP_PARAM_HOST_NAME_ADDRESS:
-		pr_debug("%s: unimplemented SCTP_HOST_NAME_ADDRESS\n", __func__);
-		break;
-
 	case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
 		/* Turn off the default values first so we'll know which
 		 * ones are really set by the peer.
@@ -2624,10 +2620,6 @@ do_addr_param:
 					asoc->peer.ipv6_address = 1;
 				break;
 
-			case SCTP_PARAM_HOST_NAME_ADDRESS:
-				asoc->peer.hostname_address = 1;
-				break;
-
 			default: /* Just ignore anything else.  */
 				break;
 			}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 218e0982c370..079e726909b4 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5195,7 +5195,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 	mask = asoc->peer.ecn_capable << 1;
 	mask = (mask | asoc->peer.ipv4_address) << 1;
 	mask = (mask | asoc->peer.ipv6_address) << 1;
-	mask = (mask | asoc->peer.hostname_address) << 1;
+	mask = mask << 1;
 	mask = (mask | asoc->peer.asconf_capable) << 1;
 	mask = (mask | asoc->peer.prsctp_capable) << 1;
 	mask = (mask | asoc->peer.auth_capable);
-- 
cgit v1.2.3


From c55c0e91c813589dc55bea6bf9a9fbfaa10ae41d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Apr 2023 10:21:36 +0200
Subject: netfilter: nf_tables: fix ifdef to also consider nf_tables=m

nftables can be built as a module, so fix the preprocessor conditional
accordingly.

Fixes: 478b360a47b7 ("netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n")
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1ec3530c8191..dbcaac8b6966 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4713,7 +4713,7 @@ static inline void nf_reset_ct(struct sk_buff *skb)
 
 static inline void nf_reset_trace(struct sk_buff *skb)
 {
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	skb->nf_trace = 0;
 #endif
 }
@@ -4733,7 +4733,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 	dst->_nfct = src->_nfct;
 	nf_conntrack_get(skb_nfct(src));
 #endif
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	if (copy)
 		dst->nf_trace = src->nf_trace;
 #endif
-- 
cgit v1.2.3


From 4a778bdc7afbc422bd513c4f1cd7a9faf4bebaab Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 6 Apr 2023 00:15:48 +0000
Subject: ASoC: expand snd_soc_dapm_mutex_lock/unlock()

soc.h has snd_soc_dapm_mutex_lock/unlock() definition and
many drivers are using it, but soc-dapm.c is not.

1st reason is snd_soc_dapm_mutex_lock/unlock() requests
snd_soc_dapm_context pointer as parameter (A), but sometimes soc-dapm.c
needs to use snd_soc_card (B).

(A)	static inline void snd_soc_dapm_mutex_lock(struct snd_soc_dapm_context *dapm)
	{
		mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
	}			   ^^^^^^^^^^

(B)	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
			   ^^^^

2nd reason is it want to use SND_SOC_DAPM_CLASS_INIT for mutex_lock_nested(),
but helper is using _RUNTIME (A).

The conclusion is we want to use "dapm vs card" and "_RUNTIME vs _INIT"
for dapm lock/unlock. To enable this selfish request, this patch uses
_Generic macro. We can use snd_soc_dapm_mutex_lock/unlock() for both
dapm and card case.

	snd_soc_dapm_mutex_lock(dapm);	snd_soc_dapm_mutex_unlock(dapm);
	snd_soc_dapm_mutex_lock(card);	snd_soc_dapm_mutex_unlock(card);

Current soc-dapm.c is using both mutex_lock() and mutex_lock_nested().
This patch handles mutex_lock() as mutex_lock_nested(..., 0),
in other words, handles below as same.

	mutex_lock(&card->dapm_mutex);
	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT);

Because people might misunderstand that _init() is mutex initialization,
this patch renames _INIT to _ROOT and adds new
snd_soc_dapm_mutex_lock_root() for it.

This patch also moves snd_soc_dapm_subclass definition from soc-dapm.h
to soc.h to keep related code together.

Because very complex soc.h vs soc-dapm.h relationship,
it is difficult/impossible to define these helper into soc-dapm.h.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87cz4hx3v0.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h |   5 --
 include/sound/soc.h      |  60 ++++++++++++++++++++++--
 sound/soc/soc-dapm.c     | 119 +++++++++++++++++++++++------------------------
 3 files changed, 113 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index 64915ebd641e..87f8e1793af1 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -527,11 +527,6 @@ enum snd_soc_dapm_type {
 	SND_SOC_DAPM_TYPE_COUNT
 };
 
-enum snd_soc_dapm_subclass {
-	SND_SOC_DAPM_CLASS_INIT		= 0,
-	SND_SOC_DAPM_CLASS_RUNTIME	= 1,
-};
-
 /*
  * DAPM audio route definition.
  *
diff --git a/include/sound/soc.h b/include/sound/soc.h
index 3833184c187f..0e17e3230c7a 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1364,17 +1364,67 @@ extern struct dentry *snd_soc_debugfs_root;
 
 extern const struct dev_pm_ops snd_soc_pm_ops;
 
-/* Helper functions */
-static inline void snd_soc_dapm_mutex_lock(struct snd_soc_dapm_context *dapm)
+/*
+ *	DAPM helper functions
+ */
+enum snd_soc_dapm_subclass {
+	SND_SOC_DAPM_CLASS_ROOT		= 0,
+	SND_SOC_DAPM_CLASS_RUNTIME	= 1,
+};
+
+static inline void _snd_soc_dapm_mutex_lock_root_c(struct snd_soc_card *card)
+{
+	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_ROOT);
+}
+
+static inline void _snd_soc_dapm_mutex_lock_c(struct snd_soc_card *card)
+{
+	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+}
+
+static inline void _snd_soc_dapm_mutex_unlock_c(struct snd_soc_card *card)
 {
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	mutex_unlock(&card->dapm_mutex);
 }
 
-static inline void snd_soc_dapm_mutex_unlock(struct snd_soc_dapm_context *dapm)
+static inline void _snd_soc_dapm_mutex_assert_held_c(struct snd_soc_card *card)
 {
-	mutex_unlock(&dapm->card->dapm_mutex);
+	lockdep_assert_held(&card->dapm_mutex);
 }
 
+static inline void _snd_soc_dapm_mutex_lock_root_d(struct snd_soc_dapm_context *dapm)
+{
+	_snd_soc_dapm_mutex_lock_root_c(dapm->card);
+}
+
+static inline void _snd_soc_dapm_mutex_lock_d(struct snd_soc_dapm_context *dapm)
+{
+	_snd_soc_dapm_mutex_lock_c(dapm->card);
+}
+
+static inline void _snd_soc_dapm_mutex_unlock_d(struct snd_soc_dapm_context *dapm)
+{
+	_snd_soc_dapm_mutex_unlock_c(dapm->card);
+}
+
+static inline void _snd_soc_dapm_mutex_assert_held_d(struct snd_soc_dapm_context *dapm)
+{
+	_snd_soc_dapm_mutex_assert_held_c(dapm->card);
+}
+
+#define snd_soc_dapm_mutex_lock_root(x) _Generic((x),			\
+	struct snd_soc_card * :		_snd_soc_dapm_mutex_lock_root_c, \
+	struct snd_soc_dapm_context * :	_snd_soc_dapm_mutex_lock_root_d)(x)
+#define snd_soc_dapm_mutex_lock(x) _Generic((x),			\
+	struct snd_soc_card * :		_snd_soc_dapm_mutex_lock_c,	\
+	struct snd_soc_dapm_context * :	_snd_soc_dapm_mutex_lock_d)(x)
+#define snd_soc_dapm_mutex_unlock(x) _Generic((x),			\
+	struct snd_soc_card * :		_snd_soc_dapm_mutex_unlock_c,	\
+	struct snd_soc_dapm_context * :	_snd_soc_dapm_mutex_unlock_d)(x)
+#define snd_soc_dapm_mutex_assert_held(x) _Generic((x),			\
+	struct snd_soc_card * :		_snd_soc_dapm_mutex_assert_held_c, \
+	struct snd_soc_dapm_context * :	_snd_soc_dapm_mutex_assert_held_d)(x)
+
 #include <sound/soc-component.h>
 #include <sound/soc-card.h>
 #include <sound/soc-jack.h>
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 36e6f261bcf7..f2f04ce693a1 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -150,7 +150,7 @@ static int dapm_down_seq[] = {
 static void dapm_assert_locked(struct snd_soc_dapm_context *dapm)
 {
 	if (snd_soc_card_is_instantiated(dapm->card))
-		lockdep_assert_held(&dapm->card->dapm_mutex);
+		snd_soc_dapm_mutex_assert_held(dapm);
 }
 
 static void pop_wait(u32 pop_time)
@@ -302,7 +302,7 @@ void dapm_mark_endpoints_dirty(struct snd_soc_card *card)
 {
 	struct snd_soc_dapm_widget *w;
 
-	mutex_lock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_lock_root(card);
 
 	for_each_card_widgets(card, w) {
 		if (w->is_ep) {
@@ -314,7 +314,7 @@ void dapm_mark_endpoints_dirty(struct snd_soc_card *card)
 		}
 	}
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 }
 EXPORT_SYMBOL_GPL(dapm_mark_endpoints_dirty);
 
@@ -604,7 +604,7 @@ static void dapm_reset(struct snd_soc_card *card)
 {
 	struct snd_soc_dapm_widget *w;
 
-	lockdep_assert_held(&card->dapm_mutex);
+	snd_soc_dapm_mutex_assert_held(card);
 
 	memset(&card->dapm_stats, 0, sizeof(card->dapm_stats));
 
@@ -1302,7 +1302,7 @@ int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream,
 	int paths;
 	int ret;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 
 	if (stream == SNDRV_PCM_STREAM_PLAYBACK) {
 		invalidate_paths_ep(w, SND_SOC_DAPM_DIR_OUT);
@@ -1322,7 +1322,7 @@ int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream,
 		paths = ret;
 
 	trace_snd_soc_dapm_connected(paths, stream);
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 
 	return paths;
 }
@@ -1952,7 +1952,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event)
 	enum snd_soc_bias_level bias;
 	int ret;
 
-	lockdep_assert_held(&card->dapm_mutex);
+	snd_soc_dapm_mutex_assert_held(card);
 
 	trace_snd_soc_dapm_start(card);
 
@@ -2090,7 +2090,6 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 					   size_t count, loff_t *ppos)
 {
 	struct snd_soc_dapm_widget *w = file->private_data;
-	struct snd_soc_card *card = w->dapm->card;
 	enum snd_soc_dapm_direction dir, rdir;
 	char *buf;
 	int in, out;
@@ -2101,7 +2100,7 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 	if (!buf)
 		return -ENOMEM;
 
-	mutex_lock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_lock_root(w->dapm);
 
 	/* Supply widgets are not handled by is_connected_{input,output}_ep() */
 	if (w->is_supply) {
@@ -2145,7 +2144,7 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 		}
 	}
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(w->dapm);
 
 	ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret);
 
@@ -2266,7 +2265,7 @@ static int soc_dapm_mux_update_power(struct snd_soc_card *card,
 	int found = 0;
 	bool connect;
 
-	lockdep_assert_held(&card->dapm_mutex);
+	snd_soc_dapm_mutex_assert_held(card);
 
 	/* find dapm widget path assoc with kcontrol */
 	dapm_kcontrol_for_each_path(path, kcontrol) {
@@ -2293,11 +2292,11 @@ int snd_soc_dapm_mux_update_power(struct snd_soc_dapm_context *dapm,
 	struct snd_soc_card *card = dapm->card;
 	int ret;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 	card->update = update;
 	ret = soc_dapm_mux_update_power(card, kcontrol, mux, e);
 	card->update = NULL;
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 	if (ret > 0)
 		snd_soc_dpcm_runtime_update(card);
 	return ret;
@@ -2312,7 +2311,7 @@ static int soc_dapm_mixer_update_power(struct snd_soc_card *card,
 	struct snd_soc_dapm_path *path;
 	int found = 0;
 
-	lockdep_assert_held(&card->dapm_mutex);
+	snd_soc_dapm_mutex_assert_held(card);
 
 	/* find dapm widget path assoc with kcontrol */
 	dapm_kcontrol_for_each_path(path, kcontrol) {
@@ -2358,11 +2357,11 @@ int snd_soc_dapm_mixer_update_power(struct snd_soc_dapm_context *dapm,
 	struct snd_soc_card *card = dapm->card;
 	int ret;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 	card->update = update;
 	ret = soc_dapm_mixer_update_power(card, kcontrol, connect, -1);
 	card->update = NULL;
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 	if (ret > 0)
 		snd_soc_dpcm_runtime_update(card);
 	return ret;
@@ -2441,7 +2440,7 @@ static ssize_t dapm_widget_show(struct device *dev,
 	struct snd_soc_dai *codec_dai;
 	int i, count = 0;
 
-	mutex_lock(&rtd->card->dapm_mutex);
+	snd_soc_dapm_mutex_lock_root(rtd->card);
 
 	for_each_rtd_codec_dais(rtd, i, codec_dai) {
 		struct snd_soc_component *cmpnt = codec_dai->component;
@@ -2449,7 +2448,7 @@ static ssize_t dapm_widget_show(struct device *dev,
 		count = dapm_widget_show_component(cmpnt, buf, count);
 	}
 
-	mutex_unlock(&rtd->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(rtd->card);
 
 	return count;
 }
@@ -2632,9 +2631,9 @@ int snd_soc_dapm_sync(struct snd_soc_dapm_context *dapm)
 {
 	int ret;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	ret = snd_soc_dapm_sync_unlocked(dapm);
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(snd_soc_dapm_sync);
@@ -2703,9 +2702,9 @@ int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream,
 	struct snd_soc_pcm_runtime *rtd = asoc_substream_to_rtd(substream);
 	int ret;
 
-	mutex_lock_nested(&rtd->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(rtd->card);
 	ret = dapm_update_dai_unlocked(substream, params, dai);
-	mutex_unlock(&rtd->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(rtd->card);
 
 	return ret;
 }
@@ -3090,14 +3089,14 @@ int snd_soc_dapm_add_routes(struct snd_soc_dapm_context *dapm,
 {
 	int i, ret = 0;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	for (i = 0; i < num; i++) {
 		int r = snd_soc_dapm_add_route(dapm, route);
 		if (r < 0)
 			ret = r;
 		route++;
 	}
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -3116,12 +3115,12 @@ int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm,
 {
 	int i;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	for (i = 0; i < num; i++) {
 		snd_soc_dapm_del_route(dapm, route);
 		route++;
 	}
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return 0;
 }
@@ -3194,14 +3193,14 @@ int snd_soc_dapm_weak_routes(struct snd_soc_dapm_context *dapm,
 	int i;
 	int ret = 0;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT);
+	snd_soc_dapm_mutex_lock_root(dapm);
 	for (i = 0; i < num; i++) {
 		int err = snd_soc_dapm_weak_route(dapm, route);
 		if (err)
 			ret = err;
 		route++;
 	}
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -3220,7 +3219,7 @@ int snd_soc_dapm_new_widgets(struct snd_soc_card *card)
 	struct snd_soc_dapm_widget *w;
 	unsigned int val;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT);
+	snd_soc_dapm_mutex_lock_root(card);
 
 	for_each_card_widgets(card, w)
 	{
@@ -3232,7 +3231,7 @@ int snd_soc_dapm_new_widgets(struct snd_soc_card *card)
 						sizeof(struct snd_kcontrol *),
 						GFP_KERNEL);
 			if (!w->kcontrols) {
-				mutex_unlock(&card->dapm_mutex);
+				snd_soc_dapm_mutex_unlock(card);
 				return -ENOMEM;
 			}
 		}
@@ -3275,7 +3274,7 @@ int snd_soc_dapm_new_widgets(struct snd_soc_card *card)
 	}
 
 	dapm_power_widgets(card, SND_SOC_DAPM_STREAM_NOP);
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(snd_soc_dapm_new_widgets);
@@ -3293,7 +3292,6 @@ int snd_soc_dapm_get_volsw(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
 	struct snd_soc_dapm_context *dapm = snd_soc_dapm_kcontrol_dapm(kcontrol);
-	struct snd_soc_card *card = dapm->card;
 	struct soc_mixer_control *mc =
 		(struct soc_mixer_control *)kcontrol->private_value;
 	int reg = mc->reg;
@@ -3304,7 +3302,7 @@ int snd_soc_dapm_get_volsw(struct snd_kcontrol *kcontrol,
 	unsigned int invert = mc->invert;
 	unsigned int reg_val, val, rval = 0;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	if (dapm_kcontrol_is_powered(kcontrol) && reg != SND_SOC_NOPM) {
 		reg_val = soc_dapm_read(dapm, reg);
 		val = (reg_val >> shift) & mask;
@@ -3321,7 +3319,7 @@ int snd_soc_dapm_get_volsw(struct snd_kcontrol *kcontrol,
 		if (snd_soc_volsw_is_stereo(mc))
 			rval = (reg_val >> width) & mask;
 	}
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	if (invert)
 		ucontrol->value.integer.value[0] = max - val;
@@ -3379,7 +3377,7 @@ int snd_soc_dapm_put_volsw(struct snd_kcontrol *kcontrol,
 			rval = max - rval;
 	}
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 
 	/* This assumes field width < (bits in unsigned int / 2) */
 	if (width > sizeof(unsigned int) * 8 / 2)
@@ -3421,7 +3419,7 @@ int snd_soc_dapm_put_volsw(struct snd_kcontrol *kcontrol,
 		card->update = NULL;
 	}
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 
 	if (ret > 0)
 		snd_soc_dpcm_runtime_update(card);
@@ -3443,17 +3441,16 @@ int snd_soc_dapm_get_enum_double(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
 	struct snd_soc_dapm_context *dapm = snd_soc_dapm_kcontrol_dapm(kcontrol);
-	struct snd_soc_card *card = dapm->card;
 	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
 	unsigned int reg_val, val;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	if (e->reg != SND_SOC_NOPM && dapm_kcontrol_is_powered(kcontrol)) {
 		reg_val = soc_dapm_read(dapm, e->reg);
 	} else {
 		reg_val = dapm_kcontrol_get_value(kcontrol);
 	}
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	val = (reg_val >> e->shift_l) & e->mask;
 	ucontrol->value.enumerated.item[0] = snd_soc_enum_val_to_item(e, val);
@@ -3500,7 +3497,7 @@ int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol,
 		mask |= e->mask << e->shift_r;
 	}
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 
 	change = dapm_kcontrol_set_value(kcontrol, val);
 
@@ -3521,7 +3518,7 @@ int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol,
 		card->update = NULL;
 	}
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 
 	if (ret > 0)
 		snd_soc_dpcm_runtime_update(card);
@@ -3562,12 +3559,12 @@ int snd_soc_dapm_get_pin_switch(struct snd_kcontrol *kcontrol,
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 	const char *pin = (const char *)kcontrol->private_value;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 
 	ucontrol->value.integer.value[0] =
 		snd_soc_dapm_get_pin_status(&card->dapm, pin);
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 
 	return 0;
 }
@@ -3586,10 +3583,10 @@ int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol,
 	const char *pin = (const char *)kcontrol->private_value;
 	int ret;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 	ret = __snd_soc_dapm_set_pin(&card->dapm, pin,
 				     !!ucontrol->value.integer.value[0]);
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 
 	snd_soc_dapm_sync(&card->dapm);
 	return ret;
@@ -3762,9 +3759,9 @@ snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm,
 {
 	struct snd_soc_dapm_widget *w;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 	w = snd_soc_dapm_new_control_unlocked(dapm, widget);
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return w;
 }
@@ -3787,7 +3784,7 @@ int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm,
 	int i;
 	int ret = 0;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT);
+	snd_soc_dapm_mutex_lock_root(dapm);
 	for (i = 0; i < num; i++) {
 		struct snd_soc_dapm_widget *w = snd_soc_dapm_new_control_unlocked(dapm, widget);
 		if (IS_ERR(w)) {
@@ -3796,7 +3793,7 @@ int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm,
 		}
 		widget++;
 	}
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(snd_soc_dapm_new_controls);
@@ -4499,9 +4496,9 @@ void snd_soc_dapm_stream_event(struct snd_soc_pcm_runtime *rtd, int stream,
 {
 	struct snd_soc_card *card = rtd->card;
 
-	mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(card);
 	soc_dapm_stream_event(rtd, stream, event);
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 }
 
 void snd_soc_dapm_stream_stop(struct snd_soc_pcm_runtime *rtd, int stream)
@@ -4562,11 +4559,11 @@ int snd_soc_dapm_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin)
 {
 	int ret;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 
 	ret = snd_soc_dapm_set_pin(dapm, pin, 1);
 
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -4630,11 +4627,11 @@ int snd_soc_dapm_force_enable_pin(struct snd_soc_dapm_context *dapm,
 {
 	int ret;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 
 	ret = snd_soc_dapm_force_enable_pin_unlocked(dapm, pin);
 
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -4674,11 +4671,11 @@ int snd_soc_dapm_disable_pin(struct snd_soc_dapm_context *dapm,
 {
 	int ret;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 
 	ret = snd_soc_dapm_set_pin(dapm, pin, 0);
 
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -4725,11 +4722,11 @@ int snd_soc_dapm_nc_pin(struct snd_soc_dapm_context *dapm, const char *pin)
 {
 	int ret;
 
-	mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
+	snd_soc_dapm_mutex_lock(dapm);
 
 	ret = snd_soc_dapm_set_pin(dapm, pin, 0);
 
-	mutex_unlock(&dapm->card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(dapm);
 
 	return ret;
 }
@@ -4826,7 +4823,7 @@ static void soc_dapm_shutdown_dapm(struct snd_soc_dapm_context *dapm)
 	LIST_HEAD(down_list);
 	int powerdown = 0;
 
-	mutex_lock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_lock_root(card);
 
 	for_each_card_widgets(dapm->card, w) {
 		if (w->dapm != dapm)
@@ -4851,7 +4848,7 @@ static void soc_dapm_shutdown_dapm(struct snd_soc_dapm_context *dapm)
 						    SND_SOC_BIAS_STANDBY);
 	}
 
-	mutex_unlock(&card->dapm_mutex);
+	snd_soc_dapm_mutex_unlock(card);
 }
 
 /*
-- 
cgit v1.2.3


From 38e42f6d6c6702bbfc633fce9b579fb80cec2d59 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 6 Apr 2023 00:16:10 +0000
Subject: ASoC: expand snd_soc_dpcm_mutex_lock/unlock()

soc-pcm.c has snd_soc_dpcm_mutex_lock/unlock(),
but other files can't use it because it is static function.

It requests snd_soc_pcm_runtime as parameter (A), but sometimes we
want to use it by snd_soc_card (B).

(A)	static inline void snd_soc_dpcm_mutex_lock(struct snd_soc_pcm_runtime *rtd)
	{
		mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
	}			   ^^^^^^^^^

(B)	mutex_lock_nested(&card->pcm_mutex, card->pcm_subclass);
			   ^^^^

We want to use it with both "rtd" and "card" for dapm lock/unlock.
To enable it, this patch uses _Generic macro.

This patch makes snd_soc_dpcm_mutex_{un}lock() global function, and use it on
each files.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87bkk1x3ud.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 sound/soc/soc-component.c | 12 ++++++------
 sound/soc/soc-compress.c  | 42 +++++++++++++++++++++---------------------
 sound/soc/soc-core.c      |  4 ++--
 sound/soc/soc-pcm.c       | 17 ++---------------
 5 files changed, 76 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 0e17e3230c7a..05004c048dd5 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1425,6 +1425,51 @@ static inline void _snd_soc_dapm_mutex_assert_held_d(struct snd_soc_dapm_context
 	struct snd_soc_card * :		_snd_soc_dapm_mutex_assert_held_c, \
 	struct snd_soc_dapm_context * :	_snd_soc_dapm_mutex_assert_held_d)(x)
 
+/*
+ *	PCM helper functions
+ */
+static inline void _snd_soc_dpcm_mutex_lock_c(struct snd_soc_card *card)
+{
+	mutex_lock_nested(&card->pcm_mutex, card->pcm_subclass);
+}
+
+static inline void _snd_soc_dpcm_mutex_unlock_c(struct snd_soc_card *card)
+{
+	mutex_unlock(&card->pcm_mutex);
+}
+
+static inline void _snd_soc_dpcm_mutex_assert_held_c(struct snd_soc_card *card)
+{
+	lockdep_assert_held(&card->pcm_mutex);
+}
+
+static inline void _snd_soc_dpcm_mutex_lock_r(struct snd_soc_pcm_runtime *rtd)
+{
+	_snd_soc_dpcm_mutex_lock_c(rtd->card);
+}
+
+static inline void _snd_soc_dpcm_mutex_unlock_r(struct snd_soc_pcm_runtime *rtd)
+{
+	_snd_soc_dpcm_mutex_unlock_c(rtd->card);
+}
+
+static inline void _snd_soc_dpcm_mutex_assert_held_r(struct snd_soc_pcm_runtime *rtd)
+{
+	_snd_soc_dpcm_mutex_assert_held_c(rtd->card);
+}
+
+#define snd_soc_dpcm_mutex_lock(x) _Generic((x),			\
+	 struct snd_soc_card * :	_snd_soc_dpcm_mutex_lock_c,	\
+	 struct snd_soc_pcm_runtime * :	_snd_soc_dpcm_mutex_lock_r)(x)
+
+#define snd_soc_dpcm_mutex_unlock(x) _Generic((x),			\
+	 struct snd_soc_card * :	_snd_soc_dpcm_mutex_unlock_c,	\
+	 struct snd_soc_pcm_runtime * :	_snd_soc_dpcm_mutex_unlock_r)(x)
+
+#define snd_soc_dpcm_mutex_assert_held(x) _Generic((x),		\
+	struct snd_soc_card * :		_snd_soc_dpcm_mutex_assert_held_c, \
+	struct snd_soc_pcm_runtime * :	_snd_soc_dpcm_mutex_assert_held_r)(x)
+
 #include <sound/soc-component.h>
 #include <sound/soc-card.h>
 #include <sound/soc-jack.h>
diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c
index 3cd6952212e1..ff25718ff2e8 100644
--- a/sound/soc/soc-component.c
+++ b/sound/soc/soc-component.c
@@ -550,7 +550,7 @@ int snd_soc_component_compr_get_caps(struct snd_compr_stream *cstream,
 	struct snd_soc_component *component;
 	int i, ret = 0;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	for_each_rtd_components(rtd, i, component) {
 		if (component->driver->compress_ops &&
@@ -561,7 +561,7 @@ int snd_soc_component_compr_get_caps(struct snd_compr_stream *cstream,
 		}
 	}
 
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 
 	return soc_component_ret(component, ret);
 }
@@ -574,7 +574,7 @@ int snd_soc_component_compr_get_codec_caps(struct snd_compr_stream *cstream,
 	struct snd_soc_component *component;
 	int i, ret = 0;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	for_each_rtd_components(rtd, i, component) {
 		if (component->driver->compress_ops &&
@@ -585,7 +585,7 @@ int snd_soc_component_compr_get_codec_caps(struct snd_compr_stream *cstream,
 		}
 	}
 
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 
 	return soc_component_ret(component, ret);
 }
@@ -638,7 +638,7 @@ int snd_soc_component_compr_copy(struct snd_compr_stream *cstream,
 	struct snd_soc_component *component;
 	int i, ret = 0;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	for_each_rtd_components(rtd, i, component) {
 		if (component->driver->compress_ops &&
@@ -649,7 +649,7 @@ int snd_soc_component_compr_copy(struct snd_compr_stream *cstream,
 		}
 	}
 
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 
 	return soc_component_ret(component, ret);
 }
diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c
index 6e74a6c48986..661e9d70994f 100644
--- a/sound/soc/soc-compress.c
+++ b/sound/soc/soc-compress.c
@@ -62,7 +62,7 @@ static int soc_compr_clean(struct snd_compr_stream *cstream, int rollback)
 	struct snd_soc_dai *codec_dai = asoc_rtd_to_codec(rtd, 0);
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	if (!rollback)
 		snd_soc_runtime_deactivate(rtd, stream);
@@ -84,7 +84,7 @@ static int soc_compr_clean(struct snd_compr_stream *cstream, int rollback)
 	if (!rollback)
 		snd_soc_dapm_stream_stop(rtd, stream);
 
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 
 	snd_soc_pcm_component_pm_runtime_put(rtd, cstream, rollback);
 
@@ -107,7 +107,7 @@ static int soc_compr_open(struct snd_compr_stream *cstream)
 	if (ret < 0)
 		goto err_no_lock;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	ret = snd_soc_dai_compr_startup(cpu_dai, cstream);
 	if (ret < 0)
@@ -123,7 +123,7 @@ static int soc_compr_open(struct snd_compr_stream *cstream)
 
 	snd_soc_runtime_activate(rtd, stream);
 err:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 err_no_lock:
 	if (ret < 0)
 		soc_compr_clean(cstream, 1);
@@ -146,7 +146,7 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	if (ret < 0)
 		goto be_err;
 
-	mutex_lock_nested(&fe->card->pcm_mutex, fe->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(fe);
 
 	/* calculate valid and active FE <-> BE dpcms */
 	dpcm_process_paths(fe, stream, &list, 1);
@@ -182,7 +182,7 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_NO;
 
 	snd_soc_runtime_activate(fe, stream);
-	mutex_unlock(&fe->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(fe);
 
 	mutex_unlock(&fe->card->mutex);
 
@@ -209,7 +209,7 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 
 	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
 
-	mutex_lock_nested(&fe->card->pcm_mutex, fe->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(fe);
 	snd_soc_runtime_deactivate(fe, stream);
 
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_FE;
@@ -229,7 +229,7 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 
 	dpcm_be_disconnect(fe, stream);
 
-	mutex_unlock(&fe->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(fe);
 
 	snd_soc_link_compr_shutdown(cstream, 0);
 
@@ -249,7 +249,7 @@ static int soc_compr_trigger(struct snd_compr_stream *cstream, int cmd)
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 	int ret;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	ret = snd_soc_component_compr_trigger(cstream, cmd);
 	if (ret < 0)
@@ -269,7 +269,7 @@ static int soc_compr_trigger(struct snd_compr_stream *cstream, int cmd)
 	}
 
 out:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 	return ret;
 }
 
@@ -327,7 +327,7 @@ static int soc_compr_set_params(struct snd_compr_stream *cstream,
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 	int ret;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	/*
 	 * First we call set_params for the CPU DAI, then the component
@@ -352,14 +352,14 @@ static int soc_compr_set_params(struct snd_compr_stream *cstream,
 
 	/* cancel any delayed stream shutdown that is pending */
 	rtd->pop_wait = 0;
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 
 	cancel_delayed_work_sync(&rtd->delayed_work);
 
 	return 0;
 
 err:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 	return ret;
 }
 
@@ -404,9 +404,9 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream,
 	ret = snd_soc_link_compr_set_params(cstream);
 	if (ret < 0)
 		goto out;
-	mutex_lock_nested(&fe->card->pcm_mutex, fe->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(fe);
 	dpcm_dapm_stream_event(fe, stream, SND_SOC_DAPM_STREAM_START);
-	mutex_unlock(&fe->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(fe);
 	fe->dpcm[stream].state = SND_SOC_DPCM_STATE_PREPARE;
 
 out:
@@ -422,7 +422,7 @@ static int soc_compr_get_params(struct snd_compr_stream *cstream,
 	struct snd_soc_dai *cpu_dai = asoc_rtd_to_cpu(rtd, 0);
 	int ret = 0;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	ret = snd_soc_dai_compr_get_params(cpu_dai, cstream, params);
 	if (ret < 0)
@@ -430,7 +430,7 @@ static int soc_compr_get_params(struct snd_compr_stream *cstream,
 
 	ret = snd_soc_component_compr_get_params(cstream, params);
 err:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 	return ret;
 }
 
@@ -440,7 +440,7 @@ static int soc_compr_ack(struct snd_compr_stream *cstream, size_t bytes)
 	struct snd_soc_dai *cpu_dai = asoc_rtd_to_cpu(rtd, 0);
 	int ret;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	ret = snd_soc_dai_compr_ack(cpu_dai, cstream, bytes);
 	if (ret < 0)
@@ -448,7 +448,7 @@ static int soc_compr_ack(struct snd_compr_stream *cstream, size_t bytes)
 
 	ret = snd_soc_component_compr_ack(cstream, bytes);
 err:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 	return ret;
 }
 
@@ -459,7 +459,7 @@ static int soc_compr_pointer(struct snd_compr_stream *cstream,
 	int ret;
 	struct snd_soc_dai *cpu_dai = asoc_rtd_to_cpu(rtd, 0);
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	ret = snd_soc_dai_compr_pointer(cpu_dai, cstream, tstamp);
 	if (ret < 0)
@@ -467,7 +467,7 @@ static int soc_compr_pointer(struct snd_compr_stream *cstream,
 
 	ret = snd_soc_component_compr_pointer(cstream, tstamp);
 out:
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 	return ret;
 }
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 9bbcff492c1e..4594505cdae2 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -348,7 +348,7 @@ void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd)
 	struct snd_soc_dai *codec_dai = asoc_rtd_to_codec(rtd, 0);
 	int playback = SNDRV_PCM_STREAM_PLAYBACK;
 
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(rtd);
 
 	dev_dbg(rtd->dev,
 		"ASoC: pop wq checking: %s status: %s waiting: %s\n",
@@ -364,7 +364,7 @@ void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd)
 					  SND_SOC_DAPM_STREAM_STOP);
 	}
 
-	mutex_unlock(&rtd->card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(rtd);
 }
 EXPORT_SYMBOL_GPL(snd_soc_close_delayed_work);
 
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 913a7d98e742..adb69d7820a8 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -49,19 +49,6 @@ static inline int _soc_pcm_ret(struct snd_soc_pcm_runtime *rtd,
 	return ret;
 }
 
-static inline void snd_soc_dpcm_mutex_lock(struct snd_soc_pcm_runtime *rtd)
-{
-	mutex_lock_nested(&rtd->card->pcm_mutex, rtd->card->pcm_subclass);
-}
-
-static inline void snd_soc_dpcm_mutex_unlock(struct snd_soc_pcm_runtime *rtd)
-{
-	mutex_unlock(&rtd->card->pcm_mutex);
-}
-
-#define snd_soc_dpcm_mutex_assert_held(rtd) \
-	lockdep_assert_held(&(rtd)->card->pcm_mutex)
-
 static inline void snd_soc_dpcm_stream_lock_irq(struct snd_soc_pcm_runtime *rtd,
 						int stream)
 {
@@ -2664,7 +2651,7 @@ int snd_soc_dpcm_runtime_update(struct snd_soc_card *card)
 	struct snd_soc_pcm_runtime *fe;
 	int ret = 0;
 
-	mutex_lock_nested(&card->pcm_mutex, card->pcm_subclass);
+	snd_soc_dpcm_mutex_lock(card);
 	/* shutdown all old paths first */
 	for_each_card_rtds(card, fe) {
 		ret = soc_dpcm_fe_runtime_update(fe, 0);
@@ -2680,7 +2667,7 @@ int snd_soc_dpcm_runtime_update(struct snd_soc_card *card)
 	}
 
 out:
-	mutex_unlock(&card->pcm_mutex);
+	snd_soc_dpcm_mutex_unlock(card);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(snd_soc_dpcm_runtime_update);
-- 
cgit v1.2.3


From 0f3b818486796ec8895fa4ccdf15edb759bff40a Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 6 Apr 2023 00:16:27 +0000
Subject: ASoC: add snd_soc_card_mutex_lock/unlock()

ASoC need to use card->mutex with _INIT or _RUNTIME,
but there is no helper function for it.

This patch adds its helper function and use it.

Because people might misunderstand that _init() is mutex initialization,
this patch renames _INIT to _ROOT and adds new
snd_soc_card_mutex_lock_root() for it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87a5zlx3tw.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-card.h | 17 ++++++++++++++++-
 sound/soc/soc-compress.c | 18 +++++++++---------
 sound/soc/soc-core.c     |  4 ++--
 3 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-card.h b/include/sound/soc-card.h
index 9d31a5c0db33..fc94dfb0021f 100644
--- a/include/sound/soc-card.h
+++ b/include/sound/soc-card.h
@@ -9,10 +9,25 @@
 #define __SOC_CARD_H
 
 enum snd_soc_card_subclass {
-	SND_SOC_CARD_CLASS_INIT		= 0,
+	SND_SOC_CARD_CLASS_ROOT		= 0,
 	SND_SOC_CARD_CLASS_RUNTIME	= 1,
 };
 
+static inline void snd_soc_card_mutex_lock_root(struct snd_soc_card *card)
+{
+	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_ROOT);
+}
+
+static inline void snd_soc_card_mutex_lock(struct snd_soc_card *card)
+{
+	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
+}
+
+static inline void snd_soc_card_mutex_unlock(struct snd_soc_card *card)
+{
+	mutex_unlock(&card->mutex);
+}
+
 struct snd_kcontrol *snd_soc_card_get_kcontrol(struct snd_soc_card *soc_card,
 					       const char *name);
 int snd_soc_card_jack_new(struct snd_soc_card *card, const char *id, int type,
diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c
index 661e9d70994f..d8715db5e415 100644
--- a/sound/soc/soc-compress.c
+++ b/sound/soc/soc-compress.c
@@ -140,7 +140,7 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 	int ret;
 
-	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
+	snd_soc_card_mutex_lock(fe->card);
 
 	ret = dpcm_path_get(fe, stream, &list);
 	if (ret < 0)
@@ -184,7 +184,7 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream)
 	snd_soc_runtime_activate(fe, stream);
 	snd_soc_dpcm_mutex_unlock(fe);
 
-	mutex_unlock(&fe->card->mutex);
+	snd_soc_card_mutex_unlock(fe->card);
 
 	return 0;
 
@@ -196,7 +196,7 @@ out:
 	dpcm_path_put(&list);
 be_err:
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_NO;
-	mutex_unlock(&fe->card->mutex);
+	snd_soc_card_mutex_unlock(fe->card);
 	return ret;
 }
 
@@ -207,7 +207,7 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 	struct snd_soc_dpcm *dpcm;
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 
-	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
+	snd_soc_card_mutex_lock(fe->card);
 
 	snd_soc_dpcm_mutex_lock(fe);
 	snd_soc_runtime_deactivate(fe, stream);
@@ -237,7 +237,7 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream)
 
 	snd_soc_dai_compr_shutdown(cpu_dai, cstream, 0);
 
-	mutex_unlock(&fe->card->mutex);
+	snd_soc_card_mutex_unlock(fe->card);
 	return 0;
 }
 
@@ -284,7 +284,7 @@ static int soc_compr_trigger_fe(struct snd_compr_stream *cstream, int cmd)
 	    cmd == SND_COMPR_TRIGGER_DRAIN)
 		return snd_soc_component_compr_trigger(cstream, cmd);
 
-	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
+	snd_soc_card_mutex_lock(fe->card);
 
 	ret = snd_soc_dai_compr_trigger(cpu_dai, cstream, cmd);
 	if (ret < 0)
@@ -315,7 +315,7 @@ static int soc_compr_trigger_fe(struct snd_compr_stream *cstream, int cmd)
 
 out:
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_NO;
-	mutex_unlock(&fe->card->mutex);
+	snd_soc_card_mutex_unlock(fe->card);
 	return ret;
 }
 
@@ -373,7 +373,7 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream,
 	int stream = cstream->direction; /* SND_COMPRESS_xxx is same as SNDRV_PCM_STREAM_xxx */
 	int ret;
 
-	mutex_lock_nested(&fe->card->mutex, SND_SOC_CARD_CLASS_RUNTIME);
+	snd_soc_card_mutex_lock(fe->card);
 
 	/*
 	 * Create an empty hw_params for the BE as the machine driver must
@@ -411,7 +411,7 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream,
 
 out:
 	fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_NO;
-	mutex_unlock(&fe->card->mutex);
+	snd_soc_card_mutex_unlock(fe->card);
 	return ret;
 }
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 4594505cdae2..b48efc3a08d2 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1938,7 +1938,7 @@ static int snd_soc_bind_card(struct snd_soc_card *card)
 	int ret;
 
 	mutex_lock(&client_mutex);
-	mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_INIT);
+	snd_soc_card_mutex_lock_root(card);
 
 	snd_soc_dapm_init(&card->dapm, card, NULL);
 
@@ -2093,7 +2093,7 @@ probe_end:
 	if (ret < 0)
 		soc_cleanup_card_resources(card);
 
-	mutex_unlock(&card->mutex);
+	snd_soc_card_mutex_unlock(card);
 	mutex_unlock(&client_mutex);
 
 	return ret;
-- 
cgit v1.2.3


From 18d758a2d81a97b9a54a37d535870ce3170cc208 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 17 Feb 2023 13:37:03 +0800
Subject: btrfs: replace btrfs_io_context::raid_map with a fixed u64 value

In btrfs_io_context structure, we have a pointer raid_map, which
indicates the logical bytenr for each stripe.

But considering we always call sort_parity_stripes(), the result
raid_map[] is always sorted, thus raid_map[0] is always the logical
bytenr of the full stripe.

So why we waste the space and time (for sorting) for raid_map?

This patch will replace btrfs_io_context::raid_map with a single u64
number, full_stripe_start, by:

- Replace btrfs_io_context::raid_map with full_stripe_start

- Replace call sites using raid_map[0] to use full_stripe_start

- Replace call sites using raid_map[i] to compare with nr_data_stripes.

The benefits are:

- Less memory wasted on raid_map
  It's sizeof(u64) * num_stripes vs sizeof(u64).
  It'll always save at least one u64, and the benefit grows larger with
  num_stripes.

- No more weird alloc_btrfs_io_context() behavior
  As there is only one fixed size + one variable length array.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c            | 31 ++++++++--------
 fs/btrfs/scrub.c             | 25 +++++++------
 fs/btrfs/volumes.c           | 84 +++++++++++++++++---------------------------
 fs/btrfs/volumes.h           | 19 +++++++---
 include/trace/events/btrfs.h |  2 +-
 5 files changed, 78 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0ac1fc7896dd..6cbbaa6c06ca 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 {
-	u64 num = rbio->bioc->raid_map[0];
+	u64 num = rbio->bioc->full_stripe_logical;
 
 	/*
 	 * we shift down quite a bit.  We're using byte
@@ -567,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
 		return 0;
 
-	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
+	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
 		return 0;
 
 	/* we can't merge with different operations */
@@ -661,7 +661,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 
 	spin_lock(&h->lock);
 	list_for_each_entry(cur, &h->hash_list, hash_list) {
-		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
+		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
 			continue;
 
 		spin_lock(&cur->bio_list_lock);
@@ -1113,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
-		     rbio->bioc->raid_map[0];
+		     rbio->bioc->full_stripe_logical;
 
 	bio_for_each_segment(bvec, bio, iter) {
 		u32 bvec_offset;
@@ -1337,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
-		     rbio->bioc->raid_map[0];
+		     rbio->bioc->full_stripe_logical;
 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
 
 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
@@ -1614,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
 {
 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
-	const u64 full_stripe_start = rbio->bioc->raid_map[0];
+	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
 	const u32 orig_len = orig_bio->bi_iter.bi_size;
 	const u32 sectorsize = fs_info->sectorsize;
 	u64 cur_logical;
@@ -1801,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 		 * here due to a crc mismatch and we can't give them the
 		 * data they want.
 		 */
-		if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
-			if (rbio->bioc->raid_map[faila] ==
-			    RAID5_P_STRIPE)
+		if (failb == rbio->real_stripes - 1) {
+			if (faila == rbio->real_stripes - 2)
 				/*
 				 * Only P and Q are corrupted.
 				 * We only care about data stripes recovery,
@@ -1817,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 			goto pstripe;
 		}
 
-		if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+		if (failb == rbio->real_stripes - 2) {
 			raid6_datap_recov(rbio->real_stripes, sectorsize,
 					  faila, pointers);
 		} else {
@@ -2080,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
-						       rbio->bioc->raid_map[0]);
-	const u64 start = rbio->bioc->raid_map[0];
+						       rbio->bioc->full_stripe_logical);
+	const u64 start = rbio->bioc->full_stripe_logical;
 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
 			fs_info->sectorsize_bits;
 	int ret;
@@ -2129,7 +2128,7 @@ error:
 	 */
 	btrfs_warn_rl(fs_info,
 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
-			rbio->bioc->raid_map[0], ret);
+			rbio->bioc->full_stripe_logical, ret);
 no_csum:
 	kfree(rbio->csum_buf);
 	bitmap_free(rbio->csum_bitmap);
@@ -2385,10 +2384,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
 	int stripe_offset;
 	int index;
 
-	ASSERT(logical >= rbio->bioc->raid_map[0]);
-	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
+	ASSERT(logical >= rbio->bioc->full_stripe_logical);
+	ASSERT(logical + sectorsize <= rbio->bioc->full_stripe_logical +
 				       BTRFS_STRIPE_LEN * rbio->nr_data);
-	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
+	stripe_offset = (int)(logical - rbio->bioc->full_stripe_logical);
 	index = stripe_offset / sectorsize;
 	rbio->bio_sectors[index].page = page;
 	rbio->bio_sectors[index].pgoff = pgoff;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 64b52be6bf0b..91aeac36ebc9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1430,7 +1430,7 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
 }
 
 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
-						 u64 *raid_map,
+						 u64 full_stripe_logical,
 						 int nstripes, int mirror,
 						 int *stripe_index,
 						 u64 *stripe_offset)
@@ -1438,19 +1438,22 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
 	int i;
 
 	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
+					    nstripes - 1 : nstripes - 2;
+
 		/* RAID5/6 */
-		for (i = 0; i < nstripes; i++) {
-			if (raid_map[i] == RAID6_Q_STRIPE ||
-			    raid_map[i] == RAID5_P_STRIPE)
-				continue;
+		for (i = 0; i < nr_data_stripes; i++) {
+			const u64 data_stripe_start = full_stripe_logical +
+						(i * BTRFS_STRIPE_LEN);
 
-			if (logical >= raid_map[i] &&
-			    logical < raid_map[i] + BTRFS_STRIPE_LEN)
+			if (logical >= data_stripe_start &&
+			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
 				break;
 		}
 
 		*stripe_index = i;
-		*stripe_offset = logical - raid_map[i];
+		*stripe_offset = (logical - full_stripe_logical) &
+				 BTRFS_STRIPE_LEN_MASK;
 	} else {
 		/* The other RAID type */
 		*stripe_index = mirror;
@@ -1538,7 +1541,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 
 			scrub_stripe_index_and_offset(logical,
 						      bioc->map_type,
-						      bioc->raid_map,
+						      bioc->full_stripe_logical,
 						      bioc->num_stripes -
 						      bioc->replace_nr_stripes,
 						      mirror_index,
@@ -2398,7 +2401,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 			       &length, &bioc);
-	if (ret || !bioc || !bioc->raid_map)
+	if (ret || !bioc)
 		goto bioc_out;
 
 	if (WARN_ON(!sctx->is_dev_replace ||
@@ -3007,7 +3010,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
 			       &length, &bioc);
-	if (ret || !bioc || !bioc->raid_map)
+	if (ret || !bioc)
 		goto bioc_out;
 
 	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f06f0e47ba8..b7e1d7dc4509 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5894,25 +5894,6 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	return preferred_mirror;
 }
 
-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
-{
-	int i;
-	int again = 1;
-
-	while (again) {
-		again = 0;
-		for (i = 0; i < num_stripes - 1; i++) {
-			/* Swap if parity is on a smaller index */
-			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
-				swap(bioc->stripes[i], bioc->stripes[i + 1]);
-				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
-				again = 1;
-			}
-		}
-	}
-}
-
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
 						       u16 total_stripes)
 {
@@ -5922,12 +5903,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 		 /* The size of btrfs_io_context */
 		sizeof(struct btrfs_io_context) +
 		/* Plus the variable array for the stripes */
-		sizeof(struct btrfs_io_stripe) * (total_stripes) +
-		/*
-		 * Plus the raid_map, which includes both the tgt dev
-		 * and the stripes.
-		 */
-		sizeof(u64) * (total_stripes),
+		sizeof(struct btrfs_io_stripe) * (total_stripes),
 		GFP_NOFS);
 
 	if (!bioc)
@@ -5936,8 +5912,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	refcount_set(&bioc->refs, 1);
 
 	bioc->fs_info = fs_info;
-	bioc->raid_map = (u64 *)(bioc->stripes + total_stripes);
 	bioc->replace_stripe_src = -1;
+	bioc->full_stripe_logical = (u64)-1;
 
 	return bioc;
 }
@@ -6541,33 +6517,39 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	}
 	bioc->map_type = map->type;
 
-	for (i = 0; i < num_stripes; i++) {
-		set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
-			      stripe_nr);
-		stripe_index++;
-	}
-
-	/* Build raid_map */
+	/*
+	 * For RAID56 full map, we need to make sure the stripes[] follows the
+	 * rule that data stripes are all ordered, then followed with P and Q
+	 * (if we have).
+	 *
+	 * It's still mostly the same as other profiles, just with extra rotation.
+	 */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
 	    (need_full_stripe(op) || mirror_num > 1)) {
-		u64 tmp;
-		unsigned rot;
-
-		/* Work out the disk rotation on this stripe-set */
-		rot = stripe_nr % num_stripes;
-
-		/* Fill in the logical address of each stripe */
-		tmp = stripe_nr * data_stripes;
-		for (i = 0; i < data_stripes; i++)
-			bioc->raid_map[(i + rot) % num_stripes] =
-				em->start + ((tmp + i) << BTRFS_STRIPE_LEN_SHIFT);
-
-		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
-		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-			bioc->raid_map[(i + rot + 1) % num_stripes] =
-				RAID6_Q_STRIPE;
-
-		sort_parity_stripes(bioc, num_stripes);
+		/*
+		 * For RAID56 @stripe_nr is already the number of full stripes
+		 * before us, which is also the rotation value (needs to modulo
+		 * with num_stripes).
+		 *
+		 * In this case, we just add @stripe_nr with @i, then do the
+		 * modulo, to reduce one modulo call.
+		 */
+		bioc->full_stripe_logical = em->start +
+			((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
+		for (i = 0; i < num_stripes; i++)
+			set_io_stripe(&bioc->stripes[i], map,
+				      (i + stripe_nr) % num_stripes,
+				      stripe_offset, stripe_nr);
+	} else {
+		/*
+		 * For all other non-RAID56 profiles, just copy the target
+		 * stripe into the bioc.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			set_io_stripe(&bioc->stripes[i], map, stripe_index,
+				      stripe_offset, stripe_nr);
+			stripe_index++;
+		}
 	}
 
 	if (need_full_stripe(op))
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index e86e9f25ba0f..650e131d079e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -460,11 +460,22 @@ struct btrfs_io_context {
 	u16 replace_nr_stripes;
 	s16 replace_stripe_src;
 	/*
-	 * logical block numbers for the start of each stripe
-	 * The last one or two are p/q.  These are sorted,
-	 * so raid_map[0] is the start of our full stripe
+	 * Logical bytenr of the full stripe start, only for RAID56 cases.
+	 *
+	 * When this value is set to other than (u64)-1, the stripes[] should
+	 * follow this pattern:
+	 *
+	 * (real_stripes = num_stripes - replace_nr_stripes)
+	 * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
+	 *
+	 * stripes[0]:			The first data stripe
+	 * stripes[1]:			The second data stripe
+	 * ...
+	 * stripes[data_stripes - 1]:	The last data stripe
+	 * stripes[data_stripes]:	The P stripe
+	 * stripes[data_stripes + 1]:	The Q stripe (only for RAID6).
 	 */
-	u64 *raid_map;
+	u64 full_stripe_logical;
 	struct btrfs_io_stripe stripes[];
 };
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 75d7d22c3a27..8ea9cea9bfeb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2422,7 +2422,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio,
 	),
 
 	TP_fast_assign_btrfs(rbio->bioc->fs_info,
-		__entry->full_stripe	= rbio->bioc->raid_map[0];
+		__entry->full_stripe	= rbio->bioc->full_stripe_logical;
 		__entry->physical	= bio->bi_iter.bi_sector << SECTOR_SHIFT;
 		__entry->len		= bio->bi_iter.bi_size;
 		__entry->opf		= bio_op(bio);
-- 
cgit v1.2.3


From 0a0596fbbe5bddd28b1dfae7e7ecb6d70bdbf059 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Mar 2023 09:49:50 +0900
Subject: btrfs, mm: remove the punt_to_cgroup field in struct
 writeback_control

punt_to_cgroup is only used by extent_write_locked_range, but that
function also directly controls the bio flags for the actual submission.
Remove th punt_to_cgroup field, and just set REQ_CGROUP_PUNT directly
in extent_write_locked_range.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c      | 6 +++---
 include/linux/writeback.h | 5 -----
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1221f699ffc5..f5702b1e2b86 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2533,13 +2533,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= start,
 		.range_end	= end + 1,
-		/* We're called from an async helper function */
-		.punt_to_cgroup	= 1,
 		.no_cgroup_owner = 1,
 	};
 	struct btrfs_bio_ctrl bio_ctrl = {
 		.wbc = &wbc_writepages,
-		.opf = REQ_OP_WRITE | wbc_to_write_flags(&wbc_writepages),
+		/* We're called from an async helper function */
+		.opf = REQ_OP_WRITE | REQ_CGROUP_PUNT |
+			wbc_to_write_flags(&wbc_writepages),
 		.extent_locked = 1,
 	};
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 46020373e155..fba937999fbf 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -70,8 +70,6 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
-	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
-
 	/* To enable batching of swap writes to non-block-device backends,
 	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
 	 * writes have been submitted, if with swap_iocb is not NULL,
@@ -97,9 +95,6 @@ static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
 {
 	blk_opf_t flags = 0;
 
-	if (wbc->punt_to_cgroup)
-		flags = REQ_CGROUP_PUNT;
-
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		flags |= REQ_SYNC;
 	else if (wbc->for_kupdate || wbc->for_background)
-- 
cgit v1.2.3


From 3480373ebdf7625ee29bee6508c9fc4ae70c00bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Mar 2023 09:49:51 +0900
Subject: btrfs, block: move REQ_CGROUP_PUNT to btrfs

REQ_CGROUP_PUNT is a bit annoying as it is hard to follow and adds
a branch to the bio submission hot path.  To fix this, export
blkcg_punt_bio_submit and let btrfs call it directly.  Add a new
REQ_FS_PRIVATE flag for btrfs to indicate to it's own low-level
bio submission code that a punt to the cgroup submission helper
is required.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 block/blk-cgroup.c        | 31 +++++++++++++++++--------------
 block/blk-cgroup.h        | 12 ------------
 block/blk-core.c          |  3 ---
 fs/btrfs/bio.c            | 12 ++++++++----
 fs/btrfs/bio.h            |  3 +++
 fs/btrfs/extent_io.c      |  2 +-
 fs/btrfs/inode.c          |  2 +-
 include/linux/bio.h       |  5 +++++
 include/linux/blk_types.h | 18 +++++-------------
 9 files changed, 40 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bd50b55bdb61..9f5f3263c178 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1688,24 +1688,27 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
-bool __blkcg_punt_bio_submit(struct bio *bio)
+/*
+ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
+ * lead to priority inversions as the kthread can be trapped waiting for that
+ * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
+ * a dedicated per-blkcg work item to avoid such priority inversions.
+ */
+void blkcg_punt_bio_submit(struct bio *bio)
 {
 	struct blkcg_gq *blkg = bio->bi_blkg;
 
-	/* consume the flag first */
-	bio->bi_opf &= ~REQ_CGROUP_PUNT;
-
-	/* never bounce for the root cgroup */
-	if (!blkg->parent)
-		return false;
-
-	spin_lock_bh(&blkg->async_bio_lock);
-	bio_list_add(&blkg->async_bios, bio);
-	spin_unlock_bh(&blkg->async_bio_lock);
-
-	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
-	return true;
+	if (blkg->parent) {
+		spin_lock_bh(&blkg->async_bio_lock);
+		bio_list_add(&blkg->async_bios, bio);
+		spin_unlock_bh(&blkg->async_bio_lock);
+		queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	} else {
+		/* never bounce for the root cgroup */
+		submit_bio(bio);
+	}
 }
+EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
 
 /*
  * Scale the accumulated delay based on how long it has been since we updated
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9c5078755e5e..64758ab9f1f1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -375,16 +375,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),	\
 					    (p_blkg)->q)))
 
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_CGROUP_PUNT)
-		return __blkcg_punt_bio_submit(bio);
-	else
-		return false;
-}
-
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
@@ -506,8 +496,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return
 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline void blk_cgroup_bio_start(struct bio *bio) { }
 static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 42926e6cb83c..478978dcb2bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct);
  */
 void submit_bio(struct bio *bio)
 {
-	if (blkcg_punt_bio_submit(bio))
-		return;
-
 	if (bio_op(bio) == REQ_OP_READ) {
 		task_io_account_read(bio->bi_iter.bi_size);
 		count_vm_events(PGPGIN, bio_sectors(bio));
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index afd2f90fdbff..ed5aa8a176b9 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -435,7 +435,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		dev->devid, bio->bi_iter.bi_size);
 
 	btrfsic_check_bio(bio);
-	submit_bio(bio);
+
+	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
+		blkcg_punt_bio_submit(bio);
+	else
+		submit_bio(bio);
 }
 
 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
@@ -551,10 +555,10 @@ static void run_one_async_done(struct btrfs_work *work)
 
 	/*
 	 * All of the bios that pass through here are from async helpers.
-	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
-	 * This changes nothing when cgroups aren't in use.
+	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
+	 * context.  This changes nothing when cgroups aren't in use.
 	 */
-	bio->bi_opf |= REQ_CGROUP_PUNT;
+	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
 }
 
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index dbf125f6fa33..8edf3c35eead 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -88,6 +88,9 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 /* Bio only refers to one ordered extent. */
 #define REQ_BTRFS_ONE_ORDERED			REQ_DRV
 
+/* Submit using blkcg_punt_bio_submit. */
+#define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
+
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f5702b1e2b86..f40e4a002f78 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2538,7 +2538,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_bio_ctrl bio_ctrl = {
 		.wbc = &wbc_writepages,
 		/* We're called from an async helper function */
-		.opf = REQ_OP_WRITE | REQ_CGROUP_PUNT |
+		.opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
 			wbc_to_write_flags(&wbc_writepages),
 		.extent_locked = 1,
 	};
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c216cab2076..93e16a408f43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1616,7 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
 		if (blkcg_css != blkcg_root_css) {
 			css_get(blkcg_css);
 			async_chunk[i].blkcg_css = blkcg_css;
-			async_chunk[i].write_flags |= REQ_CGROUP_PUNT;
+			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 		} else {
 			async_chunk[i].blkcg_css = NULL;
 		}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d766be7152e1..b3e7529ff55e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_clone_blkg_association(struct bio *dst, struct bio *src);
+void blkcg_punt_bio_submit(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio,
 { }
 static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
+static inline void blkcg_punt_bio_submit(struct bio *bio)
+{
+	submit_bio(bio);
+}
 #endif	/* CONFIG_BLK_CGROUP */
 
 static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..fb8843990d28 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -404,18 +404,11 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
-	/*
-	 * When a shared kthread needs to issue a bio for a cgroup, doing
-	 * so synchronously can lead to priority inversions as the kthread
-	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
-	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
-	 * work item to avoid such priority inversions.
-	 */
-	__REQ_CGROUP_PUNT,
 	__REQ_POLLED,		/* caller polls for completion using bio_poll */
 	__REQ_ALLOC_CACHE,	/* allocate IO from cache if available */
 	__REQ_SWAP,		/* swap I/O */
 	__REQ_DRV,		/* for driver use */
+	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 
 	/*
 	 * Command specific flags, keep last:
@@ -443,14 +436,13 @@ enum req_flag_bits {
 #define REQ_RAHEAD	(__force blk_opf_t)(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND	(__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT	(__force blk_opf_t)(1ULL << __REQ_NOWAIT)
-#define REQ_CGROUP_PUNT	(__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
-
-#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 #define REQ_POLLED	(__force blk_opf_t)(1ULL << __REQ_POLLED)
 #define REQ_ALLOC_CACHE	(__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
-
-#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_SWAP	(__force blk_opf_t)(1ULL << __REQ_SWAP)
+#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
+#define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
+
+#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-- 
cgit v1.2.3


From 7533583e125d65bac1435410ecb56433e95eade0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2023 07:49:05 +0200
Subject: libcrc32c: remove crc32c_impl

This was only ever used by btrfs, and the usage just went away.
This effectively reverts df91f56adce1 ("libcrc32c: Add crc32c_impl
function").

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/crc32c.h | 1 -
 lib/libcrc32c.c        | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index bd21af828ff6..357ae4611a45 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 
 extern u32 crc32c(u32 crc, const void *address, unsigned int length);
-extern const char *crc32c_impl(void);
 
 /* This macro exists for backwards-compatibility. */
 #define crc32c_le crc32c
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 5ca0d815a95d..649e687413a0 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void)
 	crypto_free_shash(tfm);
 }
 
-const char *crc32c_impl(void)
-{
-	return crypto_shash_driver_name(tfm);
-}
-EXPORT_SYMBOL(crc32c_impl);
-
 module_init(libcrc32c_mod_init);
 module_exit(libcrc32c_mod_fini);
 
-- 
cgit v1.2.3


From 604e6681e114d05a2e384c4d1e8ef81918037ef5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 6 Apr 2023 13:00:34 +0800
Subject: btrfs: scrub: reject unsupported scrub flags

Since the introduction of scrub interface, the only flag that we support
is BTRFS_SCRUB_READONLY.  Thus there is no sanity checks, if there are
some undefined flags passed in, we just ignore them.

This is problematic if we want to introduce new scrub flags, as we have
no way to determine if such flags are supported.

Address the problem by introducing a check for the flags, and if
unsupported flags are set, return -EOPNOTSUPP to inform the user space.

This check should be backported for all supported kernels before any new
scrub flags are introduced.

CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 5 +++++
 include/uapi/linux/btrfs.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba769a1eb87a..25833b4eeaf5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
+	if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
 	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
 		ret = mnt_want_write_file(file);
 		if (ret)
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index ada0a489bf2b..dbb8b96da50d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -187,6 +187,7 @@ struct btrfs_scrub_progress {
 };
 
 #define BTRFS_SCRUB_READONLY	1
+#define BTRFS_SCRUB_SUPPORTED_FLAGS	(BTRFS_SCRUB_READONLY)
 struct btrfs_ioctl_scrub_args {
 	__u64 devid;				/* in */
 	__u64 start;				/* in */
-- 
cgit v1.2.3


From 1f6277bf716cc5ba0e3fa0c3e0af1adb4160fb5d Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <ssengar@linux.microsoft.com>
Date: Mon, 20 Mar 2023 00:47:37 -0700
Subject: ACPI: bus: Add stub acpi_sleep_state_supported() in non-ACPI cases

acpi_sleep_state_supported() is defined only when CONFIG_ACPI=y. The
function is in acpi_bus.h, and acpi_bus.h can only be used in
CONFIG_ACPI=y cases. Add the stub function to linux/acpi.h to make
compilation successful for !CONFIG_ACPI cases.

Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/1679298460-11855-3-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index efff750f326d..d331f76b0c19 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1075,6 +1075,11 @@ static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
 	return 0;
 }
 
+static inline bool acpi_sleep_state_supported(u8 sleep_state)
+{
+	return false;
+}
+
 #endif	/* !CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
-- 
cgit v1.2.3


From 0459ff4873739986dccafbb417cfc69e71bdacf4 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Sun, 26 Mar 2023 06:52:02 -0700
Subject: swiotlb: Remove bounce buffer remapping for Hyper-V

With changes to how Hyper-V guest VMs flip memory between private
(encrypted) and shared (decrypted), creating a second kernel virtual
mapping for shared memory is no longer necessary. Everything needed
for the transition to shared is handled by set_memory_decrypted().

As such, remove swiotlb_unencrypted_base and the associated
code.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/1679838727-87310-8-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c |  7 +------
 include/linux/swiotlb.h        |  2 --
 kernel/dma/swiotlb.c           | 45 +-----------------------------------------
 3 files changed, 2 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 315fc358e584..ac630ecde795 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,7 +18,6 @@
 #include <linux/kexec.h>
 #include <linux/i8253.h>
 #include <linux/random.h>
-#include <linux/swiotlb.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv-tlfs.h>
@@ -408,12 +407,8 @@ static void __init ms_hyperv_init_platform(void)
 		pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
 			ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
 
-		if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+		if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
 			static_branch_enable(&isolation_type_snp);
-#ifdef CONFIG_SWIOTLB
-			swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
-#endif
-		}
 	}
 
 	if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index bcef10e20ea4..2ef25e6fa1b4 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -180,6 +180,4 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
 }
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
 
-extern phys_addr_t swiotlb_unencrypted_base;
-
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index dac42a2ad588..0fb16bdbbcae 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -73,8 +73,6 @@ static bool swiotlb_force_disable;
 
 struct io_tlb_mem io_tlb_default_mem;
 
-phys_addr_t swiotlb_unencrypted_base;
-
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
 static unsigned long default_nareas;
 
@@ -201,34 +199,6 @@ static inline unsigned long nr_slots(u64 val)
 	return DIV_ROUND_UP(val, IO_TLB_SIZE);
 }
 
-/*
- * Remap swioltb memory in the unencrypted physical address space
- * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
- * Isolation VMs).
- */
-#ifdef CONFIG_HAS_IOMEM
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
-	void *vaddr = NULL;
-
-	if (swiotlb_unencrypted_base) {
-		phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
-
-		vaddr = memremap(paddr, bytes, MEMREMAP_WB);
-		if (!vaddr)
-			pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
-			       &paddr, bytes);
-	}
-
-	return vaddr;
-}
-#else
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
-	return NULL;
-}
-#endif
-
 /*
  * Early SWIOTLB allocation may be too early to allow an architecture to
  * perform the desired operations.  This function allows the architecture to
@@ -238,18 +208,12 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
 void __init swiotlb_update_mem_attributes(void)
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	void *vaddr;
 	unsigned long bytes;
 
 	if (!mem->nslabs || mem->late_alloc)
 		return;
-	vaddr = phys_to_virt(mem->start);
 	bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
-	set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
-
-	mem->vaddr = swiotlb_mem_remap(mem, bytes);
-	if (!mem->vaddr)
-		mem->vaddr = vaddr;
+	set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -280,13 +244,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 		mem->slots[i].alloc_size = 0;
 	}
 
-	/*
-	 * If swiotlb_unencrypted_base is set, the bounce buffer memory will
-	 * be remapped and cleared in swiotlb_update_mem_attributes.
-	 */
-	if (swiotlb_unencrypted_base)
-		return;
-
 	memset(vaddr, 0, bytes);
 	mem->vaddr = vaddr;
 	return;
-- 
cgit v1.2.3


From 25727aaed6514b88f98a18862c6f2d65a0b0ec3b Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Sun, 26 Mar 2023 06:52:05 -0700
Subject: hv_netvsc: Remove second mapping of send and recv buffers

With changes to how Hyper-V guest VMs flip memory between private
(encrypted) and shared (decrypted), creating a second kernel virtual
mapping for shared memory is no longer necessary.  Everything needed
for the transition to shared is handled by set_memory_decrypted().

As such, remove the code to create and manage the second
mapping for the pre-allocated send and recv buffers.  This mapping
is the last user of hv_map_memory()/hv_unmap_memory(), so delete
these functions as well.  Finally, hv_map_memory() is the last
user of vmap_pfn() in Hyper-V guest code, so remove the Kconfig
selection of VMAP_PFN.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Link: https://lore.kernel.org/r/1679838727-87310-11-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/hyperv/ivm.c           | 28 ------------------------
 drivers/hv/Kconfig              |  1 -
 drivers/hv/hv_common.c          | 11 ----------
 drivers/net/hyperv/hyperv_net.h |  2 --
 drivers/net/hyperv/netvsc.c     | 48 ++---------------------------------------
 include/asm-generic/mshyperv.h  |  2 --
 6 files changed, 2 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index f6a020cb1a24..127d5b7b63de 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -376,34 +376,6 @@ void __init hv_vtom_init(void)
 
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
-/*
- * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
- */
-void *hv_map_memory(void *addr, unsigned long size)
-{
-	unsigned long *pfns = kcalloc(size / PAGE_SIZE,
-				      sizeof(unsigned long), GFP_KERNEL);
-	void *vaddr;
-	int i;
-
-	if (!pfns)
-		return NULL;
-
-	for (i = 0; i < size / PAGE_SIZE; i++)
-		pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
-			(ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
-
-	vaddr = vmap_pfn(pfns, size / PAGE_SIZE, pgprot_decrypted(PAGE_KERNEL));
-	kfree(pfns);
-
-	return vaddr;
-}
-
-void hv_unmap_memory(void *addr)
-{
-	vunmap(addr);
-}
-
 enum hv_isolation_type hv_get_isolation_type(void)
 {
 	if (!(ms_hyperv.priv_high & HV_ISOLATION))
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 47132b30b7ee..94982f08b661 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -8,7 +8,6 @@ config HYPERV
 		|| (ACPI && ARM64 && !CPU_BIG_ENDIAN)
 	select PARAVIRT
 	select X86_HV_CALLBACK_VECTOR if X86
-	select VMAP_PFN
 	select OF_EARLY_FLATTREE if OF
 	help
 	  Select this option to run Linux as a Hyper-V client operating
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 52a6f89ccdbd..6d40b6c7b23b 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -311,14 +311,3 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s
 	return HV_STATUS_INVALID_PARAMETER;
 }
 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
-
-void __weak *hv_map_memory(void *addr, unsigned long size)
-{
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(hv_map_memory);
-
-void __weak hv_unmap_memory(void *addr)
-{
-}
-EXPORT_SYMBOL_GPL(hv_unmap_memory);
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index dd5919ec408b..33d51e363913 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -1139,7 +1139,6 @@ struct netvsc_device {
 
 	/* Receive buffer allocated by us but manages by NetVSP */
 	void *recv_buf;
-	void *recv_original_buf;
 	u32 recv_buf_size; /* allocated bytes */
 	struct vmbus_gpadl recv_buf_gpadl_handle;
 	u32 recv_section_cnt;
@@ -1148,7 +1147,6 @@ struct netvsc_device {
 
 	/* Send buffer allocated by us */
 	void *send_buf;
-	void *send_original_buf;
 	u32 send_buf_size;
 	struct vmbus_gpadl send_buf_gpadl_handle;
 	u32 send_section_cnt;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index da737d959e81..82e9796c8f5e 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -154,17 +154,8 @@ static void free_netvsc_device(struct rcu_head *head)
 	int i;
 
 	kfree(nvdev->extension);
-
-	if (nvdev->recv_original_buf)
-		vfree(nvdev->recv_original_buf);
-	else
-		vfree(nvdev->recv_buf);
-
-	if (nvdev->send_original_buf)
-		vfree(nvdev->send_original_buf);
-	else
-		vfree(nvdev->send_buf);
-
+	vfree(nvdev->recv_buf);
+	vfree(nvdev->send_buf);
 	bitmap_free(nvdev->send_section_map);
 
 	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
@@ -347,7 +338,6 @@ static int netvsc_init_buf(struct hv_device *device,
 	struct nvsp_message *init_packet;
 	unsigned int buf_size;
 	int i, ret = 0;
-	void *vaddr;
 
 	/* Get receive buffer area. */
 	buf_size = device_info->recv_sections * device_info->recv_section_size;
@@ -383,17 +373,6 @@ static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
-	if (hv_isolation_type_snp()) {
-		vaddr = hv_map_memory(net_device->recv_buf, buf_size);
-		if (!vaddr) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-
-		net_device->recv_original_buf = net_device->recv_buf;
-		net_device->recv_buf = vaddr;
-	}
-
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -497,17 +476,6 @@ static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
-	if (hv_isolation_type_snp()) {
-		vaddr = hv_map_memory(net_device->send_buf, buf_size);
-		if (!vaddr) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-
-		net_device->send_original_buf = net_device->send_buf;
-		net_device->send_buf = vaddr;
-	}
-
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -762,12 +730,6 @@ void netvsc_device_remove(struct hv_device *device)
 		netvsc_teardown_send_gpadl(device, net_device, ndev);
 	}
 
-	if (net_device->recv_original_buf)
-		hv_unmap_memory(net_device->recv_buf);
-
-	if (net_device->send_original_buf)
-		hv_unmap_memory(net_device->send_buf);
-
 	/* Release all resources */
 	free_netvsc_device_rcu(net_device);
 }
@@ -1844,12 +1806,6 @@ cleanup:
 	netif_napi_del(&net_device->chan_table[0].napi);
 
 cleanup2:
-	if (net_device->recv_original_buf)
-		hv_unmap_memory(net_device->recv_buf);
-
-	if (net_device->send_original_buf)
-		hv_unmap_memory(net_device->send_buf);
-
 	free_netvsc_device(&net_device->rcu);
 
 	return ERR_PTR(ret);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 90d7f68ed39d..afcd9ae9588c 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -271,8 +271,6 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
-void *hv_map_memory(void *addr, unsigned long size);
-void hv_unmap_memory(void *addr);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline bool hv_is_hibernation_supported(void) { return false; }
-- 
cgit v1.2.3


From 2c6ba4216844ca7918289b49ed5f3f7138ee2402 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Sun, 26 Mar 2023 06:52:07 -0700
Subject: PCI: hv: Enable PCI pass-thru devices in Confidential VMs

For PCI pass-thru devices in a Confidential VM, Hyper-V requires
that PCI config space be accessed via hypercalls.  In normal VMs,
config space accesses are trapped to the Hyper-V host and emulated.
But in a confidential VM, the host can't access guest memory to
decode the instruction for emulation, so an explicit hypercall must
be used.

Add functions to make the new MMIO read and MMIO write hypercalls.
Update the PCI config space access functions to use the hypercalls
when such use is indicated by Hyper-V flags.  Also, set the flag to
allow the Hyper-V PCI driver to be loaded and used in a Confidential
VM (a.k.a., "Isolation VM").  The driver has previously been hardened
against a malicious Hyper-V host[1].

[1] https://lore.kernel.org/all/20220511223207.3386-2-parri.andrea@gmail.com/

Co-developed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://lore.kernel.org/r/1679838727-87310-13-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/include/asm/hyperv-tlfs.h  |   3 +
 drivers/hv/channel_mgmt.c           |   2 +-
 drivers/pci/controller/pci-hyperv.c | 232 ++++++++++++++++++++++++++----------
 include/asm-generic/hyperv-tlfs.h   |  22 ++++
 4 files changed, 194 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 0b73a809e9e1..b4fb75bd1013 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -122,6 +122,9 @@
 /* Recommend using enlightened VMCS */
 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		BIT(14)
 
+/* Use hypercalls for MMIO config space access */
+#define HV_X64_USE_MMIO_HYPERCALLS			BIT(21)
+
 /*
  * CPU management features identification.
  * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index cc23b90cae02..007f26d5f1a4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -67,7 +67,7 @@ const struct vmbus_device vmbus_devs[] = {
 	{ .dev_type = HV_PCIE,
 	  HV_PCIE_GUID,
 	  .perf_device = false,
-	  .allowed_in_isolated = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Synthetic Frame Buffer */
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index f33370b75628..337f3b4a04fc 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -514,6 +514,7 @@ struct hv_pcibus_device {
 
 	/* Highest slot of child device with resources allocated */
 	int wslot_res_allocated;
+	bool use_calls; /* Use hypercalls to access mmio cfg space */
 
 	/* hypercall arg, must not cross page boundary */
 	struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
@@ -1041,6 +1042,70 @@ static int wslot_to_devfn(u32 wslot)
 	return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
 }
 
+static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val)
+{
+	struct hv_mmio_read_input *in;
+	struct hv_mmio_read_output *out;
+	u64 ret;
+
+	/*
+	 * Must be called with interrupts disabled so it is safe
+	 * to use the per-cpu input argument page.  Use it for
+	 * both input and output.
+	 */
+	in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in);
+	in->gpa = gpa;
+	in->size = size;
+
+	ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out);
+	if (hv_result_success(ret)) {
+		switch (size) {
+		case 1:
+			*val = *(u8 *)(out->data);
+			break;
+		case 2:
+			*val = *(u16 *)(out->data);
+			break;
+		default:
+			*val = *(u32 *)(out->data);
+			break;
+		}
+	} else
+		dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n",
+				ret, gpa, size);
+}
+
+static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val)
+{
+	struct hv_mmio_write_input *in;
+	u64 ret;
+
+	/*
+	 * Must be called with interrupts disabled so it is safe
+	 * to use the per-cpu input argument memory.
+	 */
+	in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	in->gpa = gpa;
+	in->size = size;
+	switch (size) {
+	case 1:
+		*(u8 *)(in->data) = val;
+		break;
+	case 2:
+		*(u16 *)(in->data) = val;
+		break;
+	default:
+		*(u32 *)(in->data) = val;
+		break;
+	}
+
+	ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL);
+	if (!hv_result_success(ret))
+		dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n",
+				ret, gpa, size);
+}
+
 /*
  * PCI Configuration Space for these root PCI buses is implemented as a pair
  * of pages in memory-mapped I/O space.  Writing to the first page chooses
@@ -1059,8 +1124,10 @@ static int wslot_to_devfn(u32 wslot)
 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 				     int size, u32 *val)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	struct device *dev = &hbus->hdev->device;
+	int offset = where + CFG_PAGE_OFFSET;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 
 	/*
 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
@@ -1088,56 +1155,79 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 		 */
 		*val = 0;
 	} else if (where + size <= CFG_PAGE_SIZE) {
-		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
-		/* Choose the function to be read. (See comment above) */
-		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-		/* Make sure the function was chosen before we start reading. */
-		mb();
-		/* Read from that function's config space. */
-		switch (size) {
-		case 1:
-			*val = readb(addr);
-			break;
-		case 2:
-			*val = readw(addr);
-			break;
-		default:
-			*val = readl(addr);
-			break;
+
+		spin_lock_irqsave(&hbus->config_lock, flags);
+		if (hbus->use_calls) {
+			phys_addr_t addr = hbus->mem_config->start + offset;
+
+			hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
+						hpdev->desc.win_slot.slot);
+			hv_pci_read_mmio(dev, addr, size, val);
+		} else {
+			void __iomem *addr = hbus->cfg_addr + offset;
+
+			/* Choose the function to be read. (See comment above) */
+			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+			/* Make sure the function was chosen before reading. */
+			mb();
+			/* Read from that function's config space. */
+			switch (size) {
+			case 1:
+				*val = readb(addr);
+				break;
+			case 2:
+				*val = readw(addr);
+				break;
+			default:
+				*val = readl(addr);
+				break;
+			}
+			/*
+			 * Make sure the read was done before we release the
+			 * spinlock allowing consecutive reads/writes.
+			 */
+			mb();
 		}
-		/*
-		 * Make sure the read was done before we release the spinlock
-		 * allowing consecutive reads/writes.
-		 */
-		mb();
-		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+		spin_unlock_irqrestore(&hbus->config_lock, flags);
 	} else {
-		dev_err(&hpdev->hbus->hdev->device,
-			"Attempt to read beyond a function's config space.\n");
+		dev_err(dev, "Attempt to read beyond a function's config space.\n");
 	}
 }
 
 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	struct device *dev = &hbus->hdev->device;
+	u32 val;
 	u16 ret;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
-			     PCI_VENDOR_ID;
 
-	spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
+	spin_lock_irqsave(&hbus->config_lock, flags);
 
-	/* Choose the function to be read. (See comment above) */
-	writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-	/* Make sure the function was chosen before we start reading. */
-	mb();
-	/* Read from that function's config space. */
-	ret = readw(addr);
-	/*
-	 * mb() is not required here, because the spin_unlock_irqrestore()
-	 * is a barrier.
-	 */
+	if (hbus->use_calls) {
+		phys_addr_t addr = hbus->mem_config->start +
+					 CFG_PAGE_OFFSET + PCI_VENDOR_ID;
+
+		hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
+					hpdev->desc.win_slot.slot);
+		hv_pci_read_mmio(dev, addr, 2, &val);
+		ret = val;  /* Truncates to 16 bits */
+	} else {
+		void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET +
+					     PCI_VENDOR_ID;
+		/* Choose the function to be read. (See comment above) */
+		writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+		/* Make sure the function was chosen before we start reading. */
+		mb();
+		/* Read from that function's config space. */
+		ret = readw(addr);
+		/*
+		 * mb() is not required here, because the
+		 * spin_unlock_irqrestore() is a barrier.
+		 */
+	}
 
-	spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+	spin_unlock_irqrestore(&hbus->config_lock, flags);
 
 	return ret;
 }
@@ -1152,39 +1242,51 @@ static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
 				      int size, u32 val)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	struct device *dev = &hbus->hdev->device;
+	int offset = where + CFG_PAGE_OFFSET;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 
 	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
 	    where + size <= PCI_CAPABILITY_LIST) {
 		/* SSIDs and ROM BARs are read-only */
 	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
-		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
-		/* Choose the function to be written. (See comment above) */
-		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-		/* Make sure the function was chosen before we start writing. */
-		wmb();
-		/* Write to that function's config space. */
-		switch (size) {
-		case 1:
-			writeb(val, addr);
-			break;
-		case 2:
-			writew(val, addr);
-			break;
-		default:
-			writel(val, addr);
-			break;
+		spin_lock_irqsave(&hbus->config_lock, flags);
+
+		if (hbus->use_calls) {
+			phys_addr_t addr = hbus->mem_config->start + offset;
+
+			hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
+						hpdev->desc.win_slot.slot);
+			hv_pci_write_mmio(dev, addr, size, val);
+		} else {
+			void __iomem *addr = hbus->cfg_addr + offset;
+
+			/* Choose the function to write. (See comment above) */
+			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+			/* Make sure the function was chosen before writing. */
+			wmb();
+			/* Write to that function's config space. */
+			switch (size) {
+			case 1:
+				writeb(val, addr);
+				break;
+			case 2:
+				writew(val, addr);
+				break;
+			default:
+				writel(val, addr);
+				break;
+			}
+			/*
+			 * Make sure the write was done before we release the
+			 * spinlock allowing consecutive reads/writes.
+			 */
+			mb();
 		}
-		/*
-		 * Make sure the write was done before we release the spinlock
-		 * allowing consecutive reads/writes.
-		 */
-		mb();
-		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+		spin_unlock_irqrestore(&hbus->config_lock, flags);
 	} else {
-		dev_err(&hpdev->hbus->hdev->device,
-			"Attempt to write beyond a function's config space.\n");
+		dev_err(dev, "Attempt to write beyond a function's config space.\n");
 	}
 }
 
@@ -3563,6 +3665,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 	hbus->bridge->domain_nr = dom;
 #ifdef CONFIG_X86
 	hbus->sysdata.domain = dom;
+	hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS);
 #elif defined(CONFIG_ARM64)
 	/*
 	 * Set the PCI bus parent to be the corresponding VMbus
@@ -3572,6 +3675,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 	 * information to devices created on the bus.
 	 */
 	hbus->sysdata.parent = hdev->device.parent;
+	hbus->use_calls = false;
 #endif
 
 	hbus->hdev = hdev;
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index b870983596b9..ea406e901469 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -168,6 +168,8 @@ union hv_reference_tsc_msr {
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
 #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
+#define HVCALL_MMIO_READ			0x0106
+#define HVCALL_MMIO_WRITE			0x0107
 
 /* Extended hypercalls */
 #define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
@@ -796,4 +798,24 @@ struct hv_memory_hint {
 	union hv_gpa_page_range ranges[];
 } __packed;
 
+/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */
+#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64
+
+struct hv_mmio_read_input {
+	u64 gpa;
+	u32 size;
+	u32 reserved;
+} __packed;
+
+struct hv_mmio_read_output {
+	u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
+} __packed;
+
+struct hv_mmio_write_input {
+	u64 gpa;
+	u32 size;
+	u32 reserved;
+	u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
+} __packed;
+
 #endif
-- 
cgit v1.2.3


From d7b6ba9611aefc4bef207b16db8ff06b726efe35 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Mon, 27 Mar 2023 06:16:06 -0700
Subject: x86/hyperv: Add callback filter to cpumask_to_vpset()

When copying CPUs from a Linux cpumask to a Hyper-V VPset,
cpumask_to_vpset() currently has a "_noself" variant that doesn't copy
the current CPU to the VPset. Generalize this variant by replacing it
with a "_skip" variant having a callback function that is invoked for
each CPU to decide if that CPU should be copied. Update the one caller
of cpumask_to_vpset_noself() to use the new "_skip" variant instead.

No functional change.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/1679922967-26582-2-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/hyperv/hv_apic.c      | 12 ++++++++----
 include/asm-generic/mshyperv.h | 22 ++++++++++++++--------
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..1fbda2f94184 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -96,6 +96,11 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
 	wrmsr(HV_X64_MSR_EOI, val, 0);
 }
 
+static bool cpu_is_self(int cpu)
+{
+	return cpu == smp_processor_id();
+}
+
 /*
  * IPI implementation on Hyper-V.
  */
@@ -128,10 +133,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
 	 */
 	if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
 		ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		if (exclude_self)
-			nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
-		else
-			nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+
+		nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask,
+				exclude_self ? cpu_is_self : NULL);
 
 		/*
 		 * 'nr_bank <= 0' means some CPUs in cpumask can't be
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index afcd9ae9588c..402a8c1c202d 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -210,10 +210,9 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
 
 static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
 				    const struct cpumask *cpus,
-				    bool exclude_self)
+				    bool (*func)(int cpu))
 {
 	int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
-	int this_cpu = smp_processor_id();
 	int max_vcpu_bank = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
 
 	/* vpset.valid_bank_mask can represent up to HV_MAX_SPARSE_VCPU_BANKS banks */
@@ -232,7 +231,7 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
 	 * Some banks may end up being empty but this is acceptable.
 	 */
 	for_each_cpu(cpu, cpus) {
-		if (exclude_self && cpu == this_cpu)
+		if (func && func(cpu))
 			continue;
 		vcpu = hv_cpu_number_to_vp_number(cpu);
 		if (vcpu == VP_INVAL)
@@ -248,17 +247,24 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
 	return nr_bank;
 }
 
+/*
+ * Convert a Linux cpumask into a Hyper-V VPset. In the _skip variant,
+ * 'func' is called for each CPU present in cpumask.  If 'func' returns
+ * true, that CPU is skipped -- i.e., that CPU from cpumask is *not*
+ * added to the Hyper-V VPset. If 'func' is NULL, no CPUs are
+ * skipped.
+ */
 static inline int cpumask_to_vpset(struct hv_vpset *vpset,
 				    const struct cpumask *cpus)
 {
-	return __cpumask_to_vpset(vpset, cpus, false);
+	return __cpumask_to_vpset(vpset, cpus, NULL);
 }
 
-static inline int cpumask_to_vpset_noself(struct hv_vpset *vpset,
-				    const struct cpumask *cpus)
+static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset,
+				    const struct cpumask *cpus,
+				    bool (*func)(int cpu))
 {
-	WARN_ON_ONCE(preemptible());
-	return __cpumask_to_vpset(vpset, cpus, true);
+	return __cpumask_to_vpset(vpset, cpus, func);
 }
 
 void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
-- 
cgit v1.2.3


From 59e498a3289f685261c076b998a8a2f8a516874f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Apr 2023 15:49:15 +0200
Subject: bpf: Set skb redirect and from_ingress info in __bpf_tx_skb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some use-cases where it is desirable to use bpf_redirect()
in combination with ifb device, which currently is not supported, for
example, around filtering inbound traffic with BPF to then push it to
ifb which holds the qdisc for shaping in contrast to doing that on the
egress device.

Toke mentions the following case related to OpenWrt:

   Because there's not always a single egress on the other side. These are
   mainly home routers, which tend to have one or more WiFi devices bridged
   to one or more ethernet ports on the LAN side, and a single upstream WAN
   port. And the objective is to control the total amount of traffic going
   over the WAN link (in both directions), to deal with bufferbloat in the
   ISP network (which is sadly still all too prevalent).

   In this setup, the traffic can be split arbitrarily between the links
   on the LAN side, and the only "single bottleneck" is the WAN link. So we
   install both egress and ingress shapers on this, configured to something
   like 95-98% of the true link bandwidth, thus moving the queues into the
   qdisc layer in the router. It's usually necessary to set the ingress
   bandwidth shaper a bit lower than the egress due to being "downstream"
   of the bottleneck link, but it does work surprisingly well.

   We usually use something like a matchall filter to put all ingress
   traffic on the ifb, so doing the redirect from BPF has not been an
   immediate requirement thus far. However, it does seem a bit odd that
   this is not possible, and we do have a BPF-based filter that layers on
   top of this kind of setup, which currently uses u32 as the ingress
   filter and so it could presumably be improved to use BPF instead if
   that was available.

Reported-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reported-by: Yafang Shao <laoar.shao@gmail.com>
Reported-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://git.openwrt.org/?p=project/qosify.git;a=blob;f=README
Link: https://lore.kernel.org/bpf/875y9yzbuy.fsf@toke.dk
Link: https://lore.kernel.org/r/8cebc8b2b6e967e10cbafe2ffd6795050e74accd.1681739137.git.daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 9 +++++++++
 net/core/filter.c      | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..9ff2e3d57329 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5041,6 +5041,15 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 	skb->redirected = 0;
 }
 
+static inline void skb_set_redirected_noclear(struct sk_buff *skb,
+					      bool from_ingress)
+{
+	skb->redirected = 1;
+#ifdef CONFIG_NET_REDIRECT
+	skb->from_ingress = from_ingress;
+#endif
+}
+
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
 {
 	return skb->csum_not_inet;
diff --git a/net/core/filter.c b/net/core/filter.c
index df0df59814ae..44fb997434ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2122,6 +2122,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	}
 
 	skb->dev = dev;
+	skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
 	skb_clear_tstamp(skb);
 
 	dev_xmit_recursion_inc();
-- 
cgit v1.2.3


From 1210af3b99561bbe140af8d9a6b16d1bd0ba3fda Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 13 Apr 2023 15:29:19 +0300
Subject: net/mlx5e: Add IPsec packet offload tunnel bits

Extend packet reformat types and flow table capabilities with
IPsec packet offload tunnel bits.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6c84bf6eec85..20d00e09b168 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -463,9 +463,11 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         max_ft_level[0x8];
 
 	u8         reformat_add_esp_trasport[0x1];
-	u8         reserved_at_41[0x2];
+	u8         reformat_l2_to_l3_esp_tunnel[0x1];
+	u8         reserved_at_42[0x1];
 	u8         reformat_del_esp_trasport[0x1];
-	u8         reserved_at_44[0x2];
+	u8         reformat_l3_esp_tunnel_to_l2[0x1];
+	u8         reserved_at_45[0x1];
 	u8         execute_aso[0x1];
 	u8         reserved_at_47[0x19];
 
@@ -6630,7 +6632,9 @@ enum mlx5_reformat_ctx_type {
 	MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
 	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
 	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5,
+	MLX5_REFORMAT_TYPE_L2_TO_L3_ESP_TUNNEL = 0x6,
 	MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8,
+	MLX5_REFORMAT_TYPE_L3_ESP_TUNNEL_TO_L2 = 0x9,
 	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb,
 	MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
 	MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
-- 
cgit v1.2.3


From 15f93f46f31232da863316769182c699e364c45f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:45 +0300
Subject: net: mscc: ocelot: export a single ocelot_mm_irq()

When the switch emits an IRQ, we don't know what caused it, and we
iterate through all ports to check the MAC Merge status.

Move that iteration inside the ocelot lib; we will change the locking in
a future change and it would be good to encapsulate that lock completely
within the ocelot lib.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  5 +----
 drivers/net/ethernet/mscc/ocelot_mm.c  | 12 ++++++++++--
 include/soc/mscc/ocelot.h              |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index dddb28984bdf..478893c06f56 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -2610,12 +2610,9 @@ static const struct felix_info felix_info_vsc9959 = {
 static irqreturn_t felix_irq_handler(int irq, void *data)
 {
 	struct ocelot *ocelot = (struct ocelot *)data;
-	int port;
 
 	ocelot_get_txtstamp(ocelot);
-
-	for (port = 0; port < ocelot->num_phys_ports; port++)
-		ocelot_port_mm_irq(ocelot, port);
+	ocelot_mm_irq(ocelot);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index 0a8f21ae23f0..ddaf1fb05e48 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -49,7 +49,7 @@ static enum ethtool_mm_verify_status ocelot_mm_verify_status(u32 val)
 	}
 }
 
-void ocelot_port_mm_irq(struct ocelot *ocelot, int port)
+static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct ocelot_mm_state *mm = &ocelot->mm[port];
@@ -91,7 +91,15 @@ void ocelot_port_mm_irq(struct ocelot *ocelot, int port)
 
 	mutex_unlock(&mm->lock);
 }
-EXPORT_SYMBOL_GPL(ocelot_port_mm_irq);
+
+void ocelot_mm_irq(struct ocelot *ocelot)
+{
+	int port;
+
+	for (port = 0; port < ocelot->num_phys_ports; port++)
+		ocelot_mm_update_port_status(ocelot, port);
+}
+EXPORT_SYMBOL_GPL(ocelot_mm_irq);
 
 int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_cfg *cfg,
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 277e6d1f2096..eb8e3935375d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1148,7 +1148,7 @@ int ocelot_vcap_policer_add(struct ocelot *ocelot, u32 pol_ix,
 			    struct ocelot_policer *pol);
 int ocelot_vcap_policer_del(struct ocelot *ocelot, u32 pol_ix);
 
-void ocelot_port_mm_irq(struct ocelot *ocelot, int port);
+void ocelot_mm_irq(struct ocelot *ocelot);
 int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_cfg *cfg,
 		       struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 3ff468ef987e38740de9ca0a811c55e11bfb2141 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:46 +0300
Subject: net: mscc: ocelot: remove struct ocelot_mm_state :: lock

Unfortunately, the workarounds for the hardware bugs make it pointless
to keep fine-grained locking for the MAC Merge state of each port.

Our vsc9959_cut_through_fwd() implementation requires
ocelot->fwd_domain_lock to be held, in order to serialize with changes
to the bridging domains and to port speed changes (which affect which
ports can be cut-through). Simultaneously, the traffic classes which can
be cut-through cannot be preemptible at the same time, and this will
depend on the MAC Merge layer state (which changes from threaded
interrupt context).

Since vsc9959_cut_through_fwd() would have to hold the mm->lock of all
ports for a correct and race-free implementation with respect to
ocelot_mm_irq(), in practice it means that any time a port's mm->lock is
held, it would potentially block holders of ocelot->fwd_domain_lock.

In the interest of simple locking rules, make all MAC Merge layer state
changes (and preemptible traffic class changes) be serialized by the
ocelot->fwd_domain_lock.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_mm.c | 20 ++++++++------------
 include/soc/mscc/ocelot.h             |  1 -
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index ddaf1fb05e48..d2df47e6f8f6 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -56,8 +56,6 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	enum ethtool_mm_verify_status verify_status;
 	u32 val;
 
-	mutex_lock(&mm->lock);
-
 	val = ocelot_port_readl(ocelot_port, DEV_MM_STATUS);
 
 	verify_status = ocelot_mm_verify_status(val);
@@ -88,16 +86,18 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	}
 
 	ocelot_port_writel(ocelot_port, val, DEV_MM_STATUS);
-
-	mutex_unlock(&mm->lock);
 }
 
 void ocelot_mm_irq(struct ocelot *ocelot)
 {
 	int port;
 
+	mutex_lock(&ocelot->fwd_domain_lock);
+
 	for (port = 0; port < ocelot->num_phys_ports; port++)
 		ocelot_mm_update_port_status(ocelot, port);
+
+	mutex_unlock(&ocelot->fwd_domain_lock);
 }
 EXPORT_SYMBOL_GPL(ocelot_mm_irq);
 
@@ -107,14 +107,11 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	u32 mm_enable = 0, verify_disable = 0, add_frag_size;
-	struct ocelot_mm_state *mm;
 	int err;
 
 	if (!ocelot->mm_supported)
 		return -EOPNOTSUPP;
 
-	mm = &ocelot->mm[port];
-
 	err = ethtool_mm_frag_size_min_to_add(cfg->tx_min_frag_size,
 					      &add_frag_size, extack);
 	if (err)
@@ -129,7 +126,7 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 	if (!cfg->verify_enabled)
 		verify_disable = DEV_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_DIS;
 
-	mutex_lock(&mm->lock);
+	mutex_lock(&ocelot->fwd_domain_lock);
 
 	ocelot_port_rmwl(ocelot_port, mm_enable,
 			 DEV_MM_CONFIG_ENABLE_CONFIG_MM_TX_ENA |
@@ -148,7 +145,7 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       QSYS_PREEMPTION_CFG,
 		       port);
 
-	mutex_unlock(&mm->lock);
+	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
 }
@@ -166,7 +163,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 
 	mm = &ocelot->mm[port];
 
-	mutex_lock(&mm->lock);
+	mutex_lock(&ocelot->fwd_domain_lock);
 
 	val = ocelot_port_readl(ocelot_port, DEV_MM_ENABLE_CONFIG);
 	state->pmac_enabled = !!(val & DEV_MM_CONFIG_ENABLE_CONFIG_MM_RX_ENA);
@@ -185,7 +182,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 	state->verify_status = mm->verify_status;
 	state->tx_active = mm->tx_active;
 
-	mutex_unlock(&mm->lock);
+	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
 }
@@ -209,7 +206,6 @@ int ocelot_mm_init(struct ocelot *ocelot)
 		u32 val;
 
 		mm = &ocelot->mm[port];
-		mutex_init(&mm->lock);
 		ocelot_port = ocelot->ports[port];
 
 		/* Update initial status variable for the
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index eb8e3935375d..9599be6a0a39 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -744,7 +744,6 @@ struct ocelot_mirror {
 };
 
 struct ocelot_mm_state {
-	struct mutex lock;
 	enum ethtool_mm_verify_status verify_status;
 	bool tx_active;
 };
-- 
cgit v1.2.3


From 7bf4a5b071e59f48de8d39dfde07a3a65e7f6488 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:47 +0300
Subject: net: mscc: ocelot: optimize ocelot_mm_irq()

The MAC Merge IRQ of all ports is shared with the PTP TX timestamp IRQ
of all ports, which means that currently, when a PTP TX timestamp is
generated, felix_irq_handler() also polls for the MAC Merge layer status
of all ports, looking for changes. This makes the kernel do more work,
and under certain circumstances may make ptp4l require a
tx_timestamp_timeout argument higher than before.

Changes to the MAC Merge layer status are only to be expected under
certain conditions - its TX direction needs to be enabled - so we can
check early if that is the case, and omit register access otherwise.

Make ocelot_mm_update_port_status() skip register access if
mm->tx_enabled is unset, and also call it once more, outside IRQ
context, from ocelot_port_set_mm(), when mm->tx_enabled transitions from
true to false, because an IRQ is also expected in that case.

Also, a port may have its MAC Merge layer enabled but it may not have
generated the interrupt. In that case, there's no point in writing to
DEV_MM_STATUS to acknowledge that IRQ. We can reduce the number of
register writes per port with MM enabled by keeping an "ack" variable
which writes the "write-one-to-clear" bits. Those are 3 in number:
PRMPT_ACTIVE_STICKY, UNEXP_RX_PFRM_STICKY and UNEXP_TX_PFRM_STICKY.
The other fields in DEV_MM_STATUS are read-only and it doesn't matter
what is written to them, so writing zero is just fine.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_mm.c | 30 ++++++++++++++++++++++++++++--
 include/soc/mscc/ocelot.h             |  1 +
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index d2df47e6f8f6..ce6429d46814 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -54,7 +54,10 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct ocelot_mm_state *mm = &ocelot->mm[port];
 	enum ethtool_mm_verify_status verify_status;
-	u32 val;
+	u32 val, ack = 0;
+
+	if (!mm->tx_enabled)
+		return;
 
 	val = ocelot_port_readl(ocelot_port, DEV_MM_STATUS);
 
@@ -71,21 +74,28 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 
 		dev_dbg(ocelot->dev, "Port %d TX preemption %s\n",
 			port, mm->tx_active ? "active" : "inactive");
+
+		ack |= DEV_MM_STAT_MM_STATUS_PRMPT_ACTIVE_STICKY;
 	}
 
 	if (val & DEV_MM_STAT_MM_STATUS_UNEXP_RX_PFRM_STICKY) {
 		dev_err(ocelot->dev,
 			"Unexpected P-frame received on port %d while verification was unsuccessful or not yet verified\n",
 			port);
+
+		ack |= DEV_MM_STAT_MM_STATUS_UNEXP_RX_PFRM_STICKY;
 	}
 
 	if (val & DEV_MM_STAT_MM_STATUS_UNEXP_TX_PFRM_STICKY) {
 		dev_err(ocelot->dev,
 			"Unexpected P-frame requested to be transmitted on port %d while verification was unsuccessful or not yet verified, or MM_TX_ENA=0\n",
 			port);
+
+		ack |= DEV_MM_STAT_MM_STATUS_UNEXP_TX_PFRM_STICKY;
 	}
 
-	ocelot_port_writel(ocelot_port, val, DEV_MM_STATUS);
+	if (ack)
+		ocelot_port_writel(ocelot_port, ack, DEV_MM_STATUS);
 }
 
 void ocelot_mm_irq(struct ocelot *ocelot)
@@ -107,11 +117,14 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	u32 mm_enable = 0, verify_disable = 0, add_frag_size;
+	struct ocelot_mm_state *mm;
 	int err;
 
 	if (!ocelot->mm_supported)
 		return -EOPNOTSUPP;
 
+	mm = &ocelot->mm[port];
+
 	err = ethtool_mm_frag_size_min_to_add(cfg->tx_min_frag_size,
 					      &add_frag_size, extack);
 	if (err)
@@ -145,6 +158,19 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       QSYS_PREEMPTION_CFG,
 		       port);
 
+	/* The switch will emit an IRQ when TX is disabled, to notify that it
+	 * has become inactive. We optimize ocelot_mm_update_port_status() to
+	 * not bother processing MM IRQs at all for ports with TX disabled,
+	 * but we need to ACK this IRQ now, while mm->tx_enabled is still set,
+	 * otherwise we get an IRQ storm.
+	 */
+	if (mm->tx_enabled && !cfg->tx_enabled) {
+		ocelot_mm_update_port_status(ocelot, port);
+		WARN_ON(mm->tx_active);
+	}
+
+	mm->tx_enabled = cfg->tx_enabled;
+
 	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 9599be6a0a39..ee8d43dc5c06 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -745,6 +745,7 @@ struct ocelot_mirror {
 
 struct ocelot_mm_state {
 	enum ethtool_mm_verify_status verify_status;
+	bool tx_enabled;
 	bool tx_active;
 };
 
-- 
cgit v1.2.3


From aac80140dc31963d818a65a522c2e2da81979857 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:49 +0300
Subject: net: mscc: ocelot: add support for mqprio offload

This doesn't apply anything to hardware and in general doesn't do
anything that the software variant doesn't do, except for checking that
there isn't more than 1 TXQ per TC (TXQs for a DSA switch are a dubious
concept anyway). The reason we add this is to be able to parse one more
field added to struct tc_mqprio_qopt_offload, namely preemptible_tcs.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  9 ++++++
 drivers/net/ethernet/mscc/ocelot.c     | 50 ++++++++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h              |  4 +++
 3 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 478893c06f56..66ec2740e3cb 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1612,6 +1612,13 @@ static int vsc9959_qos_port_cbs_set(struct dsa_switch *ds, int port,
 static int vsc9959_qos_query_caps(struct tc_query_caps_base *base)
 {
 	switch (base->type) {
+	case TC_SETUP_QDISC_MQPRIO: {
+		struct tc_mqprio_caps *caps = base->caps;
+
+		caps->validate_queue_counts = true;
+
+		return 0;
+	}
 	case TC_SETUP_QDISC_TAPRIO: {
 		struct tc_taprio_caps *caps = base->caps;
 
@@ -1635,6 +1642,8 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port,
 		return vsc9959_qos_query_caps(type_data);
 	case TC_SETUP_QDISC_TAPRIO:
 		return vsc9959_qos_port_tas_set(ocelot, port, type_data);
+	case TC_SETUP_QDISC_MQPRIO:
+		return ocelot_port_mqprio(ocelot, port, type_data);
 	case TC_SETUP_QDISC_CBS:
 		return vsc9959_qos_port_cbs_set(ds, port, type_data);
 	default:
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 1502bb2c8ea7..8dc5fb1bc61b 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -8,6 +8,7 @@
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
 #include <linux/phy/phy.h>
+#include <net/pkt_sched.h>
 #include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
@@ -2699,6 +2700,55 @@ void ocelot_port_mirror_del(struct ocelot *ocelot, int from, bool ingress)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_mirror_del);
 
+static void ocelot_port_reset_mqprio(struct ocelot *ocelot, int port)
+{
+	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
+
+	netdev_reset_tc(dev);
+}
+
+int ocelot_port_mqprio(struct ocelot *ocelot, int port,
+		       struct tc_mqprio_qopt_offload *mqprio)
+{
+	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
+	struct netlink_ext_ack *extack = mqprio->extack;
+	struct tc_mqprio_qopt *qopt = &mqprio->qopt;
+	int num_tc = qopt->num_tc;
+	int tc, err;
+
+	if (!num_tc) {
+		ocelot_port_reset_mqprio(ocelot, port);
+		return 0;
+	}
+
+	err = netdev_set_num_tc(dev, num_tc);
+	if (err)
+		return err;
+
+	for (tc = 0; tc < num_tc; tc++) {
+		if (qopt->count[tc] != 1) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Only one TXQ per TC supported");
+			return -EINVAL;
+		}
+
+		err = netdev_set_tc_queue(dev, tc, 1, qopt->offset[tc]);
+		if (err)
+			goto err_reset_tc;
+	}
+
+	err = netif_set_real_num_tx_queues(dev, num_tc);
+	if (err)
+		goto err_reset_tc;
+
+	return 0;
+
+err_reset_tc:
+	ocelot_port_reset_mqprio(ocelot, port);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ocelot_port_mqprio);
+
 void ocelot_init_port(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index ee8d43dc5c06..9596c79e9223 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -11,6 +11,8 @@
 #include <linux/regmap.h>
 #include <net/dsa.h>
 
+struct tc_mqprio_qopt_offload;
+
 /* Port Group IDs (PGID) are masks of destination ports.
  *
  * For L2 forwarding, the switch performs 3 lookups in the PGID table for each
@@ -1154,6 +1156,8 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct netlink_ext_ack *extack);
 int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_state *state);
+int ocelot_port_mqprio(struct ocelot *ocelot, int port,
+		       struct tc_mqprio_qopt_offload *mqprio);
 
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
 int ocelot_mrp_add(struct ocelot *ocelot, int port,
-- 
cgit v1.2.3


From 403ffc2c34de5297d007e0e169bf022094d444c2 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:51 +0300
Subject: net: mscc: ocelot: add support for preemptible traffic classes

In order to not transmit (preemptible) frames which will be received by
the link partner as corrupted (because it doesn't support FP), the
hardware requires the driver to program the QSYS_PREEMPTION_CFG_P_QUEUES
register only after the MAC Merge layer becomes active (verification
succeeds, or was disabled).

There are some cases when FP is known (through experimentation) to be
broken. Give priority to FP over cut-through switching, and disable FP
for known broken link modes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  7 +++--
 drivers/net/ethernet/mscc/ocelot.c     | 10 ++++++-
 drivers/net/ethernet/mscc/ocelot.h     |  3 ++
 drivers/net/ethernet/mscc/ocelot_mm.c  | 54 ++++++++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h              |  3 ++
 5 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index e055b3980ccc..cfb3faeaa5bf 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -2519,6 +2519,7 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot)
 
 	for (port = 0; port < ocelot->num_phys_ports; port++) {
 		struct ocelot_port *ocelot_port = ocelot->ports[port];
+		struct ocelot_mm_state *mm = &ocelot->mm[port];
 		int min_speed = ocelot_port->speed;
 		unsigned long mask = 0;
 		u32 tmp, val = 0;
@@ -2559,10 +2560,12 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot)
 
 		/* Enable cut-through forwarding for all traffic classes that
 		 * don't have oversized dropping enabled, since this check is
-		 * bypassed in cut-through mode.
+		 * bypassed in cut-through mode. Also exclude preemptible
+		 * traffic classes, since these would hang the port for some
+		 * reason, if sent as cut-through.
 		 */
 		if (ocelot_port->speed == min_speed) {
-			val = GENMASK(7, 0);
+			val = GENMASK(7, 0) & ~mm->active_preemptible_tcs;
 
 			for (tc = 0; tc < OCELOT_NUM_TC; tc++)
 				if (vsc9959_port_qmaxsdu_get(ocelot, port, tc))
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8dc5fb1bc61b..1f5f00b30441 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1006,7 +1006,12 @@ void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port,
 	 */
 	if (ocelot->ops->cut_through_fwd) {
 		mutex_lock(&ocelot->fwd_domain_lock);
-		ocelot->ops->cut_through_fwd(ocelot);
+		/* Workaround for hardware bug - FP doesn't work
+		 * at all link speeds for all PHY modes. The function
+		 * below also calls ocelot->ops->cut_through_fwd(),
+		 * so we don't need to do it twice.
+		 */
+		ocelot_port_update_active_preemptible_tcs(ocelot, port);
 		mutex_unlock(&ocelot->fwd_domain_lock);
 	}
 
@@ -2705,6 +2710,7 @@ static void ocelot_port_reset_mqprio(struct ocelot *ocelot, int port)
 	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
 
 	netdev_reset_tc(dev);
+	ocelot_port_change_fp(ocelot, port, 0);
 }
 
 int ocelot_port_mqprio(struct ocelot *ocelot, int port,
@@ -2741,6 +2747,8 @@ int ocelot_port_mqprio(struct ocelot *ocelot, int port,
 	if (err)
 		goto err_reset_tc;
 
+	ocelot_port_change_fp(ocelot, port, mqprio->preemptible_tcs);
+
 	return 0;
 
 err_reset_tc:
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index d920ca930690..14440a3b04c3 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -119,6 +119,9 @@ int ocelot_stats_init(struct ocelot *ocelot);
 void ocelot_stats_deinit(struct ocelot *ocelot);
 
 int ocelot_mm_init(struct ocelot *ocelot);
+void ocelot_port_change_fp(struct ocelot *ocelot, int port,
+			   unsigned long preemptible_tcs);
+void ocelot_port_update_active_preemptible_tcs(struct ocelot *ocelot, int port);
 
 extern struct notifier_block ocelot_netdevice_nb;
 extern struct notifier_block ocelot_switchdev_nb;
diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index 3e458f72f645..fb3145118d68 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -49,6 +49,59 @@ static enum ethtool_mm_verify_status ocelot_mm_verify_status(u32 val)
 	}
 }
 
+void ocelot_port_update_active_preemptible_tcs(struct ocelot *ocelot, int port)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_mm_state *mm = &ocelot->mm[port];
+	u32 val = 0;
+
+	lockdep_assert_held(&ocelot->fwd_domain_lock);
+
+	/* Only commit preemptible TCs when MAC Merge is active.
+	 * On NXP LS1028A, when using QSGMII, the port hangs if transmitting
+	 * preemptible frames at any other link speed than gigabit, so avoid
+	 * preemption at lower speeds in this PHY mode.
+	 */
+	if ((ocelot_port->phy_mode != PHY_INTERFACE_MODE_QSGMII ||
+	     ocelot_port->speed == SPEED_1000) && mm->tx_active)
+		val = mm->preemptible_tcs;
+
+	/* Cut through switching doesn't work for preemptible priorities,
+	 * so first make sure it is disabled.
+	 */
+	mm->active_preemptible_tcs = val;
+	ocelot->ops->cut_through_fwd(ocelot);
+
+	dev_dbg(ocelot->dev,
+		"port %d %s/%s, MM TX %s, preemptible TCs 0x%x, active 0x%x\n",
+		port, phy_modes(ocelot_port->phy_mode),
+		phy_speed_to_str(ocelot_port->speed),
+		mm->tx_active ? "active" : "inactive", mm->preemptible_tcs,
+		mm->active_preemptible_tcs);
+
+	ocelot_rmw_rix(ocelot, QSYS_PREEMPTION_CFG_P_QUEUES(val),
+		       QSYS_PREEMPTION_CFG_P_QUEUES_M,
+		       QSYS_PREEMPTION_CFG, port);
+}
+
+void ocelot_port_change_fp(struct ocelot *ocelot, int port,
+			   unsigned long preemptible_tcs)
+{
+	struct ocelot_mm_state *mm = &ocelot->mm[port];
+
+	mutex_lock(&ocelot->fwd_domain_lock);
+
+	if (mm->preemptible_tcs == preemptible_tcs)
+		goto out_unlock;
+
+	mm->preemptible_tcs = preemptible_tcs;
+
+	ocelot_port_update_active_preemptible_tcs(ocelot, port);
+
+out_unlock:
+	mutex_unlock(&ocelot->fwd_domain_lock);
+}
+
 static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
@@ -74,6 +127,7 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 
 		dev_dbg(ocelot->dev, "Port %d TX preemption %s\n",
 			port, mm->tx_active ? "active" : "inactive");
+		ocelot_port_update_active_preemptible_tcs(ocelot, port);
 
 		ack |= DEV_MM_STAT_MM_STATUS_PRMPT_ACTIVE_STICKY;
 	}
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 9596c79e9223..cb8fbb241879 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -749,6 +749,8 @@ struct ocelot_mm_state {
 	enum ethtool_mm_verify_status verify_status;
 	bool tx_enabled;
 	bool tx_active;
+	u8 preemptible_tcs;
+	u8 active_preemptible_tcs;
 };
 
 struct ocelot_port;
@@ -1158,6 +1160,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_state *state);
 int ocelot_port_mqprio(struct ocelot *ocelot, int port,
 		       struct tc_mqprio_qopt_offload *mqprio);
+void ocelot_port_update_preemptible_tcs(struct ocelot *ocelot, int port);
 
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
 int ocelot_mrp_add(struct ocelot *ocelot, int port,
-- 
cgit v1.2.3


From 3e358ea8614ddfbc59ca7a3f5dff5dde2b350b2c Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Thu, 13 Apr 2023 12:23:09 +0300
Subject: RDMA/mlx5: Fix flow counter query via DEVX

Commit cited in "fixes" tag added bulk support for flow counters but it
didn't account that's also possible to query a counter using a non-base id
if the counter was allocated as bulk.

When a user performs a query, validate the flow counter id given in the
mailbox is inside the valid range taking bulk value into account.

Fixes: 208d70f562e5 ("IB/mlx5: Support flow counters offset for bulk counters")
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Link: https://lore.kernel.org/r/79d7fbe291690128e44672418934256254d93115.1681377114.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/devx.c | 31 ++++++++++++++++++++++++++-----
 include/linux/mlx5/mlx5_ifc.h     |  3 ++-
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 07037b829c7e..db5fb196c728 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -666,7 +666,21 @@ static bool devx_is_valid_obj_id(struct uverbs_attr_bundle *attrs,
 				      obj_id;
 
 	case MLX5_IB_OBJECT_DEVX_OBJ:
-		return ((struct devx_obj *)uobj->object)->obj_id == obj_id;
+	{
+		u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
+		struct devx_obj *devx_uobj = uobj->object;
+
+		if (opcode == MLX5_CMD_OP_QUERY_FLOW_COUNTER &&
+		    devx_uobj->flow_counter_bulk_size) {
+			u64 end;
+
+			end = devx_uobj->obj_id +
+				devx_uobj->flow_counter_bulk_size;
+			return devx_uobj->obj_id <= obj_id && end > obj_id;
+		}
+
+		return devx_uobj->obj_id == obj_id;
+	}
 
 	default:
 		return false;
@@ -1517,10 +1531,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 		goto obj_free;
 
 	if (opcode == MLX5_CMD_OP_ALLOC_FLOW_COUNTER) {
-		u8 bulk = MLX5_GET(alloc_flow_counter_in,
-				   cmd_in,
-				   flow_counter_bulk);
-		obj->flow_counter_bulk_size = 128UL * bulk;
+		u32 bulk = MLX5_GET(alloc_flow_counter_in,
+				    cmd_in,
+				    flow_counter_bulk_log_size);
+
+		if (bulk)
+			bulk = 1 << bulk;
+		else
+			bulk = 128UL * MLX5_GET(alloc_flow_counter_in,
+						cmd_in,
+						flow_counter_bulk);
+		obj->flow_counter_bulk_size = bulk;
 	}
 
 	uobj->object = obj;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b54339a1b1c6..3976e6266bcc 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9283,7 +9283,8 @@ struct mlx5_ifc_alloc_flow_counter_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x38];
+	u8         reserved_at_40[0x33];
+	u8         flow_counter_bulk_log_size[0x5];
 	u8         flow_counter_bulk[0x8];
 };
 
-- 
cgit v1.2.3


From d46fc894147cf98dd6e8210aa99ed46854191840 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 17 Apr 2023 12:14:29 +0200
Subject: netfilter: nf_tables: validate catch-all set elements

catch-all set element might jump/goto to chain that uses expressions
that require validation.

Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +++
 net/netfilter/nf_tables_api.c     | 64 +++++++++++++++++++++++++++++++++++----
 net/netfilter/nft_lookup.c        | 36 +++-------------------
 3 files changed, 66 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9430128aae99..1b8e305bb54a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1085,6 +1085,10 @@ struct nft_chain {
 };
 
 int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+			 const struct nft_set_iter *iter,
+			 struct nft_set_elem *elem);
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cd52109e674a..98043e83af71 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 	return 0;
 }
 
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+			 const struct nft_set_iter *iter,
+			 struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+	const struct nft_data *data;
+	int err;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		pctx->level++;
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		pctx->level--;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct nft_set_elem_catchall {
+	struct list_head	list;
+	struct rcu_head		rcu;
+	void			*elem;
+};
+
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask))
+			continue;
+
+		elem.priv = catchall->elem;
+		ret = nft_setelem_validate(ctx, set, NULL, &elem);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
+}
+
 static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 					     const struct nft_chain *chain,
 					     const struct nlattr *nla);
@@ -4759,12 +4817,6 @@ err_set_name:
 	return err;
 }
 
-struct nft_set_elem_catchall {
-	struct list_head	list;
-	struct rcu_head		rcu;
-	void			*elem;
-};
-
 static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
 				     struct nft_set *set)
 {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index cae5a6724163..cecf8ab90e58 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -199,37 +199,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
-				       struct nft_set *set,
-				       const struct nft_set_iter *iter,
-				       struct nft_set_elem *elem)
-{
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
-	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
-	const struct nft_data *data;
-	int err;
-
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
-	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
-		return 0;
-
-	data = nft_set_ext_data(ext);
-	switch (data->verdict.code) {
-	case NFT_JUMP:
-	case NFT_GOTO:
-		pctx->level++;
-		err = nft_chain_validate(ctx, data->verdict.chain);
-		if (err < 0)
-			return err;
-		pctx->level--;
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
 static int nft_lookup_validate(const struct nft_ctx *ctx,
 			       const struct nft_expr *expr,
 			       const struct nft_data **d)
@@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx,
 	iter.skip	= 0;
 	iter.count	= 0;
 	iter.err	= 0;
-	iter.fn		= nft_lookup_validate_setelem;
+	iter.fn		= nft_setelem_validate;
 
 	priv->set->ops->walk(ctx, priv->set, &iter);
+	if (!iter.err)
+		iter.err = nft_set_catchall_validate(ctx, priv->set);
+
 	if (iter.err < 0)
 		return iter.err;
 
-- 
cgit v1.2.3


From cb18e5595df7f4e01d0dd4a0f9c4e71b68ae351e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 17 Apr 2023 11:36:15 +0200
Subject: net: add macro netif_subqueue_completed_wake

Add netif_subqueue_completed_wake, complementing the subqueue versions
netif_subqueue_try_stop and netif_subqueue_maybe_stop.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netdev_queues.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index b26fdb441e39..d68b0a483431 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -160,4 +160,14 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
 		netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
 	})
 
+#define netif_subqueue_completed_wake(dev, idx, pkts, bytes,		\
+				      get_desc, start_thrs)		\
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_completed_wake(txq, pkts, bytes,		\
+					 get_desc, start_thrs);		\
+	})
+
 #endif
-- 
cgit v1.2.3


From 9ae708f00161e1d789268fa9cc05bf6bec2af474 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 17 Apr 2023 15:37:51 +0200
Subject: wifi: mac80211: remove ieee80211_tx_status_8023

It is unused and should not be used. In order to avoid limitations in
4-address mode, the driver should always use ieee80211_tx_status_ext for
802.3 frames with a valid sta pointer.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20230417133751.79160-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 20 --------------------
 net/mac80211/status.c  | 24 ------------------------
 2 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a8dadbd83d95..ac0370e76874 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5215,26 +5215,6 @@ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw,
 void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
 				 struct sk_buff *skb);
 
-/**
- * ieee80211_tx_status_8023 - transmit status callback for 802.3 frame format
- *
- * Call this function for all transmitted data frames after their transmit
- * completion. This callback should only be called for data frames which
- * are using driver's (or hardware's) offload capability of encap/decap
- * 802.11 frames.
- *
- * This function may not be called in IRQ context. Calls to this function
- * for a single hardware must be synchronized against each other and all
- * calls in the same tx status family.
- *
- * @hw: the hardware the frame was transmitted by
- * @vif: the interface for which the frame was transmitted
- * @skb: the frame that was transmitted, owned by mac80211 after this call
- */
-void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
-			       struct ieee80211_vif *vif,
-			       struct sk_buff *skb);
-
 /**
  * ieee80211_report_low_ack - report non-responding station
  *
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 3f9ddd7f04b6..2b13a52ce96c 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -1244,30 +1244,6 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_tx_rate_update);
 
-void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
-			      struct ieee80211_vif *vif,
-			      struct sk_buff *skb)
-{
-	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_tx_status status = {
-		.skb = skb,
-		.info = IEEE80211_SKB_CB(skb),
-	};
-	struct sta_info *sta;
-
-	sdata = vif_to_sdata(vif);
-
-	rcu_read_lock();
-
-	if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta))
-		status.sta = &sta->sta;
-
-	ieee80211_tx_status_ext(hw, &status);
-
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(ieee80211_tx_status_8023);
-
 void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-- 
cgit v1.2.3


From 5ab28c78a125a724684958f4caf8210127d3f528 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 18 Apr 2023 15:43:07 +0100
Subject: ASoC: cs35l56: Remove SDW1 TX5 and TX6

Reduce SDW1 to 4 channels and remove the controls for SDW1
TX5 and TX6.

The TX5 and TX6 channels have been removed from B0 silicon.
There is no need to support them on A1 silicon.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230418144309.1100721-3-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  3 ---
 sound/soc/codecs/cs35l56-shared.c | 27 ---------------------------
 sound/soc/codecs/cs35l56.c        | 28 +---------------------------
 3 files changed, 1 insertion(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index b3300bce74f4..dd7503464651 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -43,8 +43,6 @@
 #define CS35L56_SWIRE_DP3_CH2_INPUT			0x0004C74
 #define CS35L56_SWIRE_DP3_CH3_INPUT			0x0004C78
 #define CS35L56_SWIRE_DP3_CH4_INPUT			0x0004C7C
-#define CS35L56_SWIRE_DP3_CH5_INPUT			0x0004C80
-#define CS35L56_SWIRE_DP3_CH6_INPUT			0x0004C84
 #define CS35L56_IRQ1_CFG				0x000E000
 #define CS35L56_IRQ1_STATUS				0x000E004
 #define CS35L56_IRQ1_EINT_1				0x000E010
@@ -262,7 +260,6 @@ extern const struct cs_dsp_region cs35l56_dsp1_regions[CS35L56_NUM_DSP_REGIONS];
 extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC];
 extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC];
 
-void cs35l56_patch(struct device *dev, struct regmap *regmap, u8 revid);
 void cs35l56_reread_firmware_registers(struct device *dev, struct regmap *regmap);
 int cs35l56_get_bclk_freq_id(unsigned int freq);
 void cs35l56_fill_supply_names(struct regulator_bulk_data *data);
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index d8bc06ad4888..f5fa6bb04d38 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -28,8 +28,6 @@ static const struct reg_default cs35l56_reg_defaults[] = {
 	{ CS35L56_SWIRE_DP3_CH2_INPUT,		0x00000019 },
 	{ CS35L56_SWIRE_DP3_CH3_INPUT,		0x00000029 },
 	{ CS35L56_SWIRE_DP3_CH4_INPUT,		0x00000028 },
-	{ CS35L56_SWIRE_DP3_CH5_INPUT,		0x00000018 },
-	{ CS35L56_SWIRE_DP3_CH6_INPUT,		0x00000018 },
 	{ CS35L56_IRQ1_CFG,			0x00000000 },
 	{ CS35L56_IRQ1_MASK_1,			0x83ffffff },
 	{ CS35L56_IRQ1_MASK_2,			0xffff7fff },
@@ -42,29 +40,6 @@ static const struct reg_default cs35l56_reg_defaults[] = {
 	/* CS35L56_MAIN_POSTURE_NUMBER - soft register, no default	*/
 };
 
-/*
- * The Ax devices have different default register values to that of B0,
- * establish a common set of register defaults.
- */
-static const struct reg_sequence cs35l56_reva_patch[] = {
-	{ CS35L56_SWIRE_DP3_CH5_INPUT,	0x00000018 },
-	{ CS35L56_SWIRE_DP3_CH6_INPUT,	0x00000018 },
-};
-
-void cs35l56_patch(struct device *dev, struct regmap *regmap, u8 revid)
-{
-	int ret;
-
-	if (revid >= CS35L56_REVID_B0)
-		return;
-
-	ret = regmap_register_patch(regmap, cs35l56_reva_patch,
-				    ARRAY_SIZE(cs35l56_reva_patch));
-	if (ret)
-		dev_err(dev, "Failed to apply patch: %d\n", ret);
-}
-EXPORT_SYMBOL_NS_GPL(cs35l56_patch, SND_SOC_CS35L56_SHARED);
-
 static bool cs35l56_is_dsp_memory(unsigned int reg)
 {
 	switch (reg) {
@@ -114,8 +89,6 @@ static bool cs35l56_readable_reg(struct device *dev, unsigned int reg)
 	case CS35L56_SWIRE_DP3_CH2_INPUT:
 	case CS35L56_SWIRE_DP3_CH3_INPUT:
 	case CS35L56_SWIRE_DP3_CH4_INPUT:
-	case CS35L56_SWIRE_DP3_CH5_INPUT:
-	case CS35L56_SWIRE_DP3_CH6_INPUT:
 	case CS35L56_IRQ1_CFG:
 	case CS35L56_IRQ1_STATUS:
 	case CS35L56_IRQ1_EINT_1 ... CS35L56_IRQ1_EINT_8:
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index 1b80cae5026e..c0a857cfb8cb 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -166,24 +166,6 @@ static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx4_enum,
 static const struct snd_kcontrol_new sdw1_tx4_mux =
 	SOC_DAPM_ENUM("SDW1TX4 SRC", cs35l56_sdw1tx4_enum);
 
-static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx5_enum,
-				CS35L56_SWIRE_DP3_CH5_INPUT,
-				0, CS35L56_SWIRETXn_SRC_MASK,
-				cs35l56_tx_input_texts,
-				cs35l56_tx_input_values);
-
-static const struct snd_kcontrol_new sdw1_tx5_mux =
-	SOC_DAPM_ENUM("SDW1TX5 SRC", cs35l56_sdw1tx5_enum);
-
-static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_sdw1tx6_enum,
-				CS35L56_SWIRE_DP3_CH6_INPUT,
-				0, CS35L56_SWIRETXn_SRC_MASK,
-				cs35l56_tx_input_texts,
-				cs35l56_tx_input_values);
-
-static const struct snd_kcontrol_new sdw1_tx6_mux =
-	SOC_DAPM_ENUM("SDW1TX6 SRC", cs35l56_sdw1tx6_enum);
-
 static int cs35l56_play_event(struct snd_soc_dapm_widget *w,
 			      struct snd_kcontrol *kcontrol, int event)
 {
@@ -251,8 +233,6 @@ static const struct snd_soc_dapm_widget cs35l56_dapm_widgets[] = {
 	SND_SOC_DAPM_MUX("SDW1 TX2 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx2_mux),
 	SND_SOC_DAPM_MUX("SDW1 TX3 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx3_mux),
 	SND_SOC_DAPM_MUX("SDW1 TX4 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx4_mux),
-	SND_SOC_DAPM_MUX("SDW1 TX5 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx5_mux),
-	SND_SOC_DAPM_MUX("SDW1 TX6 Source", SND_SOC_NOPM, 0, 0, &sdw1_tx6_mux),
 
 	SND_SOC_DAPM_SIGGEN("VMON ADC"),
 	SND_SOC_DAPM_SIGGEN("IMON ADC"),
@@ -318,14 +298,10 @@ static const struct snd_soc_dapm_route cs35l56_audio_map[] = {
 	CS35L56_SRC_ROUTE("SDW1 TX2")
 	CS35L56_SRC_ROUTE("SDW1 TX3")
 	CS35L56_SRC_ROUTE("SDW1 TX4")
-	CS35L56_SRC_ROUTE("SDW1 TX5")
-	CS35L56_SRC_ROUTE("SDW1 TX6")
 	{ "SDW1 Capture", NULL, "SDW1 TX1 Source" },
 	{ "SDW1 Capture", NULL, "SDW1 TX2 Source" },
 	{ "SDW1 Capture", NULL, "SDW1 TX3 Source" },
 	{ "SDW1 Capture", NULL, "SDW1 TX4 Source" },
-	{ "SDW1 Capture", NULL, "SDW1 TX5 Source" },
-	{ "SDW1 Capture", NULL, "SDW1 TX6 Source" },
 };
 
 static int cs35l56_dsp_event(struct snd_soc_dapm_widget *w,
@@ -779,7 +755,7 @@ static struct snd_soc_dai_driver cs35l56_dai[] = {
 		.capture = {
 			.stream_name = "SDW1 Capture",
 			.channels_min = 1,
-			.channels_max = 6,
+			.channels_max = 4,
 			.rates = CS35L56_RATES,
 			.formats = CS35L56_TX_FORMATS,
 		},
@@ -1535,8 +1511,6 @@ int cs35l56_init(struct cs35l56_private *cs35l56)
 	dev_info(cs35l56->dev, "Cirrus Logic CS35L56%s Rev %02X OTP%d\n",
 		 cs35l56->secured ? "s" : "", cs35l56->rev, otpid);
 
-	cs35l56_patch(cs35l56->dev, cs35l56->regmap, cs35l56->rev);
-
 	/* Wake source and *_BLOCKED interrupts default to unmasked, so mask them */
 	regmap_write(cs35l56->regmap, CS35L56_IRQ1_MASK_20, 0xffffffff);
 	regmap_update_bits(cs35l56->regmap, CS35L56_IRQ1_MASK_1,
-- 
cgit v1.2.3


From d3a4efb334e5f6cbb3f2741fa07a873a2a78b016 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 18 Apr 2023 15:43:08 +0100
Subject: ASoC: cs35l56: Remove SDW2RX1 mixer source

The mixer source index value for SDW2RX1 is different between
A1 and B0 silicon. As the driver doesn't provide a DAI for SDW2
just remove it as a mixer source option.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230418144309.1100721-4-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           | 3 +--
 sound/soc/codecs/cs35l56-shared.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index dd7503464651..0b2f7cfc6a4a 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -181,10 +181,9 @@
 #define CS35L56_INPUT_SRC_INTERPOLATOR			0x40
 #define CS35L56_INPUT_SRC_SWIRE_RX1			0x44
 #define CS35L56_INPUT_SRC_SWIRE_RX2			0x45
-#define CS35L56_INPUT_SRC_SWIRE_RX3			0x46
 #define CS35L56_INPUT_MASK				0x7F
 
-#define CS35L56_NUM_INPUT_SRC				22
+#define CS35L56_NUM_INPUT_SRC				21
 
 /* ASP formats */
 #define CS35L56_ASP_FMT_DSP_A				0
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index f5fa6bb04d38..93a1b056660b 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -278,7 +278,7 @@ const char * const cs35l56_tx_input_texts[] = {
 	"None", "ASP1RX1", "ASP1RX2", "VMON", "IMON", "ERRVOL", "CLASSH",
 	"VDDBMON", "VBSTMON", "DSP1TX1", "DSP1TX2", "DSP1TX3", "DSP1TX4",
 	"DSP1TX5", "DSP1TX6", "DSP1TX7", "DSP1TX8", "TEMPMON",
-	"INTERPOLATOR", "SDW1RX1", "SDW1RX2", "SDW2RX1",
+	"INTERPOLATOR", "SDW1RX1", "SDW1RX2",
 };
 EXPORT_SYMBOL_NS_GPL(cs35l56_tx_input_texts, SND_SOC_CS35L56_SHARED);
 
@@ -304,7 +304,6 @@ const unsigned int cs35l56_tx_input_values[] = {
 	CS35L56_INPUT_SRC_INTERPOLATOR,
 	CS35L56_INPUT_SRC_SWIRE_RX1,
 	CS35L56_INPUT_SRC_SWIRE_RX2,
-	CS35L56_INPUT_SRC_SWIRE_RX3,
 };
 EXPORT_SYMBOL_NS_GPL(cs35l56_tx_input_values, SND_SOC_CS35L56_SHARED);
 
-- 
cgit v1.2.3


From d29a966b72fb370128096393961f2c456ff24e3d Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 18 Apr 2023 15:43:09 +0100
Subject: ASoC: cs35l56: Rename mixer source defines for SoundWire DP1

Rename the mixer source defines from CS35L56_INPUT_SRC_SWIRE_RXn
to CS35L56_INPUT_SRC_SWIRE_DP1_CHANNELn to match the latest
datasheet.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20230418144309.1100721-5-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           | 4 ++--
 sound/soc/codecs/cs35l56-shared.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 0b2f7cfc6a4a..002042b1c73c 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -179,8 +179,8 @@
 #define CS35L56_INPUT_SRC_DSP1TX8			0x39
 #define CS35L56_INPUT_SRC_TEMPMON			0x3A
 #define CS35L56_INPUT_SRC_INTERPOLATOR			0x40
-#define CS35L56_INPUT_SRC_SWIRE_RX1			0x44
-#define CS35L56_INPUT_SRC_SWIRE_RX2			0x45
+#define CS35L56_INPUT_SRC_SWIRE_DP1_CHANNEL1		0x44
+#define CS35L56_INPUT_SRC_SWIRE_DP1_CHANNEL2		0x45
 #define CS35L56_INPUT_MASK				0x7F
 
 #define CS35L56_NUM_INPUT_SRC				21
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 93a1b056660b..60da8c75b7b9 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -302,8 +302,8 @@ const unsigned int cs35l56_tx_input_values[] = {
 	CS35L56_INPUT_SRC_DSP1TX8,
 	CS35L56_INPUT_SRC_TEMPMON,
 	CS35L56_INPUT_SRC_INTERPOLATOR,
-	CS35L56_INPUT_SRC_SWIRE_RX1,
-	CS35L56_INPUT_SRC_SWIRE_RX2,
+	CS35L56_INPUT_SRC_SWIRE_DP1_CHANNEL1,
+	CS35L56_INPUT_SRC_SWIRE_DP1_CHANNEL2,
 };
 EXPORT_SYMBOL_NS_GPL(cs35l56_tx_input_values, SND_SOC_CS35L56_SHARED);
 
-- 
cgit v1.2.3


From 85a953806557dbf25d16e8c132b5b9b100d16496 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Mon, 10 Apr 2023 09:16:52 -0700
Subject: mailbox: Allow direct registration to a channel

Support virtual mailbox controllers and clients which are not platform
devices or come from the devicetree by allowing them to match client to
channel via some other mechanism.

Tested-by: Sudeep Holla <sudeep.holla@arm.com> (pcc)
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mailbox.c      | 96 ++++++++++++++++++++++++++++++------------
 include/linux/mailbox_client.h |  1 +
 2 files changed, 69 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 4229b9b5da98..adf36c05fa43 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -317,6 +317,71 @@ int mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 }
 EXPORT_SYMBOL_GPL(mbox_flush);
 
+static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl)
+{
+	struct device *dev = cl->dev;
+	unsigned long flags;
+	int ret;
+
+	if (chan->cl || !try_module_get(chan->mbox->dev->driver->owner)) {
+		dev_dbg(dev, "%s: mailbox not free\n", __func__);
+		return -EBUSY;
+	}
+
+	spin_lock_irqsave(&chan->lock, flags);
+	chan->msg_free = 0;
+	chan->msg_count = 0;
+	chan->active_req = NULL;
+	chan->cl = cl;
+	init_completion(&chan->tx_complete);
+
+	if (chan->txdone_method	== TXDONE_BY_POLL && cl->knows_txdone)
+		chan->txdone_method = TXDONE_BY_ACK;
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	if (chan->mbox->ops->startup) {
+		ret = chan->mbox->ops->startup(chan);
+
+		if (ret) {
+			dev_err(dev, "Unable to startup the chan (%d)\n", ret);
+			mbox_free_channel(chan);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * mbox_bind_client - Request a mailbox channel.
+ * @chan: The mailbox channel to bind the client to.
+ * @cl: Identity of the client requesting the channel.
+ *
+ * The Client specifies its requirements and capabilities while asking for
+ * a mailbox channel. It can't be called from atomic context.
+ * The channel is exclusively allocated and can't be used by another
+ * client before the owner calls mbox_free_channel.
+ * After assignment, any packet received on this channel will be
+ * handed over to the client via the 'rx_callback'.
+ * The framework holds reference to the client, so the mbox_client
+ * structure shouldn't be modified until the mbox_free_channel returns.
+ *
+ * Return: 0 if the channel was assigned to the client successfully.
+ *         <0 for request failure.
+ */
+int mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl)
+{
+	int ret;
+
+	mutex_lock(&con_mutex);
+	ret = __mbox_bind_client(chan, cl);
+	mutex_unlock(&con_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mbox_bind_client);
+
 /**
  * mbox_request_channel - Request a mailbox channel.
  * @cl: Identity of the client requesting the channel.
@@ -340,7 +405,6 @@ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index)
 	struct mbox_controller *mbox;
 	struct of_phandle_args spec;
 	struct mbox_chan *chan;
-	unsigned long flags;
 	int ret;
 
 	if (!dev || !dev->of_node) {
@@ -372,33 +436,9 @@ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index)
 		return chan;
 	}
 
-	if (chan->cl || !try_module_get(mbox->dev->driver->owner)) {
-		dev_dbg(dev, "%s: mailbox not free\n", __func__);
-		mutex_unlock(&con_mutex);
-		return ERR_PTR(-EBUSY);
-	}
-
-	spin_lock_irqsave(&chan->lock, flags);
-	chan->msg_free = 0;
-	chan->msg_count = 0;
-	chan->active_req = NULL;
-	chan->cl = cl;
-	init_completion(&chan->tx_complete);
-
-	if (chan->txdone_method	== TXDONE_BY_POLL && cl->knows_txdone)
-		chan->txdone_method = TXDONE_BY_ACK;
-
-	spin_unlock_irqrestore(&chan->lock, flags);
-
-	if (chan->mbox->ops->startup) {
-		ret = chan->mbox->ops->startup(chan);
-
-		if (ret) {
-			dev_err(dev, "Unable to startup the chan (%d)\n", ret);
-			mbox_free_channel(chan);
-			chan = ERR_PTR(ret);
-		}
-	}
+	ret = __mbox_bind_client(chan, cl);
+	if (ret)
+		chan = ERR_PTR(ret);
 
 	mutex_unlock(&con_mutex);
 	return chan;
diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h
index 65229a45590f..734694912ef7 100644
--- a/include/linux/mailbox_client.h
+++ b/include/linux/mailbox_client.h
@@ -37,6 +37,7 @@ struct mbox_client {
 	void (*tx_done)(struct mbox_client *cl, void *mssg, int r);
 };
 
+int mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl);
 struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl,
 					      const char *name);
 struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index);
-- 
cgit v1.2.3


From c26e0527aaf84a34b4774d80c9a9baa65f4d77f2 Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <ssengar@linux.microsoft.com>
Date: Mon, 10 Apr 2023 22:55:29 -0700
Subject: x86/hyperv: Add VTL specific structs and hypercalls

Add structs and hypercalls required to enable VTL support on x86.

Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
Link: https://lore.kernel.org/r/1681192532-15460-3-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/include/asm/hyperv-tlfs.h | 75 ++++++++++++++++++++++++++++++++++++++
 include/asm-generic/hyperv-tlfs.h  |  4 ++
 2 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index b4fb75bd1013..cea95dcd27c2 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -716,6 +716,81 @@ union hv_msi_entry {
 	} __packed;
 };
 
+struct hv_x64_segment_register {
+	u64 base;
+	u32 limit;
+	u16 selector;
+	union {
+		struct {
+			u16 segment_type : 4;
+			u16 non_system_segment : 1;
+			u16 descriptor_privilege_level : 2;
+			u16 present : 1;
+			u16 reserved : 4;
+			u16 available : 1;
+			u16 _long : 1;
+			u16 _default : 1;
+			u16 granularity : 1;
+		} __packed;
+		u16 attributes;
+	};
+} __packed;
+
+struct hv_x64_table_register {
+	u16 pad[3];
+	u16 limit;
+	u64 base;
+} __packed;
+
+struct hv_init_vp_context {
+	u64 rip;
+	u64 rsp;
+	u64 rflags;
+
+	struct hv_x64_segment_register cs;
+	struct hv_x64_segment_register ds;
+	struct hv_x64_segment_register es;
+	struct hv_x64_segment_register fs;
+	struct hv_x64_segment_register gs;
+	struct hv_x64_segment_register ss;
+	struct hv_x64_segment_register tr;
+	struct hv_x64_segment_register ldtr;
+
+	struct hv_x64_table_register idtr;
+	struct hv_x64_table_register gdtr;
+
+	u64 efer;
+	u64 cr0;
+	u64 cr3;
+	u64 cr4;
+	u64 msr_cr_pat;
+} __packed;
+
+union hv_input_vtl {
+	u8 as_uint8;
+	struct {
+		u8 target_vtl: 4;
+		u8 use_target_vtl: 1;
+		u8 reserved_z: 3;
+	};
+} __packed;
+
+struct hv_enable_vp_vtl {
+	u64				partition_id;
+	u32				vp_index;
+	union hv_input_vtl		target_vtl;
+	u8				mbz0;
+	u16				mbz1;
+	struct hv_init_vp_context	vp_context;
+} __packed;
+
+struct hv_get_vp_from_apic_id_in {
+	u64 partition_id;
+	union hv_input_vtl target_vtl;
+	u8 res[7];
+	u32 apic_ids[];
+} __packed;
+
 #include <asm-generic/hyperv-tlfs.h>
 
 #endif
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index ea406e901469..f4e4cc4f965f 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -146,6 +146,7 @@ union hv_reference_tsc_msr {
 /* Declare the various hypercall operations. */
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
+#define HVCALL_ENABLE_VP_VTL			0x000f
 #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
 #define HVCALL_SEND_IPI				0x000b
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX	0x0013
@@ -165,6 +166,8 @@ union hv_reference_tsc_msr {
 #define HVCALL_MAP_DEVICE_INTERRUPT		0x007c
 #define HVCALL_UNMAP_DEVICE_INTERRUPT		0x007d
 #define HVCALL_RETARGET_INTERRUPT		0x007e
+#define HVCALL_START_VP				0x0099
+#define HVCALL_GET_VP_ID_FROM_APIC_ID		0x009a
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
 #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
@@ -220,6 +223,7 @@ enum HV_GENERIC_SET_FORMAT {
 #define HV_STATUS_INVALID_PORT_ID		17
 #define HV_STATUS_INVALID_CONNECTION_ID		18
 #define HV_STATUS_INSUFFICIENT_BUFFERS		19
+#define HV_STATUS_VTL_ALREADY_ENABLED		134
 
 /*
  * The Hyper-V TimeRefCount register and the TSC
-- 
cgit v1.2.3


From 62e8b17ffc2ff0b0e29d5e05a18570c3e70b35ff Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 11 Mar 2023 15:40:07 +0100
Subject: PCI/DOE: Provide synchronous API and use it internally

The DOE API only allows asynchronous exchanges and forces callers to
provide a completion callback.  Yet all existing callers only perform
synchronous exchanges.  Upcoming commits for CMA (Component Measurement
and Authentication, PCIe r6.0 sec 6.31) likewise require only
synchronous DOE exchanges.

Provide a synchronous pci_doe() API call which builds on the internal
asynchronous machinery.

Convert the internal pci_doe_discovery() to the new call.

The new API allows submission of const-declared requests, necessitating
the addition of a const qualifier in struct pci_doe_task.

Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ming Li <ming4.li@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/0f444206da9615c56301fbaff459c0f45d27f122.1678543498.git.lukas@wunner.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/doe.c       | 69 ++++++++++++++++++++++++++++++++++++++-----------
 include/linux/pci-doe.h |  6 ++++-
 2 files changed, 59 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index e5e9b287b976..adde8c66499c 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -321,26 +321,15 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
 	__le32 request_pl_le = cpu_to_le32(request_pl);
 	__le32 response_pl_le;
 	u32 response_pl;
-	DECLARE_COMPLETION_ONSTACK(c);
-	struct pci_doe_task task = {
-		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
-		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
-		.request_pl = &request_pl_le,
-		.request_pl_sz = sizeof(request_pl),
-		.response_pl = &response_pl_le,
-		.response_pl_sz = sizeof(response_pl),
-		.complete = pci_doe_task_complete,
-		.private = &c,
-	};
 	int rc;
 
-	rc = pci_doe_submit_task(doe_mb, &task);
+	rc = pci_doe(doe_mb, PCI_VENDOR_ID_PCI_SIG, PCI_DOE_PROTOCOL_DISCOVERY,
+		     &request_pl_le, sizeof(request_pl_le),
+		     &response_pl_le, sizeof(response_pl_le));
 	if (rc < 0)
 		return rc;
 
-	wait_for_completion(&c);
-
-	if (task.rv != sizeof(response_pl))
+	if (rc != sizeof(response_pl_le))
 		return -EIO;
 
 	response_pl = le32_to_cpu(response_pl_le);
@@ -552,3 +541,53 @@ int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_doe_submit_task);
+
+/**
+ * pci_doe() - Perform Data Object Exchange
+ *
+ * @doe_mb: DOE Mailbox
+ * @vendor: Vendor ID
+ * @type: Data Object Type
+ * @request: Request payload
+ * @request_sz: Size of request payload (bytes)
+ * @response: Response payload
+ * @response_sz: Size of response payload (bytes)
+ *
+ * Submit @request to @doe_mb and store the @response.
+ * The DOE exchange is performed synchronously and may therefore sleep.
+ *
+ * Payloads are treated as opaque byte streams which are transmitted verbatim,
+ * without byte-swapping.  If payloads contain little-endian register values,
+ * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
+ *
+ * RETURNS: Length of received response or negative errno.
+ * Received data in excess of @response_sz is discarded.
+ * The length may be smaller than @response_sz and the caller
+ * is responsible for checking that.
+ */
+int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
+	    const void *request, size_t request_sz,
+	    void *response, size_t response_sz)
+{
+	DECLARE_COMPLETION_ONSTACK(c);
+	struct pci_doe_task task = {
+		.prot.vid = vendor,
+		.prot.type = type,
+		.request_pl = request,
+		.request_pl_sz = request_sz,
+		.response_pl = response,
+		.response_pl_sz = response_sz,
+		.complete = pci_doe_task_complete,
+		.private = &c,
+	};
+	int rc;
+
+	rc = pci_doe_submit_task(doe_mb, &task);
+	if (rc)
+		return rc;
+
+	wait_for_completion(&c);
+
+	return task.rv;
+}
+EXPORT_SYMBOL_GPL(pci_doe);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index 43765eaf2342..5dcd54f892e5 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -49,7 +49,7 @@ struct pci_doe_mb;
  */
 struct pci_doe_task {
 	struct pci_doe_protocol prot;
-	__le32 *request_pl;
+	const __le32 *request_pl;
 	size_t request_pl_sz;
 	__le32 *response_pl;
 	size_t response_pl_sz;
@@ -78,4 +78,8 @@ struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
 bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
 int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
 
+int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
+	    const void *request, size_t request_sz,
+	    void *response, size_t response_sz);
+
 #endif
-- 
cgit v1.2.3


From 0821ff8ed059d38dfc9bec2106c5cc53bcaa15b1 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 11 Mar 2023 15:40:09 +0100
Subject: PCI/DOE: Make asynchronous API private

A synchronous API for DOE has just been introduced.  CXL (the only
in-tree DOE user so far) was converted to use it instead of the
asynchronous API.

Consequently, pci_doe_submit_task() as well as the pci_doe_task struct
are only used internally, so make them private.

Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ming Li <ming4.li@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/cc19544068483681e91dfe27545c2180cd09f931.1678543498.git.lukas@wunner.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/doe.c       | 45 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/pci-doe.h | 48 ------------------------------------------------
 2 files changed, 43 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index adde8c66499c..dfdec73f6abc 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -56,6 +56,47 @@ struct pci_doe_mb {
 	unsigned long flags;
 };
 
+struct pci_doe_protocol {
+	u16 vid;
+	u8 type;
+};
+
+/**
+ * struct pci_doe_task - represents a single query/response
+ *
+ * @prot: DOE Protocol
+ * @request_pl: The request payload
+ * @request_pl_sz: Size of the request payload (bytes)
+ * @response_pl: The response payload
+ * @response_pl_sz: Size of the response payload (bytes)
+ * @rv: Return value.  Length of received response or error (bytes)
+ * @complete: Called when task is complete
+ * @private: Private data for the consumer
+ * @work: Used internally by the mailbox
+ * @doe_mb: Used internally by the mailbox
+ *
+ * The payload sizes and rv are specified in bytes with the following
+ * restrictions concerning the protocol.
+ *
+ *	1) The request_pl_sz must be a multiple of double words (4 bytes)
+ *	2) The response_pl_sz must be >= a single double word (4 bytes)
+ *	3) rv is returned as bytes but it will be a multiple of double words
+ */
+struct pci_doe_task {
+	struct pci_doe_protocol prot;
+	const __le32 *request_pl;
+	size_t request_pl_sz;
+	__le32 *response_pl;
+	size_t response_pl_sz;
+	int rv;
+	void (*complete)(struct pci_doe_task *task);
+	void *private;
+
+	/* initialized by pci_doe_submit_task() */
+	struct work_struct work;
+	struct pci_doe_mb *doe_mb;
+};
+
 static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
 {
 	if (wait_event_timeout(doe_mb->wq,
@@ -519,7 +560,8 @@ EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
  *
  * RETURNS: 0 when task has been successfully queued, -ERRNO on error
  */
-int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+static int pci_doe_submit_task(struct pci_doe_mb *doe_mb,
+			       struct pci_doe_task *task)
 {
 	if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type))
 		return -EINVAL;
@@ -540,7 +582,6 @@ int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
 	queue_work(doe_mb->work_queue, &task->work);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_doe_submit_task);
 
 /**
  * pci_doe() - Perform Data Object Exchange
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index 5dcd54f892e5..7f16749c6aa3 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -13,55 +13,8 @@
 #ifndef LINUX_PCI_DOE_H
 #define LINUX_PCI_DOE_H
 
-struct pci_doe_protocol {
-	u16 vid;
-	u8 type;
-};
-
 struct pci_doe_mb;
 
-/**
- * struct pci_doe_task - represents a single query/response
- *
- * @prot: DOE Protocol
- * @request_pl: The request payload
- * @request_pl_sz: Size of the request payload (bytes)
- * @response_pl: The response payload
- * @response_pl_sz: Size of the response payload (bytes)
- * @rv: Return value.  Length of received response or error (bytes)
- * @complete: Called when task is complete
- * @private: Private data for the consumer
- * @work: Used internally by the mailbox
- * @doe_mb: Used internally by the mailbox
- *
- * Payloads are treated as opaque byte streams which are transmitted verbatim,
- * without byte-swapping.  If payloads contain little-endian register values,
- * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
- *
- * The payload sizes and rv are specified in bytes with the following
- * restrictions concerning the protocol.
- *
- *	1) The request_pl_sz must be a multiple of double words (4 bytes)
- *	2) The response_pl_sz must be >= a single double word (4 bytes)
- *	3) rv is returned as bytes but it will be a multiple of double words
- *
- * NOTE there is no need for the caller to initialize work or doe_mb.
- */
-struct pci_doe_task {
-	struct pci_doe_protocol prot;
-	const __le32 *request_pl;
-	size_t request_pl_sz;
-	__le32 *response_pl;
-	size_t response_pl_sz;
-	int rv;
-	void (*complete)(struct pci_doe_task *task);
-	void *private;
-
-	/* No need for the user to initialize these fields */
-	struct work_struct work;
-	struct pci_doe_mb *doe_mb;
-};
-
 /**
  * pci_doe_for_each_off - Iterate each DOE capability
  * @pdev: struct pci_dev to iterate
@@ -76,7 +29,6 @@ struct pci_doe_task {
 
 struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
 bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
-int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
 
 int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
 	    const void *request, size_t request_sz,
-- 
cgit v1.2.3


From ac04840350e2c21a17d867b262a1586603b87a92 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 11 Mar 2023 15:40:12 +0100
Subject: PCI/DOE: Create mailboxes on device enumeration

Currently a DOE instance cannot be shared by multiple drivers because
each driver creates its own pci_doe_mb struct for a given DOE instance.
For the same reason a DOE instance cannot be shared between the PCI core
and a driver.

Moreover, finding out which protocols a DOE instance supports requires
creating a pci_doe_mb for it.  If a device has multiple DOE instances,
a driver looking for a specific protocol may need to create a pci_doe_mb
for each of the device's DOE instances and then destroy those which
do not support the desired protocol.  That's obviously an inefficient
way to do things.

Overcome these issues by creating mailboxes in the PCI core on device
enumeration.

Provide a pci_find_doe_mailbox() API call to allow drivers to get a
pci_doe_mb for a given (pci_dev, vendor, protocol) triple.  This API is
modeled after pci_find_capability() and can later be amended with a
pci_find_next_doe_mailbox() call to iterate over all mailboxes of a
given pci_dev which support a specific protocol.

On removal, destroy the mailboxes in pci_destroy_dev(), after the driver
is unbound.  This allows drivers to use DOE in their ->remove() hook.

On surprise removal, cancel ongoing DOE exchanges and prevent new ones
from being scheduled.  Thereby ensure that a hot-removed device doesn't
needlessly wait for a running exchange to time out.

Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ming Li <ming4.li@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/40a6f973f72ef283d79dd55e7e6fddc7481199af.1678543498.git.lukas@wunner.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/doe.c       | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h       | 11 ++++++++
 drivers/pci/probe.c     |  1 +
 drivers/pci/remove.c    |  1 +
 include/linux/pci-doe.h |  2 ++
 include/linux/pci.h     |  3 ++
 6 files changed, 91 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index 7539e72db5cb..9c577f5b3878 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -20,6 +20,8 @@
 #include <linux/pci-doe.h>
 #include <linux/workqueue.h>
 
+#include "pci.h"
+
 #define PCI_DOE_PROTOCOL_DISCOVERY 0
 
 /* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
@@ -658,3 +660,74 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
 	return task.rv;
 }
 EXPORT_SYMBOL_GPL(pci_doe);
+
+/**
+ * pci_find_doe_mailbox() - Find Data Object Exchange mailbox
+ *
+ * @pdev: PCI device
+ * @vendor: Vendor ID
+ * @type: Data Object Type
+ *
+ * Find first DOE mailbox of a PCI device which supports the given protocol.
+ *
+ * RETURNS: Pointer to the DOE mailbox or NULL if none was found.
+ */
+struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
+					u8 type)
+{
+	struct pci_doe_mb *doe_mb;
+	unsigned long index;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb)
+		if (pci_doe_supports_prot(doe_mb, vendor, type))
+			return doe_mb;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_find_doe_mailbox);
+
+void pci_doe_init(struct pci_dev *pdev)
+{
+	struct pci_doe_mb *doe_mb;
+	u16 offset = 0;
+	int rc;
+
+	xa_init(&pdev->doe_mbs);
+
+	while ((offset = pci_find_next_ext_capability(pdev, offset,
+						      PCI_EXT_CAP_ID_DOE))) {
+		doe_mb = pci_doe_create_mb(pdev, offset);
+		if (IS_ERR(doe_mb)) {
+			pci_err(pdev, "[%x] failed to create mailbox: %ld\n",
+				offset, PTR_ERR(doe_mb));
+			continue;
+		}
+
+		rc = xa_insert(&pdev->doe_mbs, offset, doe_mb, GFP_KERNEL);
+		if (rc) {
+			pci_err(pdev, "[%x] failed to insert mailbox: %d\n",
+				offset, rc);
+			pci_doe_destroy_mb(doe_mb);
+		}
+	}
+}
+
+void pci_doe_destroy(struct pci_dev *pdev)
+{
+	struct pci_doe_mb *doe_mb;
+	unsigned long index;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb)
+		pci_doe_destroy_mb(doe_mb);
+
+	xa_destroy(&pdev->doe_mbs);
+}
+
+void pci_doe_disconnected(struct pci_dev *pdev)
+{
+	struct pci_doe_mb *doe_mb;
+	unsigned long index;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb)
+		pci_doe_cancel_tasks(doe_mb);
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d2c08670a20e..815a4d2a41da 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -318,6 +318,16 @@ struct pci_sriov {
 	bool		drivers_autoprobe; /* Auto probing of VFs by driver */
 };
 
+#ifdef CONFIG_PCI_DOE
+void pci_doe_init(struct pci_dev *pdev);
+void pci_doe_destroy(struct pci_dev *pdev);
+void pci_doe_disconnected(struct pci_dev *pdev);
+#else
+static inline void pci_doe_init(struct pci_dev *pdev) { }
+static inline void pci_doe_destroy(struct pci_dev *pdev) { }
+static inline void pci_doe_disconnected(struct pci_dev *pdev) { }
+#endif
+
 /**
  * pci_dev_set_io_state - Set the new error state if possible.
  *
@@ -354,6 +364,7 @@ static inline bool pci_dev_set_io_state(struct pci_dev *dev,
 static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
 {
 	pci_dev_set_io_state(dev, pci_channel_io_perm_failure);
+	pci_doe_disconnected(dev);
 
 	return 0;
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a3f68b6ba6ac..02d2bf80eedb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2479,6 +2479,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_aer_init(dev);		/* Advanced Error Reporting */
 	pci_dpc_init(dev);		/* Downstream Port Containment */
 	pci_rcec_init(dev);		/* Root Complex Event Collector */
+	pci_doe_init(dev);		/* Data Object Exchange */
 
 	pcie_report_downtraining(dev);
 	pci_init_reset_methods(dev);
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 0145aef1b930..f25acf50879f 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -39,6 +39,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
 	list_del(&dev->bus_list);
 	up_write(&pci_bus_sem);
 
+	pci_doe_destroy(dev);
 	pcie_aspm_exit_link_state(dev);
 	pci_bridge_d3_update(dev);
 	pci_free_resources(dev);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index 7f16749c6aa3..d6192ee0ac07 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -29,6 +29,8 @@ struct pci_doe_mb;
 
 struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
 bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
+struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
+					u8 type);
 
 int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
 	    const void *request, size_t request_sz,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b50e5c79f7e3..29d99530ec17 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -511,6 +511,9 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_P2PDMA
 	struct pci_p2pdma __rcu *p2pdma;
+#endif
+#ifdef CONFIG_PCI_DOE
+	struct xarray	doe_mbs;	/* Data Object Exchange mailboxes */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
 	phys_addr_t	rom;		/* Physical address if not from BAR */
-- 
cgit v1.2.3


From 74e491e5d1bcc35a699291df720191760ff4130e Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 11 Mar 2023 15:40:14 +0100
Subject: PCI/DOE: Make mailbox creation API private

The PCI core has just been amended to create a pci_doe_mb struct for
every DOE instance on device enumeration.  CXL (the only in-tree DOE
user so far) has been migrated to use those mailboxes instead of
creating its own.

That leaves pcim_doe_create_mb() and pci_doe_for_each_off() without any
callers, so drop them.

pci_doe_supports_prot() is now only used internally, so declare it
static.

pci_doe_destroy_mb() is no longer used as callback for
devm_add_action(), so refactor it to accept a struct pci_doe_mb pointer
instead of a generic void pointer.

Because pci_doe_create_mb() is only called on device enumeration, i.e.
before driver binding, the workqueue name never contains a driver name.
So replace dev_driver_string() with dev_bus_name() when generating the
workqueue name.

Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ming Li <ming4.li@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/64f614b6584982986c55d2c6229b4ee2b276dd59.1678543498.git.lukas@wunner.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format           |  1 -
 drivers/pci/doe.c       | 41 ++++-------------------------------------
 include/linux/pci-doe.h | 14 --------------
 3 files changed, 4 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index d988e9fa9b26..1c85f6ddc71a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -520,7 +520,6 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
-  - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
   - 'pcm_for_each_format'
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index 9c577f5b3878..db9a6b0c33c7 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -455,7 +455,7 @@ static struct pci_doe_mb *pci_doe_create_mb(struct pci_dev *pdev,
 	xa_init(&doe_mb->prots);
 
 	doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0,
-						dev_driver_string(&pdev->dev),
+						dev_bus_name(&pdev->dev),
 						pci_name(pdev),
 						doe_mb->cap_offset);
 	if (!doe_mb->work_queue) {
@@ -499,50 +499,18 @@ err_free:
 /**
  * pci_doe_destroy_mb() - Destroy a DOE mailbox object
  *
- * @ptr: Pointer to DOE mailbox
+ * @doe_mb: DOE mailbox
  *
  * Destroy all internal data structures created for the DOE mailbox.
  */
-static void pci_doe_destroy_mb(void *ptr)
+static void pci_doe_destroy_mb(struct pci_doe_mb *doe_mb)
 {
-	struct pci_doe_mb *doe_mb = ptr;
-
 	pci_doe_cancel_tasks(doe_mb);
 	xa_destroy(&doe_mb->prots);
 	destroy_workqueue(doe_mb->work_queue);
 	kfree(doe_mb);
 }
 
-/**
- * pcim_doe_create_mb() - Create a DOE mailbox object
- *
- * @pdev: PCI device to create the DOE mailbox for
- * @cap_offset: Offset of the DOE mailbox
- *
- * Create a single mailbox object to manage the mailbox protocol at the
- * cap_offset specified.  The mailbox will automatically be destroyed on
- * driver unbinding from @pdev.
- *
- * RETURNS: created mailbox object on success
- *	    ERR_PTR(-errno) on failure
- */
-struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset)
-{
-	struct pci_doe_mb *doe_mb;
-	int rc;
-
-	doe_mb = pci_doe_create_mb(pdev, cap_offset);
-	if (IS_ERR(doe_mb))
-		return doe_mb;
-
-	rc = devm_add_action_or_reset(&pdev->dev, pci_doe_destroy_mb, doe_mb);
-	if (rc)
-		return ERR_PTR(rc);
-
-	return doe_mb;
-}
-EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
-
 /**
  * pci_doe_supports_prot() - Return if the DOE instance supports the given
  *			     protocol
@@ -552,7 +520,7 @@ EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
  *
  * RETURNS: True if the DOE mailbox supports the protocol specified
  */
-bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
+static bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
 {
 	unsigned long index;
 	void *entry;
@@ -567,7 +535,6 @@ bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
 
 /**
  * pci_doe_submit_task() - Submit a task to be processed by the state machine
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index d6192ee0ac07..1f14aed4354b 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -15,20 +15,6 @@
 
 struct pci_doe_mb;
 
-/**
- * pci_doe_for_each_off - Iterate each DOE capability
- * @pdev: struct pci_dev to iterate
- * @off: u16 of config space offset of each mailbox capability found
- */
-#define pci_doe_for_each_off(pdev, off) \
-	for (off = pci_find_next_ext_capability(pdev, off, \
-					PCI_EXT_CAP_ID_DOE); \
-		off > 0; \
-		off = pci_find_next_ext_capability(pdev, off, \
-					PCI_EXT_CAP_ID_DOE))
-
-struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
-bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
 struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
 					u8 type);
 
-- 
cgit v1.2.3


From 5c899820baaf4a5e9e027d5066c01c04afed290d Mon Sep 17 00:00:00 2001
From: Alain Volmat <avolmat@me.com>
Date: Sun, 16 Apr 2023 22:04:41 +0200
Subject: dt-bindings: reset: remove stih415/stih416 reset

Remove the stih415 and stih416 reset dt-bindings since those
two platforms are no more supported.

Signed-off-by: Alain Volmat <avolmat@me.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230416200442.61554-1-avolmat@me.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/dt-bindings/reset/stih415-resets.h | 28 ----------------
 include/dt-bindings/reset/stih416-resets.h | 52 ------------------------------
 2 files changed, 80 deletions(-)
 delete mode 100644 include/dt-bindings/reset/stih415-resets.h
 delete mode 100644 include/dt-bindings/reset/stih416-resets.h

(limited to 'include')

diff --git a/include/dt-bindings/reset/stih415-resets.h b/include/dt-bindings/reset/stih415-resets.h
deleted file mode 100644
index 96f7831a1db0..000000000000
--- a/include/dt-bindings/reset/stih415-resets.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This header provides constants for the reset controller
- * based peripheral powerdown requests on the STMicroelectronics
- * STiH415 SoC.
- */
-#ifndef _DT_BINDINGS_RESET_CONTROLLER_STIH415
-#define _DT_BINDINGS_RESET_CONTROLLER_STIH415
-
-#define STIH415_EMISS_POWERDOWN		0
-#define STIH415_NAND_POWERDOWN		1
-#define STIH415_KEYSCAN_POWERDOWN	2
-#define STIH415_USB0_POWERDOWN		3
-#define STIH415_USB1_POWERDOWN		4
-#define STIH415_USB2_POWERDOWN		5
-#define STIH415_SATA0_POWERDOWN		6
-#define STIH415_SATA1_POWERDOWN		7
-#define STIH415_PCIE_POWERDOWN		8
-
-#define STIH415_ETH0_SOFTRESET		0
-#define STIH415_ETH1_SOFTRESET		1
-#define STIH415_IRB_SOFTRESET		2
-#define STIH415_USB0_SOFTRESET		3
-#define STIH415_USB1_SOFTRESET		4
-#define STIH415_USB2_SOFTRESET		5
-#define STIH415_KEYSCAN_SOFTRESET	6
-
-#endif /* _DT_BINDINGS_RESET_CONTROLLER_STIH415 */
diff --git a/include/dt-bindings/reset/stih416-resets.h b/include/dt-bindings/reset/stih416-resets.h
deleted file mode 100644
index f682c906ed5a..000000000000
--- a/include/dt-bindings/reset/stih416-resets.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This header provides constants for the reset controller
- * based peripheral powerdown requests on the STMicroelectronics
- * STiH416 SoC.
- */
-#ifndef _DT_BINDINGS_RESET_CONTROLLER_STIH416
-#define _DT_BINDINGS_RESET_CONTROLLER_STIH416
-
-#define STIH416_EMISS_POWERDOWN		0
-#define STIH416_NAND_POWERDOWN		1
-#define STIH416_KEYSCAN_POWERDOWN	2
-#define STIH416_USB0_POWERDOWN		3
-#define STIH416_USB1_POWERDOWN		4
-#define STIH416_USB2_POWERDOWN		5
-#define STIH416_USB3_POWERDOWN		6
-#define STIH416_SATA0_POWERDOWN		7
-#define STIH416_SATA1_POWERDOWN		8
-#define STIH416_PCIE0_POWERDOWN		9
-#define STIH416_PCIE1_POWERDOWN		10
-
-#define STIH416_ETH0_SOFTRESET		0
-#define STIH416_ETH1_SOFTRESET		1
-#define STIH416_IRB_SOFTRESET		2
-#define STIH416_USB0_SOFTRESET		3
-#define STIH416_USB1_SOFTRESET		4
-#define STIH416_USB2_SOFTRESET		5
-#define STIH416_USB3_SOFTRESET		6
-#define STIH416_SATA0_SOFTRESET		7
-#define STIH416_SATA1_SOFTRESET		8
-#define STIH416_PCIE0_SOFTRESET		9
-#define STIH416_PCIE1_SOFTRESET		10
-#define STIH416_AUD_DAC_SOFTRESET	11
-#define STIH416_HDTVOUT_SOFTRESET	12
-#define STIH416_VTAC_M_RX_SOFTRESET	13
-#define STIH416_VTAC_A_RX_SOFTRESET	14
-#define STIH416_SYNC_HD_SOFTRESET	15
-#define STIH416_SYNC_SD_SOFTRESET	16
-#define STIH416_BLITTER_SOFTRESET	17
-#define STIH416_GPU_SOFTRESET		18
-#define STIH416_VTAC_M_TX_SOFTRESET	19
-#define STIH416_VTAC_A_TX_SOFTRESET	20
-#define STIH416_VTG_AUX_SOFTRESET	21
-#define STIH416_JPEG_DEC_SOFTRESET	22
-#define STIH416_HVA_SOFTRESET		23
-#define STIH416_COMPO_M_SOFTRESET	24
-#define STIH416_COMPO_A_SOFTRESET	25
-#define STIH416_VP8_DEC_SOFTRESET	26
-#define STIH416_VTG_MAIN_SOFTRESET	27
-#define STIH416_KEYSCAN_SOFTRESET	28
-
-#endif /* _DT_BINDINGS_RESET_CONTROLLER_STIH416 */
-- 
cgit v1.2.3


From 48380368dec14859723b9e3fbd43e042638d9a76 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Mar 2023 12:14:42 +0200
Subject: Change DEFINE_SEMAPHORE() to take a number argument

Fundamentally semaphores are a counted primitive, but
DEFINE_SEMAPHORE() does not expose this and explicitly creates a
binary semaphore.

Change DEFINE_SEMAPHORE() to take a number argument and use that in the
few places that open-coded it using __SEMAPHORE_INITIALIZER().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[mcgrof: add some tribal knowledge about why some folks prefer
 binary sempahores over mutexes]
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/mips/cavium-octeon/setup.c                               |  2 +-
 arch/x86/kernel/cpu/intel.c                                   |  2 +-
 drivers/firmware/efi/runtime-wrappers.c                       |  2 +-
 drivers/firmware/efi/vars.c                                   |  2 +-
 drivers/macintosh/adb.c                                       |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c              |  2 +-
 drivers/platform/x86/intel/ifs/sysfs.c                        |  2 +-
 drivers/scsi/esas2r/esas2r_ioctl.c                            |  2 +-
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c |  2 +-
 include/linux/semaphore.h                                     | 10 ++++++++--
 kernel/printk/printk.c                                        |  2 +-
 net/rxrpc/call_object.c                                       |  6 ++----
 12 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index a71727f7a608..c5561016f577 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -72,7 +72,7 @@ extern void pci_console_init(const char *arg);
 static unsigned long long max_memory = ULLONG_MAX;
 static unsigned long long reserve_low_mem;
 
-DEFINE_SEMAPHORE(octeon_bootbus_sem);
+DEFINE_SEMAPHORE(octeon_bootbus_sem, 1);
 EXPORT_SYMBOL(octeon_bootbus_sem);
 
 static struct octeon_boot_descriptor *octeon_boot_desc_ptr;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 291d4167fab8..12bad63822f0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1177,7 +1177,7 @@ static const struct {
 static struct ratelimit_state bld_ratelimit;
 
 static unsigned int sysctl_sld_mitigate = 1;
-static DEFINE_SEMAPHORE(buslock_sem);
+static DEFINE_SEMAPHORE(buslock_sem, 1);
 
 #ifdef CONFIG_PROC_SYSCTL
 static struct ctl_table sld_sysctls[] = {
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 1fba4e09cdcf..a400c4312c82 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -158,7 +158,7 @@ void efi_call_virt_check_flags(unsigned long flags, const char *call)
  * none of the remaining functions are actually ever called at runtime.
  * So let's just use a single lock to serialize all Runtime Services calls.
  */
-static DEFINE_SEMAPHORE(efi_runtime_lock);
+static DEFINE_SEMAPHORE(efi_runtime_lock, 1);
 
 /*
  * Expose the EFI runtime lock to the UV platform
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index bd75b87f5fc1..bfc5fa6aa47b 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -21,7 +21,7 @@
 /* Private pointer to registered efivars */
 static struct efivars *__efivars;
 
-static DEFINE_SEMAPHORE(efivars_lock);
+static DEFINE_SEMAPHORE(efivars_lock, 1);
 
 static efi_status_t check_var_size(bool nonblocking, u32 attributes,
 				   unsigned long size)
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 23bd0c77ac1a..56599515d51a 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -80,7 +80,7 @@ static struct adb_driver *adb_controller;
 BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
-static DEFINE_SEMAPHORE(adb_probe_mutex);
+static DEFINE_SEMAPHORE(adb_probe_mutex, 1);
 static int sleepy_trackpad;
 static int autopoll_devs;
 int __adb_probe_sync;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 5d1e4fe335aa..5a105bab4387 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -298,7 +298,7 @@ const u32 dmae_reg_go_c[] = {
 
 /* Global resources for unloading a previously loaded device */
 #define BNX2X_PREV_WAIT_NEEDED 1
-static DEFINE_SEMAPHORE(bnx2x_prev_sem);
+static DEFINE_SEMAPHORE(bnx2x_prev_sem, 1);
 static LIST_HEAD(bnx2x_prev_list);
 
 /* Forward declaration */
diff --git a/drivers/platform/x86/intel/ifs/sysfs.c b/drivers/platform/x86/intel/ifs/sysfs.c
index ee636a76b083..4c3c642ee19a 100644
--- a/drivers/platform/x86/intel/ifs/sysfs.c
+++ b/drivers/platform/x86/intel/ifs/sysfs.c
@@ -13,7 +13,7 @@
  * Protects against simultaneous tests on multiple cores, or
  * reloading can file while a test is in progress
  */
-static DEFINE_SEMAPHORE(ifs_sem);
+static DEFINE_SEMAPHORE(ifs_sem, 1);
 
 /*
  * The sysfs interface to check additional details of last test
diff --git a/drivers/scsi/esas2r/esas2r_ioctl.c b/drivers/scsi/esas2r/esas2r_ioctl.c
index e003d923acbf..055d2e87a2c8 100644
--- a/drivers/scsi/esas2r/esas2r_ioctl.c
+++ b/drivers/scsi/esas2r/esas2r_ioctl.c
@@ -56,7 +56,7 @@ dma_addr_t esas2r_buffered_ioctl_addr;
 u32 esas2r_buffered_ioctl_size;
 struct pci_dev *esas2r_buffered_ioctl_pcid;
 
-static DEFINE_SEMAPHORE(buffered_ioctl_semaphore);
+static DEFINE_SEMAPHORE(buffered_ioctl_semaphore, 1);
 typedef int (*BUFFERED_IOCTL_CALLBACK)(struct esas2r_adapter *,
 				       struct esas2r_request *,
 				       struct esas2r_sg_context *,
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index cddcd3c596c9..1a656fdc9445 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -149,7 +149,7 @@ static char *g_fragments_base;
 static char *g_free_fragments;
 static struct semaphore g_free_fragments_sema;
 
-static DEFINE_SEMAPHORE(g_free_fragments_mutex);
+static DEFINE_SEMAPHORE(g_free_fragments_mutex, 1);
 
 static int
 vchiq_blocking_bulk_transfer(struct vchiq_instance *instance, unsigned int handle, void *data,
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 6694d0019a68..04655faadc2d 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -25,8 +25,14 @@ struct semaphore {
 	.wait_list	= LIST_HEAD_INIT((name).wait_list),		\
 }
 
-#define DEFINE_SEMAPHORE(name)	\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
+/*
+ * Unlike mutexes, binary semaphores do not have an owner, so up() can
+ * be called in a different thread from the one which called down().
+ * It is also safe to call down_trylock() and up() from interrupt
+ * context.
+ */
+#define DEFINE_SEMAPHORE(_name, _n)	\
+	struct semaphore _name = __SEMAPHORE_INITIALIZER(_name, _n)
 
 static inline void sema_init(struct semaphore *sem, int val)
 {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd0c9f913940..76987aaa5a45 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -89,7 +89,7 @@ static DEFINE_MUTEX(console_mutex);
  * console_sem protects updates to console->seq and console_suspended,
  * and also provides serialization for console printing.
  */
-static DEFINE_SEMAPHORE(console_sem);
+static DEFINE_SEMAPHORE(console_sem, 1);
 HLIST_HEAD(console_list);
 EXPORT_SYMBOL_GPL(console_list);
 DEFINE_STATIC_SRCU(console_srcu);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index e9f1f49d18c2..3e5cc70884dd 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -40,10 +40,8 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 
 struct kmem_cache *rxrpc_call_jar;
 
-static struct semaphore rxrpc_call_limiter =
-	__SEMAPHORE_INITIALIZER(rxrpc_call_limiter, 1000);
-static struct semaphore rxrpc_kernel_call_limiter =
-	__SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000);
+static DEFINE_SEMAPHORE(rxrpc_call_limiter, 1000);
+static DEFINE_SEMAPHORE(rxrpc_kernel_call_limiter, 1000);
 
 void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
 {
-- 
cgit v1.2.3


From 47ebd0310e89c087f56e58c103c44b72a2f6b216 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 13 Apr 2023 15:12:20 +0200
Subject: mm: kmsan: handle alloc failures in kmsan_vmap_pages_range_noflush()

As reported by Dipanjan Das, when KMSAN is used together with kernel fault
injection (or, generally, even without the latter), calls to kcalloc() or
__vmap_pages_range_noflush() may fail, leaving the metadata mappings for
the virtual mapping in an inconsistent state.  When these metadata
mappings are accessed later, the kernel crashes.

To address the problem, we return a non-zero error code from
kmsan_vmap_pages_range_noflush() in the case of any allocation/mapping
failure inside it, and make vmap_pages_range_noflush() return an error if
KMSAN fails to allocate the metadata.

This patch also removes KMSAN_WARN_ON() from vmap_pages_range_noflush(),
as these allocation failures are not fatal anymore.

Link: https://lkml.kernel.org/r/20230413131223.4135168-1-glider@google.com
Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations")
Signed-off-by: Alexander Potapenko <glider@google.com>
Reported-by: Dipanjan Das <mail.dipanjan.das@gmail.com>
  Link: https://lore.kernel.org/linux-mm/CANX2M5ZRrRA64k0hOif02TjmY9kbbO2aCBPyq79es34RXZ=cAw@mail.gmail.com/
Reviewed-by: Marco Elver <elver@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 20 +++++++++++---------
 mm/kmsan/shadow.c     | 27 ++++++++++++++++++---------
 mm/vmalloc.c          |  6 +++++-
 3 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index e38ae3c34618..c7ff3aefc5a1 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr);
  * @page_shift:	page_shift passed to vmap_range_noflush().
  *
  * KMSAN maps shadow and origin pages of @pages into contiguous ranges in
- * vmalloc metadata address range.
+ * vmalloc metadata address range. Returns 0 on success, callers must check
+ * for non-zero return value.
  */
-void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
-				    pgprot_t prot, struct page **pages,
-				    unsigned int page_shift);
+int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages,
+				   unsigned int page_shift);
 
 /**
  * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -281,12 +282,13 @@ static inline void kmsan_kfree_large(const void *ptr)
 {
 }
 
-static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
-						  unsigned long end,
-						  pgprot_t prot,
-						  struct page **pages,
-						  unsigned int page_shift)
+static inline int kmsan_vmap_pages_range_noflush(unsigned long start,
+						 unsigned long end,
+						 pgprot_t prot,
+						 struct page **pages,
+						 unsigned int page_shift)
 {
+	return 0;
 }
 
 static inline void kmsan_vunmap_range_noflush(unsigned long start,
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index a787c04e9583..b8bb95eea5e3 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order)
 	kmsan_leave_runtime();
 }
 
-void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
-				    pgprot_t prot, struct page **pages,
-				    unsigned int page_shift)
+int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages,
+				   unsigned int page_shift)
 {
 	unsigned long shadow_start, origin_start, shadow_end, origin_end;
 	struct page **s_pages, **o_pages;
-	int nr, mapped;
+	int nr, mapped, err = 0;
 
 	if (!kmsan_enabled)
-		return;
+		return 0;
 
 	shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
 	shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
 	if (!shadow_start)
-		return;
+		return 0;
 
 	nr = (end - start) / PAGE_SIZE;
 	s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
 	o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
-	if (!s_pages || !o_pages)
+	if (!s_pages || !o_pages) {
+		err = -ENOMEM;
 		goto ret;
+	}
 	for (int i = 0; i < nr; i++) {
 		s_pages[i] = shadow_page_for(pages[i]);
 		o_pages[i] = origin_page_for(pages[i]);
@@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
 	kmsan_enter_runtime();
 	mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
 					    s_pages, page_shift);
-	KMSAN_WARN_ON(mapped);
+	if (mapped) {
+		err = mapped;
+		goto ret;
+	}
 	mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
 					    o_pages, page_shift);
-	KMSAN_WARN_ON(mapped);
+	if (mapped) {
+		err = mapped;
+		goto ret;
+	}
 	kmsan_leave_runtime();
 	flush_tlb_kernel_range(shadow_start, shadow_end);
 	flush_tlb_kernel_range(origin_start, origin_end);
@@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
 ret:
 	kfree(s_pages);
 	kfree(o_pages);
+	return err;
 }
 
 /* Allocate metadata for pages allocated at boot time. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a50072066221..1355d95cce1c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 		pgprot_t prot, struct page **pages, unsigned int page_shift)
 {
-	kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+	int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
+						 page_shift);
+
+	if (ret)
+		return ret;
 	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
 }
 
-- 
cgit v1.2.3


From fdea03e12aa2a44a7bb34144208be97fc25dfd90 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 13 Apr 2023 15:12:21 +0200
Subject: mm: kmsan: handle alloc failures in kmsan_ioremap_page_range()

Similarly to kmsan_vmap_pages_range_noflush(), kmsan_ioremap_page_range()
must also properly handle allocation/mapping failures.  In the case of
such, it must clean up the already created metadata mappings and return an
error code, so that the error can be propagated to ioremap_page_range().
Without doing so, KMSAN may silently fail to bring the metadata for the
page range into a consistent state, which will result in user-visible
crashes when trying to access them.

Link: https://lkml.kernel.org/r/20230413131223.4135168-2-glider@google.com
Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations")
Signed-off-by: Alexander Potapenko <glider@google.com>
Reported-by: Dipanjan Das <mail.dipanjan.das@gmail.com>
  Link: https://lore.kernel.org/linux-mm/CANX2M5ZRrRA64k0hOif02TjmY9kbbO2aCBPyq79es34RXZ=cAw@mail.gmail.com/
Reviewed-by: Marco Elver <elver@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 19 +++++++++---------
 mm/kmsan/hooks.c      | 55 +++++++++++++++++++++++++++++++++++++++++++--------
 mm/vmalloc.c          |  4 ++--
 3 files changed, 59 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index c7ff3aefc5a1..30b17647ce3c 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -160,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
  * @page_shift:	page_shift argument passed to vmap_range_noflush().
  *
  * KMSAN creates new metadata pages for the physical pages mapped into the
- * virtual memory.
+ * virtual memory. Returns 0 on success, callers must check for non-zero return
+ * value.
  */
-void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
-			      phys_addr_t phys_addr, pgprot_t prot,
-			      unsigned int page_shift);
+int kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+			     phys_addr_t phys_addr, pgprot_t prot,
+			     unsigned int page_shift);
 
 /**
  * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
@@ -296,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start,
 {
 }
 
-static inline void kmsan_ioremap_page_range(unsigned long start,
-					    unsigned long end,
-					    phys_addr_t phys_addr,
-					    pgprot_t prot,
-					    unsigned int page_shift)
+static inline int kmsan_ioremap_page_range(unsigned long start,
+					   unsigned long end,
+					   phys_addr_t phys_addr, pgprot_t prot,
+					   unsigned int page_shift)
 {
+	return 0;
 }
 
 static inline void kmsan_iounmap_page_range(unsigned long start,
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3807502766a3..ec0da72e65aa 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
  * into the virtual memory. If those physical pages already had shadow/origin,
  * those are ignored.
  */
-void kmsan_ioremap_page_range(unsigned long start, unsigned long end,
-			      phys_addr_t phys_addr, pgprot_t prot,
-			      unsigned int page_shift)
+int kmsan_ioremap_page_range(unsigned long start, unsigned long end,
+			     phys_addr_t phys_addr, pgprot_t prot,
+			     unsigned int page_shift)
 {
 	gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	struct page *shadow, *origin;
 	unsigned long off = 0;
-	int nr;
+	int nr, err = 0, clean = 0, mapped;
 
 	if (!kmsan_enabled || kmsan_in_runtime())
-		return;
+		return 0;
 
 	nr = (end - start) / PAGE_SIZE;
 	kmsan_enter_runtime();
-	for (int i = 0; i < nr; i++, off += PAGE_SIZE) {
+	for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) {
 		shadow = alloc_pages(gfp_mask, 1);
 		origin = alloc_pages(gfp_mask, 1);
-		__vmap_pages_range_noflush(
+		if (!shadow || !origin) {
+			err = -ENOMEM;
+			goto ret;
+		}
+		mapped = __vmap_pages_range_noflush(
 			vmalloc_shadow(start + off),
 			vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
 			PAGE_SHIFT);
-		__vmap_pages_range_noflush(
+		if (mapped) {
+			err = mapped;
+			goto ret;
+		}
+		shadow = NULL;
+		mapped = __vmap_pages_range_noflush(
 			vmalloc_origin(start + off),
 			vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
 			PAGE_SHIFT);
+		if (mapped) {
+			__vunmap_range_noflush(
+				vmalloc_shadow(start + off),
+				vmalloc_shadow(start + off + PAGE_SIZE));
+			err = mapped;
+			goto ret;
+		}
+		origin = NULL;
+	}
+	/* Page mapping loop finished normally, nothing to clean up. */
+	clean = 0;
+
+ret:
+	if (clean > 0) {
+		/*
+		 * Something went wrong. Clean up shadow/origin pages allocated
+		 * on the last loop iteration, then delete mappings created
+		 * during the previous iterations.
+		 */
+		if (shadow)
+			__free_pages(shadow, 1);
+		if (origin)
+			__free_pages(origin, 1);
+		__vunmap_range_noflush(
+			vmalloc_shadow(start),
+			vmalloc_shadow(start + clean * PAGE_SIZE));
+		__vunmap_range_noflush(
+			vmalloc_origin(start),
+			vmalloc_origin(start + clean * PAGE_SIZE));
 	}
 	flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
 	flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
 	kmsan_leave_runtime();
+	return err;
 }
 
 void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1355d95cce1c..31ff782d368b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
 				 ioremap_max_page_shift);
 	flush_cache_vmap(addr, end);
 	if (!err)
-		kmsan_ioremap_page_range(addr, end, phys_addr, prot,
-					 ioremap_max_page_shift);
+		err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+					       ioremap_max_page_shift);
 	return err;
 }
 
-- 
cgit v1.2.3


From 5f300fd59a2ae90b8a7fb5ed3d5fd43768236c38 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 14 Apr 2023 10:03:53 +0200
Subject: mm: make arch_has_descending_max_zone_pfns() static

clang produces a build failure on x86 for some randconfig builds after a
change that moves around code to mm/mm_init.c:

Cannot find symbol for section 2: .text.
mm/mm_init.o: failed

I have not been able to figure out why this happens, but the __weak
annotation on arch_has_descending_max_zone_pfns() is the trigger here.

Removing the weak function in favor of an open-coded Kconfig option check
avoids the problem and becomes clearer as well as better to optimize by
the compiler.

[arnd@arndb.de: fix logic bug]
  Link: https://lkml.kernel.org/r/20230415081904.969049-1-arnd@kernel.org
Link: https://lkml.kernel.org/r/20230414080418.110236-1-arnd@kernel.org
Fixes: 9420f89db2dd ("mm: move most of core MM initialization to mm/mm_init.c")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: SeongJae Park <sj@kernel.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arc/mm/init.c | 5 -----
 include/linux/mm.h | 1 -
 mm/mm_init.c       | 4 ++--
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index ce4e939a7f07..2b89b6c53801 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -74,11 +74,6 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 		base, TO_MB(size), !in_use ? "Not used":"");
 }
 
-bool arch_has_descending_max_zone_pfns(void)
-{
-	return !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
-}
-
 /*
  * First memory setup routine called from setup_arch()
  * 1. setup swapper's mm @init_mm
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e249208f8fbe..5e5ef6ee4003 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3035,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
 extern int min_free_kbytes;
 extern int watermark_boost_factor;
 extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index dd3a6ed9663f..a0ec3b3acb5e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1752,9 +1752,9 @@ static void __init free_area_init_memoryless_node(int nid)
  * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
  * such cases we allow max_zone_pfn sorted in the descending order
  */
-bool __weak arch_has_descending_max_zone_pfns(void)
+static bool arch_has_descending_max_zone_pfns(void)
 {
-	return false;
+	return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
 }
 
 /**
-- 
cgit v1.2.3


From 2ce0bdfebc74f6cbd4e97a4e767d505a81c38cf2 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <ivan.orlov0322@gmail.com>
Date: Wed, 29 Mar 2023 18:53:30 +0400
Subject: mm: khugepaged: fix kernel BUG in hpage_collapse_scan_file()

Syzkaller reported the following issue:

kernel BUG at mm/khugepaged.c:1823!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 5097 Comm: syz-executor220 Not tainted 6.2.0-syzkaller-13154-g857f1268a591 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/16/2023
RIP: 0010:collapse_file mm/khugepaged.c:1823 [inline]
RIP: 0010:hpage_collapse_scan_file+0x67c8/0x7580 mm/khugepaged.c:2233
Code: 00 00 89 de e8 c9 66 a3 ff 31 ff 89 de e8 c0 66 a3 ff 45 84 f6 0f 85 28 0d 00 00 e8 22 64 a3 ff e9 dc f7 ff ff e8 18 64 a3 ff <0f> 0b f3 0f 1e fa e8 0d 64 a3 ff e9 93 f6 ff ff f3 0f 1e fa 4c 89
RSP: 0018:ffffc90003dff4e0 EFLAGS: 00010093
RAX: ffffffff81e95988 RBX: 00000000000001c1 RCX: ffff8880205b3a80
RDX: 0000000000000000 RSI: 00000000000001c0 RDI: 00000000000001c1
RBP: ffffc90003dff830 R08: ffffffff81e90e67 R09: fffffbfff1a433c3
R10: 0000000000000000 R11: dffffc0000000001 R12: 0000000000000000
R13: ffffc90003dff6c0 R14: 00000000000001c0 R15: 0000000000000000
FS:  00007fdbae5ee700(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fdbae6901e0 CR3: 000000007b2dd000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 madvise_collapse+0x721/0xf50 mm/khugepaged.c:2693
 madvise_vma_behavior mm/madvise.c:1086 [inline]
 madvise_walk_vmas mm/madvise.c:1260 [inline]
 do_madvise+0x9e5/0x4680 mm/madvise.c:1439
 __do_sys_madvise mm/madvise.c:1452 [inline]
 __se_sys_madvise mm/madvise.c:1450 [inline]
 __x64_sys_madvise+0xa5/0xb0 mm/madvise.c:1450
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

The xas_store() call during page cache scanning can potentially translate
'xas' into the error state (with the reproducer provided by the syzkaller
the error code is -ENOMEM).  However, there are no further checks after
the 'xas_store', and the next call of 'xas_next' at the start of the
scanning cycle doesn't increase the xa_index, and the issue occurs.

This patch will add the xarray state error checking after the xas_store()
and the corresponding result error code.

Tested via syzbot.

[akpm@linux-foundation.org: update include/trace/events/huge_memory.h's SCAN_STATUS]
Link: https://lkml.kernel.org/r/20230329145330.23191-1-ivan.orlov0322@gmail.com
Link: https://syzkaller.appspot.com/bug?id=7d6bb3760e026ece7524500fe44fb024a0e959fc
Signed-off-by: Ivan Orlov <ivan.orlov0322@gmail.com>
Reported-by: syzbot+9578faa5475acb35fa50@syzkaller.appspotmail.com
Tested-by: Zach O'Keefe <zokeefe@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Himadri Pandya <himadrispandya@gmail.com>
Cc: Ivan Orlov <ivan.orlov0322@gmail.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |  3 ++-
 mm/khugepaged.c                    | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 3e6fb05852f9..c84c7af70158 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -36,7 +36,8 @@
 	EM( SCAN_ALLOC_HUGE_PAGE_FAIL,	"alloc_huge_page_failed")	\
 	EM( SCAN_CGROUP_CHARGE_FAIL,	"ccgroup_charge_failed")	\
 	EM( SCAN_TRUNCATED,		"truncated")			\
-	EMe(SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
+	EM( SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
+	EMe(SCAN_STORE_FAILED,		"store_failed")
 
 #undef EM
 #undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2c6548cd18a9..3b61cd188f7b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -55,6 +55,7 @@ enum scan_result {
 	SCAN_CGROUP_CHARGE_FAIL,
 	SCAN_TRUNCATED,
 	SCAN_PAGE_HAS_PRIVATE,
+	SCAN_STORE_FAILED,
 };
 
 #define CREATE_TRACE_POINTS
@@ -1857,6 +1858,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 					goto xa_locked;
 				}
 				xas_store(&xas, hpage);
+				if (xas_error(&xas)) {
+					/* revert shmem_charge performed
+					 * in the previous condition
+					 */
+					mapping->nrpages--;
+					shmem_uncharge(mapping->host, 1);
+					result = SCAN_STORE_FAILED;
+					goto xa_locked;
+				}
 				nr_none++;
 				continue;
 			}
@@ -2009,6 +2019,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 
 		/* Finally, replace with the new page. */
 		xas_store(&xas, hpage);
+		/* We can't get an ENOMEM here (because the allocation happened before)
+		 * but let's check for errors (XArray implementation can be
+		 * changed in the future)
+		 */
+		WARN_ON_ONCE(xas_error(&xas));
 		continue;
 out_unlock:
 		unlock_page(page);
@@ -2046,6 +2061,11 @@ out_unlock:
 	/* Join all the small entries into a single multi-index entry */
 	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
 	xas_store(&xas, hpage);
+	/* Here we can't get an ENOMEM (because entries were
+	 * previously allocated) But let's check for errors
+	 * (XArray implementation can be changed in the future)
+	 */
+	WARN_ON_ONCE(xas_error(&xas));
 xa_locked:
 	xas_unlock_irq(&xas);
 xa_unlocked:
-- 
cgit v1.2.3


From a85c2257a8ac353af16dbcbf32c50d3380860bc5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Mar 2023 14:44:47 +0100
Subject: sched/isolation: add cpu_is_isolated() API

Patch series "memcg, cpuisol: do not interfere pcp cache charges draining
with cpuisol workloads".

Leonardo has reported [1] that pcp memcg charge draining can interfere
with cpu isolated workloads.  The said draining is done from a WQ context
with a pcp worker scheduled on each CPU which holds any cached charges for
a specific memcg hierarchy.  Operation is not really a common operation
[2].  It can be triggered from the userspace though so some care is
definitely due.

Leonardo has tried to address the issue by allowing remote charge draining
[3].  This approach requires an additional locking to synchronize pcp
caches sync from a remote cpu from local pcp consumers.  Even though the
proposed lock was per-cpu there is still potential for contention and less
predictable behavior.

This patchset addresses the issue from a different angle.  Rather than
dealing with a potential synchronization, cpus which are isolated are
simply never scheduled to be drained.  This means that a small amount of
charges could be laying around and waiting for a later use or they are
flushed when a different memcg is charged from the same cpu.  More details
are in patch 2.  The first patch from Frederic is implementing an
abstraction to tell whether a specific cpu has been isolated and therefore
require a special treatment.


This patch (of 2):

Provide this new API to check if a CPU has been isolated either through
isolcpus= or nohz_full= kernel parameter.

It aims at avoiding kernel load deemed to be safely spared on CPUs running
sensitive workload that can't bear any disturbance, such as pcp cache
draining.

Link: https://lkml.kernel.org/r/20230317134448.11082-1-mhocko@kernel.org
Link: https://lkml.kernel.org/r/20230317134448.11082-2-mhocko@kernel.org
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/isolation.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 8c15abd67aed..fe1a46f30d24 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -46,6 +46,12 @@ static inline bool housekeeping_enabled(enum hk_type type)
 
 static inline void housekeeping_affine(struct task_struct *t,
 				       enum hk_type type) { }
+
+static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
+{
+	return true;
+}
+
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_CPU_ISOLATION */
 
@@ -58,4 +64,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
 	return true;
 }
 
+static inline bool cpu_is_isolated(int cpu)
+{
+	return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
+		 !housekeeping_test_cpu(cpu, HK_TYPE_TICK);
+}
+
 #endif /* _LINUX_SCHED_ISOLATION_H */
-- 
cgit v1.2.3


From 957ebbdf434013ee01f29f6c9174eced995ebbe7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 27 Mar 2023 16:10:50 +0100
Subject: hugetlb: remove PageHeadHuge()

Sidhartha Kumar removed the last caller of PageHeadHuge(), so we can now
remove it and make folio_test_hugetlb() the real implementation.  Add
kernel-doc for folio_test_hugetlb().

Link: https://lkml.kernel.org/r/20230327151050.1787744-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  7 +------
 mm/hugetlb.c               | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index dcda20c47b8f..efc42fae3d96 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -815,14 +815,9 @@ static inline void ClearPageCompound(struct page *page)
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
-int PageHeadHuge(struct page *page);
-static inline bool folio_test_hugetlb(struct folio *folio)
-{
-	return PageHeadHuge(&folio->page);
-}
+bool folio_test_hugetlb(struct folio *folio);
 #else
 TESTPAGEFLAG_FALSE(Huge, hugetlb)
-TESTPAGEFLAG_FALSE(HeadHuge, headhuge)
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a58b3739ed4b..7e4a80769c9e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2050,19 +2050,23 @@ int PageHuge(struct page *page)
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
-/*
- * PageHeadHuge() only returns true for hugetlbfs head page, but not for
- * normal or transparent huge pages.
+/**
+ * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
+ * @folio: The folio to test.
+ *
+ * Context: Any context.  Caller should have a reference on the folio to
+ * prevent it from being turned into a tail page.
+ * Return: True for hugetlbfs folios, false for anon folios or folios
+ * belonging to other filesystems.
  */
-int PageHeadHuge(struct page *page_head)
+bool folio_test_hugetlb(struct folio *folio)
 {
-	struct folio *folio = (struct folio *)page_head;
 	if (!folio_test_large(folio))
-		return 0;
+		return false;
 
 	return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
 }
-EXPORT_SYMBOL_GPL(PageHeadHuge);
+EXPORT_SYMBOL_GPL(folio_test_hugetlb);
 
 /*
  * Find and lock address space (mapping) in write mode.
-- 
cgit v1.2.3


From 62f31bd4dcedffe3c919deb76ed65bf62c3cf80b Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 26 Mar 2023 19:02:15 +0300
Subject: mm: move free_area_empty() to mm/internal.h

The free_area_empty() helper is only used inside mm/ so move it there to
reduce noise in include/linux/mmzone.h

Link: https://lkml.kernel.org/r/20230326160215.2674531-1-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 5 -----
 mm/internal.h          | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2d22e47dc1eb..337956748976 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -110,11 +110,6 @@ struct free_area {
 	unsigned long		nr_free;
 };
 
-static inline bool free_area_empty(struct free_area *area, int migratetype)
-{
-	return list_empty(&area->free_list[migratetype]);
-}
-
 struct pglist_data;
 
 #ifdef CONFIG_NUMA
diff --git a/mm/internal.h b/mm/internal.h
index 73b167b59cc5..92ddd3a05b74 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -517,6 +517,11 @@ void init_cma_reserved_pageblock(struct page *page);
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
 
+static inline bool free_area_empty(struct free_area *area, int migratetype)
+{
+	return list_empty(&area->free_list[migratetype]);
+}
+
 /*
  * These three helpers classifies VMAs for virtual memory accounting.
  */
-- 
cgit v1.2.3


From 8bff9a04ca33476213ea6155850505787c540c25 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 30 Mar 2023 19:17:54 +0000
Subject: cgroup: rename cgroup_rstat_flush_"irqsafe" to "atomic"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "memcg: avoid flushing stats atomically where possible", v3.

rstat flushing is an expensive operation that scales with the number of
cpus and the number of cgroups in the system.  The purpose of this series
is to minimize the contexts where we flush stats atomically.

Patches 1 and 2 are cleanups requested during reviews of prior versions of
this series.

Patch 3 makes sure we never try to flush from within an irq context.

Patches 4 to 7 introduce separate variants of mem_cgroup_flush_stats() for
atomic and non-atomic flushing, and make sure we only flush the stats
atomically when necessary.

Patch 8 is a slightly tangential optimization that limits the work done by
rstat flushing in some scenarios.


This patch (of 8):

cgroup_rstat_flush_irqsafe() can be a confusing name.  It may read as
"irqs are disabled throughout", which is what the current implementation
does (currently under discussion [1]), but is not the intention.  The
intention is that this function is safe to call from atomic contexts.
Name it as such.

Link: https://lkml.kernel.org/r/20230330191801.1967435-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20230330191801.1967435-2-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup.h | 2 +-
 kernel/cgroup/rstat.c  | 4 ++--
 mm/memcontrol.c        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aecffdb4..885f5395fcd0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -692,7 +692,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
  */
 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
 void cgroup_rstat_flush(struct cgroup *cgrp);
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
+void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
 void cgroup_rstat_flush_hold(struct cgroup *cgrp);
 void cgroup_rstat_flush_release(void);
 
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 831f1f472bb8..d3252b0416b6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -241,12 +241,12 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
 }
 
 /**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
  * @cgrp: target cgroup
  *
  * This function can be called from any context.
  */
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
 {
 	unsigned long flags;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 681e7528a714..b7c3eab6b4fe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -643,7 +643,7 @@ static void __mem_cgroup_flush_stats(void)
 		return;
 
 	flush_next_time = jiffies_64 + 2*FLUSH_TIME;
-	cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+	cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
 	atomic_set(&stats_flush_threshold, 0);
 	spin_unlock_irqrestore(&stats_flush_lock, flag);
 }
-- 
cgit v1.2.3


From 92fbbc7202ac4fb16250dc91c88211814ae2190b Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 30 Mar 2023 19:17:55 +0000
Subject: memcg: rename mem_cgroup_flush_stats_"delayed" to "ratelimited"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mem_cgroup_flush_stats_delayed() suggests his is using a delayed_work, but
this is actually sometimes flushing directly from the callsite.

What it's doing is ratelimited calls.  A better name would be
mem_cgroup_flush_stats_ratelimited().

Link: https://lkml.kernel.org/r/20230330191801.1967435-3-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 mm/memcontrol.c            | 2 +-
 mm/workingset.c            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index aa69ea98e2d8..00a88cf947e1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1038,7 +1038,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 }
 
 void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_delayed(void);
+void mem_cgroup_flush_stats_ratelimited(void);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
@@ -1536,7 +1536,7 @@ static inline void mem_cgroup_flush_stats(void)
 {
 }
 
-static inline void mem_cgroup_flush_stats_delayed(void)
+static inline void mem_cgroup_flush_stats_ratelimited(void)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b7c3eab6b4fe..a01f062e7007 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -654,7 +654,7 @@ void mem_cgroup_flush_stats(void)
 		__mem_cgroup_flush_stats();
 }
 
-void mem_cgroup_flush_stats_delayed(void)
+void mem_cgroup_flush_stats_ratelimited(void)
 {
 	if (time_after64(jiffies_64, flush_next_time))
 		mem_cgroup_flush_stats();
diff --git a/mm/workingset.c b/mm/workingset.c
index 00c6f4d9d9be..af862c6738c3 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -462,7 +462,7 @@ void workingset_refault(struct folio *folio, void *shadow)
 
 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
 
-	mem_cgroup_flush_stats_delayed();
+	mem_cgroup_flush_stats_ratelimited();
 	/*
 	 * Compare the distance to the existing workingset size. We
 	 * don't activate pages that couldn't stay resident even if
-- 
cgit v1.2.3


From 9fad9aee1f267a8ad1f86b87ae70b2c4d6796164 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 30 Mar 2023 19:17:58 +0000
Subject: memcg: sleep during flushing stats in safe contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, all contexts that flush memcg stats do so with sleeping not
allowed.  Some of these contexts are perfectly safe to sleep in, such as
reading cgroup files from userspace or the background periodic flusher.
Flushing is an expensive operation that scales with the number of cpus and
the number of cgroups in the system, so avoid doing it atomically where
possible.

Refactor the code to make mem_cgroup_flush_stats() non-atomic (aka
sleepable), and provide a separate atomic version.  The atomic version is
used in reclaim, refault, writeback, and in mem_cgroup_usage().  All other
code paths are left to use the non-atomic version.  This includes
callbacks for userspace reads and the periodic flusher.

Since refault is the only caller of mem_cgroup_flush_stats_ratelimited(),
change it to mem_cgroup_flush_stats_atomic_ratelimited().  Reclaim and
refault code paths are modified to do non-atomic flushing in separate
later patches -- so it will eventually be changed back to
mem_cgroup_flush_stats_ratelimited().

Link: https://lkml.kernel.org/r/20230330191801.1967435-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +++++++--
 mm/memcontrol.c            | 45 ++++++++++++++++++++++++++++++++++++---------
 mm/vmscan.c                |  2 +-
 mm/workingset.c            |  2 +-
 4 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 00a88cf947e1..3db355e6677f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1038,7 +1038,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 }
 
 void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_ratelimited(void);
+void mem_cgroup_flush_stats_atomic(void);
+void mem_cgroup_flush_stats_atomic_ratelimited(void);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
@@ -1536,7 +1537,11 @@ static inline void mem_cgroup_flush_stats(void)
 {
 }
 
-static inline void mem_cgroup_flush_stats_ratelimited(void)
+static inline void mem_cgroup_flush_stats_atomic(void)
+{
+}
+
+static inline void mem_cgroup_flush_stats_atomic_ratelimited(void)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a9f18456a16b..06786dea50b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -635,7 +635,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	}
 }
 
-static void __mem_cgroup_flush_stats(void)
+static void do_flush_stats(bool atomic)
 {
 	/*
 	 * We always flush the entire tree, so concurrent flushers can just
@@ -647,26 +647,46 @@ static void __mem_cgroup_flush_stats(void)
 		return;
 
 	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
-	cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+
+	if (atomic)
+		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+	else
+		cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+
 	atomic_set(&stats_flush_threshold, 0);
 	atomic_set(&stats_flush_ongoing, 0);
 }
 
+static bool should_flush_stats(void)
+{
+	return atomic_read(&stats_flush_threshold) > num_online_cpus();
+}
+
 void mem_cgroup_flush_stats(void)
 {
-	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
-		__mem_cgroup_flush_stats();
+	if (should_flush_stats())
+		do_flush_stats(false);
 }
 
-void mem_cgroup_flush_stats_ratelimited(void)
+void mem_cgroup_flush_stats_atomic(void)
+{
+	if (should_flush_stats())
+		do_flush_stats(true);
+}
+
+void mem_cgroup_flush_stats_atomic_ratelimited(void)
 {
 	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
-		mem_cgroup_flush_stats();
+		mem_cgroup_flush_stats_atomic();
 }
 
 static void flush_memcg_stats_dwork(struct work_struct *w)
 {
-	__mem_cgroup_flush_stats();
+	/*
+	 * Always flush here so that flushing in latency-sensitive paths is
+	 * as cheap as possible.
+	 */
+	do_flush_stats(false);
 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }
 
@@ -3686,9 +3706,12 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		 * done from irq context; use stale stats in this case.
 		 * Arguably, usage threshold events are not reliable on the root
 		 * memcg anyway since its usage is ill-defined.
+		 *
+		 * Additionally, other call paths through memcg_check_events()
+		 * disable irqs, so make sure we are flushing stats atomically.
 		 */
 		if (in_task())
-			mem_cgroup_flush_stats();
+			mem_cgroup_flush_stats_atomic();
 		val = memcg_page_state(memcg, NR_FILE_PAGES) +
 			memcg_page_state(memcg, NR_ANON_MAPPED);
 		if (swap)
@@ -4611,7 +4634,11 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats();
+	/*
+	 * wb_writeback() takes a spinlock and calls
+	 * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
+	 */
+	mem_cgroup_flush_stats_atomic();
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98719e72b5e2..c68db50ec109 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2861,7 +2861,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
 	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 	 * lruvec stats for heuristics.
 	 */
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats_atomic();
 
 	/*
 	 * Determine the scan balance between anon and file LRUs.
diff --git a/mm/workingset.c b/mm/workingset.c
index af862c6738c3..dab0c362b9e3 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -462,7 +462,7 @@ void workingset_refault(struct folio *folio, void *shadow)
 
 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
 
-	mem_cgroup_flush_stats_ratelimited();
+	mem_cgroup_flush_stats_atomic_ratelimited();
 	/*
 	 * Compare the distance to the existing workingset size. We
 	 * don't activate pages that couldn't stay resident even if
-- 
cgit v1.2.3


From 4009b2f1887036d30637bc06dd0ade7e18408bb3 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 30 Mar 2023 19:17:59 +0000
Subject: workingset: memcg: sleep when flushing stats in workingset_refault()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In workingset_refault(), we call
mem_cgroup_flush_stats_atomic_ratelimited() to read accurate stats within
an RCU read section and with sleeping disallowed.  Move the call above the
RCU read section to make it non-atomic.

Flushing is an expensive operation that scales with the number of cpus and
the number of cgroups in the system, so avoid doing it atomically where
possible.

Since workingset_refault() is the only caller of
mem_cgroup_flush_stats_atomic_ratelimited(), just make it non-atomic, and
rename it to mem_cgroup_flush_stats_ratelimited().

Link: https://lkml.kernel.org/r/20230330191801.1967435-7-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 mm/memcontrol.c            | 4 ++--
 mm/workingset.c            | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3db355e6677f..222d7370134c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1039,7 +1039,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 
 void mem_cgroup_flush_stats(void);
 void mem_cgroup_flush_stats_atomic(void);
-void mem_cgroup_flush_stats_atomic_ratelimited(void);
+void mem_cgroup_flush_stats_ratelimited(void);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
@@ -1541,7 +1541,7 @@ static inline void mem_cgroup_flush_stats_atomic(void)
 {
 }
 
-static inline void mem_cgroup_flush_stats_atomic_ratelimited(void)
+static inline void mem_cgroup_flush_stats_ratelimited(void)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06786dea50b8..e9e79ceb9a11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -674,10 +674,10 @@ void mem_cgroup_flush_stats_atomic(void)
 		do_flush_stats(true);
 }
 
-void mem_cgroup_flush_stats_atomic_ratelimited(void)
+void mem_cgroup_flush_stats_ratelimited(void)
 {
 	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
-		mem_cgroup_flush_stats_atomic();
+		mem_cgroup_flush_stats();
 }
 
 static void flush_memcg_stats_dwork(struct work_struct *w)
diff --git a/mm/workingset.c b/mm/workingset.c
index dab0c362b9e3..3025beee9b34 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -406,6 +406,9 @@ void workingset_refault(struct folio *folio, void *shadow)
 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
 	eviction <<= bucket_order;
 
+	/* Flush stats (and potentially sleep) before holding RCU read lock */
+	mem_cgroup_flush_stats_ratelimited();
+
 	rcu_read_lock();
 	/*
 	 * Look up the memcg associated with the stored ID. It might
@@ -461,8 +464,6 @@ void workingset_refault(struct folio *folio, void *shadow)
 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
-
-	mem_cgroup_flush_stats_atomic_ratelimited();
 	/*
 	 * Compare the distance to the existing workingset size. We
 	 * don't activate pages that couldn't stay resident even if
-- 
cgit v1.2.3


From 98c76c9f1ef7599b39bfd4bd99b8a760d4a8cd3b Mon Sep 17 00:00:00 2001
From: Jiaqi Yan <jiaqiyan@google.com>
Date: Wed, 29 Mar 2023 08:11:19 -0700
Subject: mm/khugepaged: recover from poisoned anonymous memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem
=======
Memory DIMMs are subject to multi-bit flips, i.e.  memory errors.  As
memory size and density increase, the chances of and number of memory
errors increase.  The increasing size and density of server RAM in the
data center and cloud have shown increased uncorrectable memory errors.
There are already mechanisms in the kernel to recover from uncorrectable
memory errors.  This series of patches provides the recovery mechanism for
the particular kernel agent khugepaged when it collapses memory pages.

Impact
======
The main reason we chose to make khugepaged collapsing tolerant of memory
failures was its high possibility of accessing poisoned memory while
performing functionally optional compaction actions.  Standard
applications typically don't have strict requirements on the size of its
pages.  So they are given 4K pages by the kernel.  The kernel is able to
improve application performance by either

  1) giving applications 2M pages to begin with, or
  2) collapsing 4K pages into 2M pages when possible.

This collapsing operation is done by khugepaged, a kernel agent that is
constantly scanning memory.  When collapsing 4K pages into a 2M page, it
must copy the data from the 4K pages into a physically contiguous 2M page.
Therefore, as long as there exists one poisoned cache line in collapsible
4K pages, khugepaged will eventually access it.  The current impact to
users is a machine check exception triggered kernel panic.  However,
khugepaged’s compaction operations are not functionally required kernel
actions.  Therefore making khugepaged tolerant to poisoned memory will
greatly improve user experience.

This patch series is for cases where khugepaged is the first guy that
detects the memory errors on the poisoned pages.  IOW, the pages are not
known to have memory errors when khugepaged collapsing gets to them.  In
our observation, this happens frequently when the huge page ratio of the
system is relatively low, which is fairly common in virtual machines
running on cloud.

Solution
========
As stated before, it is less desirable to crash the system only because
khugepaged accesses poisoned pages while it is collapsing 4K pages.  The
high level idea of this patch series is to skip the group of pages
(usually 512 4K-size pages) once khugepaged finds one of them is poisoned,
as these pages have become ineligible to be collapsed.

We are also careful to unwind operations khuagepaged has performed before
it detects memory failures.  For example, before copying and collapsing a
group of anonymous pages into a huge page, the source pages will be
isolated and their page table is unlinked from their PMD.  These
operations need to be undone in order to ensure these pages are not
changed/lost from the perspective of other threads (both user and kernel
space).  As for file backed memory pages, there already exists a rollback
case.  This patch just extends it so that khugepaged also correctly rolls
back when it fails to copy poisoned 4K pages.


This patch (of 3):

Make __collapse_huge_page_copy return whether copying anonymous pages
succeeded, and make collapse_huge_page handle the return status.

Break existing PTE scan loop into two for-loops.  The first loop copies
source pages into target huge page, and can fail gracefully when running
into memory errors in source pages.  If copying all pages succeeds, the
second loop releases and clears up these normal pages.  Otherwise, the
second loop rolls back the page table and page states by:

- re-establishing the original PTEs-to-PMD connection.
- releasing source pages back to their LRU list.

Tested manually:
0. Enable khugepaged on system under test.
1. Start a two-thread application. Each thread allocates a chunk of
   non-huge anonymous memory buffer.
2. Pick 4 random buffer locations (2 in each thread) and inject
   uncorrectable memory errors at corresponding physical addresses.
3. Signal both threads to make their memory buffer collapsible, i.e.
   calling madvise(MADV_HUGEPAGE).
4. Wait and check kernel log: khugepaged is able to recover from poisoned
   pages and skips collapsing them.
5. Signal both threads to inspect their buffer contents and make sure no
   data corruption.

Link: https://lkml.kernel.org/r/20230329151121.949896-1-jiaqiyan@google.com
Link: https://lkml.kernel.org/r/20230329151121.949896-2-jiaqiyan@google.com
Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
Cc: David Stevens <stevensd@chromium.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Tong Tiangen <tongtiangen@huawei.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |   3 +-
 mm/khugepaged.c                    | 112 ++++++++++++++++++++++++++++++++-----
 2 files changed, 101 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index c84c7af70158..eca4c6f3625e 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -37,7 +37,8 @@
 	EM( SCAN_CGROUP_CHARGE_FAIL,	"ccgroup_charge_failed")	\
 	EM( SCAN_TRUNCATED,		"truncated")			\
 	EM( SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
-	EMe(SCAN_STORE_FAILED,		"store_failed")
+	EM( SCAN_STORE_FAILED,		"store_failed")			\
+	EMe(SCAN_COPY_MC,		"copy_poisoned_page")
 
 #undef EM
 #undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3b61cd188f7b..c66933d8a8b8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -56,6 +56,7 @@ enum scan_result {
 	SCAN_TRUNCATED,
 	SCAN_PAGE_HAS_PRIVATE,
 	SCAN_STORE_FAILED,
+	SCAN_COPY_MC,
 };
 
 #define CREATE_TRACE_POINTS
@@ -686,20 +687,21 @@ out:
 	return result;
 }
 
-static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
-				      struct vm_area_struct *vma,
-				      unsigned long address,
-				      spinlock_t *ptl,
-				      struct list_head *compound_pagelist)
+static void __collapse_huge_page_copy_succeeded(pte_t *pte,
+						struct vm_area_struct *vma,
+						unsigned long address,
+						spinlock_t *ptl,
+						struct list_head *compound_pagelist)
 {
-	struct page *src_page, *tmp;
+	struct page *src_page;
+	struct page *tmp;
 	pte_t *_pte;
-	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
-				_pte++, page++, address += PAGE_SIZE) {
-		pte_t pteval = *_pte;
+	pte_t pteval;
 
+	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+	     _pte++, address += PAGE_SIZE) {
+		pteval = *_pte;
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-			clear_user_highpage(page, address);
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
 			if (is_zero_pfn(pte_pfn(pteval))) {
 				/*
@@ -711,7 +713,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 			}
 		} else {
 			src_page = pte_page(pteval);
-			copy_user_highpage(page, src_page, address, vma);
 			if (!PageCompound(src_page))
 				release_pte_page(src_page);
 			/*
@@ -738,6 +739,87 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 	}
 }
 
+static void __collapse_huge_page_copy_failed(pte_t *pte,
+					     pmd_t *pmd,
+					     pmd_t orig_pmd,
+					     struct vm_area_struct *vma,
+					     struct list_head *compound_pagelist)
+{
+	spinlock_t *pmd_ptl;
+
+	/*
+	 * Re-establish the PMD to point to the original page table
+	 * entry. Restoring PMD needs to be done prior to releasing
+	 * pages. Since pages are still isolated and locked here,
+	 * acquiring anon_vma_lock_write is unnecessary.
+	 */
+	pmd_ptl = pmd_lock(vma->vm_mm, pmd);
+	pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
+	spin_unlock(pmd_ptl);
+	/*
+	 * Release both raw and compound pages isolated
+	 * in __collapse_huge_page_isolate.
+	 */
+	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+}
+
+/*
+ * __collapse_huge_page_copy - attempts to copy memory contents from raw
+ * pages to a hugepage. Cleans up the raw pages if copying succeeds;
+ * otherwise restores the original page table and releases isolated raw pages.
+ * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
+ *
+ * @pte: starting of the PTEs to copy from
+ * @page: the new hugepage to copy contents to
+ * @pmd: pointer to the new hugepage's PMD
+ * @orig_pmd: the original raw pages' PMD
+ * @vma: the original raw pages' virtual memory area
+ * @address: starting address to copy
+ * @ptl: lock on raw pages' PTEs
+ * @compound_pagelist: list that stores compound pages
+ */
+static int __collapse_huge_page_copy(pte_t *pte,
+				     struct page *page,
+				     pmd_t *pmd,
+				     pmd_t orig_pmd,
+				     struct vm_area_struct *vma,
+				     unsigned long address,
+				     spinlock_t *ptl,
+				     struct list_head *compound_pagelist)
+{
+	struct page *src_page;
+	pte_t *_pte;
+	pte_t pteval;
+	unsigned long _address;
+	int result = SCAN_SUCCEED;
+
+	/*
+	 * Copying pages' contents is subject to memory poison at any iteration.
+	 */
+	for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
+	     _pte++, page++, _address += PAGE_SIZE) {
+		pteval = *_pte;
+		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+			clear_user_highpage(page, _address);
+			continue;
+		}
+		src_page = pte_page(pteval);
+		if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
+			result = SCAN_COPY_MC;
+			break;
+		}
+	}
+
+	if (likely(result == SCAN_SUCCEED))
+		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
+						    compound_pagelist);
+	else
+		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
+						 compound_pagelist);
+
+	return result;
+}
+
 static void khugepaged_alloc_sleep(void)
 {
 	DEFINE_WAIT(wait);
@@ -1111,9 +1193,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 */
 	anon_vma_unlock_write(vma->anon_vma);
 
-	__collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
-				  &compound_pagelist);
+	result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
+					   vma, address, pte_ptl,
+					   &compound_pagelist);
 	pte_unmap(pte);
+	if (unlikely(result != SCAN_SUCCEED))
+		goto out_up_write;
+
 	/*
 	 * spin_lock() below is not the equivalent of smp_wmb(), but
 	 * the smp_wmb() inside __SetPageUptodate() can be reused to
-- 
cgit v1.2.3


From 6efc7afb5cc98488410d44695685d003d832534d Mon Sep 17 00:00:00 2001
From: Jiaqi Yan <jiaqiyan@google.com>
Date: Wed, 29 Mar 2023 08:11:20 -0700
Subject: mm/hwpoison: introduce copy_mc_highpage

Similar to how copy_mc_user_highpage is implemented for copy_user_highpage
on #MC supported architecture, introduce the #MC handled version of
copy_highpage.

This helper has immediate usage when khugepaged wants to copy file-backed
memory pages and tolerate #MC.

Link: https://lkml.kernel.org/r/20230329151121.949896-3-jiaqiyan@google.com
Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: David Stevens <stevensd@chromium.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Tong Tiangen <tongtiangen@huawei.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 54 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9c7cdaa3de8c..4de1dbcd3ef6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -315,7 +315,29 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 
 #endif
 
+#ifndef __HAVE_ARCH_COPY_HIGHPAGE
+
+static inline void copy_highpage(struct page *to, struct page *from)
+{
+	char *vfrom, *vto;
+
+	vfrom = kmap_local_page(from);
+	vto = kmap_local_page(to);
+	copy_page(vto, vfrom);
+	kmsan_copy_page_meta(to, from);
+	kunmap_local(vto);
+	kunmap_local(vfrom);
+}
+
+#endif
+
 #ifdef copy_mc_to_kernel
+/*
+ * If architecture supports machine check exception handling, define the
+ * #MC versions of copy_user_highpage and copy_highpage. They copy a memory
+ * page with #MC in source page (@from) handled, and return the number
+ * of bytes not copied if there was a #MC, otherwise 0 for success.
+ */
 static inline int copy_mc_user_highpage(struct page *to, struct page *from,
 					unsigned long vaddr, struct vm_area_struct *vma)
 {
@@ -332,29 +354,35 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from,
 
 	return ret;
 }
-#else
-static inline int copy_mc_user_highpage(struct page *to, struct page *from,
-					unsigned long vaddr, struct vm_area_struct *vma)
-{
-	copy_user_highpage(to, from, vaddr, vma);
-	return 0;
-}
-#endif
 
-#ifndef __HAVE_ARCH_COPY_HIGHPAGE
-
-static inline void copy_highpage(struct page *to, struct page *from)
+static inline int copy_mc_highpage(struct page *to, struct page *from)
 {
+	unsigned long ret;
 	char *vfrom, *vto;
 
 	vfrom = kmap_local_page(from);
 	vto = kmap_local_page(to);
-	copy_page(vto, vfrom);
-	kmsan_copy_page_meta(to, from);
+	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+	if (!ret)
+		kmsan_copy_page_meta(to, from);
 	kunmap_local(vto);
 	kunmap_local(vfrom);
+
+	return ret;
+}
+#else
+static inline int copy_mc_user_highpage(struct page *to, struct page *from,
+					unsigned long vaddr, struct vm_area_struct *vma)
+{
+	copy_user_highpage(to, from, vaddr, vma);
+	return 0;
 }
 
+static inline int copy_mc_highpage(struct page *to, struct page *from)
+{
+	copy_highpage(to, from);
+	return 0;
+}
 #endif
 
 static inline void memcpy_page(struct page *dst_page, size_t dst_off,
-- 
cgit v1.2.3


From ac492b9c70cac4d887e9dce4410b1d521851e142 Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Tue, 4 Apr 2023 21:01:16 +0900
Subject: mm/khugepaged: skip shmem with userfaultfd

Make sure that collapse_file respects any userfaultfds registered with
MODE_MISSING.  If userspace has any such userfaultfds registered, then for
any page which it knows to be missing, it may expect a
UFFD_EVENT_PAGEFAULT.  This means collapse_file needs to be careful when
collapsing a shmem range would result in replacing an empty page with a
THP, to avoid breaking userfaultfd.

Synchronization when checking for userfaultfds in collapse_file is tricky
because the mmap locks can't be used to prevent races with the
registration of new userfaultfds.  Instead, we provide synchronization by
ensuring that userspace cannot observe the fact that pages are missing
before we check for userfaultfds.  Although this allows registration of a
userfaultfd to race with collapse_file, it ensures that userspace cannot
observe any pages transition from missing to present after such a race
occurs.  This makes such a race indistinguishable to the collapse
occurring immediately before the userfaultfd registration.

The first step to provide this synchronization is to stop filling gaps
during the loop iterating over the target range, since the page cache lock
can be dropped during that loop.  The second step is to fill the gaps with
XA_RETRY_ENTRY after the page cache lock is acquired the final time, to
avoid races with accesses to the page cache that only take the RCU read
lock.

The fact that we don't fill holes during the initial iteration means that
collapse_file now has to handle faults occurring during the collapse.
This is done by re-validating the number of missing pages after acquiring
the page cache lock for the final time.

This fix is targeted at khugepaged, but the change also applies to
MADV_COLLAPSE.  MADV_COLLAPSE on a range with a userfaultfd will now
return EBUSY if there are any missing pages (instead of succeeding on
shmem and returning EINVAL on anonymous memory).  There is also now a
window during MADV_COLLAPSE where a fault on a missing page will cause the
syscall to fail with EAGAIN.

The fact that intermediate page cache state can no longer be observed
before the rollback of a failed collapse is also technically a
userspace-visible change (via at least SEEK_DATA and SEEK_END), but it is
exceedingly unlikely that anything relies on being able to observe that
transient state.

Link: https://lkml.kernel.org/r/20230404120117.2562166-4-stevensd@google.com
Signed-off-by: David Stevens <stevensd@chromium.org>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |  3 +-
 mm/khugepaged.c                    | 99 ++++++++++++++++++++++++++++++--------
 2 files changed, 81 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index eca4c6f3625e..6e2ef1d4b002 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -38,7 +38,8 @@
 	EM( SCAN_TRUNCATED,		"truncated")			\
 	EM( SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
 	EM( SCAN_STORE_FAILED,		"store_failed")			\
-	EMe(SCAN_COPY_MC,		"copy_poisoned_page")
+	EM( SCAN_COPY_MC,		"copy_poisoned_page")		\
+	EMe(SCAN_PAGE_FILLED,		"page_filled")
 
 #undef EM
 #undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 762877539634..434674ca0c8a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -57,6 +57,7 @@ enum scan_result {
 	SCAN_PAGE_HAS_PRIVATE,
 	SCAN_STORE_FAILED,
 	SCAN_COPY_MC,
+	SCAN_PAGE_FILLED,
 };
 
 #define CREATE_TRACE_POINTS
@@ -1860,8 +1861,8 @@ next:
  *  - allocate and lock a new huge page;
  *  - scan page cache replacing old pages with the new one
  *    + swap/gup in pages if necessary;
- *    + fill in gaps;
  *    + keep old pages around in case rollback is required;
+ *  - finalize updates to the page cache;
  *  - if replacing succeeds:
  *    + copy data over;
  *    + free old pages;
@@ -1939,7 +1940,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 						result = SCAN_TRUNCATED;
 						goto xa_locked;
 					}
-					xas_set(&xas, index);
+					xas_set(&xas, index + 1);
 				}
 				if (!shmem_charge(mapping->host, 1)) {
 					result = SCAN_FAIL;
@@ -2176,22 +2177,66 @@ xa_unlocked:
 		index++;
 	}
 
-	/*
-	 * Copying old pages to huge one has succeeded, now we
-	 * need to free the old pages.
-	 */
-	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
-		list_del(&page->lru);
-		page->mapping = NULL;
-		page_ref_unfreeze(page, 1);
-		ClearPageActive(page);
-		ClearPageUnevictable(page);
-		unlock_page(page);
-		put_page(page);
+	if (nr_none) {
+		struct vm_area_struct *vma;
+		int nr_none_check = 0;
+
+		i_mmap_lock_read(mapping);
+		xas_lock_irq(&xas);
+
+		xas_set(&xas, start);
+		for (index = start; index < end; index++) {
+			if (!xas_next(&xas)) {
+				xas_store(&xas, XA_RETRY_ENTRY);
+				if (xas_error(&xas)) {
+					result = SCAN_STORE_FAILED;
+					goto immap_locked;
+				}
+				nr_none_check++;
+			}
+		}
+
+		if (nr_none != nr_none_check) {
+			result = SCAN_PAGE_FILLED;
+			goto immap_locked;
+		}
+
+		/*
+		 * If userspace observed a missing page in a VMA with a MODE_MISSING
+		 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
+		 * page. If so, we need to roll back to avoid suppressing such an
+		 * event. Since wp/minor userfaultfds don't give userspace any
+		 * guarantees that the kernel doesn't fill a missing page with a zero
+		 * page, so they don't matter here.
+		 *
+		 * Any userfaultfds registered after this point will not be able to
+		 * observe any missing pages due to the previously inserted retry
+		 * entries.
+		 */
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+			if (userfaultfd_missing(vma)) {
+				result = SCAN_EXCEED_NONE_PTE;
+				goto immap_locked;
+			}
+		}
+
+immap_locked:
+		i_mmap_unlock_read(mapping);
+		if (result != SCAN_SUCCEED) {
+			xas_set(&xas, start);
+			for (index = start; index < end; index++) {
+				if (xas_next(&xas) == XA_RETRY_ENTRY)
+					xas_store(&xas, NULL);
+			}
+
+			xas_unlock_irq(&xas);
+			goto rollback;
+		}
+	} else {
+		xas_lock_irq(&xas);
 	}
 
 	nr = thp_nr_pages(hpage);
-	xas_lock_irq(&xas);
 	if (is_shmem)
 		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
 	else
@@ -2221,6 +2266,20 @@ xa_unlocked:
 	result = retract_page_tables(mapping, start, mm, addr, hpage,
 				     cc);
 	unlock_page(hpage);
+
+	/*
+	 * The collapse has succeeded, so free the old pages.
+	 */
+	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+		list_del(&page->lru);
+		page->mapping = NULL;
+		page_ref_unfreeze(page, 1);
+		ClearPageActive(page);
+		ClearPageUnevictable(page);
+		unlock_page(page);
+		put_page(page);
+	}
+
 	goto out;
 
 rollback:
@@ -2232,15 +2291,13 @@ rollback:
 	}
 
 	xas_set(&xas, start);
-	xas_for_each(&xas, page, end - 1) {
+	end = index;
+	for (index = start; index < end; index++) {
+		xas_next(&xas);
 		page = list_first_entry_or_null(&pagelist,
 				struct page, lru);
 		if (!page || xas.xa_index < page->index) {
-			if (!nr_none)
-				break;
 			nr_none--;
-			/* Put holes back where they were */
-			xas_store(&xas, NULL);
 			continue;
 		}
 
@@ -2764,12 +2821,14 @@ static int madvise_collapse_errno(enum scan_result r)
 	case SCAN_ALLOC_HUGE_PAGE_FAIL:
 		return -ENOMEM;
 	case SCAN_CGROUP_CHARGE_FAIL:
+	case SCAN_EXCEED_NONE_PTE:
 		return -EBUSY;
 	/* Resource temporary unavailable - trying again might succeed */
 	case SCAN_PAGE_COUNT:
 	case SCAN_PAGE_LOCK:
 	case SCAN_PAGE_LRU:
 	case SCAN_DEL_PAGE_LRU:
+	case SCAN_PAGE_FILLED:
 		return -EAGAIN;
 	/*
 	 * Other: Trying again likely not to succeed / error intrinsic to
-- 
cgit v1.2.3


From ddc65971bb677aa9f6a4c21f76d3133e106f88eb Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 4 Apr 2023 21:31:48 +0900
Subject: prctl: add PR_GET_AUXV to copy auxv to userspace

If a library wants to get information from auxv (for instance,
AT_HWCAP/AT_HWCAP2), it has a few options, none of them perfectly reliable
or ideal:

- Be main or the pre-main startup code, and grub through the stack above
  main. Doesn't work for a library.
- Call libc getauxval. Not ideal for libraries that are trying to be
  libc-independent and/or don't otherwise require anything from other
  libraries.
- Open and read /proc/self/auxv. Doesn't work for libraries that may run
  in arbitrarily constrained environments that may not have /proc
  mounted (e.g. libraries that might be used by an init program or a
  container setup tool).
- Assume you're on the main thread and still on the original stack, and
  try to walk the stack upwards, hoping to find auxv. Extremely bad
  idea.
- Ask the caller to pass auxv in for you. Not ideal for a user-friendly
  library, and then your caller may have the same problem.

Add a prctl that copies current->mm->saved_auxv to a userspace buffer.

Link: https://lkml.kernel.org/r/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/prctl.h |  2 ++
 kernel/sys.c               | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1312a137f7fb..b99c0be72577 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -290,4 +290,6 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+#define PR_GET_AUXV			0x41555856
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 351de7916302..26c1399e0654 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2388,6 +2388,16 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
 		PR_MDWE_REFUSE_EXEC_GAIN : 0;
 }
 
+static int prctl_get_auxv(void __user *addr, unsigned long len)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);
+
+	if (size && copy_to_user(addr, mm->saved_auxv, size))
+		return -EFAULT;
+	return sizeof(mm->saved_auxv);
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2518,6 +2528,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				return -EINVAL;
 			break;
+	case PR_GET_AUXV:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = prctl_get_auxv((void __user *)arg2, arg3);
+		break;
 		default:
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From b4aca54792e76556bb9f8661bab07b4bf2eb4e5f Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Wed, 5 Apr 2023 11:38:19 +0100
Subject: smaps: fix defined but not used smaps_shmem_walk_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When !CONFIG_SHMEM smaps_shmem_walk_ops is defined but not used,
triggering a compiler warning.  To avoid the warning remove the #ifdef
around the usage.  This has no effect because shmem_mapping() is a stub
returning false when !CONFIG_SHMEM so the code will be compiled out,
however we now need to also provide a stub for shmem_swap_usage().

Link: https://lkml.kernel.org/r/20230405103819.151246-1-steven.price@arm.com
Fixes: 7b86ac3371b7 ("pagewalk: separate function pointers from iterator data")
Signed-off-by: Steven Price <steven.price@arm.com>
Reported-by: kernel test robot <lkp@intel.com>
  Link: https://lore.kernel.org/oe-kbuild-all/202304031749.UiyJpxzF-lkp@intel.com/
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c       | 3 +--
 include/linux/shmem_fs.h | 7 +++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..cb49479acd2e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 	if (start >= vma->vm_end)
 		return;
 
-#ifdef CONFIG_SHMEM
 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
 		/*
 		 * For shared or readonly shmem mappings we know that all
@@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 			ops = &smaps_shmem_walk_ops;
 		}
 	}
-#endif
+
 	/* mmap_lock is held in m_start */
 	if (!start)
 		walk_page_vma(vma, ops, mss);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 3bb8d21edbb3..b9516a9802bf 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -96,7 +96,14 @@ int shmem_unuse(unsigned int type);
 
 extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
 			  struct mm_struct *mm, unsigned long vm_flags);
+#ifdef CONFIG_SHMEM
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+#else
+static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+	return 0;
+}
+#endif
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
 
-- 
cgit v1.2.3


From e87340ca5c9cecc8a11daf1a2dcabf23f06a4e10 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Mon, 10 Apr 2023 21:39:29 +0800
Subject: userfaultfd: convert copy_huge_page_from_user() to
 copy_folio_from_user()

Replace copy_huge_page_from_user() with copy_folio_from_user().
copy_folio_from_user() does the same as copy_huge_page_from_user(), but
takes in a folio instead of a page.

Convert page_kaddr to kaddr in copy_folio_from_user() to do indenting
cleanup.

Link: https://lkml.kernel.org/r/20230410133932.32288-4-zhangpeng362@huawei.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  7 +++----
 mm/hugetlb.c       |  5 ++---
 mm/memory.c        | 23 +++++++++++------------
 mm/userfaultfd.c   |  6 ++----
 4 files changed, 18 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5e5ef6ee4003..01a009efc2cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3681,10 +3681,9 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned long addr_hint,
 				struct vm_area_struct *vma,
 				unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
-				const void __user *usr_src,
-				unsigned int pages_per_huge_page,
-				bool allow_pagefault);
+long copy_folio_from_user(struct folio *dst_folio,
+			   const void __user *usr_src,
+			   bool allow_pagefault);
 
 /**
  * vma_is_special_huge - Are transhuge page-table entries considered special?
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7e4a80769c9e..aade1b513474 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6217,9 +6217,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			goto out;
 		}
 
-		ret = copy_huge_page_from_user(&folio->page,
-						(const void __user *) src_addr,
-						pages_per_huge_page(h), false);
+		ret = copy_folio_from_user(folio, (const void __user *) src_addr,
+					   false);
 
 		/* fallback to copy_from_user outside mmap_lock */
 		if (unlikely(ret)) {
diff --git a/mm/memory.c b/mm/memory.c
index 808f354bce65..021cab989703 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5868,26 +5868,25 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
 }
 
-long copy_huge_page_from_user(struct page *dst_page,
-				const void __user *usr_src,
-				unsigned int pages_per_huge_page,
-				bool allow_pagefault)
+long copy_folio_from_user(struct folio *dst_folio,
+			   const void __user *usr_src,
+			   bool allow_pagefault)
 {
-	void *page_kaddr;
+	void *kaddr;
 	unsigned long i, rc = 0;
-	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+	unsigned int nr_pages = folio_nr_pages(dst_folio);
+	unsigned long ret_val = nr_pages * PAGE_SIZE;
 	struct page *subpage;
 
-	for (i = 0; i < pages_per_huge_page; i++) {
-		subpage = nth_page(dst_page, i);
-		page_kaddr = kmap_local_page(subpage);
+	for (i = 0; i < nr_pages; i++) {
+		subpage = folio_page(dst_folio, i);
+		kaddr = kmap_local_page(subpage);
 		if (!allow_pagefault)
 			pagefault_disable();
-		rc = copy_from_user(page_kaddr,
-				usr_src + i * PAGE_SIZE, PAGE_SIZE);
+		rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
 		if (!allow_pagefault)
 			pagefault_enable();
-		kunmap_local(page_kaddr);
+		kunmap_local(kaddr);
 
 		ret_val -= (PAGE_SIZE - rc);
 		if (rc)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 313bc683c2b6..1e7dba6c4c5f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -421,10 +421,8 @@ retry:
 			mmap_read_unlock(dst_mm);
 			BUG_ON(!page);
 
-			err = copy_huge_page_from_user(page,
-						(const void __user *)src_addr,
-						vma_hpagesize / PAGE_SIZE,
-						true);
+			err = copy_folio_from_user(page_folio(page),
+						   (const void __user *)src_addr, true);
 			if (unlikely(err)) {
 				err = -EFAULT;
 				goto out;
-- 
cgit v1.2.3


From 0169fd518a8934d8d723659752b07589ecc9f692 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Mon, 10 Apr 2023 21:39:30 +0800
Subject: userfaultfd: convert mfill_atomic_hugetlb() to use a folio

Convert hugetlb_mfill_atomic_pte() to take in a folio pointer instead of
a page pointer.

Convert mfill_atomic_hugetlb() to use a folio.

Link: https://lkml.kernel.org/r/20230410133932.32288-5-zhangpeng362@huawei.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 26 +++++++++++++-------------
 mm/userfaultfd.c        | 16 ++++++++--------
 3 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2a758bcd6719..28703fe22386 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -163,7 +163,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
 			     uffd_flags_t flags,
-			     struct page **pagep);
+			     struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
@@ -397,7 +397,7 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 					   unsigned long dst_addr,
 					   unsigned long src_addr,
 					   uffd_flags_t flags,
-					   struct page **pagep)
+					   struct folio **foliop)
 {
 	BUG();
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aade1b513474..c88f856ec2e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6178,7 +6178,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     unsigned long dst_addr,
 			     unsigned long src_addr,
 			     uffd_flags_t flags,
-			     struct page **pagep)
+			     struct folio **foliop)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
@@ -6201,8 +6201,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 		if (IS_ERR(folio))
 			goto out;
 		folio_in_pagecache = true;
-	} else if (!*pagep) {
-		/* If a page already exists, then it's UFFDIO_COPY for
+	} else if (!*foliop) {
+		/* If a folio already exists, then it's UFFDIO_COPY for
 		 * a non-missing case. Return -EEXIST.
 		 */
 		if (vm_shared &&
@@ -6237,33 +6237,33 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 				ret = -ENOMEM;
 				goto out;
 			}
-			*pagep = &folio->page;
-			/* Set the outparam pagep and return to the caller to
+			*foliop = folio;
+			/* Set the outparam foliop and return to the caller to
 			 * copy the contents outside the lock. Don't free the
-			 * page.
+			 * folio.
 			 */
 			goto out;
 		}
 	} else {
 		if (vm_shared &&
 		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
-			put_page(*pagep);
+			folio_put(*foliop);
 			ret = -EEXIST;
-			*pagep = NULL;
+			*foliop = NULL;
 			goto out;
 		}
 
 		folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
 		if (IS_ERR(folio)) {
-			put_page(*pagep);
+			folio_put(*foliop);
 			ret = -ENOMEM;
-			*pagep = NULL;
+			*foliop = NULL;
 			goto out;
 		}
-		copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma,
+		copy_user_huge_page(&folio->page, &(*foliop)->page, dst_addr, dst_vma,
 				    pages_per_huge_page(h));
-		put_page(*pagep);
-		*pagep = NULL;
+		folio_put(*foliop);
+		*foliop = NULL;
 	}
 
 	/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 1e7dba6c4c5f..2f263afb823d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -321,7 +321,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	pte_t *dst_pte;
 	unsigned long src_addr, dst_addr;
 	long copied;
-	struct page *page;
+	struct folio *folio;
 	unsigned long vma_hpagesize;
 	pgoff_t idx;
 	u32 hash;
@@ -341,7 +341,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	src_addr = src_start;
 	dst_addr = dst_start;
 	copied = 0;
-	page = NULL;
+	folio = NULL;
 	vma_hpagesize = vma_kernel_pagesize(dst_vma);
 
 	/*
@@ -410,7 +410,7 @@ retry:
 		}
 
 		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
-					       src_addr, flags, &page);
+					       src_addr, flags, &folio);
 
 		hugetlb_vma_unlock_read(dst_vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -419,9 +419,9 @@ retry:
 
 		if (unlikely(err == -ENOENT)) {
 			mmap_read_unlock(dst_mm);
-			BUG_ON(!page);
+			BUG_ON(!folio);
 
-			err = copy_folio_from_user(page_folio(page),
+			err = copy_folio_from_user(folio,
 						   (const void __user *)src_addr, true);
 			if (unlikely(err)) {
 				err = -EFAULT;
@@ -432,7 +432,7 @@ retry:
 			dst_vma = NULL;
 			goto retry;
 		} else
-			BUG_ON(page);
+			BUG_ON(folio);
 
 		if (!err) {
 			dst_addr += vma_hpagesize;
@@ -449,8 +449,8 @@ retry:
 out_unlock:
 	mmap_read_unlock(dst_mm);
 out:
-	if (page)
-		put_page(page);
+	if (folio)
+		folio_put(folio);
 	BUG_ON(copied < 0);
 	BUG_ON(err > 0);
 	BUG_ON(!copied && !err);
-- 
cgit v1.2.3


From c0e8150e144b62ae467520d0b51c4707c09e897b Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Mon, 10 Apr 2023 21:39:31 +0800
Subject: mm: convert copy_user_huge_page() to copy_user_large_folio()

Replace copy_user_huge_page() with copy_user_large_folio().
copy_user_large_folio() does the same as copy_user_huge_page(), but takes
in folios instead of pages.  Remove pages_per_huge_page from
copy_user_large_folio(), because we can get that from folio_nr_pages(dst).

Convert copy_user_gigantic_page() to take in folios.

Link: https://lkml.kernel.org/r/20230410133932.32288-6-zhangpeng362@huawei.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  7 +++----
 mm/hugetlb.c       | 11 +++++------
 mm/memory.c        | 28 ++++++++++++++--------------
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 01a009efc2cb..5a3eaa9a1f8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3677,10 +3677,9 @@ extern const struct attribute_group memory_failure_attr_group;
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
-				unsigned long addr_hint,
-				struct vm_area_struct *vma,
-				unsigned int pages_per_huge_page);
+void copy_user_large_folio(struct folio *dst, struct folio *src,
+			   unsigned long addr_hint,
+			   struct vm_area_struct *vma);
 long copy_folio_from_user(struct folio *dst_folio,
 			   const void __user *usr_src,
 			   bool allow_pagefault);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c88f856ec2e2..f16b25b1a6b9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5097,8 +5097,9 @@ again:
 					ret = PTR_ERR(new_folio);
 					break;
 				}
-				copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
-						    npages);
+				copy_user_large_folio(new_folio,
+						      page_folio(ptepage),
+						      addr, dst_vma);
 				put_page(ptepage);
 
 				/* Install the new hugetlb folio if src pte stable */
@@ -5616,8 +5617,7 @@ retry_avoidcopy:
 		goto out_release_all;
 	}
 
-	copy_user_huge_page(&new_folio->page, old_page, address, vma,
-			    pages_per_huge_page(h));
+	copy_user_large_folio(new_folio, page_folio(old_page), address, vma);
 	__folio_mark_uptodate(new_folio);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
@@ -6260,8 +6260,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			*foliop = NULL;
 			goto out;
 		}
-		copy_user_huge_page(&folio->page, &(*foliop)->page, dst_addr, dst_vma,
-				    pages_per_huge_page(h));
+		copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
 		folio_put(*foliop);
 		*foliop = NULL;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 021cab989703..f315c2198098 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5815,21 +5815,21 @@ void clear_huge_page(struct page *page,
 	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
 }
 
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-				    unsigned long addr,
-				    struct vm_area_struct *vma,
-				    unsigned int pages_per_huge_page)
+static void copy_user_gigantic_page(struct folio *dst, struct folio *src,
+				     unsigned long addr,
+				     struct vm_area_struct *vma,
+				     unsigned int pages_per_huge_page)
 {
 	int i;
-	struct page *dst_base = dst;
-	struct page *src_base = src;
+	struct page *dst_page;
+	struct page *src_page;
 
 	for (i = 0; i < pages_per_huge_page; i++) {
-		dst = nth_page(dst_base, i);
-		src = nth_page(src_base, i);
+		dst_page = folio_page(dst, i);
+		src_page = folio_page(src, i);
 
 		cond_resched();
-		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+		copy_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma);
 	}
 }
 
@@ -5847,15 +5847,15 @@ static void copy_subpage(unsigned long addr, int idx, void *arg)
 			   addr, copy_arg->vma);
 }
 
-void copy_user_huge_page(struct page *dst, struct page *src,
-			 unsigned long addr_hint, struct vm_area_struct *vma,
-			 unsigned int pages_per_huge_page)
+void copy_user_large_folio(struct folio *dst, struct folio *src,
+			   unsigned long addr_hint, struct vm_area_struct *vma)
 {
+	unsigned int pages_per_huge_page = folio_nr_pages(dst);
 	unsigned long addr = addr_hint &
 		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 	struct copy_subpage_arg arg = {
-		.dst = dst,
-		.src = src,
+		.dst = &dst->page,
+		.src = &src->page,
 		.vma = vma,
 	};
 
-- 
cgit v1.2.3


From d7be6d7eee1bbf98671d7a2c95654322241e2ae4 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Mon, 10 Apr 2023 21:39:32 +0800
Subject: userfaultfd: convert mfill_atomic() to use a folio

Convert mfill_atomic_pte_copy(), shmem_mfill_atomic_pte() and
mfill_atomic_pte() to take in a folio pointer.

Convert mfill_atomic() to use a folio.  Convert page_kaddr to kaddr in
mfill_atomic().

Link: https://lkml.kernel.org/r/20230410133932.32288-7-zhangpeng362@huawei.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h |  4 ++--
 mm/shmem.c               | 16 ++++++++--------
 mm/userfaultfd.c         | 40 ++++++++++++++++++++--------------------
 3 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index b9516a9802bf..9029abd29b1c 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -165,10 +165,10 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
 				  uffd_flags_t flags,
-				  struct page **pagep);
+				  struct folio **foliop);
 #else /* !CONFIG_SHMEM */
 #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
-			       src_addr, flags, pagep) ({ BUG(); 0; })
+			       src_addr, flags, foliop) ({ BUG(); 0; })
 #endif /* CONFIG_SHMEM */
 #endif /* CONFIG_USERFAULTFD */
 
diff --git a/mm/shmem.c b/mm/shmem.c
index b185c1db3009..3531ebb3494e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2433,7 +2433,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 			   unsigned long dst_addr,
 			   unsigned long src_addr,
 			   uffd_flags_t flags,
-			   struct page **pagep)
+			   struct folio **foliop)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2451,14 +2451,14 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 		 * and now we find ourselves with -ENOMEM. Release the page, to
 		 * avoid a BUG_ON in our caller.
 		 */
-		if (unlikely(*pagep)) {
-			put_page(*pagep);
-			*pagep = NULL;
+		if (unlikely(*foliop)) {
+			folio_put(*foliop);
+			*foliop = NULL;
 		}
 		return -ENOMEM;
 	}
 
-	if (!*pagep) {
+	if (!*foliop) {
 		ret = -ENOMEM;
 		folio = shmem_alloc_folio(gfp, info, pgoff);
 		if (!folio)
@@ -2490,7 +2490,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 
 			/* fallback to copy_from_user outside mmap_lock */
 			if (unlikely(ret)) {
-				*pagep = &folio->page;
+				*foliop = folio;
 				ret = -ENOENT;
 				/* don't free the page */
 				goto out_unacct_blocks;
@@ -2501,9 +2501,9 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 			clear_user_highpage(&folio->page, dst_addr);
 		}
 	} else {
-		folio = page_folio(*pagep);
+		folio = *foliop;
 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-		*pagep = NULL;
+		*foliop = NULL;
 	}
 
 	VM_BUG_ON(folio_test_locked(folio));
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 2f263afb823d..11cfd82c6726 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -133,13 +133,13 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 				 unsigned long dst_addr,
 				 unsigned long src_addr,
 				 uffd_flags_t flags,
-				 struct page **pagep)
+				 struct folio **foliop)
 {
 	void *kaddr;
 	int ret;
 	struct folio *folio;
 
-	if (!*pagep) {
+	if (!*foliop) {
 		ret = -ENOMEM;
 		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
 					dst_addr, false);
@@ -171,15 +171,15 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 		/* fallback to copy_from_user outside mmap_lock */
 		if (unlikely(ret)) {
 			ret = -ENOENT;
-			*pagep = &folio->page;
+			*foliop = folio;
 			/* don't free the page */
 			goto out;
 		}
 
 		flush_dcache_folio(folio);
 	} else {
-		folio = page_folio(*pagep);
-		*pagep = NULL;
+		folio = *foliop;
+		*foliop = NULL;
 	}
 
 	/*
@@ -470,7 +470,7 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						uffd_flags_t flags,
-						struct page **pagep)
+						struct folio **foliop)
 {
 	ssize_t err;
 
@@ -493,14 +493,14 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
 			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
 						    dst_addr, src_addr,
-						    flags, pagep);
+						    flags, foliop);
 		else
 			err = mfill_atomic_pte_zeropage(dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
 		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
 					     dst_addr, src_addr,
-					     flags, pagep);
+					     flags, foliop);
 	}
 
 	return err;
@@ -518,7 +518,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	pmd_t *dst_pmd;
 	unsigned long src_addr, dst_addr;
 	long copied;
-	struct page *page;
+	struct folio *folio;
 
 	/*
 	 * Sanitize the command parameters:
@@ -533,7 +533,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	src_addr = src_start;
 	dst_addr = dst_start;
 	copied = 0;
-	page = NULL;
+	folio = NULL;
 retry:
 	mmap_read_lock(dst_mm);
 
@@ -629,28 +629,28 @@ retry:
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
 		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
-				       src_addr, flags, &page);
+				       src_addr, flags, &folio);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
-			void *page_kaddr;
+			void *kaddr;
 
 			mmap_read_unlock(dst_mm);
-			BUG_ON(!page);
+			BUG_ON(!folio);
 
-			page_kaddr = kmap_local_page(page);
-			err = copy_from_user(page_kaddr,
+			kaddr = kmap_local_folio(folio, 0);
+			err = copy_from_user(kaddr,
 					     (const void __user *) src_addr,
 					     PAGE_SIZE);
-			kunmap_local(page_kaddr);
+			kunmap_local(kaddr);
 			if (unlikely(err)) {
 				err = -EFAULT;
 				goto out;
 			}
-			flush_dcache_page(page);
+			flush_dcache_folio(folio);
 			goto retry;
 		} else
-			BUG_ON(page);
+			BUG_ON(folio);
 
 		if (!err) {
 			dst_addr += PAGE_SIZE;
@@ -667,8 +667,8 @@ retry:
 out_unlock:
 	mmap_read_unlock(dst_mm);
 out:
-	if (page)
-		put_page(page);
+	if (folio)
+		folio_put(folio);
 	BUG_ON(copied < 0);
 	BUG_ON(err > 0);
 	BUG_ON(!copied && !err);
-- 
cgit v1.2.3


From 87a7ae75d7383afa998f57656d1d14e2a730cc47 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 11 Apr 2023 19:52:13 +0530
Subject: mm/vmemmap/devdax: fix kernel crash when probing devdax devices

commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for
compound devmaps") added support for using optimized vmmemap for devdax
devices.  But how vmemmap mappings are created are architecture specific.
For example, powerpc with hash translation doesn't have vmemmap mappings
in init_mm page table instead they are bolted table entries in the
hardware page table

vmemmap_populate_compound_pages() used by vmemmap optimization code is not
aware of these architecture-specific mapping.  Hence allow architecture to
opt for this feature.  I selected architectures supporting
HUGETLB_PAGE_OPTIMIZE_VMEMMAP option as also supporting this feature.

This patch fixes the below crash on ppc64.

BUG: Unable to handle kernel data access on write at 0xc00c000100400038
Faulting instruction address: 0xc000000001269d90
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
NIP:  c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
REGS: c000000003632c30 TRAP: 0300   Not tainted  (6.3.0-rc5-150500.34-default+)
MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24842228  XER: 00000000
CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
....
NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
Call Trace:
[c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
[c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
[c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
[c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
[c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
[c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
[c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
[c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
[c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
[c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
[c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
[c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
[c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
[c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
[c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
[c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
[c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
[c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
[c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
[c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
[c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
[c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
[c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
[c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
[c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
[c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
[c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
[c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
[c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
[c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64

Link: https://lkml.kernel.org/r/20230411142214.64464-1-aneesh.kumar@linux.ibm.com
Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  | 16 ++++++++++++++++
 mm/mm_init.c        | 10 ++++++----
 mm/sparse-vmemmap.c |  3 +--
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5a3eaa9a1f8c..21a7e2460084 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3560,6 +3560,22 @@ void vmemmap_populate_print_last(void);
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
 #endif
+
+#ifdef CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+					   struct dev_pagemap *pgmap)
+{
+	return is_power_of_2(sizeof(struct page)) &&
+		pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+					   struct dev_pagemap *pgmap)
+{
+	return false;
+}
+#endif
+
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
 				  unsigned long nr_pages);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a0ec3b3acb5e..7f7f9c677854 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1015,10 +1015,12 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
  * of an altmap. See vmemmap_populate_compound_pages().
  */
 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
-					      unsigned long nr_pages)
+					      struct dev_pagemap *pgmap)
 {
-	return is_power_of_2(sizeof(struct page)) &&
-		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+	if (!vmemmap_can_optimize(altmap, pgmap))
+		return pgmap_vmemmap_nr(pgmap);
+
+	return 2 * (PAGE_SIZE / sizeof(struct page));
 }
 
 static void __ref memmap_init_compound(struct page *head,
@@ -1083,7 +1085,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 			continue;
 
 		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
-				     compound_nr_pages(altmap, pfns_per_compound));
+				     compound_nr_pages(altmap, pgmap));
 	}
 
 	pr_debug("%s initialised %lu pages in %ums\n", __func__,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c5398a5960d0..10d73a0dfcec 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -458,8 +458,7 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (is_power_of_2(sizeof(struct page)) &&
-	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+	if (vmemmap_can_optimize(altmap, pgmap))
 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
 	else
 		r = vmemmap_populate(start, end, nid, altmap);
-- 
cgit v1.2.3


From 0b376f1e0ff555435597fa16823ae0f30b2883e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 12 Apr 2023 10:30:25 +0530
Subject: mm/hugetlb_vmemmap: rename ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP

Now we use ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP config option to
indicate devdax and hugetlb vmemmap optimization support.  Hence rename
that to a generic ARCH_WANT_OPTIMIZE_VMEMMAP

Link: https://lkml.kernel.org/r/20230412050025.84346-2-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tarun Sahu <tsahu@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/Kconfig | 2 +-
 arch/s390/Kconfig      | 2 +-
 arch/x86/Kconfig       | 2 +-
 fs/Kconfig             | 9 +--------
 include/linux/mm.h     | 2 +-
 mm/Kconfig             | 6 ++++++
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e1e3a3828962..5abd13093c1a 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -53,8 +53,8 @@ config LOONGARCH
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select ARCH_WANTS_NO_INSTR
 	select BUILDTIME_TABLE_SORT
 	select COMMON_CLK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 548b5b587003..61d778397720 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -126,8 +126,8 @@ config S390
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_DEFAULT_BPF_JIT
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WANT_OPTIMIZE_VMEMMAP
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
 	select DMA_OPS if PCI
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df21fba77db1..4c123315c440 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -126,8 +126,8 @@ config X86
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_GENERAL_HUGETLB
 	select ARCH_WANT_HUGE_PMD_SHARE
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANT_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select BUILDTIME_TABLE_SORT
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c65033..cc07a0cd3172 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,16 +250,9 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
-#
-# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
-#
-config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-	bool
-
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+	depends on ARCH_WANT_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21a7e2460084..0b514de43143 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3561,7 +3561,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
 #endif
 
-#ifdef CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
 static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
 					   struct dev_pagemap *pgmap)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 6ee3b48ed298..5ca8fcfae243 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -479,6 +479,12 @@ config SPARSEMEM_VMEMMAP
 	  SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
+#
+config ARCH_WANT_OPTIMIZE_VMEMMAP
+	bool
 
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool
-- 
cgit v1.2.3


From 1cb9dc4b475c7418f925ab0c97b6750007d9f52e Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 13 Apr 2023 21:13:49 +0800
Subject: mm: hwpoison: support recovery from HugePage copy-on-write faults

copy-on-write of hugetlb user pages with uncorrectable errors will result
in a kernel crash.  This is because the copy is performed in kernel mode
and in general we can not handle accessing memory with such errors while
in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
copy-on write faults") introduced the routine copy_user_highpage_mc() to
gracefully handle copying of user pages with uncorrectable errors.
However, the separate hugetlb copy-on-write code paths were not modified
as part of commit a873dfe1032a.

Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage() so
that they can also gracefully handle uncorrectable errors in user pages.
This involves changing the hugetlb specific routine
copy_user_large_folio() from type void to int so that it can return an
error.  Modify the hugetlb userfaultfd code in the same way so that it can
return -EHWPOISON if it encounters an uncorrectable error.

Link: https://lkml.kernel.org/r/20230413131349.2524210-1-liushixin2@huawei.com
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  6 +++---
 mm/hugetlb.c       | 17 +++++++++++++---
 mm/memory.c        | 59 +++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b514de43143..2772a1800cf3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3693,9 +3693,9 @@ extern const struct attribute_group memory_failure_attr_group;
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
-void copy_user_large_folio(struct folio *dst, struct folio *src,
-			   unsigned long addr_hint,
-			   struct vm_area_struct *vma);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+			  unsigned long addr_hint,
+			  struct vm_area_struct *vma);
 long copy_folio_from_user(struct folio *dst_folio,
 			   const void __user *usr_src,
 			   bool allow_pagefault);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f16b25b1a6b9..a08fb47fb200 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5097,10 +5097,14 @@ again:
 					ret = PTR_ERR(new_folio);
 					break;
 				}
-				copy_user_large_folio(new_folio,
+				ret = copy_user_large_folio(new_folio,
 						      page_folio(ptepage),
 						      addr, dst_vma);
 				put_page(ptepage);
+				if (ret) {
+					folio_put(new_folio);
+					break;
+				}
 
 				/* Install the new hugetlb folio if src pte stable */
 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5617,7 +5621,10 @@ retry_avoidcopy:
 		goto out_release_all;
 	}
 
-	copy_user_large_folio(new_folio, page_folio(old_page), address, vma);
+	if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) {
+		ret = VM_FAULT_HWPOISON_LARGE;
+		goto out_release_all;
+	}
 	__folio_mark_uptodate(new_folio);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
@@ -6260,9 +6267,13 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			*foliop = NULL;
 			goto out;
 		}
-		copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
+		ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
 		folio_put(*foliop);
 		*foliop = NULL;
+		if (ret) {
+			folio_put(folio);
+			goto out;
+		}
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index f315c2198098..f7510a06a2d0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5733,12 +5733,12 @@ EXPORT_SYMBOL(__might_fault);
  * operation.  The target subpage will be processed last to keep its
  * cache lines hot.
  */
-static inline void process_huge_page(
+static inline int process_huge_page(
 	unsigned long addr_hint, unsigned int pages_per_huge_page,
-	void (*process_subpage)(unsigned long addr, int idx, void *arg),
+	int (*process_subpage)(unsigned long addr, int idx, void *arg),
 	void *arg)
 {
-	int i, n, base, l;
+	int i, n, base, l, ret;
 	unsigned long addr = addr_hint &
 		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 
@@ -5752,7 +5752,9 @@ static inline void process_huge_page(
 		/* Process subpages at the end of huge page */
 		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
 			cond_resched();
-			process_subpage(addr + i * PAGE_SIZE, i, arg);
+			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+			if (ret)
+				return ret;
 		}
 	} else {
 		/* If target subpage in second half of huge page */
@@ -5761,7 +5763,9 @@ static inline void process_huge_page(
 		/* Process subpages at the begin of huge page */
 		for (i = 0; i < base; i++) {
 			cond_resched();
-			process_subpage(addr + i * PAGE_SIZE, i, arg);
+			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+			if (ret)
+				return ret;
 		}
 	}
 	/*
@@ -5773,10 +5777,15 @@ static inline void process_huge_page(
 		int right_idx = base + 2 * l - 1 - i;
 
 		cond_resched();
-		process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+		ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+		if (ret)
+			return ret;
 		cond_resched();
-		process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+		ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+		if (ret)
+			return ret;
 	}
+	return 0;
 }
 
 static void clear_gigantic_page(struct page *page,
@@ -5794,11 +5803,12 @@ static void clear_gigantic_page(struct page *page,
 	}
 }
 
-static void clear_subpage(unsigned long addr, int idx, void *arg)
+static int clear_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct page *page = arg;
 
 	clear_user_highpage(page + idx, addr);
+	return 0;
 }
 
 void clear_huge_page(struct page *page,
@@ -5815,7 +5825,7 @@ void clear_huge_page(struct page *page,
 	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
 }
 
-static void copy_user_gigantic_page(struct folio *dst, struct folio *src,
+static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
 				     unsigned long addr,
 				     struct vm_area_struct *vma,
 				     unsigned int pages_per_huge_page)
@@ -5829,8 +5839,13 @@ static void copy_user_gigantic_page(struct folio *dst, struct folio *src,
 		src_page = folio_page(src, i);
 
 		cond_resched();
-		copy_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma);
+		if (copy_mc_user_highpage(dst_page, src_page,
+					  addr + i*PAGE_SIZE, vma)) {
+			memory_failure_queue(page_to_pfn(src_page), 0);
+			return -EHWPOISON;
+		}
 	}
+	return 0;
 }
 
 struct copy_subpage_arg {
@@ -5839,16 +5854,20 @@ struct copy_subpage_arg {
 	struct vm_area_struct *vma;
 };
 
-static void copy_subpage(unsigned long addr, int idx, void *arg)
+static int copy_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct copy_subpage_arg *copy_arg = arg;
 
-	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-			   addr, copy_arg->vma);
+	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+				  addr, copy_arg->vma)) {
+		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+		return -EHWPOISON;
+	}
+	return 0;
 }
 
-void copy_user_large_folio(struct folio *dst, struct folio *src,
-			   unsigned long addr_hint, struct vm_area_struct *vma)
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+			  unsigned long addr_hint, struct vm_area_struct *vma)
 {
 	unsigned int pages_per_huge_page = folio_nr_pages(dst);
 	unsigned long addr = addr_hint &
@@ -5859,13 +5878,11 @@ void copy_user_large_folio(struct folio *dst, struct folio *src,
 		.vma = vma,
 	};
 
-	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-		copy_user_gigantic_page(dst, src, addr, vma,
-					pages_per_huge_page);
-		return;
-	}
+	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
+		return copy_user_gigantic_page(dst, src, addr, vma,
+					       pages_per_huge_page);
 
-	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+	return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
 }
 
 long copy_folio_from_user(struct folio *dst_folio,
-- 
cgit v1.2.3


From bb1508c24c9c361e6344308c8de2cb81d7f228ba Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 13 Apr 2023 15:12:22 +0200
Subject: mm: kmsan: apply __must_check to non-void functions

Non-void KMSAN hooks may return error codes that indicate that KMSAN
failed to reflect the changed memory state in the metadata (e.g.  it could
not create the necessary memory mappings).  In such cases the callers
should handle the errors to prevent the tool from using the inconsistent
metadata in the future.

We mark non-void hooks with __must_check so that error handling is not
skipped.

Link: https://lkml.kernel.org/r/20230413131223.4135168-3-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dipanjan Das <mail.dipanjan.das@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index 30b17647ce3c..e0c23a32cdf0 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -54,7 +54,8 @@ void __init kmsan_init_runtime(void);
  * Freed pages are either returned to buddy allocator or held back to be used
  * as metadata pages.
  */
-bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order);
+bool __init __must_check kmsan_memblock_free_pages(struct page *page,
+						   unsigned int order);
 
 /**
  * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
@@ -137,9 +138,11 @@ void kmsan_kfree_large(const void *ptr);
  * vmalloc metadata address range. Returns 0 on success, callers must check
  * for non-zero return value.
  */
-int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
-				   pgprot_t prot, struct page **pages,
-				   unsigned int page_shift);
+int __must_check kmsan_vmap_pages_range_noflush(unsigned long start,
+						unsigned long end,
+						pgprot_t prot,
+						struct page **pages,
+						unsigned int page_shift);
 
 /**
  * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -163,9 +166,9 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
  * virtual memory. Returns 0 on success, callers must check for non-zero return
  * value.
  */
-int kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
-			     phys_addr_t phys_addr, pgprot_t prot,
-			     unsigned int page_shift);
+int __must_check kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+					  phys_addr_t phys_addr, pgprot_t prot,
+					  unsigned int page_shift);
 
 /**
  * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
@@ -237,8 +240,8 @@ static inline void kmsan_init_runtime(void)
 {
 }
 
-static inline bool kmsan_memblock_free_pages(struct page *page,
-					     unsigned int order)
+static inline bool __must_check kmsan_memblock_free_pages(struct page *page,
+							  unsigned int order)
 {
 	return true;
 }
@@ -251,10 +254,9 @@ static inline void kmsan_task_exit(struct task_struct *task)
 {
 }
 
-static inline int kmsan_alloc_page(struct page *page, unsigned int order,
-				   gfp_t flags)
+static inline void kmsan_alloc_page(struct page *page, unsigned int order,
+				    gfp_t flags)
 {
-	return 0;
 }
 
 static inline void kmsan_free_page(struct page *page, unsigned int order)
@@ -283,11 +285,9 @@ static inline void kmsan_kfree_large(const void *ptr)
 {
 }
 
-static inline int kmsan_vmap_pages_range_noflush(unsigned long start,
-						 unsigned long end,
-						 pgprot_t prot,
-						 struct page **pages,
-						 unsigned int page_shift)
+static inline int __must_check kmsan_vmap_pages_range_noflush(
+	unsigned long start, unsigned long end, pgprot_t prot,
+	struct page **pages, unsigned int page_shift)
 {
 	return 0;
 }
@@ -297,10 +297,11 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start,
 {
 }
 
-static inline int kmsan_ioremap_page_range(unsigned long start,
-					   unsigned long end,
-					   phys_addr_t phys_addr, pgprot_t prot,
-					   unsigned int page_shift)
+static inline int __must_check kmsan_ioremap_page_range(unsigned long start,
+							unsigned long end,
+							phys_addr_t phys_addr,
+							pgprot_t prot,
+							unsigned int page_shift)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From c7b23b68e2aa93f86a206222d23ccd9a21f5982a Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 13 Apr 2023 10:40:34 +0000
Subject: mm: vmscan: refactor updating current->reclaim_state

During reclaim, we keep track of pages reclaimed from other means than
LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
which we stash a pointer to in current task_struct.

However, we keep track of more than just reclaimed slab pages through
this.  We also use it for clean file pages dropped through pruned inodes,
and xfs buffer pages freed.  Rename reclaimed_slab to reclaimed, and add a
helper function that wraps updating it through current, so that future
changes to this logic are contained within include/linux/swap.h.

Link: https://lkml.kernel.org/r/20230413104034.1086717-4-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/inode.c           |  3 +--
 fs/xfs/xfs_buf.c     |  3 +--
 include/linux/swap.h | 17 ++++++++++++++++-
 mm/slab.c            |  3 +--
 mm/slob.c            |  6 ++----
 mm/slub.c            |  5 ++---
 6 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 4558dc2f1355..e60fcc41faf1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 				__count_vm_events(KSWAPD_INODESTEAL, reap);
 			else
 				__count_vm_events(PGINODESTEAL, reap);
-			if (current->reclaim_state)
-				current->reclaim_state->reclaimed_slab += reap;
+			mm_account_reclaimed_pages(reap);
 		}
 		iput(inode);
 		spin_lock(lru_lock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c..15d1e5a7c2d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -286,8 +286,7 @@ xfs_buf_free_pages(
 		if (bp->b_pages[i])
 			__free_page(bp->b_pages[i]);
 	}
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += bp->b_page_count;
+	mm_account_reclaimed_pages(bp->b_page_count);
 
 	if (bp->b_pages != bp->b_page_array)
 		kmem_free(bp->b_pages);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bfc3b06b5f8f..7f7d5b9ddf7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -153,13 +153,28 @@ union swap_header {
  * memory reclaim
  */
 struct reclaim_state {
-	unsigned long reclaimed_slab;
+	/* pages reclaimed outside of LRU-based reclaim */
+	unsigned long reclaimed;
 #ifdef CONFIG_LRU_GEN
 	/* per-thread mm walk data */
 	struct lru_gen_mm_walk *mm_walk;
 #endif
 };
 
+/*
+ * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
+ * reclaim
+ * @pages: number of pages reclaimed
+ *
+ * If the current process is undergoing a reclaim operation, increment the
+ * number of reclaimed pages by @pages.
+ */
+static inline void mm_account_reclaimed_pages(unsigned long pages)
+{
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed += pages;
+}
+
 #ifdef __KERNEL__
 
 struct address_space;
diff --git a/mm/slab.c b/mm/slab.c
index 6b7c172158e5..bb57f7fdbae1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
 	smp_wmb();
 	__folio_clear_slab(folio);
 
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += 1 << order;
+	mm_account_reclaimed_pages(1 << order);
 	unaccount_slab(slab, order, cachep);
 	__free_pages(&folio->page, order);
 }
diff --git a/mm/slob.c b/mm/slob.c
index fe567fcfa3a3..79cc8680c973 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -61,7 +61,7 @@
 #include <linux/slab.h>
 
 #include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/export.h>
@@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
 {
 	struct page *sp = virt_to_page(b);
 
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += 1 << order;
-
+	mm_account_reclaimed_pages(1 << order);
 	mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
 			    -(PAGE_SIZE << order));
 	__free_pages(sp, order);
diff --git a/mm/slub.c b/mm/slub.c
index f49d669ff604..2728d5ae4dc0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
 #include <linux/module.h>
 #include <linux/bit_spinlock.h>
 #include <linux/interrupt.h>
@@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
 	/* Make the mapping reset visible before clearing the flag */
 	smp_wmb();
 	__folio_clear_slab(folio);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += pages;
+	mm_account_reclaimed_pages(pages);
 	unaccount_slab(slab, order, s);
 	__free_pages(&folio->page, order);
 }
-- 
cgit v1.2.3


From 7f63cf2d9b9bbe7b90f808927558a66ff737d399 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Thu, 13 Apr 2023 14:43:26 -0700
Subject: mm: Multi-gen LRU: remove wait_event_killable()

Android 14 and later default to MGLRU [1] and field telemetry showed
occasional long tail latency (>100ms) in the reclaim path.

Tracing revealed priority inversion in the reclaim path.  In
try_to_inc_max_seq(), when high priority tasks were blocked on
wait_event_killable(), the preemption of the low priority task to call
wake_up_all() caused those high priority tasks to wait longer than
necessary.  In general, this problem is not different from others of its
kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU
because it introduced the new wait queue lruvec->mm_state.wait.

The purpose of this new wait queue is to avoid the thundering herd
problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only
one can succeed, i.e., the one to wake up the rest, and the rest who
failed might cause premature OOM kills if they do not wait.  So far there
is no evidence supporting this scenario, based on how often the wait has
been hit.  And this begs the question how useful the wait queue is in
practice.

Based on Minchan's recommendation, which is in line with his commit
6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
rest of the MGLRU code which also uses trylock when possible, remove the
wait queue.

[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf

Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Reported-by: Wei Wang <wvw@google.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |   8 +---
 mm/vmscan.c            | 112 ++++++++++++++++++-------------------------------
 2 files changed, 42 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 337956748976..a4889c9d4055 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -443,18 +443,14 @@ enum {
 struct lru_gen_mm_state {
 	/* set to max_seq after each iteration */
 	unsigned long seq;
-	/* where the current iteration continues (inclusive) */
+	/* where the current iteration continues after */
 	struct list_head *head;
-	/* where the last iteration ended (exclusive) */
+	/* where the last iteration ended before */
 	struct list_head *tail;
-	/* to wait for the last page table walker to finish */
-	struct wait_queue_head wait;
 	/* Bloom filters flip after each iteration */
 	unsigned long *filters[NR_BLOOM_FILTERS];
 	/* the mm stats for debugging */
 	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
-	/* the number of concurrent page table walkers */
-	int nr_walkers;
 };
 
 struct lru_gen_mm_walk {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2eb5862e99ec..8c05f4c288ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3450,18 +3450,13 @@ void lru_gen_del_mm(struct mm_struct *mm)
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
 
-		/* where the last iteration ended (exclusive) */
+		/* where the current iteration continues after */
+		if (lruvec->mm_state.head == &mm->lru_gen.list)
+			lruvec->mm_state.head = lruvec->mm_state.head->prev;
+
+		/* where the last iteration ended before */
 		if (lruvec->mm_state.tail == &mm->lru_gen.list)
 			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
-
-		/* where the current iteration continues (inclusive) */
-		if (lruvec->mm_state.head != &mm->lru_gen.list)
-			continue;
-
-		lruvec->mm_state.head = lruvec->mm_state.head->next;
-		/* the deletion ends the current iteration */
-		if (lruvec->mm_state.head == &mm_list->fifo)
-			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
 	}
 
 	list_del_init(&mm->lru_gen.list);
@@ -3557,68 +3552,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 			    struct mm_struct **iter)
 {
 	bool first = false;
-	bool last = true;
+	bool last = false;
 	struct mm_struct *mm = NULL;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 
 	/*
-	 * There are four interesting cases for this page table walker:
-	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
-	 *    there is nothing left to do.
-	 * 2. It's the first of the current generation, and it needs to reset
-	 *    the Bloom filter for the next generation.
-	 * 3. It reaches the end of mm_list, and it needs to increment
-	 *    mm_state->seq; the iteration is done.
-	 * 4. It's the last of the current generation, and it needs to reset the
-	 *    mm stats counters for the next generation.
+	 * mm_state->seq is incremented after each iteration of mm_list. There
+	 * are three interesting cases for this page table walker:
+	 * 1. It tries to start a new iteration with a stale max_seq: there is
+	 *    nothing left to do.
+	 * 2. It started the next iteration: it needs to reset the Bloom filter
+	 *    so that a fresh set of PTE tables can be recorded.
+	 * 3. It ended the current iteration: it needs to reset the mm stats
+	 *    counters and tell its caller to increment max_seq.
 	 */
 	spin_lock(&mm_list->lock);
 
 	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
-	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
-	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
 
-	if (walk->max_seq <= mm_state->seq) {
-		if (!*iter)
-			last = false;
+	if (walk->max_seq <= mm_state->seq)
 		goto done;
-	}
 
-	if (!mm_state->nr_walkers) {
-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+	if (!mm_state->head)
+		mm_state->head = &mm_list->fifo;
 
-		mm_state->head = mm_list->fifo.next;
+	if (mm_state->head == &mm_list->fifo)
 		first = true;
-	}
-
-	while (!mm && mm_state->head != &mm_list->fifo) {
-		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 
+	do {
 		mm_state->head = mm_state->head->next;
+		if (mm_state->head == &mm_list->fifo) {
+			WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+			last = true;
+			break;
+		}
 
 		/* force scan for those added after the last iteration */
-		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
-			mm_state->tail = mm_state->head;
+		if (!mm_state->tail || mm_state->tail == mm_state->head) {
+			mm_state->tail = mm_state->head->next;
 			walk->force_scan = true;
 		}
 
+		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 		if (should_skip_mm(mm, walk))
 			mm = NULL;
-	}
-
-	if (mm_state->head == &mm_list->fifo)
-		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+	} while (!mm);
 done:
-	if (*iter && !mm)
-		mm_state->nr_walkers--;
-	if (!*iter && mm)
-		mm_state->nr_walkers++;
-
-	if (mm_state->nr_walkers)
-		last = false;
-
 	if (*iter || last)
 		reset_mm_stats(lruvec, walk, last);
 
@@ -3646,9 +3627,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 
 	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
 
-	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
-
+	if (max_seq > mm_state->seq) {
+		mm_state->head = NULL;
+		mm_state->tail = NULL;
 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 		reset_mm_stats(lruvec, NULL, true);
 		success = true;
@@ -4248,10 +4229,6 @@ restart:
 
 		walk_pmd_range(&val, addr, next, args);
 
-		/* a racy check to curtail the waiting time */
-		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
-			return 1;
-
 		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
 			end = (addr | ~PUD_MASK) + 1;
 			goto done;
@@ -4284,8 +4261,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
 	walk->next_addr = FIRST_USER_ADDRESS;
 
 	do {
+		DEFINE_MAX_SEQ(lruvec);
+
 		err = -EBUSY;
 
+		/* another thread might have called inc_max_seq() */
+		if (walk->max_seq != max_seq)
+			break;
+
 		/* folio_update_gen() requires stable folio_memcg() */
 		if (!mem_cgroup_trylock_pages(memcg))
 			break;
@@ -4518,25 +4501,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 		success = iterate_mm_list(lruvec, walk, &mm);
 		if (mm)
 			walk_mm(lruvec, mm, walk);
-
-		cond_resched();
 	} while (mm);
 done:
-	if (!success) {
-		if (sc->priority <= DEF_PRIORITY - 2)
-			wait_event_killable(lruvec->mm_state.wait,
-					    max_seq < READ_ONCE(lrugen->max_seq));
-		return false;
-	}
+	if (success)
+		inc_max_seq(lruvec, can_swap, force_scan);
 
-	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
-
-	inc_max_seq(lruvec, can_swap, force_scan);
-	/* either this sees any waiters or they will see updated max_seq */
-	if (wq_has_sleeper(&lruvec->mm_state.wait))
-		wake_up_all(&lruvec->mm_state.wait);
-
-	return true;
+	return success;
 }
 
 /******************************************************************************
@@ -6173,7 +6143,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
 
 	lruvec->mm_state.seq = MIN_NR_GENS;
-	init_waitqueue_head(&lruvec->mm_state.wait);
 }
 
 #ifdef CONFIG_MEMCG
@@ -6206,7 +6175,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
 
-		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
 					   sizeof(lruvec->lrugen.nr_pages)));
 
-- 
cgit v1.2.3


From f7b8f70ba44fb63df47b6fc3204d49ac2885de64 Mon Sep 17 00:00:00 2001
From: Luca Vizzarro <Luca.Vizzarro@arm.com>
Date: Fri, 14 Apr 2023 16:24:58 +0100
Subject: memfd: pass argument of memfd_fcntl as int

The interface for fcntl expects the argument passed for the command
F_ADD_SEALS to be of type int.  The current code wrongly treats it as a
long.  In order to avoid access to undefined bits, we should explicitly
cast the argument to int.

This commit changes the signature of all the related and helper functions
so that they treat the argument as int instead of long.

Link: https://lkml.kernel.org/r/20230414152459.816046-5-Luca.Vizzarro@arm.com
Signed-off-by: Luca Vizzarro <Luca.Vizzarro@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Kevin Brodsky <Kevin.Brodsky@arm.com>
Cc: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: David Laight <David.Laight@ACULAB.com>
Cc: Mark Rutland <Mark.Rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memfd.h | 4 ++--
 mm/memfd.c            | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 4f1600413f91..e7abf6fa4c52 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -5,9 +5,9 @@
 #include <linux/file.h>
 
 #ifdef CONFIG_MEMFD_CREATE
-extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
 #else
-static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
 {
 	return -EINVAL;
 }
diff --git a/mm/memfd.c b/mm/memfd.c
index a0a7a37e8177..69b90c31d38c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -243,16 +243,12 @@ static int memfd_get_seals(struct file *file)
 	return seals ? *seals : -EINVAL;
 }
 
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 {
 	long error;
 
 	switch (cmd) {
 	case F_ADD_SEALS:
-		/* disallow upper 32bit */
-		if (arg > UINT_MAX)
-			return -EINVAL;
-
 		error = memfd_add_seals(file, arg);
 		break;
 	case F_GET_SEALS:
-- 
cgit v1.2.3


From b0687c1119b4e8c88a651b6e876b7eae28d076e3 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 4 Apr 2023 17:13:51 -0500
Subject: lib/rbtree: use '+' instead of '|' for setting color.

This has a slight benefit for x86 and has no effect on other targets.

The benefit to x86 is it change the codegen for setting a node to block
from `mov %r0, %r1; or $RB_BLACK, %r1` to `lea RB_BLACK(%r0), %r1` which
saves an instructions.

In all other cases it just replace ALU with ALU (or -> and) which
perform the same on all machines I am aware of.

Total instructions in rbtree.o:
    Before  - 802
    After   - 782

so it saves about 20 `mov` instructions.

Link: https://lkml.kernel.org/r/20230404221350.3806566-1-goldstein.w.n@gmail.com
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree_augmented.h | 4 ++--
 lib/rbtree.c                     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index d1c53e9d8c75..7ee7ed5de722 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -156,13 +156,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
 
 static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
 {
-	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+	rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
 }
 
 static inline void rb_set_parent_color(struct rb_node *rb,
 				       struct rb_node *p, int color)
 {
-	rb->__rb_parent_color = (unsigned long)p | color;
+	rb->__rb_parent_color = (unsigned long)p + color;
 }
 
 static inline void
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c4ac5c2421f2..5114eda6309c 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -58,7 +58,7 @@
 
 static inline void rb_set_black(struct rb_node *rb)
 {
-	rb->__rb_parent_color |= RB_BLACK;
+	rb->__rb_parent_color += RB_BLACK;
 }
 
 static inline struct rb_node *rb_red_parent(struct rb_node *red)
-- 
cgit v1.2.3


From a3b2aeac9d154e5e15ddbf19de934c0c606b6acd Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang19@zte.com.cn>
Date: Sat, 8 Apr 2023 17:28:35 +0800
Subject: delayacct: track delays from IRQ/SOFTIRQ

Delay accounting does not track the delay of IRQ/SOFTIRQ.  While
IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such
as when workloads are running on system which is busy handling network
IRQ/SOFTIRQ.

Get the delay of IRQ/SOFTIRQ could help users to reduce such delay.  Such
as setting interrupt affinity or task affinity, using kernel thread for
NAPI etc.  This is inspired by "sched/psi: Add PSI_IRQ to track
IRQ/SOFTIRQ pressure"[1].  Also fix some code indent problems of older
code.

And update tools/accounting/getdelays.c:
    / # ./getdelays -p 156 -di
    print delayacct stats ON
    printing IO accounting
    PID     156

    CPU             count     real total  virtual total    delay total  delay average
                       15       15836008       16218149      275700790         18.380ms
    IO              count    delay total  delay average
                        0              0          0.000ms
    SWAP            count    delay total  delay average
                        0              0          0.000ms
    RECLAIM         count    delay total  delay average
                        0              0          0.000ms
    THRASHING       count    delay total  delay average
                        0              0          0.000ms
    COMPACT         count    delay total  delay average
                        0              0          0.000ms
    WPCOPY          count    delay total  delay average
                       36        7586118          0.211ms
    IRQ             count    delay total  delay average
                       42         929161          0.022ms

[1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure")

Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: junhua huang <huang.junhua@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  7 +++++--
 include/linux/delayacct.h                     | 15 ++++++++++++++
 include/uapi/linux/taskstats.h                |  6 +++++-
 kernel/delayacct.c                            | 14 +++++++++++++
 kernel/sched/core.c                           |  1 +
 tools/accounting/getdelays.c                  | 30 ++++++++++++++++-----------
 6 files changed, 58 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 79f537c9f160..f61c01fc376e 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -16,6 +16,7 @@ d) memory reclaim
 e) thrashing
 f) direct compact
 g) write-protect copy
+h) IRQ/SOFTIRQ
 
 and makes these statistics available to userspace through
 the taskstats interface.
@@ -49,7 +50,7 @@ this structure. See
 for a description of the fields pertaining to delay accounting.
 It will generally be in the form of counters returning the cumulative
 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact, write-protect copy etc.
+cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc.
 
 Taking the difference of two successive readings of a given
 counter (say cpu_delay_total) for a task will give the delay
@@ -118,7 +119,9 @@ Get sum of delays, since system boot, for all pids with tgid 5::
                        0              0          0.000ms
 	COMPACT         count    delay total  delay average
                        0              0          0.000ms
-   WPCOPY          count    delay total  delay average
+	WPCOPY          count    delay total  delay average
+                       0              0          0.000ms
+	IRQ             count    delay total  delay average
                        0              0          0.000ms
 
 Get IO accounting for pid 1, it works only with -p::
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 0da97dba9ef8..6639f48dac36 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -48,10 +48,13 @@ struct task_delay_info {
 	u64 wpcopy_start;
 	u64 wpcopy_delay;	/* wait for write-protect copy */
 
+	u64 irq_delay;	/* wait for IRQ/SOFTIRQ */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
 	u32 wpcopy_count;	/* total count of write-protect copy */
+	u32 irq_count;	/* total count of IRQ/SOFTIRQ */
 };
 #endif
 
@@ -81,6 +84,7 @@ extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
 extern void __delayacct_wpcopy_start(void);
 extern void __delayacct_wpcopy_end(void);
+extern void __delayacct_irq(struct task_struct *task, u32 delta);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -215,6 +219,15 @@ static inline void delayacct_wpcopy_end(void)
 		__delayacct_wpcopy_end();
 }
 
+static inline void delayacct_irq(struct task_struct *task, u32 delta)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (task->delays)
+		__delayacct_irq(task, delta);
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -253,6 +266,8 @@ static inline void delayacct_wpcopy_start(void)
 {}
 static inline void delayacct_wpcopy_end(void)
 {}
+static inline void delayacct_irq(struct task_struct *task, u32 delta)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index a7f5b11a8f1b..b50b2eb257a0 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	13
+#define TASKSTATS_VERSION	14
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -198,6 +198,10 @@ struct taskstats {
 	/* v13: Delay waiting for write-protect copy */
 	__u64    wpcopy_count;
 	__u64    wpcopy_delay_total;
+
+	/* v14: Delay waiting for IRQ/SOFTIRQ */
+	__u64    irq_count;
+	__u64    irq_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e39cb696cfbd..6f0c358e73d8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
 	tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
 	d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+	tmp = d->irq_delay_total + tsk->delays->irq_delay;
+	d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
 	d->wpcopy_count += tsk->delays->wpcopy_count;
+	d->irq_count += tsk->delays->irq_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void)
 		      &current->delays->wpcopy_delay,
 		      &current->delays->wpcopy_count);
 }
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&task->delays->lock, flags);
+	task->delays->irq_delay += delta;
+	task->delays->irq_count++;
+	raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..5473e831daf3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -704,6 +704,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	psi_account_irqtime(rq->curr, irq_delta);
+	delayacct_irq(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 23a15d8f2bf4..1334214546d7 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -198,17 +198,19 @@ static void print_delayacct(struct taskstats *t)
 	printf("\n\nCPU   %15s%15s%15s%15s%15s\n"
 	       "      %15llu%15llu%15llu%15llu%15.3fms\n"
 	       "IO    %15s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "SWAP  %15s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "RECLAIM  %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "THRASHING%12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "COMPACT  %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "WPCOPY   %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n",
+	       "      %15llu%15llu%15.3fms\n"
+	       "IRQ   %15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
 	       (unsigned long long)t->cpu_count,
@@ -219,27 +221,31 @@ static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->blkio_count,
 	       (unsigned long long)t->blkio_delay_total,
-          average_ms((double)t->blkio_delay_total, t->blkio_count),
+	       average_ms((double)t->blkio_delay_total, t->blkio_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->swapin_count,
 	       (unsigned long long)t->swapin_delay_total,
-          average_ms((double)t->swapin_delay_total, t->swapin_count),
+	       average_ms((double)t->swapin_delay_total, t->swapin_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->freepages_count,
 	       (unsigned long long)t->freepages_delay_total,
-          average_ms((double)t->freepages_delay_total, t->freepages_count),
+	       average_ms((double)t->freepages_delay_total, t->freepages_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->thrashing_count,
 	       (unsigned long long)t->thrashing_delay_total,
-          average_ms((double)t->thrashing_delay_total, t->thrashing_count),
+	       average_ms((double)t->thrashing_delay_total, t->thrashing_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
-          average_ms((double)t->compact_delay_total, t->compact_count),
+	       average_ms((double)t->compact_delay_total, t->compact_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->wpcopy_count,
 	       (unsigned long long)t->wpcopy_delay_total,
-          average_ms((double)t->wpcopy_delay_total, t->wpcopy_count));
+	       average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->irq_count,
+	       (unsigned long long)t->irq_delay_total,
+	       average_ms((double)t->irq_delay_total, t->irq_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From 31088f6f7906253ef4577f6a9b84e2d42447dba0 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 11 Apr 2023 10:27:47 +0100
Subject: uapi/linux/const.h: prefer ISO-friendly __typeof__

typeof is (still) a GNU extension, which means that it cannot be used when
building ISO C (e.g.  -std=c99).  It should therefore be avoided in uapi
headers in favour of the ISO-friendly __typeof__.

Unfortunately this issue could not be detected by
CONFIG_UAPI_HEADER_TEST=y as the __ALIGN_KERNEL() macro is not expanded in
any uapi header.

This matters from a userspace perspective, not a kernel one. uapi
headers and their contents are expected to be usable in a variety of
situations, and in particular when building ISO C applications (with
-std=c99 or similar).

This particular problem can be reproduced by trying to use the
__ALIGN_KERNEL macro directly in application code, say:

#include <linux/const.h>

int align(int x, int a)
{
	return __KERNEL_ALIGN(x, a);
}

and trying to build that with -std=c99.

Link: https://lkml.kernel.org/r/20230411092747.3759032-1-kevin.brodsky@arm.com
Fixes: a79ff731a1b2 ("netfilter: xtables: make XT_ALIGN() usable in exported headers by exporting __ALIGN_KERNEL()")
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reported-by: Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
Tested-by: Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Tested-by: Petr Vorel <pvorel@suse.cz>
Reviewed-by: Masahiro Yamada <masahiroy@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/const.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h
index af2a44c08683..a429381e7ca5 100644
--- a/include/uapi/linux/const.h
+++ b/include/uapi/linux/const.h
@@ -28,7 +28,7 @@
 #define _BITUL(x)	(_UL(1) << (x))
 #define _BITULL(x)	(_ULL(1) << (x))
 
-#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
 #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
-- 
cgit v1.2.3


From 4248d0083ec5817eebfb916c54950d100b3468ee Mon Sep 17 00:00:00 2001
From: Longlong Xia <xialonglong1@huawei.com>
Date: Fri, 14 Apr 2023 10:17:41 +0800
Subject: mm: ksm: support hwpoison for ksm page

hwpoison_user_mappings() is updated to support ksm pages, and add
collect_procs_ksm() to collect processes when the error hit an ksm page.
The difference from collect_procs_anon() is that it also needs to traverse
the rmap-item list on the stable node of the ksm page.  At the same time,
add_to_kill_ksm() is added to handle ksm pages.  And
task_in_to_kill_list() is added to avoid duplicate addition of tsk to the
to_kill list.  This is because when scanning the list, if the pages that
make up the ksm page all come from the same process, they may be added
repeatedly.

Link: https://lkml.kernel.org/r/20230414021741.2597273-3-xialonglong1@huawei.com
Signed-off-by: Longlong Xia <xialonglong1@huawei.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 11 +++++++++++
 include/linux/mm.h  |  7 +++++++
 mm/ksm.c            | 45 +++++++++++++++++++++++++++++++++++++++++++++
 mm/memory-failure.c | 34 +++++++++++++++++++++++++---------
 4 files changed, 88 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e232ba59b86..d9e701326a88 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -51,6 +51,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
 
+#ifdef CONFIG_MEMORY_FAILURE
+void collect_procs_ksm(struct page *page, struct list_head *to_kill,
+		       int force_early);
+#endif
 #else  /* !CONFIG_KSM */
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
@@ -62,6 +66,13 @@ static inline void ksm_exit(struct mm_struct *mm)
 {
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+static inline void collect_procs_ksm(struct page *page,
+				     struct list_head *to_kill, int force_early)
+{
+}
+#endif
+
 #ifdef CONFIG_MMU
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2772a1800cf3..37554b08bb28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3604,6 +3604,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 					bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
 #else
 static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
@@ -3624,6 +3625,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 }
 #endif
 
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+		     struct vm_area_struct *vma, struct list_head *to_kill,
+		     unsigned long ksm_addr);
+#endif
+
 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 extern void memblk_nr_poison_inc(unsigned long pfn);
 extern void memblk_nr_poison_sub(unsigned long pfn, long i);
diff --git a/mm/ksm.c b/mm/ksm.c
index 290a3eb6d8de..37c63310bc4e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2738,6 +2738,51 @@ again:
 		goto again;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Collect processes when the error hit an ksm page.
+ */
+void collect_procs_ksm(struct page *page, struct list_head *to_kill,
+		       int force_early)
+{
+	struct ksm_stable_node *stable_node;
+	struct ksm_rmap_item *rmap_item;
+	struct folio *folio = page_folio(page);
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+
+	stable_node = folio_stable_node(folio);
+	if (!stable_node)
+		return;
+	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+		struct anon_vma *av = rmap_item->anon_vma;
+
+		anon_vma_lock_read(av);
+		read_lock(&tasklist_lock);
+		for_each_process(tsk) {
+			struct anon_vma_chain *vmac;
+			unsigned long addr;
+			struct task_struct *t =
+				task_early_kill(tsk, force_early);
+			if (!t)
+				continue;
+			anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
+						       ULONG_MAX)
+			{
+				vma = vmac->vma;
+				if (vma->vm_mm == t->mm) {
+					addr = rmap_item->address & PAGE_MASK;
+					add_to_kill_ksm(t, page, vma, to_kill,
+							addr);
+				}
+			}
+		}
+		read_unlock(&tasklist_lock);
+		anon_vma_unlock_read(av);
+	}
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
 {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bb09999108ef..f69c410fb9f9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -455,6 +455,27 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
 	__add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
 }
 
+#ifdef CONFIG_KSM
+static bool task_in_to_kill_list(struct list_head *to_kill,
+				 struct task_struct *tsk)
+{
+	struct to_kill *tk, *next;
+
+	list_for_each_entry_safe(tk, next, to_kill, nd) {
+		if (tk->tsk == tsk)
+			return true;
+	}
+
+	return false;
+}
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+		     struct vm_area_struct *vma, struct list_head *to_kill,
+		     unsigned long ksm_addr)
+{
+	if (!task_in_to_kill_list(to_kill, tsk))
+		__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
+}
+#endif
 /*
  * Kill the processes that have been collected earlier.
  *
@@ -534,8 +555,7 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
  * processes sharing the same error page,if the process is "early kill", the
  * task_struct of the dedicated thread will also be returned.
  */
-static struct task_struct *task_early_kill(struct task_struct *tsk,
-					   int force_early)
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
 {
 	if (!tsk->mm)
 		return NULL;
@@ -666,8 +686,9 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 {
 	if (!page->mapping)
 		return;
-
-	if (PageAnon(page))
+	if (unlikely(PageKsm(page)))
+		collect_procs_ksm(page, tokill, force_early);
+	else if (PageAnon(page))
 		collect_procs_anon(page, tokill, force_early);
 	else
 		collect_procs_file(page, tokill, force_early);
@@ -1522,11 +1543,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (!page_mapped(hpage))
 		return true;
 
-	if (PageKsm(p)) {
-		pr_err("%#lx: can't handle KSM pages.\n", pfn);
-		return false;
-	}
-
 	if (PageSwapCache(p)) {
 		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
 		ttu &= ~TTU_HWPOISON;
-- 
cgit v1.2.3


From ea97f6c8558e83cb457c3b5f53351e4fd8519ab1 Mon Sep 17 00:00:00 2001
From: David Wei <davidhwei@meta.com>
Date: Tue, 18 Apr 2023 15:58:18 -0700
Subject: io_uring: add support for multishot timeouts

A multishot timeout submission will repeatedly generate completions with
the IORING_CQE_F_MORE cflag set. Depending on the value of the `off'
field in the submission, these timeouts can either repeat indefinitely
until cancelled (`off' = 0) or for a fixed number of times (`off' > 0).

Only noseq timeouts (i.e. not dependent on the number of I/O
completions) are supported.

An indefinite timer will be cancelled if the CQ ever overflows.

Signed-off-by: David Wei <davidhwei@meta.com>
Link: https://lore.kernel.org/r/20230418225817.1905027-1-davidhwei@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/timeout.c            | 57 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f8d14d1c58d3..0716cb17e436 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -250,6 +250,7 @@ enum io_uring_op {
 #define IORING_TIMEOUT_REALTIME		(1U << 3)
 #define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
 #define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
+#define IORING_TIMEOUT_MULTISHOT	(1U << 6)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 #define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 5c6c6f720809..fc950177e2e1 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -17,6 +17,7 @@ struct io_timeout {
 	struct file			*file;
 	u32				off;
 	u32				target_seq;
+	u32				repeats;
 	struct list_head		list;
 	/* head of the link, used by linked timeouts only */
 	struct io_kiocb			*head;
@@ -37,8 +38,9 @@ struct io_timeout_rem {
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
+	struct io_timeout_data *data = req->async_data;
 
-	return !timeout->off;
+	return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT;
 }
 
 static inline void io_put_req(struct io_kiocb *req)
@@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req)
 	}
 }
 
+static inline bool io_timeout_finish(struct io_timeout *timeout,
+				     struct io_timeout_data *data)
+{
+	if (!(data->flags & IORING_TIMEOUT_MULTISHOT))
+		return true;
+
+	if (!timeout->off || (timeout->repeats && --timeout->repeats))
+		return false;
+
+	return true;
+}
+
+static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer);
+
+static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
+	struct io_timeout_data *data = req->async_data;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!io_timeout_finish(timeout, data)) {
+		bool filled;
+		filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME,
+				    IORING_CQE_F_MORE, false);
+		if (filled) {
+			/* re-arm timer */
+			spin_lock_irq(&ctx->timeout_lock);
+			list_add(&timeout->list, ctx->timeout_list.prev);
+			data->timer.function = io_timeout_fn;
+			hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
+			spin_unlock_irq(&ctx->timeout_lock);
+			return;
+		}
+	}
+
+	io_req_task_complete(req, ts);
+}
+
 static bool io_kill_timeout(struct io_kiocb *req, int status)
 	__must_hold(&req->ctx->timeout_lock)
 {
@@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 		req_set_fail(req);
 
 	io_req_set_res(req, -ETIME, 0);
-	req->io_task_work.func = io_req_task_complete;
+	req->io_task_work.func = io_timeout_complete;
 	io_req_task_work_add(req);
 	return HRTIMER_NORESTART;
 }
@@ -470,16 +510,27 @@ static int __io_timeout_prep(struct io_kiocb *req,
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
 	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
-		      IORING_TIMEOUT_ETIME_SUCCESS))
+		      IORING_TIMEOUT_ETIME_SUCCESS |
+		      IORING_TIMEOUT_MULTISHOT))
 		return -EINVAL;
 	/* more than one clock specified is invalid, obviously */
 	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
+	/* multishot requests only make sense with rel values */
+	if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS)))
+		return -EINVAL;
 
 	INIT_LIST_HEAD(&timeout->list);
 	timeout->off = off;
 	if (unlikely(off && !req->ctx->off_timeout_used))
 		req->ctx->off_timeout_used = true;
+	/*
+	 * for multishot reqs w/ fixed nr of repeats, repeats tracks the
+	 * remaining nr
+	 */
+	timeout->repeats = 0;
+	if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0)
+		timeout->repeats = off;
 
 	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
-- 
cgit v1.2.3


From 2d786e66c9662d84cbeab981ce3a371d2fb5a4bb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 18 Apr 2023 21:18:10 +0800
Subject: block: ublk: switch to ioctl command encoding

All ublk commands(control, IO) should have taken ioctl command encoding
from the beginning, because ioctl command encoding defines each code
uniquely, so driver can figure out wrong command sent from userspace
easily; 2) it might help security subsystem for audit uring cmd[1].

Unfortunately we didn't do that way, and it could be one lesson for
ublk driver.

So switch to ioctl command encoding now, we still support commands encoded
in old way, but they become legacy definition. Any new command should take
ioctl encoding.

See ublksrv code for switching to ioctl command encoding in [2].

[1] https://lore.kernel.org/io-uring/CAHC9VhSVzujW9LOj5Km80AjU0EfAuukoLrxO6BEfnXeK_s6bAg@mail.gmail.com/
[2] https://github.com/ming1/ubdsrv/commits/ioctl_cmd_encoding

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ken Kurematsu <k.kurematsu@nskint.co.jp>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230418131810.855959-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig         | 17 ++++++++++++++++
 drivers/block/ublk_drv.c      | 47 +++++++++++++++++++++++++++++++------------
 include/uapi/linux/ublk_cmd.h | 43 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f79f20430ef7..5b9d4aaebb81 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -385,6 +385,23 @@ config BLK_DEV_UBLK
 	  can handle batch more effectively, but task_work_add() isn't exported
 	  for module, so ublk has to be built to kernel.
 
+config BLKDEV_UBLK_LEGACY_OPCODES
+	bool "Support legacy command opcode"
+	depends on BLK_DEV_UBLK
+	default y
+	help
+	  ublk driver started to take plain command encoding, which turns out
+	  one bad way. The traditional ioctl command opcode encodes more
+	  info and basically defines each code uniquely, so opcode conflict
+	  is avoided, and driver can handle wrong command easily, meantime it
+	  may help security subsystem to audit io_uring command.
+
+	  Say Y if your application still uses legacy command opcode.
+
+	  Say N if you don't want to support legacy command opcode. It is
+	  suggested to enable N if your application(ublk server) switches to
+	  ioctl command encoding.
+
 source "drivers/block/rnbd/Kconfig"
 
 endif # BLK_DEV
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1223fcbfc6c9..5da5876a4443 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -53,7 +53,8 @@
 		| UBLK_F_NEED_GET_DATA \
 		| UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
-		| UBLK_F_UNPRIVILEGED_DEV)
+		| UBLK_F_UNPRIVILEGED_DEV \
+		| UBLK_F_CMD_IOCTL_ENCODE)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
@@ -1253,6 +1254,19 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
 	ublk_queue_cmd(ubq, req);
 }
 
+static inline int ublk_check_cmd_op(u32 cmd_op)
+{
+	u32 ioc_type = _IOC_TYPE(cmd_op);
+
+	if (IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
+		return -EOPNOTSUPP;
+
+	if (ioc_type != 'u' && ioc_type != 0)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -1294,10 +1308,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
 	 */
 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
-			^ (cmd_op == UBLK_IO_NEED_GET_DATA))
+			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
+		goto out;
+
+	ret = ublk_check_cmd_op(cmd_op);
+	if (ret)
 		goto out;
 
-	switch (cmd_op) {
+	switch (_IOC_NR(cmd_op)) {
 	case UBLK_IO_FETCH_REQ:
 		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
 		if (ublk_queue_ready(ubq)) {
@@ -1743,6 +1761,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
 		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
 
+	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE;
+
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
@@ -2099,7 +2119,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 		 * know if the specified device is created as unprivileged
 		 * mode.
 		 */
-		if (cmd->cmd_op != UBLK_CMD_GET_DEV_INFO2)
+		if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
 			return 0;
 	}
 
@@ -2125,7 +2145,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 	dev_path[header->dev_path_len] = 0;
 
 	ret = -EINVAL;
-	switch (cmd->cmd_op) {
+	switch (_IOC_NR(cmd->cmd_op)) {
 	case UBLK_CMD_GET_DEV_INFO:
 	case UBLK_CMD_GET_DEV_INFO2:
 	case UBLK_CMD_GET_QUEUE_AFFINITY:
@@ -2164,6 +2184,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 {
 	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
 	struct ublk_device *ub = NULL;
+	u32 cmd_op = cmd->cmd_op;
 	int ret = -EINVAL;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -2174,22 +2195,22 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	if (!(issue_flags & IO_URING_F_SQE128))
 		goto out;
 
-	if (cmd->cmd_op != UBLK_CMD_ADD_DEV) {
+	ret = ublk_check_cmd_op(cmd_op);
+	if (ret)
+		goto out;
+
+	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
 		ret = -ENODEV;
 		ub = ublk_get_device_from_id(header->dev_id);
 		if (!ub)
 			goto out;
 
 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
-	} else {
-		/* ADD_DEV permission check is done in command handler */
-		ret = 0;
+		if (ret)
+			goto put_dev;
 	}
 
-	if (ret)
-		goto put_dev;
-
-	switch (cmd->cmd_op) {
+	switch (_IOC_NR(cmd_op)) {
 	case UBLK_CMD_START_DEV:
 		ret = ublk_ctrl_start_dev(ub, cmd);
 		break;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index f6238ccc7800..640bf687b94a 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -8,6 +8,9 @@
 
 /*
  * Admin commands, issued by ublk server, and handled by ublk driver.
+ *
+ * Legacy command definition, don't use in new application, and don't
+ * add new such definition any more
  */
 #define	UBLK_CMD_GET_QUEUE_AFFINITY	0x01
 #define	UBLK_CMD_GET_DEV_INFO	0x02
@@ -21,6 +24,30 @@
 #define	UBLK_CMD_END_USER_RECOVERY	0x11
 #define	UBLK_CMD_GET_DEV_INFO2		0x12
 
+/* Any new ctrl command should encode by __IO*() */
+#define UBLK_U_CMD_GET_QUEUE_AFFINITY	\
+	_IOR('u', UBLK_CMD_GET_QUEUE_AFFINITY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_GET_DEV_INFO		\
+	_IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_ADD_DEV		\
+	_IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_DEL_DEV		\
+	_IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_START_DEV		\
+	_IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_STOP_DEV		\
+	_IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_SET_PARAMS		\
+	_IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_GET_PARAMS		\
+	_IOR('u', UBLK_CMD_GET_PARAMS, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_START_USER_RECOVERY	\
+	_IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_END_USER_RECOVERY	\
+	_IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_GET_DEV_INFO2	\
+	_IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd)
+
 /*
  * IO commands, issued by ublk server, and handled by ublk driver.
  *
@@ -41,10 +68,23 @@
  *      It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
  *      while starting a ublk device.
  */
+
+/*
+ * Legacy IO command definition, don't use in new application, and don't
+ * add new such definition any more
+ */
 #define	UBLK_IO_FETCH_REQ		0x20
 #define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
 #define	UBLK_IO_NEED_GET_DATA	0x22
 
+/* Any new IO command should encode by __IOWR() */
+#define	UBLK_U_IO_FETCH_REQ		\
+	_IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd)
+#define	UBLK_U_IO_COMMIT_AND_FETCH_REQ	\
+	_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
+#define	UBLK_U_IO_NEED_GET_DATA		\
+	_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
 #define UBLK_IO_RES_NEED_GET_DATA	1
@@ -102,6 +142,9 @@
  */
 #define UBLK_F_UNPRIVILEGED_DEV	(1UL << 5)
 
+/* use ioctl encoding for uring command */
+#define UBLK_F_CMD_IOCTL_ENCODE	(1UL << 6)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From 980f0799a15c75403f1f9284a32b6056b9660144 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 18 Apr 2023 11:48:41 +0800
Subject: bonding: add software tx timestamping support

Currently, bonding only obtain the timestamp (ts) information of
the active slave, which is available only for modes 1, 5, and 6.
For other modes, bonding only has software rx timestamping support.

However, some users who use modes such as LACP also want tx timestamp
support. To address this issue, let's check the ts information of each
slave. If all slaves support tx timestamping, we can enable tx
timestamping support for the bond.

Add a note that the get_ts_info may be called with RCU, or rtnl or
reference on the device in ethtool.h>

Suggested-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Link: https://lore.kernel.org/r/20230418034841.2566262-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 30 ++++++++++++++++++++++++++++++
 include/linux/ethtool.h         |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8cc9a74789b7..db7e650d9ebb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5696,9 +5696,13 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 				    struct ethtool_ts_info *info)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct ethtool_ts_info ts_info;
 	const struct ethtool_ops *ops;
 	struct net_device *real_dev;
+	bool sw_tx_support = false;
 	struct phy_device *phydev;
+	struct list_head *iter;
+	struct slave *slave;
 	int ret = 0;
 
 	rcu_read_lock();
@@ -5717,10 +5721,36 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 			ret = ops->get_ts_info(real_dev, info);
 			goto out;
 		}
+	} else {
+		/* Check if all slaves support software tx timestamping */
+		rcu_read_lock();
+		bond_for_each_slave_rcu(bond, slave, iter) {
+			ret = -1;
+			ops = slave->dev->ethtool_ops;
+			phydev = slave->dev->phydev;
+
+			if (phy_has_tsinfo(phydev))
+				ret = phy_ts_info(phydev, &ts_info);
+			else if (ops->get_ts_info)
+				ret = ops->get_ts_info(slave->dev, &ts_info);
+
+			if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) {
+				sw_tx_support = true;
+				continue;
+			}
+
+			sw_tx_support = false;
+			break;
+		}
+		rcu_read_unlock();
 	}
 
+	ret = 0;
 	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
 				SOF_TIMESTAMPING_SOFTWARE;
+	if (sw_tx_support)
+		info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+
 	info->phc_index = -1;
 
 out:
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 798d35890118..62b61527bcc4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -711,6 +711,7 @@ struct ethtool_mm_stats {
  * @get_dump_data: Get dump data.
  * @set_dump: Set dump specific flags to the device.
  * @get_ts_info: Get the time stamping and PTP hardware clock capabilities.
+ *	It may be called with RCU, or rtnl or reference on the device.
  *	Drivers supporting transmit time stamps in software should set this to
  *	ethtool_op_get_ts_info().
  * @get_module_info: Get the size and type of the eeprom contained within
-- 
cgit v1.2.3


From 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1 Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Wed, 19 Apr 2023 13:15:26 +0800
Subject: netfilter: conntrack: fix wrong ct->timeout value

(struct nf_conn)->timeout is an interval before the conntrack
confirmed.  After confirmed, it becomes a timestamp.

It is observed that timeout of an unconfirmed conntrack:
- Set by calling ctnetlink_change_timeout(). As a result,
  `nfct_time_stamp` was wrongly added to `ct->timeout` twice.
- Get by calling ctnetlink_dump_timeout(). As a result,
  `nfct_time_stamp` was wrongly subtracted.

Call Trace:
 <TASK>
 dump_stack_lvl
 ctnetlink_dump_timeout
 __ctnetlink_glue_build
 ctnetlink_glue_build
 __nfqnl_enqueue_packet
 nf_queue
 nf_hook_slow
 ip_mc_output
 ? __pfx_ip_finish_output
 ip_send_skb
 ? __pfx_dst_output
 udp_send_skb
 udp_sendmsg
 ? __pfx_ip_generic_getfrag
 sock_sendmsg

Separate the 2 cases in:
- Setting `ct->timeout` in __nf_ct_set_timeout().
- Getting `ct->timeout` in ctnetlink_dump_timeout().

Pablo appends:

Update ctnetlink to set up the timeout _after_ the IPS_CONFIRMED flag is
set on, otherwise conntrack creation via ctnetlink breaks.

Note that the problem described in this patch occurs since the
introduction of the nfnetlink_queue conntrack support, select a
sufficiently old Fixes: tag for -stable kernel to pick up this fix.

Fixes: a4b4766c3ceb ("netfilter: nfnetlink_queue: rename related to nfqueue attaching conntrack info")
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h |  6 +++++-
 net/netfilter/nf_conntrack_netlink.c      | 13 +++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 71d1269fe4d4..3384859a8921 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -89,7 +89,11 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
 {
 	if (timeout > INT_MAX)
 		timeout = INT_MAX;
-	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+
+	if (nf_ct_is_confirmed(ct))
+		WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+	else
+		ct->timeout = (u32)timeout;
 }
 
 int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d3ee18854698..6f3b23a6653c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -176,7 +176,12 @@ nla_put_failure:
 static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
 				  bool skip_zero)
 {
-	long timeout = nf_ct_expires(ct) / HZ;
+	long timeout;
+
+	if (nf_ct_is_confirmed(ct))
+		timeout = nf_ct_expires(ct) / HZ;
+	else
+		timeout = ct->timeout / HZ;
 
 	if (skip_zero && timeout == 0)
 		return 0;
@@ -2253,9 +2258,6 @@ ctnetlink_create_conntrack(struct net *net,
 	if (!cda[CTA_TIMEOUT])
 		goto err1;
 
-	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-	__nf_ct_set_timeout(ct, timeout);
-
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {
 		char *helpname = NULL;
@@ -2319,6 +2321,9 @@ ctnetlink_create_conntrack(struct net *net,
 	/* we must add conntrack extensions before confirmation. */
 	ct->status |= IPS_CONFIRMED;
 
+	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+	__nf_ct_set_timeout(ct, timeout);
+
 	if (cda[CTA_STATUS]) {
 		err = ctnetlink_change_status(ct, cda);
 		if (err < 0)
-- 
cgit v1.2.3


From e5029edd53937a29801ef507cee12e657ff31ea9 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:26 +0200
Subject: leds: Provide stubs for when CLASS_LED & NEW_LEDS are disabled

Provide stubs for devm_led_classdev_register_ext() and
led_init_default_state_get() so that LED drivers embedded within other
drivers such as PHYs and Ethernet switches still build when LEDS_CLASS
or NEW_LEDS are disabled. This also helps with Kconfig dependencies,
which are somewhat hairy for phylib and mdio and only get worse when
adding a dependency on LED_CLASS.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/leds.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index d71201a968b6..aa48e643f655 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -82,7 +82,15 @@ struct led_init_data {
 	bool devname_mandatory;
 };
 
+#if IS_ENABLED(CONFIG_NEW_LEDS)
 enum led_default_state led_init_default_state_get(struct fwnode_handle *fwnode);
+#else
+static inline enum led_default_state
+led_init_default_state_get(struct fwnode_handle *fwnode)
+{
+	return LEDS_DEFSTATE_OFF;
+}
+#endif
 
 struct led_hw_trigger_type {
 	int dummy;
@@ -217,9 +225,19 @@ static inline int led_classdev_register(struct device *parent,
 	return led_classdev_register_ext(parent, led_cdev, NULL);
 }
 
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 int devm_led_classdev_register_ext(struct device *parent,
 					  struct led_classdev *led_cdev,
 					  struct led_init_data *init_data);
+#else
+static inline int
+devm_led_classdev_register_ext(struct device *parent,
+			       struct led_classdev *led_cdev,
+			       struct led_init_data *init_data)
+{
+	return 0;
+}
+#endif
 
 static inline int devm_led_classdev_register(struct device *parent,
 					     struct led_classdev *led_cdev)
-- 
cgit v1.2.3


From 01e5b728e9e43ae444e0369695a5f72209906464 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:27 +0200
Subject: net: phy: Add a binding for PHY LEDs

Define common binding parsing for all PHY drivers with LEDs using
phylib. Parse the DT as part of the phy_probe and add LEDs to the
linux LED class infrastructure. For the moment, provide a dummy
brightness function, which will later be replaced with a call into the
PHY driver. This allows testing since the LED core might otherwise
reject an LED whose brightness cannot be set.

Add a dependency on LED_CLASS. It either needs to be built in, or not
enabled, since a modular build can result in linker errors.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig      |  1 +
 drivers/net/phy/phy_device.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          | 16 ++++++++++
 3 files changed, 93 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 6b9525def973..b8cc49820ced 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -18,6 +18,7 @@ menuconfig PHYLIB
 	depends on NETDEVICES
 	select MDIO_DEVICE
 	select MDIO_DEVRES
+	depends on LEDS_CLASS || LEDS_CLASS=n
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 917ba84105fc..61b971251de5 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -19,10 +19,12 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/mdio.h>
 #include <linux/mii.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/netdevice.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
@@ -674,6 +676,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 	device_initialize(&mdiodev->dev);
 
 	dev->state = PHY_DOWN;
+	INIT_LIST_HEAD(&dev->leds);
 
 	mutex_init(&dev->lock);
 	INIT_DELAYED_WORK(&dev->state_queue, phy_state_machine);
@@ -2988,6 +2991,74 @@ static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
+/* Dummy implementation until calls into PHY driver are added */
+static int phy_led_set_brightness(struct led_classdev *led_cdev,
+				  enum led_brightness value)
+{
+	return 0;
+}
+
+static int of_phy_led(struct phy_device *phydev,
+		      struct device_node *led)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct led_init_data init_data = {};
+	struct led_classdev *cdev;
+	struct phy_led *phyled;
+	int err;
+
+	phyled = devm_kzalloc(dev, sizeof(*phyled), GFP_KERNEL);
+	if (!phyled)
+		return -ENOMEM;
+
+	cdev = &phyled->led_cdev;
+
+	err = of_property_read_u8(led, "reg", &phyled->index);
+	if (err)
+		return err;
+
+	cdev->brightness_set_blocking = phy_led_set_brightness;
+	cdev->max_brightness = 1;
+	init_data.devicename = dev_name(&phydev->mdio.dev);
+	init_data.fwnode = of_fwnode_handle(led);
+	init_data.devname_mandatory = true;
+
+	err = devm_led_classdev_register_ext(dev, cdev, &init_data);
+	if (err)
+		return err;
+
+	list_add(&phyled->list, &phydev->leds);
+
+	return 0;
+}
+
+static int of_phy_leds(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	struct device_node *leds, *led;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_OF_MDIO))
+		return 0;
+
+	if (!node)
+		return 0;
+
+	leds = of_get_child_by_name(node, "leds");
+	if (!leds)
+		return 0;
+
+	for_each_available_child_of_node(leds, led) {
+		err = of_phy_led(phydev, led);
+		if (err) {
+			of_node_put(led);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * fwnode_mdio_find_device - Given a fwnode, find the mdio_device
  * @fwnode: pointer to the mdio_device's fwnode
@@ -3183,6 +3254,11 @@ static int phy_probe(struct device *dev)
 	/* Set the state to READY by default */
 	phydev->state = PHY_READY;
 
+	/* Get the LEDs from the device tree, and instantiate standard
+	 * LEDs for them.
+	 */
+	err = of_phy_leds(phydev);
+
 out:
 	/* Re-assert the reset signal on error */
 	if (err)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2f83cfc206e5..bd6b5e9bb729 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -14,6 +14,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
+#include <linux/leds.h>
 #include <linux/linkmode.h>
 #include <linux/netlink.h>
 #include <linux/mdio.h>
@@ -600,6 +601,7 @@ struct macsec_ops;
  * @phy_num_led_triggers: Number of triggers in @phy_led_triggers
  * @led_link_trigger: LED trigger for link up/down
  * @last_triggered: last LED trigger for link speed
+ * @leds: list of PHY LED structures
  * @master_slave_set: User requested master/slave configuration
  * @master_slave_get: Current master/slave advertisement
  * @master_slave_state: Current master/slave configuration
@@ -699,6 +701,7 @@ struct phy_device {
 
 	struct phy_led_trigger *led_link_trigger;
 #endif
+	struct list_head leds;
 
 	/*
 	 * Interrupt number for this PHY
@@ -834,6 +837,19 @@ struct phy_plca_status {
 	bool pst;
 };
 
+/**
+ * struct phy_led: An LED driven by the PHY
+ *
+ * @list: List of LEDs
+ * @led_cdev: Standard LED class structure
+ * @index: Number of the LED
+ */
+struct phy_led {
+	struct list_head list;
+	struct led_classdev led_cdev;
+	u8 index;
+};
+
 /**
  * struct phy_driver - Driver structure for a particular PHY type
  *
-- 
cgit v1.2.3


From 684818189b04b095b34964ed4a3ea5249a840eab Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:28 +0200
Subject: net: phy: phy_device: Call into the PHY driver to set LED brightness

Linux LEDs can be software controlled via the brightness file in /sys.
LED drivers need to implement a brightness_set function which the core
will call. Implement an intermediary in phy_device, which will call
into the phy driver if it implements the necessary function.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 15 ++++++++++++---
 include/linux/phy.h          | 13 +++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 61b971251de5..5c1200160c51 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2991,11 +2991,18 @@ static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
-/* Dummy implementation until calls into PHY driver are added */
 static int phy_led_set_brightness(struct led_classdev *led_cdev,
 				  enum led_brightness value)
 {
-	return 0;
+	struct phy_led *phyled = to_phy_led(led_cdev);
+	struct phy_device *phydev = phyled->phydev;
+	int err;
+
+	mutex_lock(&phydev->lock);
+	err = phydev->drv->led_brightness_set(phydev, phyled->index, value);
+	mutex_unlock(&phydev->lock);
+
+	return err;
 }
 
 static int of_phy_led(struct phy_device *phydev,
@@ -3012,12 +3019,14 @@ static int of_phy_led(struct phy_device *phydev,
 		return -ENOMEM;
 
 	cdev = &phyled->led_cdev;
+	phyled->phydev = phydev;
 
 	err = of_property_read_u8(led, "reg", &phyled->index);
 	if (err)
 		return err;
 
-	cdev->brightness_set_blocking = phy_led_set_brightness;
+	if (phydev->drv->led_brightness_set)
+		cdev->brightness_set_blocking = phy_led_set_brightness;
 	cdev->max_brightness = 1;
 	init_data.devicename = dev_name(&phydev->mdio.dev);
 	init_data.fwnode = of_fwnode_handle(led);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bd6b5e9bb729..f3c7e3c99f24 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -841,15 +841,19 @@ struct phy_plca_status {
  * struct phy_led: An LED driven by the PHY
  *
  * @list: List of LEDs
+ * @phydev: PHY this LED is attached to
  * @led_cdev: Standard LED class structure
  * @index: Number of the LED
  */
 struct phy_led {
 	struct list_head list;
+	struct phy_device *phydev;
 	struct led_classdev led_cdev;
 	u8 index;
 };
 
+#define to_phy_led(d) container_of(d, struct phy_led, led_cdev)
+
 /**
  * struct phy_driver - Driver structure for a particular PHY type
  *
@@ -1072,6 +1076,15 @@ struct phy_driver {
 	/** @get_plca_status: Return the current PLCA status info */
 	int (*get_plca_status)(struct phy_device *dev,
 			       struct phy_plca_status *plca_st);
+
+	/**
+	 * @led_brightness_set: Set a PHY LED brightness. Index
+	 * indicates which of the PHYs led should be set. Value
+	 * follows the standard LED class meaning, e.g. LED_OFF,
+	 * LED_HALF, LED_FULL.
+	 */
+	int (*led_brightness_set)(struct phy_device *dev,
+				  u8 index, enum led_brightness value);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From 4e901018432e38eab35d2a352661ce4727795be1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:30 +0200
Subject: net: phy: phy_device: Call into the PHY driver to set LED blinking

Linux LEDs can be requested to perform hardware accelerated
blinking. Pass this to the PHY driver, if it implements the op.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 18 ++++++++++++++++++
 include/linux/phy.h          | 12 ++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 5c1200160c51..538523a7cd51 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3005,6 +3005,22 @@ static int phy_led_set_brightness(struct led_classdev *led_cdev,
 	return err;
 }
 
+static int phy_led_blink_set(struct led_classdev *led_cdev,
+			     unsigned long *delay_on,
+			     unsigned long *delay_off)
+{
+	struct phy_led *phyled = to_phy_led(led_cdev);
+	struct phy_device *phydev = phyled->phydev;
+	int err;
+
+	mutex_lock(&phydev->lock);
+	err = phydev->drv->led_blink_set(phydev, phyled->index,
+					 delay_on, delay_off);
+	mutex_unlock(&phydev->lock);
+
+	return err;
+}
+
 static int of_phy_led(struct phy_device *phydev,
 		      struct device_node *led)
 {
@@ -3027,6 +3043,8 @@ static int of_phy_led(struct phy_device *phydev,
 
 	if (phydev->drv->led_brightness_set)
 		cdev->brightness_set_blocking = phy_led_set_brightness;
+	if (phydev->drv->led_blink_set)
+		cdev->blink_set = phy_led_blink_set;
 	cdev->max_brightness = 1;
 	init_data.devicename = dev_name(&phydev->mdio.dev);
 	init_data.fwnode = of_fwnode_handle(led);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f3c7e3c99f24..c5a0dc829714 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1085,6 +1085,18 @@ struct phy_driver {
 	 */
 	int (*led_brightness_set)(struct phy_device *dev,
 				  u8 index, enum led_brightness value);
+
+	/**
+	 * @led_blink_set: Set a PHY LED brightness.  Index indicates
+	 * which of the PHYs led should be configured to blink. Delays
+	 * are in milliseconds and if both are zero then a sensible
+	 * default should be chosen.  The call should adjust the
+	 * timings in that case and if it can't match the values
+	 * specified exactly.
+	 */
+	int (*led_blink_set)(struct phy_device *dev, u8 index,
+			     unsigned long *delay_on,
+			     unsigned long *delay_off);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From eb6fba7555a812c07aa984fb9e8e9b151a65ca16 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:46 -0700
Subject: net: skbuff: hide wifi_acked when CONFIG_WIRELESS not set

Datacenter kernel builds will very likely not include WIRELESS,
so let them shave 2 bits off the skb by hiding the wifi fields.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++++++++
 include/net/sock.h     |  2 +-
 net/core/skbuff.c      |  2 ++
 net/socket.c           |  2 ++
 4 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a823ec3aa326..513f03b23a73 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -953,8 +953,10 @@ struct sk_buff {
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
+#ifdef CONFIG_WIRELESS
 	__u8			wifi_acked_valid:1;
 	__u8			wifi_acked:1;
+#endif
 	__u8			no_fcs:1;
 	/* Indicates the inner headers are valid in the skbuff. */
 	__u8			encapsulation:1;
@@ -1187,6 +1189,15 @@ static inline unsigned int skb_napi_id(const struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_WIRELESS
+	return skb->wifi_acked_valid;
+#else
+	return 0;
+#endif
+}
+
 /**
  * skb_unref - decrement the skb's reference count
  * @skb: buffer
diff --git a/include/net/sock.h b/include/net/sock.h
index 5edf0038867c..8b7ed7167243 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2697,7 +2697,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 	else
 		sock_write_timestamp(sk, kt);
 
-	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
+	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
 		__sock_recv_wifi_status(msg, sk, skb);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ef81452759be..768f9d04911f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5189,6 +5189,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 }
 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
 
+#ifdef CONFIG_WIRELESS
 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
 {
 	struct sock *sk = skb->sk;
@@ -5214,6 +5215,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
 		kfree_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+#endif /* CONFIG_WIRELESS */
 
 /**
  * skb_partial_csum_set - set up and verify partial csum values for packet
diff --git a/net/socket.c b/net/socket.c
index 73e493da4589..a7b4b37d86df 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -957,6 +957,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
+#ifdef CONFIG_WIRELESS
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb)
 {
@@ -972,6 +973,7 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
 }
 EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
+#endif
 
 static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
 				   struct sk_buff *skb)
-- 
cgit v1.2.3


From c24831a13ba2e472f874483525084da2f2feb890 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:47 -0700
Subject: net: skbuff: hide csum_not_inet when CONFIG_IP_SCTP not set

SCTP is not universally deployed, allow hiding its bit
from the skb.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 ++++++++++++++
 net/core/dev.c         |  3 +--
 net/sched/act_csum.c   |  3 +--
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 513f03b23a73..98d6b48f4dcf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -983,7 +983,9 @@ struct sk_buff {
 	__u8			decrypted:1;
 #endif
 	__u8			slow_gro:1;
+#if IS_ENABLED(CONFIG_IP_SCTP)
 	__u8			csum_not_inet:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -5060,7 +5062,19 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
 {
+#if IS_ENABLED(CONFIG_IP_SCTP)
 	return skb->csum_not_inet;
+#else
+	return 0;
+#endif
+}
+
+static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
+{
+	skb->ip_summed = CHECKSUM_NONE;
+#if IS_ENABLED(CONFIG_IP_SCTP)
+	skb->csum_not_inet = 0;
+#endif
 }
 
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8aea68275172..3fc4dba71f9d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3315,8 +3315,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
 						  skb->len - start, ~(__u32)0,
 						  crc32c_csum_stub));
 	*(__le32 *)(skb->data + offset) = crc32c_csum;
-	skb->ip_summed = CHECKSUM_NONE;
-	skb->csum_not_inet = 0;
+	skb_reset_csum_not_inet(skb);
 out:
 	return ret;
 }
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 95e9304024b7..8ed285023a40 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -376,8 +376,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
 
 	sctph->checksum = sctp_compute_cksum(skb,
 					     skb_network_offset(skb) + ihl);
-	skb->ip_summed = CHECKSUM_NONE;
-	skb->csum_not_inet = 0;
+	skb_reset_csum_not_inet(skb);
 
 	return 1;
 }
-- 
cgit v1.2.3


From 4398f3f6d1380d9f71a484e1e5d869ba0eaf23d5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:48 -0700
Subject: net: skbuff: move alloc_cpu into a potential hole

alloc_cpu is currently between 4 byte fields, so it's almost
guaranteed to create a 2B hole. It has a knock on effect of
creating a 4B hole after @end (and @end and @tail being in
different cachelines).

None of this matters hugely, but for kernel configs which
don't enable all the features there may well be a 2B hole
after the bitfield. Move alloc_cpu there.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 98d6b48f4dcf..2595b2cfba0d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -991,6 +991,8 @@ struct sk_buff {
 	__u16			tc_index;	/* traffic control index */
 #endif
 
+	u16			alloc_cpu;
+
 	union {
 		__wsum		csum;
 		struct {
@@ -1014,7 +1016,6 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
-	u16			alloc_cpu;
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32		secmark;
 #endif
-- 
cgit v1.2.3


From 4c60d04c2888a542f96fe77ecbdfc242b484f943 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:49 -0700
Subject: net: skbuff: push nf_trace down the bitfield

nf_trace is a debug feature, AFAIU, and yet it sits oddly
high in the sk_buff bitfield. Move it down, pushing up
dst_pending_confirm and inner_protocol_type.

Next change will make nf_trace optional (under Kconfig)
and all optional fields should be placed after 2b fields
to avoid 2b fields straddling bytes.

dst_pending_confirm is L3, so it makes sense next to ignore_df.
inner_protocol_type goes up just to keep the balance.

Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2595b2cfba0d..3ae9e8868afa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -934,7 +934,7 @@ struct sk_buff {
 	/* public: */
 	__u8			pkt_type:3; /* see PKT_TYPE_MAX */
 	__u8			ignore_df:1;
-	__u8			nf_trace:1;
+	__u8			dst_pending_confirm:1;
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
 
@@ -949,7 +949,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
-	__u8			dst_pending_confirm:1;
+	__u8			inner_protocol_type:1;
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
@@ -967,7 +967,7 @@ struct sk_buff {
 #endif
 
 	__u8			ipvs_property:1;
-	__u8			inner_protocol_type:1;
+	__u8			nf_trace:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
-- 
cgit v1.2.3


From 48d80c394d3d1afcb49d26398917f5be27bf44cb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:50 -0700
Subject: net: skbuff: hide nf_trace and ipvs_property

Accesses to nf_trace and ipvs_property are already wrapped
by ifdefs where necessary. Don't allocate the bits for those
fields at all if possible.

Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ae9e8868afa..5f120bbab9da 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -966,8 +966,12 @@ struct sk_buff {
 	__u8			ndisc_nodetype:2;
 #endif
 
+#if IS_ENABLED(CONFIG_IP_VS)
 	__u8			ipvs_property:1;
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	__u8			nf_trace:1;
+#endif
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
-- 
cgit v1.2.3


From 9e05a2599a37295eb2dc5c03441daa6741abed4b Mon Sep 17 00:00:00 2001
From: Ondrej Kozina <okozina@redhat.com>
Date: Tue, 11 Apr 2023 11:09:31 +0200
Subject: sed-opal: geometry feature reporting command

Locking range start and locking range length
attributes may be require to satisfy restrictions
exposed by OPAL2 geometry feature reporting.

Geometry reporting feature is described in TCG OPAL SSC,
section 3.1.1.4 (ALIGN, LogicalBlockSize, AlignmentGranularity
and LowestAlignedLBA).

4.3.5.2.1.1 RangeStart Behavior:

[ StartAlignment = (RangeStart modulo AlignmentGranularity) - LowestAlignedLBA ]

When processing a Set method or CreateRow method on the Locking
table for a non-Global Range row, if:

a) the AlignmentRequired (ALIGN above) column in the LockingInfo
   table is TRUE;
b) RangeStart is non-zero; and
c) StartAlignment is non-zero, then the method SHALL fail and
   return an error status code INVALID_PARAMETER.

4.3.5.2.1.2 RangeLength Behavior:

If RangeStart is zero, then
	[ LengthAlignment = (RangeLength modulo AlignmentGranularity) - LowestAlignedLBA ]

If RangeStart is non-zero, then
	[ LengthAlignment = (RangeLength modulo AlignmentGranularity) ]

When processing a Set method or CreateRow method on the Locking
table for a non-Global Range row, if:

a) the AlignmentRequired (ALIGN above) column in the LockingInfo
   table is TRUE;
b) RangeLength is non-zero; and
c) LengthAlignment is non-zero, then the method SHALL fail and
   return an error status code INVALID_PARAMETER

In userspace we stuck to logical block size reported by general
block device (via sysfs or ioctl), but we can not read
'AlignmentGranularity' or 'LowestAlignedLBA' anywhere else and
we need to get those values from sed-opal interface otherwise
we will not be able to report or avoid locking range setup
INVALID_PARAMETER errors above.

Signed-off-by: Ondrej Kozina <okozina@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Tested-by: Milan Broz <gmazyland@gmail.com>
Link: https://lore.kernel.org/r/20230411090931.9193-2-okozina@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 29 ++++++++++++++++++++++++++++-
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h | 13 +++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 3fc4e65db111..c18339446ef3 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -83,8 +83,10 @@ struct opal_dev {
 	u16 comid;
 	u32 hsn;
 	u32 tsn;
-	u64 align;
+	u64 align; /* alignment granularity */
 	u64 lowest_lba;
+	u32 logical_block_size;
+	u8  align_required; /* ALIGN: 0 or 1 */
 
 	size_t pos;
 	u8 *cmd;
@@ -409,6 +411,8 @@ static void check_geometry(struct opal_dev *dev, const void *data)
 
 	dev->align = be64_to_cpu(geo->alignment_granularity);
 	dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba);
+	dev->logical_block_size = be32_to_cpu(geo->logical_block_size);
+	dev->align_required = geo->reserved01 & 1;
 }
 
 static int execute_step(struct opal_dev *dev,
@@ -2956,6 +2960,26 @@ static int opal_get_status(struct opal_dev *dev, void __user *data)
 	return 0;
 }
 
+static int opal_get_geometry(struct opal_dev *dev, void __user *data)
+{
+	struct opal_geometry geo = {0};
+
+	if (check_opal_support(dev))
+		return -EINVAL;
+
+	geo.align = dev->align_required;
+	geo.logical_block_size = dev->logical_block_size;
+	geo.alignment_granularity =  dev->align;
+	geo.lowest_aligned_lba = dev->lowest_lba;
+
+	if (copy_to_user(data, &geo, sizeof(geo))) {
+		pr_debug("Error copying geometry data to userspace\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 {
 	void *p;
@@ -3029,6 +3053,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_GET_LR_STATUS:
 		ret = opal_locking_range_status(dev, p, arg);
 		break;
+	case IOC_OPAL_GET_GEOMETRY:
+		ret = opal_get_geometry(dev, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 042c1e2cb0ce..bbae1e52ab4f 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -46,6 +46,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_GENERIC_TABLE_RW:
 	case IOC_OPAL_GET_STATUS:
 	case IOC_OPAL_GET_LR_STATUS:
+	case IOC_OPAL_GET_GEOMETRY:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 3905c8ffedbf..dc2efd345133 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -161,6 +161,18 @@ struct opal_status {
 	__u32 reserved;
 };
 
+/*
+ * Geometry Reporting per TCG Storage OPAL SSC
+ * section 3.1.1.4
+ */
+struct opal_geometry {
+	__u8 align;
+	__u32 logical_block_size;
+	__u64 alignment_granularity;
+	__u64 lowest_aligned_lba;
+	__u8  __align[3];
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -179,5 +191,6 @@ struct opal_status {
 #define IOC_OPAL_GENERIC_TABLE_RW   _IOW('p', 235, struct opal_read_write_table)
 #define IOC_OPAL_GET_STATUS         _IOR('p', 236, struct opal_status)
 #define IOC_OPAL_GET_LR_STATUS      _IOW('p', 237, struct opal_lr_status)
+#define IOC_OPAL_GET_GEOMETRY       _IOR('p', 238, struct opal_geometry)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From 691d0b782066a6eeeecbfceb7910a8f6184e6105 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Tue, 18 Apr 2023 13:19:02 -0700
Subject: SUNRPC: remove the maximum number of retries in call_bind_status

Currently call_bind_status places a hard limit of 3 to the number of
retries on EACCES error. This limit was done to prevent NLM unlock
requests from being hang forever when the server keeps returning garbage.
However this change causes problem for cases when NLM service takes
longer than 9 seconds to register with the port mapper after a restart.

This patch removes this hard coded limit and let the RPC handles
the retry based on the standard hard/soft task semantics.

Fixes: 0b760113a3a1 ("NLM: Don't hang forever on NLM unlock requests")
Reported-by: Helen Chao <helen.chao@oracle.com>
Tested-by: Helen Chao <helen.chao@oracle.com>
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/sched.h | 3 +--
 net/sunrpc/clnt.c            | 3 ---
 net/sunrpc/sched.c           | 1 -
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index b8ca3ecaf8d7..8ada7dc802d3 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -90,8 +90,7 @@ struct rpc_task {
 #endif
 	unsigned char		tk_priority : 2,/* Task priority */
 				tk_garb_retry : 2,
-				tk_cred_retry : 2,
-				tk_rebind_retry : 2;
+				tk_cred_retry : 2;
 };
 
 typedef void			(*rpc_action)(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fd7e1c630493..d2ee56634308 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2050,9 +2050,6 @@ call_bind_status(struct rpc_task *task)
 			status = -EOPNOTSUPP;
 			break;
 		}
-		if (task->tk_rebind_retry == 0)
-			break;
-		task->tk_rebind_retry--;
 		rpc_delay(task, 3*HZ);
 		goto retry_timeout;
 	case -ENOBUFS:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index be587a308e05..c8321de341ee 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -817,7 +817,6 @@ rpc_init_task_statistics(struct rpc_task *task)
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
 	task->tk_cred_retry = 2;
-	task->tk_rebind_retry = 2;
 
 	/* starting timestamp */
 	task->tk_start = ktime_get();
-- 
cgit v1.2.3


From 3b3009ea8abb713b022d94fba95ec270cf6e7eae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 17 Apr 2023 10:32:26 -0400
Subject: net/handshake: Create a NETLINK service for handling handshake
 requests

When a kernel consumer needs a transport layer security session, it
first needs a handshake to negotiate and establish a session. This
negotiation can be done in user space via one of the several
existing library implementations, or it can be done in the kernel.

No in-kernel handshake implementations yet exist. In their absence,
we add a netlink service that can:

a. Notify a user space daemon that a handshake is needed.

b. Once notified, the daemon calls the kernel back via this
   netlink service to get the handshake parameters, including an
   open socket on which to establish the session.

c. Once the handshake is complete, the daemon reports the
   session status and other information via a second netlink
   operation. This operation marks that it is safe for the
   kernel to use the open socket and the security session
   established there.

The notification service uses a multicast group. Each handshake
mechanism (eg, tlshd) adopts its own group number so that the
handshake services are completely independent of one another. The
kernel can then tell via netlink_has_listeners() whether a handshake
service is active and prepared to handle a handshake request.

A new netlink operation, ACCEPT, acts like accept(2) in that it
instantiates a file descriptor in the user space daemon's fd table.
If this operation is successful, the reply carries the fd number,
which can be treated as an open and ready file descriptor.

While user space is performing the handshake, the kernel keeps its
muddy paws off the open socket. A second new netlink operation,
DONE, indicates that the user space daemon is finished with the
socket and it is safe for the kernel to use again. The operation
also indicates whether a session was established successfully.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/handshake.yaml | 122 +++++++++++
 MAINTAINERS                                |   9 +
 include/trace/events/handshake.h           | 159 ++++++++++++++
 include/uapi/linux/handshake.h             |  71 ++++++
 net/Kconfig                                |   5 +
 net/Makefile                               |   1 +
 net/handshake/Makefile                     |  11 +
 net/handshake/genl.c                       |  57 +++++
 net/handshake/genl.h                       |  23 ++
 net/handshake/handshake.h                  |  82 +++++++
 net/handshake/netlink.c                    | 312 ++++++++++++++++++++++++++
 net/handshake/request.c                    | 339 +++++++++++++++++++++++++++++
 net/handshake/trace.c                      |  20 ++
 13 files changed, 1211 insertions(+)
 create mode 100644 Documentation/netlink/specs/handshake.yaml
 create mode 100644 include/trace/events/handshake.h
 create mode 100644 include/uapi/linux/handshake.h
 create mode 100644 net/handshake/Makefile
 create mode 100644 net/handshake/genl.c
 create mode 100644 net/handshake/genl.h
 create mode 100644 net/handshake/handshake.h
 create mode 100644 net/handshake/netlink.c
 create mode 100644 net/handshake/request.c
 create mode 100644 net/handshake/trace.c

(limited to 'include')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
new file mode 100644
index 000000000000..0333d92b1438
--- /dev/null
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+name: handshake
+
+protocol: genetlink
+
+doc: Netlink protocol to request a transport layer security handshake.
+
+definitions:
+  -
+    type: enum
+    name: handler-class
+    value-start: 0
+    entries: [ none, max ]
+  -
+    type: enum
+    name: msg-type
+    value-start: 0
+    entries: [ unspec, clienthello, serverhello ]
+  -
+    type: enum
+    name: auth
+    value-start: 0
+    entries: [ unspec, unauth, psk, x509 ]
+
+attribute-sets:
+  -
+    name: x509
+    attributes:
+      -
+        name: cert
+        type: u32
+      -
+        name: privkey
+        type: u32
+  -
+    name: accept
+    attributes:
+      -
+        name: sockfd
+        type: u32
+      -
+        name: handler-class
+        type: u32
+        enum: handler-class
+      -
+        name: message-type
+        type: u32
+        enum: msg-type
+      -
+        name: timeout
+        type: u32
+      -
+        name: auth-mode
+        type: u32
+        enum: auth
+      -
+        name: peer-identity
+        type: u32
+        multi-attr: true
+      -
+        name: certificate
+        type: nest
+        nested-attributes: x509
+        multi-attr: true
+  -
+    name: done
+    attributes:
+      -
+        name: status
+        type: u32
+      -
+        name: sockfd
+        type: u32
+      -
+        name: remote-auth
+        type: u32
+        multi-attr: true
+
+operations:
+  list:
+    -
+      name: ready
+      doc: Notify handlers that a new handshake request is waiting
+      notify: accept
+    -
+      name: accept
+      doc: Handler retrieves next queued handshake request
+      attribute-set: accept
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - handler-class
+        reply:
+          attributes:
+            - sockfd
+            - message-type
+            - timeout
+            - auth-mode
+            - peer-identity
+            - certificate
+    -
+      name: done
+      doc: Handler reports handshake completion
+      attribute-set: done
+      do:
+        request:
+          attributes:
+            - status
+            - sockfd
+            - remote-auth
+
+mcast-groups:
+  list:
+    -
+      name: none
diff --git a/MAINTAINERS b/MAINTAINERS
index 4fc57dfd5fd0..cdc7748d15b8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8947,6 +8947,15 @@ Q:	http://patchwork.linuxtv.org/project/linux-media/list/
 T:	git git://linuxtv.org/anttip/media_tree.git
 F:	drivers/media/usb/hackrf/
 
+HANDSHAKE UPCALL FOR TRANSPORT LAYER SECURITY
+M:	Chuck Lever <chuck.lever@oracle.com>
+L:	kernel-tls-handshake@lists.linux.dev
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	Documentation/netlink/specs/handshake.yaml
+F:	include/trace/events/handshake.h
+F:	net/handshake/
+
 HANTRO VPU CODEC DRIVER
 M:	Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
 M:	Philipp Zabel <p.zabel@pengutronix.de>
diff --git a/include/trace/events/handshake.h b/include/trace/events/handshake.h
new file mode 100644
index 000000000000..8dadcab5f12a
--- /dev/null
+++ b/include/trace/events/handshake.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM handshake
+
+#if !defined(_TRACE_HANDSHAKE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HANDSHAKE_H
+
+#include <linux/net.h>
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(handshake_event_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk
+	),
+	TP_ARGS(net, req, sk),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p",
+		__entry->req, __entry->sk
+	)
+);
+#define DEFINE_HANDSHAKE_EVENT(name)				\
+	DEFINE_EVENT(handshake_event_class, name,		\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk			\
+		),						\
+		TP_ARGS(net, req, sk))
+
+DECLARE_EVENT_CLASS(handshake_fd_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int fd
+	),
+	TP_ARGS(net, req, sk, fd),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, fd)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = req->hr_sk;
+		__entry->fd = fd;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p fd=%d",
+		__entry->req, __entry->sk, __entry->fd
+	)
+);
+#define DEFINE_HANDSHAKE_FD_EVENT(name)				\
+	DEFINE_EVENT(handshake_fd_class, name,			\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk,			\
+			int fd					\
+		),						\
+		TP_ARGS(net, req, sk, fd))
+
+DECLARE_EVENT_CLASS(handshake_error_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int err
+	),
+	TP_ARGS(net, req, sk, err),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, err)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->err = err;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p err=%d",
+		__entry->req, __entry->sk, __entry->err
+	)
+);
+#define DEFINE_HANDSHAKE_ERROR(name)				\
+	DEFINE_EVENT(handshake_error_class, name,		\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk,			\
+			int err					\
+		),						\
+		TP_ARGS(net, req, sk, err))
+
+
+/*
+ * Request lifetime events
+ */
+
+DEFINE_HANDSHAKE_EVENT(handshake_submit);
+DEFINE_HANDSHAKE_ERROR(handshake_submit_err);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel_none);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel_busy);
+DEFINE_HANDSHAKE_EVENT(handshake_destruct);
+
+
+TRACE_EVENT(handshake_complete,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int status
+	),
+	TP_ARGS(net, req, sk, status),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, status)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->status = status;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p status=%d",
+		__entry->req, __entry->sk, __entry->status
+	)
+);
+
+/*
+ * Netlink events
+ */
+
+DEFINE_HANDSHAKE_ERROR(handshake_notify_err);
+DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_accept);
+DEFINE_HANDSHAKE_ERROR(handshake_cmd_accept_err);
+DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_done);
+DEFINE_HANDSHAKE_ERROR(handshake_cmd_done_err);
+
+#endif /* _TRACE_HANDSHAKE_H */
+
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
new file mode 100644
index 000000000000..7f66ff489b87
--- /dev/null
+++ b/include/uapi/linux/handshake.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_HANDSHAKE_H
+#define _UAPI_LINUX_HANDSHAKE_H
+
+#define HANDSHAKE_FAMILY_NAME		"handshake"
+#define HANDSHAKE_FAMILY_VERSION	1
+
+enum handshake_handler_class {
+	HANDSHAKE_HANDLER_CLASS_NONE,
+	HANDSHAKE_HANDLER_CLASS_MAX,
+};
+
+enum handshake_msg_type {
+	HANDSHAKE_MSG_TYPE_UNSPEC,
+	HANDSHAKE_MSG_TYPE_CLIENTHELLO,
+	HANDSHAKE_MSG_TYPE_SERVERHELLO,
+};
+
+enum handshake_auth {
+	HANDSHAKE_AUTH_UNSPEC,
+	HANDSHAKE_AUTH_UNAUTH,
+	HANDSHAKE_AUTH_PSK,
+	HANDSHAKE_AUTH_X509,
+};
+
+enum {
+	HANDSHAKE_A_X509_CERT = 1,
+	HANDSHAKE_A_X509_PRIVKEY,
+
+	__HANDSHAKE_A_X509_MAX,
+	HANDSHAKE_A_X509_MAX = (__HANDSHAKE_A_X509_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_A_ACCEPT_SOCKFD = 1,
+	HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+	HANDSHAKE_A_ACCEPT_MESSAGE_TYPE,
+	HANDSHAKE_A_ACCEPT_TIMEOUT,
+	HANDSHAKE_A_ACCEPT_AUTH_MODE,
+	HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+	HANDSHAKE_A_ACCEPT_CERTIFICATE,
+
+	__HANDSHAKE_A_ACCEPT_MAX,
+	HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_A_DONE_STATUS = 1,
+	HANDSHAKE_A_DONE_SOCKFD,
+	HANDSHAKE_A_DONE_REMOTE_AUTH,
+
+	__HANDSHAKE_A_DONE_MAX,
+	HANDSHAKE_A_DONE_MAX = (__HANDSHAKE_A_DONE_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_CMD_READY = 1,
+	HANDSHAKE_CMD_ACCEPT,
+	HANDSHAKE_CMD_DONE,
+
+	__HANDSHAKE_CMD_MAX,
+	HANDSHAKE_CMD_MAX = (__HANDSHAKE_CMD_MAX - 1)
+};
+
+#define HANDSHAKE_MCGRP_NONE	"none"
+
+#endif /* _UAPI_LINUX_HANDSHAKE_H */
diff --git a/net/Kconfig b/net/Kconfig
index f806722bccf4..4b800706cc76 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -68,6 +68,11 @@ source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
 source "net/xdp/Kconfig"
 
+config NET_HANDSHAKE
+	bool
+	depends on SUNRPC || NVME_TARGET_TCP || NVME_TCP
+	default y
+
 config INET
 	bool "TCP/IP networking"
 	help
diff --git a/net/Makefile b/net/Makefile
index 87592009366f..4c4dc535453d 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_NET_NCSI)		+= ncsi/
 obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
 obj-$(CONFIG_MPTCP)		+= mptcp/
 obj-$(CONFIG_MCTP)		+= mctp/
+obj-$(CONFIG_NET_HANDSHAKE)	+= handshake/
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
new file mode 100644
index 000000000000..d38736de45da
--- /dev/null
+++ b/net/handshake/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Generic HANDSHAKE service
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+obj-y += handshake.o
+handshake-y := genl.o netlink.o request.o trace.o
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
new file mode 100644
index 000000000000..652f37d19bd6
--- /dev/null
+++ b/net/handshake/genl.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "genl.h"
+
+#include <linux/handshake.h>
+
+/* HANDSHAKE_CMD_ACCEPT - do */
+static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
+	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
+/* HANDSHAKE_CMD_DONE - do */
+static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
+	[HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+	[HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_U32, },
+	[HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
+};
+
+/* Ops table for handshake */
+static const struct genl_split_ops handshake_nl_ops[] = {
+	{
+		.cmd		= HANDSHAKE_CMD_ACCEPT,
+		.doit		= handshake_nl_accept_doit,
+		.policy		= handshake_accept_nl_policy,
+		.maxattr	= HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= HANDSHAKE_CMD_DONE,
+		.doit		= handshake_nl_done_doit,
+		.policy		= handshake_done_nl_policy,
+		.maxattr	= HANDSHAKE_A_DONE_REMOTE_AUTH,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+};
+
+static const struct genl_multicast_group handshake_nl_mcgrps[] = {
+	[HANDSHAKE_NLGRP_NONE] = { "none", },
+};
+
+struct genl_family handshake_nl_family __ro_after_init = {
+	.name		= HANDSHAKE_FAMILY_NAME,
+	.version	= HANDSHAKE_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= handshake_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(handshake_nl_ops),
+	.mcgrps		= handshake_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(handshake_nl_mcgrps),
+};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
new file mode 100644
index 000000000000..a1eb7ccccc7f
--- /dev/null
+++ b/net/handshake/genl.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_HANDSHAKE_GEN_H
+#define _LINUX_HANDSHAKE_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <linux/handshake.h>
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+	HANDSHAKE_NLGRP_NONE,
+};
+
+extern struct genl_family handshake_nl_family;
+
+#endif /* _LINUX_HANDSHAKE_GEN_H */
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
new file mode 100644
index 000000000000..52568dbe24f1
--- /dev/null
+++ b/net/handshake/handshake.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _INTERNAL_HANDSHAKE_H
+#define _INTERNAL_HANDSHAKE_H
+
+/* Per-net namespace context */
+struct handshake_net {
+	spinlock_t		hn_lock;	/* protects next 3 fields */
+	int			hn_pending;
+	int			hn_pending_max;
+	struct list_head	hn_requests;
+
+	unsigned long		hn_flags;
+};
+
+enum hn_flags_bits {
+	HANDSHAKE_F_NET_DRAINING,
+};
+
+struct handshake_proto;
+
+/* One handshake request */
+struct handshake_req {
+	struct list_head		hr_list;
+	struct rhash_head		hr_rhash;
+	unsigned long			hr_flags;
+	const struct handshake_proto	*hr_proto;
+	struct sock			*hr_sk;
+	void				(*hr_odestruct)(struct sock *sk);
+
+	/* Always the last field */
+	char				hr_priv[];
+};
+
+enum hr_flags_bits {
+	HANDSHAKE_F_REQ_COMPLETED,
+};
+
+/* Invariants for all handshake requests for one transport layer
+ * security protocol
+ */
+struct handshake_proto {
+	int			hp_handler_class;
+	size_t			hp_privsize;
+
+	int			(*hp_accept)(struct handshake_req *req,
+					     struct genl_info *info, int fd);
+	void			(*hp_done)(struct handshake_req *req,
+					   unsigned int status,
+					   struct genl_info *info);
+	void			(*hp_destroy)(struct handshake_req *req);
+};
+
+/* netlink.c */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+			  gfp_t flags);
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+				    struct genl_info *info);
+struct handshake_net *handshake_pernet(struct net *net);
+
+/* request.c */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+					  gfp_t flags);
+int handshake_req_hash_init(void);
+void handshake_req_hash_destroy(void);
+void *handshake_req_private(struct handshake_req *req);
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+			 gfp_t flags);
+void handshake_complete(struct handshake_req *req, unsigned int status,
+			struct genl_info *info);
+bool handshake_req_cancel(struct sock *sk);
+
+#endif /* _INTERNAL_HANDSHAKE_H */
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
new file mode 100644
index 000000000000..7264cac04047
--- /dev/null
+++ b/net/handshake/netlink.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+#include "genl.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * handshake_genl_notify - Notify handlers that a request is waiting
+ * @net: target network namespace
+ * @proto: handshake protocol
+ * @flags: memory allocation control flags
+ *
+ * Returns zero on success or a negative errno if notification failed.
+ */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+			  gfp_t flags)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!genl_has_listeners(&handshake_nl_family, net,
+				proto->hp_handler_class))
+		return -ESRCH;
+
+	msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0,
+			  HANDSHAKE_CMD_READY);
+	if (!hdr)
+		goto out_free;
+
+	if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+			proto->hp_handler_class) < 0) {
+		genlmsg_cancel(msg, hdr);
+		goto out_free;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_multicast_netns(&handshake_nl_family, net, msg,
+				       0, proto->hp_handler_class, flags);
+
+out_free:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+/**
+ * handshake_genl_put - Create a generic netlink message header
+ * @msg: buffer in which to create the header
+ * @info: generic netlink message context
+ *
+ * Returns a ready-to-use header, or NULL.
+ */
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+				    struct genl_info *info)
+{
+	return genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			   &handshake_nl_family, 0, info->genlhdr->cmd);
+}
+EXPORT_SYMBOL(handshake_genl_put);
+
+/*
+ * dup() a kernel socket for use as a user space file descriptor
+ * in the current process. The kernel socket must have an
+ * instatiated struct file.
+ *
+ * Implicit argument: "current()"
+ */
+static int handshake_dup(struct socket *sock)
+{
+	struct file *file;
+	int newfd;
+
+	if (!sock->file)
+		return -EBADF;
+
+	file = get_file(sock->file);
+	newfd = get_unused_fd_flags(O_CLOEXEC);
+	if (newfd < 0) {
+		fput(file);
+		return newfd;
+	}
+
+	fd_install(newfd, file);
+	return newfd;
+}
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct handshake_net *hn = handshake_pernet(net);
+	struct handshake_req *req = NULL;
+	struct socket *sock;
+	int class, fd, err;
+
+	err = -EOPNOTSUPP;
+	if (!hn)
+		goto out_status;
+
+	err = -EINVAL;
+	if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS))
+		goto out_status;
+	class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]);
+
+	err = -EAGAIN;
+	req = handshake_req_next(hn, class);
+	if (!req)
+		goto out_status;
+
+	sock = req->hr_sk->sk_socket;
+	fd = handshake_dup(sock);
+	if (fd < 0) {
+		err = fd;
+		goto out_complete;
+	}
+	err = req->hr_proto->hp_accept(req, info, fd);
+	if (err)
+		goto out_complete;
+
+	trace_handshake_cmd_accept(net, req, req->hr_sk, fd);
+	return 0;
+
+out_complete:
+	handshake_complete(req, -EIO, NULL);
+	fput(sock->file);
+out_status:
+	trace_handshake_cmd_accept_err(net, req, NULL, err);
+	return err;
+}
+
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct socket *sock = NULL;
+	struct handshake_req *req;
+	int fd, status, err;
+
+	if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
+		return -EINVAL;
+	fd = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+
+	err = 0;
+	sock = sockfd_lookup(fd, &err);
+	if (err) {
+		err = -EBADF;
+		goto out_status;
+	}
+
+	req = handshake_req_hash_lookup(sock->sk);
+	if (!req) {
+		err = -EBUSY;
+		fput(sock->file);
+		goto out_status;
+	}
+
+	trace_handshake_cmd_done(net, req, sock->sk, fd);
+
+	status = -EIO;
+	if (info->attrs[HANDSHAKE_A_DONE_STATUS])
+		status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+
+	handshake_complete(req, status, info);
+	fput(sock->file);
+	return 0;
+
+out_status:
+	trace_handshake_cmd_done_err(net, req, sock->sk, err);
+	return err;
+}
+
+static unsigned int handshake_net_id;
+
+static int __net_init handshake_net_init(struct net *net)
+{
+	struct handshake_net *hn = net_generic(net, handshake_net_id);
+	unsigned long tmp;
+	struct sysinfo si;
+
+	/*
+	 * Arbitrary limit to prevent handshakes that do not make
+	 * progress from clogging up the system. The cap scales up
+	 * with the amount of physical memory on the system.
+	 */
+	si_meminfo(&si);
+	tmp = si.totalram / (25 * si.mem_unit);
+	hn->hn_pending_max = clamp(tmp, 3UL, 50UL);
+
+	spin_lock_init(&hn->hn_lock);
+	hn->hn_pending = 0;
+	hn->hn_flags = 0;
+	INIT_LIST_HEAD(&hn->hn_requests);
+	return 0;
+}
+
+static void __net_exit handshake_net_exit(struct net *net)
+{
+	struct handshake_net *hn = net_generic(net, handshake_net_id);
+	struct handshake_req *req;
+	LIST_HEAD(requests);
+
+	/*
+	 * Drain the net's pending list. Requests that have been
+	 * accepted and are in progress will be destroyed when
+	 * the socket is closed.
+	 */
+	spin_lock(&hn->hn_lock);
+	set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
+	list_splice_init(&requests, &hn->hn_requests);
+	spin_unlock(&hn->hn_lock);
+
+	while (!list_empty(&requests)) {
+		req = list_first_entry(&requests, struct handshake_req, hr_list);
+		list_del(&req->hr_list);
+
+		/*
+		 * Requests on this list have not yet been
+		 * accepted, so they do not have an fd to put.
+		 */
+
+		handshake_complete(req, -ETIMEDOUT, NULL);
+	}
+}
+
+static struct pernet_operations __net_initdata handshake_genl_net_ops = {
+	.init		= handshake_net_init,
+	.exit		= handshake_net_exit,
+	.id		= &handshake_net_id,
+	.size		= sizeof(struct handshake_net),
+};
+
+/**
+ * handshake_pernet - Get the handshake private per-net structure
+ * @net: network namespace
+ *
+ * Returns a pointer to the net's private per-net structure for the
+ * handshake module, or NULL if handshake_init() failed.
+ */
+struct handshake_net *handshake_pernet(struct net *net)
+{
+	return handshake_net_id ?
+		net_generic(net, handshake_net_id) : NULL;
+}
+
+static int __init handshake_init(void)
+{
+	int ret;
+
+	ret = handshake_req_hash_init();
+	if (ret) {
+		pr_warn("handshake: hash initialization failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = genl_register_family(&handshake_nl_family);
+	if (ret) {
+		pr_warn("handshake: netlink registration failed (%d)\n", ret);
+		handshake_req_hash_destroy();
+		return ret;
+	}
+
+	/*
+	 * ORDER: register_pernet_subsys must be done last.
+	 *
+	 *	If initialization does not make it past pernet_subsys
+	 *	registration, then handshake_net_id will remain 0. That
+	 *	shunts the handshake consumer API to return ENOTSUPP
+	 *	to prevent it from dereferencing something that hasn't
+	 *	been allocated.
+	 */
+	ret = register_pernet_subsys(&handshake_genl_net_ops);
+	if (ret) {
+		pr_warn("handshake: pernet registration failed (%d)\n", ret);
+		genl_unregister_family(&handshake_nl_family);
+		handshake_req_hash_destroy();
+	}
+
+	return ret;
+}
+
+static void __exit handshake_exit(void)
+{
+	unregister_pernet_subsys(&handshake_genl_net_ops);
+	handshake_net_id = 0;
+
+	handshake_req_hash_destroy();
+	genl_unregister_family(&handshake_nl_family);
+}
+
+module_init(handshake_init);
+module_exit(handshake_exit);
diff --git a/net/handshake/request.c b/net/handshake/request.c
new file mode 100644
index 000000000000..d5b2bc6de057
--- /dev/null
+++ b/net/handshake/request.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handshake request lifetime events
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/fdtable.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/*
+ * We need both a handshake_req -> sock mapping, and a sock ->
+ * handshake_req mapping. Both are one-to-one.
+ *
+ * To avoid adding another pointer field to struct sock, net/handshake
+ * maintains a hash table, indexed by the memory address of @sock, to
+ * find the struct handshake_req outstanding for that socket. The
+ * reverse direction uses a simple pointer field in the handshake_req
+ * struct.
+ */
+
+static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params handshake_rhash_params = {
+	.key_len		= sizeof_field(struct handshake_req, hr_sk),
+	.key_offset		= offsetof(struct handshake_req, hr_sk),
+	.head_offset		= offsetof(struct handshake_req, hr_rhash),
+	.automatic_shrinking	= true,
+};
+
+int handshake_req_hash_init(void)
+{
+	return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params);
+}
+
+void handshake_req_hash_destroy(void)
+{
+	rhashtable_destroy(&handshake_rhashtbl);
+}
+
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk)
+{
+	return rhashtable_lookup_fast(&handshake_rhashtbl, &sk,
+				      handshake_rhash_params);
+}
+
+static bool handshake_req_hash_add(struct handshake_req *req)
+{
+	int ret;
+
+	ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl,
+					    &req->hr_rhash,
+					    handshake_rhash_params);
+	return ret == 0;
+}
+
+static void handshake_req_destroy(struct handshake_req *req)
+{
+	if (req->hr_proto->hp_destroy)
+		req->hr_proto->hp_destroy(req);
+	rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash,
+			       handshake_rhash_params);
+	kfree(req);
+}
+
+static void handshake_sk_destruct(struct sock *sk)
+{
+	void (*sk_destruct)(struct sock *sk);
+	struct handshake_req *req;
+
+	req = handshake_req_hash_lookup(sk);
+	if (!req)
+		return;
+
+	trace_handshake_destruct(sock_net(sk), req, sk);
+	sk_destruct = req->hr_odestruct;
+	handshake_req_destroy(req);
+	if (sk_destruct)
+		sk_destruct(sk);
+}
+
+/**
+ * handshake_req_alloc - Allocate a handshake request
+ * @proto: security protocol
+ * @flags: memory allocation flags
+ *
+ * Returns an initialized handshake_req or NULL.
+ */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+					  gfp_t flags)
+{
+	struct handshake_req *req;
+
+	if (!proto)
+		return NULL;
+	if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE)
+		return NULL;
+	if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX)
+		return NULL;
+	if (!proto->hp_accept || !proto->hp_done)
+		return NULL;
+
+	req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags);
+	if (!req)
+		return NULL;
+
+	INIT_LIST_HEAD(&req->hr_list);
+	req->hr_proto = proto;
+	return req;
+}
+EXPORT_SYMBOL(handshake_req_alloc);
+
+/**
+ * handshake_req_private - Get per-handshake private data
+ * @req: handshake arguments
+ *
+ */
+void *handshake_req_private(struct handshake_req *req)
+{
+	return (void *)&req->hr_priv;
+}
+EXPORT_SYMBOL(handshake_req_private);
+
+static bool __add_pending_locked(struct handshake_net *hn,
+				 struct handshake_req *req)
+{
+	if (WARN_ON_ONCE(!list_empty(&req->hr_list)))
+		return false;
+	hn->hn_pending++;
+	list_add_tail(&req->hr_list, &hn->hn_requests);
+	return true;
+}
+
+static void __remove_pending_locked(struct handshake_net *hn,
+				    struct handshake_req *req)
+{
+	hn->hn_pending--;
+	list_del_init(&req->hr_list);
+}
+
+/*
+ * Returns %true if the request was found on @net's pending list,
+ * otherwise %false.
+ *
+ * If @req was on a pending list, it has not yet been accepted.
+ */
+static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
+{
+	bool ret = false;
+
+	spin_lock(&hn->hn_lock);
+	if (!list_empty(&req->hr_list)) {
+		__remove_pending_locked(hn, req);
+		ret = true;
+	}
+	spin_unlock(&hn->hn_lock);
+
+	return ret;
+}
+
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
+{
+	struct handshake_req *req, *pos;
+
+	req = NULL;
+	spin_lock(&hn->hn_lock);
+	list_for_each_entry(pos, &hn->hn_requests, hr_list) {
+		if (pos->hr_proto->hp_handler_class != class)
+			continue;
+		__remove_pending_locked(hn, pos);
+		req = pos;
+		break;
+	}
+	spin_unlock(&hn->hn_lock);
+
+	return req;
+}
+
+/**
+ * handshake_req_submit - Submit a handshake request
+ * @sock: open socket on which to perform the handshake
+ * @req: handshake arguments
+ * @flags: memory allocation flags
+ *
+ * Return values:
+ *   %0: Request queued
+ *   %-EINVAL: Invalid argument
+ *   %-EBUSY: A handshake is already under way for this socket
+ *   %-ESRCH: No handshake agent is available
+ *   %-EAGAIN: Too many pending handshake requests
+ *   %-ENOMEM: Failed to allocate memory
+ *   %-EMSGSIZE: Failed to construct notification message
+ *   %-EOPNOTSUPP: Handshake module not initialized
+ *
+ * A zero return value from handshake_req_submit() means that
+ * exactly one subsequent completion callback is guaranteed.
+ *
+ * A negative return value from handshake_req_submit() means that
+ * no completion callback will be done and that @req has been
+ * destroyed.
+ */
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+			 gfp_t flags)
+{
+	struct handshake_net *hn;
+	struct net *net;
+	int ret;
+
+	if (!sock || !req || !sock->file) {
+		kfree(req);
+		return -EINVAL;
+	}
+
+	req->hr_sk = sock->sk;
+	if (!req->hr_sk) {
+		kfree(req);
+		return -EINVAL;
+	}
+	req->hr_odestruct = req->hr_sk->sk_destruct;
+	req->hr_sk->sk_destruct = handshake_sk_destruct;
+
+	ret = -EOPNOTSUPP;
+	net = sock_net(req->hr_sk);
+	hn = handshake_pernet(net);
+	if (!hn)
+		goto out_err;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
+		goto out_err;
+
+	spin_lock(&hn->hn_lock);
+	ret = -EOPNOTSUPP;
+	if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
+		goto out_unlock;
+	ret = -EBUSY;
+	if (!handshake_req_hash_add(req))
+		goto out_unlock;
+	if (!__add_pending_locked(hn, req))
+		goto out_unlock;
+	spin_unlock(&hn->hn_lock);
+
+	ret = handshake_genl_notify(net, req->hr_proto, flags);
+	if (ret) {
+		trace_handshake_notify_err(net, req, req->hr_sk, ret);
+		if (remove_pending(hn, req))
+			goto out_err;
+	}
+
+	/* Prevent socket release while a handshake request is pending */
+	sock_hold(req->hr_sk);
+
+	trace_handshake_submit(net, req, req->hr_sk);
+	return 0;
+
+out_unlock:
+	spin_unlock(&hn->hn_lock);
+out_err:
+	trace_handshake_submit_err(net, req, req->hr_sk, ret);
+	handshake_req_destroy(req);
+	return ret;
+}
+EXPORT_SYMBOL(handshake_req_submit);
+
+void handshake_complete(struct handshake_req *req, unsigned int status,
+			struct genl_info *info)
+{
+	struct sock *sk = req->hr_sk;
+	struct net *net = sock_net(sk);
+
+	if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+		trace_handshake_complete(net, req, sk, status);
+		req->hr_proto->hp_done(req, status, info);
+
+		/* Handshake request is no longer pending */
+		sock_put(sk);
+	}
+}
+
+/**
+ * handshake_req_cancel - Cancel an in-progress handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ *   %true - Uncompleted handshake request was canceled
+ *   %false - Handshake request already completed or not found
+ */
+bool handshake_req_cancel(struct sock *sk)
+{
+	struct handshake_req *req;
+	struct handshake_net *hn;
+	struct net *net;
+
+	net = sock_net(sk);
+	req = handshake_req_hash_lookup(sk);
+	if (!req) {
+		trace_handshake_cancel_none(net, req, sk);
+		return false;
+	}
+
+	hn = handshake_pernet(net);
+	if (hn && remove_pending(hn, req)) {
+		/* Request hadn't been accepted */
+		goto out_true;
+	}
+	if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+		/* Request already completed */
+		trace_handshake_cancel_busy(net, req, sk);
+		return false;
+	}
+
+out_true:
+	trace_handshake_cancel(net, req, sk);
+
+	/* Handshake request is no longer pending */
+	sock_put(sk);
+	return true;
+}
+EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/trace.c b/net/handshake/trace.c
new file mode 100644
index 000000000000..1c4d8e27e17a
--- /dev/null
+++ b/net/handshake/trace.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace points for transport security layer handshakes.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "handshake.h"
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/handshake.h>
-- 
cgit v1.2.3


From 2fd5532044a89d2403b543520b4902e196f7d165 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 17 Apr 2023 10:32:33 -0400
Subject: net/handshake: Add a kernel API for requesting a TLSv1.3 handshake

To enable kernel consumers of TLS to request a TLS handshake, add
support to net/handshake/ to request a handshake upcall.

This patch also acts as a template for adding handshake upcall
support for other kernel transport layer security providers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/handshake.yaml |   4 +-
 Documentation/networking/index.rst         |   1 +
 Documentation/networking/tls-handshake.rst | 217 +++++++++++++++
 MAINTAINERS                                |   2 +
 include/net/handshake.h                    |  43 +++
 include/uapi/linux/handshake.h             |   2 +
 net/handshake/Makefile                     |   2 +-
 net/handshake/genl.c                       |   3 +-
 net/handshake/genl.h                       |   1 +
 net/handshake/tlshd.c                      | 417 +++++++++++++++++++++++++++++
 10 files changed, 689 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/tls-handshake.rst
 create mode 100644 include/net/handshake.h
 create mode 100644 net/handshake/tlshd.c

(limited to 'include')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
index 0333d92b1438..614f1a585511 100644
--- a/Documentation/netlink/specs/handshake.yaml
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -16,7 +16,7 @@ definitions:
     type: enum
     name: handler-class
     value-start: 0
-    entries: [ none, max ]
+    entries: [ none, tlshd, max ]
   -
     type: enum
     name: msg-type
@@ -120,3 +120,5 @@ mcast-groups:
   list:
     -
       name: none
+    -
+      name: tlshd
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 24bb256d6d53..a164ff074356 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -36,6 +36,7 @@ Contents:
    scaling
    tls
    tls-offload
+   tls-handshake
    nfc
    6lowpan
    6pack
diff --git a/Documentation/networking/tls-handshake.rst b/Documentation/networking/tls-handshake.rst
new file mode 100644
index 000000000000..a2817a88e905
--- /dev/null
+++ b/Documentation/networking/tls-handshake.rst
@@ -0,0 +1,217 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+In-Kernel TLS Handshake
+=======================
+
+Overview
+========
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs
+over TCP. TLS provides end-to-end data integrity and confidentiality in
+addition to peer authentication.
+
+The kernel's kTLS implementation handles the TLS record subprotocol, but
+does not handle the TLS handshake subprotocol which is used to establish
+a TLS session. Kernel consumers can use the API described here to
+request TLS session establishment.
+
+There are several possible ways to provide a handshake service in the
+kernel. The API described here is designed to hide the details of those
+implementations so that in-kernel TLS consumers do not need to be
+aware of how the handshake gets done.
+
+
+User handshake agent
+====================
+
+As of this writing, there is no TLS handshake implementation in the
+Linux kernel. To provide a handshake service, a handshake agent
+(typically in user space) is started in each network namespace where a
+kernel consumer might require a TLS handshake. Handshake agents listen
+for events sent from the kernel that indicate a handshake request is
+waiting.
+
+An open socket is passed to a handshake agent via a netlink operation,
+which creates a socket descriptor in the agent's file descriptor table.
+If the handshake completes successfully, the handshake agent promotes
+the socket to use the TLS ULP and sets the session information using the
+SOL_TLS socket options. The handshake agent returns the socket to the
+kernel via a second netlink operation.
+
+
+Kernel Handshake API
+====================
+
+A kernel TLS consumer initiates a client-side TLS handshake on an open
+socket by invoking one of the tls_client_hello() functions. First, it
+fills in a structure that contains the parameters of the request:
+
+.. code-block:: c
+
+  struct tls_handshake_args {
+        struct socket   *ta_sock;
+        tls_done_func_t ta_done;
+        void            *ta_data;
+        unsigned int    ta_timeout_ms;
+        key_serial_t    ta_keyring;
+        key_serial_t    ta_my_cert;
+        key_serial_t    ta_my_privkey;
+        unsigned int    ta_num_peerids;
+        key_serial_t    ta_my_peerids[5];
+  };
+
+The @ta_sock field references an open and connected socket. The consumer
+must hold a reference on the socket to prevent it from being destroyed
+while the handshake is in progress. The consumer must also have
+instantiated a struct file in sock->file.
+
+
+@ta_done contains a callback function that is invoked when the handshake
+has completed. Further explanation of this function is in the "Handshake
+Completion" sesction below.
+
+The consumer can fill in the @ta_timeout_ms field to force the servicing
+handshake agent to exit after a number of milliseconds. This enables the
+socket to be fully closed once both the kernel and the handshake agent
+have closed their endpoints.
+
+Authentication material such as x.509 certificates, private certificate
+keys, and pre-shared keys are provided to the handshake agent in keys
+that are instantiated by the consumer before making the handshake
+request. The consumer can provide a private keyring that is linked into
+the handshake agent's process keyring in the @ta_keyring field to prevent
+access of those keys by other subsystems.
+
+To request an x.509-authenticated TLS session, the consumer fills in
+the @ta_my_cert and @ta_my_privkey fields with the serial numbers of
+keys containing an x.509 certificate and the private key for that
+certificate. Then, it invokes this function:
+
+.. code-block:: c
+
+  ret = tls_client_hello_x509(args, gfp_flags);
+
+The function returns zero when the handshake request is under way. A
+zero return guarantees the callback function @ta_done will be invoked
+for this socket. The function returns a negative errno if the handshake
+could not be started. A negative errno guarantees the callback function
+@ta_done will not be invoked on this socket.
+
+
+To initiate a client-side TLS handshake with a pre-shared key, use:
+
+.. code-block:: c
+
+  ret = tls_client_hello_psk(args, gfp_flags);
+
+However, in this case, the consumer fills in the @ta_my_peerids array
+with serial numbers of keys containing the peer identities it wishes
+to offer, and the @ta_num_peerids field with the number of array
+entries it has filled in. The other fields are filled in as above.
+
+
+To initiate an anonymous client-side TLS handshake use:
+
+.. code-block:: c
+
+  ret = tls_client_hello_anon(args, gfp_flags);
+
+The handshake agent presents no peer identity information to the remote
+during this type of handshake. Only server authentication (ie the client
+verifies the server's identity) is performed during the handshake. Thus
+the established session uses encryption only.
+
+
+Consumers that are in-kernel servers use:
+
+.. code-block:: c
+
+  ret = tls_server_hello_x509(args, gfp_flags);
+
+or
+
+.. code-block:: c
+
+  ret = tls_server_hello_psk(args, gfp_flags);
+
+The argument structure is filled in as above.
+
+
+If the consumer needs to cancel the handshake request, say, due to a ^C
+or other exigent event, the consumer can invoke:
+
+.. code-block:: c
+
+  bool tls_handshake_cancel(sock);
+
+This function returns true if the handshake request associated with
+@sock has been canceled. The consumer's handshake completion callback
+will not be invoked. If this function returns false, then the consumer's
+completion callback has already been invoked.
+
+
+Handshake Completion
+====================
+
+When the handshake agent has completed processing, it notifies the
+kernel that the socket may be used by the consumer again. At this point,
+the consumer's handshake completion callback, provided in the @ta_done
+field in the tls_handshake_args structure, is invoked.
+
+The synopsis of this function is:
+
+.. code-block:: c
+
+  typedef void	(*tls_done_func_t)(void *data, int status,
+                                   key_serial_t peerid);
+
+The consumer provides a cookie in the @ta_data field of the
+tls_handshake_args structure that is returned in the @data parameter of
+this callback. The consumer uses the cookie to match the callback to the
+thread waiting for the handshake to complete.
+
+The success status of the handshake is returned via the @status
+parameter:
+
++------------+----------------------------------------------+
+|  status    |  meaning                                     |
++============+==============================================+
+|  0         |  TLS session established successfully        |
++------------+----------------------------------------------+
+|  -EACCESS  |  Remote peer rejected the handshake or       |
+|            |  authentication failed                       |
++------------+----------------------------------------------+
+|  -ENOMEM   |  Temporary resource allocation failure       |
++------------+----------------------------------------------+
+|  -EINVAL   |  Consumer provided an invalid argument       |
++------------+----------------------------------------------+
+|  -ENOKEY   |  Missing authentication material             |
++------------+----------------------------------------------+
+|  -EIO      |  An unexpected fault occurred                |
++------------+----------------------------------------------+
+
+The @peerid parameter contains the serial number of a key containing the
+remote peer's identity or the value TLS_NO_PEERID if the session is not
+authenticated.
+
+A best practice is to close and destroy the socket immediately if the
+handshake failed.
+
+
+Other considerations
+--------------------
+
+While a handshake is under way, the kernel consumer must alter the
+socket's sk_data_ready callback function to ignore all incoming data.
+Once the handshake completion callback function has been invoked, normal
+receive operation can be resumed.
+
+Once a TLS session is established, the consumer must provide a buffer
+for and then examine the control message (CMSG) that is part of every
+subsequent sock_recvmsg(). Each control message indicates whether the
+received message data is TLS record data or session metadata.
+
+See tls.rst for details on how a kTLS consumer recognizes incoming
+(decrypted) application data, alerts, and handshake packets once the
+socket has been promoted to use the TLS ULP.
diff --git a/MAINTAINERS b/MAINTAINERS
index cdc7748d15b8..04ebde8ccb75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8953,6 +8953,8 @@ L:	kernel-tls-handshake@lists.linux.dev
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/netlink/specs/handshake.yaml
+F:	Documentation/networking/tls-handshake.rst
+F:	include/net/handshake.h
 F:	include/trace/events/handshake.h
 F:	net/handshake/
 
diff --git a/include/net/handshake.h b/include/net/handshake.h
new file mode 100644
index 000000000000..3352b1ab43b3
--- /dev/null
+++ b/include/net/handshake.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink HANDSHAKE service.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _NET_HANDSHAKE_H
+#define _NET_HANDSHAKE_H
+
+enum {
+	TLS_NO_KEYRING = 0,
+	TLS_NO_PEERID = 0,
+	TLS_NO_CERT = 0,
+	TLS_NO_PRIVKEY = 0,
+};
+
+typedef void	(*tls_done_func_t)(void *data, int status,
+				   key_serial_t peerid);
+
+struct tls_handshake_args {
+	struct socket		*ta_sock;
+	tls_done_func_t		ta_done;
+	void			*ta_data;
+	unsigned int		ta_timeout_ms;
+	key_serial_t		ta_keyring;
+	key_serial_t		ta_my_cert;
+	key_serial_t		ta_my_privkey;
+	unsigned int		ta_num_peerids;
+	key_serial_t		ta_my_peerids[5];
+};
+
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags);
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags);
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags);
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags);
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags);
+
+bool tls_handshake_cancel(struct sock *sk);
+
+#endif /* _NET_HANDSHAKE_H */
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 7f66ff489b87..1de4d0b95325 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -11,6 +11,7 @@
 
 enum handshake_handler_class {
 	HANDSHAKE_HANDLER_CLASS_NONE,
+	HANDSHAKE_HANDLER_CLASS_TLSHD,
 	HANDSHAKE_HANDLER_CLASS_MAX,
 };
 
@@ -67,5 +68,6 @@ enum {
 };
 
 #define HANDSHAKE_MCGRP_NONE	"none"
+#define HANDSHAKE_MCGRP_TLSHD	"tlshd"
 
 #endif /* _UAPI_LINUX_HANDSHAKE_H */
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
index d38736de45da..a089f7e3df24 100644
--- a/net/handshake/Makefile
+++ b/net/handshake/Makefile
@@ -8,4 +8,4 @@
 #
 
 obj-y += handshake.o
-handshake-y := genl.o netlink.o request.o trace.o
+handshake-y := genl.o netlink.o request.o tlshd.o trace.o
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
index 652f37d19bd6..9f29efb1493e 100644
--- a/net/handshake/genl.c
+++ b/net/handshake/genl.c
@@ -12,7 +12,7 @@
 
 /* HANDSHAKE_CMD_ACCEPT - do */
 static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
-	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 1),
+	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 2),
 };
 
 /* HANDSHAKE_CMD_DONE - do */
@@ -42,6 +42,7 @@ static const struct genl_split_ops handshake_nl_ops[] = {
 
 static const struct genl_multicast_group handshake_nl_mcgrps[] = {
 	[HANDSHAKE_NLGRP_NONE] = { "none", },
+	[HANDSHAKE_NLGRP_TLSHD] = { "tlshd", },
 };
 
 struct genl_family handshake_nl_family __ro_after_init = {
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
index a1eb7ccccc7f..2c1f1aa6a02a 100644
--- a/net/handshake/genl.h
+++ b/net/handshake/genl.h
@@ -16,6 +16,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	HANDSHAKE_NLGRP_NONE,
+	HANDSHAKE_NLGRP_TLSHD,
 };
 
 extern struct genl_family handshake_nl_family;
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
new file mode 100644
index 000000000000..1b8353296060
--- /dev/null
+++ b/net/handshake/tlshd.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Establish a TLS session for a kernel socket consumer
+ * using the tlshd user space handler.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2021-2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/keyctl.h>
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+struct tls_handshake_req {
+	void			(*th_consumer_done)(void *data, int status,
+						    key_serial_t peerid);
+	void			*th_consumer_data;
+
+	int			th_type;
+	unsigned int		th_timeout_ms;
+	int			th_auth_mode;
+	key_serial_t		th_keyring;
+	key_serial_t		th_certificate;
+	key_serial_t		th_privkey;
+
+	unsigned int		th_num_peerids;
+	key_serial_t		th_peerid[5];
+};
+
+static struct tls_handshake_req *
+tls_handshake_req_init(struct handshake_req *req,
+		       const struct tls_handshake_args *args)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+
+	treq->th_timeout_ms = args->ta_timeout_ms;
+	treq->th_consumer_done = args->ta_done;
+	treq->th_consumer_data = args->ta_data;
+	treq->th_keyring = args->ta_keyring;
+	treq->th_num_peerids = 0;
+	treq->th_certificate = TLS_NO_CERT;
+	treq->th_privkey = TLS_NO_PRIVKEY;
+	return treq;
+}
+
+static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
+					 struct genl_info *info)
+{
+	struct nlattr *head = nlmsg_attrdata(info->nlhdr, GENL_HDRLEN);
+	int rem, len = nlmsg_attrlen(info->nlhdr, GENL_HDRLEN);
+	struct nlattr *nla;
+	unsigned int i;
+
+	i = 0;
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+			i++;
+	}
+	if (!i)
+		return;
+	treq->th_num_peerids = min_t(unsigned int, i,
+				     ARRAY_SIZE(treq->th_peerid));
+
+	i = 0;
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+			treq->th_peerid[i++] = nla_get_u32(nla);
+		if (i >= treq->th_num_peerids)
+			break;
+	}
+}
+
+/**
+ * tls_handshake_done - callback to handle a CMD_DONE request
+ * @req: socket on which the handshake was performed
+ * @status: session status code
+ * @info: full results of session establishment
+ *
+ */
+static void tls_handshake_done(struct handshake_req *req,
+			       unsigned int status, struct genl_info *info)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+
+	treq->th_peerid[0] = TLS_NO_PEERID;
+	if (info)
+		tls_handshake_remote_peerids(treq, info);
+
+	treq->th_consumer_done(treq->th_consumer_data, -status,
+			       treq->th_peerid[0]);
+}
+
+#if IS_ENABLED(CONFIG_KEYS)
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+	key_ref_t process_keyring_ref, keyring_ref;
+	int ret;
+
+	if (treq->th_keyring == TLS_NO_KEYRING)
+		return 0;
+
+	process_keyring_ref = lookup_user_key(KEY_SPEC_PROCESS_KEYRING,
+					      KEY_LOOKUP_CREATE,
+					      KEY_NEED_WRITE);
+	if (IS_ERR(process_keyring_ref)) {
+		ret = PTR_ERR(process_keyring_ref);
+		goto out;
+	}
+
+	keyring_ref = lookup_user_key(treq->th_keyring, KEY_LOOKUP_CREATE,
+				      KEY_NEED_LINK);
+	if (IS_ERR(keyring_ref)) {
+		ret = PTR_ERR(keyring_ref);
+		goto out_put_key;
+	}
+
+	ret = key_link(key_ref_to_ptr(process_keyring_ref),
+		       key_ref_to_ptr(keyring_ref));
+
+	key_ref_put(keyring_ref);
+out_put_key:
+	key_ref_put(process_keyring_ref);
+out:
+	return ret;
+}
+#else
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+	return 0;
+}
+#endif
+
+static int tls_handshake_put_peer_identity(struct sk_buff *msg,
+					   struct tls_handshake_req *treq)
+{
+	unsigned int i;
+
+	for (i = 0; i < treq->th_num_peerids; i++)
+		if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+				treq->th_peerid[i]) < 0)
+			return -EMSGSIZE;
+	return 0;
+}
+
+static int tls_handshake_put_certificate(struct sk_buff *msg,
+					 struct tls_handshake_req *treq)
+{
+	struct nlattr *entry_attr;
+
+	if (treq->th_certificate == TLS_NO_CERT &&
+	    treq->th_privkey == TLS_NO_PRIVKEY)
+		return 0;
+
+	entry_attr = nla_nest_start(msg, HANDSHAKE_A_ACCEPT_CERTIFICATE);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, HANDSHAKE_A_X509_CERT,
+			treq->th_certificate) ||
+	    nla_put_u32(msg, HANDSHAKE_A_X509_PRIVKEY,
+			treq->th_privkey)) {
+		nla_nest_cancel(msg, entry_attr);
+		return -EMSGSIZE;
+	}
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+}
+
+/**
+ * tls_handshake_accept - callback to construct a CMD_ACCEPT response
+ * @req: handshake parameters to return
+ * @info: generic netlink message context
+ * @fd: file descriptor to be returned
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int tls_handshake_accept(struct handshake_req *req,
+				struct genl_info *info, int fd)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+	struct nlmsghdr *hdr;
+	struct sk_buff *msg;
+	int ret;
+
+	ret = tls_handshake_private_keyring(treq);
+	if (ret < 0)
+		goto out;
+
+	ret = -ENOMEM;
+	msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		goto out;
+	hdr = handshake_genl_put(msg, info);
+	if (!hdr)
+		goto out_cancel;
+
+	ret = -EMSGSIZE;
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+	if (ret < 0)
+		goto out_cancel;
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
+	if (ret < 0)
+		goto out_cancel;
+	if (treq->th_timeout_ms) {
+		ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
+		if (ret < 0)
+			goto out_cancel;
+	}
+
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
+			  treq->th_auth_mode);
+	if (ret < 0)
+		goto out_cancel;
+	switch (treq->th_auth_mode) {
+	case HANDSHAKE_AUTH_PSK:
+		ret = tls_handshake_put_peer_identity(msg, treq);
+		if (ret < 0)
+			goto out_cancel;
+		break;
+	case HANDSHAKE_AUTH_X509:
+		ret = tls_handshake_put_certificate(msg, treq);
+		if (ret < 0)
+			goto out_cancel;
+		break;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+out_cancel:
+	genlmsg_cancel(msg, hdr);
+out:
+	return ret;
+}
+
+static const struct handshake_proto tls_handshake_proto = {
+	.hp_handler_class	= HANDSHAKE_HANDLER_CLASS_TLSHD,
+	.hp_privsize		= sizeof(struct tls_handshake_req),
+
+	.hp_accept		= tls_handshake_accept,
+	.hp_done		= tls_handshake_done,
+};
+
+/**
+ * tls_client_hello_anon - request an anonymous TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_UNAUTH;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_anon);
+
+/**
+ * tls_client_hello_x509 - request an x.509-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+	treq->th_certificate = args->ta_my_cert;
+	treq->th_privkey = args->ta_my_privkey;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_x509);
+
+/**
+ * tls_client_hello_psk - request a PSK-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-EINVAL: Wrong number of local peer IDs
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+	unsigned int i;
+
+	if (!args->ta_num_peerids ||
+	    args->ta_num_peerids > ARRAY_SIZE(treq->th_peerid))
+		return -EINVAL;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+	treq->th_num_peerids = args->ta_num_peerids;
+	for (i = 0; i < args->ta_num_peerids; i++)
+		treq->th_peerid[i] = args->ta_my_peerids[i];
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_psk);
+
+/**
+ * tls_server_hello_x509 - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+	treq->th_certificate = args->ta_my_cert;
+	treq->th_privkey = args->ta_my_privkey;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_x509);
+
+/**
+ * tls_server_hello_psk - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+	treq->th_num_peerids = 1;
+	treq->th_peerid[0] = args->ta_my_peerids[0];
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_psk);
+
+/**
+ * tls_handshake_cancel - cancel a pending handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ *   %true - Uncompleted handshake request was canceled
+ *   %false - Handshake request already completed or not found
+ */
+bool tls_handshake_cancel(struct sock *sk)
+{
+	return handshake_req_cancel(sk);
+}
+EXPORT_SYMBOL(tls_handshake_cancel);
-- 
cgit v1.2.3


From f52cc627b832e08a7bcf1b7e81e650ec308fe1d8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 13 Apr 2023 15:25:47 -0700
Subject: Revert "net/mlx5: Enable management PF initialization"

This reverts commit fe998a3c77b9f989a30a2a01fb00d3729a6d53a4.

Paul reports that it causes a regression with IB on CX4
and FW 12.18.1000. In addition I think that the concept
of "management PF" is not fully accepted and requires
a discussion.

Fixes: fe998a3c77b9 ("net/mlx5: Enable management PF initialization")
Reported-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/all/CAHC9VhQ7A4+msL38WpbOMYjAqLp0EtOjeLh4Dc6SQtD6OUvCQg@mail.gmail.com/
Link: https://lore.kernel.org/r/20230413222547.56901-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c     | 6 ------
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c    | 8 --------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +-
 include/linux/mlx5/driver.h                       | 5 -----
 4 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 445fe30c3d0b..2e7806001fdc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -59,9 +59,6 @@ bool mlx5_eth_supported(struct mlx5_core_dev *dev)
 	if (!IS_ENABLED(CONFIG_MLX5_CORE_EN))
 		return false;
 
-	if (mlx5_core_is_management_pf(dev))
-		return false;
-
 	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return false;
 
@@ -201,9 +198,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev)
 	if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND))
 		return false;
 
-	if (mlx5_core_is_management_pf(dev))
-		return false;
-
 	if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV)
 		return false;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index 7c9c4e40c019..d000236ddbac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -75,10 +75,6 @@ int mlx5_ec_init(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return 0;
 
-	/* Management PF don't have a peer PF */
-	if (mlx5_core_is_management_pf(dev))
-		return 0;
-
 	return mlx5_host_pf_init(dev);
 }
 
@@ -89,10 +85,6 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return;
 
-	/* Management PF don't have a peer PF */
-	if (mlx5_core_is_management_pf(dev))
-		return;
-
 	mlx5_host_pf_cleanup(dev);
 
 	err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 8bdf28762f41..19fed514fc17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1488,7 +1488,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *
 	void *hca_caps;
 	int err;
 
-	if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_management_pf(dev)) {
+	if (!mlx5_core_is_ecpf(dev)) {
 		*max_sfs = 0;
 		return 0;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209..7e225e41d55b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1211,11 +1211,6 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev)
 	return dev->coredev_type == MLX5_COREDEV_VF;
 }
 
-static inline bool mlx5_core_is_management_pf(const struct mlx5_core_dev *dev)
-{
-	return MLX5_CAP_GEN(dev, num_ports) == 1 && !MLX5_CAP_GEN(dev, native_port_num);
-}
-
 static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 {
 	return dev->caps.embedded_cpu;
-- 
cgit v1.2.3


From 519fe1bae7e20fc4e7f179d50b6102b49980e85d Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Sun, 2 Apr 2023 11:37:42 +0900
Subject: ext4: Add a uapi header for ext4 userspace APIs

Create a uapi header include/uapi/linux/ext4.h, move the ioctls and
associated data structures to the uapi header, and include it from
fs/ext4/ext4.h.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Link: https://lore.kernel.org/r/680175260970d977d16b5cc7e7606483ec99eb63.1680402881.git.josh@joshtriplett.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 MAINTAINERS               |   1 +
 fs/ext4/ext4.h            |  91 +-----------------------------------
 include/uapi/linux/ext4.h | 117 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 90 deletions(-)
 create mode 100644 include/uapi/linux/ext4.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index d8ebab595b2a..364719171ba7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7728,6 +7728,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git
 F:	Documentation/filesystems/ext4/
 F:	fs/ext4/
 F:	include/trace/events/ext4.h
+F:	include/uapi/linux/ext4.h
 
 Extended Verification Module (EVM)
 M:	Mimi Zohar <zohar@linux.ibm.com>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 271d9661ce82..18cb2680dc39 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,6 +40,7 @@
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
+#include <uapi/linux/ext4.h>
 
 #include <linux/fscrypt.h>
 #include <linux/fsverity.h>
@@ -591,17 +592,6 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(RESERVED);
 }
 
-/* Used to pass group descriptor data when online resize is done */
-struct ext4_new_group_input {
-	__u32 group;		/* Group number for this data */
-	__u64 block_bitmap;	/* Absolute block number of block bitmap */
-	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
-	__u64 inode_table;	/* Absolute block number of inode table start */
-	__u32 blocks_count;	/* Total number of blocks in this group */
-	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
-	__u16 unused;
-};
-
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 struct compat_ext4_new_group_input {
 	u32 group;
@@ -698,70 +688,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040
 
-/*
- * ioctl commands
- */
-#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
-#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
-#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
-#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
-#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
-#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
-#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
-#define EXT4_IOC_MIGRATE		_IO('f', 9)
- /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
-#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
-#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
-#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
-#define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
-#define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
-/* ioctl codes 19--39 are reserved for fscrypt */
-#define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
-#define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
-#define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
-#define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
-#define EXT4_IOC_GETFSUUID		_IOR('f', 44, struct fsuuid)
-#define EXT4_IOC_SETFSUUID		_IOW('f', 44, struct fsuuid)
-
-#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
-
-/*
- * Flags for going down operation
- */
-#define EXT4_GOING_FLAGS_DEFAULT		0x0	/* going down */
-#define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
-#define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
-
-/*
- * Flags returned by EXT4_IOC_GETSTATE
- *
- * We only expose to userspace a subset of the state flags in
- * i_state_flags
- */
-#define EXT4_STATE_FLAG_EXT_PRECACHED	0x00000001
-#define EXT4_STATE_FLAG_NEW		0x00000002
-#define EXT4_STATE_FLAG_NEWENTRY	0x00000004
-#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008
-
-/* flags for ioctl EXT4_IOC_CHECKPOINT */
-#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD	0x1
-#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT	0x2
-#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN	0x4
-#define EXT4_IOC_CHECKPOINT_FLAG_VALID		(EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
-						EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
-						EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
-
-/*
- * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
- */
-struct fsuuid {
-	__u32       fsu_len;
-	__u32       fsu_flags;
-	__u8        fsu_uuid[];
-};
-
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -776,12 +702,6 @@ struct fsuuid {
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
 
-/*
- * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
- * It indicates that the entry in extent status cache is for a hole.
- */
-#define EXT4_FIEMAP_EXTENT_HOLE		0x08000000
-
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 
@@ -852,15 +772,6 @@ struct ext4_inode {
 	__le32	i_projid;	/* Project ID */
 };
 
-struct move_extent {
-	__u32 reserved;		/* should be zero */
-	__u32 donor_fd;		/* donor file descriptor */
-	__u64 orig_start;	/* logical start offset in block for orig */
-	__u64 donor_start;	/* logical start offset in block for donor */
-	__u64 len;		/* block length to be moved */
-	__u64 moved_len;	/* moved block length */
-};
-
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h
new file mode 100644
index 000000000000..1c4c2dd29112
--- /dev/null
+++ b/include/uapi/linux/ext4.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_EXT4_H
+#define _UAPI_LINUX_EXT4_H
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * ext4-specific ioctl commands
+ */
+#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
+#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
+#define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
+#define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
+#define EXT4_IOC_GETFSUUID		_IOR('f', 44, struct fsuuid)
+#define EXT4_IOC_SETFSUUID		_IOW('f', 44, struct fsuuid)
+
+#define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
+#define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED	0x00000001
+#define EXT4_STATE_FLAG_NEW		0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY	0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008
+
+/*
+ * Flags for ioctl EXT4_IOC_CHECKPOINT
+ */
+#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD	0x1
+#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT	0x2
+#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN	0x4
+#define EXT4_IOC_CHECKPOINT_FLAG_VALID		(EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
+						EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
+						EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+
+/*
+ * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
+ */
+struct fsuuid {
+	__u32       fsu_len;
+	__u32       fsu_flags;
+	__u8        fsu_uuid[];
+};
+
+/*
+ * Structure for EXT4_IOC_MOVE_EXT
+ */
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+
+/*
+ * Flags used by EXT4_IOC_SHUTDOWN
+ */
+#define EXT4_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+	__u32 group;		/* Group number for this data */
+	__u64 block_bitmap;	/* Absolute block number of block bitmap */
+	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
+	__u64 inode_table;	/* Absolute block number of inode table start */
+	__u32 blocks_count;	/* Total number of blocks in this group */
+	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE		0x08000000
+
+#endif /* _UAPI_LINUX_EXT4_H */
-- 
cgit v1.2.3


From 13890626501ffda22b18213ddaf7930473da5792 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 10 Apr 2023 15:37:07 -0400
Subject: USB: core: Add routines for endpoint checks in old drivers

Many of the older USB drivers in the Linux USB stack were written
based simply on a vendor's device specification.  They use the
endpoint information in the spec and assume these endpoints will
always be present, with the properties listed, in any device matching
the given vendor and product IDs.

While that may have been true back then, with spoofing and fuzzing it
is not true any more.  More and more we are finding that those old
drivers need to perform at least a minimum of checking before they try
to use any endpoint other than ep0.

To make this checking as simple as possible, we now add a couple of
utility routines to the USB core.  usb_check_bulk_endpoints() and
usb_check_int_endpoints() take an interface pointer together with a
list of endpoint addresses (numbers and directions).  They check that
the interface's current alternate setting includes endpoints with
those addresses and that each of these endpoints has the right type:
bulk or interrupt, respectively.

Although we already have usb_find_common_endpoints() and related
routines meant for a similar purpose, they are not well suited for
this kind of checking.  Those routines find endpoints of various
kinds, but only one (either the first or the last) of each kind, and
they don't verify that the endpoints' addresses agree with what the
caller expects.

In theory the new routines could be more general: They could take a
particular altsetting as their argument instead of always using the
interface's current altsetting.  In practice I think this won't matter
too much; multiple altsettings tend to be used for transferring media
(audio or visual) over isochronous endpoints, not bulk or interrupt.
Drivers for such devices will generally require more sophisticated
checking than these simplistic routines provide.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/dd2c8e8c-2c87-44ea-ba17-c64b97e201c9@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/usb.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h    |  5 ++++
 2 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 34742fbbd84d..901ec732321c 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -206,6 +206,82 @@ int usb_find_common_endpoints_reverse(struct usb_host_interface *alt,
 }
 EXPORT_SYMBOL_GPL(usb_find_common_endpoints_reverse);
 
+/**
+ * usb_find_endpoint() - Given an endpoint address, search for the endpoint's
+ * usb_host_endpoint structure in an interface's current altsetting.
+ * @intf: the interface whose current altsetting should be searched
+ * @ep_addr: the endpoint address (number and direction) to find
+ *
+ * Search the altsetting's list of endpoints for one with the specified address.
+ *
+ * Return: Pointer to the usb_host_endpoint if found, %NULL otherwise.
+ */
+static const struct usb_host_endpoint *usb_find_endpoint(
+		const struct usb_interface *intf, unsigned int ep_addr)
+{
+	int n;
+	const struct usb_host_endpoint *ep;
+
+	n = intf->cur_altsetting->desc.bNumEndpoints;
+	ep = intf->cur_altsetting->endpoint;
+	for (; n > 0; (--n, ++ep)) {
+		if (ep->desc.bEndpointAddress == ep_addr)
+			return ep;
+	}
+	return NULL;
+}
+
+/**
+ * usb_check_bulk_endpoints - Check whether an interface's current altsetting
+ * contains a set of bulk endpoints with the given addresses.
+ * @intf: the interface whose current altsetting should be searched
+ * @ep_addrs: 0-terminated array of the endpoint addresses (number and
+ * direction) to look for
+ *
+ * Search for endpoints with the specified addresses and check their types.
+ *
+ * Return: %true if all the endpoints are found and are bulk, %false otherwise.
+ */
+bool usb_check_bulk_endpoints(
+		const struct usb_interface *intf, const u8 *ep_addrs)
+{
+	const struct usb_host_endpoint *ep;
+
+	for (; *ep_addrs; ++ep_addrs) {
+		ep = usb_find_endpoint(intf, *ep_addrs);
+		if (!ep || !usb_endpoint_xfer_bulk(&ep->desc))
+			return false;
+	}
+	return true;
+}
+EXPORT_SYMBOL_GPL(usb_check_bulk_endpoints);
+
+/**
+ * usb_check_int_endpoints - Check whether an interface's current altsetting
+ * contains a set of interrupt endpoints with the given addresses.
+ * @intf: the interface whose current altsetting should be searched
+ * @ep_addrs: 0-terminated array of the endpoint addresses (number and
+ * direction) to look for
+ *
+ * Search for endpoints with the specified addresses and check their types.
+ *
+ * Return: %true if all the endpoints are found and are interrupt,
+ * %false otherwise.
+ */
+bool usb_check_int_endpoints(
+		const struct usb_interface *intf, const u8 *ep_addrs)
+{
+	const struct usb_host_endpoint *ep;
+
+	for (; *ep_addrs; ++ep_addrs) {
+		ep = usb_find_endpoint(intf, *ep_addrs);
+		if (!ep || !usb_endpoint_xfer_int(&ep->desc))
+			return false;
+	}
+	return true;
+}
+EXPORT_SYMBOL_GPL(usb_check_int_endpoints);
+
 /**
  * usb_find_alt_setting() - Given a configuration, find the alternate setting
  * for the given interface.
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d510fabcafa2..4ae0466c846c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -291,6 +291,11 @@ void usb_put_intf(struct usb_interface *intf);
 #define USB_MAXINTERFACES	32
 #define USB_MAXIADS		(USB_MAXINTERFACES/2)
 
+bool usb_check_bulk_endpoints(
+		const struct usb_interface *intf, const u8 *ep_addrs);
+bool usb_check_int_endpoints(
+		const struct usb_interface *intf, const u8 *ep_addrs);
+
 /*
  * USB Resume Timer: Every Host controller driver should drive the resume
  * signalling on the bus for the amount of time defined by this macro.
-- 
cgit v1.2.3


From ae131f4970f0778f35ed06aeb15bde2fbc1d9619 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Apr 2023 14:24:15 +0800
Subject: crypto: api - Add crypto_tfm_get

Add a crypto_tfm_get interface to allow tfm objects to be shared.
They can still be freed in the usual way.

This should only be done with tfm objects with no keys.  You must
also not modify the tfm flags in any way once it becomes shared.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 4 ++++
 crypto/internal.h      | 6 ++++++
 include/linux/crypto.h | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/crypto/api.c b/crypto/api.c
index e67cc63368ed..f509d73fa682 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -408,6 +408,7 @@ struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
 		goto out_err;
 
 	tfm->__crt_alg = alg;
+	refcount_set(&tfm->refcnt, 1);
 
 	err = crypto_init_ops(tfm, type, mask);
 	if (err)
@@ -507,6 +508,7 @@ void *crypto_create_tfm_node(struct crypto_alg *alg,
 	tfm = (struct crypto_tfm *)(mem + tfmsize);
 	tfm->__crt_alg = alg;
 	tfm->node = node;
+	refcount_set(&tfm->refcnt, 1);
 
 	err = frontend->init_tfm(tfm);
 	if (err)
@@ -619,6 +621,8 @@ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
 	if (IS_ERR_OR_NULL(mem))
 		return;
 
+	if (!refcount_dec_and_test(&tfm->refcnt))
+		return;
 	alg = tfm->__crt_alg;
 
 	if (!tfm->exit && alg->cra_exit)
diff --git a/crypto/internal.h b/crypto/internal.h
index f84dfe6491e5..5eee009ee494 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -10,6 +10,7 @@
 
 #include <crypto/algapi.h>
 #include <linux/completion.h>
+#include <linux/err.h>
 #include <linux/jump_label.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -186,5 +187,10 @@ static inline int crypto_is_test_larval(struct crypto_larval *larval)
 	return larval->alg.cra_driver_name[0];
 }
 
+static inline struct crypto_tfm *crypto_tfm_get(struct crypto_tfm *tfm)
+{
+	return refcount_inc_not_zero(&tfm->refcnt) ? tfm : ERR_PTR(-EOVERFLOW);
+}
+
 #endif	/* _CRYPTO_INTERNAL_H */
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index fdfa3e8eda43..fa310ac1db59 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -419,6 +419,7 @@ int crypto_has_alg(const char *name, u32 type, u32 mask);
  */
 
 struct crypto_tfm {
+	refcount_t refcnt;
 
 	u32 crt_flags;
 
-- 
cgit v1.2.3


From ed3630b83e9394acef27041de7a2223f1e875e9a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Apr 2023 14:24:19 +0800
Subject: crypto: hash - Add crypto_clone_ahash/shash

This patch adds the helpers crypto_clone_ahash and crypto_clone_shash.
They are the hash-specific counterparts of crypto_clone_tfm.

This allows code paths that cannot otherwise allocate a hash tfm
object to do so.  Once a new tfm has been obtained its key could
then be changed without impacting other users.

Note that only algorithms that implement clone_tfm can be cloned.
However, all keyless hashes can be cloned by simply reusing the
tfm object.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 | 51 +++++++++++++++++++++++++++++++++++++++++
 crypto/hash.h                  |  4 ++++
 crypto/shash.c                 | 52 ++++++++++++++++++++++++++++++++++++++++++
 include/crypto/hash.h          |  8 +++++++
 include/crypto/internal/hash.h |  2 --
 5 files changed, 115 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 2d858d7fd1bb..b8a607928e72 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -543,6 +543,57 @@ int crypto_has_ahash(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_ahash);
 
+struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *hash)
+{
+	struct hash_alg_common *halg = crypto_hash_alg_common(hash);
+	struct crypto_tfm *tfm = crypto_ahash_tfm(hash);
+	struct crypto_ahash *nhash;
+	struct ahash_alg *alg;
+	int err;
+
+	if (!crypto_hash_alg_has_setkey(halg)) {
+		tfm = crypto_tfm_get(tfm);
+		if (IS_ERR(tfm))
+			return ERR_CAST(tfm);
+
+		return hash;
+	}
+
+	nhash = crypto_clone_tfm(&crypto_ahash_type, tfm);
+
+	if (IS_ERR(nhash))
+		return nhash;
+
+	nhash->init = hash->init;
+	nhash->update = hash->update;
+	nhash->final = hash->final;
+	nhash->finup = hash->finup;
+	nhash->digest = hash->digest;
+	nhash->export = hash->export;
+	nhash->import = hash->import;
+	nhash->setkey = hash->setkey;
+	nhash->reqsize = hash->reqsize;
+
+	if (tfm->__crt_alg->cra_type != &crypto_ahash_type)
+		return crypto_clone_shash_ops_async(nhash, hash);
+
+	err = -ENOSYS;
+	alg = crypto_ahash_alg(hash);
+	if (!alg->clone_tfm)
+		goto out_free_nhash;
+
+	err = alg->clone_tfm(nhash, hash);
+	if (err)
+		goto out_free_nhash;
+
+	return nhash;
+
+out_free_nhash:
+	crypto_free_ahash(nhash);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(crypto_clone_ahash);
+
 static int ahash_prepare_alg(struct ahash_alg *alg)
 {
 	struct crypto_alg *base = &alg->halg.base;
diff --git a/crypto/hash.h b/crypto/hash.h
index 57b28a986d69..7e6c1a948692 100644
--- a/crypto/hash.h
+++ b/crypto/hash.h
@@ -31,6 +31,10 @@ static inline int crypto_hash_report_stat(struct sk_buff *skb,
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
 
+int crypto_init_shash_ops_async(struct crypto_tfm *tfm);
+struct crypto_ahash *crypto_clone_shash_ops_async(struct crypto_ahash *nhash,
+						  struct crypto_ahash *hash);
+
 int hash_prepare_alg(struct hash_alg_common *alg);
 
 #endif	/* _LOCAL_CRYPTO_HASH_H */
diff --git a/crypto/shash.c b/crypto/shash.c
index 4cefa614dbbd..5845b7d59b2f 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -445,6 +445,24 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	return 0;
 }
 
+struct crypto_ahash *crypto_clone_shash_ops_async(struct crypto_ahash *nhash,
+						  struct crypto_ahash *hash)
+{
+	struct crypto_shash **nctx = crypto_ahash_ctx(nhash);
+	struct crypto_shash **ctx = crypto_ahash_ctx(hash);
+	struct crypto_shash *shash;
+
+	shash = crypto_clone_shash(*ctx);
+	if (IS_ERR(shash)) {
+		crypto_free_ahash(nhash);
+		return ERR_CAST(shash);
+	}
+
+	*nctx = shash;
+
+	return nhash;
+}
+
 static void crypto_shash_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_shash *hash = __crypto_shash_cast(tfm);
@@ -564,6 +582,40 @@ int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_shash);
 
+struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash)
+{
+	struct crypto_tfm *tfm = crypto_shash_tfm(hash);
+	struct shash_alg *alg = crypto_shash_alg(hash);
+	struct crypto_shash *nhash;
+	int err;
+
+	if (!crypto_shash_alg_has_setkey(alg)) {
+		tfm = crypto_tfm_get(tfm);
+		if (IS_ERR(tfm))
+			return ERR_CAST(tfm);
+
+		return hash;
+	}
+
+	if (!alg->clone_tfm)
+		return ERR_PTR(-ENOSYS);
+
+	nhash = crypto_clone_tfm(&crypto_shash_type, tfm);
+	if (IS_ERR(nhash))
+		return nhash;
+
+	nhash->descsize = hash->descsize;
+
+	err = alg->clone_tfm(nhash, hash);
+	if (err) {
+		crypto_free_shash(nhash);
+		return ERR_PTR(err);
+	}
+
+	return nhash;
+}
+EXPORT_SYMBOL_GPL(crypto_clone_shash);
+
 int hash_prepare_alg(struct hash_alg_common *alg)
 {
 	struct crypto_istat_hash *istat = hash_get_stat(alg);
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 3a04e601ad6a..e69542d86a2b 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -152,6 +152,7 @@ struct ahash_request {
  * @exit_tfm: Deinitialize the cryptographic transformation object.
  *	      This is a counterpart to @init_tfm, used to remove
  *	      various changes set in @init_tfm.
+ * @clone_tfm: Copy transform into new object, may allocate memory.
  * @halg: see struct hash_alg_common
  */
 struct ahash_alg {
@@ -166,6 +167,7 @@ struct ahash_alg {
 		      unsigned int keylen);
 	int (*init_tfm)(struct crypto_ahash *tfm);
 	void (*exit_tfm)(struct crypto_ahash *tfm);
+	int (*clone_tfm)(struct crypto_ahash *dst, struct crypto_ahash *src);
 
 	struct hash_alg_common halg;
 };
@@ -209,6 +211,7 @@ struct shash_desc {
  * @exit_tfm: Deinitialize the cryptographic transformation object.
  *	      This is a counterpart to @init_tfm, used to remove
  *	      various changes set in @init_tfm.
+ * @clone_tfm: Copy transform into new object, may allocate memory.
  * @digestsize: see struct ahash_alg
  * @statesize: see struct ahash_alg
  * @descsize: Size of the operational state for the message digest. This state
@@ -234,6 +237,7 @@ struct shash_alg {
 		      unsigned int keylen);
 	int (*init_tfm)(struct crypto_shash *tfm);
 	void (*exit_tfm)(struct crypto_shash *tfm);
+	int (*clone_tfm)(struct crypto_shash *dst, struct crypto_shash *src);
 
 	unsigned int descsize;
 
@@ -297,6 +301,8 @@ static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
 struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
 					u32 mask);
 
+struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *tfm);
+
 static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
 {
 	return &tfm->base;
@@ -761,6 +767,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 					u32 mask);
 
+struct crypto_shash *crypto_clone_shash(struct crypto_shash *tfm);
+
 int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
 
 static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 0b259dbb97af..37edf3f4e8af 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -133,8 +133,6 @@ int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc);
 int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc);
 int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc);
 
-int crypto_init_shash_ops_async(struct crypto_tfm *tfm);
-
 static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
 {
 	return crypto_tfm_ctx(crypto_ahash_tfm(tfm));
-- 
cgit v1.2.3


From 440da737cf8d35a1b2205678cc1879fa90948f7a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 14 Apr 2023 09:40:07 -0500
Subject: i2c: designware: Use PCI PSP driver for communication

Currently the PSP semaphore communication base address is discovered
by using an MSR that is not architecturally guaranteed for future
platforms.  Also the mailbox that is utilized for communication with
the PSP may have other consumers in the kernel, so it's better to
make all communication go through a single driver.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Mark Hasemeyer <markhas@chromium.org>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Tested-by: Mark Hasemeyer <markhas@chromium.org>
Acked-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/i2c/busses/Kconfig                  |   4 +-
 drivers/i2c/busses/i2c-designware-amdpsp.c  | 175 ++++------------------------
 drivers/i2c/busses/i2c-designware-core.h    |   1 -
 drivers/i2c/busses/i2c-designware-platdrv.c |   1 -
 include/linux/psp-platform-access.h         |   1 +
 5 files changed, 29 insertions(+), 153 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 25eb4e8fd22f..89f8b75043d0 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -566,9 +566,11 @@ config I2C_DESIGNWARE_PLATFORM
 
 config I2C_DESIGNWARE_AMDPSP
 	bool "AMD PSP I2C semaphore support"
-	depends on X86_MSR
 	depends on ACPI
+	depends on CRYPTO_DEV_SP_PSP
 	depends on I2C_DESIGNWARE_PLATFORM
+	depends on (I2C_DESIGNWARE_PLATFORM=y && CRYPTO_DEV_CCP_DD=y) || \
+		   (I2C_DESIGNWARE_PLATFORM=m && CRYPTO_DEV_CCP_DD)
 	help
 	  This driver enables managed host access to the selected I2C bus shared
 	  between AMD CPU and AMD PSP.
diff --git a/drivers/i2c/busses/i2c-designware-amdpsp.c b/drivers/i2c/busses/i2c-designware-amdpsp.c
index 652e6b64bd5f..12870dc44bdb 100644
--- a/drivers/i2c/busses/i2c-designware-amdpsp.c
+++ b/drivers/i2c/busses/i2c-designware-amdpsp.c
@@ -1,35 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/bitfield.h>
-#include <linux/bits.h>
 #include <linux/i2c.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/psp-platform-access.h>
 #include <linux/psp.h>
-#include <linux/types.h>
 #include <linux/workqueue.h>
 
-#include <asm/msr.h>
-
 #include "i2c-designware-core.h"
 
-#define MSR_AMD_PSP_ADDR	0xc00110a2
-#define PSP_MBOX_OFFSET		0x10570
-#define PSP_CMD_TIMEOUT_US	(500 * USEC_PER_MSEC)
-
 #define PSP_I2C_RESERVATION_TIME_MS 100
 
-#define PSP_I2C_REQ_BUS_CMD		0x64
 #define PSP_I2C_REQ_RETRY_CNT		400
 #define PSP_I2C_REQ_RETRY_DELAY_US	(25 * USEC_PER_MSEC)
 #define PSP_I2C_REQ_STS_OK		0x0
 #define PSP_I2C_REQ_STS_BUS_BUSY	0x1
 #define PSP_I2C_REQ_STS_INV_PARAM	0x3
 
-struct psp_req_buffer_hdr {
-	u32 total_size;
-	u32 status;
-};
-
 enum psp_i2c_req_type {
 	PSP_I2C_REQ_ACQUIRE,
 	PSP_I2C_REQ_RELEASE,
@@ -41,119 +26,12 @@ struct psp_i2c_req {
 	enum psp_i2c_req_type type;
 };
 
-struct psp_mbox {
-	u32 cmd_fields;
-	u64 i2c_req_addr;
-} __packed;
-
 static DEFINE_MUTEX(psp_i2c_access_mutex);
 static unsigned long psp_i2c_sem_acquired;
-static void __iomem *mbox_iomem;
 static u32 psp_i2c_access_count;
 static bool psp_i2c_mbox_fail;
 static struct device *psp_i2c_dev;
 
-/*
- * Implementation of PSP-x86 i2c-arbitration mailbox introduced for AMD Cezanne
- * family of SoCs.
- */
-
-static int psp_get_mbox_addr(unsigned long *mbox_addr)
-{
-	unsigned long long psp_mmio;
-
-	if (rdmsrl_safe(MSR_AMD_PSP_ADDR, &psp_mmio))
-		return -EIO;
-
-	*mbox_addr = (unsigned long)(psp_mmio + PSP_MBOX_OFFSET);
-
-	return 0;
-}
-
-static int psp_mbox_probe(void)
-{
-	unsigned long mbox_addr;
-	int ret;
-
-	ret = psp_get_mbox_addr(&mbox_addr);
-	if (ret)
-		return ret;
-
-	mbox_iomem = ioremap(mbox_addr, sizeof(struct psp_mbox));
-	if (!mbox_iomem)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/* Recovery field should be equal 0 to start sending commands */
-static int psp_check_mbox_recovery(struct psp_mbox __iomem *mbox)
-{
-	u32 tmp;
-
-	tmp = readl(&mbox->cmd_fields);
-
-	return FIELD_GET(PSP_CMDRESP_RECOVERY, tmp);
-}
-
-static int psp_wait_cmd(struct psp_mbox __iomem *mbox)
-{
-	u32 tmp, expected;
-
-	/* Expect mbox_cmd to be cleared and the response bit to be set by PSP */
-	expected = FIELD_PREP(PSP_CMDRESP_RESP, 1);
-
-	/*
-	 * Check for readiness of PSP mailbox in a tight loop in order to
-	 * process further as soon as command was consumed.
-	 */
-	return readl_poll_timeout(&mbox->cmd_fields, tmp, (tmp == expected),
-				  0, PSP_CMD_TIMEOUT_US);
-}
-
-/* Status equal to 0 means that PSP succeed processing command */
-static u32 psp_check_mbox_sts(struct psp_mbox __iomem *mbox)
-{
-	u32 cmd_reg;
-
-	cmd_reg = readl(&mbox->cmd_fields);
-
-	return FIELD_GET(PSP_CMDRESP_STS, cmd_reg);
-}
-
-static int psp_send_cmd(struct psp_i2c_req *req)
-{
-	struct psp_mbox __iomem *mbox = mbox_iomem;
-	phys_addr_t req_addr;
-	u32 cmd_reg;
-
-	if (psp_check_mbox_recovery(mbox))
-		return -EIO;
-
-	if (psp_wait_cmd(mbox))
-		return -EBUSY;
-
-	/*
-	 * Fill mailbox with address of command-response buffer, which will be
-	 * used for sending i2c requests as well as reading status returned by
-	 * PSP. Use physical address of buffer, since PSP will map this region.
-	 */
-	req_addr = __psp_pa((void *)req);
-	writeq(req_addr, &mbox->i2c_req_addr);
-
-	/* Write command register to trigger processing */
-	cmd_reg = FIELD_PREP(PSP_CMDRESP_CMD, PSP_I2C_REQ_BUS_CMD);
-	writel(cmd_reg, &mbox->cmd_fields);
-
-	if (psp_wait_cmd(mbox))
-		return -ETIMEDOUT;
-
-	if (psp_check_mbox_sts(mbox))
-		return -EIO;
-
-	return 0;
-}
-
 /* Helper to verify status returned by PSP */
 static int check_i2c_req_sts(struct psp_i2c_req *req)
 {
@@ -173,22 +51,25 @@ static int check_i2c_req_sts(struct psp_i2c_req *req)
 	}
 }
 
-static int psp_send_check_i2c_req(struct psp_i2c_req *req)
+/*
+ * Errors in x86-PSP i2c-arbitration protocol may occur at two levels:
+ * 1. mailbox communication - PSP is not operational or some IO errors with
+ *    basic communication had happened.
+ * 2. i2c-requests - PSP refuses to grant i2c arbitration to x86 for too long.
+ *
+ * In order to distinguish between these in error handling code all mailbox
+ * communication errors on the first level (from CCP symbols) will be passed
+ * up and if -EIO is returned the second level will be checked.
+ */
+static int psp_send_i2c_req_cezanne(struct psp_i2c_req *req)
 {
-	/*
-	 * Errors in x86-PSP i2c-arbitration protocol may occur at two levels:
-	 * 1. mailbox communication - PSP is not operational or some IO errors
-	 * with basic communication had happened;
-	 * 2. i2c-requests - PSP refuses to grant i2c arbitration to x86 for too
-	 * long.
-	 * In order to distinguish between these two in error handling code, all
-	 * errors on the first level (returned by psp_send_cmd) are shadowed by
-	 * -EIO.
-	 */
-	if (psp_send_cmd(req))
-		return -EIO;
+	int ret;
 
-	return check_i2c_req_sts(req);
+	ret = psp_send_platform_access_msg(PSP_I2C_REQ_BUS_CMD, (struct psp_request *)req);
+	if (ret == -EIO)
+		return check_i2c_req_sts(req);
+
+	return ret;
 }
 
 static int psp_send_i2c_req(enum psp_i2c_req_type i2c_req_type)
@@ -202,11 +83,11 @@ static int psp_send_i2c_req(enum psp_i2c_req_type i2c_req_type)
 	if (!req)
 		return -ENOMEM;
 
-	req->hdr.total_size = sizeof(*req);
+	req->hdr.payload_size = sizeof(*req);
 	req->type = i2c_req_type;
 
 	start = jiffies;
-	ret = read_poll_timeout(psp_send_check_i2c_req, status,
+	ret = read_poll_timeout(psp_send_i2c_req_cezanne, status,
 				(status != -EBUSY),
 				PSP_I2C_REQ_RETRY_DELAY_US,
 				PSP_I2C_REQ_RETRY_CNT * PSP_I2C_REQ_RETRY_DELAY_US,
@@ -381,7 +262,8 @@ static const struct i2c_lock_operations i2c_dw_psp_lock_ops = {
 
 int i2c_dw_amdpsp_probe_lock_support(struct dw_i2c_dev *dev)
 {
-	int ret;
+	if (!IS_REACHABLE(CONFIG_CRYPTO_DEV_CCP_DD))
+		return -ENODEV;
 
 	if (!dev)
 		return -ENODEV;
@@ -393,11 +275,10 @@ int i2c_dw_amdpsp_probe_lock_support(struct dw_i2c_dev *dev)
 	if (psp_i2c_dev)
 		return -EEXIST;
 
-	psp_i2c_dev = dev->dev;
+	if (psp_check_platform_access_status())
+		return -EPROBE_DEFER;
 
-	ret = psp_mbox_probe();
-	if (ret)
-		return ret;
+	psp_i2c_dev = dev->dev;
 
 	dev_info(psp_i2c_dev, "I2C bus managed by AMD PSP\n");
 
@@ -411,9 +292,3 @@ int i2c_dw_amdpsp_probe_lock_support(struct dw_i2c_dev *dev)
 
 	return 0;
 }
-
-/* Unmap area used as a mailbox with PSP */
-void i2c_dw_amdpsp_remove_lock_support(struct dw_i2c_dev *dev)
-{
-	iounmap(mbox_iomem);
-}
diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h
index 050d8c63ad3c..c5d87aae39c6 100644
--- a/drivers/i2c/busses/i2c-designware-core.h
+++ b/drivers/i2c/busses/i2c-designware-core.h
@@ -383,7 +383,6 @@ int i2c_dw_baytrail_probe_lock_support(struct dw_i2c_dev *dev);
 
 #if IS_ENABLED(CONFIG_I2C_DESIGNWARE_AMDPSP)
 int i2c_dw_amdpsp_probe_lock_support(struct dw_i2c_dev *dev);
-void i2c_dw_amdpsp_remove_lock_support(struct dw_i2c_dev *dev);
 #endif
 
 int i2c_dw_validate_speed(struct dw_i2c_dev *dev);
diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index 74182db03a88..89ad88c54754 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -214,7 +214,6 @@ static const struct i2c_dw_semaphore_callbacks i2c_dw_semaphore_cb_table[] = {
 #ifdef CONFIG_I2C_DESIGNWARE_AMDPSP
 	{
 		.probe = i2c_dw_amdpsp_probe_lock_support,
-		.remove = i2c_dw_amdpsp_remove_lock_support,
 	},
 #endif
 	{}
diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h
index 1b661341d8f3..75da8f5f7ad8 100644
--- a/include/linux/psp-platform-access.h
+++ b/include/linux/psp-platform-access.h
@@ -7,6 +7,7 @@
 
 enum psp_platform_access_msg {
 	PSP_CMD_NONE = 0x0,
+	PSP_I2C_REQ_BUS_CMD = 0x64,
 };
 
 struct psp_req_buffer_hdr {
-- 
cgit v1.2.3


From 046b6a171009e1ed9ede02194025e9ccd709beb2 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 19 Apr 2023 09:41:27 -0700
Subject: device property: make device_property functions take const device *

device_property functions do not modify the device pointer passed to them.
The underlying of_device and fwnode_ functions actually already take
const * arguments. Mark the parameter constant to simplify conversion
from of_property to device_property functions, and to let the calling code
use const device pointers where possible.

Cc: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20230419164127.3773278-1-linux@roeck-us.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/property.c  | 16 ++++++++--------
 include/linux/property.h | 36 ++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 6a908e9c30f8..f6117ec9805c 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(__dev_fwnode_const);
  *
  * Return: true if property @propname is present. Otherwise, returns false.
  */
-bool device_property_present(struct device *dev, const char *propname)
+bool device_property_present(const struct device *dev, const char *propname)
 {
 	return fwnode_property_present(dev_fwnode(dev), propname);
 }
@@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_present);
  *	   %-EOVERFLOW if the size of the property is not as expected.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_u8_array(struct device *dev, const char *propname,
+int device_property_read_u8_array(const struct device *dev, const char *propname,
 				  u8 *val, size_t nval)
 {
 	return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval);
@@ -118,7 +118,7 @@ EXPORT_SYMBOL_GPL(device_property_read_u8_array);
  *	   %-EOVERFLOW if the size of the property is not as expected.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_u16_array(struct device *dev, const char *propname,
+int device_property_read_u16_array(const struct device *dev, const char *propname,
 				   u16 *val, size_t nval)
 {
 	return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval);
@@ -146,7 +146,7 @@ EXPORT_SYMBOL_GPL(device_property_read_u16_array);
  *	   %-EOVERFLOW if the size of the property is not as expected.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_u32_array(struct device *dev, const char *propname,
+int device_property_read_u32_array(const struct device *dev, const char *propname,
 				   u32 *val, size_t nval)
 {
 	return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval);
@@ -174,7 +174,7 @@ EXPORT_SYMBOL_GPL(device_property_read_u32_array);
  *	   %-EOVERFLOW if the size of the property is not as expected.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_u64_array(struct device *dev, const char *propname,
+int device_property_read_u64_array(const struct device *dev, const char *propname,
 				   u64 *val, size_t nval)
 {
 	return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval);
@@ -202,7 +202,7 @@ EXPORT_SYMBOL_GPL(device_property_read_u64_array);
  *	   %-EOVERFLOW if the size of the property is not as expected.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_string_array(struct device *dev, const char *propname,
+int device_property_read_string_array(const struct device *dev, const char *propname,
 				      const char **val, size_t nval)
 {
 	return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval);
@@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(device_property_read_string_array);
  *	   %-EPROTO or %-EILSEQ if the property type is not a string.
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_read_string(struct device *dev, const char *propname,
+int device_property_read_string(const struct device *dev, const char *propname,
 				const char **val)
 {
 	return fwnode_property_read_string(dev_fwnode(dev), propname, val);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(device_property_read_string);
  *	   %-EPROTO if the property is not an array of strings,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int device_property_match_string(struct device *dev, const char *propname,
+int device_property_match_string(const struct device *dev, const char *propname,
 				 const char *string)
 {
 	return fwnode_property_match_string(dev_fwnode(dev), propname, string);
diff --git a/include/linux/property.h b/include/linux/property.h
index 59f452198c64..66df1a15d518 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -39,20 +39,20 @@ struct fwnode_handle *__dev_fwnode(struct device *dev);
 		 const struct device *: __dev_fwnode_const,	\
 		 struct device *: __dev_fwnode)(dev)
 
-bool device_property_present(struct device *dev, const char *propname);
-int device_property_read_u8_array(struct device *dev, const char *propname,
+bool device_property_present(const struct device *dev, const char *propname);
+int device_property_read_u8_array(const struct device *dev, const char *propname,
 				  u8 *val, size_t nval);
-int device_property_read_u16_array(struct device *dev, const char *propname,
+int device_property_read_u16_array(const struct device *dev, const char *propname,
 				   u16 *val, size_t nval);
-int device_property_read_u32_array(struct device *dev, const char *propname,
+int device_property_read_u32_array(const struct device *dev, const char *propname,
 				   u32 *val, size_t nval);
-int device_property_read_u64_array(struct device *dev, const char *propname,
+int device_property_read_u64_array(const struct device *dev, const char *propname,
 				   u64 *val, size_t nval);
-int device_property_read_string_array(struct device *dev, const char *propname,
+int device_property_read_string_array(const struct device *dev, const char *propname,
 				      const char **val, size_t nval);
-int device_property_read_string(struct device *dev, const char *propname,
+int device_property_read_string(const struct device *dev, const char *propname,
 				const char **val);
-int device_property_match_string(struct device *dev,
+int device_property_match_string(const struct device *dev,
 				 const char *propname, const char *string);
 
 bool fwnode_property_present(const struct fwnode_handle *fwnode,
@@ -142,57 +142,57 @@ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name);
 
 unsigned int device_get_child_node_count(const struct device *dev);
 
-static inline bool device_property_read_bool(struct device *dev,
+static inline bool device_property_read_bool(const struct device *dev,
 					     const char *propname)
 {
 	return device_property_present(dev, propname);
 }
 
-static inline int device_property_read_u8(struct device *dev,
+static inline int device_property_read_u8(const struct device *dev,
 					  const char *propname, u8 *val)
 {
 	return device_property_read_u8_array(dev, propname, val, 1);
 }
 
-static inline int device_property_read_u16(struct device *dev,
+static inline int device_property_read_u16(const struct device *dev,
 					   const char *propname, u16 *val)
 {
 	return device_property_read_u16_array(dev, propname, val, 1);
 }
 
-static inline int device_property_read_u32(struct device *dev,
+static inline int device_property_read_u32(const struct device *dev,
 					   const char *propname, u32 *val)
 {
 	return device_property_read_u32_array(dev, propname, val, 1);
 }
 
-static inline int device_property_read_u64(struct device *dev,
+static inline int device_property_read_u64(const struct device *dev,
 					   const char *propname, u64 *val)
 {
 	return device_property_read_u64_array(dev, propname, val, 1);
 }
 
-static inline int device_property_count_u8(struct device *dev, const char *propname)
+static inline int device_property_count_u8(const struct device *dev, const char *propname)
 {
 	return device_property_read_u8_array(dev, propname, NULL, 0);
 }
 
-static inline int device_property_count_u16(struct device *dev, const char *propname)
+static inline int device_property_count_u16(const struct device *dev, const char *propname)
 {
 	return device_property_read_u16_array(dev, propname, NULL, 0);
 }
 
-static inline int device_property_count_u32(struct device *dev, const char *propname)
+static inline int device_property_count_u32(const struct device *dev, const char *propname)
 {
 	return device_property_read_u32_array(dev, propname, NULL, 0);
 }
 
-static inline int device_property_count_u64(struct device *dev, const char *propname)
+static inline int device_property_count_u64(const struct device *dev, const char *propname)
 {
 	return device_property_read_u64_array(dev, propname, NULL, 0);
 }
 
-static inline int device_property_string_array_count(struct device *dev,
+static inline int device_property_string_array_count(const struct device *dev,
 						     const char *propname)
 {
 	return device_property_read_string_array(dev, propname, NULL, 0);
-- 
cgit v1.2.3


From ec274aff21b6a94c7973384ca80a503c1bc3b173 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik.ext@huawei.com>
Date: Thu, 20 Apr 2023 11:58:58 +0200
Subject: swiotlb: Omit total_used and used_hiwater if !CONFIG_DEBUG_FS

The tracking of used_hiwater adds an atomic operation to the hot
path. This is acceptable only when debugging the kernel. To make
sure that the fields can never be used by mistake, do not even
include them in struct io_tlb_mem if CONFIG_DEBUG_FS is not set.

The build fails after doing that. To fix it, it is necessary to
remove all code specific to debugfs and instead provide a stub
implementation of swiotlb_create_debugfs_files(). As a bonus, this
change allows to remove one __maybe_unused attribute.

Signed-off-by: Petr Tesarik <petr.tesarik.ext@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h |  2 ++
 kernel/dma/swiotlb.c    | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 6dc4598d2260..44767844e12b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -107,8 +107,10 @@ struct io_tlb_mem {
 	unsigned int area_nslabs;
 	struct io_tlb_area *areas;
 	struct io_tlb_slot *slots;
+#ifdef CONFIG_DEBUG_FS
 	atomic_long_t total_used;
 	atomic_long_t used_hiwater;
+#endif
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9bbc2802a444..e67dafc255f2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -966,6 +966,8 @@ bool is_swiotlb_active(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
+#ifdef CONFIG_DEBUG_FS
+
 static int io_tlb_used_get(void *data, u64 *val)
 {
 	struct io_tlb_mem *mem = data;
@@ -1015,15 +1017,22 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
 			&fops_io_tlb_hiwater);
 }
 
-static int __init __maybe_unused swiotlb_create_default_debugfs(void)
+static int __init swiotlb_create_default_debugfs(void)
 {
 	swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_FS
 late_initcall(swiotlb_create_default_debugfs);
-#endif
+
+#else  /* !CONFIG_DEBUG_FS */
+
+static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+						const char *dirname)
+{
+}
+
+#endif	/* CONFIG_DEBUG_FS */
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 
-- 
cgit v1.2.3


From 038585573d0544bc717cdc5b3be4e95206d3ee3d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 18 Apr 2023 15:49:47 +0200
Subject: efi/pe: Import new BTI/IBT header flags from the spec

The latest version of your favorite fork of the PE/COFF spec includes a
new type of header flag that is intended to be used in the context of
EFI firmware to indicate to the image loader that the executable regions
of an image can be mapped with BTI/IBT enforcement enabled.

So let's import these definitions so we can use them in subsequent
patches.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/pe.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/pe.h b/include/linux/pe.h
index 6ffabf1e6d03..5e1e11540870 100644
--- a/include/linux/pe.h
+++ b/include/linux/pe.h
@@ -118,6 +118,9 @@
 #define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER             0x2000
 #define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE  0x8000
 
+#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT		0x0001
+#define IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT	0x0040
+
 /* they actually defined 0x00000000 as well, but I think we'll skip that one. */
 #define IMAGE_SCN_RESERVED_0	0x00000001
 #define IMAGE_SCN_RESERVED_1	0x00000002
@@ -165,6 +168,7 @@
 #define IMAGE_SCN_MEM_WRITE	0x80000000 /* writeable */
 
 #define IMAGE_DEBUG_TYPE_CODEVIEW	2
+#define IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS	20
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.2.3


From b52124a78ab34eb0754e32edc0c9996937779176 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 17 Apr 2023 10:27:05 -0500
Subject: PM: Add sysfs files to represent time spent in hardware sleep state

Userspace can't easily discover how much of a sleep cycle was spent in a
hardware sleep state without using kernel tracing and vendor specific sysfs
or debugfs files.

To make this information more discoverable, introduce 3 new sysfs files:
1) The time spent in a hw sleep state for last cycle.
2) The time spent in a hw sleep state since the kernel booted
3) The maximum time that the hardware can report for a sleep cycle.
All of these files will be present only if the system supports s2idle.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-power | 29 +++++++++++++++++
 include/linux/suspend.h               |  8 +++++
 kernel/power/main.c                   | 59 ++++++++++++++++++++++++++++-------
 3 files changed, 84 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index f99d433ff311..a3942b1036e2 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -413,6 +413,35 @@ Description:
 		The /sys/power/suspend_stats/last_failed_step file contains
 		the last failed step in the suspend/resume path.
 
+What:		/sys/power/suspend_stats/last_hw_sleep
+Date:		June 2023
+Contact:	Mario Limonciello <mario.limonciello@amd.com>
+Description:
+		The /sys/power/suspend_stats/last_hw_sleep file
+		contains the duration of time spent in a hardware sleep
+		state in the most recent system suspend-resume cycle.
+		This number is measured in microseconds.
+
+What:		/sys/power/suspend_stats/total_hw_sleep
+Date:		June 2023
+Contact:	Mario Limonciello <mario.limonciello@amd.com>
+Description:
+		The /sys/power/suspend_stats/total_hw_sleep file
+		contains the aggregate of time spent in a hardware sleep
+		state since the kernel was booted. This number
+		is measured in microseconds.
+
+What:		/sys/power/suspend_stats/max_hw_sleep
+Date:		June 2023
+Contact:	Mario Limonciello <mario.limonciello@amd.com>
+Description:
+		The /sys/power/suspend_stats/max_hw_sleep file
+		contains the maximum amount of time that the hardware can
+		report for time spent in a hardware sleep state. When sleep
+		cycles are longer than this time, the values for
+		'total_hw_sleep' and 'last_hw_sleep' may not be accurate.
+		This number is measured in microseconds.
+
 What:		/sys/power/sync_on_suspend
 Date:		October 2019
 Contact:	Jonas Meurer <jonas@freesources.org>
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index cfe19a028918..d0d4598a7b3f 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -68,6 +68,9 @@ struct suspend_stats {
 	int	last_failed_errno;
 	int	errno[REC_FAILED_NUM];
 	int	last_failed_step;
+	u64	last_hw_sleep;
+	u64	total_hw_sleep;
+	u64	max_hw_sleep;
 	enum suspend_stat_step	failed_steps[REC_FAILED_NUM];
 };
 
@@ -489,6 +492,8 @@ void restore_processor_state(void);
 extern int register_pm_notifier(struct notifier_block *nb);
 extern int unregister_pm_notifier(struct notifier_block *nb);
 extern void ksys_sync_helper(void);
+extern void pm_report_hw_sleep_time(u64 t);
+extern void pm_report_max_hw_sleep(u64 t);
 
 #define pm_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
@@ -526,6 +531,9 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline void pm_report_hw_sleep_time(u64 t) {};
+static inline void pm_report_max_hw_sleep(u64 t) {};
+
 static inline void ksys_sync_helper(void) {}
 
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 31ec4a9b9d70..3113ec2f1db4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,6 +6,7 @@
  * Copyright (c) 2003 Open Source Development Lab
  */
 
+#include <linux/acpi.h>
 #include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
@@ -83,6 +84,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 
+void pm_report_hw_sleep_time(u64 t)
+{
+	suspend_stats.last_hw_sleep = t;
+	suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+	suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
 int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
 {
 	int ret;
@@ -314,24 +328,27 @@ static char *suspend_step_name(enum suspend_stat_step step)
 	}
 }
 
-#define suspend_attr(_name)					\
+#define suspend_attr(_name, format_str)				\
 static ssize_t _name##_show(struct kobject *kobj,		\
 		struct kobj_attribute *attr, char *buf)		\
 {								\
-	return sprintf(buf, "%d\n", suspend_stats._name);	\
+	return sprintf(buf, format_str, suspend_stats._name);	\
 }								\
 static struct kobj_attribute _name = __ATTR_RO(_name)
 
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_attr(success, "%d\n");
+suspend_attr(fail, "%d\n");
+suspend_attr(failed_freeze, "%d\n");
+suspend_attr(failed_prepare, "%d\n");
+suspend_attr(failed_suspend, "%d\n");
+suspend_attr(failed_suspend_late, "%d\n");
+suspend_attr(failed_suspend_noirq, "%d\n");
+suspend_attr(failed_resume, "%d\n");
+suspend_attr(failed_resume_early, "%d\n");
+suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
 
 static ssize_t last_failed_dev_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
@@ -391,12 +408,30 @@ static struct attribute *suspend_attrs[] = {
 	&last_failed_dev.attr,
 	&last_failed_errno.attr,
 	&last_failed_step.attr,
+	&last_hw_sleep.attr,
+	&total_hw_sleep.attr,
+	&max_hw_sleep.attr,
 	NULL,
 };
 
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+	if (attr != &last_hw_sleep.attr &&
+	    attr != &total_hw_sleep.attr &&
+	    attr != &max_hw_sleep.attr)
+		return 0444;
+
+#ifdef CONFIG_ACPI
+	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+		return 0444;
+#endif
+	return 0;
+}
+
 static const struct attribute_group suspend_attr_group = {
 	.name = "suspend_stats",
 	.attrs = suspend_attrs,
+	.is_visible = suspend_attr_is_visible,
 };
 
 #ifdef CONFIG_DEBUG_FS
-- 
cgit v1.2.3


From b0bc615df488abd0e95107e4a9ecefb9bf8c250a Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Tue, 21 Mar 2023 00:10:16 +0200
Subject: net/mlx5: Add vnic devlink health reporter to PFs/VFs

Create a vnic devlink health reporter for PFs/VFs interfaces.
The reporter's diagnose callback displays the values of vNIC/vport
transport debug counters of PFs/VFs, as follows:

$ devlink health diagnose pci/0000:08:00.0 reporter vnic
 vNIC env counters:
    total_error_queues: 0 send_queue_priority_update_flow: 0
    comp_eq_overrun: 0 async_eq_overrun: 0 cq_overrun: 0
    invalid_command: 0 quota_exceeded_command: 0
    nic_receive_steering_discard: 0

Moreover, add documentation on the reporter functionality and the
counters description.

While at it, expose the vNIC counters diagnose function to be used by
the downstream patch, which will reveal the counters for representor
interfaces.

Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/devlink.rst             |  30 +++++
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../mellanox/mlx5/core/diag/reporter_vnic.c        | 125 +++++++++++++++++++++
 .../mellanox/mlx5/core/diag/reporter_vnic.h        |  16 +++
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |   4 +
 include/linux/mlx5/driver.h                        |   1 +
 6 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
index 0995e4e5acd7..ceab18e46456 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
@@ -257,3 +257,33 @@ User commands examples:
     $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
 
 NOTE: This command can run only on PF.
+
+vnic reporter
+-------------
+The vnic reporter implements only the `diagnose` callback.
+It is responsible for querying the vnic diagnostic counters from fw and displaying
+them in realtime.
+
+Description of the vnic counters:
+total_q_under_processor_handle: number of queues in an error state due to
+an async error or errored command.
+send_queue_priority_update_flow: number of QP/SQ priority/SL update
+events.
+cq_overrun: number of times CQ entered an error state due to an
+overflow.
+async_eq_overrun: number of times an EQ mapped to async events was
+overrun.
+comp_eq_overrun: number of times an EQ mapped to completion events was
+overrun.
+quota_exceeded_command: number of commands issued and failed due to quota
+exceeded.
+invalid_command: number of commands issued and failed dues to any reason
+other than quota exceeded.
+nic_receive_steering_discard: number of packets that completed RX flow
+steering but were discarded due to a mismatch in flow table.
+
+User commands examples:
+- Diagnose PF/VF vnic counters
+        $ devlink health diagnose pci/0000:82:00.1 reporter vnic
+
+NOTE: This command can run only on PF/VF ports.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 68f6a4544f7e..ddf1e352f51d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
 		fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
+		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
 		fw_reset.o qos.o lib/tout.o lib/aso.o
 
 #
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
new file mode 100644
index 000000000000..9114661cd967
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
+
+#include "reporter_vnic.h"
+#include "devlink.h"
+
+#define VNIC_ENV_GET64(vnic_env_stats, c) \
+	MLX5_GET64(query_vnic_env_out, (vnic_env_stats)->query_vnic_env_out, \
+		 vport_env.c)
+
+struct mlx5_vnic_diag_stats {
+	__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
+};
+
+int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
+					 struct devlink_fmsg *fmsg,
+					 u16 vport_num, bool other_vport)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
+	struct mlx5_vnic_diag_stats vnic;
+	int err;
+
+	MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, vport_number, vport_num);
+	MLX5_SET(query_vnic_env_in, in, other_vport, !!other_vport);
+
+	err = mlx5_cmd_exec_inout(dev, query_vnic_env, in, &vnic.query_vnic_env_out);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, "vNIC env counters");
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_start(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
+					VNIC_ENV_GET64(&vnic, total_error_queues));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
+					VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
+					VNIC_ENV_GET64(&vnic, comp_eq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
+					VNIC_ENV_GET64(&vnic, async_eq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
+					VNIC_ENV_GET64(&vnic, cq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
+					VNIC_ENV_GET64(&vnic, invalid_command));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
+					VNIC_ENV_GET64(&vnic, quota_exceeded_command));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
+					VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlx5_reporter_vnic_diagnose(struct devlink_health_reporter *reporter,
+				       struct devlink_fmsg *fmsg,
+				       struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+
+	return mlx5_reporter_vnic_diagnose_counters(dev, fmsg, 0, false);
+}
+
+static const struct devlink_health_reporter_ops mlx5_reporter_vnic_ops = {
+	.name = "vnic",
+	.diagnose = mlx5_reporter_vnic_diagnose,
+};
+
+void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	health->vnic_reporter =
+		devlink_health_reporter_create(devlink,
+					       &mlx5_reporter_vnic_ops,
+					       0, dev);
+	if (IS_ERR(health->vnic_reporter))
+		mlx5_core_warn(dev,
+			       "Failed to create vnic reporter, err = %ld\n",
+			       PTR_ERR(health->vnic_reporter));
+}
+
+void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (!IS_ERR_OR_NULL(health->vnic_reporter))
+		devlink_health_reporter_destroy(health->vnic_reporter);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h
new file mode 100644
index 000000000000..eba87a39e9b1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __MLX5_REPORTER_VNIC_H
+#define __MLX5_REPORTER_VNIC_H
+
+#include "mlx5_core.h"
+
+void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev);
+void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev);
+
+int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
+					 struct devlink_fmsg *fmsg,
+					 u16 vport_num, bool other_vport);
+
+#endif /* __MLX5_REPORTER_VNIC_H */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 016c5f99c470..871c32dda66e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -42,6 +42,7 @@
 #include "lib/pci_vsc.h"
 #include "lib/tout.h"
 #include "diag/fw_tracer.h"
+#include "diag/reporter_vnic.h"
 
 enum {
 	MAX_MISSES			= 3,
@@ -898,6 +899,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 
 	cancel_delayed_work_sync(&health->update_fw_log_ts_work);
 	destroy_workqueue(health->wq);
+	mlx5_reporter_vnic_destroy(dev);
 	mlx5_fw_reporters_destroy(dev);
 }
 
@@ -907,6 +909,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	char *name;
 
 	mlx5_fw_reporters_create(dev);
+	mlx5_reporter_vnic_create(dev);
 
 	health = &dev->priv.health;
 	name = kmalloc(64, GFP_KERNEL);
@@ -926,6 +929,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	return 0;
 
 out_err:
+	mlx5_reporter_vnic_destroy(dev);
 	mlx5_fw_reporters_destroy(dev);
 	return -ENOMEM;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 135a3c8d8237..5d25c4c73046 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -439,6 +439,7 @@ struct mlx5_core_health {
 	struct work_struct		report_work;
 	struct devlink_health_reporter *fw_reporter;
 	struct devlink_health_reporter *fw_fatal_reporter;
+	struct devlink_health_reporter *vnic_reporter;
 	struct delayed_work		update_fw_log_ts_work;
 };
 
-- 
cgit v1.2.3


From f9c895a72a390656f9582e048fdcc3d2cec1dd7c Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Wed, 22 Mar 2023 10:21:46 +0200
Subject: net/mlx5: Update op_mode to op_mod for port selection

To be consistent with the other enum keys use OP_MOD
instead of OP_MODE.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +-
 include/linux/mlx5/mlx5_ifc.h                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a95d1218def9..89a65779494e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -717,7 +717,7 @@ static int handle_hca_cap_port_selection(struct mlx5_core_dev *dev,
 	       MLX5_ST_SZ_BYTES(port_selection_cap));
 	MLX5_SET(port_selection_cap, set_hca_cap, port_select_flow_table_bypass, 1);
 
-	err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION);
+	err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_PORT_SELECTION);
 
 	return err;
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 20d00e09b168..b42696d74c9f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -69,7 +69,7 @@ enum {
 	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
 	MLX5_SET_HCA_CAP_OP_MOD_ROCE                  = 0x4,
 	MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2       = 0x20,
-	MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION       = 0x25,
+	MLX5_SET_HCA_CAP_OP_MOD_PORT_SELECTION        = 0x25,
 };
 
 enum {
-- 
cgit v1.2.3


From dd64b232deb8d48812a2ea739d1fedaeaffb59ed Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 19 Apr 2023 11:20:06 -0700
Subject: page_pool: unlink from napi during destroy

Jesper points out that we must prevent recycling into cache
after page_pool_destroy() is called, because page_pool_destroy()
is not synchronized with recycling (some pages may still be
outstanding when destroy() gets called).

I assumed this will not happen because NAPI can't be scheduled
if its page pool is being destroyed. But I missed the fact that
NAPI may get reused. For instance when user changes ring configuration
driver may allocate a new page pool, stop NAPI, swap, start NAPI,
and then destroy the old pool. The NAPI is running so old page
pool will think it can recycle to the cache, but the consumer
at that point is the destroy() path, not NAPI.

To avoid extra synchronization let the drivers do "unlinking"
during the "swap" stage while NAPI is indeed disabled.

Fixes: 8c48eea3adf3 ("page_pool: allow caching from safely localized NAPI")
Reported-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Link: https://lore.kernel.org/all/e8df2654-6a5b-3c92-489d-2fe5e444135f@redhat.com/
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/r/20230419182006.719923-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool.h |  5 +++++
 net/core/page_pool.c    | 18 +++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 91b808dade82..c8ec2f34722b 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -247,6 +247,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params);
 struct xdp_mem_info;
 
 #ifdef CONFIG_PAGE_POOL
+void page_pool_unlink_napi(struct page_pool *pool);
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 			   struct xdp_mem_info *mem);
@@ -254,6 +255,10 @@ void page_pool_release_page(struct page_pool *pool, struct page *page);
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 			     int count);
 #else
+static inline void page_pool_unlink_napi(struct page_pool *pool)
+{
+}
+
 static inline void page_pool_destroy(struct page_pool *pool)
 {
 }
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 97f20f7ff4fc..e212e9d7edcb 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -839,6 +839,21 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 	pool->xdp_mem_id = mem->id;
 }
 
+void page_pool_unlink_napi(struct page_pool *pool)
+{
+	if (!pool->p.napi)
+		return;
+
+	/* To avoid races with recycling and additional barriers make sure
+	 * pool and NAPI are unlinked when NAPI is disabled.
+	 */
+	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
+		READ_ONCE(pool->p.napi->list_owner) != -1);
+
+	WRITE_ONCE(pool->p.napi, NULL);
+}
+EXPORT_SYMBOL(page_pool_unlink_napi);
+
 void page_pool_destroy(struct page_pool *pool)
 {
 	if (!pool)
@@ -847,6 +862,7 @@ void page_pool_destroy(struct page_pool *pool)
 	if (!page_pool_put(pool))
 		return;
 
+	page_pool_unlink_napi(pool);
 	page_pool_free_frag(pool);
 
 	if (!page_pool_release(pool))
@@ -900,7 +916,7 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe)
 	 * in the same context as the consumer would run, so there's
 	 * no possible race.
 	 */
-	napi = pp->p.napi;
+	napi = READ_ONCE(pp->p.napi);
 	allow_direct = napi_safe && napi &&
 		READ_ONCE(napi->list_owner) == smp_processor_id();
 
-- 
cgit v1.2.3


From 8c966a10eb8478dea333a0f7f77b11c3cb3f7dcb Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Wed, 19 Apr 2023 13:34:35 +0200
Subject: flow_dissector: Address kdoc warnings

Address a number of warnings flagged by
./scripts/kernel-doc -none include/net/flow_dissector.h

 include/net/flow_dissector.h:23: warning: Function parameter or member 'addr_type' not described in 'flow_dissector_key_control'
 include/net/flow_dissector.h:23: warning: Function parameter or member 'flags' not described in 'flow_dissector_key_control'
 include/net/flow_dissector.h:46: warning: Function parameter or member 'padding' not described in 'flow_dissector_key_basic'
 include/net/flow_dissector.h:145: warning: Function parameter or member 'tipckey' not described in 'flow_dissector_key_addrs'
 include/net/flow_dissector.h:157: warning: cannot understand function prototype: 'struct flow_dissector_key_arp '
 include/net/flow_dissector.h:171: warning: cannot understand function prototype: 'struct flow_dissector_key_ports '
 include/net/flow_dissector.h:203: warning: cannot understand function prototype: 'struct flow_dissector_key_icmp '

Also improve indentation on adjacent lines to those changed
to address the above.

No functional changes intended.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230419-flow-dissector-kdoc-v1-1-1aa0cca1118b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow_dissector.h | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 5ccf52ef8809..85b2281576ed 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -14,7 +14,9 @@ struct sk_buff;
 
 /**
  * struct flow_dissector_key_control:
- * @thoff: Transport header offset
+ * @thoff:     Transport header offset
+ * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_*
+ * @flags:     Key flags. Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAGENCAPSULATION)
  */
 struct flow_dissector_key_control {
 	u16	thoff;
@@ -36,8 +38,9 @@ enum flow_dissect_ret {
 
 /**
  * struct flow_dissector_key_basic:
- * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @n_proto:  Network header protocol (eg. IPv4/IPv6)
  * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ * @padding:  Unused
  */
 struct flow_dissector_key_basic {
 	__be16	n_proto;
@@ -135,6 +138,7 @@ struct flow_dissector_key_tipc {
  * struct flow_dissector_key_addrs:
  * @v4addrs: IPv4 addresses
  * @v6addrs: IPv6 addresses
+ * @tipckey: TIPC key
  */
 struct flow_dissector_key_addrs {
 	union {
@@ -145,14 +149,12 @@ struct flow_dissector_key_addrs {
 };
 
 /**
- * flow_dissector_key_arp:
- *	@ports: Operation, source and target addresses for an ARP header
- *              for Ethernet hardware addresses and IPv4 protocol addresses
- *		sip: Sender IP address
- *		tip: Target IP address
- *		op:  Operation
- *		sha: Sender hardware address
- *		tpa: Target hardware address
+ * struct flow_dissector_key_arp:
+ * @sip: Sender IP address
+ * @tip: Target IP address
+ * @op:  Operation
+ * @sha: Sender hardware address
+ * @tha: Target hardware address
  */
 struct flow_dissector_key_arp {
 	__u32 sip;
@@ -163,10 +165,10 @@ struct flow_dissector_key_arp {
 };
 
 /**
- * flow_dissector_key_tp_ports:
- *	@ports: port numbers of Transport header
- *		src: source port number
- *		dst: destination port number
+ * struct flow_dissector_key_ports:
+ * @ports: port numbers of Transport header
+ * @src: source port number
+ * @dst: destination port number
  */
 struct flow_dissector_key_ports {
 	union {
@@ -195,10 +197,10 @@ struct flow_dissector_key_ports_range {
 };
 
 /**
- * flow_dissector_key_icmp:
- *		type: ICMP type
- *		code: ICMP code
- *		id:   session identifier
+ * struct flow_dissector_key_icmp:
+ * @type: ICMP type
+ * @code: ICMP code
+ * @id:   Session identifier
  */
 struct flow_dissector_key_icmp {
 	struct {
-- 
cgit v1.2.3


From 8fa66e4a1bdd41d55d7842928e60a40fed65715d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 19 Apr 2023 19:00:05 -0700
Subject: net: skbuff: update and rename __kfree_skb_defer()

__kfree_skb_defer() uses the old naming where "defer" meant
slab bulk free/alloc APIs. In the meantime we also made
__kfree_skb_defer() feed the per-NAPI skb cache, which
implies bulk APIs. So take away the 'defer' and add 'napi'.

While at it add a drop reason. This only matters on the
tx_action path, if the skb has a frag_list. But getting
rid of a SKB_DROP_REASON_NOT_SPECIFIED seems like a net
benefit so why not.

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230420020005.815854-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 net/core/dev.c         | 3 ++-
 net/core/gro.c         | 2 +-
 net/core/skbuff.c      | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index db5973559042..03aa7ed076f0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3253,7 +3253,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void napi_skb_free_stolen_head(struct sk_buff *skb);
-void __kfree_skb_defer(struct sk_buff *skb);
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 3fc4dba71f9d..1551aabac343 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5040,7 +5040,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
 			else
-				__kfree_skb_defer(skb);
+				__napi_kfree_skb(skb,
+						 get_kfree_skb_cb(skb)->reason);
 		}
 	}
 
diff --git a/net/core/gro.c b/net/core/gro.c
index a606705a0859..2d84165cb4f1 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -633,7 +633,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 		else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 			__kfree_skb(skb);
 		else
-			__kfree_skb_defer(skb);
+			__napi_kfree_skb(skb, SKB_CONSUMED);
 		break;
 
 	case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 768f9d04911f..8764653bede7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1226,9 +1226,9 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 	}
 }
 
-void __kfree_skb_defer(struct sk_buff *skb)
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
 {
-	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
+	skb_release_all(skb, reason, true);
 	napi_skb_cache_put(skb);
 }
 
-- 
cgit v1.2.3


From 211db0ac9e3dc6c46f2dd53395b34d76af929faf Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Thu, 16 Mar 2023 07:34:33 +0900
Subject: ksmbd: remove internal.h include

Since vfs_path_lookup is exported, It should not be internal.
Move vfs_path_lookup prototype in internal.h to linux/namei.h.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h         | 2 --
 fs/ksmbd/vfs.c        | 2 --
 include/linux/namei.h | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/internal.h b/fs/internal.h
index dc4eb91a577a..071a7517f1a7 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
  */
 extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
 			   struct path *path, struct path *root);
-extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
-			   const char *, unsigned int, struct path *);
 int do_rmdir(int dfd, struct filename *name);
 int do_unlinkat(int dfd, struct filename *name);
 int may_linkat(struct mnt_idmap *idmap, const struct path *link);
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 5ea9229dad2c..cef07d7fb7dc 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -19,8 +19,6 @@
 #include <linux/sched/xacct.h>
 #include <linux/crc32c.h>
 
-#include "../internal.h"	/* for vfs_path_lookup */
-
 #include "glob.h"
 #include "oplock.h"
 #include "connection.h"
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 0d797f3367ca..ba9b32b4d1b0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -63,6 +63,8 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne
 extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
 extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
+int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
+		    unsigned int, struct path *);
 
 extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
-- 
cgit v1.2.3


From 9bc37e04823b5280dd0f22b6680fc23fe81ca325 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 16 Mar 2023 07:34:34 +0900
Subject: fs: introduce lock_rename_child() helper

Pass the dentry of a source file and the dentry of a destination directory
to lock parent inodes for rename. As soon as this function returns,
->d_parent of the source file dentry is stable and inodes are properly
locked for calling vfs-rename. This helper is needed for ksmbd server.
rename request of SMB protocol has to rename an opened file, no matter
which directory it's in.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 68 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/namei.h |  1 +
 2 files changed, 58 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index edfedfbccaef..6bc1964e2214 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2980,20 +2980,10 @@ static inline int may_create(struct mnt_idmap *idmap,
 	return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
 }
 
-/*
- * p1 and p2 should be directories on the same fs.
- */
-struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
-	if (p1 == p2) {
-		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
-		return NULL;
-	}
-
-	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
-
 	p = d_ancestor(p2, p1);
 	if (p) {
 		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
@@ -3012,8 +3002,64 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
 	return NULL;
 }
+
+/*
+ * p1 and p2 should be directories on the same fs.
+ */
+struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+{
+	if (p1 == p2) {
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+		return NULL;
+	}
+
+	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+	return lock_two_directories(p1, p2);
+}
 EXPORT_SYMBOL(lock_rename);
 
+/*
+ * c1 and p2 should be on the same fs.
+ */
+struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
+{
+	if (READ_ONCE(c1->d_parent) == p2) {
+		/*
+		 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
+		 */
+		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+		/*
+		 * now that p2 is locked, nobody can move in or out of it,
+		 * so the test below is safe.
+		 */
+		if (likely(c1->d_parent == p2))
+			return NULL;
+
+		/*
+		 * c1 got moved out of p2 while we'd been taking locks;
+		 * unlock and fall back to slow case.
+		 */
+		inode_unlock(p2->d_inode);
+	}
+
+	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
+	/*
+	 * nobody can move out of any directories on this fs.
+	 */
+	if (likely(c1->d_parent != p2))
+		return lock_two_directories(c1->d_parent, p2);
+
+	/*
+	 * c1 got moved into p2 while we were taking locks;
+	 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
+	 * for consistency with lock_rename().
+	 */
+	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+	mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
+	return NULL;
+}
+EXPORT_SYMBOL(lock_rename_child);
+
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
 	inode_unlock(p1->d_inode);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index ba9b32b4d1b0..5864e4d82e56 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -83,6 +83,7 @@ extern int follow_down(struct path *path, unsigned int flags);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
 extern int __must_check nd_jump_link(const struct path *path);
-- 
cgit v1.2.3


From 38e124086282715e3b9b3d193d9308bb3499b46c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 May 2022 18:16:54 -0400
Subject: kill the last remaining user of proc_ns_fget()

lookups by descriptor are better off closer to syscall surface...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nsfs.c                | 18 ------------------
 include/linux/proc_ns.h  |  1 -
 net/core/net_namespace.c | 23 +++++++++++------------
 3 files changed, 11 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index f8df60b3b901..f602a96a1afe 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -235,24 +235,6 @@ bool proc_ns_file(const struct file *file)
 	return file->f_op == &ns_file_operations;
 }
 
-struct file *proc_ns_fget(int fd)
-{
-	struct file *file;
-
-	file = fget(fd);
-	if (!file)
-		return ERR_PTR(-EBADF);
-
-	if (file->f_op != &ns_file_operations)
-		goto out_invalid;
-
-	return file;
-
-out_invalid:
-	fput(file);
-	return ERR_PTR(-EINVAL);
-}
-
 /**
  * ns_match() - Returns true if current namespace matches dev/ino provided.
  * @ns: current namespace
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 75807ecef880..49539bc416ce 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -72,7 +72,6 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
 
-extern struct file *proc_ns_fget(int fd);
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern int ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7b69cf882b8e..3e3598cd49f2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task.h>
 #include <linux/uidgid.h>
 #include <linux/cookie.h>
+#include <linux/proc_fs.h>
 
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -676,21 +677,19 @@ EXPORT_SYMBOL_GPL(get_net_ns);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct file *file;
-	struct ns_common *ns;
-	struct net *net;
+	struct fd f = fdget(fd);
+	struct net *net = ERR_PTR(-EINVAL);
 
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return ERR_CAST(file);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
 
-	ns = get_proc_ns(file_inode(file));
-	if (ns->ops == &netns_operations)
-		net = get_net(container_of(ns, struct net, ns));
-	else
-		net = ERR_PTR(-EINVAL);
+	if (proc_ns_file(f.file)) {
+		struct ns_common *ns = get_proc_ns(file_inode(f.file));
+		if (ns->ops == &netns_operations)
+			net = get_net(container_of(ns, struct net, ns));
+	}
+	fdput(f);
 
-	fput(file);
 	return net;
 }
 EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
-- 
cgit v1.2.3


From 7ab75456be144a354fbb3df1516d82fc24d3d67d Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Tue, 18 Apr 2023 18:32:38 -0700
Subject: ipv6: add icmpv6_error_anycast_as_unicast for ICMPv6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ICMPv6 error packets are not sent to the anycast destinations and this
prevents things like traceroute from working. So create a setting similar
to ECHO when dealing with Anycast sources (icmpv6_echo_ignore_anycast).

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Link: https://lore.kernel.org/r/20230419013238.2691167-1-maheshb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  7 +++++++
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/af_inet6.c                    |  1 +
 net/ipv6/icmp.c                        | 15 +++++++++++++--
 4 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 58a78a316697..6ec06a33688a 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2721,6 +2721,13 @@ echo_ignore_anycast - BOOLEAN
 
 	Default: 0
 
+error_anycast_as_unicast - BOOLEAN
+	If set to 1, then the kernel will respond with ICMP Errors
+	resulting from requests sent to it over the IPv6 protocol destined
+	to anycast address essentially treating anycast as unicast.
+
+	Default: 0
+
 xfrm6_gc_thresh - INTEGER
 	(Obsolete since linux-4.14)
 	The threshold at which we will start garbage collecting for IPv6
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b4af4837d80b..3cceb3e9320b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -55,6 +55,7 @@ struct netns_sysctl_ipv6 {
 	u64 ioam6_id_wide;
 	bool skip_notify_on_dev_down;
 	u8 fib_notify_on_flag_change;
+	u8 icmpv6_error_anycast_as_unicast;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e1b679a590c9..2bbf13216a3d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -952,6 +952,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
 	net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
 	net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+	net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
 
 	/* By default, rate limit error messages.
 	 * Except for pmtu discovery, it would break it.
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 1f53f2a74480..9edf1f45b1ed 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -362,9 +362,10 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
 
 	/*
 	 * We won't send icmp if the destination is known
-	 * anycast.
+	 * anycast unless we need to treat anycast as unicast.
 	 */
-	if (ipv6_anycast_destination(dst, &fl6->daddr)) {
+	if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
+	    ipv6_anycast_destination(dst, &fl6->daddr)) {
 		net_dbg_ratelimited("icmp6_send: acast source\n");
 		dst_release(dst);
 		return ERR_PTR(-EINVAL);
@@ -1195,6 +1196,15 @@ static struct ctl_table ipv6_icmp_table_template[] = {
 		.mode		= 0644,
 		.proc_handler = proc_do_large_bitmap,
 	},
+	{
+		.procname	= "error_anycast_as_unicast",
+		.data		= &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{ },
 };
 
@@ -1212,6 +1222,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
 		table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
 		table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
 		table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
+		table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
 	}
 	return table;
 }
-- 
cgit v1.2.3


From 5b8285cca6fed9bc5baabe2e5699a5a5c0d96371 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:52 +0200
Subject: net: move dropreason.h to dropreason-core.h

This will, after the next patch, hold only the core
drop reasons and minimal infrastructure. Fix a small
kernel-doc issue while at it, to avoid the move
triggering a checker.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h     |   2 +-
 include/linux/skbuff.h        |   2 +-
 include/net/dropreason-core.h | 364 ++++++++++++++++++++++++++++++++++++++++++
 include/net/dropreason.h      | 363 -----------------------------------------
 include/net/inet_frag.h       |   2 +-
 5 files changed, 367 insertions(+), 366 deletions(-)
 create mode 100644 include/net/dropreason-core.h
 delete mode 100644 include/net/dropreason.h

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 203c0df2046c..a6a3e9457d6c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,7 +52,7 @@
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 struct netpoll_info;
 struct device;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03aa7ed076f0..eb9e7bb76fa6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,7 +37,7 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 #endif
 #include <net/net_debug.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 /**
  * DOC: skb checksums
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
new file mode 100644
index 000000000000..ade6d5b9186c
--- /dev/null
+++ b/include/net/dropreason-core.h
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_CORE_H
+#define _LINUX_DROPREASON_CORE_H
+
+#define DEFINE_DROP_REASON(FN, FNe)	\
+	FN(NOT_SPECIFIED)		\
+	FN(NO_SOCKET)			\
+	FN(PKT_TOO_SMALL)		\
+	FN(TCP_CSUM)			\
+	FN(SOCKET_FILTER)		\
+	FN(UDP_CSUM)			\
+	FN(NETFILTER_DROP)		\
+	FN(OTHERHOST)			\
+	FN(IP_CSUM)			\
+	FN(IP_INHDR)			\
+	FN(IP_RPFILTER)			\
+	FN(UNICAST_IN_L2_MULTICAST)	\
+	FN(XFRM_POLICY)			\
+	FN(IP_NOPROTO)			\
+	FN(SOCKET_RCVBUFF)		\
+	FN(PROTO_MEM)			\
+	FN(TCP_MD5NOTFOUND)		\
+	FN(TCP_MD5UNEXPECTED)		\
+	FN(TCP_MD5FAILURE)		\
+	FN(SOCKET_BACKLOG)		\
+	FN(TCP_FLAGS)			\
+	FN(TCP_ZEROWINDOW)		\
+	FN(TCP_OLD_DATA)		\
+	FN(TCP_OVERWINDOW)		\
+	FN(TCP_OFOMERGE)		\
+	FN(TCP_RFC7323_PAWS)		\
+	FN(TCP_INVALID_SEQUENCE)	\
+	FN(TCP_RESET)			\
+	FN(TCP_INVALID_SYN)		\
+	FN(TCP_CLOSE)			\
+	FN(TCP_FASTOPEN)		\
+	FN(TCP_OLD_ACK)			\
+	FN(TCP_TOO_OLD_ACK)		\
+	FN(TCP_ACK_UNSENT_DATA)		\
+	FN(TCP_OFO_QUEUE_PRUNE)		\
+	FN(TCP_OFO_DROP)		\
+	FN(IP_OUTNOROUTES)		\
+	FN(BPF_CGROUP_EGRESS)		\
+	FN(IPV6DISABLED)		\
+	FN(NEIGH_CREATEFAIL)		\
+	FN(NEIGH_FAILED)		\
+	FN(NEIGH_QUEUEFULL)		\
+	FN(NEIGH_DEAD)			\
+	FN(TC_EGRESS)			\
+	FN(QDISC_DROP)			\
+	FN(CPU_BACKLOG)			\
+	FN(XDP)				\
+	FN(TC_INGRESS)			\
+	FN(UNHANDLED_PROTO)		\
+	FN(SKB_CSUM)			\
+	FN(SKB_GSO_SEG)			\
+	FN(SKB_UCOPY_FAULT)		\
+	FN(DEV_HDR)			\
+	FN(DEV_READY)			\
+	FN(FULL_RING)			\
+	FN(NOMEM)			\
+	FN(HDR_TRUNC)			\
+	FN(TAP_FILTER)			\
+	FN(TAP_TXFILTER)		\
+	FN(ICMP_CSUM)			\
+	FN(INVALID_PROTO)		\
+	FN(IP_INADDRERRORS)		\
+	FN(IP_INNOROUTES)		\
+	FN(PKT_TOO_BIG)			\
+	FN(DUP_FRAG)			\
+	FN(FRAG_REASM_TIMEOUT)		\
+	FN(FRAG_TOO_FAR)		\
+	FN(TCP_MINTTL)			\
+	FN(IPV6_BAD_EXTHDR)		\
+	FN(IPV6_NDISC_FRAG)		\
+	FN(IPV6_NDISC_HOP_LIMIT)	\
+	FN(IPV6_NDISC_BAD_CODE)		\
+	FN(IPV6_NDISC_BAD_OPTIONS)	\
+	FN(IPV6_NDISC_NS_OTHERHOST)	\
+	FNe(MAX)
+
+/**
+ * enum skb_drop_reason - the reasons of skb drops
+ *
+ * The reason of skb drop, which is used in kfree_skb_reason().
+ */
+enum skb_drop_reason {
+	/**
+	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
+	 */
+	SKB_NOT_DROPPED_YET = 0,
+	/** @SKB_CONSUMED: packet has been consumed */
+	SKB_CONSUMED,
+	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
+	SKB_DROP_REASON_NOT_SPECIFIED,
+	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
+	SKB_DROP_REASON_NO_SOCKET,
+	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
+	SKB_DROP_REASON_PKT_TOO_SMALL,
+	/** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */
+	SKB_DROP_REASON_TCP_CSUM,
+	/** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */
+	SKB_DROP_REASON_SOCKET_FILTER,
+	/** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */
+	SKB_DROP_REASON_UDP_CSUM,
+	/** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */
+	SKB_DROP_REASON_NETFILTER_DROP,
+	/**
+	 * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host
+	 * (interface is in promisc mode)
+	 */
+	SKB_DROP_REASON_OTHERHOST,
+	/** @SKB_DROP_REASON_IP_CSUM: IP checksum error */
+	SKB_DROP_REASON_IP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see
+	 * IPSTATS_MIB_INHDRERRORS)
+	 */
+	SKB_DROP_REASON_IP_INHDR,
+	/**
+	 * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the
+	 * document for rp_filter in ip-sysctl.rst for more information
+	 */
+	SKB_DROP_REASON_IP_RPFILTER,
+	/**
+	 * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is
+	 * multicast, but L3 is unicast.
+	 */
+	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,
+	/** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */
+	SKB_DROP_REASON_XFRM_POLICY,
+	/** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */
+	SKB_DROP_REASON_IP_NOPROTO,
+	/** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */
+	SKB_DROP_REASON_SOCKET_RCVBUFF,
+	/**
+	 * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet
+	 * drop out of udp_memory_allocated.
+	 */
+	SKB_DROP_REASON_PROTO_MEM,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
+	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
+	 */
+	SKB_DROP_REASON_TCP_MD5NOTFOUND,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting
+	 * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED
+	 */
+	SKB_DROP_REASON_TCP_MD5UNEXPECTED,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding
+	 * to LINUX_MIB_TCPMD5FAILURE
+	 */
+	SKB_DROP_REASON_TCP_MD5FAILURE,
+	/**
+	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
+	 * see LINUX_MIB_TCPBACKLOGDROP)
+	 */
+	SKB_DROP_REASON_SOCKET_BACKLOG,
+	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
+	SKB_DROP_REASON_TCP_FLAGS,
+	/**
+	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
+	 * see LINUX_MIB_TCPZEROWINDOWDROP
+	 */
+	SKB_DROP_REASON_TCP_ZEROWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already
+	 * received before (spurious retrans may happened), see
+	 * LINUX_MIB_DELAYEDACKLOST
+	 */
+	SKB_DROP_REASON_TCP_OLD_DATA,
+	/**
+	 * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window,
+	 * the seq of the first byte exceed the right edges of receive
+	 * window
+	 */
+	SKB_DROP_REASON_TCP_OVERWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo
+	 * queue, corresponding to LINUX_MIB_TCPOFOMERGE
+	 */
+	SKB_DROP_REASON_TCP_OFOMERGE,
+	/**
+	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
+	 * LINUX_MIB_PAWSESTABREJECTED
+	 */
+	SKB_DROP_REASON_TCP_RFC7323_PAWS,
+	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
+	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
+	SKB_DROP_REASON_TCP_RESET,
+	/**
+	 * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected
+	 * SYN flag
+	 */
+	SKB_DROP_REASON_TCP_INVALID_SYN,
+	/** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */
+	SKB_DROP_REASON_TCP_CLOSE,
+	/** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */
+	SKB_DROP_REASON_TCP_FASTOPEN,
+	/** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
+	SKB_DROP_REASON_TCP_OLD_ACK,
+	/** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
+	SKB_DROP_REASON_TCP_TOO_OLD_ACK,
+	/**
+	 * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
+	 * sent yet
+	 */
+	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,
+	/** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */
+	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,
+	/** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */
+	SKB_DROP_REASON_TCP_OFO_DROP,
+	/** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */
+	SKB_DROP_REASON_IP_OUTNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB
+	 * eBPF program
+	 */
+	SKB_DROP_REASON_BPF_CGROUP_EGRESS,
+	/** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */
+	SKB_DROP_REASON_IPV6DISABLED,
+	/** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */
+	SKB_DROP_REASON_NEIGH_CREATEFAIL,
+	/** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */
+	SKB_DROP_REASON_NEIGH_FAILED,
+	/** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */
+	SKB_DROP_REASON_NEIGH_QUEUEFULL,
+	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
+	SKB_DROP_REASON_NEIGH_DEAD,
+	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
+	SKB_DROP_REASON_TC_EGRESS,
+	/**
+	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
+	 * failed to enqueue to current qdisc)
+	 */
+	SKB_DROP_REASON_QDISC_DROP,
+	/**
+	 * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
+	 * backlog queue. This can be caused by backlog queue full (see
+	 * netdev_max_backlog in net.rst) or RPS flow limit
+	 */
+	SKB_DROP_REASON_CPU_BACKLOG,
+	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
+	SKB_DROP_REASON_XDP,
+	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
+	SKB_DROP_REASON_TC_INGRESS,
+	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
+	SKB_DROP_REASON_UNHANDLED_PROTO,
+	/** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */
+	SKB_DROP_REASON_SKB_CSUM,
+	/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
+	SKB_DROP_REASON_SKB_GSO_SEG,
+	/**
+	 * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
+	 * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
+	 */
+	SKB_DROP_REASON_SKB_UCOPY_FAULT,
+	/** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */
+	SKB_DROP_REASON_DEV_HDR,
+	/**
+	 * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to
+	 * any of its data structure that is not up/ready/initialized,
+	 * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq]
+	 * is not initialized
+	 */
+	SKB_DROP_REASON_DEV_READY,
+	/** @SKB_DROP_REASON_FULL_RING: ring buffer is full */
+	SKB_DROP_REASON_FULL_RING,
+	/** @SKB_DROP_REASON_NOMEM: error due to OOM */
+	SKB_DROP_REASON_NOMEM,
+	/**
+	 * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from
+	 * networking data, e.g., failed to pull the protocol header from
+	 * frags via pskb_may_pull()
+	 */
+	SKB_DROP_REASON_HDR_TRUNC,
+	/**
+	 * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached
+	 * to tun/tap, e.g., via TUNSETFILTEREBPF
+	 */
+	SKB_DROP_REASON_TAP_FILTER,
+	/**
+	 * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at
+	 * tun/tap, e.g., check_filter()
+	 */
+	SKB_DROP_REASON_TAP_TXFILTER,
+	/** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */
+	SKB_DROP_REASON_ICMP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211,
+	 * such as a broadcasts ICMP_TIMESTAMP
+	 */
+	SKB_DROP_REASON_INVALID_PROTO,
+	/**
+	 * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INADDRERRORS,
+	/**
+	 * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the
+	 * MTU)
+	 */
+	SKB_DROP_REASON_PKT_TOO_BIG,
+	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
+	SKB_DROP_REASON_DUP_FRAG,
+	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
+	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
+	/**
+	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
+	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
+	 */
+	SKB_DROP_REASON_FRAG_TOO_FAR,
+	/**
+	 * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below
+	 * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT).
+	 */
+	SKB_DROP_REASON_TCP_MINTTL,
+	/** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */
+	SKB_DROP_REASON_IPV6_BAD_EXTHDR,
+	/** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */
+	SKB_DROP_REASON_IPV6_NDISC_FRAG,
+	/** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */
+	SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT,
+	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */
+	SKB_DROP_REASON_IPV6_NDISC_BAD_CODE,
+	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */
+	SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS,
+	/**
+	 * @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION
+	 * for another host.
+	 */
+	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
+	/**
+	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
+	 * used as a real 'reason'
+	 */
+	SKB_DROP_REASON_MAX,
+};
+
+#define SKB_DR_INIT(name, reason)				\
+	enum skb_drop_reason name = SKB_DROP_REASON_##reason
+#define SKB_DR(name)						\
+	SKB_DR_INIT(name, NOT_SPECIFIED)
+#define SKB_DR_SET(name, reason)				\
+	(name = SKB_DROP_REASON_##reason)
+#define SKB_DR_OR(name, reason)					\
+	do {							\
+		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
+		    name == SKB_NOT_DROPPED_YET)		\
+			SKB_DR_SET(name, reason);		\
+	} while (0)
+
+extern const char * const drop_reasons[];
+
+#endif
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
deleted file mode 100644
index c0a3ea806cd5..000000000000
--- a/include/net/dropreason.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef _LINUX_DROPREASON_H
-#define _LINUX_DROPREASON_H
-
-#define DEFINE_DROP_REASON(FN, FNe)	\
-	FN(NOT_SPECIFIED)		\
-	FN(NO_SOCKET)			\
-	FN(PKT_TOO_SMALL)		\
-	FN(TCP_CSUM)			\
-	FN(SOCKET_FILTER)		\
-	FN(UDP_CSUM)			\
-	FN(NETFILTER_DROP)		\
-	FN(OTHERHOST)			\
-	FN(IP_CSUM)			\
-	FN(IP_INHDR)			\
-	FN(IP_RPFILTER)			\
-	FN(UNICAST_IN_L2_MULTICAST)	\
-	FN(XFRM_POLICY)			\
-	FN(IP_NOPROTO)			\
-	FN(SOCKET_RCVBUFF)		\
-	FN(PROTO_MEM)			\
-	FN(TCP_MD5NOTFOUND)		\
-	FN(TCP_MD5UNEXPECTED)		\
-	FN(TCP_MD5FAILURE)		\
-	FN(SOCKET_BACKLOG)		\
-	FN(TCP_FLAGS)			\
-	FN(TCP_ZEROWINDOW)		\
-	FN(TCP_OLD_DATA)		\
-	FN(TCP_OVERWINDOW)		\
-	FN(TCP_OFOMERGE)		\
-	FN(TCP_RFC7323_PAWS)		\
-	FN(TCP_INVALID_SEQUENCE)	\
-	FN(TCP_RESET)			\
-	FN(TCP_INVALID_SYN)		\
-	FN(TCP_CLOSE)			\
-	FN(TCP_FASTOPEN)		\
-	FN(TCP_OLD_ACK)			\
-	FN(TCP_TOO_OLD_ACK)		\
-	FN(TCP_ACK_UNSENT_DATA)		\
-	FN(TCP_OFO_QUEUE_PRUNE)		\
-	FN(TCP_OFO_DROP)		\
-	FN(IP_OUTNOROUTES)		\
-	FN(BPF_CGROUP_EGRESS)		\
-	FN(IPV6DISABLED)		\
-	FN(NEIGH_CREATEFAIL)		\
-	FN(NEIGH_FAILED)		\
-	FN(NEIGH_QUEUEFULL)		\
-	FN(NEIGH_DEAD)			\
-	FN(TC_EGRESS)			\
-	FN(QDISC_DROP)			\
-	FN(CPU_BACKLOG)			\
-	FN(XDP)				\
-	FN(TC_INGRESS)			\
-	FN(UNHANDLED_PROTO)		\
-	FN(SKB_CSUM)			\
-	FN(SKB_GSO_SEG)			\
-	FN(SKB_UCOPY_FAULT)		\
-	FN(DEV_HDR)			\
-	FN(DEV_READY)			\
-	FN(FULL_RING)			\
-	FN(NOMEM)			\
-	FN(HDR_TRUNC)			\
-	FN(TAP_FILTER)			\
-	FN(TAP_TXFILTER)		\
-	FN(ICMP_CSUM)			\
-	FN(INVALID_PROTO)		\
-	FN(IP_INADDRERRORS)		\
-	FN(IP_INNOROUTES)		\
-	FN(PKT_TOO_BIG)			\
-	FN(DUP_FRAG)			\
-	FN(FRAG_REASM_TIMEOUT)		\
-	FN(FRAG_TOO_FAR)		\
-	FN(TCP_MINTTL)			\
-	FN(IPV6_BAD_EXTHDR)		\
-	FN(IPV6_NDISC_FRAG)		\
-	FN(IPV6_NDISC_HOP_LIMIT)	\
-	FN(IPV6_NDISC_BAD_CODE)		\
-	FN(IPV6_NDISC_BAD_OPTIONS)	\
-	FN(IPV6_NDISC_NS_OTHERHOST)	\
-	FNe(MAX)
-
-/**
- * enum skb_drop_reason - the reasons of skb drops
- *
- * The reason of skb drop, which is used in kfree_skb_reason().
- */
-enum skb_drop_reason {
-	/**
-	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
-	 */
-	SKB_NOT_DROPPED_YET = 0,
-	/** @SKB_CONSUMED: packet has been consumed */
-	SKB_CONSUMED,
-	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
-	SKB_DROP_REASON_NOT_SPECIFIED,
-	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
-	SKB_DROP_REASON_NO_SOCKET,
-	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
-	SKB_DROP_REASON_PKT_TOO_SMALL,
-	/** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */
-	SKB_DROP_REASON_TCP_CSUM,
-	/** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */
-	SKB_DROP_REASON_SOCKET_FILTER,
-	/** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */
-	SKB_DROP_REASON_UDP_CSUM,
-	/** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */
-	SKB_DROP_REASON_NETFILTER_DROP,
-	/**
-	 * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host
-	 * (interface is in promisc mode)
-	 */
-	SKB_DROP_REASON_OTHERHOST,
-	/** @SKB_DROP_REASON_IP_CSUM: IP checksum error */
-	SKB_DROP_REASON_IP_CSUM,
-	/**
-	 * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see
-	 * IPSTATS_MIB_INHDRERRORS)
-	 */
-	SKB_DROP_REASON_IP_INHDR,
-	/**
-	 * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the
-	 * document for rp_filter in ip-sysctl.rst for more information
-	 */
-	SKB_DROP_REASON_IP_RPFILTER,
-	/**
-	 * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is
-	 * multicast, but L3 is unicast.
-	 */
-	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,
-	/** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */
-	SKB_DROP_REASON_XFRM_POLICY,
-	/** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */
-	SKB_DROP_REASON_IP_NOPROTO,
-	/** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */
-	SKB_DROP_REASON_SOCKET_RCVBUFF,
-	/**
-	 * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet
-	 * drop out of udp_memory_allocated.
-	 */
-	SKB_DROP_REASON_PROTO_MEM,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
-	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
-	 */
-	SKB_DROP_REASON_TCP_MD5NOTFOUND,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting
-	 * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED
-	 */
-	SKB_DROP_REASON_TCP_MD5UNEXPECTED,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding
-	 * to LINUX_MIB_TCPMD5FAILURE
-	 */
-	SKB_DROP_REASON_TCP_MD5FAILURE,
-	/**
-	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
-	 * see LINUX_MIB_TCPBACKLOGDROP)
-	 */
-	SKB_DROP_REASON_SOCKET_BACKLOG,
-	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
-	SKB_DROP_REASON_TCP_FLAGS,
-	/**
-	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
-	 * see LINUX_MIB_TCPZEROWINDOWDROP
-	 */
-	SKB_DROP_REASON_TCP_ZEROWINDOW,
-	/**
-	 * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already
-	 * received before (spurious retrans may happened), see
-	 * LINUX_MIB_DELAYEDACKLOST
-	 */
-	SKB_DROP_REASON_TCP_OLD_DATA,
-	/**
-	 * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window,
-	 * the seq of the first byte exceed the right edges of receive
-	 * window
-	 */
-	SKB_DROP_REASON_TCP_OVERWINDOW,
-	/**
-	 * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo
-	 * queue, corresponding to LINUX_MIB_TCPOFOMERGE
-	 */
-	SKB_DROP_REASON_TCP_OFOMERGE,
-	/**
-	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
-	 * LINUX_MIB_PAWSESTABREJECTED
-	 */
-	SKB_DROP_REASON_TCP_RFC7323_PAWS,
-	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
-	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
-	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
-	SKB_DROP_REASON_TCP_RESET,
-	/**
-	 * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected
-	 * SYN flag
-	 */
-	SKB_DROP_REASON_TCP_INVALID_SYN,
-	/** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */
-	SKB_DROP_REASON_TCP_CLOSE,
-	/** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */
-	SKB_DROP_REASON_TCP_FASTOPEN,
-	/** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
-	SKB_DROP_REASON_TCP_OLD_ACK,
-	/** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
-	SKB_DROP_REASON_TCP_TOO_OLD_ACK,
-	/**
-	 * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
-	 * sent yet
-	 */
-	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,
-	/** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */
-	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,
-	/** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */
-	SKB_DROP_REASON_TCP_OFO_DROP,
-	/** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */
-	SKB_DROP_REASON_IP_OUTNOROUTES,
-	/**
-	 * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB
-	 * eBPF program
-	 */
-	SKB_DROP_REASON_BPF_CGROUP_EGRESS,
-	/** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */
-	SKB_DROP_REASON_IPV6DISABLED,
-	/** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */
-	SKB_DROP_REASON_NEIGH_CREATEFAIL,
-	/** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */
-	SKB_DROP_REASON_NEIGH_FAILED,
-	/** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */
-	SKB_DROP_REASON_NEIGH_QUEUEFULL,
-	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
-	SKB_DROP_REASON_NEIGH_DEAD,
-	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
-	SKB_DROP_REASON_TC_EGRESS,
-	/**
-	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
-	 * failed to enqueue to current qdisc)
-	 */
-	SKB_DROP_REASON_QDISC_DROP,
-	/**
-	 * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
-	 * backlog queue. This can be caused by backlog queue full (see
-	 * netdev_max_backlog in net.rst) or RPS flow limit
-	 */
-	SKB_DROP_REASON_CPU_BACKLOG,
-	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
-	SKB_DROP_REASON_XDP,
-	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
-	SKB_DROP_REASON_TC_INGRESS,
-	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
-	SKB_DROP_REASON_UNHANDLED_PROTO,
-	/** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */
-	SKB_DROP_REASON_SKB_CSUM,
-	/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
-	SKB_DROP_REASON_SKB_GSO_SEG,
-	/**
-	 * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
-	 * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
-	 */
-	SKB_DROP_REASON_SKB_UCOPY_FAULT,
-	/** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */
-	SKB_DROP_REASON_DEV_HDR,
-	/**
-	 * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to
-	 * any of its data structure that is not up/ready/initialized,
-	 * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq]
-	 * is not initialized
-	 */
-	SKB_DROP_REASON_DEV_READY,
-	/** @SKB_DROP_REASON_FULL_RING: ring buffer is full */
-	SKB_DROP_REASON_FULL_RING,
-	/** @SKB_DROP_REASON_NOMEM: error due to OOM */
-	SKB_DROP_REASON_NOMEM,
-	/**
-	 * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from
-	 * networking data, e.g., failed to pull the protocol header from
-	 * frags via pskb_may_pull()
-	 */
-	SKB_DROP_REASON_HDR_TRUNC,
-	/**
-	 * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached
-	 * to tun/tap, e.g., via TUNSETFILTEREBPF
-	 */
-	SKB_DROP_REASON_TAP_FILTER,
-	/**
-	 * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at
-	 * tun/tap, e.g., check_filter()
-	 */
-	SKB_DROP_REASON_TAP_TXFILTER,
-	/** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */
-	SKB_DROP_REASON_ICMP_CSUM,
-	/**
-	 * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211,
-	 * such as a broadcasts ICMP_TIMESTAMP
-	 */
-	SKB_DROP_REASON_INVALID_PROTO,
-	/**
-	 * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to
-	 * IPSTATS_MIB_INADDRERRORS
-	 */
-	SKB_DROP_REASON_IP_INADDRERRORS,
-	/**
-	 * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to
-	 * IPSTATS_MIB_INADDRERRORS
-	 */
-	SKB_DROP_REASON_IP_INNOROUTES,
-	/**
-	 * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the
-	 * MTU)
-	 */
-	SKB_DROP_REASON_PKT_TOO_BIG,
-	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
-	SKB_DROP_REASON_DUP_FRAG,
-	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
-	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
-	/**
-	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
-	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
-	 */
-	SKB_DROP_REASON_FRAG_TOO_FAR,
-	/**
-	 * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below
-	 * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT).
-	 */
-	SKB_DROP_REASON_TCP_MINTTL,
-	/** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */
-	SKB_DROP_REASON_IPV6_BAD_EXTHDR,
-	/** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */
-	SKB_DROP_REASON_IPV6_NDISC_FRAG,
-	/** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */
-	SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT,
-	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */
-	SKB_DROP_REASON_IPV6_NDISC_BAD_CODE,
-	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */
-	SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS,
-	/** @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION
-	 * for another host.
-	 */
-	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
-	/**
-	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
-	 * used as a real 'reason'
-	 */
-	SKB_DROP_REASON_MAX,
-};
-
-#define SKB_DR_INIT(name, reason)				\
-	enum skb_drop_reason name = SKB_DROP_REASON_##reason
-#define SKB_DR(name)						\
-	SKB_DR_INIT(name, NOT_SPECIFIED)
-#define SKB_DR_SET(name, reason)				\
-	(name = SKB_DROP_REASON_##reason)
-#define SKB_DR_OR(name, reason)					\
-	do {							\
-		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
-		    name == SKB_NOT_DROPPED_YET)		\
-			SKB_DR_SET(name, reason);		\
-	} while (0)
-
-extern const char * const drop_reasons[];
-
-#endif
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b23ddec3cd5c..325ad893f624 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -7,7 +7,7 @@
 #include <linux/in6.h>
 #include <linux/rbtree_types.h>
 #include <linux/refcount.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 /* Per netns frag queues directory */
 struct fqdir {
-- 
cgit v1.2.3


From 071c0fc6fb919dcf29c676a842dda08a674877d7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:53 +0200
Subject: net: extend drop reasons for multiple subsystems

Extend drop reasons to make them usable by subsystems
other than core by reserving the high 16 bits for a
new subsystem ID, of which 0 of course is used for the
existing reasons immediately.

To still be able to have string reasons, restructure
that code a bit to make the loopup under RCU, the only
user of this (right now) is drop_monitor.

Link: https://lore.kernel.org/netdev/00659771ed54353f92027702c5bbb84702da62ce.camel@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h | 14 +++++++---
 include/net/dropreason.h      | 31 +++++++++++++++++++++++
 net/core/drop_monitor.c       | 33 +++++++++++++++++-------
 net/core/skbuff.c             | 59 ++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 include/net/dropreason.h

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index ade6d5b9186c..a2b953b57689 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -340,12 +340,20 @@ enum skb_drop_reason {
 	 */
 	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
 	/**
-	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
-	 * used as a real 'reason'
+	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
+	 * shouldn't be used as a real 'reason' - only for tracing code gen
 	 */
 	SKB_DROP_REASON_MAX,
+
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MASK: subsystem mask in drop reasons,
+	 * see &enum skb_drop_reason_subsys
+	 */
+	SKB_DROP_REASON_SUBSYS_MASK = 0xffff0000,
 };
 
+#define SKB_DROP_REASON_SUBSYS_SHIFT	16
+
 #define SKB_DR_INIT(name, reason)				\
 	enum skb_drop_reason name = SKB_DROP_REASON_##reason
 #define SKB_DR(name)						\
@@ -359,6 +367,4 @@ enum skb_drop_reason {
 			SKB_DR_SET(name, reason);		\
 	} while (0)
 
-extern const char * const drop_reasons[];
-
 #endif
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
new file mode 100644
index 000000000000..f0f2378dbed0
--- /dev/null
+++ b/include/net/dropreason.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_H
+#define _LINUX_DROPREASON_H
+#include <net/dropreason-core.h>
+
+/**
+ * enum skb_drop_reason_subsys - subsystem tag for (extended) drop reasons
+ */
+enum skb_drop_reason_subsys {
+	/** @SKB_DROP_REASON_SUBSYS_CORE: core drop reasons defined above */
+	SKB_DROP_REASON_SUBSYS_CORE,
+
+	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
+	SKB_DROP_REASON_SUBSYS_NUM
+};
+
+struct drop_reason_list {
+	const char * const *reasons;
+	size_t n_reasons;
+};
+
+/* Note: due to dynamic registrations, access must be under RCU */
+extern const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM];
+
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+				  const struct drop_reason_list *list);
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys);
+
+#endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5a782d1d8fd3..aff31cd944c2 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -21,6 +21,7 @@
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
+#include <linux/bitfield.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <linux/bitops.h>
@@ -29,6 +30,7 @@
 #include <net/genetlink.h>
 #include <net/netevent.h>
 #include <net/flow_offload.h>
+#include <net/dropreason.h>
 #include <net/devlink.h>
 
 #include <trace/events/skb.h>
@@ -504,8 +506,6 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 	if (!nskb)
 		return;
 
-	if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0))
-		reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	cb = NET_DM_SKB_CB(nskb);
 	cb->reason = reason;
 	cb->pc = location;
@@ -552,9 +552,9 @@ static size_t net_dm_in_port_size(void)
 }
 
 #define NET_DM_MAX_SYMBOL_LEN 40
+#define NET_DM_MAX_REASON_LEN 50
 
-static size_t net_dm_packet_report_size(size_t payload_len,
-					enum skb_drop_reason reason)
+static size_t net_dm_packet_report_size(size_t payload_len)
 {
 	size_t size;
 
@@ -576,7 +576,7 @@ static size_t net_dm_packet_report_size(size_t payload_len,
 	       /* NET_DM_ATTR_PROTO */
 	       nla_total_size(sizeof(u16)) +
 	       /* NET_DM_ATTR_REASON */
-	       nla_total_size(strlen(drop_reasons[reason]) + 1) +
+	       nla_total_size(NET_DM_MAX_REASON_LEN + 1) +
 	       /* NET_DM_ATTR_PAYLOAD */
 	       nla_total_size(payload_len);
 }
@@ -610,6 +610,8 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 				     size_t payload_len)
 {
 	struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
+	const struct drop_reason_list *list = NULL;
+	unsigned int subsys, subsys_reason;
 	char buf[NET_DM_MAX_SYMBOL_LEN];
 	struct nlattr *attr;
 	void *hdr;
@@ -627,9 +629,24 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 			      NET_DM_ATTR_PAD))
 		goto nla_put_failure;
 
+	rcu_read_lock();
+	subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK);
+	if (subsys < SKB_DROP_REASON_SUBSYS_NUM)
+		list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+	subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+	if (!list ||
+	    subsys_reason >= list->n_reasons ||
+	    !list->reasons[subsys_reason] ||
+	    strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) {
+		list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]);
+		subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	}
 	if (nla_put_string(msg, NET_DM_ATTR_REASON,
-			   drop_reasons[cb->reason]))
+			   list->reasons[subsys_reason])) {
+		rcu_read_unlock();
 		goto nla_put_failure;
+	}
+	rcu_read_unlock();
 
 	snprintf(buf, sizeof(buf), "%pS", cb->pc);
 	if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
@@ -687,9 +704,7 @@ static void net_dm_packet_report(struct sk_buff *skb)
 	if (net_dm_trunc_len)
 		payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
 
-	msg = nlmsg_new(net_dm_packet_report_size(payload_len,
-						  NET_DM_SKB_CB(skb)->reason),
-			GFP_KERNEL);
+	msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
 	if (!msg)
 		goto out;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8764653bede7..0d998806b377 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -58,6 +58,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/bitfield.h>
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
 #include <linux/kcov.h>
@@ -72,6 +73,7 @@
 #include <net/mptcp.h>
 #include <net/mctp.h>
 #include <net/page_pool.h>
+#include <net/dropreason.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -122,11 +124,59 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
 
 #undef FN
 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
-const char * const drop_reasons[] = {
+static const char * const drop_reasons[] = {
 	[SKB_CONSUMED] = "CONSUMED",
 	DEFINE_DROP_REASON(FN, FN)
 };
-EXPORT_SYMBOL(drop_reasons);
+
+static const struct drop_reason_list drop_reasons_core = {
+	.reasons = drop_reasons,
+	.n_reasons = ARRAY_SIZE(drop_reasons),
+};
+
+const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
+	[SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
+};
+EXPORT_SYMBOL(drop_reasons_by_subsys);
+
+/**
+ * drop_reasons_register_subsys - register another drop reason subsystem
+ * @subsys: the subsystem to register, must not be the core
+ * @list: the list of drop reasons within the subsystem, must point to
+ *	a statically initialized list
+ */
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+				  const struct drop_reason_list *list)
+{
+	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+		 "invalid subsystem %d\n", subsys))
+		return;
+
+	/* must point to statically allocated memory, so INIT is OK */
+	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
+}
+EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
+
+/**
+ * drop_reasons_unregister_subsys - unregister a drop reason subsystem
+ * @subsys: the subsystem to remove, must not be the core
+ *
+ * Note: This will synchronize_rcu() to ensure no users when it returns.
+ */
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
+{
+	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+		 "invalid subsystem %d\n", subsys))
+		return;
+
+	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
 
 /**
  *	skb_panic - private function for out-of-line support
@@ -986,7 +1036,10 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 	if (unlikely(!skb_unref(skb)))
 		return false;
 
-	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+	DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
+			       u32_get_bits(reason,
+					    SKB_DROP_REASON_SUBSYS_MASK) >=
+				SKB_DROP_REASON_SUBSYS_NUM);
 
 	if (reason == SKB_CONSUMED)
 		trace_consume_skb(skb, __builtin_return_address(0));
-- 
cgit v1.2.3


From baa951a1c1771810f3a378a95fc93e81953027d5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:54 +0200
Subject: mac80211: use the new drop reasons infrastructure

It can be really hard to analyse or debug why packets are
going missing in mac80211, so add the needed infrastructure
to use use the new per-subsystem drop reasons.

We actually use two drop reason subsystems here because of
the different handling of frames that are dropped but still
go to monitor for old versions of hostapd, and those that
are just completely unusable (e.g. crypto failed.)

Annotate a few reasons here just to illustrate this, we'll
need to go through and annotate more of them later.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h   | 12 ++++++++++
 net/mac80211/drop.h        | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  8 +------
 net/mac80211/main.c        | 31 +++++++++++++++++++++++++
 net/mac80211/rx.c          | 55 +++++++++++++++++++++------------------------
 net/mac80211/wpa.c         | 24 ++++++++++----------
 6 files changed, 138 insertions(+), 48 deletions(-)
 create mode 100644 net/mac80211/drop.h

(limited to 'include')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index f0f2378dbed0..685fb37df8e8 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -11,6 +11,18 @@ enum skb_drop_reason_subsys {
 	/** @SKB_DROP_REASON_SUBSYS_CORE: core drop reasons defined above */
 	SKB_DROP_REASON_SUBSYS_CORE,
 
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE: mac80211 drop reasons
+	 * for unusable frames, see net/mac80211/drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
+
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons
+	 * for frames still going to monitor, see net/mac80211/drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
+
 	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
 	SKB_DROP_REASON_SUBSYS_NUM
 };
diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h
new file mode 100644
index 000000000000..49dc809cab29
--- /dev/null
+++ b/net/mac80211/drop.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mac80211 drop reason list
+ *
+ * Copyright (C) 2023 Intel Corporation
+ */
+
+#ifndef MAC80211_DROP_H
+#define MAC80211_DROP_H
+#include <net/dropreason.h>
+
+typedef unsigned int __bitwise ieee80211_rx_result;
+
+#define MAC80211_DROP_REASONS_MONITOR(R)	\
+	R(RX_DROP_M_UNEXPECTED_4ADDR_FRAME)	\
+	R(RX_DROP_M_BAD_BCN_KEYIDX)		\
+	R(RX_DROP_M_BAD_MGMT_KEYIDX)		\
+/* this line for the trailing \ - add before this */
+
+#define MAC80211_DROP_REASONS_UNUSABLE(R)	\
+	R(RX_DROP_U_MIC_FAIL)			\
+	R(RX_DROP_U_REPLAY)			\
+	R(RX_DROP_U_BAD_MMIE)			\
+/* this line for the trailing \ - add before this */
+
+/* having two enums allows for checking ieee80211_rx_result use with sparse */
+enum ___mac80211_drop_reason {
+/* if we get to the end of handlers with RX_CONTINUE this will be the reason */
+	___RX_CONTINUE	= SKB_CONSUMED,
+
+/* this never gets used as an argument to kfree_skb_reason() */
+	___RX_QUEUED	= SKB_NOT_DROPPED_YET,
+
+#define ENUM(x) ___ ## x,
+	___RX_DROP_MONITOR = SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+	MAC80211_DROP_REASONS_MONITOR(ENUM)
+
+	___RX_DROP_UNUSABLE = SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+	MAC80211_DROP_REASONS_UNUSABLE(ENUM)
+#undef ENUM
+};
+
+enum mac80211_drop_reason {
+	RX_CONTINUE	 = (__force ieee80211_rx_result)___RX_CONTINUE,
+	RX_QUEUED	 = (__force ieee80211_rx_result)___RX_QUEUED,
+	RX_DROP_MONITOR	 = (__force ieee80211_rx_result)___RX_DROP_MONITOR,
+	RX_DROP_UNUSABLE = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE,
+#define DEF(x) x = (__force ieee80211_rx_result)___ ## x,
+	MAC80211_DROP_REASONS_MONITOR(DEF)
+	MAC80211_DROP_REASONS_UNUSABLE(DEF)
+#undef DEF
+};
+
+#endif /* MAC80211_DROP_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9b7e184430b8..a0a7839cb961 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -33,6 +33,7 @@
 #include "key.h"
 #include "sta_info.h"
 #include "debug.h"
+#include "drop.h"
 
 extern const struct cfg80211_ops mac80211_config_ops;
 
@@ -170,13 +171,6 @@ struct ieee80211_tx_data {
 	unsigned int flags;
 };
 
-
-typedef unsigned __bitwise ieee80211_rx_result;
-#define RX_CONTINUE		((__force ieee80211_rx_result) 0u)
-#define RX_DROP_UNUSABLE	((__force ieee80211_rx_result) 1u)
-#define RX_DROP_MONITOR		((__force ieee80211_rx_result) 2u)
-#define RX_QUEUED		((__force ieee80211_rx_result) 3u)
-
 /**
  * enum ieee80211_packet_rx_flags - packet RX flags
  * @IEEE80211_RX_AMSDU: a-MSDU packet
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ddf2b7811c55..55cdfaef0f5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -22,6 +22,7 @@
 #include <linux/bitmap.h>
 #include <linux/inetdevice.h>
 #include <net/net_namespace.h>
+#include <net/dropreason.h>
 #include <net/cfg80211.h>
 #include <net/addrconf.h>
 
@@ -1542,6 +1543,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
 }
 EXPORT_SYMBOL(ieee80211_free_hw);
 
+static const char * const drop_reasons_monitor[] = {
+#define V(x)	#x,
+	[0] = "RX_DROP_MONITOR",
+	MAC80211_DROP_REASONS_MONITOR(V)
+};
+
+static struct drop_reason_list drop_reason_list_monitor = {
+	.reasons = drop_reasons_monitor,
+	.n_reasons = ARRAY_SIZE(drop_reasons_monitor),
+};
+
+static const char * const drop_reasons_unusable[] = {
+	[0] = "RX_DROP_UNUSABLE",
+	MAC80211_DROP_REASONS_UNUSABLE(V)
+#undef V
+};
+
+static struct drop_reason_list drop_reason_list_unusable = {
+	.reasons = drop_reasons_unusable,
+	.n_reasons = ARRAY_SIZE(drop_reasons_unusable),
+};
+
 static int __init ieee80211_init(void)
 {
 	struct sk_buff *skb;
@@ -1559,6 +1582,11 @@ static int __init ieee80211_init(void)
 	if (ret)
 		goto err_netdev;
 
+	drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
+				     &drop_reason_list_monitor);
+	drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
+				     &drop_reason_list_unusable);
+
 	return 0;
  err_netdev:
 	rc80211_minstrel_exit();
@@ -1574,6 +1602,9 @@ static void __exit ieee80211_exit(void)
 
 	ieee80211_iface_exit();
 
+	drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR);
+	drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE);
+
 	rcu_barrier();
 }
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index db3451f5f2fb..58222c077898 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1826,7 +1826,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 				cfg80211_rx_unexpected_4addr_frame(
 					rx->sdata->dev, sta->sta.addr,
 					GFP_ATOMIC);
-			return RX_DROP_MONITOR;
+			return RX_DROP_M_UNEXPECTED_4ADDR_FRAME;
 		}
 		/*
 		 * Update counter and free packet here to avoid
@@ -1961,7 +1961,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 				cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
 							     skb->data,
 							     skb->len);
-			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+			return RX_DROP_M_BAD_BCN_KEYIDX;
 		}
 
 		rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
@@ -1975,7 +1975,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
 		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
-			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+			return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */
 		if (rx->link_sta) {
 			if (ieee80211_is_group_privacy_action(skb) &&
 			    test_sta_flag(rx->sta, WLAN_STA_MFP))
@@ -3960,7 +3960,8 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 }
 
 static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
-					struct ieee80211_rate *rate)
+					struct ieee80211_rate *rate,
+					ieee80211_rx_result reason)
 {
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_local *local = rx->local;
@@ -4024,42 +4025,38 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 	}
 
  out_free_skb:
-	dev_kfree_skb(skb);
+	kfree_skb_reason(skb, (__force u32)reason);
 }
 
 static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
 					 ieee80211_rx_result res)
 {
-	switch (res) {
-	case RX_DROP_MONITOR:
-		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
-		if (rx->sta)
-			rx->link_sta->rx_stats.dropped++;
-		fallthrough;
-	case RX_CONTINUE: {
-		struct ieee80211_rate *rate = NULL;
-		struct ieee80211_supported_band *sband;
-		struct ieee80211_rx_status *status;
-
-		status = IEEE80211_SKB_RXCB((rx->skb));
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *rate = NULL;
 
-		sband = rx->local->hw.wiphy->bands[status->band];
-		if (status->encoding == RX_ENC_LEGACY)
-			rate = &sband->bitrates[status->rate_idx];
+	if (res == RX_QUEUED) {
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+		return;
+	}
 
-		ieee80211_rx_cooked_monitor(rx, rate);
-		break;
-		}
-	case RX_DROP_UNUSABLE:
+	if (res != RX_CONTINUE) {
 		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
 		if (rx->sta)
 			rx->link_sta->rx_stats.dropped++;
-		dev_kfree_skb(rx->skb);
-		break;
-	case RX_QUEUED:
-		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
-		break;
 	}
+
+	if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) ==
+			SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) {
+		kfree_skb_reason(rx->skb, (__force u32)res);
+		return;
+	}
+
+	sband = rx->local->hw.wiphy->bands[status->band];
+	if (status->encoding == RX_ENC_LEGACY)
+		rate = &sband->bitrates[status->rate_idx];
+
+	ieee80211_rx_cooked_monitor(rx, rate, res);
 }
 
 static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 20f742b5503b..4133496da378 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -550,7 +550,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 		if (res < 0 ||
 		    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
 			key->u.ccmp.replays++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_REPLAY;
 		}
 
 		if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -564,7 +564,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 				    skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
 				    data_len,
 				    skb->data + skb->len - mic_len))
-				return RX_DROP_UNUSABLE;
+				return RX_DROP_U_MIC_FAIL;
 		}
 
 		memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
@@ -746,7 +746,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 		if (res < 0 ||
 		    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
 			key->u.gcmp.replays++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_REPLAY;
 		}
 
 		if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -761,7 +761,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 				    data_len,
 				    skb->data + skb->len -
 				    IEEE80211_GCMP_MIC_LEN))
-				return RX_DROP_UNUSABLE;
+				return RX_DROP_U_MIC_FAIL;
 		}
 
 		memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
@@ -930,13 +930,13 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 		(skb->data + skb->len - sizeof(*mmie));
 	if (mmie->element_id != WLAN_EID_MMIE ||
 	    mmie->length != sizeof(*mmie) - 2)
-		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+		return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
 
 	bip_ipn_swap(ipn, mmie->sequence_number);
 
 	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
 		key->u.aes_cmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -946,7 +946,7 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 				   skb->data + 24, skb->len - 24, mic);
 		if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_cmac.icverrors++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 	}
 
@@ -986,7 +986,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
 
 	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
 		key->u.aes_cmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -996,7 +996,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
 				       skb->data + 24, skb->len - 24, mic);
 		if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_cmac.icverrors++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 	}
 
@@ -1079,13 +1079,13 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
 		(skb->data + skb->len - sizeof(*mmie));
 	if (mmie->element_id != WLAN_EID_MMIE ||
 	    mmie->length != sizeof(*mmie) - 2)
-		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+		return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
 
 	bip_ipn_swap(ipn, mmie->sequence_number);
 
 	if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) {
 		key->u.aes_gmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -1104,7 +1104,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
 		    crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_gmac.icverrors++;
 			kfree(mic);
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 		kfree(mic);
 	}
-- 
cgit v1.2.3


From 48cd6bc5b22d68b8bbc8601f3c7ddeed99541a0b Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 18 Feb 2023 09:10:31 +0100
Subject: virtio: Reorder fields in 'struct virtqueue'

Group some variables based on their sizes to reduce hole and avoid padding.
On x86_64, this shrinks the size of 'struct virtqueue'
from 72 to 68 bytes.

It saves a few bytes of memory.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Message-Id: <8f3d2e49270a2158717e15008e7ed7228196ba02.1676707807.git.christophe.jaillet@wanadoo.fr>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Lafreniere <peter@n8pjl.ca>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 include/linux/virtio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 2b472514c49b..d6e161aa532e 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -34,8 +34,8 @@ struct virtqueue {
 	unsigned int index;
 	unsigned int num_free;
 	unsigned int num_max;
-	void *priv;
 	bool reset;
+	void *priv;
 };
 
 int virtqueue_add_outbuf(struct virtqueue *vq,
-- 
cgit v1.2.3


From 4b6ec919b84830949313c805c586fb34f141ce0c Mon Sep 17 00:00:00 2001
From: Feng Liu <feliu@nvidia.com>
Date: Fri, 10 Mar 2023 07:34:28 +0200
Subject: virtio_ring: Use const to annotate read-only pointer params

Add const to make the read-only pointer parameters clear, similar to
many existing functions.

To implement this change, the commit also introduces the use of
`container_of_const` to implement `to_vvq`, which ensures the const-ness
of read-only parameters and avoids accidental modification of their
members.

Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Bodong Wang <bodong@nvidia.com>

Message-Id: <20230310053428.3376-4-feliu@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 36 ++++++++++++++++++------------------
 include/linux/virtio.h       | 14 +++++++-------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a26fab91c59f..4c3bb0ddeb9b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -231,9 +231,9 @@ static void vring_free(struct virtqueue *_vq);
  * Helpers.
  */
 
-#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
 
-static bool virtqueue_use_indirect(struct vring_virtqueue *vq,
+static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
 				   unsigned int total_sg)
 {
 	/*
@@ -269,7 +269,7 @@ static bool virtqueue_use_indirect(struct vring_virtqueue *vq,
  * unconditionally on data path.
  */
 
-static bool vring_use_dma_api(struct virtio_device *vdev)
+static bool vring_use_dma_api(const struct virtio_device *vdev)
 {
 	if (!virtio_has_dma_quirk(vdev))
 		return true;
@@ -289,7 +289,7 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 	return false;
 }
 
-size_t virtio_max_dma_size(struct virtio_device *vdev)
+size_t virtio_max_dma_size(const struct virtio_device *vdev)
 {
 	size_t max_segment_size = SIZE_MAX;
 
@@ -423,7 +423,7 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num)
  */
 
 static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
-					   struct vring_desc *desc)
+					   const struct vring_desc *desc)
 {
 	u16 flags;
 
@@ -1183,7 +1183,7 @@ static u16 packed_last_used(u16 last_used_idx)
 }
 
 static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
-				     struct vring_desc_extra *extra)
+				     const struct vring_desc_extra *extra)
 {
 	u16 flags;
 
@@ -1206,7 +1206,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
 }
 
 static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
-				   struct vring_packed_desc *desc)
+				    const struct vring_packed_desc *desc)
 {
 	u16 flags;
 
@@ -2786,10 +2786,10 @@ EXPORT_SYMBOL_GPL(vring_transport_features);
  * Returns the size of the vring.  This is mainly used for boasting to
  * userspace.  Unlike other operations, this need not be serialized.
  */
-unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
+unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq)
 {
 
-	struct vring_virtqueue *vq = to_vvq(_vq);
+	const struct vring_virtqueue *vq = to_vvq(_vq);
 
 	return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
 }
@@ -2819,9 +2819,9 @@ void __virtqueue_unbreak(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(__virtqueue_unbreak);
 
-bool virtqueue_is_broken(struct virtqueue *_vq)
+bool virtqueue_is_broken(const struct virtqueue *_vq)
 {
-	struct vring_virtqueue *vq = to_vvq(_vq);
+	const struct vring_virtqueue *vq = to_vvq(_vq);
 
 	return READ_ONCE(vq->broken);
 }
@@ -2868,9 +2868,9 @@ void __virtio_unbreak_device(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(__virtio_unbreak_device);
 
-dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
+dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq)
 {
-	struct vring_virtqueue *vq = to_vvq(_vq);
+	const struct vring_virtqueue *vq = to_vvq(_vq);
 
 	BUG_ON(!vq->we_own_ring);
 
@@ -2881,9 +2881,9 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
 
-dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
+dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq)
 {
-	struct vring_virtqueue *vq = to_vvq(_vq);
+	const struct vring_virtqueue *vq = to_vvq(_vq);
 
 	BUG_ON(!vq->we_own_ring);
 
@@ -2895,9 +2895,9 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
 
-dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
+dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq)
 {
-	struct vring_virtqueue *vq = to_vvq(_vq);
+	const struct vring_virtqueue *vq = to_vvq(_vq);
 
 	BUG_ON(!vq->we_own_ring);
 
@@ -2910,7 +2910,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
 
 /* Only available for split ring */
-const struct vring *virtqueue_get_vring(struct virtqueue *vq)
+const struct vring *virtqueue_get_vring(const struct virtqueue *vq)
 {
 	return &to_vvq(vq)->split.vring;
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d6e161aa532e..b93238db94e3 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -84,14 +84,14 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
 
 void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
-unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
+unsigned int virtqueue_get_vring_size(const struct virtqueue *vq);
 
-bool virtqueue_is_broken(struct virtqueue *vq);
+bool virtqueue_is_broken(const struct virtqueue *vq);
 
-const struct vring *virtqueue_get_vring(struct virtqueue *vq);
-dma_addr_t virtqueue_get_desc_addr(struct virtqueue *vq);
-dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq);
-dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
+const struct vring *virtqueue_get_vring(const struct virtqueue *vq);
+dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *vq);
+dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq);
+dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq);
 
 int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
@@ -147,7 +147,7 @@ int virtio_device_restore(struct virtio_device *dev);
 #endif
 void virtio_reset_device(struct virtio_device *dev);
 
-size_t virtio_max_dma_size(struct virtio_device *vdev);
+size_t virtio_max_dma_size(const struct virtio_device *vdev);
 
 #define virtio_device_for_each_vq(vdev, vq) \
 	list_for_each_entry(vq, &vdev->vqs, list)
-- 
cgit v1.2.3


From 1d24692732fb299c94b0dcc032b48ac8fa85c854 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Thu, 23 Mar 2023 13:30:34 +0800
Subject: vdpa: Add set/get_vq_affinity callbacks in vdpa_config_ops

This introduces set/get_vq_affinity callbacks in
vdpa_config_ops to support virtqueue affinity
management for vdpa device drivers.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20230323053043.35-3-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_vdpa.c | 28 ++++++++++++++++++++++++++++
 include/linux/vdpa.h         | 13 +++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index d7f5af62ddaa..f72696b4c1c2 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -337,6 +337,32 @@ static const char *virtio_vdpa_bus_name(struct virtio_device *vdev)
 	return dev_name(&vdpa->dev);
 }
 
+static int virtio_vdpa_set_vq_affinity(struct virtqueue *vq,
+				       const struct cpumask *cpu_mask)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+	struct vdpa_device *vdpa = vd_dev->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+	unsigned int index = vq->index;
+
+	if (ops->set_vq_affinity)
+		return ops->set_vq_affinity(vdpa, index, cpu_mask);
+
+	return 0;
+}
+
+static const struct cpumask *
+virtio_vdpa_get_vq_affinity(struct virtio_device *vdev, int index)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	if (ops->get_vq_affinity)
+		return ops->get_vq_affinity(vdpa, index);
+
+	return NULL;
+}
+
 static const struct virtio_config_ops virtio_vdpa_config_ops = {
 	.get		= virtio_vdpa_get,
 	.set		= virtio_vdpa_set,
@@ -349,6 +375,8 @@ static const struct virtio_config_ops virtio_vdpa_config_ops = {
 	.get_features	= virtio_vdpa_get_features,
 	.finalize_features = virtio_vdpa_finalize_features,
 	.bus_name	= virtio_vdpa_bus_name,
+	.set_vq_affinity = virtio_vdpa_set_vq_affinity,
+	.get_vq_affinity = virtio_vdpa_get_vq_affinity,
 };
 
 static void virtio_vdpa_release_dev(struct device *_d)
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 43f59ef10cc9..2313db735773 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -250,6 +250,15 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				Returns the iova range supported by
  *				the device.
+ * @set_vq_affinity:		Set the affinity of virtqueue (optional)
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@cpu_mask: the affinity mask
+ *				Returns integer: success (0) or error (< 0)
+ * @get_vq_affinity:		Get the affinity of virtqueue (optional)
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				Returns the affinity mask
  * @set_group_asid:		Set address space identifier for a
  *				virtqueue group (optional)
  *				@vdev: vdpa device
@@ -340,6 +349,10 @@ struct vdpa_config_ops {
 			   const void *buf, unsigned int len);
 	u32 (*get_generation)(struct vdpa_device *vdev);
 	struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev);
+	int (*set_vq_affinity)(struct vdpa_device *vdev, u16 idx,
+			       const struct cpumask *cpu_mask);
+	const struct cpumask *(*get_vq_affinity)(struct vdpa_device *vdev,
+						 u16 idx);
 
 	/* DMA ops */
 	int (*set_map)(struct vdpa_device *vdev, unsigned int asid,
-- 
cgit v1.2.3


From 5e68470f4e80a4120e9ecec408f6ab4ad386bd4a Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Thu, 23 Mar 2023 13:30:40 +0800
Subject: vdpa: Add eventfd for the vdpa callback

Add eventfd for the vdpa callback so that user
can signal it directly instead of triggering the
callback. It will be used for vhost-vdpa case.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Message-Id: <20230323053043.35-9-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vdpa.c         | 2 ++
 drivers/virtio/virtio_vdpa.c | 1 +
 include/linux/vdpa.h         | 6 ++++++
 3 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 7be9d9d8f01c..9cd878e25cff 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -599,9 +599,11 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 		if (vq->call_ctx.ctx) {
 			cb.callback = vhost_vdpa_virtqueue_cb;
 			cb.private = vq;
+			cb.trigger = vq->call_ctx.ctx;
 		} else {
 			cb.callback = NULL;
 			cb.private = NULL;
+			cb.trigger = NULL;
 		}
 		ops->set_vq_cb(vdpa, idx, &cb);
 		vhost_vdpa_setup_vq_irq(v, idx);
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f3826f42b704..2a095f37f26b 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -196,6 +196,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 	/* Setup virtqueue callback */
 	cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
 	cb.private = info;
+	cb.trigger = NULL;
 	ops->set_vq_cb(vdpa, index, &cb);
 	ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
 
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 2313db735773..f2064fa2d2da 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -13,10 +13,16 @@
  * struct vdpa_calllback - vDPA callback definition.
  * @callback: interrupt callback function
  * @private: the data passed to the callback function
+ * @trigger: the eventfd for the callback (Optional).
+ *           When it is set, the vDPA driver must guarantee that
+ *           signaling it is functional equivalent to triggering
+ *           the callback. Then vDPA parent can signal it directly
+ *           instead of triggering the callback.
  */
 struct vdpa_callback {
 	irqreturn_t (*callback)(void *data);
 	void *private;
+	struct eventfd_ctx *trigger;
 };
 
 /**
-- 
cgit v1.2.3


From c618c84d4ccc6268c3da7609c7388b6cb305c639 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 4 Apr 2023 15:13:18 +0200
Subject: vdpa: add bind_mm/unbind_mm callbacks

These new optional callbacks is used to bind/unbind the device to
a specific address space so the vDPA framework can use VA when
these callbacks are implemented.

Suggested-by: Jason Wang <jasowang@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20230404131326.44403-2-sgarzare@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index f2064fa2d2da..2f1af73b3a1a 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -305,6 +305,14 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				Returns pointer to structure device or error (NULL)
+ * @bind_mm:			Bind the device to a specific address space
+ *				so the vDPA framework can use VA when this
+ *				callback is implemented. (optional)
+ *				@vdev: vdpa device
+ *				@mm: address space to bind
+ * @unbind_mm:			Unbind the device from the address space
+ *				bound using the bind_mm callback. (optional)
+ *				@vdev: vdpa device
  * @free:			Free resources that belongs to vDPA (optional)
  *				@vdev: vdpa device
  */
@@ -370,6 +378,8 @@ struct vdpa_config_ops {
 	int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group,
 			      unsigned int asid);
 	struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx);
+	int (*bind_mm)(struct vdpa_device *vdev, struct mm_struct *mm);
+	void (*unbind_mm)(struct vdpa_device *vdev);
 
 	/* Free device resources */
 	void (*free)(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From 42823a871fd4e17b34034f43b36ae57bd2ed8a67 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 4 Apr 2023 15:17:16 +0200
Subject: vringh: support VA with iotlb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vDPA supports the possibility to use user VA in the iotlb messages.
So, let's add support for user VA in vringh to use it in the vDPA
simulators.

Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20230404131716.45855-1-sgarzare@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vringh.c | 171 ++++++++++++++++++++++++++++++++++++++++---------
 include/linux/vringh.h |   9 +++
 2 files changed, 148 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 3d5f6bb4ac31..955d938eb663 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -1094,10 +1094,17 @@ EXPORT_SYMBOL(vringh_need_notify_kern);
 
 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
 
+struct iotlb_vec {
+	union {
+		struct iovec *iovec;
+		struct bio_vec *bvec;
+	} iov;
+	size_t count;
+};
+
 static int iotlb_translate(const struct vringh *vrh,
 			   u64 addr, u64 len, u64 *translated,
-			   struct bio_vec iov[],
-			   int iov_size, u32 perm)
+			   struct iotlb_vec *ivec, u32 perm)
 {
 	struct vhost_iotlb_map *map;
 	struct vhost_iotlb *iotlb = vrh->iotlb;
@@ -1107,9 +1114,11 @@ static int iotlb_translate(const struct vringh *vrh,
 	spin_lock(vrh->iotlb_lock);
 
 	while (len > s) {
-		u64 size, pa, pfn;
+		uintptr_t io_addr;
+		size_t io_len;
+		u64 size;
 
-		if (unlikely(ret >= iov_size)) {
+		if (unlikely(ret >= ivec->count)) {
 			ret = -ENOBUFS;
 			break;
 		}
@@ -1124,10 +1133,22 @@ static int iotlb_translate(const struct vringh *vrh,
 		}
 
 		size = map->size - addr + map->start;
-		pa = map->addr + addr - map->start;
-		pfn = pa >> PAGE_SHIFT;
-		bvec_set_page(&iov[ret], pfn_to_page(pfn), min(len - s, size),
-			      pa & (PAGE_SIZE - 1));
+		io_len = min(len - s, size);
+		io_addr = map->addr - map->start + addr;
+
+		if (vrh->use_va) {
+			struct iovec *iovec = ivec->iov.iovec;
+
+			iovec[ret].iov_len = io_len;
+			iovec[ret].iov_base = (void __user *)io_addr;
+		} else {
+			u64 pfn = io_addr >> PAGE_SHIFT;
+			struct bio_vec *bvec = ivec->iov.bvec;
+
+			bvec_set_page(&bvec[ret], pfn_to_page(pfn), io_len,
+				      io_addr & (PAGE_SIZE - 1));
+		}
+
 		s += size;
 		addr += size;
 		++ret;
@@ -1146,23 +1167,36 @@ static int iotlb_translate(const struct vringh *vrh,
 static inline int copy_from_iotlb(const struct vringh *vrh, void *dst,
 				  void *src, size_t len)
 {
+	struct iotlb_vec ivec;
+	union {
+		struct iovec iovec[IOTLB_IOV_STRIDE];
+		struct bio_vec bvec[IOTLB_IOV_STRIDE];
+	} iov;
 	u64 total_translated = 0;
 
+	ivec.iov.iovec = iov.iovec;
+	ivec.count = IOTLB_IOV_STRIDE;
+
 	while (total_translated < len) {
-		struct bio_vec iov[IOTLB_IOV_STRIDE];
 		struct iov_iter iter;
 		u64 translated;
 		int ret;
 
 		ret = iotlb_translate(vrh, (u64)(uintptr_t)src,
 				      len - total_translated, &translated,
-				      iov, ARRAY_SIZE(iov), VHOST_MAP_RO);
+				      &ivec, VHOST_MAP_RO);
 		if (ret == -ENOBUFS)
-			ret = ARRAY_SIZE(iov);
+			ret = IOTLB_IOV_STRIDE;
 		else if (ret < 0)
 			return ret;
 
-		iov_iter_bvec(&iter, ITER_SOURCE, iov, ret, translated);
+		if (vrh->use_va) {
+			iov_iter_init(&iter, ITER_SOURCE, ivec.iov.iovec, ret,
+				      translated);
+		} else {
+			iov_iter_bvec(&iter, ITER_SOURCE, ivec.iov.bvec, ret,
+				      translated);
+		}
 
 		ret = copy_from_iter(dst, translated, &iter);
 		if (ret < 0)
@@ -1179,23 +1213,36 @@ static inline int copy_from_iotlb(const struct vringh *vrh, void *dst,
 static inline int copy_to_iotlb(const struct vringh *vrh, void *dst,
 				void *src, size_t len)
 {
+	struct iotlb_vec ivec;
+	union {
+		struct iovec iovec[IOTLB_IOV_STRIDE];
+		struct bio_vec bvec[IOTLB_IOV_STRIDE];
+	} iov;
 	u64 total_translated = 0;
 
+	ivec.iov.iovec = iov.iovec;
+	ivec.count = IOTLB_IOV_STRIDE;
+
 	while (total_translated < len) {
-		struct bio_vec iov[IOTLB_IOV_STRIDE];
 		struct iov_iter iter;
 		u64 translated;
 		int ret;
 
 		ret = iotlb_translate(vrh, (u64)(uintptr_t)dst,
 				      len - total_translated, &translated,
-				      iov, ARRAY_SIZE(iov), VHOST_MAP_WO);
+				      &ivec, VHOST_MAP_WO);
 		if (ret == -ENOBUFS)
-			ret = ARRAY_SIZE(iov);
+			ret = IOTLB_IOV_STRIDE;
 		else if (ret < 0)
 			return ret;
 
-		iov_iter_bvec(&iter, ITER_DEST, iov, ret, translated);
+		if (vrh->use_va) {
+			iov_iter_init(&iter, ITER_DEST, ivec.iov.iovec, ret,
+				      translated);
+		} else {
+			iov_iter_bvec(&iter, ITER_DEST, ivec.iov.bvec, ret,
+				      translated);
+		}
 
 		ret = copy_to_iter(src, translated, &iter);
 		if (ret < 0)
@@ -1212,20 +1259,36 @@ static inline int copy_to_iotlb(const struct vringh *vrh, void *dst,
 static inline int getu16_iotlb(const struct vringh *vrh,
 			       u16 *val, const __virtio16 *p)
 {
-	struct bio_vec iov;
-	void *kaddr, *from;
+	struct iotlb_vec ivec;
+	union {
+		struct iovec iovec[1];
+		struct bio_vec bvec[1];
+	} iov;
+	__virtio16 tmp;
 	int ret;
 
+	ivec.iov.iovec = iov.iovec;
+	ivec.count = 1;
+
 	/* Atomic read is needed for getu16 */
-	ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL,
-			      &iov, 1, VHOST_MAP_RO);
+	ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+			      NULL, &ivec, VHOST_MAP_RO);
 	if (ret < 0)
 		return ret;
 
-	kaddr = kmap_local_page(iov.bv_page);
-	from = kaddr + iov.bv_offset;
-	*val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from));
-	kunmap_local(kaddr);
+	if (vrh->use_va) {
+		ret = __get_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base);
+		if (ret)
+			return ret;
+	} else {
+		void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page);
+		void *from = kaddr + ivec.iov.bvec[0].bv_offset;
+
+		tmp = READ_ONCE(*(__virtio16 *)from);
+		kunmap_local(kaddr);
+	}
+
+	*val = vringh16_to_cpu(vrh, tmp);
 
 	return 0;
 }
@@ -1233,20 +1296,36 @@ static inline int getu16_iotlb(const struct vringh *vrh,
 static inline int putu16_iotlb(const struct vringh *vrh,
 			       __virtio16 *p, u16 val)
 {
-	struct bio_vec iov;
-	void *kaddr, *to;
+	struct iotlb_vec ivec;
+	union {
+		struct iovec iovec;
+		struct bio_vec bvec;
+	} iov;
+	__virtio16 tmp;
 	int ret;
 
+	ivec.iov.iovec = &iov.iovec;
+	ivec.count = 1;
+
 	/* Atomic write is needed for putu16 */
-	ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL,
-			      &iov, 1, VHOST_MAP_WO);
+	ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+			      NULL, &ivec, VHOST_MAP_RO);
 	if (ret < 0)
 		return ret;
 
-	kaddr = kmap_local_page(iov.bv_page);
-	to = kaddr + iov.bv_offset;
-	WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val));
-	kunmap_local(kaddr);
+	tmp = cpu_to_vringh16(vrh, val);
+
+	if (vrh->use_va) {
+		ret = __put_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base);
+		if (ret)
+			return ret;
+	} else {
+		void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page);
+		void *to = kaddr + ivec.iov.bvec[0].bv_offset;
+
+		WRITE_ONCE(*(__virtio16 *)to, tmp);
+		kunmap_local(kaddr);
+	}
 
 	return 0;
 }
@@ -1320,11 +1399,39 @@ int vringh_init_iotlb(struct vringh *vrh, u64 features,
 		      struct vring_avail *avail,
 		      struct vring_used *used)
 {
+	vrh->use_va = false;
+
 	return vringh_init_kern(vrh, features, num, weak_barriers,
 				desc, avail, used);
 }
 EXPORT_SYMBOL(vringh_init_iotlb);
 
+/**
+ * vringh_init_iotlb_va - initialize a vringh for a ring with IOTLB containing
+ *                        user VA.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userspace descriptor pointer.
+ * @avail: the userspace avail pointer.
+ * @used: the userspace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_iotlb_va(struct vringh *vrh, u64 features,
+			 unsigned int num, bool weak_barriers,
+			 struct vring_desc *desc,
+			 struct vring_avail *avail,
+			 struct vring_used *used)
+{
+	vrh->use_va = true;
+
+	return vringh_init_kern(vrh, features, num, weak_barriers,
+				desc, avail, used);
+}
+EXPORT_SYMBOL(vringh_init_iotlb_va);
+
 /**
  * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
  * @vrh: the vring
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 1991a02c6431..b4edfadf5479 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -32,6 +32,9 @@ struct vringh {
 	/* Can we get away with weak barriers? */
 	bool weak_barriers;
 
+	/* Use user's VA */
+	bool use_va;
+
 	/* Last available index we saw (ie. where we're up to). */
 	u16 last_avail_idx;
 
@@ -284,6 +287,12 @@ int vringh_init_iotlb(struct vringh *vrh, u64 features,
 		      struct vring_avail *avail,
 		      struct vring_used *used);
 
+int vringh_init_iotlb_va(struct vringh *vrh, u64 features,
+			 unsigned int num, bool weak_barriers,
+			 struct vring_desc *desc,
+			 struct vring_avail *avail,
+			 struct vring_used *used);
+
 int vringh_getdesc_iotlb(struct vringh *vrh,
 			 struct vringh_kiov *riov,
 			 struct vringh_kiov *wiov,
-- 
cgit v1.2.3


From 9be5d2d424a19a0c6d643a1baecba3d58e725012 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Fri, 31 Mar 2023 10:56:55 +0200
Subject: vdpa: address kdoc warnings

This patch addresses the following minor kdoc problems.

* Incorrect spelling of 'callback' and 'notification'
* Unrecognised kdoc format for 'struct vdpa_map_file'
* Missing documentation of 'get_vendor_vq_stats' member of
  'struct vdpa_config_ops'
* Missing documentation of 'max_supported_vqs' and 'supported_features'
  members of 'struct vdpa_mgmt_dev'

Most of these problems were flagged by:

 $ ./scripts/kernel-doc -Werror -none  include/linux/vdpa.h
 include/linux/vdpa.h:20: warning: expecting prototype for struct vdpa_calllback. Prototype was for struct vdpa_callback instead
 include/linux/vdpa.h:117: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
  * Corresponding file area for device memory mapping
 include/linux/vdpa.h:357: warning: Function parameter or member 'get_vendor_vq_stats' not described in 'vdpa_config_ops'
 include/linux/vdpa.h:518: warning: Function parameter or member 'supported_features' not described in 'vdpa_mgmt_dev'
 include/linux/vdpa.h:518: warning: Function parameter or member 'max_supported_vqs' not described in 'vdpa_mgmt_dev'

The misspelling of 'notification' was flagged by:
 $ ./scripts/checkpatch.pl --codespell --showfile --strict -f include/linux/vdpa.h
 include/linux/vdpa.h:171: CHECK: 'notifcation' may be misspelled - perhaps 'notification'?
 ...

Signed-off-by: Simon Horman <horms@kernel.org>
Message-Id: <20230331-vhost-fixes-v1-1-1f046e735b9e@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/vdpa.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 2f1af73b3a1a..c0d3f5433af7 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -10,7 +10,7 @@
 #include <linux/if_ether.h>
 
 /**
- * struct vdpa_calllback - vDPA callback definition.
+ * struct vdpa_callback - vDPA callback definition.
  * @callback: interrupt callback function
  * @private: the data passed to the callback function
  * @trigger: the eventfd for the callback (Optional).
@@ -120,7 +120,7 @@ struct vdpa_dev_set_config {
 };
 
 /**
- * Corresponding file area for device memory mapping
+ * struct vdpa_map_file - file area for device memory mapping
  * @file: vma->vm_file for the mapping
  * @offset: mapping offset in the vm_file
  */
@@ -171,10 +171,16 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				@state: pointer to returned state (last_avail_idx)
+ * @get_vendor_vq_stats:	Get the vendor statistics of a device.
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@msg: socket buffer holding stats message
+ *				@extack: extack for reporting error messages
+ *				Returns integer: success (0) or error (< 0)
  * @get_vq_notification:	Get the notification area for a virtqueue (optional)
  *				@vdev: vdpa device
  *				@idx: virtqueue index
- *				Returns the notifcation area
+ *				Returns the notification area
  * @get_vq_irq:			Get the irq number of a virtqueue (optional,
  *				but must implemented if require vq irq offloading)
  *				@vdev: vdpa device
@@ -535,6 +541,8 @@ struct vdpa_mgmtdev_ops {
  * @config_attr_mask: bit mask of attributes of type enum vdpa_attr that
  *		      management device support during dev_add callback
  * @list: list entry
+ * @supported_features: features supported by device
+ * @max_supported_vqs: maximum number of virtqueues supported by device
  */
 struct vdpa_mgmt_dev {
 	struct device *device;
-- 
cgit v1.2.3


From b2ffaa672edaf1a681537a9f47bc14ab6c9e8845 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Fri, 31 Mar 2023 10:56:56 +0200
Subject: vringh: address kdoc warnings

Address some minor kdoc warnings in vring.h.
* Place kdoc for 'struct vringh_config_ops' immediately before the structure
* Add missing documentation of members of 'vringh_iov' and 'vringh_kiov'

Warnings flagged by:
 $ ./scripts/kernel-doc -none include/linux/vringh.h
 include/linux/vringh.h:68: error: Cannot parse struct or union!
 include/linux/vringh.h:92: warning: Function parameter or member 'iov' not described in 'vringh_iov'
 include/linux/vringh.h:92: warning: Function parameter or member 'consumed' not described in 'vringh_iov'
 include/linux/vringh.h:92: warning: Function parameter or member 'i' not described in 'vringh_iov'
 include/linux/vringh.h:92: warning: Function parameter or member 'used' not described in 'vringh_iov'
 include/linux/vringh.h:92: warning: Function parameter or member 'max_num' not described in 'vringh_iov'
 include/linux/vringh.h:104: warning: Function parameter or member 'iov' not described in 'vringh_kiov'
 include/linux/vringh.h:104: warning: Function parameter or member 'consumed' not described in 'vringh_kiov'
 include/linux/vringh.h:104: warning: Function parameter or member 'i' not described in 'vringh_kiov'
 include/linux/vringh.h:104: warning: Function parameter or member 'used' not described in 'vringh_kiov'
 include/linux/vringh.h:104: warning: Function parameter or member 'max_num' not described in 'vringh_kiov'

Signed-off-by: Simon Horman <horms@kernel.org>
Message-Id: <20230331-vhost-fixes-v1-2-1f046e735b9e@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vringh.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index b4edfadf5479..c3a8117dabe8 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -57,6 +57,9 @@ struct vringh {
 	void (*notify)(struct vringh *);
 };
 
+struct virtio_device;
+typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
+
 /**
  * struct vringh_config_ops - ops for creating a host vring from a virtio driver
  * @find_vrhs: find the host vrings and instantiate them
@@ -68,8 +71,6 @@ struct vringh {
  *	Returns 0 on success or error status
  * @del_vrhs: free the host vrings found by find_vrhs().
  */
-struct virtio_device;
-typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
 struct vringh_config_ops {
 	int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
 			 struct vringh *vrhs[], vrh_callback_t *callbacks[]);
@@ -84,6 +85,12 @@ struct vringh_range {
 
 /**
  * struct vringh_iov - iovec mangler.
+ * @iov: array of iovecs to operate on
+ * @consumed: number of bytes consumed within iov[i]
+ * @i: index of current iovec
+ * @used: number of iovecs present in @iov
+ * @max_num: maximum number of iovecs.
+ *           corresponds to allocated memory of @iov
  *
  * Mangles iovec in place, and restores it.
  * Remaining data is iov + i, of used - i elements.
@@ -96,6 +103,12 @@ struct vringh_iov {
 
 /**
  * struct vringh_kiov - kvec mangler.
+ * @iov: array of iovecs to operate on
+ * @consumed: number of bytes consumed within iov[i]
+ * @i: index of current iovec
+ * @used: number of iovecs present in @iov
+ * @max_num: maximum number of iovecs.
+ *           corresponds to allocated memory of @iov
  *
  * Mangles kvec in place, and restores it.
  * Remaining data is iov + i, of used - i elements.
-- 
cgit v1.2.3


From af8ececda185078c096852edb4e1d7a2349e6856 Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor@daynix.com>
Date: Thu, 13 Apr 2023 11:18:54 +0300
Subject: virtio: add VIRTIO_F_NOTIFICATION_DATA feature support

According to VirtIO spec v1.2, VIRTIO_F_NOTIFICATION_DATA feature
indicates that the driver passes extra data along with the queue
notifications.

In a split queue case, the extra data is 16-bit available index. In a
packed queue case, the extra data is 1-bit wrap counter and 15-bit
available index.

Add support for this feature for MMIO, channel I/O and modern PCI
transports.

Signed-off-by: Viktor Prutyanov <viktor@daynix.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Message-Id: <20230413081855.36643-2-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/s390/virtio/virtio_ccw.c   | 22 +++++++++++++++++++---
 drivers/virtio/virtio_mmio.c       | 18 +++++++++++++++++-
 drivers/virtio/virtio_pci_modern.c | 17 ++++++++++++++++-
 drivers/virtio/virtio_ring.c       | 19 +++++++++++++++++++
 include/linux/virtio_ring.h        |  2 ++
 include/uapi/linux/virtio_config.h |  6 ++++++
 6 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 954fc31b4bc7..02922768b129 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -391,7 +391,7 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
 	ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area));
 }
 
-static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
+static inline bool virtio_ccw_do_kvm_notify(struct virtqueue *vq, u32 data)
 {
 	struct virtio_ccw_vq_info *info = vq->priv;
 	struct virtio_ccw_device *vcdev;
@@ -402,12 +402,22 @@ static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
 	BUILD_BUG_ON(sizeof(struct subchannel_id) != sizeof(unsigned int));
 	info->cookie = kvm_hypercall3(KVM_S390_VIRTIO_CCW_NOTIFY,
 				      *((unsigned int *)&schid),
-				      vq->index, info->cookie);
+				      data, info->cookie);
 	if (info->cookie < 0)
 		return false;
 	return true;
 }
 
+static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
+{
+	return virtio_ccw_do_kvm_notify(vq, vq->index);
+}
+
+static bool virtio_ccw_kvm_notify_with_data(struct virtqueue *vq)
+{
+	return virtio_ccw_do_kvm_notify(vq, vring_notification_data(vq));
+}
+
 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
 				   struct ccw1 *ccw, int index)
 {
@@ -495,6 +505,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 					     struct ccw1 *ccw)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+	bool (*notify)(struct virtqueue *vq);
 	int err;
 	struct virtqueue *vq = NULL;
 	struct virtio_ccw_vq_info *info;
@@ -502,6 +513,11 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	unsigned long flags;
 	bool may_reduce;
 
+	if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = virtio_ccw_kvm_notify_with_data;
+	else
+		notify = virtio_ccw_kvm_notify;
+
 	/* Allocate queue. */
 	info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
 	if (!info) {
@@ -524,7 +540,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	may_reduce = vcdev->revision > 0;
 	vq = vring_create_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN,
 				    vdev, true, may_reduce, ctx,
-				    virtio_ccw_kvm_notify, callback, name);
+				    notify, callback, name);
 
 	if (!vq) {
 		/* For now, we fail if we can't get the requested size. */
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 3ff746e3f24a..dd4424c14233 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -285,6 +285,16 @@ static bool vm_notify(struct virtqueue *vq)
 	return true;
 }
 
+static bool vm_notify_with_data(struct virtqueue *vq)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
+	u32 data = vring_notification_data(vq);
+
+	writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+
+	return true;
+}
+
 /* Notify all virtqueues on an interrupt. */
 static irqreturn_t vm_interrupt(int irq, void *opaque)
 {
@@ -363,12 +373,18 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 				  const char *name, bool ctx)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	bool (*notify)(struct virtqueue *vq);
 	struct virtio_mmio_vq_info *info;
 	struct virtqueue *vq;
 	unsigned long flags;
 	unsigned int num;
 	int err;
 
+	if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = vm_notify_with_data;
+	else
+		notify = vm_notify;
+
 	if (!name)
 		return NULL;
 
@@ -397,7 +413,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 
 	/* Create the vring */
 	vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
-				 true, true, ctx, vm_notify, callback, name);
+				 true, true, ctx, notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 6e713904d8e8..d6bb68ba84e5 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -288,6 +288,15 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 	return vp_modern_config_vector(&vp_dev->mdev, vector);
 }
 
+static bool vp_notify_with_data(struct virtqueue *vq)
+{
+	u32 data = vring_notification_data(vq);
+
+	iowrite32(data, (void __iomem *)vq->priv);
+
+	return true;
+}
+
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  struct virtio_pci_vq_info *info,
 				  unsigned int index,
@@ -298,10 +307,16 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 {
 
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+	bool (*notify)(struct virtqueue *vq);
 	struct virtqueue *vq;
 	u16 num;
 	int err;
 
+	if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = vp_notify_with_data;
+	else
+		notify = vp_notify;
+
 	if (index >= vp_modern_get_num_queues(mdev))
 		return ERR_PTR(-EINVAL);
 
@@ -316,7 +331,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	vq = vring_create_virtqueue(index, num,
 				    SMP_CACHE_BYTES, &vp_dev->vdev,
 				    true, true, ctx,
-				    vp_notify, callback, name);
+				    notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 34e3849629da..c5310eaf8b46 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2762,6 +2762,23 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
 
+u32 vring_notification_data(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 next;
+
+	if (vq->packed_ring)
+		next = (vq->packed.next_avail_idx &
+				~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) |
+			vq->packed.avail_wrap_counter <<
+				VRING_PACKED_EVENT_F_WRAP_CTR;
+	else
+		next = vq->split.avail_idx_shadow;
+
+	return next << 16 | _vq->index;
+}
+EXPORT_SYMBOL_GPL(vring_notification_data);
+
 /* Manipulates transport-specific feature bits. */
 void vring_transport_features(struct virtio_device *vdev)
 {
@@ -2781,6 +2798,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_ORDER_PLATFORM:
 			break;
+		case VIRTIO_F_NOTIFICATION_DATA:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 8b95b69ef694..2550c9170f4f 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -117,4 +117,6 @@ void vring_del_virtqueue(struct virtqueue *vq);
 void vring_transport_features(struct virtio_device *vdev);
 
 irqreturn_t vring_interrupt(int irq, void *_vq);
+
+u32 vring_notification_data(struct virtqueue *_vq);
 #endif /* _LINUX_VIRTIO_RING_H */
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 3c05162bc988..2c712c654165 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -99,6 +99,12 @@
  */
 #define VIRTIO_F_SR_IOV			37
 
+/*
+ * This feature indicates that the driver passes extra data (besides
+ * identifying the virtqueue) in its device notifications.
+ */
+#define VIRTIO_F_NOTIFICATION_DATA	38
+
 /*
  * This feature indicates that the driver can reset a queue individually.
  */
-- 
cgit v1.2.3


From 2c4e4a22a3b090e06191f8544d21e0e1d72ce518 Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Thu, 13 Apr 2023 11:18:55 +0300
Subject: virtio-vdpa: add VIRTIO_F_NOTIFICATION_DATA feature support

Add VIRTIO_F_NOTIFICATION_DATA support for vDPA transport.
If this feature is negotiated, the driver passes extra data when kicking
a virtqueue.

A device that offers this feature needs to implement the
kick_vq_with_data callback.

kick_vq_with_data receives the vDPA device and data.
data includes:
16 bits vqn and 16 bits next available index for split virtqueues.
16 bits vqs, 15 least significant bits of next available index
and 1 bit next_wrap for packed virtqueues.

This patch follows a patch [1] by Viktor Prutyanov which adds support
for the MMIO, channel I/O and modern PCI transports.

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Message-Id: <20230413081855.36643-3-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/virtio/virtio_vdpa.c | 23 +++++++++++++++++++++--
 include/linux/vdpa.h         |  9 +++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 2a095f37f26b..eb6aee8c06b2 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -113,6 +113,17 @@ static bool virtio_vdpa_notify(struct virtqueue *vq)
 	return true;
 }
 
+static bool virtio_vdpa_notify_with_data(struct virtqueue *vq)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+	u32 data = vring_notification_data(vq);
+
+	ops->kick_vq_with_data(vdpa, data);
+
+	return true;
+}
+
 static irqreturn_t virtio_vdpa_config_cb(void *private)
 {
 	struct virtio_vdpa_device *vd_dev = private;
@@ -139,6 +150,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 	struct device *dma_dev;
 	const struct vdpa_config_ops *ops = vdpa->config;
 	struct virtio_vdpa_vq_info *info;
+	bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify;
 	struct vdpa_callback cb;
 	struct virtqueue *vq;
 	u64 desc_addr, driver_addr, device_addr;
@@ -155,6 +167,14 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 	if (index >= vdpa->nvqs)
 		return ERR_PTR(-ENOENT);
 
+	/* We cannot accept VIRTIO_F_NOTIFICATION_DATA without kick_vq_with_data */
+	if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+		if (ops->kick_vq_with_data)
+			notify = virtio_vdpa_notify_with_data;
+		else
+			__virtio_clear_bit(vdev, VIRTIO_F_NOTIFICATION_DATA);
+	}
+
 	/* Queue shouldn't already be set up. */
 	if (ops->get_vq_ready(vdpa, index))
 		return ERR_PTR(-ENOENT);
@@ -184,8 +204,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 		dma_dev = vdpa_get_dma_dev(vdpa);
 	vq = vring_create_virtqueue_dma(index, max_num, align, vdev,
 					true, may_reduce_num, ctx,
-					virtio_vdpa_notify, callback,
-					name, dma_dev);
+					notify, callback, name, dma_dev);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index c0d3f5433af7..db1b0eaef4eb 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -149,6 +149,14 @@ struct vdpa_map_file {
  * @kick_vq:			Kick the virtqueue
  *				@vdev: vdpa device
  *				@idx: virtqueue index
+ * @kick_vq_with_data:		Kick the virtqueue and supply extra data
+ *				(only if VIRTIO_F_NOTIFICATION_DATA is negotiated)
+ *				@vdev: vdpa device
+ *				@data for split virtqueue:
+ *				16 bits vqn and 16 bits next available index.
+ *				@data for packed virtqueue:
+ *				16 bits vqn, 15 least significant bits of
+ *				next available index and 1 bit next_wrap.
  * @set_vq_cb:			Set the interrupt callback function for
  *				a virtqueue
  *				@vdev: vdpa device
@@ -329,6 +337,7 @@ struct vdpa_config_ops {
 			      u64 device_area);
 	void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
 	void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
+	void (*kick_vq_with_data)(struct vdpa_device *vdev, u32 data);
 	void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
 			  struct vdpa_callback *cb);
 	void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
-- 
cgit v1.2.3


From 38fc29ea754711e9a8c4e9ca2678ab41353a4662 Mon Sep 17 00:00:00 2001
From: Shunsuke Mie <mie@igel.co.jp>
Date: Mon, 17 Apr 2023 11:20:36 +0900
Subject: virtio_ring: add a struct device forward declaration

The virtio_ring header file uses the struct device without a forward
declaration.

Signed-off-by: Shunsuke Mie <mie@igel.co.jp>
Message-Id: <20230417022037.917668-1-mie@igel.co.jp>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_ring.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 2550c9170f4f..9b33df741b63 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -58,6 +58,7 @@ do { \
 
 struct virtio_device;
 struct virtqueue;
+struct device;
 
 /*
  * Creates a virtqueue and allocates the descriptor ring.  If
-- 
cgit v1.2.3


From add7370a398930077c6bc257ef5016b040d476eb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:28 -0400
Subject: sctp: delete the nested flexible array params

This patch deletes the flexible-array params[] from the structure
sctp_inithdr, sctp_addiphdr and sctp_reconf_chunk to avoid some
sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/input.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:278:29: warning: nested flexible array
  ./include/linux/sctp.h:675:30: warning: nested flexible array

This warning is reported if a structure having a flexible array
member is included by other structures.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  6 +++---
 include/net/sctp/sctp.h  |  8 ++++----
 net/sctp/input.c         |  2 +-
 net/sctp/sm_make_chunk.c | 18 +++++++++---------
 net/sctp/sm_statefuns.c  |  2 +-
 net/sctp/stream.c        |  2 +-
 6 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 358dc08e0831..0ff36a2737a3 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -270,7 +270,7 @@ struct sctp_inithdr {
 	__be16 num_outbound_streams;
 	__be16 num_inbound_streams;
 	__be32 initial_tsn;
-	__u8  params[];
+	/* __u8  params[]; */
 };
 
 struct sctp_init_chunk {
@@ -667,7 +667,7 @@ struct sctp_addip_param {
 
 struct sctp_addiphdr {
 	__be32	serial;
-	__u8	params[];
+	/* __u8	params[]; */
 };
 
 struct sctp_addip_chunk {
@@ -742,7 +742,7 @@ struct sctp_infox {
 
 struct sctp_reconf_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	__u8 params[];
+	/* __u8 params[]; */
 };
 
 struct sctp_strreset_outreq {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index c335dd01a597..74fae532b944 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -425,11 +425,11 @@ static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk)
  * the chunk length to indicate when to stop.  Make sure
  * there is room for a param header too.
  */
-#define sctp_walk_params(pos, chunk, member)\
-_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member)
+#define sctp_walk_params(pos, chunk)\
+_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length))
 
-#define _sctp_walk_params(pos, chunk, end, member)\
-for (pos.v = chunk->member;\
+#define _sctp_walk_params(pos, chunk, end)\
+for (pos.v = (u8 *)(chunk + 1);\
      (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\
       (void *)chunk + end) &&\
      pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 127bf28a6033..2613c4d74b16 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1150,7 +1150,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
 	init = (struct sctp_init_chunk *)skb->data;
 
 	/* Walk the parameters looking for embedded addresses. */
-	sctp_walk_params(params, init, init_hdr.params) {
+	sctp_walk_params(params, init) {
 
 		/* Note: Ignoring hostname addresses. */
 		af = sctp_get_af_specific(param_type2af(params.p->type));
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c8f4ec5d5f98..4dbbbc2a7742 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2306,7 +2306,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 	    ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
 		return sctp_process_inv_mandatory(asoc, chunk, errp);
 
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		if (param.p->type == SCTP_PARAM_STATE_COOKIE)
 			has_cookie = true;
 	}
@@ -2329,7 +2329,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 						  chunk, errp);
 
 	/* Verify all the variable length parameters */
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		result = sctp_verify_param(net, ep, asoc, param, cid,
 					   chunk, errp);
 		switch (result) {
@@ -2381,7 +2381,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 		src_match = 1;
 
 	/* Process the initialization parameters.  */
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		if (!src_match &&
 		    (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
 		     param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
@@ -3202,7 +3202,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 	union sctp_params param;
 
 	addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, addip, addip_hdr.params) {
+	sctp_walk_params(param, addip) {
 		size_t length = ntohs(param.p->length);
 
 		*errp = param.p;
@@ -3215,14 +3215,14 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			/* ensure there is only one addr param and it's in the
 			 * beginning of addip_hdr params, or we reject it.
 			 */
-			if (param.v != addip->addip_hdr.params)
+			if (param.v != (addip + 1))
 				return false;
 			addr_param_seen = true;
 			break;
 		case SCTP_PARAM_IPV6_ADDRESS:
 			if (length != sizeof(struct sctp_ipv6addr_param))
 				return false;
-			if (param.v != addip->addip_hdr.params)
+			if (param.v != (addip + 1))
 				return false;
 			addr_param_seen = true;
 			break;
@@ -3302,7 +3302,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 		goto done;
 
 	/* Process the TLVs contained within the ASCONF chunk. */
-	sctp_walk_params(param, addip, addip_hdr.params) {
+	sctp_walk_params(param, addip) {
 		/* Skip preceeding address parameters. */
 		if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
 		    param.p->type == SCTP_PARAM_IPV6_ADDRESS)
@@ -3636,7 +3636,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
 		return NULL;
 
 	reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
-	retval->param_hdr.v = reconf->params;
+	retval->param_hdr.v = (u8 *)(reconf + 1);
 
 	return retval;
 }
@@ -3878,7 +3878,7 @@ bool sctp_verify_reconf(const struct sctp_association *asoc,
 	__u16 cnt = 0;
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		__u16 length = ntohs(param.p->length);
 
 		*errp = param.p;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ce5426171206..39d416e7f795 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4142,7 +4142,7 @@ enum sctp_disposition sctp_sf_do_reconf(struct net *net,
 						  (void *)err_param, commands);
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		struct sctp_chunk *reply = NULL;
 		struct sctp_ulpevent *ev = NULL;
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ee6514af830f..c241cc552e8d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -491,7 +491,7 @@ static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param(
 		return NULL;
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		/* sctp_strreset_tsnreq is actually the basic structure
 		 * of all stream reconf params, so it's safe to use it
 		 * to access request_seq.
-- 
cgit v1.2.3


From 73175a042955e531ec355a8708585befa67a22db Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:29 -0400
Subject: sctp: delete the nested flexible array skip

This patch deletes the flexible-array skip[] from the structure
sctp_ifwdtsn/fwdtsn_hdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/stream_interleave.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:611:32: warning: nested flexible array
  ./include/linux/sctp.h:628:33: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h         | 4 ++--
 include/net/sctp/sctp.h      | 4 ++--
 net/sctp/stream_interleave.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 0ff36a2737a3..9815b801fec0 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -603,7 +603,7 @@ struct sctp_fwdtsn_skip {
 
 struct sctp_fwdtsn_hdr {
 	__be32 new_cum_tsn;
-	struct sctp_fwdtsn_skip skip[];
+	/* struct sctp_fwdtsn_skip skip[]; */
 };
 
 struct sctp_fwdtsn_chunk {
@@ -620,7 +620,7 @@ struct sctp_ifwdtsn_skip {
 
 struct sctp_ifwdtsn_hdr {
 	__be32 new_cum_tsn;
-	struct sctp_ifwdtsn_skip skip[];
+	/* struct sctp_ifwdtsn_skip skip[]; */
 };
 
 struct sctp_ifwdtsn_chunk {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 74fae532b944..2a67100b2a17 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -452,8 +452,8 @@ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \
 _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))
 
 #define _sctp_walk_fwdtsn(pos, chunk, end)\
-for (pos = chunk->subh.fwdtsn_hdr->skip;\
-     (void *)pos <= (void *)chunk->subh.fwdtsn_hdr->skip + end - sizeof(struct sctp_fwdtsn_skip);\
+for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\
+     (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\
      pos++)
 
 /* External references. */
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index b046b11200c9..840f24045ae2 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1153,8 +1153,8 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
 }
 
 #define _sctp_walk_ifwdtsn(pos, chunk, end) \
-	for (pos = chunk->subh.ifwdtsn_hdr->skip; \
-	     (void *)pos <= (void *)chunk->subh.ifwdtsn_hdr->skip + (end) - \
+	for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \
+	     (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \
 			    sizeof(struct sctp_ifwdtsn_skip); pos++)
 
 #define sctp_walk_ifwdtsn(pos, ch) \
-- 
cgit v1.2.3


From 9789c1c6619e0a5eccfc31abe49b1ce5ca3cd11f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:30 -0400
Subject: sctp: delete the nested flexible array variable

This patch deletes the flexible-array variable[] from the structure
sctp_sackhdr and sctp_errhdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/sm_statefuns.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:451:28: warning: nested flexible array
  ./include/linux/sctp.h:393:29: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  4 ++--
 net/sctp/outqueue.c      | 11 +++++++----
 net/sctp/sm_sideeffect.c |  3 +--
 net/sctp/sm_statefuns.c  |  2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 9815b801fec0..01a0eb7e9fa1 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -385,7 +385,7 @@ struct sctp_sackhdr {
 	__be32 a_rwnd;
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
-	union sctp_sack_variable variable[];
+	/* union sctp_sack_variable variable[]; */
 };
 
 struct sctp_sack_chunk {
@@ -443,7 +443,7 @@ struct sctp_shutdown_chunk {
 struct sctp_errhdr {
 	__be16 cause;
 	__be16 length;
-	__u8  variable[];
+	/* __u8  variable[]; */
 };
 
 struct sctp_operr_chunk {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 20831079fb09..0dc6b8ab9963 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1231,7 +1231,7 @@ static void sctp_sack_update_unack_data(struct sctp_association *assoc,
 
 	unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;
 
-	frags = sack->variable;
+	frags = (union sctp_sack_variable *)(sack + 1);
 	for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
 		unack_data -= ((ntohs(frags[i].gab.end) -
 				ntohs(frags[i].gab.start) + 1));
@@ -1252,7 +1252,6 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 	struct sctp_transport *transport;
 	struct sctp_chunk *tchunk = NULL;
 	struct list_head *lchunk, *transport_list, *temp;
-	union sctp_sack_variable *frags = sack->variable;
 	__u32 sack_ctsn, ctsn, tsn;
 	__u32 highest_tsn, highest_new_tsn;
 	__u32 sack_a_rwnd;
@@ -1313,8 +1312,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 
 	/* Get the highest TSN in the sack. */
 	highest_tsn = sack_ctsn;
-	if (gap_ack_blocks)
+	if (gap_ack_blocks) {
+		union sctp_sack_variable *frags =
+			(union sctp_sack_variable *)(sack + 1);
+
 		highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
+	}
 
 	if (TSN_lt(asoc->highest_sacked, highest_tsn))
 		asoc->highest_sacked = highest_tsn;
@@ -1789,7 +1792,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 	 *  Block are assumed to have been received correctly.
 	 */
 
-	frags = sack->variable;
+	frags = (union sctp_sack_variable *)(sack + 1);
 	blocks = ntohs(sack->num_gap_ack_blocks);
 	tsn_offset = tsn - ctsn;
 	for (i = 0; i < blocks; ++i) {
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 463c4a58d2c3..7fbeb99d8d32 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -984,8 +984,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 		{
 			struct sctp_chunkhdr *unk_chunk_hdr;
 
-			unk_chunk_hdr = (struct sctp_chunkhdr *)
-							err_hdr->variable;
+			unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1);
 			switch (unk_chunk_hdr->type) {
 			/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
 			 * an ERROR chunk reporting that it did not recognized
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 39d416e7f795..8d0cfd689b20 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1337,7 +1337,7 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 	 * throughout the code today.
 	 */
 	errhdr = (struct sctp_errhdr *)buffer;
-	addrparm = (union sctp_addr_param *)errhdr->variable;
+	addrparm = (union sctp_addr_param *)(errhdr + 1);
 
 	/* Copy into a parm format. */
 	len = af->to_addr_param(ssa, addrparm);
-- 
cgit v1.2.3


From f97278ff346a5f11c68d0ac499999d5ad06a6db2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:31 -0400
Subject: sctp: delete the nested flexible array peer_init

This patch deletes the flexible-array peer_init[] from the structure
sctp_cookie to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/sm_make_chunk.c: note: in included file (through include/net/sctp/sctp.h):
  ./include/net/sctp/structs.h:1588:28: warning: nested flexible array
  ./include/net/sctp/structs.h:343:28: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 +-
 net/sctp/associola.c       | 5 +++--
 net/sctp/sm_make_chunk.c   | 4 ++--
 net/sctp/sm_statefuns.c    | 8 +++-----
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 070c9458fff4..5c72d1864dd6 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -332,7 +332,7 @@ struct sctp_cookie {
 	 * the association TCB is re-constructed from the cookie.
 	 */
 	__u32 raw_addr_list_len;
-	struct sctp_init_chunk peer_init[];
+	/* struct sctp_init_chunk peer_init[]; */
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 63ba5551c13f..796529167e8d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1597,9 +1597,10 @@ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
 					 struct sctp_cookie *cookie,
 					 gfp_t gfp)
 {
-	int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
+	struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1);
+	int var_size2 = ntohs(peer_init->chunk_hdr.length);
 	int var_size3 = cookie->raw_addr_list_len;
-	__u8 *raw = (__u8 *)cookie->peer_init + var_size2;
+	__u8 *raw = (__u8 *)peer_init + var_size2;
 
 	return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
 				      asoc->ep->base.bind_addr.port, gfp);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4dbbbc2a7742..08527d882e56 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1707,11 +1707,11 @@ static struct sctp_cookie_param *sctp_pack_cookie(
 					 ktime_get_real());
 
 	/* Copy the peer's init packet.  */
-	memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
+	memcpy(cookie + 1, init_chunk->chunk_hdr,
 	       ntohs(init_chunk->chunk_hdr->length));
 
 	/* Copy the raw local address list of the association. */
-	memcpy((__u8 *)&cookie->c.peer_init[0] +
+	memcpy((__u8 *)(cookie + 1) +
 	       ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
 
 	if (sctp_sk(ep->base.sk)->hmac) {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8d0cfd689b20..7b8eb735fa88 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -794,8 +794,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 	/* This is a brand-new association, so these are not yet side
 	 * effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk,
 			       &chunk->subh.cookie_hdr->c.peer_addr,
 			       peer_init, GFP_ATOMIC))
@@ -1869,8 +1868,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
 	/* new_asoc is a brand-new association, so these are not yet
 	 * side effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
 			       GFP_ATOMIC))
 		goto nomem;
@@ -1990,7 +1988,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
 	/* new_asoc is a brand-new association, so these are not yet
 	 * side effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
 			       GFP_ATOMIC))
 		goto nomem;
-- 
cgit v1.2.3


From 2ab399a931dddacdf7202cd4b49a5187154623d1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:32 -0400
Subject: sctp: delete the nested flexible array hmac

This patch deletes the flexible-array hmac[] from the structure
sctp_authhdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/auth.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:735:29: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 2 +-
 net/sctp/auth.c         | 2 +-
 net/sctp/sm_statefuns.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 01a0eb7e9fa1..d182e8c41985 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -727,7 +727,7 @@ struct sctp_addip_chunk {
 struct sctp_authhdr {
 	__be16 shkey_id;
 	__be16 hmac_id;
-	__u8   hmac[];
+	/* __u8   hmac[]; */
 };
 
 struct sctp_auth_chunk {
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 34964145514e..c58fffc86a0c 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -738,7 +738,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 
 	tfm = asoc->ep->auth_hmacs[hmac_id];
 
-	digest = auth->auth_hdr.hmac;
+	digest = (u8 *)(&auth->auth_hdr + 1);
 	if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
 		goto free;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7b8eb735fa88..97f1155a2045 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4391,7 +4391,7 @@ static enum sctp_ierror sctp_sf_authenticate(
 	 *  3. Compute the new digest
 	 *  4. Compare saved and new digests.
 	 */
-	digest = auth_hdr->hmac;
+	digest = (u8 *)(auth_hdr + 1);
 	skb_pull(chunk->skb, sig_len);
 
 	save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
-- 
cgit v1.2.3


From dbda0fba7a14f14835c34d59fd329cb90a887862 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:33 -0400
Subject: sctp: delete the nested flexible array payload

This patch deletes the flexible-array payload[] from the structure
sctp_datahdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/socket.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:230:29: warning: nested flexible array

This member is not even used anywhere.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d182e8c41985..836a7e200f39 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -222,7 +222,7 @@ struct sctp_datahdr {
 	__be16 stream;
 	__be16 ssn;
 	__u32 ppid;
-	__u8  payload[];
+	/* __u8  payload[]; */
 };
 
 struct sctp_data_chunk {
-- 
cgit v1.2.3


From a714e3ec230892039b5d5ae6902b58bb084a15c1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:54 +0300
Subject: bridge: Add internal flags for per-{Port, VLAN} neighbor suppression

Add two internal flags that will be used to enable / disable per-{Port,
VLAN} neighbor suppression:

1. 'BR_NEIGH_VLAN_SUPPRESS': A per-port flag used to indicate that
per-{Port, VLAN} neighbor suppression is enabled on the bridge port.
When set, 'BR_NEIGH_SUPPRESS' has no effect.

2. 'BR_VLFLAG_NEIGH_SUPPRESS_ENABLED': A per-VLAN flag used to indicate
that neighbor suppression is enabled on the given VLAN.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 1 +
 net/bridge/br_private.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 1668ac4d7adc..3ff96ae31bf6 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -60,6 +60,7 @@ struct br_ip_list {
 #define BR_TX_FWD_OFFLOAD	BIT(20)
 #define BR_PORT_LOCKED		BIT(21)
 #define BR_PORT_MAB		BIT(22)
+#define BR_NEIGH_VLAN_SUPPRESS	BIT(23)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1ff4d64ab584..b17fc821ecc8 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -178,6 +178,7 @@ enum {
 	BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
 	BR_VLFLAG_MCAST_ENABLED = BIT(2),
 	BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
+	BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
 };
 
 /**
-- 
cgit v1.2.3


From 83f6d600796c65ab34b08dbddb5795099dfda4d1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:58 +0300
Subject: bridge: vlan: Allow setting VLAN neighbor suppression state

Add a new VLAN attribute that allows user space to set the neighbor
suppression state of the port VLAN. Example:

 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 false
 # bridge vlan set vid 10 dev swp1 neigh_suppress on
 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 true
 # bridge vlan set vid 10 dev swp1 neigh_suppress off
 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 false

 # bridge vlan set vid 10 dev br0 neigh_suppress on
 Error: bridge: Can't set neigh_suppress for non-port vlans.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_vlan.c           |  1 +
 net/bridge/br_vlan_options.c   | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index c9d624f528c5..f95326fce6bb 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -525,6 +525,7 @@ enum {
 	BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
 	BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS,
 	BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS,
+	BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
 	__BRIDGE_VLANDB_ENTRY_MAX,
 };
 #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8a3dbc09ba38..15f44d026e75 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2134,6 +2134,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
 	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS]	= { .type = NLA_REJECT },
 	[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]	= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index e378c2f3a9e2..8fa89b04ee94 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -52,7 +52,9 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
 		       const struct net_bridge_port *p)
 {
 	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
-	    !__vlan_tun_put(skb, v))
+	    !__vlan_tun_put(skb, v) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
+		       !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED)))
 		return false;
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -80,6 +82,7 @@ size_t br_vlan_opts_nl_size(void)
 	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
 	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
 #endif
+	       + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */
 	       + 0;
 }
 
@@ -239,6 +242,21 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
 	}
 #endif
 
+	if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) {
+		bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+		bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]);
+
+		if (!p) {
+			NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans");
+			return -EINVAL;
+		}
+
+		if (val != enabled) {
+			v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+			*changed = true;
+		}
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 160656d7201d861a1f2a0bf279a765e8cda2317a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:59 +0300
Subject: bridge: Allow setting per-{Port, VLAN} neighbor suppression state

Add a new bridge port attribute that allows user space to enable
per-{Port, VLAN} neighbor suppression. Example:

 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 false
 # bridge link set dev swp1 neigh_vlan_suppress on
 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 true
 # bridge link set dev swp1 neigh_vlan_suppress off
 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 false

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_netlink.c      | 8 +++++++-
 net/core/rtnetlink.c         | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d679688efe0..4ac1000b0ef2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -569,6 +569,7 @@ enum {
 	IFLA_BRPORT_MAB,
 	IFLA_BRPORT_MCAST_N_GROUPS,
 	IFLA_BRPORT_MCAST_MAX_GROUPS,
+	IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fefb1c0e248b..05c5863d2e20 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -189,6 +189,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LOCKED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_MAB */
+		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -278,7 +279,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)))
+	    nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+		       !!(p->flags & BR_NEIGH_VLAN_SUPPRESS)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -891,6 +894,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
 	[IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+	[IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -957,6 +961,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+			 BR_NEIGH_VLAN_SUPPRESS);
 
 	if ((p->flags & BR_PORT_MAB) &&
 	    (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e844d75220fb..653901a1bf75 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -61,7 +61,7 @@
 #include "dev.h"
 
 #define RTNL_MAX_TYPE		50
-#define RTNL_SLAVE_MAX_TYPE	42
+#define RTNL_SLAVE_MAX_TYPE	43
 
 struct rtnl_link {
 	rtnl_doit_func		doit;
-- 
cgit v1.2.3


From 55435ea7729accb5b8a330de751836c4be524834 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:14 -0700
Subject: pds_core: initial framework for pds_core PF driver

This is the initial PCI driver framework for the new pds_core device
driver and its family of devices.  This does the very basics of
registering for the new PF PCI device 1dd8:100c, setting up debugfs
entries, and registering with devlink.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../device_drivers/ethernet/amd/pds_core.rst       |  35 ++
 .../networking/device_drivers/ethernet/index.rst   |   1 +
 drivers/net/ethernet/amd/pds_core/Makefile         |   8 +
 drivers/net/ethernet/amd/pds_core/core.h           |  56 ++
 drivers/net/ethernet/amd/pds_core/debugfs.c        |  31 ++
 drivers/net/ethernet/amd/pds_core/main.c           | 277 ++++++++++
 include/linux/pds/pds_common.h                     |  14 +
 include/linux/pds/pds_core_if.h                    | 571 +++++++++++++++++++++
 8 files changed, 993 insertions(+)
 create mode 100644 Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
 create mode 100644 drivers/net/ethernet/amd/pds_core/Makefile
 create mode 100644 drivers/net/ethernet/amd/pds_core/core.h
 create mode 100644 drivers/net/ethernet/amd/pds_core/debugfs.c
 create mode 100644 drivers/net/ethernet/amd/pds_core/main.c
 create mode 100644 include/linux/pds/pds_common.h
 create mode 100644 include/linux/pds/pds_core_if.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
new file mode 100644
index 000000000000..99a70026f1bc
--- /dev/null
+++ b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
@@ -0,0 +1,35 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+========================================================
+Linux Driver for the AMD/Pensando(R) DSC adapter family
+========================================================
+
+Copyright(c) 2023 Advanced Micro Devices, Inc
+
+Identifying the Adapter
+=======================
+
+To find if one or more AMD/Pensando PCI Core devices are installed on the
+host, check for the PCI devices::
+
+  # lspci -d 1dd8:100c
+  b5:00.0 Processing accelerators: Pensando Systems Device 100c
+  b6:00.0 Processing accelerators: Pensando Systems Device 100c
+
+If such devices are listed as above, then the pds_core.ko driver should find
+and configure them for use.  There should be log entries in the kernel
+messages such as these::
+
+  $ dmesg | grep pds_core
+  pds_core 0000:b5:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
+  pds_core 0000:b5:00.0: FW: 1.60.0-73
+  pds_core 0000:b6:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
+  pds_core 0000:b6:00.0: FW: 1.60.0-73
+
+Support
+=======
+
+For general Linux networking support, please use the netdev mailing
+list, which is monitored by AMD/Pensando personnel::
+
+  netdev@vger.kernel.org
diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst
index 6e9e7012d000..417ca514a4d0 100644
--- a/Documentation/networking/device_drivers/ethernet/index.rst
+++ b/Documentation/networking/device_drivers/ethernet/index.rst
@@ -14,6 +14,7 @@ Contents:
    3com/vortex
    amazon/ena
    altera/altera_tse
+   amd/pds_core
    aquantia/atlantic
    chelsio/cxgb
    cirrus/cs89x0
diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
new file mode 100644
index 000000000000..de3bf1d1886c
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+obj-$(CONFIG_PDS_CORE) := pds_core.o
+
+pds_core-y := main.o
+
+pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
new file mode 100644
index 000000000000..34ef837e8cfe
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDSC_H_
+#define _PDSC_H_
+
+#include <linux/debugfs.h>
+#include <net/devlink.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+
+#define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
+
+struct pdsc_dev_bar {
+	void __iomem *vaddr;
+	phys_addr_t bus_addr;
+	unsigned long len;
+	int res_index;
+};
+
+/* No state flags set means we are in a steady running state */
+enum pdsc_state_flags {
+	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
+	PDSC_S_INITING_DRIVER,	    /* initial startup from probe */
+	PDSC_S_STOPPING_DRIVER,	    /* driver remove */
+
+	/* leave this as last */
+	PDSC_S_STATE_SIZE
+};
+
+struct pdsc {
+	struct pci_dev *pdev;
+	struct dentry *dentry;
+	struct device *dev;
+	struct pdsc_dev_bar bars[PDS_CORE_BARS_MAX];
+	int hw_index;
+	int uid;
+
+	unsigned long state;
+
+	struct pds_core_dev_info_regs __iomem *info_regs;
+	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
+	struct pds_core_intr __iomem *intr_ctrl;
+	u64 __iomem *intr_status;
+	u64 __iomem *db_pages;
+	dma_addr_t phy_db_pages;
+	u64 __iomem *kern_dbpage;
+};
+
+void pdsc_debugfs_create(void);
+void pdsc_debugfs_destroy(void);
+void pdsc_debugfs_add_dev(struct pdsc *pdsc);
+void pdsc_debugfs_del_dev(struct pdsc *pdsc);
+
+#endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
new file mode 100644
index 000000000000..b2f7cb795c20
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+
+#include "core.h"
+
+static struct dentry *pdsc_dir;
+
+void pdsc_debugfs_create(void)
+{
+	pdsc_dir = debugfs_create_dir(PDS_CORE_DRV_NAME, NULL);
+}
+
+void pdsc_debugfs_destroy(void)
+{
+	debugfs_remove_recursive(pdsc_dir);
+}
+
+void pdsc_debugfs_add_dev(struct pdsc *pdsc)
+{
+	pdsc->dentry = debugfs_create_dir(pci_name(pdsc->pdev), pdsc_dir);
+
+	debugfs_create_ulong("state", 0400, pdsc->dentry, &pdsc->state);
+}
+
+void pdsc_debugfs_del_dev(struct pdsc *pdsc)
+{
+	debugfs_remove_recursive(pdsc->dentry);
+	pdsc->dentry = NULL;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
new file mode 100644
index 000000000000..c2b12f226959
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/pci.h>
+
+#include <linux/pds/pds_common.h>
+
+#include "core.h"
+
+MODULE_DESCRIPTION(PDSC_DRV_DESCRIPTION);
+MODULE_AUTHOR("Advanced Micro Devices, Inc");
+MODULE_LICENSE("GPL");
+
+/* Supported devices */
+static const struct pci_device_id pdsc_id_table[] = {
+	{ PCI_VDEVICE(PENSANDO, PCI_DEVICE_ID_PENSANDO_CORE_PF) },
+	{ 0, }	/* end of table */
+};
+MODULE_DEVICE_TABLE(pci, pdsc_id_table);
+
+static void pdsc_unmap_bars(struct pdsc *pdsc)
+{
+	struct pdsc_dev_bar *bars = pdsc->bars;
+	unsigned int i;
+
+	for (i = 0; i < PDS_CORE_BARS_MAX; i++) {
+		if (bars[i].vaddr)
+			pci_iounmap(pdsc->pdev, bars[i].vaddr);
+	}
+}
+
+static int pdsc_map_bars(struct pdsc *pdsc)
+{
+	struct pdsc_dev_bar *bar = pdsc->bars;
+	struct pci_dev *pdev = pdsc->pdev;
+	struct device *dev = pdsc->dev;
+	struct pdsc_dev_bar *bars;
+	unsigned int i, j;
+	int num_bars = 0;
+	int err;
+	u32 sig;
+
+	bars = pdsc->bars;
+
+	/* Since the PCI interface in the hardware is configurable,
+	 * we need to poke into all the bars to find the set we're
+	 * expecting.
+	 */
+	for (i = 0, j = 0; i < PDS_CORE_BARS_MAX; i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+
+		bars[j].len = pci_resource_len(pdev, i);
+		bars[j].bus_addr = pci_resource_start(pdev, i);
+		bars[j].res_index = i;
+
+		/* only map the whole bar 0 */
+		if (j > 0) {
+			bars[j].vaddr = NULL;
+		} else {
+			bars[j].vaddr = pci_iomap(pdev, i, bars[j].len);
+			if (!bars[j].vaddr) {
+				dev_err(dev, "Cannot map BAR %d, aborting\n", i);
+				return -ENODEV;
+			}
+		}
+
+		j++;
+	}
+	num_bars = j;
+
+	/* BAR0: dev_cmd and interrupts */
+	if (num_bars < 1) {
+		dev_err(dev, "No bars found\n");
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	if (bar->len < PDS_CORE_BAR0_SIZE) {
+		dev_err(dev, "Resource bar size %lu too small\n", bar->len);
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	pdsc->info_regs = bar->vaddr + PDS_CORE_BAR0_DEV_INFO_REGS_OFFSET;
+	pdsc->cmd_regs = bar->vaddr + PDS_CORE_BAR0_DEV_CMD_REGS_OFFSET;
+	pdsc->intr_status = bar->vaddr + PDS_CORE_BAR0_INTR_STATUS_OFFSET;
+	pdsc->intr_ctrl = bar->vaddr + PDS_CORE_BAR0_INTR_CTRL_OFFSET;
+
+	sig = ioread32(&pdsc->info_regs->signature);
+	if (sig != PDS_CORE_DEV_INFO_SIGNATURE) {
+		dev_err(dev, "Incompatible firmware signature %x", sig);
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	/* BAR1: doorbells */
+	bar++;
+	if (num_bars < 2) {
+		dev_err(dev, "Doorbell bar missing\n");
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	pdsc->db_pages = bar->vaddr;
+	pdsc->phy_db_pages = bar->bus_addr;
+
+	return 0;
+
+err_out:
+	pdsc_unmap_bars(pdsc);
+	return err;
+}
+
+static int pdsc_init_vf(struct pdsc *vf)
+{
+	return -1;
+}
+
+static int pdsc_init_pf(struct pdsc *pdsc)
+{
+	struct devlink *dl;
+	int err;
+
+	pcie_print_link_status(pdsc->pdev);
+
+	err = pci_request_regions(pdsc->pdev, PDS_CORE_DRV_NAME);
+	if (err) {
+		dev_err(pdsc->dev, "Cannot request PCI regions: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	err = pdsc_map_bars(pdsc);
+	if (err)
+		goto err_out_release_regions;
+
+	dl = priv_to_devlink(pdsc);
+	devl_lock(dl);
+	devl_register(dl);
+	devl_unlock(dl);
+
+	return 0;
+
+err_out_release_regions:
+	pci_release_regions(pdsc->pdev);
+
+	return err;
+}
+
+static const struct devlink_ops pdsc_dl_ops = {
+};
+
+static const struct devlink_ops pdsc_dl_vf_ops = {
+};
+
+static DEFINE_IDA(pdsc_ida);
+
+static int pdsc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct device *dev = &pdev->dev;
+	const struct devlink_ops *ops;
+	struct devlink *dl;
+	struct pdsc *pdsc;
+	bool is_pf;
+	int err;
+
+	is_pf = !pdev->is_virtfn;
+	ops = is_pf ? &pdsc_dl_ops : &pdsc_dl_vf_ops;
+	dl = devlink_alloc(ops, sizeof(struct pdsc), dev);
+	if (!dl)
+		return -ENOMEM;
+	pdsc = devlink_priv(dl);
+
+	pdsc->pdev = pdev;
+	pdsc->dev = &pdev->dev;
+	set_bit(PDSC_S_INITING_DRIVER, &pdsc->state);
+	pci_set_drvdata(pdev, pdsc);
+	pdsc_debugfs_add_dev(pdsc);
+
+	err = ida_alloc(&pdsc_ida, GFP_KERNEL);
+	if (err < 0) {
+		dev_err(pdsc->dev, "%s: id alloc failed: %pe\n",
+			__func__, ERR_PTR(err));
+		goto err_out_free_devlink;
+	}
+	pdsc->uid = err;
+
+	/* Query system for DMA addressing limitation for the device. */
+	err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(PDS_CORE_ADDR_LEN));
+	if (err) {
+		dev_err(dev, "Unable to obtain 64-bit DMA for consistent allocations, aborting: %pe\n",
+			ERR_PTR(err));
+		goto err_out_free_ida;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev, "Cannot enable PCI device: %pe\n", ERR_PTR(err));
+		goto err_out_free_ida;
+	}
+	pci_set_master(pdev);
+
+	if (is_pf)
+		err = pdsc_init_pf(pdsc);
+	else
+		err = pdsc_init_vf(pdsc);
+	if (err) {
+		dev_err(dev, "Cannot init device: %pe\n", ERR_PTR(err));
+		goto err_out_clear_master;
+	}
+
+	clear_bit(PDSC_S_INITING_DRIVER, &pdsc->state);
+	return 0;
+
+err_out_clear_master:
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+err_out_free_ida:
+	ida_free(&pdsc_ida, pdsc->uid);
+err_out_free_devlink:
+	pdsc_debugfs_del_dev(pdsc);
+	devlink_free(dl);
+
+	return err;
+}
+
+static void pdsc_remove(struct pci_dev *pdev)
+{
+	struct pdsc *pdsc = pci_get_drvdata(pdev);
+	struct devlink *dl;
+
+	/* Unhook the registrations first to be sure there
+	 * are no requests while we're stopping.
+	 */
+	dl = priv_to_devlink(pdsc);
+	devl_lock(dl);
+	devl_unregister(dl);
+	devl_unlock(dl);
+
+	pdsc_unmap_bars(pdsc);
+	pci_release_regions(pdev);
+
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+
+	ida_free(&pdsc_ida, pdsc->uid);
+	pdsc_debugfs_del_dev(pdsc);
+	devlink_free(dl);
+}
+
+static struct pci_driver pdsc_driver = {
+	.name = PDS_CORE_DRV_NAME,
+	.id_table = pdsc_id_table,
+	.probe = pdsc_probe,
+	.remove = pdsc_remove,
+};
+
+static int __init pdsc_init_module(void)
+{
+	if (strcmp(KBUILD_MODNAME, PDS_CORE_DRV_NAME))
+		return -EINVAL;
+
+	pdsc_debugfs_create();
+	return pci_register_driver(&pdsc_driver);
+}
+
+static void __exit pdsc_cleanup_module(void)
+{
+	pci_unregister_driver(&pdsc_driver);
+	pdsc_debugfs_destroy();
+}
+
+module_init(pdsc_init_module);
+module_exit(pdsc_cleanup_module);
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
new file mode 100644
index 000000000000..bd041a5170a6
--- /dev/null
+++ b/include/linux/pds/pds_common.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_COMMON_H_
+#define _PDS_COMMON_H_
+
+#define PDS_CORE_DRV_NAME			"pds_core"
+
+/* the device's internal addressing uses up to 52 bits */
+#define PDS_CORE_ADDR_LEN	52
+#define PDS_CORE_ADDR_MASK	(BIT_ULL(PDS_ADDR_LEN) - 1)
+#define PDS_PAGE_SIZE		4096
+
+#endif /* _PDS_COMMON_H_ */
diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h
new file mode 100644
index 000000000000..e838a2b90440
--- /dev/null
+++ b/include/linux/pds/pds_core_if.h
@@ -0,0 +1,571 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_CORE_IF_H_
+#define _PDS_CORE_IF_H_
+
+#define PCI_VENDOR_ID_PENSANDO			0x1dd8
+#define PCI_DEVICE_ID_PENSANDO_CORE_PF		0x100c
+#define PCI_DEVICE_ID_VIRTIO_NET_TRANS		0x1000
+#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF	0x1003
+#define PCI_DEVICE_ID_PENSANDO_VDPA_VF		0x100b
+#define PDS_CORE_BARS_MAX			4
+#define PDS_CORE_PCI_BAR_DBELL			1
+
+/* Bar0 */
+#define PDS_CORE_DEV_INFO_SIGNATURE		0x44455649 /* 'DEVI' */
+#define PDS_CORE_BAR0_SIZE			0x8000
+#define PDS_CORE_BAR0_DEV_INFO_REGS_OFFSET	0x0000
+#define PDS_CORE_BAR0_DEV_CMD_REGS_OFFSET	0x0800
+#define PDS_CORE_BAR0_DEV_CMD_DATA_REGS_OFFSET	0x0c00
+#define PDS_CORE_BAR0_INTR_STATUS_OFFSET	0x1000
+#define PDS_CORE_BAR0_INTR_CTRL_OFFSET		0x2000
+#define PDS_CORE_DEV_CMD_DONE			0x00000001
+
+#define PDS_CORE_DEVCMD_TIMEOUT			5
+
+#define PDS_CORE_CLIENT_ID			0
+#define PDS_CORE_ASIC_TYPE_CAPRI		0
+
+/*
+ * enum pds_core_cmd_opcode - Device commands
+ */
+enum pds_core_cmd_opcode {
+	/* Core init */
+	PDS_CORE_CMD_NOP		= 0,
+	PDS_CORE_CMD_IDENTIFY		= 1,
+	PDS_CORE_CMD_RESET		= 2,
+	PDS_CORE_CMD_INIT		= 3,
+
+	PDS_CORE_CMD_FW_DOWNLOAD	= 4,
+	PDS_CORE_CMD_FW_CONTROL		= 5,
+
+	/* SR/IOV commands */
+	PDS_CORE_CMD_VF_GETATTR		= 60,
+	PDS_CORE_CMD_VF_SETATTR		= 61,
+	PDS_CORE_CMD_VF_CTRL		= 62,
+
+	/* Add commands before this line */
+	PDS_CORE_CMD_MAX,
+	PDS_CORE_CMD_COUNT
+};
+
+/*
+ * enum pds_core_status_code - Device command return codes
+ */
+enum pds_core_status_code {
+	PDS_RC_SUCCESS	= 0,	/* Success */
+	PDS_RC_EVERSION	= 1,	/* Incorrect version for request */
+	PDS_RC_EOPCODE	= 2,	/* Invalid cmd opcode */
+	PDS_RC_EIO	= 3,	/* I/O error */
+	PDS_RC_EPERM	= 4,	/* Permission denied */
+	PDS_RC_EQID	= 5,	/* Bad qid */
+	PDS_RC_EQTYPE	= 6,	/* Bad qtype */
+	PDS_RC_ENOENT	= 7,	/* No such element */
+	PDS_RC_EINTR	= 8,	/* operation interrupted */
+	PDS_RC_EAGAIN	= 9,	/* Try again */
+	PDS_RC_ENOMEM	= 10,	/* Out of memory */
+	PDS_RC_EFAULT	= 11,	/* Bad address */
+	PDS_RC_EBUSY	= 12,	/* Device or resource busy */
+	PDS_RC_EEXIST	= 13,	/* object already exists */
+	PDS_RC_EINVAL	= 14,	/* Invalid argument */
+	PDS_RC_ENOSPC	= 15,	/* No space left or alloc failure */
+	PDS_RC_ERANGE	= 16,	/* Parameter out of range */
+	PDS_RC_BAD_ADDR	= 17,	/* Descriptor contains a bad ptr */
+	PDS_RC_DEV_CMD	= 18,	/* Device cmd attempted on AdminQ */
+	PDS_RC_ENOSUPP	= 19,	/* Operation not supported */
+	PDS_RC_ERROR	= 29,	/* Generic error */
+	PDS_RC_ERDMA	= 30,	/* Generic RDMA error */
+	PDS_RC_EVFID	= 31,	/* VF ID does not exist */
+	PDS_RC_BAD_FW	= 32,	/* FW file is invalid or corrupted */
+	PDS_RC_ECLIENT	= 33,   /* No such client id */
+};
+
+/**
+ * struct pds_core_drv_identity - Driver identity information
+ * @drv_type:         Driver type (enum pds_core_driver_type)
+ * @os_dist:          OS distribution, numeric format
+ * @os_dist_str:      OS distribution, string format
+ * @kernel_ver:       Kernel version, numeric format
+ * @kernel_ver_str:   Kernel version, string format
+ * @driver_ver_str:   Driver version, string format
+ */
+struct pds_core_drv_identity {
+	__le32 drv_type;
+	__le32 os_dist;
+	char   os_dist_str[128];
+	__le32 kernel_ver;
+	char   kernel_ver_str[32];
+	char   driver_ver_str[32];
+};
+
+#define PDS_DEV_TYPE_MAX	16
+/**
+ * struct pds_core_dev_identity - Device identity information
+ * @version:	      Version of device identify
+ * @type:	      Identify type (0 for now)
+ * @state:	      Device state
+ * @rsvd:	      Word boundary padding
+ * @nlifs:	      Number of LIFs provisioned
+ * @nintrs:	      Number of interrupts provisioned
+ * @ndbpgs_per_lif:   Number of doorbell pages per LIF
+ * @intr_coal_mult:   Interrupt coalescing multiplication factor
+ *		      Scale user-supplied interrupt coalescing
+ *		      value in usecs to device units using:
+ *		      device units = usecs * mult / div
+ * @intr_coal_div:    Interrupt coalescing division factor
+ *		      Scale user-supplied interrupt coalescing
+ *		      value in usecs to device units using:
+ *		      device units = usecs * mult / div
+ * @vif_types:        How many of each VIF device type is supported
+ */
+struct pds_core_dev_identity {
+	u8     version;
+	u8     type;
+	u8     state;
+	u8     rsvd;
+	__le32 nlifs;
+	__le32 nintrs;
+	__le32 ndbpgs_per_lif;
+	__le32 intr_coal_mult;
+	__le32 intr_coal_div;
+	__le16 vif_types[PDS_DEV_TYPE_MAX];
+};
+
+#define PDS_CORE_IDENTITY_VERSION_1	1
+
+/**
+ * struct pds_core_dev_identify_cmd - Driver/device identify command
+ * @opcode:	Opcode PDS_CORE_CMD_IDENTIFY
+ * @ver:	Highest version of identify supported by driver
+ *
+ * Expects to find driver identification info (struct pds_core_drv_identity)
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while preparing the driver info.
+ */
+struct pds_core_dev_identify_cmd {
+	u8 opcode;
+	u8 ver;
+};
+
+/**
+ * struct pds_core_dev_identify_comp - Device identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @ver:	Version of identify returned by device
+ *
+ * Device identification info (struct pds_core_dev_identity) can be found
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while reading the results.
+ */
+struct pds_core_dev_identify_comp {
+	u8 status;
+	u8 ver;
+};
+
+/**
+ * struct pds_core_dev_reset_cmd - Device reset command
+ * @opcode:	Opcode PDS_CORE_CMD_RESET
+ *
+ * Resets and clears all LIFs, VDevs, and VIFs on the device.
+ */
+struct pds_core_dev_reset_cmd {
+	u8 opcode;
+};
+
+/**
+ * struct pds_core_dev_reset_comp - Reset command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_dev_reset_comp {
+	u8 status;
+};
+
+/*
+ * struct pds_core_dev_init_data - Pointers and info needed for the Core
+ * initialization PDS_CORE_CMD_INIT command.  The in and out structs are
+ * overlays on the pds_core_dev_cmd_regs.data space for passing data down
+ * to the firmware on init, and then returning initialization results.
+ */
+struct pds_core_dev_init_data_in {
+	__le64 adminq_q_base;
+	__le64 adminq_cq_base;
+	__le64 notifyq_cq_base;
+	__le32 flags;
+	__le16 intr_index;
+	u8     adminq_ring_size;
+	u8     notifyq_ring_size;
+};
+
+struct pds_core_dev_init_data_out {
+	__le32 core_hw_index;
+	__le32 adminq_hw_index;
+	__le32 notifyq_hw_index;
+	u8     adminq_hw_type;
+	u8     notifyq_hw_type;
+};
+
+/**
+ * struct pds_core_dev_init_cmd - Core device initialize
+ * @opcode:          opcode PDS_CORE_CMD_INIT
+ *
+ * Initializes the core device and sets up the AdminQ and NotifyQ.
+ * Expects to find initialization data (struct pds_core_dev_init_data_in)
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while preparing the driver info.
+ */
+struct pds_core_dev_init_cmd {
+	u8     opcode;
+};
+
+/**
+ * struct pds_core_dev_init_comp - Core init completion
+ * @status:     Status of the command (enum pds_core_status_code)
+ *
+ * Initialization result data (struct pds_core_dev_init_data_in)
+ * is found in cmd_regs->data.
+ */
+struct pds_core_dev_init_comp {
+	u8     status;
+};
+
+/**
+ * struct pds_core_fw_download_cmd - Firmware download command
+ * @opcode:     opcode
+ * @rsvd:	Word boundary padding
+ * @addr:       DMA address of the firmware buffer
+ * @offset:     offset of the firmware buffer within the full image
+ * @length:     number of valid bytes in the firmware buffer
+ */
+struct pds_core_fw_download_cmd {
+	u8     opcode;
+	u8     rsvd[3];
+	__le32 offset;
+	__le64 addr;
+	__le32 length;
+};
+
+/**
+ * struct pds_core_fw_download_comp - Firmware download completion
+ * @status:     Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_fw_download_comp {
+	u8     status;
+};
+
+/**
+ * enum pds_core_fw_control_oper - FW control operations
+ * @PDS_CORE_FW_INSTALL_ASYNC:     Install firmware asynchronously
+ * @PDS_CORE_FW_INSTALL_STATUS:    Firmware installation status
+ * @PDS_CORE_FW_ACTIVATE_ASYNC:    Activate firmware asynchronously
+ * @PDS_CORE_FW_ACTIVATE_STATUS:   Firmware activate status
+ * @PDS_CORE_FW_UPDATE_CLEANUP:    Cleanup any firmware update leftovers
+ * @PDS_CORE_FW_GET_BOOT:          Return current active firmware slot
+ * @PDS_CORE_FW_SET_BOOT:          Set active firmware slot for next boot
+ * @PDS_CORE_FW_GET_LIST:          Return list of installed firmware images
+ */
+enum pds_core_fw_control_oper {
+	PDS_CORE_FW_INSTALL_ASYNC          = 0,
+	PDS_CORE_FW_INSTALL_STATUS         = 1,
+	PDS_CORE_FW_ACTIVATE_ASYNC         = 2,
+	PDS_CORE_FW_ACTIVATE_STATUS        = 3,
+	PDS_CORE_FW_UPDATE_CLEANUP         = 4,
+	PDS_CORE_FW_GET_BOOT               = 5,
+	PDS_CORE_FW_SET_BOOT               = 6,
+	PDS_CORE_FW_GET_LIST               = 7,
+};
+
+enum pds_core_fw_slot {
+	PDS_CORE_FW_SLOT_INVALID    = 0,
+	PDS_CORE_FW_SLOT_A	    = 1,
+	PDS_CORE_FW_SLOT_B          = 2,
+	PDS_CORE_FW_SLOT_GOLD       = 3,
+};
+
+/**
+ * struct pds_core_fw_control_cmd - Firmware control command
+ * @opcode:    opcode
+ * @rsvd:      Word boundary padding
+ * @oper:      firmware control operation (enum pds_core_fw_control_oper)
+ * @slot:      slot to operate on (enum pds_core_fw_slot)
+ */
+struct pds_core_fw_control_cmd {
+	u8  opcode;
+	u8  rsvd[3];
+	u8  oper;
+	u8  slot;
+};
+
+/**
+ * struct pds_core_fw_control_comp - Firmware control copletion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:	Word alignment space
+ * @slot:	Slot number (enum pds_core_fw_slot)
+ * @rsvd1:	Struct padding
+ * @color:	Color bit
+ */
+struct pds_core_fw_control_comp {
+	u8     status;
+	u8     rsvd[3];
+	u8     slot;
+	u8     rsvd1[10];
+	u8     color;
+};
+
+struct pds_core_fw_name_info {
+#define PDS_CORE_FWSLOT_BUFLEN		8
+#define PDS_CORE_FWVERS_BUFLEN		32
+	char   slotname[PDS_CORE_FWSLOT_BUFLEN];
+	char   fw_version[PDS_CORE_FWVERS_BUFLEN];
+};
+
+struct pds_core_fw_list_info {
+#define PDS_CORE_FWVERS_LIST_LEN	16
+	u8 num_fw_slots;
+	struct pds_core_fw_name_info fw_names[PDS_CORE_FWVERS_LIST_LEN];
+} __packed;
+
+enum pds_core_vf_attr {
+	PDS_CORE_VF_ATTR_SPOOFCHK	= 1,
+	PDS_CORE_VF_ATTR_TRUST		= 2,
+	PDS_CORE_VF_ATTR_MAC		= 3,
+	PDS_CORE_VF_ATTR_LINKSTATE	= 4,
+	PDS_CORE_VF_ATTR_VLAN		= 5,
+	PDS_CORE_VF_ATTR_RATE		= 6,
+	PDS_CORE_VF_ATTR_STATSADDR	= 7,
+};
+
+/**
+ * enum pds_core_vf_link_status - Virtual Function link status
+ * @PDS_CORE_VF_LINK_STATUS_AUTO:   Use link state of the uplink
+ * @PDS_CORE_VF_LINK_STATUS_UP:     Link always up
+ * @PDS_CORE_VF_LINK_STATUS_DOWN:   Link always down
+ */
+enum pds_core_vf_link_status {
+	PDS_CORE_VF_LINK_STATUS_AUTO = 0,
+	PDS_CORE_VF_LINK_STATUS_UP   = 1,
+	PDS_CORE_VF_LINK_STATUS_DOWN = 2,
+};
+
+/**
+ * struct pds_core_vf_setattr_cmd - Set VF attributes on the NIC
+ * @opcode:     Opcode
+ * @attr:       Attribute type (enum pds_core_vf_attr)
+ * @vf_index:   VF index
+ * @macaddr:	mac address
+ * @vlanid:	vlan ID
+ * @maxrate:	max Tx rate in Mbps
+ * @spoofchk:	enable address spoof checking
+ * @trust:	enable VF trust
+ * @linkstate:	set link up or down
+ * @stats:	stats addr struct
+ * @stats.pa:	set DMA address for VF stats
+ * @stats.len:	length of VF stats space
+ * @pad:	force union to specific size
+ */
+struct pds_core_vf_setattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 vf_index;
+	union {
+		u8     macaddr[6];
+		__le16 vlanid;
+		__le32 maxrate;
+		u8     spoofchk;
+		u8     trust;
+		u8     linkstate;
+		struct {
+			__le64 pa;
+			__le32 len;
+		} stats;
+		u8     pad[60];
+	} __packed;
+};
+
+struct pds_core_vf_setattr_comp {
+	u8     status;
+	u8     attr;
+	__le16 vf_index;
+	__le16 comp_index;
+	u8     rsvd[9];
+	u8     color;
+};
+
+/**
+ * struct pds_core_vf_getattr_cmd - Get VF attributes from the NIC
+ * @opcode:     Opcode
+ * @attr:       Attribute type (enum pds_core_vf_attr)
+ * @vf_index:   VF index
+ */
+struct pds_core_vf_getattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 vf_index;
+};
+
+struct pds_core_vf_getattr_comp {
+	u8     status;
+	u8     attr;
+	__le16 vf_index;
+	union {
+		u8     macaddr[6];
+		__le16 vlanid;
+		__le32 maxrate;
+		u8     spoofchk;
+		u8     trust;
+		u8     linkstate;
+		__le64 stats_pa;
+		u8     pad[11];
+	} __packed;
+	u8     color;
+};
+
+enum pds_core_vf_ctrl_opcode {
+	PDS_CORE_VF_CTRL_START_ALL	= 0,
+	PDS_CORE_VF_CTRL_START		= 1,
+};
+
+/**
+ * struct pds_core_vf_ctrl_cmd - VF control command
+ * @opcode:         Opcode for the command
+ * @ctrl_opcode:    VF control operation type
+ * @vf_index:       VF Index. It is unused if op START_ALL is used.
+ */
+
+struct pds_core_vf_ctrl_cmd {
+	u8	opcode;
+	u8	ctrl_opcode;
+	__le16	vf_index;
+};
+
+/**
+ * struct pds_core_vf_ctrl_comp - VF_CTRL command completion.
+ * @status:     Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_vf_ctrl_comp {
+	u8	status;
+};
+
+/*
+ * union pds_core_dev_cmd - Overlay of core device command structures
+ */
+union pds_core_dev_cmd {
+	u8     opcode;
+	u32    words[16];
+
+	struct pds_core_dev_identify_cmd identify;
+	struct pds_core_dev_init_cmd     init;
+	struct pds_core_dev_reset_cmd    reset;
+	struct pds_core_fw_download_cmd  fw_download;
+	struct pds_core_fw_control_cmd   fw_control;
+
+	struct pds_core_vf_setattr_cmd   vf_setattr;
+	struct pds_core_vf_getattr_cmd   vf_getattr;
+	struct pds_core_vf_ctrl_cmd      vf_ctrl;
+};
+
+/*
+ * union pds_core_dev_comp - Overlay of core device completion structures
+ */
+union pds_core_dev_comp {
+	u8                                status;
+	u8                                bytes[16];
+
+	struct pds_core_dev_identify_comp identify;
+	struct pds_core_dev_reset_comp    reset;
+	struct pds_core_dev_init_comp     init;
+	struct pds_core_fw_download_comp  fw_download;
+	struct pds_core_fw_control_comp   fw_control;
+
+	struct pds_core_vf_setattr_comp   vf_setattr;
+	struct pds_core_vf_getattr_comp   vf_getattr;
+	struct pds_core_vf_ctrl_comp      vf_ctrl;
+};
+
+/**
+ * struct pds_core_dev_hwstamp_regs - Hardware current timestamp registers
+ * @tick_low:        Low 32 bits of hardware timestamp
+ * @tick_high:       High 32 bits of hardware timestamp
+ */
+struct pds_core_dev_hwstamp_regs {
+	u32    tick_low;
+	u32    tick_high;
+};
+
+/**
+ * struct pds_core_dev_info_regs - Device info register format (read-only)
+ * @signature:       Signature value of 0x44455649 ('DEVI')
+ * @version:         Current version of info
+ * @asic_type:       Asic type
+ * @asic_rev:        Asic revision
+ * @fw_status:       Firmware status
+ *			bit 0   - 1 = fw running
+ *			bit 4-7 - 4 bit generation number, changes on fw restart
+ * @fw_heartbeat:    Firmware heartbeat counter
+ * @serial_num:      Serial number
+ * @fw_version:      Firmware version
+ * @oprom_regs:      oprom_regs to store oprom debug enable/disable and bmp
+ * @rsvd_pad1024:    Struct padding
+ * @hwstamp:         Hardware current timestamp registers
+ * @rsvd_pad2048:    Struct padding
+ */
+struct pds_core_dev_info_regs {
+#define PDS_CORE_DEVINFO_FWVERS_BUFLEN 32
+#define PDS_CORE_DEVINFO_SERIAL_BUFLEN 32
+	u32    signature;
+	u8     version;
+	u8     asic_type;
+	u8     asic_rev;
+#define PDS_CORE_FW_STS_F_STOPPED	0x00
+#define PDS_CORE_FW_STS_F_RUNNING	0x01
+#define PDS_CORE_FW_STS_F_GENERATION	0xF0
+	u8     fw_status;
+	__le32 fw_heartbeat;
+	char   fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN];
+	char   serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN];
+	u8     oprom_regs[32];     /* reserved */
+	u8     rsvd_pad1024[916];
+	struct pds_core_dev_hwstamp_regs hwstamp;   /* on 1k boundary */
+	u8     rsvd_pad2048[1016];
+} __packed;
+
+/**
+ * struct pds_core_dev_cmd_regs - Device command register format (read-write)
+ * @doorbell:	Device Cmd Doorbell, write-only
+ *              Write a 1 to signal device to process cmd
+ * @done:	Command completed indicator, poll for completion
+ *              bit 0 == 1 when command is complete
+ * @cmd:	Opcode-specific command bytes
+ * @comp:	Opcode-specific response bytes
+ * @rsvd:	Struct padding
+ * @data:	Opcode-specific side-data
+ */
+struct pds_core_dev_cmd_regs {
+	u32                     doorbell;
+	u32                     done;
+	union pds_core_dev_cmd  cmd;
+	union pds_core_dev_comp comp;
+	u8                      rsvd[48];
+	u32                     data[478];
+} __packed;
+
+/**
+ * struct pds_core_dev_regs - Device register format for bar 0 page 0
+ * @info:            Device info registers
+ * @devcmd:          Device command registers
+ */
+struct pds_core_dev_regs {
+	struct pds_core_dev_info_regs info;
+	struct pds_core_dev_cmd_regs  devcmd;
+} __packed;
+
+#ifndef __CHECKER__
+static_assert(sizeof(struct pds_core_drv_identity) <= 1912);
+static_assert(sizeof(struct pds_core_dev_identity) <= 1912);
+static_assert(sizeof(union pds_core_dev_cmd) == 64);
+static_assert(sizeof(union pds_core_dev_comp) == 16);
+static_assert(sizeof(struct pds_core_dev_info_regs) == 2048);
+static_assert(sizeof(struct pds_core_dev_cmd_regs) == 2048);
+static_assert(sizeof(struct pds_core_dev_regs) == 4096);
+#endif /* __CHECKER__ */
+
+#endif /* _PDS_CORE_IF_H_ */
-- 
cgit v1.2.3


From 523847df1b3718d6286dce0ed1c83742fe0ffa94 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:15 -0700
Subject: pds_core: add devcmd device interfaces

The devcmd interface is the basic connection to the device through the
PCI BAR for low level identification and command services.  This does
the early device initialization and finds the identity data, and adds
devcmd routines to be used by later driver bits.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile  |   4 +-
 drivers/net/ethernet/amd/pds_core/core.c    |  36 +++
 drivers/net/ethernet/amd/pds_core/core.h    |  50 ++++
 drivers/net/ethernet/amd/pds_core/debugfs.c |  38 +++
 drivers/net/ethernet/amd/pds_core/dev.c     | 348 ++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/main.c    |  33 ++-
 include/linux/pds/pds_common.h              |  30 +++
 include/linux/pds/pds_intr.h                | 163 +++++++++++++
 8 files changed, 699 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/core.c
 create mode 100644 drivers/net/ethernet/amd/pds_core/dev.c
 create mode 100644 include/linux/pds/pds_intr.h

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index de3bf1d1886c..95a6c31e92d2 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -3,6 +3,8 @@
 
 obj-$(CONFIG_PDS_CORE) := pds_core.o
 
-pds_core-y := main.o
+pds_core-y := main.o \
+	      dev.o \
+	      core.o
 
 pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
new file mode 100644
index 000000000000..80d2ecb045df
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include "core.h"
+
+int pdsc_setup(struct pdsc *pdsc, bool init)
+{
+	int err = 0;
+
+	if (init)
+		err = pdsc_dev_init(pdsc);
+	else
+		err = pdsc_dev_reinit(pdsc);
+	if (err)
+		return err;
+
+	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
+	return 0;
+}
+
+void pdsc_teardown(struct pdsc *pdsc, bool removing)
+{
+	pdsc_devcmd_reset(pdsc);
+
+	if (removing) {
+		kfree(pdsc->intr_info);
+		pdsc->intr_info = NULL;
+	}
+
+	if (pdsc->kern_dbpage) {
+		iounmap(pdsc->kern_dbpage);
+		pdsc->kern_dbpage = NULL;
+	}
+
+	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
+}
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 34ef837e8cfe..fcf6c6545c49 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -9,8 +9,13 @@
 
 #include <linux/pds/pds_common.h>
 #include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_intr.h>
 
 #define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
+#define PDSC_TEARDOWN_RECOVERY	false
+#define PDSC_TEARDOWN_REMOVING	true
+#define PDSC_SETUP_RECOVERY	false
+#define PDSC_SETUP_INIT		true
 
 struct pdsc_dev_bar {
 	void __iomem *vaddr;
@@ -19,6 +24,22 @@ struct pdsc_dev_bar {
 	int res_index;
 };
 
+struct pdsc_devinfo {
+	u8 asic_type;
+	u8 asic_rev;
+	char fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN + 1];
+	char serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN + 1];
+};
+
+#define PDSC_INTR_NAME_MAX_SZ		32
+
+struct pdsc_intr_info {
+	char name[PDSC_INTR_NAME_MAX_SZ];
+	unsigned int index;
+	unsigned int vector;
+	void *data;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -38,7 +59,19 @@ struct pdsc {
 	int uid;
 
 	unsigned long state;
+	u8 fw_status;
+	u8 fw_generation;
+	unsigned long last_fw_time;
+	u32 last_hb;
+
+	struct pdsc_devinfo dev_info;
+	struct pds_core_dev_identity dev_ident;
+	unsigned int nintrs;
+	struct pdsc_intr_info *intr_info;	/* array of nintrs elements */
 
+	unsigned int devcmd_timeout;
+	struct mutex devcmd_lock;	/* lock for dev_cmd operations */
+	struct mutex config_lock;	/* lock for configuration operations */
 	struct pds_core_dev_info_regs __iomem *info_regs;
 	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
 	struct pds_core_intr __iomem *intr_ctrl;
@@ -52,5 +85,22 @@ void pdsc_debugfs_create(void);
 void pdsc_debugfs_destroy(void);
 void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
+void pdsc_debugfs_add_ident(struct pdsc *pdsc);
+void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
+
+int pdsc_err_to_errno(enum pds_core_status_code code);
+bool pdsc_is_fw_running(struct pdsc *pdsc);
+bool pdsc_is_fw_good(struct pdsc *pdsc);
+int pdsc_devcmd(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		union pds_core_dev_comp *comp, int max_seconds);
+int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		       union pds_core_dev_comp *comp, int max_seconds);
+int pdsc_devcmd_init(struct pdsc *pdsc);
+int pdsc_devcmd_reset(struct pdsc *pdsc);
+int pdsc_dev_reinit(struct pdsc *pdsc);
+int pdsc_dev_init(struct pdsc *pdsc);
+
+int pdsc_setup(struct pdsc *pdsc, bool init);
+void pdsc_teardown(struct pdsc *pdsc, bool removing);
 
 #endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index b2f7cb795c20..601431b41abb 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -29,3 +29,41 @@ void pdsc_debugfs_del_dev(struct pdsc *pdsc)
 	debugfs_remove_recursive(pdsc->dentry);
 	pdsc->dentry = NULL;
 }
+
+static int identity_show(struct seq_file *seq, void *v)
+{
+	struct pdsc *pdsc = seq->private;
+	struct pds_core_dev_identity *ident;
+	int vt;
+
+	ident = &pdsc->dev_ident;
+
+	seq_printf(seq, "fw_heartbeat:     0x%x\n",
+		   ioread32(&pdsc->info_regs->fw_heartbeat));
+
+	seq_printf(seq, "nlifs:            %d\n",
+		   le32_to_cpu(ident->nlifs));
+	seq_printf(seq, "nintrs:           %d\n",
+		   le32_to_cpu(ident->nintrs));
+	seq_printf(seq, "ndbpgs_per_lif:   %d\n",
+		   le32_to_cpu(ident->ndbpgs_per_lif));
+	seq_printf(seq, "intr_coal_mult:   %d\n",
+		   le32_to_cpu(ident->intr_coal_mult));
+	seq_printf(seq, "intr_coal_div:    %d\n",
+		   le32_to_cpu(ident->intr_coal_div));
+
+	seq_puts(seq, "vif_types:        ");
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++)
+		seq_printf(seq, "%d ",
+			   le16_to_cpu(pdsc->dev_ident.vif_types[vt]));
+	seq_puts(seq, "\n");
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(identity);
+
+void pdsc_debugfs_add_ident(struct pdsc *pdsc)
+{
+	debugfs_create_file("identity", 0400, pdsc->dentry,
+			    pdsc, &identity_fops);
+}
diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c
new file mode 100644
index 000000000000..f082d69c5128
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/dev.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+
+#include "core.h"
+
+int pdsc_err_to_errno(enum pds_core_status_code code)
+{
+	switch (code) {
+	case PDS_RC_SUCCESS:
+		return 0;
+	case PDS_RC_EVERSION:
+	case PDS_RC_EQTYPE:
+	case PDS_RC_EQID:
+	case PDS_RC_EINVAL:
+	case PDS_RC_ENOSUPP:
+		return -EINVAL;
+	case PDS_RC_EPERM:
+		return -EPERM;
+	case PDS_RC_ENOENT:
+		return -ENOENT;
+	case PDS_RC_EAGAIN:
+		return -EAGAIN;
+	case PDS_RC_ENOMEM:
+		return -ENOMEM;
+	case PDS_RC_EFAULT:
+		return -EFAULT;
+	case PDS_RC_EBUSY:
+		return -EBUSY;
+	case PDS_RC_EEXIST:
+		return -EEXIST;
+	case PDS_RC_EVFID:
+		return -ENODEV;
+	case PDS_RC_ECLIENT:
+		return -ECHILD;
+	case PDS_RC_ENOSPC:
+		return -ENOSPC;
+	case PDS_RC_ERANGE:
+		return -ERANGE;
+	case PDS_RC_BAD_ADDR:
+		return -EFAULT;
+	case PDS_RC_EOPCODE:
+	case PDS_RC_EINTR:
+	case PDS_RC_DEV_CMD:
+	case PDS_RC_ERROR:
+	case PDS_RC_ERDMA:
+	case PDS_RC_EIO:
+	default:
+		return -EIO;
+	}
+}
+
+bool pdsc_is_fw_running(struct pdsc *pdsc)
+{
+	pdsc->fw_status = ioread8(&pdsc->info_regs->fw_status);
+	pdsc->last_fw_time = jiffies;
+	pdsc->last_hb = ioread32(&pdsc->info_regs->fw_heartbeat);
+
+	/* Firmware is useful only if the running bit is set and
+	 * fw_status != 0xff (bad PCI read)
+	 */
+	return (pdsc->fw_status != 0xff) &&
+		(pdsc->fw_status & PDS_CORE_FW_STS_F_RUNNING);
+}
+
+bool pdsc_is_fw_good(struct pdsc *pdsc)
+{
+	u8 gen = pdsc->fw_status & PDS_CORE_FW_STS_F_GENERATION;
+
+	return pdsc_is_fw_running(pdsc) && gen == pdsc->fw_generation;
+}
+
+static u8 pdsc_devcmd_status(struct pdsc *pdsc)
+{
+	return ioread8(&pdsc->cmd_regs->comp.status);
+}
+
+static bool pdsc_devcmd_done(struct pdsc *pdsc)
+{
+	return ioread32(&pdsc->cmd_regs->done) & PDS_CORE_DEV_CMD_DONE;
+}
+
+static void pdsc_devcmd_dbell(struct pdsc *pdsc)
+{
+	iowrite32(0, &pdsc->cmd_regs->done);
+	iowrite32(1, &pdsc->cmd_regs->doorbell);
+}
+
+static void pdsc_devcmd_clean(struct pdsc *pdsc)
+{
+	iowrite32(0, &pdsc->cmd_regs->doorbell);
+	memset_io(&pdsc->cmd_regs->cmd, 0, sizeof(pdsc->cmd_regs->cmd));
+}
+
+static const char *pdsc_devcmd_str(int opcode)
+{
+	switch (opcode) {
+	case PDS_CORE_CMD_NOP:
+		return "PDS_CORE_CMD_NOP";
+	case PDS_CORE_CMD_IDENTIFY:
+		return "PDS_CORE_CMD_IDENTIFY";
+	case PDS_CORE_CMD_RESET:
+		return "PDS_CORE_CMD_RESET";
+	case PDS_CORE_CMD_INIT:
+		return "PDS_CORE_CMD_INIT";
+	case PDS_CORE_CMD_FW_DOWNLOAD:
+		return "PDS_CORE_CMD_FW_DOWNLOAD";
+	case PDS_CORE_CMD_FW_CONTROL:
+		return "PDS_CORE_CMD_FW_CONTROL";
+	default:
+		return "PDS_CORE_CMD_UNKNOWN";
+	}
+}
+
+static int pdsc_devcmd_wait(struct pdsc *pdsc, int max_seconds)
+{
+	struct device *dev = pdsc->dev;
+	unsigned long start_time;
+	unsigned long max_wait;
+	unsigned long duration;
+	int timeout = 0;
+	int done = 0;
+	int err = 0;
+	int status;
+	int opcode;
+
+	opcode = ioread8(&pdsc->cmd_regs->cmd.opcode);
+
+	start_time = jiffies;
+	max_wait = start_time + (max_seconds * HZ);
+
+	while (!done && !timeout) {
+		done = pdsc_devcmd_done(pdsc);
+		if (done)
+			break;
+
+		timeout = time_after(jiffies, max_wait);
+		if (timeout)
+			break;
+
+		usleep_range(100, 200);
+	}
+	duration = jiffies - start_time;
+
+	if (done && duration > HZ)
+		dev_dbg(dev, "DEVCMD %d %s after %ld secs\n",
+			opcode, pdsc_devcmd_str(opcode), duration / HZ);
+
+	if (!done || timeout) {
+		dev_err(dev, "DEVCMD %d %s timeout, done %d timeout %d max_seconds=%d\n",
+			opcode, pdsc_devcmd_str(opcode), done, timeout,
+			max_seconds);
+		err = -ETIMEDOUT;
+		pdsc_devcmd_clean(pdsc);
+	}
+
+	status = pdsc_devcmd_status(pdsc);
+	err = pdsc_err_to_errno(status);
+	if (err && err != -EAGAIN)
+		dev_err(dev, "DEVCMD %d %s failed, status=%d err %d %pe\n",
+			opcode, pdsc_devcmd_str(opcode), status, err,
+			ERR_PTR(err));
+
+	return err;
+}
+
+int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		       union pds_core_dev_comp *comp, int max_seconds)
+{
+	int err;
+
+	memcpy_toio(&pdsc->cmd_regs->cmd, cmd, sizeof(*cmd));
+	pdsc_devcmd_dbell(pdsc);
+	err = pdsc_devcmd_wait(pdsc, max_seconds);
+	memcpy_fromio(comp, &pdsc->cmd_regs->comp, sizeof(*comp));
+
+	return err;
+}
+
+int pdsc_devcmd(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		union pds_core_dev_comp *comp, int max_seconds)
+{
+	int err;
+
+	mutex_lock(&pdsc->devcmd_lock);
+	err = pdsc_devcmd_locked(pdsc, cmd, comp, max_seconds);
+	mutex_unlock(&pdsc->devcmd_lock);
+
+	return err;
+}
+
+int pdsc_devcmd_init(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.opcode = PDS_CORE_CMD_INIT,
+	};
+
+	return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+int pdsc_devcmd_reset(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.reset.opcode = PDS_CORE_CMD_RESET,
+	};
+
+	return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+static int pdsc_devcmd_identify_locked(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.identify.opcode = PDS_CORE_CMD_IDENTIFY,
+		.identify.ver = PDS_CORE_IDENTITY_VERSION_1,
+	};
+
+	return pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+static void pdsc_init_devinfo(struct pdsc *pdsc)
+{
+	pdsc->dev_info.asic_type = ioread8(&pdsc->info_regs->asic_type);
+	pdsc->dev_info.asic_rev = ioread8(&pdsc->info_regs->asic_rev);
+	pdsc->fw_generation = PDS_CORE_FW_STS_F_GENERATION &
+			      ioread8(&pdsc->info_regs->fw_status);
+
+	memcpy_fromio(pdsc->dev_info.fw_version,
+		      pdsc->info_regs->fw_version,
+		      PDS_CORE_DEVINFO_FWVERS_BUFLEN);
+	pdsc->dev_info.fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN] = 0;
+
+	memcpy_fromio(pdsc->dev_info.serial_num,
+		      pdsc->info_regs->serial_num,
+		      PDS_CORE_DEVINFO_SERIAL_BUFLEN);
+	pdsc->dev_info.serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN] = 0;
+
+	dev_dbg(pdsc->dev, "fw_version %s\n", pdsc->dev_info.fw_version);
+}
+
+static int pdsc_identify(struct pdsc *pdsc)
+{
+	struct pds_core_drv_identity drv = {};
+	size_t sz;
+	int err;
+
+	drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX);
+	snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
+		 "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+
+	/* Next let's get some info about the device
+	 * We use the devcmd_lock at this level in order to
+	 * get safe access to the cmd_regs->data before anyone
+	 * else can mess it up
+	 */
+	mutex_lock(&pdsc->devcmd_lock);
+
+	sz = min_t(size_t, sizeof(drv), sizeof(pdsc->cmd_regs->data));
+	memcpy_toio(&pdsc->cmd_regs->data, &drv, sz);
+
+	err = pdsc_devcmd_identify_locked(pdsc);
+	if (!err) {
+		sz = min_t(size_t, sizeof(pdsc->dev_ident),
+			   sizeof(pdsc->cmd_regs->data));
+		memcpy_fromio(&pdsc->dev_ident, &pdsc->cmd_regs->data, sz);
+	}
+	mutex_unlock(&pdsc->devcmd_lock);
+
+	if (err) {
+		dev_err(pdsc->dev, "Cannot identify device: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	if (isprint(pdsc->dev_info.fw_version[0]) &&
+	    isascii(pdsc->dev_info.fw_version[0]))
+		dev_info(pdsc->dev, "FW: %.*s\n",
+			 (int)(sizeof(pdsc->dev_info.fw_version) - 1),
+			 pdsc->dev_info.fw_version);
+	else
+		dev_info(pdsc->dev, "FW: (invalid string) 0x%02x 0x%02x 0x%02x 0x%02x ...\n",
+			 (u8)pdsc->dev_info.fw_version[0],
+			 (u8)pdsc->dev_info.fw_version[1],
+			 (u8)pdsc->dev_info.fw_version[2],
+			 (u8)pdsc->dev_info.fw_version[3]);
+
+	return 0;
+}
+
+int pdsc_dev_reinit(struct pdsc *pdsc)
+{
+	pdsc_init_devinfo(pdsc);
+
+	return pdsc_identify(pdsc);
+}
+
+int pdsc_dev_init(struct pdsc *pdsc)
+{
+	unsigned int nintrs;
+	int err;
+
+	/* Initial init and reset of device */
+	pdsc_init_devinfo(pdsc);
+	pdsc->devcmd_timeout = PDS_CORE_DEVCMD_TIMEOUT;
+
+	err = pdsc_devcmd_reset(pdsc);
+	if (err)
+		return err;
+
+	err = pdsc_identify(pdsc);
+	if (err)
+		return err;
+
+	pdsc_debugfs_add_ident(pdsc);
+
+	/* Now we can reserve interrupts */
+	nintrs = le32_to_cpu(pdsc->dev_ident.nintrs);
+	nintrs = min_t(unsigned int, num_online_cpus(), nintrs);
+
+	/* Get intr_info struct array for tracking */
+	pdsc->intr_info = kcalloc(nintrs, sizeof(*pdsc->intr_info), GFP_KERNEL);
+	if (!pdsc->intr_info) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	err = pci_alloc_irq_vectors(pdsc->pdev, nintrs, nintrs, PCI_IRQ_MSIX);
+	if (err != nintrs) {
+		dev_err(pdsc->dev, "Can't get %d intrs from OS: %pe\n",
+			nintrs, ERR_PTR(err));
+		err = -ENOSPC;
+		goto err_out;
+	}
+	pdsc->nintrs = nintrs;
+
+	return 0;
+
+err_out:
+	kfree(pdsc->intr_info);
+	pdsc->intr_info = NULL;
+
+	return err;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index c2b12f226959..09afb069dcb3 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -137,6 +137,18 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 	if (err)
 		goto err_out_release_regions;
 
+	mutex_init(&pdsc->devcmd_lock);
+	mutex_init(&pdsc->config_lock);
+
+	mutex_lock(&pdsc->config_lock);
+	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
+
+	err = pdsc_setup(pdsc, PDSC_SETUP_INIT);
+	if (err)
+		goto err_out_unmap_bars;
+
+	mutex_unlock(&pdsc->config_lock);
+
 	dl = priv_to_devlink(pdsc);
 	devl_lock(dl);
 	devl_register(dl);
@@ -144,6 +156,12 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	return 0;
 
+err_out_unmap_bars:
+	mutex_unlock(&pdsc->config_lock);
+	mutex_destroy(&pdsc->config_lock);
+	mutex_destroy(&pdsc->devcmd_lock);
+	pci_free_irq_vectors(pdsc->pdev);
+	pdsc_unmap_bars(pdsc);
 err_out_release_regions:
 	pci_release_regions(pdsc->pdev);
 
@@ -240,8 +258,19 @@ static void pdsc_remove(struct pci_dev *pdev)
 	devl_unregister(dl);
 	devl_unlock(dl);
 
-	pdsc_unmap_bars(pdsc);
-	pci_release_regions(pdev);
+	if (!pdev->is_virtfn) {
+		mutex_lock(&pdsc->config_lock);
+		set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state);
+
+		pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
+		mutex_unlock(&pdsc->config_lock);
+		mutex_destroy(&pdsc->config_lock);
+		mutex_destroy(&pdsc->devcmd_lock);
+
+		pci_free_irq_vectors(pdev);
+		pdsc_unmap_bars(pdsc);
+		pci_release_regions(pdev);
+	}
 
 	pci_clear_master(pdev);
 	pci_disable_device(pdev);
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index bd041a5170a6..f0798ce01acf 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -11,4 +11,34 @@
 #define PDS_CORE_ADDR_MASK	(BIT_ULL(PDS_ADDR_LEN) - 1)
 #define PDS_PAGE_SIZE		4096
 
+enum pds_core_driver_type {
+	PDS_DRIVER_LINUX   = 1,
+	PDS_DRIVER_WIN     = 2,
+	PDS_DRIVER_DPDK    = 3,
+	PDS_DRIVER_FREEBSD = 4,
+	PDS_DRIVER_IPXE    = 5,
+	PDS_DRIVER_ESXI    = 6,
+};
+
+#define PDS_CORE_IFNAMSIZ		16
+
+/**
+ * enum pds_core_logical_qtype - Logical Queue Types
+ * @PDS_CORE_QTYPE_ADMINQ:    Administrative Queue
+ * @PDS_CORE_QTYPE_NOTIFYQ:   Notify Queue
+ * @PDS_CORE_QTYPE_RXQ:       Receive Queue
+ * @PDS_CORE_QTYPE_TXQ:       Transmit Queue
+ * @PDS_CORE_QTYPE_EQ:        Event Queue
+ * @PDS_CORE_QTYPE_MAX:       Max queue type supported
+ */
+enum pds_core_logical_qtype {
+	PDS_CORE_QTYPE_ADMINQ  = 0,
+	PDS_CORE_QTYPE_NOTIFYQ = 1,
+	PDS_CORE_QTYPE_RXQ     = 2,
+	PDS_CORE_QTYPE_TXQ     = 3,
+	PDS_CORE_QTYPE_EQ      = 4,
+
+	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
+};
+
 #endif /* _PDS_COMMON_H_ */
diff --git a/include/linux/pds/pds_intr.h b/include/linux/pds/pds_intr.h
new file mode 100644
index 000000000000..56277c37248c
--- /dev/null
+++ b/include/linux/pds/pds_intr.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_INTR_H_
+#define _PDS_INTR_H_
+
+/*
+ * Interrupt control register
+ * @coal_init:        Coalescing timer initial value, in
+ *                    device units.  Use @identity->intr_coal_mult
+ *                    and @identity->intr_coal_div to convert from
+ *                    usecs to device units:
+ *
+ *                      coal_init = coal_usecs * coal_mutl / coal_div
+ *
+ *                    When an interrupt is sent the interrupt
+ *                    coalescing timer current value
+ *                    (@coalescing_curr) is initialized with this
+ *                    value and begins counting down.  No more
+ *                    interrupts are sent until the coalescing
+ *                    timer reaches 0.  When @coalescing_init=0
+ *                    interrupt coalescing is effectively disabled
+ *                    and every interrupt assert results in an
+ *                    interrupt.  Reset value: 0
+ * @mask:             Interrupt mask.  When @mask=1 the interrupt
+ *                    resource will not send an interrupt.  When
+ *                    @mask=0 the interrupt resource will send an
+ *                    interrupt if an interrupt event is pending
+ *                    or on the next interrupt assertion event.
+ *                    Reset value: 1
+ * @credits:          Interrupt credits.  This register indicates
+ *                    how many interrupt events the hardware has
+ *                    sent.  When written by software this
+ *                    register atomically decrements @int_credits
+ *                    by the value written.  When @int_credits
+ *                    becomes 0 then the "pending interrupt" bit
+ *                    in the Interrupt Status register is cleared
+ *                    by the hardware and any pending but unsent
+ *                    interrupts are cleared.
+ *                    !!!IMPORTANT!!! This is a signed register.
+ * @flags:            Interrupt control flags
+ *                       @unmask -- When this bit is written with a 1
+ *                       the interrupt resource will set mask=0.
+ *                       @coal_timer_reset -- When this
+ *                       bit is written with a 1 the
+ *                       @coalescing_curr will be reloaded with
+ *                       @coalescing_init to reset the coalescing
+ *                       timer.
+ * @mask_on_assert:   Automatically mask on assertion.  When
+ *                    @mask_on_assert=1 the interrupt resource
+ *                    will set @mask=1 whenever an interrupt is
+ *                    sent.  When using interrupts in Legacy
+ *                    Interrupt mode the driver must select
+ *                    @mask_on_assert=0 for proper interrupt
+ *                    operation.
+ * @coalescing_curr:  Coalescing timer current value, in
+ *                    microseconds.  When this value reaches 0
+ *                    the interrupt resource is again eligible to
+ *                    send an interrupt.  If an interrupt event
+ *                    is already pending when @coalescing_curr
+ *                    reaches 0 the pending interrupt will be
+ *                    sent, otherwise an interrupt will be sent
+ *                    on the next interrupt assertion event.
+ */
+struct pds_core_intr {
+	u32 coal_init;
+	u32 mask;
+	u16 credits;
+	u16 flags;
+#define PDS_CORE_INTR_F_UNMASK		0x0001
+#define PDS_CORE_INTR_F_TIMER_RESET	0x0002
+	u32 mask_on_assert;
+	u32 coalescing_curr;
+	u32 rsvd6[3];
+};
+
+#ifndef __CHECKER__
+static_assert(sizeof(struct pds_core_intr) == 32);
+#endif /* __CHECKER__ */
+
+#define PDS_CORE_INTR_CTRL_REGS_MAX		2048
+#define PDS_CORE_INTR_CTRL_COAL_MAX		0x3F
+#define PDS_CORE_INTR_INDEX_NOT_ASSIGNED	-1
+
+struct pds_core_intr_status {
+	u32 status[2];
+};
+
+/**
+ * enum pds_core_intr_mask_vals - valid values for mask and mask_assert.
+ * @PDS_CORE_INTR_MASK_CLEAR:	unmask interrupt.
+ * @PDS_CORE_INTR_MASK_SET:	mask interrupt.
+ */
+enum pds_core_intr_mask_vals {
+	PDS_CORE_INTR_MASK_CLEAR	= 0,
+	PDS_CORE_INTR_MASK_SET		= 1,
+};
+
+/**
+ * enum pds_core_intr_credits_bits - Bitwise composition of credits values.
+ * @PDS_CORE_INTR_CRED_COUNT:	bit mask of credit count, no shift needed.
+ * @PDS_CORE_INTR_CRED_COUNT_SIGNED: bit mask of credit count, including sign bit.
+ * @PDS_CORE_INTR_CRED_UNMASK:	unmask the interrupt.
+ * @PDS_CORE_INTR_CRED_RESET_COALESCE: reset the coalesce timer.
+ * @PDS_CORE_INTR_CRED_REARM:	unmask the and reset the timer.
+ */
+enum pds_core_intr_credits_bits {
+	PDS_CORE_INTR_CRED_COUNT		= 0x7fffu,
+	PDS_CORE_INTR_CRED_COUNT_SIGNED		= 0xffffu,
+	PDS_CORE_INTR_CRED_UNMASK		= 0x10000u,
+	PDS_CORE_INTR_CRED_RESET_COALESCE	= 0x20000u,
+	PDS_CORE_INTR_CRED_REARM		= (PDS_CORE_INTR_CRED_UNMASK |
+					   PDS_CORE_INTR_CRED_RESET_COALESCE),
+};
+
+static inline void
+pds_core_intr_coal_init(struct pds_core_intr __iomem *intr_ctrl, u32 coal)
+{
+	iowrite32(coal, &intr_ctrl->coal_init);
+}
+
+static inline void
+pds_core_intr_mask(struct pds_core_intr __iomem *intr_ctrl, u32 mask)
+{
+	iowrite32(mask, &intr_ctrl->mask);
+}
+
+static inline void
+pds_core_intr_credits(struct pds_core_intr __iomem *intr_ctrl,
+		      u32 cred, u32 flags)
+{
+	if (WARN_ON_ONCE(cred > PDS_CORE_INTR_CRED_COUNT)) {
+		cred = ioread32(&intr_ctrl->credits);
+		cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED;
+	}
+
+	iowrite32(cred | flags, &intr_ctrl->credits);
+}
+
+static inline void
+pds_core_intr_clean_flags(struct pds_core_intr __iomem *intr_ctrl, u32 flags)
+{
+	u32 cred;
+
+	cred = ioread32(&intr_ctrl->credits);
+	cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED;
+	cred |= flags;
+	iowrite32(cred, &intr_ctrl->credits);
+}
+
+static inline void
+pds_core_intr_clean(struct pds_core_intr __iomem *intr_ctrl)
+{
+	pds_core_intr_clean_flags(intr_ctrl, PDS_CORE_INTR_CRED_RESET_COALESCE);
+}
+
+static inline void
+pds_core_intr_mask_assert(struct pds_core_intr __iomem *intr_ctrl, u32 mask)
+{
+	iowrite32(mask, &intr_ctrl->mask_on_assert);
+}
+
+#endif /* _PDS_INTR_H_ */
-- 
cgit v1.2.3


From 45d76f492938cdc27ddadc16e1e75103f4cfbf56 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:18 -0700
Subject: pds_core: set up device and adminq

Set up the basic adminq and notifyq queue structures.  These are
used mostly by the client drivers for feature configuration.
These are essentially the same adminq and notifyq as in the
ionic driver.

Part of this includes querying for device identity and FW
information, so we can make that available to devlink dev info.

  $ devlink dev info pci/0000:b5:00.0
  pci/0000:b5:00.0:
    driver pds_core
    serial_number FLM18420073
    versions:
        fixed:
          asic.id 0x0
          asic.rev 0x0
        running:
          fw 1.51.0-73
        stored:
          fw.goldfw 1.15.9-C-22
          fw.mainfwa 1.60.0-73
          fw.mainfwb 1.60.0-57

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../device_drivers/ethernet/amd/pds_core.rst       |  47 ++
 drivers/net/ethernet/amd/pds_core/core.c           | 431 +++++++++++++-
 drivers/net/ethernet/amd/pds_core/core.h           | 151 +++++
 drivers/net/ethernet/amd/pds_core/debugfs.c        |  77 +++
 drivers/net/ethernet/amd/pds_core/devlink.c        |  61 ++
 drivers/net/ethernet/amd/pds_core/main.c           |  17 +-
 include/linux/pds/pds_adminq.h                     | 638 +++++++++++++++++++++
 7 files changed, 1418 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/pds/pds_adminq.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
index 5b88173a20ff..a48eafb3d0d3 100644
--- a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
+++ b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
@@ -26,6 +26,53 @@ messages such as these::
   pds_core 0000:b6:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
   pds_core 0000:b6:00.0: FW: 1.60.0-73
 
+Driver and firmware version information can be gathered with devlink::
+
+  $ devlink dev info pci/0000:b5:00.0
+  pci/0000:b5:00.0:
+    driver pds_core
+    serial_number FLM18420073
+    versions:
+        fixed:
+          asic.id 0x0
+          asic.rev 0x0
+        running:
+          fw 1.51.0-73
+        stored:
+          fw.goldfw 1.15.9-C-22
+          fw.mainfwa 1.60.0-73
+          fw.mainfwb 1.60.0-57
+
+Info versions
+=============
+
+The ``pds_core`` driver reports the following versions
+
+.. list-table:: devlink info versions implemented
+   :widths: 5 5 90
+
+   * - Name
+     - Type
+     - Description
+   * - ``fw``
+     - running
+     - Version of firmware running on the device
+   * - ``fw.goldfw``
+     - stored
+     - Version of firmware stored in the goldfw slot
+   * - ``fw.mainfwa``
+     - stored
+     - Version of firmware stored in the mainfwa slot
+   * - ``fw.mainfwb``
+     - stored
+     - Version of firmware stored in the mainfwb slot
+   * - ``asic.id``
+     - fixed
+     - The ASIC type for this device
+   * - ``asic.rev``
+     - fixed
+     - The revision of the ASIC for this device
+
 Health Reporters
 ================
 
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index ab8531386226..8c0dbdb5efc5 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -1,10 +1,365 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2023 Advanced Micro Devices, Inc */
 
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
 #include "core.h"
 
+void pdsc_work_thread(struct work_struct *work)
+{
+	/* stub */
+}
+
+irqreturn_t pdsc_adminq_isr(int irq, void *data)
+{
+	/* stub */
+	return IRQ_HANDLED;
+}
+
+void pdsc_intr_free(struct pdsc *pdsc, int index)
+{
+	struct pdsc_intr_info *intr_info;
+
+	if (index >= pdsc->nintrs || index < 0) {
+		WARN(true, "bad intr index %d\n", index);
+		return;
+	}
+
+	intr_info = &pdsc->intr_info[index];
+	if (!intr_info->vector)
+		return;
+	dev_dbg(pdsc->dev, "%s: idx %d vec %d name %s\n",
+		__func__, index, intr_info->vector, intr_info->name);
+
+	pds_core_intr_mask(&pdsc->intr_ctrl[index], PDS_CORE_INTR_MASK_SET);
+	pds_core_intr_clean(&pdsc->intr_ctrl[index]);
+
+	free_irq(intr_info->vector, intr_info->data);
+
+	memset(intr_info, 0, sizeof(*intr_info));
+}
+
+int pdsc_intr_alloc(struct pdsc *pdsc, char *name,
+		    irq_handler_t handler, void *data)
+{
+	struct pdsc_intr_info *intr_info;
+	unsigned int index;
+	int err;
+
+	/* Find the first available interrupt */
+	for (index = 0; index < pdsc->nintrs; index++)
+		if (!pdsc->intr_info[index].vector)
+			break;
+	if (index >= pdsc->nintrs) {
+		dev_warn(pdsc->dev, "%s: no intr, index=%d nintrs=%d\n",
+			 __func__, index, pdsc->nintrs);
+		return -ENOSPC;
+	}
+
+	pds_core_intr_clean_flags(&pdsc->intr_ctrl[index],
+				  PDS_CORE_INTR_CRED_RESET_COALESCE);
+
+	intr_info = &pdsc->intr_info[index];
+
+	intr_info->index = index;
+	intr_info->data = data;
+	strscpy(intr_info->name, name, sizeof(intr_info->name));
+
+	/* Get the OS vector number for the interrupt */
+	err = pci_irq_vector(pdsc->pdev, index);
+	if (err < 0) {
+		dev_err(pdsc->dev, "failed to get intr vector index %d: %pe\n",
+			index, ERR_PTR(err));
+		goto err_out_free_intr;
+	}
+	intr_info->vector = err;
+
+	/* Init the device's intr mask */
+	pds_core_intr_clean(&pdsc->intr_ctrl[index]);
+	pds_core_intr_mask_assert(&pdsc->intr_ctrl[index], 1);
+	pds_core_intr_mask(&pdsc->intr_ctrl[index], PDS_CORE_INTR_MASK_SET);
+
+	/* Register the isr with a name */
+	err = request_irq(intr_info->vector, handler, 0, intr_info->name, data);
+	if (err) {
+		dev_err(pdsc->dev, "failed to get intr irq vector %d: %pe\n",
+			intr_info->vector, ERR_PTR(err));
+		goto err_out_free_intr;
+	}
+
+	return index;
+
+err_out_free_intr:
+	pdsc_intr_free(pdsc, index);
+	return err;
+}
+
+static void pdsc_qcq_intr_free(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	if (!(qcq->flags & PDS_CORE_QCQ_F_INTR) ||
+	    qcq->intx == PDS_CORE_INTR_INDEX_NOT_ASSIGNED)
+		return;
+
+	pdsc_intr_free(pdsc, qcq->intx);
+	qcq->intx = PDS_CORE_INTR_INDEX_NOT_ASSIGNED;
+}
+
+static int pdsc_qcq_intr_alloc(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	char name[PDSC_INTR_NAME_MAX_SZ];
+	int index;
+
+	if (!(qcq->flags & PDS_CORE_QCQ_F_INTR)) {
+		qcq->intx = PDS_CORE_INTR_INDEX_NOT_ASSIGNED;
+		return 0;
+	}
+
+	snprintf(name, sizeof(name), "%s-%d-%s",
+		 PDS_CORE_DRV_NAME, pdsc->pdev->bus->number, qcq->q.name);
+	index = pdsc_intr_alloc(pdsc, name, pdsc_adminq_isr, qcq);
+	if (index < 0)
+		return index;
+	qcq->intx = index;
+
+	return 0;
+}
+
+void pdsc_qcq_free(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	struct device *dev = pdsc->dev;
+
+	if (!(qcq && qcq->pdsc))
+		return;
+
+	pdsc_debugfs_del_qcq(qcq);
+
+	pdsc_qcq_intr_free(pdsc, qcq);
+
+	if (qcq->q_base)
+		dma_free_coherent(dev, qcq->q_size,
+				  qcq->q_base, qcq->q_base_pa);
+
+	if (qcq->cq_base)
+		dma_free_coherent(dev, qcq->cq_size,
+				  qcq->cq_base, qcq->cq_base_pa);
+
+	if (qcq->cq.info)
+		vfree(qcq->cq.info);
+
+	if (qcq->q.info)
+		vfree(qcq->q.info);
+
+	memset(qcq, 0, sizeof(*qcq));
+}
+
+static void pdsc_q_map(struct pdsc_queue *q, void *base, dma_addr_t base_pa)
+{
+	struct pdsc_q_info *cur;
+	unsigned int i;
+
+	q->base = base;
+	q->base_pa = base_pa;
+
+	for (i = 0, cur = q->info; i < q->num_descs; i++, cur++)
+		cur->desc = base + (i * q->desc_size);
+}
+
+static void pdsc_cq_map(struct pdsc_cq *cq, void *base, dma_addr_t base_pa)
+{
+	struct pdsc_cq_info *cur;
+	unsigned int i;
+
+	cq->base = base;
+	cq->base_pa = base_pa;
+
+	for (i = 0, cur = cq->info; i < cq->num_descs; i++, cur++)
+		cur->comp = base + (i * cq->desc_size);
+}
+
+int pdsc_qcq_alloc(struct pdsc *pdsc, unsigned int type, unsigned int index,
+		   const char *name, unsigned int flags, unsigned int num_descs,
+		   unsigned int desc_size, unsigned int cq_desc_size,
+		   unsigned int pid, struct pdsc_qcq *qcq)
+{
+	struct device *dev = pdsc->dev;
+	void *q_base, *cq_base;
+	dma_addr_t cq_base_pa;
+	dma_addr_t q_base_pa;
+	int err;
+
+	qcq->q.info = vzalloc(num_descs * sizeof(*qcq->q.info));
+	if (!qcq->q.info) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	qcq->pdsc = pdsc;
+	qcq->flags = flags;
+	INIT_WORK(&qcq->work, pdsc_work_thread);
+
+	qcq->q.type = type;
+	qcq->q.index = index;
+	qcq->q.num_descs = num_descs;
+	qcq->q.desc_size = desc_size;
+	qcq->q.tail_idx = 0;
+	qcq->q.head_idx = 0;
+	qcq->q.pid = pid;
+	snprintf(qcq->q.name, sizeof(qcq->q.name), "%s%u", name, index);
+
+	err = pdsc_qcq_intr_alloc(pdsc, qcq);
+	if (err)
+		goto err_out_free_q_info;
+
+	qcq->cq.info = vzalloc(num_descs * sizeof(*qcq->cq.info));
+	if (!qcq->cq.info) {
+		err = -ENOMEM;
+		goto err_out_free_irq;
+	}
+
+	qcq->cq.bound_intr = &pdsc->intr_info[qcq->intx];
+	qcq->cq.num_descs = num_descs;
+	qcq->cq.desc_size = cq_desc_size;
+	qcq->cq.tail_idx = 0;
+	qcq->cq.done_color = 1;
+
+	if (flags & PDS_CORE_QCQ_F_NOTIFYQ) {
+		/* q & cq need to be contiguous in case of notifyq */
+		qcq->q_size = PDS_PAGE_SIZE +
+			      ALIGN(num_descs * desc_size, PDS_PAGE_SIZE) +
+			      ALIGN(num_descs * cq_desc_size, PDS_PAGE_SIZE);
+		qcq->q_base = dma_alloc_coherent(dev,
+						 qcq->q_size + qcq->cq_size,
+						 &qcq->q_base_pa,
+						 GFP_KERNEL);
+		if (!qcq->q_base) {
+			err = -ENOMEM;
+			goto err_out_free_cq_info;
+		}
+		q_base = PTR_ALIGN(qcq->q_base, PDS_PAGE_SIZE);
+		q_base_pa = ALIGN(qcq->q_base_pa, PDS_PAGE_SIZE);
+		pdsc_q_map(&qcq->q, q_base, q_base_pa);
+
+		cq_base = PTR_ALIGN(q_base +
+				    ALIGN(num_descs * desc_size, PDS_PAGE_SIZE),
+				    PDS_PAGE_SIZE);
+		cq_base_pa = ALIGN(qcq->q_base_pa +
+				   ALIGN(num_descs * desc_size, PDS_PAGE_SIZE),
+				   PDS_PAGE_SIZE);
+
+	} else {
+		/* q DMA descriptors */
+		qcq->q_size = PDS_PAGE_SIZE + (num_descs * desc_size);
+		qcq->q_base = dma_alloc_coherent(dev, qcq->q_size,
+						 &qcq->q_base_pa,
+						 GFP_KERNEL);
+		if (!qcq->q_base) {
+			err = -ENOMEM;
+			goto err_out_free_cq_info;
+		}
+		q_base = PTR_ALIGN(qcq->q_base, PDS_PAGE_SIZE);
+		q_base_pa = ALIGN(qcq->q_base_pa, PDS_PAGE_SIZE);
+		pdsc_q_map(&qcq->q, q_base, q_base_pa);
+
+		/* cq DMA descriptors */
+		qcq->cq_size = PDS_PAGE_SIZE + (num_descs * cq_desc_size);
+		qcq->cq_base = dma_alloc_coherent(dev, qcq->cq_size,
+						  &qcq->cq_base_pa,
+						  GFP_KERNEL);
+		if (!qcq->cq_base) {
+			err = -ENOMEM;
+			goto err_out_free_q;
+		}
+		cq_base = PTR_ALIGN(qcq->cq_base, PDS_PAGE_SIZE);
+		cq_base_pa = ALIGN(qcq->cq_base_pa, PDS_PAGE_SIZE);
+	}
+
+	pdsc_cq_map(&qcq->cq, cq_base, cq_base_pa);
+	qcq->cq.bound_q = &qcq->q;
+
+	pdsc_debugfs_add_qcq(pdsc, qcq);
+
+	return 0;
+
+err_out_free_q:
+	dma_free_coherent(dev, qcq->q_size, qcq->q_base, qcq->q_base_pa);
+err_out_free_cq_info:
+	vfree(qcq->cq.info);
+err_out_free_irq:
+	pdsc_qcq_intr_free(pdsc, qcq);
+err_out_free_q_info:
+	vfree(qcq->q.info);
+	memset(qcq, 0, sizeof(*qcq));
+err_out:
+	dev_err(dev, "qcq alloc of %s%d failed %d\n", name, index, err);
+	return err;
+}
+
+static int pdsc_core_init(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.init.opcode = PDS_CORE_CMD_INIT,
+	};
+	struct pds_core_dev_init_data_out cido;
+	struct pds_core_dev_init_data_in cidi;
+	u32 dbid_count;
+	u32 dbpage_num;
+	size_t sz;
+	int err;
+
+	cidi.adminq_q_base = cpu_to_le64(pdsc->adminqcq.q_base_pa);
+	cidi.adminq_cq_base = cpu_to_le64(pdsc->adminqcq.cq_base_pa);
+	cidi.notifyq_cq_base = cpu_to_le64(pdsc->notifyqcq.cq.base_pa);
+	cidi.flags = cpu_to_le32(PDS_CORE_QINIT_F_IRQ | PDS_CORE_QINIT_F_ENA);
+	cidi.intr_index = cpu_to_le16(pdsc->adminqcq.intx);
+	cidi.adminq_ring_size = ilog2(pdsc->adminqcq.q.num_descs);
+	cidi.notifyq_ring_size = ilog2(pdsc->notifyqcq.q.num_descs);
+
+	mutex_lock(&pdsc->devcmd_lock);
+
+	sz = min_t(size_t, sizeof(cidi), sizeof(pdsc->cmd_regs->data));
+	memcpy_toio(&pdsc->cmd_regs->data, &cidi, sz);
+
+	err = pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+	if (!err) {
+		sz = min_t(size_t, sizeof(cido), sizeof(pdsc->cmd_regs->data));
+		memcpy_fromio(&cido, &pdsc->cmd_regs->data, sz);
+	}
+
+	mutex_unlock(&pdsc->devcmd_lock);
+	if (err) {
+		dev_err(pdsc->dev, "Device init command failed: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	pdsc->hw_index = le32_to_cpu(cido.core_hw_index);
+
+	dbid_count = le32_to_cpu(pdsc->dev_ident.ndbpgs_per_lif);
+	dbpage_num = pdsc->hw_index * dbid_count;
+	pdsc->kern_dbpage = pdsc_map_dbpage(pdsc, dbpage_num);
+	if (!pdsc->kern_dbpage) {
+		dev_err(pdsc->dev, "Cannot map dbpage, aborting\n");
+		return -ENOMEM;
+	}
+
+	pdsc->adminqcq.q.hw_type = cido.adminq_hw_type;
+	pdsc->adminqcq.q.hw_index = le32_to_cpu(cido.adminq_hw_index);
+	pdsc->adminqcq.q.dbval = PDS_CORE_DBELL_QID(pdsc->adminqcq.q.hw_index);
+
+	pdsc->notifyqcq.q.hw_type = cido.notifyq_hw_type;
+	pdsc->notifyqcq.q.hw_index = le32_to_cpu(cido.notifyq_hw_index);
+	pdsc->notifyqcq.q.dbval = PDS_CORE_DBELL_QID(pdsc->notifyqcq.q.hw_index);
+
+	pdsc->last_eid = 0;
+
+	return err;
+}
+
 int pdsc_setup(struct pdsc *pdsc, bool init)
 {
+	int numdescs;
 	int err;
 
 	if (init)
@@ -14,17 +369,60 @@ int pdsc_setup(struct pdsc *pdsc, bool init)
 	if (err)
 		return err;
 
+	/* Scale the descriptor ring length based on number of CPUs and VFs */
+	numdescs = max_t(int, PDSC_ADMINQ_MIN_LENGTH, num_online_cpus());
+	numdescs += 2 * pci_sriov_get_totalvfs(pdsc->pdev);
+	numdescs = roundup_pow_of_two(numdescs);
+	err = pdsc_qcq_alloc(pdsc, PDS_CORE_QTYPE_ADMINQ, 0, "adminq",
+			     PDS_CORE_QCQ_F_CORE | PDS_CORE_QCQ_F_INTR,
+			     numdescs,
+			     sizeof(union pds_core_adminq_cmd),
+			     sizeof(union pds_core_adminq_comp),
+			     0, &pdsc->adminqcq);
+	if (err)
+		goto err_out_teardown;
+
+	err = pdsc_qcq_alloc(pdsc, PDS_CORE_QTYPE_NOTIFYQ, 0, "notifyq",
+			     PDS_CORE_QCQ_F_NOTIFYQ,
+			     PDSC_NOTIFYQ_LENGTH,
+			     sizeof(struct pds_core_notifyq_cmd),
+			     sizeof(union pds_core_notifyq_comp),
+			     0, &pdsc->notifyqcq);
+	if (err)
+		goto err_out_teardown;
+
+	/* NotifyQ rides on the AdminQ interrupt */
+	pdsc->notifyqcq.intx = pdsc->adminqcq.intx;
+
+	/* Set up the Core with the AdminQ and NotifyQ info */
+	err = pdsc_core_init(pdsc);
+	if (err)
+		goto err_out_teardown;
+
 	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
 	return 0;
+
+err_out_teardown:
+	pdsc_teardown(pdsc, init);
+	return err;
 }
 
 void pdsc_teardown(struct pdsc *pdsc, bool removing)
 {
+	int i;
+
 	pdsc_devcmd_reset(pdsc);
+	pdsc_qcq_free(pdsc, &pdsc->notifyqcq);
+	pdsc_qcq_free(pdsc, &pdsc->adminqcq);
+
+	if (pdsc->intr_info) {
+		for (i = 0; i < pdsc->nintrs; i++)
+			pdsc_intr_free(pdsc, i);
 
-	if (removing) {
-		kfree(pdsc->intr_info);
-		pdsc->intr_info = NULL;
+		if (removing) {
+			kfree(pdsc->intr_info);
+			pdsc->intr_info = NULL;
+		}
 	}
 
 	if (pdsc->kern_dbpage) {
@@ -35,6 +433,28 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing)
 	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
 }
 
+int pdsc_start(struct pdsc *pdsc)
+{
+	pds_core_intr_mask(&pdsc->intr_ctrl[pdsc->adminqcq.intx],
+			   PDS_CORE_INTR_MASK_CLEAR);
+
+	return 0;
+}
+
+void pdsc_stop(struct pdsc *pdsc)
+{
+	int i;
+
+	if (!pdsc->intr_info)
+		return;
+
+	/* Mask interrupts that are in use */
+	for (i = 0; i < pdsc->nintrs; i++)
+		if (pdsc->intr_info[i].vector)
+			pds_core_intr_mask(&pdsc->intr_ctrl[i],
+					   PDS_CORE_INTR_MASK_SET);
+}
+
 static void pdsc_fw_down(struct pdsc *pdsc)
 {
 	if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
@@ -44,6 +464,7 @@ static void pdsc_fw_down(struct pdsc *pdsc)
 
 	devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc);
 
+	pdsc_stop(pdsc);
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY);
 }
 
@@ -60,6 +481,10 @@ static void pdsc_fw_up(struct pdsc *pdsc)
 	if (err)
 		goto err_out;
 
+	err = pdsc_start(pdsc);
+	if (err)
+		goto err_out;
+
 	pdsc->fw_recoveries++;
 	devlink_health_reporter_state_update(pdsc->fw_reporter,
 					     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 32aa38c40024..7adbd10cb6d9 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -9,11 +9,15 @@
 
 #include <linux/pds/pds_common.h>
 #include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
 #include <linux/pds/pds_intr.h>
 
 #define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
 
 #define PDSC_WATCHDOG_SECS	5
+#define PDSC_QUEUE_NAME_MAX_SZ  32
+#define PDSC_ADMINQ_MIN_LENGTH	16	/* must be a power of two */
+#define PDSC_NOTIFYQ_LENGTH	64	/* must be a power of two */
 #define PDSC_TEARDOWN_RECOVERY	false
 #define PDSC_TEARDOWN_REMOVING	true
 #define PDSC_SETUP_RECOVERY	false
@@ -33,6 +37,28 @@ struct pdsc_devinfo {
 	char serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN + 1];
 };
 
+struct pdsc_queue {
+	struct pdsc_q_info *info;
+	u64 dbval;
+	u16 head_idx;
+	u16 tail_idx;
+	u8 hw_type;
+	unsigned int index;
+	unsigned int num_descs;
+	u64 dbell_count;
+	u64 features;
+	unsigned int type;
+	unsigned int hw_index;
+	union {
+		void *base;
+		struct pds_core_admin_cmd *adminq;
+	};
+	dma_addr_t base_pa;	/* must be page aligned */
+	unsigned int desc_size;
+	unsigned int pid;
+	char name[PDSC_QUEUE_NAME_MAX_SZ];
+};
+
 #define PDSC_INTR_NAME_MAX_SZ		32
 
 struct pdsc_intr_info {
@@ -42,6 +68,61 @@ struct pdsc_intr_info {
 	void *data;
 };
 
+struct pdsc_cq_info {
+	void *comp;
+};
+
+struct pdsc_buf_info {
+	struct page *page;
+	dma_addr_t dma_addr;
+	u32 page_offset;
+	u32 len;
+};
+
+struct pdsc_q_info {
+	union {
+		void *desc;
+		struct pdsc_admin_cmd *adminq_desc;
+	};
+	unsigned int bytes;
+	unsigned int nbufs;
+	struct pdsc_buf_info bufs[PDS_CORE_MAX_FRAGS];
+	struct pdsc_wait_context *wc;
+	void *dest;
+};
+
+struct pdsc_cq {
+	struct pdsc_cq_info *info;
+	struct pdsc_queue *bound_q;
+	struct pdsc_intr_info *bound_intr;
+	u16 tail_idx;
+	bool done_color;
+	unsigned int num_descs;
+	unsigned int desc_size;
+	void *base;
+	dma_addr_t base_pa;	/* must be page aligned */
+} ____cacheline_aligned_in_smp;
+
+struct pdsc_qcq {
+	struct pdsc *pdsc;
+	void *q_base;
+	dma_addr_t q_base_pa;	/* might not be page aligned */
+	void *cq_base;
+	dma_addr_t cq_base_pa;	/* might not be page aligned */
+	u32 q_size;
+	u32 cq_size;
+	bool armed;
+	unsigned int flags;
+
+	struct work_struct work;
+	struct pdsc_queue q;
+	struct pdsc_cq cq;
+	int intx;
+
+	u32 accum_work;
+	struct dentry *dentry;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -81,6 +162,7 @@ struct pdsc {
 	unsigned int devcmd_timeout;
 	struct mutex devcmd_lock;	/* lock for dev_cmd operations */
 	struct mutex config_lock;	/* lock for configuration operations */
+	spinlock_t adminq_lock;		/* lock for adminq operations */
 	struct pds_core_dev_info_regs __iomem *info_regs;
 	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
 	struct pds_core_intr __iomem *intr_ctrl;
@@ -88,11 +170,64 @@ struct pdsc {
 	u64 __iomem *db_pages;
 	dma_addr_t phy_db_pages;
 	u64 __iomem *kern_dbpage;
+
+	struct pdsc_qcq adminqcq;
+	struct pdsc_qcq notifyqcq;
+	u64 last_eid;
 };
 
+/** enum pds_core_dbell_bits - bitwise composition of dbell values.
+ *
+ * @PDS_CORE_DBELL_QID_MASK:	unshifted mask of valid queue id bits.
+ * @PDS_CORE_DBELL_QID_SHIFT:	queue id shift amount in dbell value.
+ * @PDS_CORE_DBELL_QID:		macro to build QID component of dbell value.
+ *
+ * @PDS_CORE_DBELL_RING_MASK:	unshifted mask of valid ring bits.
+ * @PDS_CORE_DBELL_RING_SHIFT:	ring shift amount in dbell value.
+ * @PDS_CORE_DBELL_RING:	macro to build ring component of dbell value.
+ *
+ * @PDS_CORE_DBELL_RING_0:	ring zero dbell component value.
+ * @PDS_CORE_DBELL_RING_1:	ring one dbell component value.
+ * @PDS_CORE_DBELL_RING_2:	ring two dbell component value.
+ * @PDS_CORE_DBELL_RING_3:	ring three dbell component value.
+ *
+ * @PDS_CORE_DBELL_INDEX_MASK:	bit mask of valid index bits, no shift needed.
+ */
+enum pds_core_dbell_bits {
+	PDS_CORE_DBELL_QID_MASK		= 0xffffff,
+	PDS_CORE_DBELL_QID_SHIFT		= 24,
+
+#define PDS_CORE_DBELL_QID(n) \
+	(((u64)(n) & PDS_CORE_DBELL_QID_MASK) << PDS_CORE_DBELL_QID_SHIFT)
+
+	PDS_CORE_DBELL_RING_MASK		= 0x7,
+	PDS_CORE_DBELL_RING_SHIFT		= 16,
+
+#define PDS_CORE_DBELL_RING(n) \
+	(((u64)(n) & PDS_CORE_DBELL_RING_MASK) << PDS_CORE_DBELL_RING_SHIFT)
+
+	PDS_CORE_DBELL_RING_0		= 0,
+	PDS_CORE_DBELL_RING_1		= PDS_CORE_DBELL_RING(1),
+	PDS_CORE_DBELL_RING_2		= PDS_CORE_DBELL_RING(2),
+	PDS_CORE_DBELL_RING_3		= PDS_CORE_DBELL_RING(3),
+
+	PDS_CORE_DBELL_INDEX_MASK		= 0xffff,
+};
+
+static inline void pds_core_dbell_ring(u64 __iomem *db_page,
+				       enum pds_core_logical_qtype qtype,
+				       u64 val)
+{
+	writeq(val, &db_page[qtype]);
+}
+
 int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 			      struct devlink_fmsg *fmsg,
 			      struct netlink_ext_ack *extack);
+int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
+		     struct netlink_ext_ack *extack);
+
+void __iomem *pdsc_map_dbpage(struct pdsc *pdsc, int page_num);
 
 void pdsc_debugfs_create(void);
 void pdsc_debugfs_destroy(void);
@@ -100,6 +235,8 @@ void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
 void pdsc_debugfs_add_ident(struct pdsc *pdsc);
 void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
+void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq);
+void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq);
 
 int pdsc_err_to_errno(enum pds_core_status_code code);
 bool pdsc_is_fw_running(struct pdsc *pdsc);
@@ -113,8 +250,22 @@ int pdsc_devcmd_reset(struct pdsc *pdsc);
 int pdsc_dev_reinit(struct pdsc *pdsc);
 int pdsc_dev_init(struct pdsc *pdsc);
 
+int pdsc_intr_alloc(struct pdsc *pdsc, char *name,
+		    irq_handler_t handler, void *data);
+void pdsc_intr_free(struct pdsc *pdsc, int index);
+void pdsc_qcq_free(struct pdsc *pdsc, struct pdsc_qcq *qcq);
+int pdsc_qcq_alloc(struct pdsc *pdsc, unsigned int type, unsigned int index,
+		   const char *name, unsigned int flags, unsigned int num_descs,
+		   unsigned int desc_size, unsigned int cq_desc_size,
+		   unsigned int pid, struct pdsc_qcq *qcq);
 int pdsc_setup(struct pdsc *pdsc, bool init);
 void pdsc_teardown(struct pdsc *pdsc, bool removing);
+int pdsc_start(struct pdsc *pdsc);
+void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+void pdsc_process_adminq(struct pdsc_qcq *qcq);
+void pdsc_work_thread(struct work_struct *work);
+irqreturn_t pdsc_adminq_isr(int irq, void *data);
+
 #endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index 601431b41abb..b83e5016644b 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -67,3 +67,80 @@ void pdsc_debugfs_add_ident(struct pdsc *pdsc)
 	debugfs_create_file("identity", 0400, pdsc->dentry,
 			    pdsc, &identity_fops);
 }
+
+static const struct debugfs_reg32 intr_ctrl_regs[] = {
+	{ .name = "coal_init", .offset = 0, },
+	{ .name = "mask", .offset = 4, },
+	{ .name = "credits", .offset = 8, },
+	{ .name = "mask_on_assert", .offset = 12, },
+	{ .name = "coal_timer", .offset = 16, },
+};
+
+void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	struct dentry *qcq_dentry, *q_dentry, *cq_dentry;
+	struct dentry *intr_dentry;
+	struct debugfs_regset32 *intr_ctrl_regset;
+	struct pdsc_intr_info *intr = &pdsc->intr_info[qcq->intx];
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc_cq *cq = &qcq->cq;
+
+	qcq_dentry = debugfs_create_dir(q->name, pdsc->dentry);
+	if (IS_ERR_OR_NULL(qcq_dentry))
+		return;
+	qcq->dentry = qcq_dentry;
+
+	debugfs_create_x64("q_base_pa", 0400, qcq_dentry, &qcq->q_base_pa);
+	debugfs_create_x32("q_size", 0400, qcq_dentry, &qcq->q_size);
+	debugfs_create_x64("cq_base_pa", 0400, qcq_dentry, &qcq->cq_base_pa);
+	debugfs_create_x32("cq_size", 0400, qcq_dentry, &qcq->cq_size);
+	debugfs_create_x32("accum_work", 0400, qcq_dentry, &qcq->accum_work);
+
+	q_dentry = debugfs_create_dir("q", qcq->dentry);
+	if (IS_ERR_OR_NULL(q_dentry))
+		return;
+
+	debugfs_create_u32("index", 0400, q_dentry, &q->index);
+	debugfs_create_u32("num_descs", 0400, q_dentry, &q->num_descs);
+	debugfs_create_u32("desc_size", 0400, q_dentry, &q->desc_size);
+	debugfs_create_u32("pid", 0400, q_dentry, &q->pid);
+
+	debugfs_create_u16("tail", 0400, q_dentry, &q->tail_idx);
+	debugfs_create_u16("head", 0400, q_dentry, &q->head_idx);
+
+	cq_dentry = debugfs_create_dir("cq", qcq->dentry);
+	if (IS_ERR_OR_NULL(cq_dentry))
+		return;
+
+	debugfs_create_x64("base_pa", 0400, cq_dentry, &cq->base_pa);
+	debugfs_create_u32("num_descs", 0400, cq_dentry, &cq->num_descs);
+	debugfs_create_u32("desc_size", 0400, cq_dentry, &cq->desc_size);
+	debugfs_create_bool("done_color", 0400, cq_dentry, &cq->done_color);
+	debugfs_create_u16("tail", 0400, cq_dentry, &cq->tail_idx);
+
+	if (qcq->flags & PDS_CORE_QCQ_F_INTR) {
+		intr_dentry = debugfs_create_dir("intr", qcq->dentry);
+		if (IS_ERR_OR_NULL(intr_dentry))
+			return;
+
+		debugfs_create_u32("index", 0400, intr_dentry, &intr->index);
+		debugfs_create_u32("vector", 0400, intr_dentry, &intr->vector);
+
+		intr_ctrl_regset = kzalloc(sizeof(*intr_ctrl_regset),
+					   GFP_KERNEL);
+		if (!intr_ctrl_regset)
+			return;
+		intr_ctrl_regset->regs = intr_ctrl_regs;
+		intr_ctrl_regset->nregs = ARRAY_SIZE(intr_ctrl_regs);
+		intr_ctrl_regset->base = &pdsc->intr_ctrl[intr->index];
+
+		debugfs_create_regset32("intr_ctrl", 0400, intr_dentry,
+					intr_ctrl_regset);
+	}
+};
+
+void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq)
+{
+	debugfs_remove_recursive(qcq->dentry);
+	qcq->dentry = NULL;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index 3b05b1af65d1..89f23d620d1d 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -3,6 +3,67 @@
 
 #include "core.h"
 
+static char *fw_slotnames[] = {
+	"fw.goldfw",
+	"fw.mainfwa",
+	"fw.mainfwb",
+};
+
+int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
+		     struct netlink_ext_ack *extack)
+{
+	union pds_core_dev_cmd cmd = {
+		.fw_control.opcode = PDS_CORE_CMD_FW_CONTROL,
+		.fw_control.oper = PDS_CORE_FW_GET_LIST,
+	};
+	struct pds_core_fw_list_info fw_list;
+	struct pdsc *pdsc = devlink_priv(dl);
+	union pds_core_dev_comp comp;
+	char buf[16];
+	int listlen;
+	int err;
+	int i;
+
+	mutex_lock(&pdsc->devcmd_lock);
+	err = pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout * 2);
+	memcpy_fromio(&fw_list, pdsc->cmd_regs->data, sizeof(fw_list));
+	mutex_unlock(&pdsc->devcmd_lock);
+	if (err && err != -EIO)
+		return err;
+
+	listlen = fw_list.num_fw_slots;
+	for (i = 0; i < listlen; i++) {
+		if (i < ARRAY_SIZE(fw_slotnames))
+			strscpy(buf, fw_slotnames[i], sizeof(buf));
+		else
+			snprintf(buf, sizeof(buf), "fw.slot_%d", i);
+		err = devlink_info_version_stored_put(req, buf,
+						      fw_list.fw_names[i].fw_version);
+	}
+
+	err = devlink_info_version_running_put(req,
+					       DEVLINK_INFO_VERSION_GENERIC_FW,
+					       pdsc->dev_info.fw_version);
+	if (err)
+		return err;
+
+	snprintf(buf, sizeof(buf), "0x%x", pdsc->dev_info.asic_type);
+	err = devlink_info_version_fixed_put(req,
+					     DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
+					     buf);
+	if (err)
+		return err;
+
+	snprintf(buf, sizeof(buf), "0x%x", pdsc->dev_info.asic_rev);
+	err = devlink_info_version_fixed_put(req,
+					     DEVLINK_INFO_VERSION_GENERIC_ASIC_REV,
+					     buf);
+	if (err)
+		return err;
+
+	return devlink_info_serial_number_put(req, pdsc->dev_info.serial_num);
+}
+
 int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 			      struct devlink_fmsg *fmsg,
 			      struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index 54f3aed7adb1..eaff311d4a10 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -125,6 +125,13 @@ err_out:
 	return err;
 }
 
+void __iomem *pdsc_map_dbpage(struct pdsc *pdsc, int page_num)
+{
+	return pci_iomap_range(pdsc->pdev,
+			       pdsc->bars[PDS_CORE_PCI_BAR_DBELL].res_index,
+			       (u64)page_num << PAGE_SHIFT, PAGE_SIZE);
+}
+
 static int pdsc_init_vf(struct pdsc *vf)
 {
 	return -1;
@@ -166,6 +173,7 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	mutex_init(&pdsc->devcmd_lock);
 	mutex_init(&pdsc->config_lock);
+	spin_lock_init(&pdsc->adminq_lock);
 
 	mutex_lock(&pdsc->config_lock);
 	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
@@ -173,6 +181,9 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 	err = pdsc_setup(pdsc, PDSC_SETUP_INIT);
 	if (err)
 		goto err_out_unmap_bars;
+	err = pdsc_start(pdsc);
+	if (err)
+		goto err_out_teardown;
 
 	mutex_unlock(&pdsc->config_lock);
 
@@ -184,7 +195,7 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 		dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr);
 		err = PTR_ERR(hr);
 		devl_unlock(dl);
-		goto err_out_teardown;
+		goto err_out_stop;
 	}
 	pdsc->fw_reporter = hr;
 
@@ -196,6 +207,8 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	return 0;
 
+err_out_stop:
+	pdsc_stop(pdsc);
 err_out_teardown:
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
 err_out_unmap_bars:
@@ -214,6 +227,7 @@ err_out_release_regions:
 }
 
 static const struct devlink_ops pdsc_dl_ops = {
+	.info_get	= pdsc_dl_info_get,
 };
 
 static const struct devlink_ops pdsc_dl_vf_ops = {
@@ -315,6 +329,7 @@ static void pdsc_remove(struct pci_dev *pdev)
 		mutex_lock(&pdsc->config_lock);
 		set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state);
 
+		pdsc_stop(pdsc);
 		pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
 		mutex_unlock(&pdsc->config_lock);
 		mutex_destroy(&pdsc->config_lock);
diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
new file mode 100644
index 000000000000..dd5fbe3ee141
--- /dev/null
+++ b/include/linux/pds/pds_adminq.h
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDS_CORE_ADMINQ_H_
+#define _PDS_CORE_ADMINQ_H_
+
+enum pds_core_adminq_flags {
+	PDS_AQ_FLAG_FASTPOLL	= BIT(1),	/* completion poll at 1ms */
+};
+
+/*
+ * enum pds_core_adminq_opcode - AdminQ command opcodes
+ * These commands are only processed on AdminQ, not available in devcmd
+ */
+enum pds_core_adminq_opcode {
+	PDS_AQ_CMD_NOP			= 0,
+
+	/* Client control */
+	PDS_AQ_CMD_CLIENT_REG		= 6,
+	PDS_AQ_CMD_CLIENT_UNREG		= 7,
+	PDS_AQ_CMD_CLIENT_CMD		= 8,
+
+	/* LIF commands */
+	PDS_AQ_CMD_LIF_IDENTIFY		= 20,
+	PDS_AQ_CMD_LIF_INIT		= 21,
+	PDS_AQ_CMD_LIF_RESET		= 22,
+	PDS_AQ_CMD_LIF_GETATTR		= 23,
+	PDS_AQ_CMD_LIF_SETATTR		= 24,
+	PDS_AQ_CMD_LIF_SETPHC		= 25,
+
+	PDS_AQ_CMD_RX_MODE_SET		= 30,
+	PDS_AQ_CMD_RX_FILTER_ADD	= 31,
+	PDS_AQ_CMD_RX_FILTER_DEL	= 32,
+
+	/* Queue commands */
+	PDS_AQ_CMD_Q_IDENTIFY		= 39,
+	PDS_AQ_CMD_Q_INIT		= 40,
+	PDS_AQ_CMD_Q_CONTROL		= 41,
+
+	/* SR/IOV commands */
+	PDS_AQ_CMD_VF_GETATTR		= 60,
+	PDS_AQ_CMD_VF_SETATTR		= 61,
+};
+
+/*
+ * enum pds_core_notifyq_opcode - NotifyQ event codes
+ */
+enum pds_core_notifyq_opcode {
+	PDS_EVENT_LINK_CHANGE		= 1,
+	PDS_EVENT_RESET			= 2,
+	PDS_EVENT_XCVR			= 5,
+	PDS_EVENT_CLIENT		= 6,
+};
+
+#define PDS_COMP_COLOR_MASK  0x80
+
+/**
+ * struct pds_core_notifyq_event - Generic event reporting structure
+ * @eid:   event number
+ * @ecode: event code
+ *
+ * This is the generic event report struct from which the other
+ * actual events will be formed.
+ */
+struct pds_core_notifyq_event {
+	__le64 eid;
+	__le16 ecode;
+};
+
+/**
+ * struct pds_core_link_change_event - Link change event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_LINK_CHANGE
+ * @link_status:	link up/down, with error bits
+ * @link_speed:		speed of the network link
+ *
+ * Sent when the network link state changes between UP and DOWN
+ */
+struct pds_core_link_change_event {
+	__le64 eid;
+	__le16 ecode;
+	__le16 link_status;
+	__le32 link_speed;	/* units of 1Mbps: e.g. 10000 = 10Gbps */
+};
+
+/**
+ * struct pds_core_reset_event - Reset event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_RESET
+ * @reset_code:		reset type
+ * @state:		0=pending, 1=complete, 2=error
+ *
+ * Sent when the NIC or some subsystem is going to be or
+ * has been reset.
+ */
+struct pds_core_reset_event {
+	__le64 eid;
+	__le16 ecode;
+	u8     reset_code;
+	u8     state;
+};
+
+/**
+ * struct pds_core_client_event - Client event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_CLIENT
+ * @client_id:          client to sent event to
+ * @client_event:       wrapped event struct for the client
+ *
+ * Sent when an event needs to be passed on to a client
+ */
+struct pds_core_client_event {
+	__le64 eid;
+	__le16 ecode;
+	__le16 client_id;
+	u8     client_event[54];
+};
+
+/**
+ * struct pds_core_notifyq_cmd - Placeholder for building qcq
+ * @data:      anonymous field for building the qcq
+ */
+struct pds_core_notifyq_cmd {
+	__le32 data;	/* Not used but needed for qcq structure */
+};
+
+/*
+ * union pds_core_notifyq_comp - Overlay of notifyq event structures
+ */
+union pds_core_notifyq_comp {
+	struct {
+		__le64 eid;
+		__le16 ecode;
+	};
+	struct pds_core_notifyq_event     event;
+	struct pds_core_link_change_event link_change;
+	struct pds_core_reset_event       reset;
+	u8     data[64];
+};
+
+#define PDS_DEVNAME_LEN		32
+/**
+ * struct pds_core_client_reg_cmd - Register a new client with DSC
+ * @opcode:         opcode PDS_AQ_CMD_CLIENT_REG
+ * @rsvd:           word boundary padding
+ * @devname:        text name of client device
+ * @vif_type:       what type of device (enum pds_core_vif_types)
+ *
+ * Tell the DSC of the new client, and receive a client_id from DSC.
+ */
+struct pds_core_client_reg_cmd {
+	u8     opcode;
+	u8     rsvd[3];
+	char   devname[PDS_DEVNAME_LEN];
+	u8     vif_type;
+};
+
+/**
+ * struct pds_core_client_reg_comp - Client registration completion
+ * @status:     Status of the command (enum pdc_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @client_id:  New id assigned by DSC
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_client_reg_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	__le16 client_id;
+	u8     rsvd1[9];
+	u8     color;
+};
+
+/**
+ * struct pds_core_client_unreg_cmd - Unregister a client from DSC
+ * @opcode:     opcode PDS_AQ_CMD_CLIENT_UNREG
+ * @rsvd:       word boundary padding
+ * @client_id:  id of client being removed
+ *
+ * Tell the DSC this client is going away and remove its context
+ * This uses the generic completion.
+ */
+struct pds_core_client_unreg_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+};
+
+/**
+ * struct pds_core_client_request_cmd - Pass along a wrapped client AdminQ cmd
+ * @opcode:     opcode PDS_AQ_CMD_CLIENT_CMD
+ * @rsvd:       word boundary padding
+ * @client_id:  id of client being removed
+ * @client_cmd: the wrapped client command
+ *
+ * Proxy post an adminq command for the client.
+ * This uses the generic completion.
+ */
+struct pds_core_client_request_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+	u8     client_cmd[60];
+};
+
+#define PDS_CORE_MAX_FRAGS		16
+
+#define PDS_CORE_QCQ_F_INITED		BIT(0)
+#define PDS_CORE_QCQ_F_SG		BIT(1)
+#define PDS_CORE_QCQ_F_INTR		BIT(2)
+#define PDS_CORE_QCQ_F_TX_STATS		BIT(3)
+#define PDS_CORE_QCQ_F_RX_STATS		BIT(4)
+#define PDS_CORE_QCQ_F_NOTIFYQ		BIT(5)
+#define PDS_CORE_QCQ_F_CMB_RINGS	BIT(6)
+#define PDS_CORE_QCQ_F_CORE		BIT(7)
+
+enum pds_core_lif_type {
+	PDS_CORE_LIF_TYPE_DEFAULT = 0,
+};
+
+/**
+ * union pds_core_lif_config - LIF configuration
+ * @state:	    LIF state (enum pds_core_lif_state)
+ * @rsvd:           Word boundary padding
+ * @name:	    LIF name
+ * @rsvd2:          Word boundary padding
+ * @features:	    LIF features active (enum pds_core_hw_features)
+ * @queue_count:    Queue counts per queue-type
+ * @words:          Full union buffer size
+ */
+union pds_core_lif_config {
+	struct {
+		u8     state;
+		u8     rsvd[3];
+		char   name[PDS_CORE_IFNAMSIZ];
+		u8     rsvd2[12];
+		__le64 features;
+		__le32 queue_count[PDS_CORE_QTYPE_MAX];
+	} __packed;
+	__le32 words[64];
+};
+
+/**
+ * struct pds_core_lif_status - LIF status register
+ * @eid:	     most recent NotifyQ event id
+ * @rsvd:            full struct size
+ */
+struct pds_core_lif_status {
+	__le64 eid;
+	u8     rsvd[56];
+};
+
+/**
+ * struct pds_core_lif_info - LIF info structure
+ * @config:	LIF configuration structure
+ * @status:	LIF status structure
+ */
+struct pds_core_lif_info {
+	union pds_core_lif_config config;
+	struct pds_core_lif_status status;
+};
+
+/**
+ * struct pds_core_lif_identity - LIF identity information (type-specific)
+ * @features:		LIF features (see enum pds_core_hw_features)
+ * @version:		Identify structure version
+ * @hw_index:		LIF hardware index
+ * @rsvd:		Word boundary padding
+ * @max_nb_sessions:	Maximum number of sessions supported
+ * @rsvd2:		buffer padding
+ * @config:		LIF config struct with features, q counts
+ */
+struct pds_core_lif_identity {
+	__le64 features;
+	u8     version;
+	u8     hw_index;
+	u8     rsvd[2];
+	__le32 max_nb_sessions;
+	u8     rsvd2[120];
+	union pds_core_lif_config config;
+};
+
+/**
+ * struct pds_core_lif_identify_cmd - Get LIF identity info command
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_IDENTIFY
+ * @type:	LIF type (enum pds_core_lif_type)
+ * @client_id:	Client identifier
+ * @ver:	Version of identify returned by device
+ * @rsvd:       Word boundary padding
+ * @ident_pa:	DMA address to receive identity info
+ *
+ * Firmware will copy LIF identity data (struct pds_core_lif_identity)
+ * into the buffer address given.
+ */
+struct pds_core_lif_identify_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le64 ident_pa;
+};
+
+/**
+ * struct pds_core_lif_identify_comp - LIF identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @ver:	Version of identify returned by device
+ * @bytes:	Bytes copied into the buffer
+ * @rsvd:       Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_lif_identify_comp {
+	u8     status;
+	u8     ver;
+	__le16 bytes;
+	u8     rsvd[11];
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_init_cmd - LIF init command
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_INIT
+ * @type:	LIF type (enum pds_core_lif_type)
+ * @client_id:	Client identifier
+ * @rsvd:       Word boundary padding
+ * @info_pa:	Destination address for LIF info (struct pds_core_lif_info)
+ */
+struct pds_core_lif_init_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	__le32 rsvd;
+	__le64 info_pa;
+};
+
+/**
+ * struct pds_core_lif_init_comp - LIF init command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @hw_index:	Hardware index of the initialized LIF
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_lif_init_comp {
+	u8 status;
+	u8 rsvd;
+	__le16 hw_index;
+	u8     rsvd1[11];
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_reset_cmd - LIF reset command
+ * Will reset only the specified LIF.
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_RESET
+ * @rsvd:       Word boundary padding
+ * @client_id:	Client identifier
+ */
+struct pds_core_lif_reset_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+};
+
+/**
+ * enum pds_core_lif_attr - List of LIF attributes
+ * @PDS_CORE_LIF_ATTR_STATE:		LIF state attribute
+ * @PDS_CORE_LIF_ATTR_NAME:		LIF name attribute
+ * @PDS_CORE_LIF_ATTR_FEATURES:		LIF features attribute
+ * @PDS_CORE_LIF_ATTR_STATS_CTRL:	LIF statistics control attribute
+ */
+enum pds_core_lif_attr {
+	PDS_CORE_LIF_ATTR_STATE		= 0,
+	PDS_CORE_LIF_ATTR_NAME		= 1,
+	PDS_CORE_LIF_ATTR_FEATURES	= 4,
+	PDS_CORE_LIF_ATTR_STATS_CTRL	= 6,
+};
+
+/**
+ * struct pds_core_lif_setattr_cmd - Set LIF attributes on the NIC
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_SETATTR
+ * @attr:	Attribute type (enum pds_core_lif_attr)
+ * @client_id:	Client identifier
+ * @state:	LIF state (enum pds_core_lif_state)
+ * @name:	The name string, 0 terminated
+ * @features:	Features (enum pds_core_hw_features)
+ * @stats_ctl:	Stats control commands (enum pds_core_stats_ctl_cmd)
+ * @rsvd:       Command Buffer padding
+ */
+struct pds_core_lif_setattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 client_id;
+	union {
+		u8      state;
+		char    name[PDS_CORE_IFNAMSIZ];
+		__le64  features;
+		u8      stats_ctl;
+		u8      rsvd[60];
+	} __packed;
+};
+
+/**
+ * struct pds_core_lif_setattr_comp - LIF set attr command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @features:	Features (enum pds_core_hw_features)
+ * @rsvd2:      Word boundary padding
+ * @color:	Color bit
+ */
+struct pds_core_lif_setattr_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		__le64  features;
+		u8      rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_getattr_cmd - Get LIF attributes from the NIC
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_GETATTR
+ * @attr:	Attribute type (enum pds_core_lif_attr)
+ * @client_id:	Client identifier
+ */
+struct pds_core_lif_getattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 client_id;
+};
+
+/**
+ * struct pds_core_lif_getattr_comp - LIF get attr command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @state:	LIF state (enum pds_core_lif_state)
+ * @name:	LIF name string, 0 terminated
+ * @features:	Features (enum pds_core_hw_features)
+ * @rsvd2:      Word boundary padding
+ * @color:	Color bit
+ */
+struct pds_core_lif_getattr_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		u8      state;
+		__le64  features;
+		u8      rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * union pds_core_q_identity - Queue identity information
+ * @version:	Queue type version that can be used with FW
+ * @supported:	Bitfield of queue versions, first bit = ver 0
+ * @rsvd:       Word boundary padding
+ * @features:	Queue features
+ * @desc_sz:	Descriptor size
+ * @comp_sz:	Completion descriptor size
+ * @rsvd2:      Word boundary padding
+ */
+struct pds_core_q_identity {
+	u8      version;
+	u8      supported;
+	u8      rsvd[6];
+#define PDS_CORE_QIDENT_F_CQ	0x01	/* queue has completion ring */
+	__le64  features;
+	__le16  desc_sz;
+	__le16  comp_sz;
+	u8      rsvd2[6];
+};
+
+/**
+ * struct pds_core_q_identify_cmd - queue identify command
+ * @opcode:	Opcode PDS_AQ_CMD_Q_IDENTIFY
+ * @type:	Logical queue type (enum pds_core_logical_qtype)
+ * @client_id:	Client identifier
+ * @ver:	Highest queue type version that the driver supports
+ * @rsvd:       Word boundary padding
+ * @ident_pa:   DMA address to receive the data (struct pds_core_q_identity)
+ */
+struct pds_core_q_identify_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le64 ident_pa;
+};
+
+/**
+ * struct pds_core_q_identify_comp - queue identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index:	Index in the descriptor ring for which this is the completion
+ * @ver:	Queue type version that can be used with FW
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_q_identify_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	u8     ver;
+	u8     rsvd1[10];
+	u8     color;
+};
+
+/**
+ * struct pds_core_q_init_cmd - Queue init command
+ * @opcode:	  Opcode PDS_AQ_CMD_Q_INIT
+ * @type:	  Logical queue type
+ * @client_id:	  Client identifier
+ * @ver:	  Queue type version
+ * @rsvd:         Word boundary padding
+ * @index:	  (LIF, qtype) relative admin queue index
+ * @intr_index:	  Interrupt control register index, or Event queue index
+ * @pid:	  Process ID
+ * @flags:
+ *    IRQ:	  Interrupt requested on completion
+ *    ENA:	  Enable the queue.  If ENA=0 the queue is initialized
+ *		  but remains disabled, to be later enabled with the
+ *		  Queue Enable command. If ENA=1, then queue is
+ *		  initialized and then enabled.
+ * @cos:	  Class of service for this queue
+ * @ring_size:	  Queue ring size, encoded as a log2(size), in
+ *		  number of descriptors.  The actual ring size is
+ *		  (1 << ring_size).  For example, to select a ring size
+ *		  of 64 descriptors write ring_size = 6. The minimum
+ *		  ring_size value is 2 for a ring of 4 descriptors.
+ *		  The maximum ring_size value is 12 for a ring of 4k
+ *		  descriptors. Values of ring_size <2 and >12 are
+ *		  reserved.
+ * @ring_base:	  Queue ring base address
+ * @cq_ring_base: Completion queue ring base address
+ */
+struct pds_core_q_init_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le32 index;
+	__le16 pid;
+	__le16 intr_index;
+	__le16 flags;
+#define PDS_CORE_QINIT_F_IRQ	0x01	/* Request interrupt on completion */
+#define PDS_CORE_QINIT_F_ENA	0x02	/* Enable the queue */
+	u8     cos;
+#define PDS_CORE_QSIZE_MIN_LG2	2
+#define PDS_CORE_QSIZE_MAX_LG2	12
+	u8     ring_size;
+	__le64 ring_base;
+	__le64 cq_ring_base;
+} __packed;
+
+/**
+ * struct pds_core_q_init_comp - Queue init command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index:	Index in the descriptor ring for which this is the completion
+ * @hw_index:	Hardware Queue ID
+ * @hw_type:	Hardware Queue type
+ * @rsvd2:      Word boundary padding
+ * @color:	Color
+ */
+struct pds_core_q_init_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	__le32 hw_index;
+	u8     hw_type;
+	u8     rsvd2[6];
+	u8     color;
+};
+
+union pds_core_adminq_cmd {
+	u8     opcode;
+	u8     bytes[64];
+
+	struct pds_core_client_reg_cmd     client_reg;
+	struct pds_core_client_unreg_cmd   client_unreg;
+	struct pds_core_client_request_cmd client_request;
+
+	struct pds_core_lif_identify_cmd  lif_ident;
+	struct pds_core_lif_init_cmd      lif_init;
+	struct pds_core_lif_reset_cmd     lif_reset;
+	struct pds_core_lif_setattr_cmd   lif_setattr;
+	struct pds_core_lif_getattr_cmd   lif_getattr;
+
+	struct pds_core_q_identify_cmd    q_ident;
+	struct pds_core_q_init_cmd        q_init;
+};
+
+union pds_core_adminq_comp {
+	struct {
+		u8     status;
+		u8     rsvd;
+		__le16 comp_index;
+		u8     rsvd2[11];
+		u8     color;
+	};
+	u32    words[4];
+
+	struct pds_core_client_reg_comp   client_reg;
+
+	struct pds_core_lif_identify_comp lif_ident;
+	struct pds_core_lif_init_comp     lif_init;
+	struct pds_core_lif_setattr_comp  lif_setattr;
+	struct pds_core_lif_getattr_comp  lif_getattr;
+
+	struct pds_core_q_identify_comp   q_ident;
+	struct pds_core_q_init_comp       q_init;
+};
+
+#ifndef __CHECKER__
+static_assert(sizeof(union pds_core_adminq_cmd) == 64);
+static_assert(sizeof(union pds_core_adminq_comp) == 16);
+static_assert(sizeof(union pds_core_notifyq_comp) == 64);
+#endif /* __CHECKER__ */
+
+/* The color bit is a 'done' bit for the completion descriptors
+ * where the meaning alternates between '1' and '0' for alternating
+ * passes through the completion descriptor ring.
+ */
+static inline u8 pdsc_color_match(u8 color, u8 done_color)
+{
+	return (!!(color & PDS_COMP_COLOR_MASK)) == done_color;
+}
+#endif /* _PDS_CORE_ADMINQ_H_ */
-- 
cgit v1.2.3


From 01ba61b55b2041a39c54aefb3153c770dd59a0ef Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:19 -0700
Subject: pds_core: Add adminq processing and commands

Add the service routines for submitting and processing
the adminq messages and for handling notifyq events.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile |   1 +
 drivers/net/ethernet/amd/pds_core/adminq.c | 288 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.c   |  11 --
 include/linux/pds/pds_adminq.h             |  11 +-
 4 files changed, 299 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/adminq.c

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index eaca8557ba66..ef76dcd7fccd 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_PDS_CORE) := pds_core.o
 pds_core-y := main.o \
 	      devlink.o \
 	      dev.o \
+	      adminq.o \
 	      core.o
 
 pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c
new file mode 100644
index 000000000000..fb2ba3f62480
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/adminq.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/dynamic_debug.h>
+
+#include "core.h"
+
+struct pdsc_wait_context {
+	struct pdsc_qcq *qcq;
+	struct completion wait_completion;
+};
+
+static int pdsc_process_notifyq(struct pdsc_qcq *qcq)
+{
+	union pds_core_notifyq_comp *comp;
+	struct pdsc *pdsc = qcq->pdsc;
+	struct pdsc_cq *cq = &qcq->cq;
+	struct pdsc_cq_info *cq_info;
+	int nq_work = 0;
+	u64 eid;
+
+	cq_info = &cq->info[cq->tail_idx];
+	comp = cq_info->comp;
+	eid = le64_to_cpu(comp->event.eid);
+	while (eid > pdsc->last_eid) {
+		u16 ecode = le16_to_cpu(comp->event.ecode);
+
+		switch (ecode) {
+		case PDS_EVENT_LINK_CHANGE:
+			dev_info(pdsc->dev, "NotifyQ LINK_CHANGE ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		case PDS_EVENT_RESET:
+			dev_info(pdsc->dev, "NotifyQ RESET ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		case PDS_EVENT_XCVR:
+			dev_info(pdsc->dev, "NotifyQ XCVR ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		default:
+			dev_info(pdsc->dev, "NotifyQ ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+		}
+
+		pdsc->last_eid = eid;
+		cq->tail_idx = (cq->tail_idx + 1) & (cq->num_descs - 1);
+		cq_info = &cq->info[cq->tail_idx];
+		comp = cq_info->comp;
+		eid = le64_to_cpu(comp->event.eid);
+
+		nq_work++;
+	}
+
+	qcq->accum_work += nq_work;
+
+	return nq_work;
+}
+
+void pdsc_process_adminq(struct pdsc_qcq *qcq)
+{
+	union pds_core_adminq_comp *comp;
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc *pdsc = qcq->pdsc;
+	struct pdsc_cq *cq = &qcq->cq;
+	struct pdsc_q_info *q_info;
+	unsigned long irqflags;
+	int nq_work = 0;
+	int aq_work = 0;
+	int credits;
+
+	/* Don't process AdminQ when shutting down */
+	if (pdsc->state & BIT_ULL(PDSC_S_STOPPING_DRIVER)) {
+		dev_err(pdsc->dev, "%s: called while PDSC_S_STOPPING_DRIVER\n",
+			__func__);
+		return;
+	}
+
+	/* Check for NotifyQ event */
+	nq_work = pdsc_process_notifyq(&pdsc->notifyqcq);
+
+	/* Check for empty queue, which can happen if the interrupt was
+	 * for a NotifyQ event and there are no new AdminQ completions.
+	 */
+	if (q->tail_idx == q->head_idx)
+		goto credits;
+
+	/* Find the first completion to clean,
+	 * run the callback in the related q_info,
+	 * and continue while we still match done color
+	 */
+	spin_lock_irqsave(&pdsc->adminq_lock, irqflags);
+	comp = cq->info[cq->tail_idx].comp;
+	while (pdsc_color_match(comp->color, cq->done_color)) {
+		q_info = &q->info[q->tail_idx];
+		q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
+
+		/* Copy out the completion data */
+		memcpy(q_info->dest, comp, sizeof(*comp));
+
+		complete_all(&q_info->wc->wait_completion);
+
+		if (cq->tail_idx == cq->num_descs - 1)
+			cq->done_color = !cq->done_color;
+		cq->tail_idx = (cq->tail_idx + 1) & (cq->num_descs - 1);
+		comp = cq->info[cq->tail_idx].comp;
+
+		aq_work++;
+	}
+	spin_unlock_irqrestore(&pdsc->adminq_lock, irqflags);
+
+	qcq->accum_work += aq_work;
+
+credits:
+	/* Return the interrupt credits, one for each completion */
+	credits = nq_work + aq_work;
+	if (credits)
+		pds_core_intr_credits(&pdsc->intr_ctrl[qcq->intx],
+				      credits,
+				      PDS_CORE_INTR_CRED_REARM);
+}
+
+void pdsc_work_thread(struct work_struct *work)
+{
+	struct pdsc_qcq *qcq = container_of(work, struct pdsc_qcq, work);
+
+	pdsc_process_adminq(qcq);
+}
+
+irqreturn_t pdsc_adminq_isr(int irq, void *data)
+{
+	struct pdsc_qcq *qcq = data;
+	struct pdsc *pdsc = qcq->pdsc;
+
+	/* Don't process AdminQ when shutting down */
+	if (pdsc->state & BIT_ULL(PDSC_S_STOPPING_DRIVER)) {
+		dev_err(pdsc->dev, "%s: called while PDSC_S_STOPPING_DRIVER\n",
+			__func__);
+		return IRQ_HANDLED;
+	}
+
+	queue_work(pdsc->wq, &qcq->work);
+	pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR);
+
+	return IRQ_HANDLED;
+}
+
+static int __pdsc_adminq_post(struct pdsc *pdsc,
+			      struct pdsc_qcq *qcq,
+			      union pds_core_adminq_cmd *cmd,
+			      union pds_core_adminq_comp *comp,
+			      struct pdsc_wait_context *wc)
+{
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc_q_info *q_info;
+	unsigned long irqflags;
+	unsigned int avail;
+	int index;
+	int ret;
+
+	spin_lock_irqsave(&pdsc->adminq_lock, irqflags);
+
+	/* Check for space in the queue */
+	avail = q->tail_idx;
+	if (q->head_idx >= avail)
+		avail += q->num_descs - q->head_idx - 1;
+	else
+		avail -= q->head_idx + 1;
+	if (!avail) {
+		ret = -ENOSPC;
+		goto err_out_unlock;
+	}
+
+	/* Check that the FW is running */
+	if (!pdsc_is_fw_running(pdsc)) {
+		u8 fw_status = ioread8(&pdsc->info_regs->fw_status);
+
+		dev_info(pdsc->dev, "%s: post failed - fw not running %#02x:\n",
+			 __func__, fw_status);
+		ret = -ENXIO;
+
+		goto err_out_unlock;
+	}
+
+	/* Post the request */
+	index = q->head_idx;
+	q_info = &q->info[index];
+	q_info->wc = wc;
+	q_info->dest = comp;
+	memcpy(q_info->desc, cmd, sizeof(*cmd));
+
+	dev_dbg(pdsc->dev, "head_idx %d tail_idx %d\n",
+		q->head_idx, q->tail_idx);
+	dev_dbg(pdsc->dev, "post admin queue command:\n");
+	dynamic_hex_dump("cmd ", DUMP_PREFIX_OFFSET, 16, 1,
+			 cmd, sizeof(*cmd), true);
+
+	q->head_idx = (q->head_idx + 1) & (q->num_descs - 1);
+
+	pds_core_dbell_ring(pdsc->kern_dbpage,
+			    q->hw_type, q->dbval | q->head_idx);
+	ret = index;
+
+err_out_unlock:
+	spin_unlock_irqrestore(&pdsc->adminq_lock, irqflags);
+	return ret;
+}
+
+int pdsc_adminq_post(struct pdsc *pdsc,
+		     union pds_core_adminq_cmd *cmd,
+		     union pds_core_adminq_comp *comp,
+		     bool fast_poll)
+{
+	struct pdsc_wait_context wc = {
+		.wait_completion =
+			COMPLETION_INITIALIZER_ONSTACK(wc.wait_completion),
+	};
+	unsigned long poll_interval = 1;
+	unsigned long poll_jiffies;
+	unsigned long time_limit;
+	unsigned long time_start;
+	unsigned long time_done;
+	unsigned long remaining;
+	int err = 0;
+	int index;
+
+	wc.qcq = &pdsc->adminqcq;
+	index = __pdsc_adminq_post(pdsc, &pdsc->adminqcq, cmd, comp, &wc);
+	if (index < 0) {
+		err = index;
+		goto err_out;
+	}
+
+	time_start = jiffies;
+	time_limit = time_start + HZ * pdsc->devcmd_timeout;
+	do {
+		/* Timeslice the actual wait to catch IO errors etc early */
+		poll_jiffies = msecs_to_jiffies(poll_interval);
+		remaining = wait_for_completion_timeout(&wc.wait_completion,
+							poll_jiffies);
+		if (remaining)
+			break;
+
+		if (!pdsc_is_fw_running(pdsc)) {
+			u8 fw_status = ioread8(&pdsc->info_regs->fw_status);
+
+			dev_dbg(pdsc->dev, "%s: post wait failed - fw not running %#02x:\n",
+				__func__, fw_status);
+			err = -ENXIO;
+			break;
+		}
+
+		/* When fast_poll is not requested, prevent aggressive polling
+		 * on failures due to timeouts by doing exponential back off.
+		 */
+		if (!fast_poll && poll_interval < PDSC_ADMINQ_MAX_POLL_INTERVAL)
+			poll_interval <<= 1;
+	} while (time_before(jiffies, time_limit));
+	time_done = jiffies;
+	dev_dbg(pdsc->dev, "%s: elapsed %d msecs\n",
+		__func__, jiffies_to_msecs(time_done - time_start));
+
+	/* Check the results */
+	if (time_after_eq(time_done, time_limit))
+		err = -ETIMEDOUT;
+
+	dev_dbg(pdsc->dev, "read admin queue completion idx %d:\n", index);
+	dynamic_hex_dump("comp ", DUMP_PREFIX_OFFSET, 16, 1,
+			 comp, sizeof(*comp), true);
+
+	if (remaining && comp->status)
+		err = pdsc_err_to_errno(comp->status);
+
+err_out:
+	if (err) {
+		dev_dbg(pdsc->dev, "%s: opcode %d status %d err %pe\n",
+			__func__, cmd->opcode, comp->status, ERR_PTR(err));
+		if (err == -ENXIO || err == -ETIMEDOUT)
+			queue_work(pdsc->wq, &pdsc->health_work);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pdsc_adminq_post);
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index 8c0dbdb5efc5..59daf8a67ac6 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -6,17 +6,6 @@
 
 #include "core.h"
 
-void pdsc_work_thread(struct work_struct *work)
-{
-	/* stub */
-}
-
-irqreturn_t pdsc_adminq_isr(int irq, void *data)
-{
-	/* stub */
-	return IRQ_HANDLED;
-}
-
 void pdsc_intr_free(struct pdsc *pdsc, int index)
 {
 	struct pdsc_intr_info *intr_info;
diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
index dd5fbe3ee141..98a60ce87b92 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -4,6 +4,8 @@
 #ifndef _PDS_CORE_ADMINQ_H_
 #define _PDS_CORE_ADMINQ_H_
 
+#define PDSC_ADMINQ_MAX_POLL_INTERVAL	256
+
 enum pds_core_adminq_flags {
 	PDS_AQ_FLAG_FASTPOLL	= BIT(1),	/* completion poll at 1ms */
 };
@@ -631,8 +633,15 @@ static_assert(sizeof(union pds_core_notifyq_comp) == 64);
  * where the meaning alternates between '1' and '0' for alternating
  * passes through the completion descriptor ring.
  */
-static inline u8 pdsc_color_match(u8 color, u8 done_color)
+static inline bool pdsc_color_match(u8 color, bool done_color)
 {
 	return (!!(color & PDS_COMP_COLOR_MASK)) == done_color;
 }
+
+struct pdsc;
+int pdsc_adminq_post(struct pdsc *pdsc,
+		     union pds_core_adminq_cmd *cmd,
+		     union pds_core_adminq_comp *comp,
+		     bool fast_poll);
+
 #endif /* _PDS_CORE_ADMINQ_H_ */
-- 
cgit v1.2.3


From 65e0185ad764d2801811bb2e7c122e92557208c4 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:21 -0700
Subject: pds_core: set up the VIF definitions and defaults

The Virtual Interfaces (VIFs) supported by the DSC's
configuration (vDPA, Eth, RDMA, etc) are reported in the
dev_ident struct and made visible in debugfs.  At this point
only vDPA is supported in this driver so we only setup
devices for that feature.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/core.c    | 48 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h    | 11 +++++++
 drivers/net/ethernet/amd/pds_core/debugfs.c | 24 +++++++++++++++
 include/linux/pds/pds_common.h              | 19 ++++++++++++
 4 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index 59daf8a67ac6..b2fca3b99f02 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -346,6 +346,43 @@ static int pdsc_core_init(struct pdsc *pdsc)
 	return err;
 }
 
+static struct pdsc_viftype pdsc_viftype_defaults[] = {
+	[PDS_DEV_TYPE_VDPA] = { .name = PDS_DEV_TYPE_VDPA_STR,
+				.vif_id = PDS_DEV_TYPE_VDPA,
+				.dl_id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET },
+	[PDS_DEV_TYPE_MAX] = {}
+};
+
+static int pdsc_viftypes_init(struct pdsc *pdsc)
+{
+	enum pds_core_vif_types vt;
+
+	pdsc->viftype_status = kzalloc(sizeof(pdsc_viftype_defaults),
+				       GFP_KERNEL);
+	if (!pdsc->viftype_status)
+		return -ENOMEM;
+
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++) {
+		bool vt_support;
+
+		if (!pdsc_viftype_defaults[vt].name)
+			continue;
+
+		/* Grab the defaults */
+		pdsc->viftype_status[vt] = pdsc_viftype_defaults[vt];
+
+		/* See what the Core device has for support */
+		vt_support = !!le16_to_cpu(pdsc->dev_ident.vif_types[vt]);
+		dev_dbg(pdsc->dev, "VIF %s is %ssupported\n",
+			pdsc->viftype_status[vt].name,
+			vt_support ? "" : "not ");
+
+		pdsc->viftype_status[vt].supported = vt_support;
+	}
+
+	return 0;
+}
+
 int pdsc_setup(struct pdsc *pdsc, bool init)
 {
 	int numdescs;
@@ -388,6 +425,14 @@ int pdsc_setup(struct pdsc *pdsc, bool init)
 	if (err)
 		goto err_out_teardown;
 
+	/* Set up the VIFs */
+	err = pdsc_viftypes_init(pdsc);
+	if (err)
+		goto err_out_teardown;
+
+	if (init)
+		pdsc_debugfs_add_viftype(pdsc);
+
 	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
 	return 0;
 
@@ -404,6 +449,9 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing)
 	pdsc_qcq_free(pdsc, &pdsc->notifyqcq);
 	pdsc_qcq_free(pdsc, &pdsc->adminqcq);
 
+	kfree(pdsc->viftype_status);
+	pdsc->viftype_status = NULL;
+
 	if (pdsc->intr_info) {
 		for (i = 0; i < pdsc->nintrs; i++)
 			pdsc_intr_free(pdsc, i);
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 7eb02b359f3a..ac0480d7f0f1 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -123,6 +123,15 @@ struct pdsc_qcq {
 	struct dentry *dentry;
 };
 
+struct pdsc_viftype {
+	char *name;
+	bool supported;
+	bool enabled;
+	int dl_id;
+	int vif_id;
+	struct pds_auxiliary_dev *padev;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -174,6 +183,7 @@ struct pdsc {
 	struct pdsc_qcq adminqcq;
 	struct pdsc_qcq notifyqcq;
 	u64 last_eid;
+	struct pdsc_viftype *viftype_status;
 };
 
 /** enum pds_core_dbell_bits - bitwise composition of dbell values.
@@ -237,6 +247,7 @@ void pdsc_debugfs_destroy(void);
 void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
 void pdsc_debugfs_add_ident(struct pdsc *pdsc);
+void pdsc_debugfs_add_viftype(struct pdsc *pdsc);
 void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
 void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq);
 void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq);
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index b83e5016644b..8ec392299b7d 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -68,6 +68,30 @@ void pdsc_debugfs_add_ident(struct pdsc *pdsc)
 			    pdsc, &identity_fops);
 }
 
+static int viftype_show(struct seq_file *seq, void *v)
+{
+	struct pdsc *pdsc = seq->private;
+	int vt;
+
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++) {
+		if (!pdsc->viftype_status[vt].name)
+			continue;
+
+		seq_printf(seq, "%s\t%d supported %d enabled\n",
+			   pdsc->viftype_status[vt].name,
+			   pdsc->viftype_status[vt].supported,
+			   pdsc->viftype_status[vt].enabled);
+	}
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(viftype);
+
+void pdsc_debugfs_add_viftype(struct pdsc *pdsc)
+{
+	debugfs_create_file("viftypes", 0400, pdsc->dentry,
+			    pdsc, &viftype_fops);
+}
+
 static const struct debugfs_reg32 intr_ctrl_regs[] = {
 	{ .name = "coal_init", .offset = 0, },
 	{ .name = "mask", .offset = 4, },
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index f0798ce01acf..b2be14ebadb6 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -20,6 +20,25 @@ enum pds_core_driver_type {
 	PDS_DRIVER_ESXI    = 6,
 };
 
+enum pds_core_vif_types {
+	PDS_DEV_TYPE_CORE	= 0,
+	PDS_DEV_TYPE_VDPA	= 1,
+	PDS_DEV_TYPE_VFIO	= 2,
+	PDS_DEV_TYPE_ETH	= 3,
+	PDS_DEV_TYPE_RDMA	= 4,
+	PDS_DEV_TYPE_LM		= 5,
+
+	/* new ones added before this line */
+	PDS_DEV_TYPE_MAX	= 16   /* don't change - used in struct size */
+};
+
+#define PDS_DEV_TYPE_CORE_STR	"Core"
+#define PDS_DEV_TYPE_VDPA_STR	"vDPA"
+#define PDS_DEV_TYPE_VFIO_STR	"VFio"
+#define PDS_DEV_TYPE_ETH_STR	"Eth"
+#define PDS_DEV_TYPE_RDMA_STR	"RDMA"
+#define PDS_DEV_TYPE_LM_STR	"LM"
+
 #define PDS_CORE_IFNAMSIZ		16
 
 /**
-- 
cgit v1.2.3


From 4569cce43bc61e4cdd76597a1cf9b608846c18cc Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:23 -0700
Subject: pds_core: add auxiliary_bus devices

An auxiliary_bus device is created for each vDPA type VF at VF
probe and destroyed at VF remove.  The aux device name comes
from the driver name + VIF type + the unique id assigned at PCI
probe.  The VFs are always removed on PF remove, so there should
be no issues with VFs trying to access missing PF structures.

The auxiliary_device names will look like "pds_core.vDPA.nn"
where 'nn' is the VF's uid.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile |   1 +
 drivers/net/ethernet/amd/pds_core/auxbus.c | 115 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h   |   6 ++
 drivers/net/ethernet/amd/pds_core/main.c   |  36 ++++++++-
 include/linux/pds/pds_auxbus.h             |  14 ++++
 include/linux/pds/pds_common.h             |   1 +
 6 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/auxbus.c
 create mode 100644 include/linux/pds/pds_auxbus.h

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index 6d1d6c58a1fa..0abc33ce826c 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_PDS_CORE) := pds_core.o
 
 pds_core-y := main.o \
 	      devlink.o \
+	      auxbus.o \
 	      dev.o \
 	      adminq.o \
 	      core.o \
diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c
new file mode 100644
index 000000000000..adee516b3f0c
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/auxbus.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+
+#include "core.h"
+#include <linux/pds/pds_auxbus.h>
+
+static void pdsc_auxbus_dev_release(struct device *dev)
+{
+	struct pds_auxiliary_dev *padev =
+		container_of(dev, struct pds_auxiliary_dev, aux_dev.dev);
+
+	kfree(padev);
+}
+
+static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
+							  struct pdsc *pf,
+							  char *name)
+{
+	struct auxiliary_device *aux_dev;
+	struct pds_auxiliary_dev *padev;
+	int err;
+
+	padev = kzalloc(sizeof(*padev), GFP_KERNEL);
+	if (!padev)
+		return ERR_PTR(-ENOMEM);
+
+	padev->vf_pdev = cf->pdev;
+
+	aux_dev = &padev->aux_dev;
+	aux_dev->name = name;
+	aux_dev->id = cf->uid;
+	aux_dev->dev.parent = cf->dev;
+	aux_dev->dev.release = pdsc_auxbus_dev_release;
+
+	err = auxiliary_device_init(aux_dev);
+	if (err < 0) {
+		dev_warn(cf->dev, "auxiliary_device_init of %s failed: %pe\n",
+			 name, ERR_PTR(err));
+		goto err_out;
+	}
+
+	err = auxiliary_device_add(aux_dev);
+	if (err) {
+		dev_warn(cf->dev, "auxiliary_device_add of %s failed: %pe\n",
+			 name, ERR_PTR(err));
+		goto err_out_uninit;
+	}
+
+	return padev;
+
+err_out_uninit:
+	auxiliary_device_uninit(aux_dev);
+err_out:
+	kfree(padev);
+	return ERR_PTR(err);
+}
+
+int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf)
+{
+	struct pds_auxiliary_dev *padev;
+	int err = 0;
+
+	mutex_lock(&pf->config_lock);
+
+	padev = pf->vfs[cf->vf_id].padev;
+	if (padev) {
+		auxiliary_device_delete(&padev->aux_dev);
+		auxiliary_device_uninit(&padev->aux_dev);
+	}
+	pf->vfs[cf->vf_id].padev = NULL;
+
+	mutex_unlock(&pf->config_lock);
+	return err;
+}
+
+int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
+{
+	struct pds_auxiliary_dev *padev;
+	enum pds_core_vif_types vt;
+	u16 vt_support;
+	int err = 0;
+
+	mutex_lock(&pf->config_lock);
+
+	/* We only support vDPA so far, so it is the only one to
+	 * be verified that it is available in the Core device and
+	 * enabled in the devlink param.  In the future this might
+	 * become a loop for several VIF types.
+	 */
+
+	/* Verify that the type is supported and enabled.  It is not
+	 * an error if there is no auxbus device support for this
+	 * VF, it just means something else needs to happen with it.
+	 */
+	vt = PDS_DEV_TYPE_VDPA;
+	vt_support = !!le16_to_cpu(pf->dev_ident.vif_types[vt]);
+	if (!(vt_support &&
+	      pf->viftype_status[vt].supported &&
+	      pf->viftype_status[vt].enabled))
+		goto out_unlock;
+
+	padev = pdsc_auxbus_dev_register(cf, pf,
+					 pf->viftype_status[vt].name);
+	if (IS_ERR(padev)) {
+		err = PTR_ERR(padev);
+		goto out_unlock;
+	}
+	pf->vfs[cf->vf_id].padev = padev;
+
+out_unlock:
+	mutex_unlock(&pf->config_lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 1ec8784773f1..36099d3ac3dd 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -30,8 +30,11 @@ struct pdsc_dev_bar {
 	int res_index;
 };
 
+struct pdsc;
+
 struct pdsc_vf {
 	struct pds_auxiliary_dev *padev;
+	struct pdsc *vf;
 	u16     index;
 	__le16  vif_types[PDS_DEV_TYPE_MAX];
 };
@@ -287,6 +290,9 @@ int pdsc_start(struct pdsc *pdsc);
 void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf);
+int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf);
+
 void pdsc_process_adminq(struct pdsc_qcq *qcq);
 void pdsc_work_thread(struct work_struct *work);
 irqreturn_t pdsc_adminq_isr(int irq, void *data);
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index 511cb91a295e..b848f3360fe2 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -169,6 +169,12 @@ no_vfs:
 static int pdsc_init_vf(struct pdsc *vf)
 {
 	struct devlink *dl;
+	struct pdsc *pf;
+	int err;
+
+	pf = pdsc_get_pf_struct(vf->pdev);
+	if (IS_ERR_OR_NULL(pf))
+		return PTR_ERR(pf) ?: -1;
 
 	vf->vf_id = pci_iov_vf_id(vf->pdev);
 
@@ -177,7 +183,15 @@ static int pdsc_init_vf(struct pdsc *vf)
 	devl_register(dl);
 	devl_unlock(dl);
 
-	return 0;
+	pf->vfs[vf->vf_id].vf = vf;
+	err = pdsc_auxbus_dev_add(vf, pf);
+	if (err) {
+		devl_lock(dl);
+		devl_unregister(dl);
+		devl_unlock(dl);
+	}
+
+	return err;
 }
 
 static const struct devlink_health_reporter_ops pdsc_fw_reporter_ops = {
@@ -365,7 +379,19 @@ static void pdsc_remove(struct pci_dev *pdev)
 	}
 	devl_unlock(dl);
 
-	if (!pdev->is_virtfn) {
+	if (pdev->is_virtfn) {
+		struct pdsc *pf;
+
+		pf = pdsc_get_pf_struct(pdsc->pdev);
+		if (!IS_ERR(pf)) {
+			pdsc_auxbus_dev_del(pdsc, pf);
+			pf->vfs[pdsc->vf_id].vf = NULL;
+		}
+	} else {
+		/* Remove the VFs and their aux_bus connections before other
+		 * cleanup so that the clients can use the AdminQ to cleanly
+		 * shut themselves down.
+		 */
 		pdsc_sriov_configure(pdev, 0);
 
 		del_timer_sync(&pdsc->wdtimer);
@@ -402,6 +428,12 @@ static struct pci_driver pdsc_driver = {
 	.sriov_configure = pdsc_sriov_configure,
 };
 
+void *pdsc_get_pf_struct(struct pci_dev *vf_pdev)
+{
+	return pci_iov_get_pf_drvdata(vf_pdev, &pdsc_driver);
+}
+EXPORT_SYMBOL_GPL(pdsc_get_pf_struct);
+
 static int __init pdsc_init_module(void)
 {
 	if (strcmp(KBUILD_MODNAME, PDS_CORE_DRV_NAME))
diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h
new file mode 100644
index 000000000000..493f75b1995e
--- /dev/null
+++ b/include/linux/pds/pds_auxbus.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDSC_AUXBUS_H_
+#define _PDSC_AUXBUS_H_
+
+#include <linux/auxiliary_bus.h>
+
+struct pds_auxiliary_dev {
+	struct auxiliary_device aux_dev;
+	struct pci_dev *vf_pdev;
+	u16 client_id;
+};
+#endif /* _PDSC_AUXBUS_H_ */
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index b2be14ebadb6..961b3d02c69f 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -60,4 +60,5 @@ enum pds_core_logical_qtype {
 	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
 };
 
+void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
 #endif /* _PDS_COMMON_H_ */
-- 
cgit v1.2.3


From 10659034c622738bc1bfab8a76fc576c52d5acce Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:25 -0700
Subject: pds_core: add the aux client API

Add the client API operations for running adminq commands.
The core registers the client with the FW, then the client
has a context for requesting adminq services.  We expect
to add additional operations for other clients, including
requesting additional private adminqs and IRQs, but don't have
the need yet.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/auxbus.c | 151 ++++++++++++++++++++++++++++-
 include/linux/pds/pds_auxbus.h             |   6 ++
 include/linux/pds/pds_common.h             |   2 +
 3 files changed, 158 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c
index adee516b3f0c..561af8e5b3ea 100644
--- a/drivers/net/ethernet/amd/pds_core/auxbus.c
+++ b/drivers/net/ethernet/amd/pds_core/auxbus.c
@@ -6,6 +6,136 @@
 #include "core.h"
 #include <linux/pds/pds_auxbus.h>
 
+/**
+ * pds_client_register - Link the client to the firmware
+ * @pf_pdev:	ptr to the PF driver struct
+ * @devname:	name that includes service into, e.g. pds_core.vDPA
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ */
+int pds_client_register(struct pci_dev *pf_pdev, char *devname)
+{
+	union pds_core_adminq_comp comp = {};
+	union pds_core_adminq_cmd cmd = {};
+	struct pdsc *pf;
+	int err;
+	u16 ci;
+
+	pf = pci_get_drvdata(pf_pdev);
+	if (pf->state)
+		return -ENXIO;
+
+	cmd.client_reg.opcode = PDS_AQ_CMD_CLIENT_REG;
+	strscpy(cmd.client_reg.devname, devname,
+		sizeof(cmd.client_reg.devname));
+
+	err = pdsc_adminq_post(pf, &cmd, &comp, false);
+	if (err) {
+		dev_info(pf->dev, "register dev_name %s with DSC failed, status %d: %pe\n",
+			 devname, comp.status, ERR_PTR(err));
+		return err;
+	}
+
+	ci = le16_to_cpu(comp.client_reg.client_id);
+	if (!ci) {
+		dev_err(pf->dev, "%s: device returned null client_id\n",
+			__func__);
+		return -EIO;
+	}
+
+	dev_dbg(pf->dev, "%s: device returned client_id %d for %s\n",
+		__func__, ci, devname);
+
+	return ci;
+}
+EXPORT_SYMBOL_GPL(pds_client_register);
+
+/**
+ * pds_client_unregister - Unlink the client from the firmware
+ * @pf_pdev:	ptr to the PF driver struct
+ * @client_id:	id returned from pds_client_register()
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ */
+int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id)
+{
+	union pds_core_adminq_comp comp = {};
+	union pds_core_adminq_cmd cmd = {};
+	struct pdsc *pf;
+	int err;
+
+	pf = pci_get_drvdata(pf_pdev);
+	if (pf->state)
+		return -ENXIO;
+
+	cmd.client_unreg.opcode = PDS_AQ_CMD_CLIENT_UNREG;
+	cmd.client_unreg.client_id = cpu_to_le16(client_id);
+
+	err = pdsc_adminq_post(pf, &cmd, &comp, false);
+	if (err)
+		dev_info(pf->dev, "unregister client_id %d failed, status %d: %pe\n",
+			 client_id, comp.status, ERR_PTR(err));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pds_client_unregister);
+
+/**
+ * pds_client_adminq_cmd - Process an adminq request for the client
+ * @padev:   ptr to the client device
+ * @req:     ptr to buffer with request
+ * @req_len: length of actual struct used for request
+ * @resp:    ptr to buffer where answer is to be copied
+ * @flags:   optional flags from pds_core_adminq_flags
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ *
+ * Client sends pointers to request and response buffers
+ * Core copies request data into pds_core_client_request_cmd
+ * Core sets other fields as needed
+ * Core posts to AdminQ
+ * Core copies completion data into response buffer
+ */
+int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev,
+			  union pds_core_adminq_cmd *req,
+			  size_t req_len,
+			  union pds_core_adminq_comp *resp,
+			  u64 flags)
+{
+	union pds_core_adminq_cmd cmd = {};
+	struct pci_dev *pf_pdev;
+	struct pdsc *pf;
+	size_t cp_len;
+	int err;
+
+	pf_pdev = pci_physfn(padev->vf_pdev);
+	pf = pci_get_drvdata(pf_pdev);
+
+	dev_dbg(pf->dev, "%s: %s opcode %d\n",
+		__func__, dev_name(&padev->aux_dev.dev), req->opcode);
+
+	if (pf->state)
+		return -ENXIO;
+
+	/* Wrap the client's request */
+	cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD;
+	cmd.client_request.client_id = cpu_to_le16(padev->client_id);
+	cp_len = min_t(size_t, req_len, sizeof(cmd.client_request.client_cmd));
+	memcpy(cmd.client_request.client_cmd, req, cp_len);
+
+	err = pdsc_adminq_post(pf, &cmd, resp,
+			       !!(flags & PDS_AQ_FLAG_FASTPOLL));
+	if (err && err != -EAGAIN)
+		dev_info(pf->dev, "client admin cmd failed: %pe\n",
+			 ERR_PTR(err));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pds_client_adminq_cmd);
+
 static void pdsc_auxbus_dev_release(struct device *dev)
 {
 	struct pds_auxiliary_dev *padev =
@@ -16,6 +146,7 @@ static void pdsc_auxbus_dev_release(struct device *dev)
 
 static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
 							  struct pdsc *pf,
+							  u16 client_id,
 							  char *name)
 {
 	struct auxiliary_device *aux_dev;
@@ -27,6 +158,7 @@ static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
 		return ERR_PTR(-ENOMEM);
 
 	padev->vf_pdev = cf->pdev;
+	padev->client_id = client_id;
 
 	aux_dev = &padev->aux_dev;
 	aux_dev->name = name;
@@ -66,8 +198,10 @@ int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf)
 
 	padev = pf->vfs[cf->vf_id].padev;
 	if (padev) {
+		pds_client_unregister(pf->pdev, padev->client_id);
 		auxiliary_device_delete(&padev->aux_dev);
 		auxiliary_device_uninit(&padev->aux_dev);
+		padev->client_id = 0;
 	}
 	pf->vfs[cf->vf_id].padev = NULL;
 
@@ -79,7 +213,9 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
 {
 	struct pds_auxiliary_dev *padev;
 	enum pds_core_vif_types vt;
+	char devname[PDS_DEVNAME_LEN];
 	u16 vt_support;
+	int client_id;
 	int err = 0;
 
 	mutex_lock(&pf->config_lock);
@@ -101,9 +237,22 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
 	      pf->viftype_status[vt].enabled))
 		goto out_unlock;
 
-	padev = pdsc_auxbus_dev_register(cf, pf,
+	/* Need to register with FW and get the client_id before
+	 * creating the aux device so that the aux client can run
+	 * adminq commands as part its probe
+	 */
+	snprintf(devname, sizeof(devname), "%s.%s.%d",
+		 PDS_CORE_DRV_NAME, pf->viftype_status[vt].name, cf->uid);
+	client_id = pds_client_register(pf->pdev, devname);
+	if (client_id < 0) {
+		err = client_id;
+		goto out_unlock;
+	}
+
+	padev = pdsc_auxbus_dev_register(cf, pf, client_id,
 					 pf->viftype_status[vt].name);
 	if (IS_ERR(padev)) {
+		pds_client_unregister(pf->pdev, client_id);
 		err = PTR_ERR(padev);
 		goto out_unlock;
 	}
diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h
index 493f75b1995e..214ef12302d0 100644
--- a/include/linux/pds/pds_auxbus.h
+++ b/include/linux/pds/pds_auxbus.h
@@ -11,4 +11,10 @@ struct pds_auxiliary_dev {
 	struct pci_dev *vf_pdev;
 	u16 client_id;
 };
+
+int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev,
+			  union pds_core_adminq_cmd *req,
+			  size_t req_len,
+			  union pds_core_adminq_comp *resp,
+			  u64 flags);
 #endif /* _PDSC_AUXBUS_H_ */
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 961b3d02c69f..4b37675fde3e 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -61,4 +61,6 @@ enum pds_core_logical_qtype {
 };
 
 void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
+int pds_client_register(struct pci_dev *pf_pdev, char *devname);
+int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id);
 #endif /* _PDS_COMMON_H_ */
-- 
cgit v1.2.3


From d24c28278a01dc4c80d1470533c667cf406f0e88 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:26 -0700
Subject: pds_core: publish events to the clients

When the Core device gets an event from the device, or notices
the device FW to be up or down, it needs to send those events
on to the clients that have an event handler.  Add the code to
pass along the events to the clients.

The entry points pdsc_register_notify() and pdsc_unregister_notify()
are EXPORTed for other drivers that want to listen for these events.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/adminq.c |  2 ++
 drivers/net/ethernet/amd/pds_core/core.c   | 32 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h   |  3 +++
 include/linux/pds/pds_common.h             |  2 ++
 4 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c
index fb2ba3f62480..045fe133f6ee 100644
--- a/drivers/net/ethernet/amd/pds_core/adminq.c
+++ b/drivers/net/ethernet/amd/pds_core/adminq.c
@@ -29,11 +29,13 @@ static int pdsc_process_notifyq(struct pdsc_qcq *qcq)
 		case PDS_EVENT_LINK_CHANGE:
 			dev_info(pdsc->dev, "NotifyQ LINK_CHANGE ecode %d eid %lld\n",
 				 ecode, eid);
+			pdsc_notify(PDS_EVENT_LINK_CHANGE, comp);
 			break;
 
 		case PDS_EVENT_RESET:
 			dev_info(pdsc->dev, "NotifyQ RESET ecode %d eid %lld\n",
 				 ecode, eid);
+			pdsc_notify(PDS_EVENT_RESET, comp);
 			break;
 
 		case PDS_EVENT_XCVR:
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index b2fca3b99f02..483a070d96fa 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -6,6 +6,25 @@
 
 #include "core.h"
 
+static BLOCKING_NOTIFIER_HEAD(pds_notify_chain);
+
+int pdsc_register_notify(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&pds_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(pdsc_register_notify);
+
+void pdsc_unregister_notify(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&pds_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(pdsc_unregister_notify);
+
+void pdsc_notify(unsigned long event, void *data)
+{
+	blocking_notifier_call_chain(&pds_notify_chain, event, data);
+}
+
 void pdsc_intr_free(struct pdsc *pdsc, int index)
 {
 	struct pdsc_intr_info *intr_info;
@@ -494,12 +513,19 @@ void pdsc_stop(struct pdsc *pdsc)
 
 static void pdsc_fw_down(struct pdsc *pdsc)
 {
+	union pds_core_notifyq_comp reset_event = {
+		.reset.ecode = cpu_to_le16(PDS_EVENT_RESET),
+		.reset.state = 0,
+	};
+
 	if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
 		dev_err(pdsc->dev, "%s: already happening\n", __func__);
 		return;
 	}
 
+	/* Notify clients of fw_down */
 	devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc);
+	pdsc_notify(PDS_EVENT_RESET, &reset_event);
 
 	pdsc_stop(pdsc);
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY);
@@ -507,6 +533,10 @@ static void pdsc_fw_down(struct pdsc *pdsc)
 
 static void pdsc_fw_up(struct pdsc *pdsc)
 {
+	union pds_core_notifyq_comp reset_event = {
+		.reset.ecode = cpu_to_le16(PDS_EVENT_RESET),
+		.reset.state = 1,
+	};
 	int err;
 
 	if (!test_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
@@ -522,9 +552,11 @@ static void pdsc_fw_up(struct pdsc *pdsc)
 	if (err)
 		goto err_out;
 
+	/* Notify clients of fw_up */
 	pdsc->fw_recoveries++;
 	devlink_health_reporter_state_update(pdsc->fw_reporter,
 					     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
+	pdsc_notify(PDS_EVENT_RESET, &reset_event);
 
 	return;
 
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 9e01a9ee6868..e545fafc4819 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -297,6 +297,9 @@ int pdsc_start(struct pdsc *pdsc);
 void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+int pdsc_register_notify(struct notifier_block *nb);
+void pdsc_unregister_notify(struct notifier_block *nb);
+void pdsc_notify(unsigned long event, void *data);
 int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf);
 int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf);
 
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 4b37675fde3e..060331486d50 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -60,6 +60,8 @@ enum pds_core_logical_qtype {
 	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
 };
 
+int pdsc_register_notify(struct notifier_block *nb);
+void pdsc_unregister_notify(struct notifier_block *nb);
 void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
 int pds_client_register(struct pci_dev *pf_pdev, char *devname);
 int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id);
-- 
cgit v1.2.3


From 9f656705c5faa18afb26d922cfc64f9fd103c38d Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Thu, 20 Apr 2023 13:33:23 +0200
Subject: ALSA: pcm: rewrite snd_pcm_playback_silence()

The auto-silencer supports two modes: "thresholded" to fill up "just
enough", and "top-up" to fill up "as much as possible". The two modes
used rather distinct code paths, which this patch unifies. The only
remaining distinction is how much we actually want to fill.

This fixes a bug in thresholded mode, where we failed to use new_hw_ptr,
resulting in under-fill.

Top-up mode is now more well-behaved and much easier to understand in
corner cases.

This also updates comments in the proximity of silencing-related data
structures.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Link: https://lore.kernel.org/r/20230420113324.877164-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 .../sound/kernel-api/writing-an-alsa-driver.rst    | 17 +++--
 include/sound/pcm.h                                | 14 ++--
 include/uapi/sound/asound.h                        | 11 ++-
 sound/core/pcm_lib.c                               | 86 +++++++++-------------
 sound/core/pcm_local.h                             |  3 +-
 sound/core/pcm_native.c                            |  6 +-
 6 files changed, 66 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst
index a368529e8ed3..e37d9dba320d 100644
--- a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst
+++ b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst
@@ -1577,14 +1577,19 @@ are the contents of this file:
           unsigned int period_step;
           unsigned int sleep_min;		/* min ticks to sleep */
           snd_pcm_uframes_t start_threshold;
-          snd_pcm_uframes_t stop_threshold;
-          snd_pcm_uframes_t silence_threshold; /* Silence filling happens when
-                                                  noise is nearest than this */
-          snd_pcm_uframes_t silence_size;	/* Silence filling size */
+          /*
+           * The following two thresholds alleviate playback buffer underruns; when
+           * hw_avail drops below the threshold, the respective action is triggered:
+           */
+          snd_pcm_uframes_t stop_threshold;	/* - stop playback */
+          snd_pcm_uframes_t silence_threshold;	/* - pre-fill buffer with silence */
+          snd_pcm_uframes_t silence_size;       /* max size of silence pre-fill; when >= boundary,
+                                                 * fill played area with silence immediately */
           snd_pcm_uframes_t boundary;	/* pointers wrap point */
   
-          snd_pcm_uframes_t silenced_start;
-          snd_pcm_uframes_t silenced_size;
+          /* internal data of auto-silencer */
+          snd_pcm_uframes_t silence_start; /* starting pointer to silence area */
+          snd_pcm_uframes_t silence_filled; /* size filled with silence */
   
           snd_pcm_sync_id_t sync;		/* hardware synchronization ID */
   
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 27040b472a4f..19f564606ac4 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -378,18 +378,18 @@ struct snd_pcm_runtime {
 	unsigned int rate_den;
 	unsigned int no_period_wakeup: 1;
 
-	/* -- SW params -- */
-	int tstamp_mode;		/* mmap timestamp is updated */
+	/* -- SW params; see struct snd_pcm_sw_params for comments -- */
+	int tstamp_mode;
   	unsigned int period_step;
 	snd_pcm_uframes_t start_threshold;
 	snd_pcm_uframes_t stop_threshold;
-	snd_pcm_uframes_t silence_threshold; /* Silence filling happens when
-						noise is nearest than this */
-	snd_pcm_uframes_t silence_size;	/* Silence filling size */
-	snd_pcm_uframes_t boundary;	/* pointers wrap point */
+	snd_pcm_uframes_t silence_threshold;
+	snd_pcm_uframes_t silence_size;
+	snd_pcm_uframes_t boundary;
 
+	/* internal data of auto-silencer */
 	snd_pcm_uframes_t silence_start; /* starting pointer to silence area */
-	snd_pcm_uframes_t silence_filled; /* size filled with silence */
+	snd_pcm_uframes_t silence_filled; /* already filled part of silence area */
 
 	union snd_pcm_sync_id sync;	/* hardware synchronization ID */
 
diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index 7eecc99ddef7..0aa955aa8246 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -429,9 +429,14 @@ struct snd_pcm_sw_params {
 	snd_pcm_uframes_t avail_min;		/* min avail frames for wakeup */
 	snd_pcm_uframes_t xfer_align;		/* obsolete: xfer size need to be a multiple */
 	snd_pcm_uframes_t start_threshold;	/* min hw_avail frames for automatic start */
-	snd_pcm_uframes_t stop_threshold;	/* min avail frames for automatic stop */
-	snd_pcm_uframes_t silence_threshold;	/* min distance from noise for silence filling */
-	snd_pcm_uframes_t silence_size;		/* silence block size */
+	/*
+	 * The following two thresholds alleviate playback buffer underruns; when
+	 * hw_avail drops below the threshold, the respective action is triggered:
+	 */
+	snd_pcm_uframes_t stop_threshold;	/* - stop playback */
+	snd_pcm_uframes_t silence_threshold;	/* - pre-fill buffer with silence */
+	snd_pcm_uframes_t silence_size;		/* max size of silence pre-fill; when >= boundary,
+						 * fill played area with silence immediately */
 	snd_pcm_uframes_t boundary;		/* pointers wrap point */
 	unsigned int proto;			/* protocol version */
 	unsigned int tstamp_type;		/* timestamp type (req. proto >= 2.0.12) */
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index af1eb136feb0..d21c73944efd 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -42,70 +42,56 @@ static int fill_silence_frames(struct snd_pcm_substream *substream,
  *
  * when runtime->silence_size >= runtime->boundary - fill processed area with silence immediately
  */
-void snd_pcm_playback_silence(struct snd_pcm_substream *substream, snd_pcm_uframes_t new_hw_ptr)
+void snd_pcm_playback_silence(struct snd_pcm_substream *substream)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	snd_pcm_uframes_t frames, ofs, transfer;
+	snd_pcm_uframes_t appl_ptr = READ_ONCE(runtime->control->appl_ptr);
+	snd_pcm_sframes_t added, hw_avail, frames;
+	snd_pcm_uframes_t noise_dist, ofs, transfer;
 	int err;
 
+	added = appl_ptr - runtime->silence_start;
+	if (added) {
+		if (added < 0)
+			added += runtime->boundary;
+		if (added < runtime->silence_filled)
+			runtime->silence_filled -= added;
+		else
+			runtime->silence_filled = 0;
+		runtime->silence_start = appl_ptr;
+	}
+
+	// This will "legitimately" turn negative on underrun, and will be mangled
+	// into a huge number by the boundary crossing handling. The initial state
+	// might also be not quite sane. The code below MUST account for these cases.
+	hw_avail = appl_ptr - runtime->status->hw_ptr;
+	if (hw_avail < 0)
+		hw_avail += runtime->boundary;
+
+	noise_dist = hw_avail + runtime->silence_filled;
 	if (runtime->silence_size < runtime->boundary) {
-		snd_pcm_sframes_t noise_dist, n;
-		snd_pcm_uframes_t appl_ptr = READ_ONCE(runtime->control->appl_ptr);
-		if (runtime->silence_start != appl_ptr) {
-			n = appl_ptr - runtime->silence_start;
-			if (n < 0)
-				n += runtime->boundary;
-			if ((snd_pcm_uframes_t)n < runtime->silence_filled)
-				runtime->silence_filled -= n;
-			else
-				runtime->silence_filled = 0;
-			runtime->silence_start = appl_ptr;
-		}
-		if (runtime->silence_filled >= runtime->buffer_size)
-			return;
-		noise_dist = snd_pcm_playback_hw_avail(runtime) + runtime->silence_filled;
-		if (noise_dist >= (snd_pcm_sframes_t) runtime->silence_threshold)
-			return;
 		frames = runtime->silence_threshold - noise_dist;
+		if (frames <= 0)
+			return;
 		if (frames > runtime->silence_size)
 			frames = runtime->silence_size;
 	} else {
-		if (new_hw_ptr == ULONG_MAX) {	/* initialization */
-			snd_pcm_sframes_t avail = snd_pcm_playback_hw_avail(runtime);
-			if (avail > runtime->buffer_size)
-				avail = runtime->buffer_size;
-			runtime->silence_filled = avail > 0 ? avail : 0;
-			runtime->silence_start = (runtime->status->hw_ptr +
-						  runtime->silence_filled) %
-						 runtime->boundary;
-		} else {
-			ofs = runtime->status->hw_ptr;
-			frames = new_hw_ptr - ofs;
-			if ((snd_pcm_sframes_t)frames < 0)
-				frames += runtime->boundary;
-			runtime->silence_filled -= frames;
-			if ((snd_pcm_sframes_t)runtime->silence_filled < 0) {
-				runtime->silence_filled = 0;
-				runtime->silence_start = new_hw_ptr;
-			} else {
-				runtime->silence_start = ofs;
-			}
-		}
-		frames = runtime->buffer_size - runtime->silence_filled;
+		frames = runtime->buffer_size - noise_dist;
+		if (frames <= 0)
+			return;
 	}
+
 	if (snd_BUG_ON(frames > runtime->buffer_size))
 		return;
-	if (frames == 0)
-		return;
-	ofs = runtime->silence_start % runtime->buffer_size;
-	while (frames > 0) {
+	ofs = (runtime->silence_start + runtime->silence_filled) % runtime->buffer_size;
+	do {
 		transfer = ofs + frames > runtime->buffer_size ? runtime->buffer_size - ofs : frames;
 		err = fill_silence_frames(substream, ofs, transfer);
 		snd_BUG_ON(err < 0);
 		runtime->silence_filled += transfer;
 		frames -= transfer;
 		ofs = 0;
-	}
+	} while (frames > 0);
 	snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE);
 }
 
@@ -439,10 +425,6 @@ static int snd_pcm_update_hw_ptr0(struct snd_pcm_substream *substream,
 		return 0;
 	}
 
-	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
-	    runtime->silence_size > 0)
-		snd_pcm_playback_silence(substream, new_hw_ptr);
-
 	if (in_interrupt) {
 		delta = new_hw_ptr - runtime->hw_ptr_interrupt;
 		if (delta < 0)
@@ -460,6 +442,10 @@ static int snd_pcm_update_hw_ptr0(struct snd_pcm_substream *substream,
 		runtime->hw_ptr_wrap += runtime->boundary;
 	}
 
+	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
+	    runtime->silence_size > 0)
+		snd_pcm_playback_silence(substream);
+
 	update_audio_tstamp(substream, &curr_tstamp, &audio_tstamp);
 
 	return snd_pcm_update_state(substream, runtime);
diff --git a/sound/core/pcm_local.h b/sound/core/pcm_local.h
index ecb21697ae3a..42fe3a4e9154 100644
--- a/sound/core/pcm_local.h
+++ b/sound/core/pcm_local.h
@@ -29,8 +29,7 @@ int snd_pcm_update_state(struct snd_pcm_substream *substream,
 			 struct snd_pcm_runtime *runtime);
 int snd_pcm_update_hw_ptr(struct snd_pcm_substream *substream);
 
-void snd_pcm_playback_silence(struct snd_pcm_substream *substream,
-			      snd_pcm_uframes_t new_hw_ptr);
+void snd_pcm_playback_silence(struct snd_pcm_substream *substream);
 
 static inline snd_pcm_uframes_t
 snd_pcm_avail(struct snd_pcm_substream *substream)
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 94185267a7b9..3d0c4a5b701b 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -958,7 +958,7 @@ static int snd_pcm_sw_params(struct snd_pcm_substream *substream,
 	if (snd_pcm_running(substream)) {
 		if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
 		    runtime->silence_size > 0)
-			snd_pcm_playback_silence(substream, ULONG_MAX);
+			snd_pcm_playback_silence(substream);
 		err = snd_pcm_update_state(substream, runtime);
 	}
 	snd_pcm_stream_unlock_irq(substream);
@@ -1455,7 +1455,7 @@ static void snd_pcm_post_start(struct snd_pcm_substream *substream,
 	__snd_pcm_set_state(runtime, state);
 	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
 	    runtime->silence_size > 0)
-		snd_pcm_playback_silence(substream, ULONG_MAX);
+		snd_pcm_playback_silence(substream);
 	snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTART);
 }
 
@@ -1916,7 +1916,7 @@ static void snd_pcm_post_reset(struct snd_pcm_substream *substream,
 	runtime->control->appl_ptr = runtime->status->hw_ptr;
 	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
 	    runtime->silence_size > 0)
-		snd_pcm_playback_silence(substream, ULONG_MAX);
+		snd_pcm_playback_silence(substream);
 	snd_pcm_stream_unlock_irq(substream);
 }
 
-- 
cgit v1.2.3


From dfc39d4026fb2432363c0f77543c4cf3adca4c7b Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antgroup.com>
Date: Wed, 19 Apr 2023 15:24:16 +0800
Subject: net/packet: support mergeable feature of virtio

Packet sockets, like tap, can be used as the backend for kernel vhost.
In packet sockets, virtio net header size is currently hardcoded to be
the size of struct virtio_net_hdr, which is 10 bytes; however, it is not
always the case: some virtio features, such as mrg_rxbuf, need virtio
net header to be 12-byte long.

Mergeable buffers, as a virtio feature, is worthy of supporting: packets
that are larger than one-mbuf size will be dropped in vhost worker's
handle_rx if mrg_rxbuf feature is not used, but large packets
cannot be avoided and increasing mbuf's size is not economical.

With this virtio feature enabled by virtio-user, packet sockets with
hardcoded 10-byte virtio net header will parse mac head incorrectly in
packet_snd by taking the last two bytes of virtio net header as part of
mac header.
This incorrect mac header parsing will cause packet to be dropped due to
invalid ether head checking in later under-layer device packet receiving.

By adding extra field vnet_hdr_sz with utilizing holes in struct
packet_sock to record currently used virtio net header size and supporting
extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet
sockets can know the exact length of virtio net header that virtio user
gives.
In packet_snd, tpacket_snd and packet_recvmsg, instead of using
hardcoded virtio net header size, it can get the exact vnet_hdr_sz from
corresponding packet_sock, and parse mac header correctly based on this
information to avoid the packets being mistakenly dropped.

Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 95 +++++++++++++++++++++++++-----------------
 net/packet/diag.c              |  2 +-
 net/packet/internal.h          |  2 +-
 4 files changed, 60 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 78c981d6a9d4..9efc42382fdb 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -59,6 +59,7 @@ struct sockaddr_ll {
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
 #define PACKET_IGNORE_OUTGOING		23
+#define PACKET_VNET_HDR_SZ		24
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 568f8d76e3c1..6080c0db0814 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb,
 }
 
 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
-			   size_t *len)
+			   size_t *len, int vnet_hdr_sz)
 {
-	struct virtio_net_hdr vnet_hdr;
+	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
 
-	if (*len < sizeof(vnet_hdr))
+	if (*len < vnet_hdr_sz)
 		return -EINVAL;
-	*len -= sizeof(vnet_hdr);
+	*len -= vnet_hdr_sz;
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
 		return -EINVAL;
 
-	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
 }
 
 /*
@@ -2250,7 +2250,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	__u32 ts_status;
 	bool is_drop_n_account = false;
 	unsigned int slot_id = 0;
-	bool do_vnet = false;
+	int vnet_hdr_sz = 0;
 
 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
 	 * We may add members to them until current aligned size without forcing
@@ -2308,10 +2308,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
 				       (maclen < 16 ? 16 : maclen)) +
 				       po->tp_reserve;
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
-			netoff += sizeof(struct virtio_net_hdr);
-			do_vnet = true;
-		}
+		vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
+		if (vnet_hdr_sz)
+			netoff += vnet_hdr_sz;
 		macoff = netoff - maclen;
 	}
 	if (netoff > USHRT_MAX) {
@@ -2337,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			snaplen = po->rx_ring.frame_size - macoff;
 			if ((int)snaplen < 0) {
 				snaplen = 0;
-				do_vnet = false;
+				vnet_hdr_sz = 0;
 			}
 		}
 	} else if (unlikely(macoff + snaplen >
@@ -2351,7 +2350,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely((int)snaplen < 0)) {
 			snaplen = 0;
 			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
-			do_vnet = false;
+			vnet_hdr_sz = 0;
 		}
 	}
 	spin_lock(&sk->sk_receive_queue.lock);
@@ -2367,7 +2366,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		__set_bit(slot_id, po->rx_ring.rx_owner_map);
 	}
 
-	if (do_vnet &&
+	if (vnet_hdr_sz &&
 	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
 				    sizeof(struct virtio_net_hdr),
 				    vio_le(), true, 0)) {
@@ -2551,16 +2550,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
 }
 
 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
-				 struct virtio_net_hdr *vnet_hdr)
+				 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
 {
-	if (*len < sizeof(*vnet_hdr))
+	int ret;
+
+	if (*len < vnet_hdr_sz)
 		return -EINVAL;
-	*len -= sizeof(*vnet_hdr);
+	*len -= vnet_hdr_sz;
 
 	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
 		return -EFAULT;
 
-	return __packet_snd_vnet_parse(vnet_hdr, *len);
+	ret = __packet_snd_vnet_parse(vnet_hdr, *len);
+	if (ret)
+		return ret;
+
+	/* move iter to point to the start of mac header */
+	if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
+		iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
+
+	return 0;
 }
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
@@ -2722,6 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	void *ph;
 	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
 	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
 	unsigned char *addr = NULL;
 	int tp_len, size_max;
 	void *data;
@@ -2779,8 +2789,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-	if ((size_max > dev->mtu + reserve + VLAN_HLEN) &&
-	    !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	reinit_completion(&po->skb_completion);
@@ -2809,10 +2818,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		status = TP_STATUS_SEND_REQUEST;
 		hlen = LL_RESERVED_SPACE(dev);
 		tlen = dev->needed_tailroom;
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+		if (vnet_hdr_sz) {
 			vnet_hdr = data;
-			data += sizeof(*vnet_hdr);
-			tp_len -= sizeof(*vnet_hdr);
+			data += vnet_hdr_sz;
+			tp_len -= vnet_hdr_sz;
 			if (tp_len < 0 ||
 			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
 				tp_len = -EINVAL;
@@ -2837,7 +2846,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 					  addr, hlen, copylen, &sockc);
 		if (likely(tp_len >= 0) &&
 		    tp_len > dev->mtu + reserve &&
-		    !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) &&
+		    !vnet_hdr_sz &&
 		    !packet_extra_vlan_len_allowed(dev, skb))
 			tp_len = -EMSGSIZE;
 
@@ -2856,7 +2865,7 @@ tpacket_error:
 			}
 		}
 
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+		if (vnet_hdr_sz) {
 			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
 				tp_len = -EINVAL;
 				goto tpacket_error;
@@ -2946,7 +2955,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int offset = 0;
 	struct packet_sock *po = pkt_sk(sk);
-	bool has_vnet_hdr = false;
+	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
 	int hlen, tlen, linear;
 	int extra_len = 0;
 
@@ -2990,11 +2999,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	if (sock->type == SOCK_RAW)
 		reserve = dev->hard_header_len;
-	if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
-		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+	if (vnet_hdr_sz) {
+		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
 		if (err)
 			goto out_unlock;
-		has_vnet_hdr = true;
 	}
 
 	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -3064,11 +3072,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	packet_parse_headers(skb, sock);
 
-	if (has_vnet_hdr) {
+	if (vnet_hdr_sz) {
 		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
 		if (err)
 			goto out_free;
-		len += sizeof(vnet_hdr);
+		len += vnet_hdr_sz;
 		virtio_net_hdr_set_proto(skb, &vnet_hdr);
 	}
 
@@ -3408,7 +3416,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
 	int copied, err;
-	int vnet_hdr_len = 0;
+	int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
 	unsigned int origlen = 0;
 
 	err = -EINVAL;
@@ -3449,11 +3457,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	packet_rcv_try_clear_pressure(pkt_sk(sk));
 
-	if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) {
-		err = packet_rcv_vnet(msg, skb, &len);
+	if (vnet_hdr_len) {
+		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
 		if (err)
 			goto out_free;
-		vnet_hdr_len = sizeof(struct virtio_net_hdr);
 	}
 
 	/* You lose any data beyond the buffer you gave. If it worries
@@ -3915,8 +3922,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
 		return 0;
 	}
 	case PACKET_VNET_HDR:
+	case PACKET_VNET_HDR_SZ:
 	{
-		int val;
+		int val, hdr_len;
 
 		if (sock->type != SOCK_RAW)
 			return -EINVAL;
@@ -3925,11 +3933,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
 		if (copy_from_sockptr(&val, optval, sizeof(val)))
 			return -EFAULT;
 
+		if (optname == PACKET_VNET_HDR_SZ) {
+			if (val && val != sizeof(struct virtio_net_hdr) &&
+			    val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
+				return -EINVAL;
+			hdr_len = val;
+		} else {
+			hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
+		}
 		lock_sock(sk);
 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
 			ret = -EBUSY;
 		} else {
-			packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val);
+			WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
 			ret = 0;
 		}
 		release_sock(sk);
@@ -4062,7 +4078,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
 		break;
 	case PACKET_VNET_HDR:
-		val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR);
+		val = !!READ_ONCE(po->vnet_hdr_sz);
+		break;
+	case PACKET_VNET_HDR_SZ:
+		val = READ_ONCE(po->vnet_hdr_sz);
 		break;
 	case PACKET_VERSION:
 		val = po->tp_version;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index de4ced5cf3e8..d0c4eda4cdc6 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
 		pinfo.pdi_flags |= PDI_AUXDATA;
 	if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV))
 		pinfo.pdi_flags |= PDI_ORIGDEV;
-	if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+	if (READ_ONCE(po->vnet_hdr_sz))
 		pinfo.pdi_flags |= PDI_VNETHDR;
 	if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS))
 		pinfo.pdi_flags |= PDI_LOSS;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 27930f69f368..63f4865202c1 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -118,6 +118,7 @@ struct packet_sock {
 	struct mutex		pg_vec_lock;
 	unsigned long		flags;
 	int			ifindex;	/* bound device		*/
+	u8			vnet_hdr_sz;
 	__be16			num;
 	struct packet_rollover	*rollover;
 	struct packet_mclist	*mclist;
@@ -139,7 +140,6 @@ enum packet_sock_flags {
 	PACKET_SOCK_AUXDATA,
 	PACKET_SOCK_TX_HAS_OFF,
 	PACKET_SOCK_TP_LOSS,
-	PACKET_SOCK_HAS_VNET_HDR,
 	PACKET_SOCK_RUNNING,
 	PACKET_SOCK_PRESSURE,
 	PACKET_SOCK_QDISC_BYPASS,
-- 
cgit v1.2.3


From 223baf9d17f25e2608dbdff7232c095c1e612268 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 20 Apr 2023 10:55:48 -0400
Subject: sched: Fix performance regression introduced by mm_cid

Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
sysbench regression reported by Aaron Lu.

Keep track of the currently allocated mm_cid for each mm/cpu rather than
freeing them immediately on context switch. This eliminates most atomic
operations when context switching back and forth between threads
belonging to different memory spaces in multi-threaded scenarios (many
processes, each with many threads). The per-mm/per-cpu mm_cid values are
serialized by their respective runqueue locks.

Thread migration is handled by introducing invocation to
sched_mm_cid_migrate_to() (with destination runqueue lock held) in
activate_task() for migrating tasks. If the destination cpu's mm_cid is
unset, and if the source runqueue is not actively using its mm_cid, then
the source cpu's mm_cid is moved to the destination cpu on migration.

Introduce a task-work executed periodically, similarly to NUMA work,
which delays reclaim of cid values when they are unused for a period of
time.

Keep track of the allocation time for each per-cpu cid, and let the task
work clear them when they are observed to be older than
SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all
mm_cids which are greater or equal to the Hamming weight of the mm
cidmask to keep concurrency ids compact.

Because we want to ensure the mm_cid converges towards the smaller
values as migrations happen, the prior optimization that was done when
context switching between threads belonging to the same mm is removed,
because it could delay the lazy release of the destination runqueue
mm_cid after it has been replaced by a migration. Removing this prior
optimization is not an issue performance-wise because the introduced
per-mm/per-cpu mm_cid tracking also covers this more specific case.

Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID")
Reported-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Aaron Lu <aaron.lu@intel.com>
Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
---
 include/linux/mm_types.h |  82 +++++++-
 include/linux/sched.h    |   3 +
 include/linux/sched/mm.h |   5 +
 kernel/fork.c            |   9 +-
 kernel/sched/core.c      | 523 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h     | 239 +++++++++++++++++++---
 6 files changed, 804 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a57e6ae78e65..5eab61156f0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -550,6 +550,13 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
+#ifdef CONFIG_SCHED_MM_CID
+struct mm_cid {
+	u64 time;
+	int cid;
+};
+#endif
+
 struct kioctx_table;
 struct mm_struct {
 	struct {
@@ -600,15 +607,19 @@ struct mm_struct {
 		atomic_t mm_count;
 #ifdef CONFIG_SCHED_MM_CID
 		/**
-		 * @cid_lock: Protect cid bitmap updates vs lookups.
+		 * @pcpu_cid: Per-cpu current cid.
 		 *
-		 * Prevent situations where updates to the cid bitmap happen
-		 * concurrently with lookups. Those can lead to situations
-		 * where a lookup cannot find a free bit simply because it was
-		 * unlucky enough to load, non-atomically, bitmap words as they
-		 * were being concurrently updated by the updaters.
+		 * Keep track of the currently allocated mm_cid for each cpu.
+		 * The per-cpu mm_cid values are serialized by their respective
+		 * runqueue locks.
 		 */
-		raw_spinlock_t cid_lock;
+		struct mm_cid __percpu *pcpu_cid;
+		/*
+		 * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
+		 *
+		 * When the next mm_cid scan is due (in jiffies).
+		 */
+		unsigned long mm_cid_next_scan;
 #endif
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
@@ -873,6 +884,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
 }
 
 #ifdef CONFIG_SCHED_MM_CID
+
+enum mm_cid_state {
+	MM_CID_UNSET = -1U,		/* Unset state has lazy_put flag set. */
+	MM_CID_LAZY_PUT = (1U << 31),
+};
+
+static inline bool mm_cid_is_unset(int cid)
+{
+	return cid == MM_CID_UNSET;
+}
+
+static inline bool mm_cid_is_lazy_put(int cid)
+{
+	return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
+}
+
+static inline bool mm_cid_is_valid(int cid)
+{
+	return !(cid & MM_CID_LAZY_PUT);
+}
+
+static inline int mm_cid_set_lazy_put(int cid)
+{
+	return cid | MM_CID_LAZY_PUT;
+}
+
+static inline int mm_cid_clear_lazy_put(int cid)
+{
+	return cid & ~MM_CID_LAZY_PUT;
+}
+
 /* Accessor for struct mm_struct's cidmask. */
 static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
 {
@@ -886,16 +928,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
 
 static inline void mm_init_cid(struct mm_struct *mm)
 {
-	raw_spin_lock_init(&mm->cid_lock);
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
+
+		pcpu_cid->cid = MM_CID_UNSET;
+		pcpu_cid->time = 0;
+	}
 	cpumask_clear(mm_cidmask(mm));
 }
 
+static inline int mm_alloc_cid(struct mm_struct *mm)
+{
+	mm->pcpu_cid = alloc_percpu(struct mm_cid);
+	if (!mm->pcpu_cid)
+		return -ENOMEM;
+	mm_init_cid(mm);
+	return 0;
+}
+
+static inline void mm_destroy_cid(struct mm_struct *mm)
+{
+	free_percpu(mm->pcpu_cid);
+	mm->pcpu_cid = NULL;
+}
+
 static inline unsigned int mm_cid_size(void)
 {
 	return cpumask_size();
 }
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_cid(struct mm_struct *mm) { }
 static inline unsigned int mm_cid_size(void)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d654eb4cabd..675298d6eb36 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1314,7 +1314,10 @@ struct task_struct {
 
 #ifdef CONFIG_SCHED_MM_CID
 	int				mm_cid;		/* Current cid in mm */
+	int				last_mm_cid;	/* Most recent cid in mm */
+	int				migrate_from_cpu;
 	int				mm_cid_active;	/* Whether cid bitmap is active */
+	struct callback_head		cid_work;
 #endif
 
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..f20fc0600fcc 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
 	atomic_inc(&mm->mm_count);
 }
 
+static inline void smp_mb__after_mmgrab(void)
+{
+	smp_mb__after_atomic();
+}
+
 extern void __mmdrop(struct mm_struct *mm);
 
 static inline void mmdrop(struct mm_struct *mm)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..ad2ee22272a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm)
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
+	mm_destroy_cid(mm);
 
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 #ifdef CONFIG_SCHED_MM_CID
 	tsk->mm_cid = -1;
+	tsk->last_mm_cid = -1;
 	tsk->mm_cid_active = 0;
+	tsk->migrate_from_cpu = -1;
 #endif
 	return tsk;
 
@@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (init_new_context(p, mm))
 		goto fail_nocontext;
 
+	if (mm_alloc_cid(mm))
+		goto fail_cid;
+
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
 			goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
-	mm_init_cid(mm);
 	return mm;
 
 fail_pcpu:
 	while (i > 0)
 		percpu_counter_destroy(&mm->rss_stat[--i]);
+	mm_destroy_cid(mm);
+fail_cid:
 fail_nocontext:
 	mm_free_pgd(mm);
 fail_nopgd:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5a97ceb37c13..898fa3bc2765 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2101,6 +2101,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_on_rq_migrating(p))
 		flags |= ENQUEUE_MIGRATED;
+	if (flags & ENQUEUE_MIGRATED)
+		sched_mm_cid_migrate_to(rq, p);
 
 	enqueue_task(rq, p, flags);
 
@@ -3210,6 +3212,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		rseq_migrate(p);
+		sched_mm_cid_migrate_from(p);
 		perf_event_task_migrate(p);
 	}
 
@@ -4483,6 +4486,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
 #endif
+	init_sched_mm_cid(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5129,7 +5133,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
-	switch_mm_cid(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
@@ -5285,6 +5288,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 *
 	 * kernel ->   user   switch + mmdrop() active
 	 *   user ->   user   switch
+	 *
+	 * switch_mm_cid() needs to be updated if the barriers provided
+	 * by context_switch() are modified.
 	 */
 	if (!next->mm) {                                // to kernel
 		enter_lazy_tlb(prev->active_mm, next);
@@ -5314,6 +5320,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		}
 	}
 
+	/* switch_mm_cid() requires the memory barriers above. */
+	switch_mm_cid(rq, prev, next);
+
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
 	prepare_lock_switch(rq, next, rf);
@@ -5602,6 +5611,7 @@ void scheduler_tick(void)
 		resched_latency = cpu_resched_latency(rq);
 	calc_global_load_tick(rq);
 	sched_core_tick(rq);
+	task_tick_mm_cid(rq, curr);
 
 	rq_unlock(rq, &rf);
 
@@ -11469,45 +11479,524 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
 }
 
 #ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_exit_signals(struct task_struct *t)
+
+/**
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/**
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ *      X = Y = 0
+ *
+ *      w[X]=1          w[Y]=1
+ *      MB              MB
+ *      r[Y]=y          r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0                                      CPU1
+ *
+ * Context switch CS-1                       Remote-clear
+ *   - store to rq->curr: (N)->(Y) (TSA)     - cmpxchg to *pcpu_id to LAZY (TMA)
+ *                                             (implied barrier after cmpxchg)
+ *   - switch_mm_cid()
+ *     - memory barrier (see switch_mm_cid()
+ *       comment explaining how this barrier
+ *       is combined with other scheduler
+ *       barriers)
+ *     - mm_cid_get (next)
+ *       - READ_ONCE(*pcpu_cid)              - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+	t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+					  struct task_struct *t,
+					  struct mm_cid *src_pcpu_cid)
 {
 	struct mm_struct *mm = t->mm;
-	unsigned long flags;
+	struct task_struct *src_task;
+	int src_cid, last_mm_cid;
 
 	if (!mm)
+		return -1;
+
+	last_mm_cid = t->last_mm_cid;
+	/*
+	 * If the migrated task has no last cid, or if the current
+	 * task on src rq uses the cid, it means the source cid does not need
+	 * to be moved to the destination cpu.
+	 */
+	if (last_mm_cid == -1)
+		return -1;
+	src_cid = READ_ONCE(src_pcpu_cid->cid);
+	if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+		return -1;
+
+	/*
+	 * If we observe an active task using the mm on this rq, it means we
+	 * are not the last task to be migrated from this cpu for this mm, so
+	 * there is no need to move src_cid to the destination cpu.
+	 */
+	rcu_read_lock();
+	src_task = rcu_dereference(src_rq->curr);
+	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+		rcu_read_unlock();
+		t->last_mm_cid = -1;
+		return -1;
+	}
+	rcu_read_unlock();
+
+	return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+					      struct task_struct *t,
+					      struct mm_cid *src_pcpu_cid,
+					      int src_cid)
+{
+	struct task_struct *src_task;
+	struct mm_struct *mm = t->mm;
+	int lazy_cid;
+
+	if (src_cid == -1)
+		return -1;
+
+	/*
+	 * Attempt to clear the source cpu cid to move it to the destination
+	 * cpu.
+	 */
+	lazy_cid = mm_cid_set_lazy_put(src_cid);
+	if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+		return -1;
+
+	/*
+	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+	 * rq->curr->mm matches the scheduler barrier in context_switch()
+	 * between store to rq->curr and load of prev and next task's
+	 * per-mm/cpu cid.
+	 *
+	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+	 * rq->curr->mm_cid_active matches the barrier in
+	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+	 * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+	 * load of per-mm/cpu cid.
+	 */
+
+	/*
+	 * If we observe an active task using the mm on this rq after setting
+	 * the lazy-put flag, this task will be responsible for transitioning
+	 * from lazy-put flag set to MM_CID_UNSET.
+	 */
+	rcu_read_lock();
+	src_task = rcu_dereference(src_rq->curr);
+	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+		rcu_read_unlock();
+		/*
+		 * We observed an active task for this mm, there is therefore
+		 * no point in moving this cid to the destination cpu.
+		 */
+		t->last_mm_cid = -1;
+		return -1;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * The src_cid is unused, so it can be unset.
+	 */
+	if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+		return -1;
+	return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+	struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+	struct mm_struct *mm = t->mm;
+	int src_cid, dst_cid, src_cpu;
+	struct rq *src_rq;
+
+	lockdep_assert_rq_held(dst_rq);
+
+	if (!mm)
+		return;
+	src_cpu = t->migrate_from_cpu;
+	if (src_cpu == -1) {
+		t->last_mm_cid = -1;
+		return;
+	}
+	/*
+	 * Move the src cid if the dst cid is unset. This keeps id
+	 * allocation closest to 0 in cases where few threads migrate around
+	 * many cpus.
+	 *
+	 * If destination cid is already set, we may have to just clear
+	 * the src cid to ensure compactness in frequent migrations
+	 * scenarios.
+	 *
+	 * It is not useful to clear the src cid when the number of threads is
+	 * greater or equal to the number of allowed cpus, because user-space
+	 * can expect that the number of allowed cids can reach the number of
+	 * allowed cpus.
+	 */
+	dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+	dst_cid = READ_ONCE(dst_pcpu_cid->cid);
+	if (!mm_cid_is_unset(dst_cid) &&
+	    atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+		return;
+	src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+	src_rq = cpu_rq(src_cpu);
+	src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+	if (src_cid == -1)
+		return;
+	src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+							    src_cid);
+	if (src_cid == -1)
+		return;
+	if (!mm_cid_is_unset(dst_cid)) {
+		__mm_cid_put(mm, src_cid);
+		return;
+	}
+	/* Move src_cid to dst cpu. */
+	mm_cid_snapshot_time(dst_rq, mm);
+	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+				      int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *t;
+	unsigned long flags;
+	int cid, lazy_cid;
+
+	cid = READ_ONCE(pcpu_cid->cid);
+	if (!mm_cid_is_valid(cid))
 		return;
+
+	/*
+	 * Clear the cpu cid if it is set to keep cid allocation compact.  If
+	 * there happens to be other tasks left on the source cpu using this
+	 * mm, the next task using this mm will reallocate its cid on context
+	 * switch.
+	 */
+	lazy_cid = mm_cid_set_lazy_put(cid);
+	if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+		return;
+
+	/*
+	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+	 * rq->curr->mm matches the scheduler barrier in context_switch()
+	 * between store to rq->curr and load of prev and next task's
+	 * per-mm/cpu cid.
+	 *
+	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+	 * rq->curr->mm_cid_active matches the barrier in
+	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+	 * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+	 * load of per-mm/cpu cid.
+	 */
+
+	/*
+	 * If we observe an active task using the mm on this rq after setting
+	 * the lazy-put flag, that task will be responsible for transitioning
+	 * from lazy-put flag set to MM_CID_UNSET.
+	 */
+	rcu_read_lock();
+	t = rcu_dereference(rq->curr);
+	if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * The cid is unused, so it can be unset.
+	 * Disable interrupts to keep the window of cid ownership without rq
+	 * lock small.
+	 */
 	local_irq_save(flags);
-	mm_cid_put(mm, t->mm_cid);
-	t->mm_cid = -1;
-	t->mm_cid_active = 0;
+	if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+		__mm_cid_put(mm, cid);
 	local_irq_restore(flags);
 }
 
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct mm_cid *pcpu_cid;
+	struct task_struct *curr;
+	u64 rq_clock;
+
+	/*
+	 * rq->clock load is racy on 32-bit but one spurious clear once in a
+	 * while is irrelevant.
+	 */
+	rq_clock = READ_ONCE(rq->clock);
+	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+	/*
+	 * In order to take care of infrequently scheduled tasks, bump the time
+	 * snapshot associated with this cid if an active task using the mm is
+	 * observed on this rq.
+	 */
+	rcu_read_lock();
+	curr = rcu_dereference(rq->curr);
+	if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+		WRITE_ONCE(pcpu_cid->time, rq_clock);
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+		return;
+	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+					     int weight)
+{
+	struct mm_cid *pcpu_cid;
+	int cid;
+
+	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+	cid = READ_ONCE(pcpu_cid->cid);
+	if (!mm_cid_is_valid(cid) || cid < weight)
+		return;
+	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void task_mm_cid_work(struct callback_head *work)
+{
+	unsigned long now = jiffies, old_scan, next_scan;
+	struct task_struct *t = current;
+	struct cpumask *cidmask;
+	struct mm_struct *mm;
+	int weight, cpu;
+
+	SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+
+	work->next = work;	/* Prevent double-add */
+	if (t->flags & PF_EXITING)
+		return;
+	mm = t->mm;
+	if (!mm)
+		return;
+	old_scan = READ_ONCE(mm->mm_cid_next_scan);
+	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+	if (!old_scan) {
+		unsigned long res;
+
+		res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
+		if (res != old_scan)
+			old_scan = res;
+		else
+			old_scan = next_scan;
+	}
+	if (time_before(now, old_scan))
+		return;
+	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
+		return;
+	cidmask = mm_cidmask(mm);
+	/* Clear cids that were not recently used. */
+	for_each_possible_cpu(cpu)
+		sched_mm_cid_remote_clear_old(mm, cpu);
+	weight = cpumask_weight(cidmask);
+	/*
+	 * Clear cids that are greater or equal to the cidmask weight to
+	 * recompact it.
+	 */
+	for_each_possible_cpu(cpu)
+		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+}
+
+void init_sched_mm_cid(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+	int mm_users = 0;
+
+	if (mm) {
+		mm_users = atomic_read(&mm->mm_users);
+		if (mm_users == 1)
+			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+	}
+	t->cid_work.next = &t->cid_work;	/* Protect against double add */
+	init_task_work(&t->cid_work, task_mm_cid_work);
+}
+
+void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+{
+	struct callback_head *work = &curr->cid_work;
+	unsigned long now = jiffies;
+
+	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+	    work->next != work)
+		return;
+	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+		return;
+	task_work_add(curr, work, TWA_RESUME);
+}
+
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+	struct rq_flags rf;
+	struct rq *rq;
+
+	if (!mm)
+		return;
+
+	preempt_disable();
+	rq = this_rq();
+	rq_lock_irqsave(rq, &rf);
+	preempt_enable_no_resched();	/* holding spinlock */
+	WRITE_ONCE(t->mm_cid_active, 0);
+	/*
+	 * Store t->mm_cid_active before loading per-mm/cpu cid.
+	 * Matches barrier in sched_mm_cid_remote_clear_old().
+	 */
+	smp_mb();
+	mm_cid_put(mm);
+	t->last_mm_cid = t->mm_cid = -1;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
 void sched_mm_cid_before_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	unsigned long flags;
+	struct rq_flags rf;
+	struct rq *rq;
 
 	if (!mm)
 		return;
-	local_irq_save(flags);
-	mm_cid_put(mm, t->mm_cid);
-	t->mm_cid = -1;
-	t->mm_cid_active = 0;
-	local_irq_restore(flags);
+
+	preempt_disable();
+	rq = this_rq();
+	rq_lock_irqsave(rq, &rf);
+	preempt_enable_no_resched();	/* holding spinlock */
+	WRITE_ONCE(t->mm_cid_active, 0);
+	/*
+	 * Store t->mm_cid_active before loading per-mm/cpu cid.
+	 * Matches barrier in sched_mm_cid_remote_clear_old().
+	 */
+	smp_mb();
+	mm_cid_put(mm);
+	t->last_mm_cid = t->mm_cid = -1;
+	rq_unlock_irqrestore(rq, &rf);
 }
 
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	unsigned long flags;
+	struct rq_flags rf;
+	struct rq *rq;
 
 	if (!mm)
 		return;
-	local_irq_save(flags);
-	t->mm_cid = mm_cid_get(mm);
-	t->mm_cid_active = 1;
-	local_irq_restore(flags);
+
+	preempt_disable();
+	rq = this_rq();
+	rq_lock_irqsave(rq, &rf);
+	preempt_enable_no_resched();	/* holding spinlock */
+	WRITE_ONCE(t->mm_cid_active, 1);
+	/*
+	 * Store t->mm_cid_active before loading per-mm/cpu cid.
+	 * Matches barrier in sched_mm_cid_remote_clear_old().
+	 */
+	smp_mb();
+	t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+	rq_unlock_irqrestore(rq, &rf);
 	rseq_set_notify_resume(t);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 060616944d7a..ec7b3e0a2b20 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3253,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
 }
 
 #ifdef CONFIG_SCHED_MM_CID
-static inline int __mm_cid_get(struct mm_struct *mm)
+
+#define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
+#define MM_CID_SCAN_DELAY	100			/* 100ms */
+
+extern raw_spinlock_t cid_lock;
+extern int use_cid_lock;
+
+extern void sched_mm_cid_migrate_from(struct task_struct *t);
+extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
+extern void init_sched_mm_cid(struct task_struct *t);
+
+static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+{
+	if (cid < 0)
+		return;
+	cpumask_clear_cpu(cid, mm_cidmask(mm));
+}
+
+/*
+ * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
+ * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
+ * be held to transition to other states.
+ *
+ * State transitions synchronized with cmpxchg or try_cmpxchg need to be
+ * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ */
+static inline void mm_cid_put_lazy(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+	int cid;
+
+	lockdep_assert_irqs_disabled();
+	cid = __this_cpu_read(pcpu_cid->cid);
+	if (!mm_cid_is_lazy_put(cid) ||
+	    !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+		return;
+	__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+{
+	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+	int cid, res;
+
+	lockdep_assert_irqs_disabled();
+	cid = __this_cpu_read(pcpu_cid->cid);
+	for (;;) {
+		if (mm_cid_is_unset(cid))
+			return MM_CID_UNSET;
+		/*
+		 * Attempt transition from valid or lazy-put to unset.
+		 */
+		res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
+		if (res == cid)
+			break;
+		cid = res;
+	}
+	return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm)
+{
+	int cid;
+
+	lockdep_assert_irqs_disabled();
+	cid = mm_cid_pcpu_unset(mm);
+	if (cid == MM_CID_UNSET)
+		return;
+	__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int __mm_cid_try_get(struct mm_struct *mm)
 {
 	struct cpumask *cpumask;
 	int cid;
 
 	cpumask = mm_cidmask(mm);
-	cid = cpumask_first_zero(cpumask);
-	if (cid >= nr_cpu_ids)
+	/*
+	 * Retry finding first zero bit if the mask is temporarily
+	 * filled. This only happens during concurrent remote-clear
+	 * which owns a cid without holding a rq lock.
+	 */
+	for (;;) {
+		cid = cpumask_first_zero(cpumask);
+		if (cid < nr_cpu_ids)
+			break;
+		cpu_relax();
+	}
+	if (cpumask_test_and_set_cpu(cid, cpumask))
 		return -1;
-	__cpumask_set_cpu(cid, cpumask);
 	return cid;
 }
 
-static inline void mm_cid_put(struct mm_struct *mm, int cid)
+/*
+ * Save a snapshot of the current runqueue time of this cpu
+ * with the per-cpu cid value, allowing to estimate how recently it was used.
+ */
+static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
 {
-	lockdep_assert_irqs_disabled();
-	if (cid < 0)
-		return;
-	raw_spin_lock(&mm->cid_lock);
-	__cpumask_clear_cpu(cid, mm_cidmask(mm));
-	raw_spin_unlock(&mm->cid_lock);
+	struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+
+	lockdep_assert_rq_held(rq);
+	WRITE_ONCE(pcpu_cid->time, rq->clock);
 }
 
-static inline int mm_cid_get(struct mm_struct *mm)
+static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
 {
-	int ret;
+	int cid;
 
-	lockdep_assert_irqs_disabled();
-	raw_spin_lock(&mm->cid_lock);
-	ret = __mm_cid_get(mm);
-	raw_spin_unlock(&mm->cid_lock);
-	return ret;
+	/*
+	 * All allocations (even those using the cid_lock) are lock-free. If
+	 * use_cid_lock is set, hold the cid_lock to perform cid allocation to
+	 * guarantee forward progress.
+	 */
+	if (!READ_ONCE(use_cid_lock)) {
+		cid = __mm_cid_try_get(mm);
+		if (cid >= 0)
+			goto end;
+		raw_spin_lock(&cid_lock);
+	} else {
+		raw_spin_lock(&cid_lock);
+		cid = __mm_cid_try_get(mm);
+		if (cid >= 0)
+			goto unlock;
+	}
+
+	/*
+	 * cid concurrently allocated. Retry while forcing following
+	 * allocations to use the cid_lock to ensure forward progress.
+	 */
+	WRITE_ONCE(use_cid_lock, 1);
+	/*
+	 * Set use_cid_lock before allocation. Only care about program order
+	 * because this is only required for forward progress.
+	 */
+	barrier();
+	/*
+	 * Retry until it succeeds. It is guaranteed to eventually succeed once
+	 * all newcoming allocations observe the use_cid_lock flag set.
+	 */
+	do {
+		cid = __mm_cid_try_get(mm);
+		cpu_relax();
+	} while (cid < 0);
+	/*
+	 * Allocate before clearing use_cid_lock. Only care about
+	 * program order because this is for forward progress.
+	 */
+	barrier();
+	WRITE_ONCE(use_cid_lock, 0);
+unlock:
+	raw_spin_unlock(&cid_lock);
+end:
+	mm_cid_snapshot_time(rq, mm);
+	return cid;
 }
 
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
 {
+	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+	struct cpumask *cpumask;
+	int cid;
+
+	lockdep_assert_rq_held(rq);
+	cpumask = mm_cidmask(mm);
+	cid = __this_cpu_read(pcpu_cid->cid);
+	if (mm_cid_is_valid(cid)) {
+		mm_cid_snapshot_time(rq, mm);
+		return cid;
+	}
+	if (mm_cid_is_lazy_put(cid)) {
+		if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+			__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+	}
+	cid = __mm_cid_get(rq, mm);
+	__this_cpu_write(pcpu_cid->cid, cid);
+	return cid;
+}
+
+static inline void switch_mm_cid(struct rq *rq,
+				 struct task_struct *prev,
+				 struct task_struct *next)
+{
+	/*
+	 * Provide a memory barrier between rq->curr store and load of
+	 * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
+	 *
+	 * Should be adapted if context_switch() is modified.
+	 */
+	if (!next->mm) {                                // to kernel
+		/*
+		 * user -> kernel transition does not guarantee a barrier, but
+		 * we can use the fact that it performs an atomic operation in
+		 * mmgrab().
+		 */
+		if (prev->mm)                           // from user
+			smp_mb__after_mmgrab();
+		/*
+		 * kernel -> kernel transition does not change rq->curr->mm
+		 * state. It stays NULL.
+		 */
+	} else {                                        // to user
+		/*
+		 * kernel -> user transition does not provide a barrier
+		 * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
+		 * Provide it here.
+		 */
+		if (!prev->mm)                          // from kernel
+			smp_mb();
+		/*
+		 * user -> user transition guarantees a memory barrier through
+		 * switch_mm() when current->mm changes. If current->mm is
+		 * unchanged, no barrier is needed.
+		 */
+	}
 	if (prev->mm_cid_active) {
-		if (next->mm_cid_active && next->mm == prev->mm) {
-			/*
-			 * Context switch between threads in same mm, hand over
-			 * the mm_cid from prev to next.
-			 */
-			next->mm_cid = prev->mm_cid;
-			prev->mm_cid = -1;
-			return;
-		}
-		mm_cid_put(prev->mm, prev->mm_cid);
+		mm_cid_snapshot_time(rq, prev->mm);
+		mm_cid_put_lazy(prev);
 		prev->mm_cid = -1;
 	}
 	if (next->mm_cid_active)
-		next->mm_cid = mm_cid_get(next->mm);
+		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
 }
 
 #else
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
+static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif
 
 #endif /* _KERNEL_SCHED_SCHED_H */
-- 
cgit v1.2.3


From f7abf14f0001a5a47539d9f60bbdca649e43536b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 17 Apr 2023 15:37:55 +0200
Subject: posix-cpu-timers: Implement the missing timer_wait_running callback

For some unknown reason the introduction of the timer_wait_running callback
missed to fixup posix CPU timers, which went unnoticed for almost four years.
Marco reported recently that the WARN_ON() in timer_wait_running()
triggers with a posix CPU timer test case.

Posix CPU timers have two execution models for expiring timers depending on
CONFIG_POSIX_CPU_TIMERS_TASK_WORK:

1) If not enabled, the expiry happens in hard interrupt context so
   spin waiting on the remote CPU is reasonably time bound.

   Implement an empty stub function for that case.

2) If enabled, the expiry happens in task work before returning to user
   space or guest mode. The expired timers are marked as firing and moved
   from the timer queue to a local list head with sighand lock held. Once
   the timers are moved, sighand lock is dropped and the expiry happens in
   fully preemptible context. That means the expiring task can be scheduled
   out, migrated, interrupted etc. So spin waiting on it is more than
   suboptimal.

   The timer wheel has a timer_wait_running() mechanism for RT, which uses
   a per CPU timer-base expiry lock which is held by the expiry code and the
   task waiting for the timer function to complete blocks on that lock.

   This does not work in the same way for posix CPU timers as there is no
   timer base and expiry for process wide timers can run on any task
   belonging to that process, but the concept of waiting on an expiry lock
   can be used too in a slightly different way:

    - Add a mutex to struct posix_cputimers_work. This struct is per task
      and used to schedule the expiry task work from the timer interrupt.

    - Add a task_struct pointer to struct cpu_timer which is used to store
      a the task which runs the expiry. That's filled in when the task
      moves the expired timers to the local expiry list. That's not
      affecting the size of the k_itimer union as there are bigger union
      members already

    - Let the task take the expiry mutex around the expiry function

    - Let the waiter acquire a task reference with rcu_read_lock() held and
      block on the expiry mutex

   This avoids spin-waiting on a task which might not even be on a CPU and
   works nicely for RT too.

Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT")
Reported-by: Marco Elver <elver@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marco Elver <elver@google.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87zg764ojw.ffs@tglx
---
 include/linux/posix-timers.h   | 17 +++++----
 kernel/time/posix-cpu-timers.c | 81 ++++++++++++++++++++++++++++++++++--------
 kernel/time/posix-timers.c     |  4 +++
 3 files changed, 82 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 2c6e99ca48af..d607f51404fc 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/alarmtimer.h>
 #include <linux/timerqueue.h>
 
@@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk)
  * cpu_timer - Posix CPU timer representation for k_itimer
  * @node:	timerqueue node to queue in the task/sig
  * @head:	timerqueue head on which this timer is queued
- * @task:	Pointer to target task
+ * @pid:	Pointer to target task PID
  * @elist:	List head for the expiry list
  * @firing:	Timer is currently firing
+ * @handling:	Pointer to the task which handles expiry
  */
 struct cpu_timer {
-	struct timerqueue_node	node;
-	struct timerqueue_head	*head;
-	struct pid		*pid;
-	struct list_head	elist;
-	int			firing;
+	struct timerqueue_node		node;
+	struct timerqueue_head		*head;
+	struct pid			*pid;
+	struct list_head		elist;
+	int				firing;
+	struct task_struct __rcu	*handling;
 };
 
 static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
@@ -135,10 +138,12 @@ struct posix_cputimers {
 /**
  * posix_cputimers_work - Container for task work based posix CPU timer expiry
  * @work:	The task work to be scheduled
+ * @mutex:	Mutex held around expiry in context of this task work
  * @scheduled:  @work has been scheduled already, no further processing
  */
 struct posix_cputimers_work {
 	struct callback_head	work;
+	struct mutex		mutex;
 	unsigned int		scheduled;
 };
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b34022c..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
 			return expires;
 
 		ctmr->firing = 1;
+		/* See posix_cpu_timer_wait_running() */
+		rcu_assign_pointer(ctmr->handling, current);
 		cpu_timer_dequeue(ctmr);
 		list_add_tail(&ctmr->elist, firing);
 	}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
 static void posix_cpu_timers_work(struct callback_head *work)
 {
+	struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+	mutex_lock(&cw->mutex);
 	handle_posix_cpu_timers(current);
+	mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+	struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+	/* Has the handling task completed expiry already? */
+	if (!tsk)
+		return;
+
+	/* Ensure that the task cannot go away */
+	get_task_struct(tsk);
+	/* Now drop the RCU protection so the mutex can be locked */
+	rcu_read_unlock();
+	/* Wait on the expiry mutex */
+	mutex_lock(&tsk->posix_cputimers_work.mutex);
+	/* Release it immediately again. */
+	mutex_unlock(&tsk->posix_cputimers_work.mutex);
+	/* Drop the task reference. */
+	put_task_struct(tsk);
+	/* Relock RCU so the callsite is balanced */
+	rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+	/* Ensure that timr->it.cpu.handling task cannot go away */
+	rcu_read_lock();
+	spin_unlock_irq(&timr->it_lock);
+	posix_cpu_timer_wait_running(timr);
+	rcu_read_unlock();
+	/* @timr is on stack and is valid */
+	spin_lock_irq(&timr->it_lock);
 }
 
 /*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
 	       sizeof(p->posix_cputimers_work.work));
 	init_task_work(&p->posix_cputimers_work.work,
 		       posix_cpu_timers_work);
+	mutex_init(&p->posix_cputimers_work.mutex);
 	p->posix_cputimers_work.scheduled = false;
 }
 
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
 	lockdep_posixtimer_exit();
 }
 
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+	cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+	spin_unlock_irq(&timr->it_lock);
+	cpu_relax();
+	spin_lock_irq(&timr->it_lock);
+}
+
 static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
 {
 	return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
 		 */
 		if (likely(cpu_firing >= 0))
 			cpu_timer_fire(timer);
+		/* See posix_cpu_timer_wait_running() */
+		rcu_assign_pointer(timer->it.cpu.handling, NULL);
 		spin_unlock(&timer->it_lock);
 	}
 }
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		expires = cpu_timer_getexpires(&timer.it.cpu);
 		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
 		if (!error) {
-			/*
-			 * Timer is now unarmed, deletion can not fail.
-			 */
+			/* Timer is now unarmed, deletion can not fail. */
 			posix_cpu_timer_del(&timer);
+		} else {
+			while (error == TIMER_RETRY) {
+				posix_cpu_timer_wait_running_nsleep(&timer);
+				error = posix_cpu_timer_del(&timer);
+			}
 		}
-		spin_unlock_irq(&timer.it_lock);
 
-		while (error == TIMER_RETRY) {
-			/*
-			 * We need to handle case when timer was or is in the
-			 * middle of firing. In other cases we already freed
-			 * resources.
-			 */
-			spin_lock_irq(&timer.it_lock);
-			error = posix_cpu_timer_del(&timer);
-			spin_unlock_irq(&timer.it_lock);
-		}
+		spin_unlock_irq(&timer.it_lock);
 
 		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
 			/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
 	.timer_del		= posix_cpu_timer_del,
 	.timer_get		= posix_cpu_timer_get,
 	.timer_rearm		= posix_cpu_timer_rearm,
+	.timer_wait_running	= posix_cpu_timer_wait_running,
 };
 
 const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a11b39..808a247205a9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
 	rcu_read_lock();
 	unlock_timer(timer, *flags);
 
+	/*
+	 * kc->timer_wait_running() might drop RCU lock. So @timer
+	 * cannot be touched anymore after the function returns!
+	 */
 	if (!WARN_ON_ONCE(!kc->timer_wait_running))
 		kc->timer_wait_running(timer);
 
-- 
cgit v1.2.3


From 67a142dc9eb96a5cc018e5db62390665eb5f038c Mon Sep 17 00:00:00 2001
From: Krishna Yarlagadda <kyarlagadda@nvidia.com>
Date: Fri, 21 Apr 2023 14:43:07 +0530
Subject: spi: Add TPM HW flow flag

TPM specification [1] defines flow control over SPI. Client device can
insert a wait state on MISO when address is transmitted by controller
on MOSI. Detecting the wait state in software is only possible for
full duplex controllers. For controllers that support only half-
duplex, the wait state detection needs to be implemented in hardware.

Add a flag SPI_TPM_HW_FLOW for TPM device to set when software flow
control is not possible and hardware flow control is expected from
SPI controller.

Reference:
[1] https://trustedcomputinggroup.org/resource/pc-client-platform-tpm
-profile-ptp-specification/

Signed-off-by: Krishna Yarlagadda <kyarlagadda@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20230421091309.2672-2-kyarlagadda@nvidia.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 873ced6ae4ca..cfe42f8cd7a4 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -184,8 +184,18 @@ struct spi_device {
 	u8			chip_select;
 	u8			bits_per_word;
 	bool			rt;
-#define SPI_NO_TX	BIT(31)		/* No transmit wire */
-#define SPI_NO_RX	BIT(30)		/* No receive wire */
+#define SPI_NO_TX		BIT(31)		/* No transmit wire */
+#define SPI_NO_RX		BIT(30)		/* No receive wire */
+	/*
+	 * TPM specification defines flow control over SPI. Client device
+	 * can insert a wait state on MISO when address is transmitted by
+	 * controller on MOSI. Detecting the wait state in software is only
+	 * possible for full duplex controllers. For controllers that support
+	 * only half-duplex, the wait state detection needs to be implemented
+	 * in hardware. TPM devices would set this flag when hardware flow
+	 * control is expected from SPI controller.
+	 */
+#define SPI_TPM_HW_FLOW		BIT(29)		/* TPM HW flow control */
 	/*
 	 * All bits defined above should be covered by SPI_MODE_KERNEL_MASK.
 	 * The SPI_MODE_KERNEL_MASK has the SPI_MODE_USER_MASK counterpart,
@@ -195,7 +205,7 @@ struct spi_device {
 	 * These bits must not overlap. A static assert check should make sure of that.
 	 * If adding extra bits, make sure to decrease the bit index below as well.
 	 */
-#define SPI_MODE_KERNEL_MASK	(~(BIT(30) - 1))
+#define SPI_MODE_KERNEL_MASK	(~(BIT(29) - 1))
 	u32			mode;
 	int			irq;
 	void			*controller_state;
-- 
cgit v1.2.3


From e81995a81e25e8fab286db6539198dd97b140807 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:06 +0200
Subject: ALSA: emu10k1: clarify various fx8010.*_mask fields

extin_mask and extout_mask are used only by the SbLive! microcode, so
they have no effect on Audigy.

Eliminate fxbus_mask entirely, as it wasn't actually used for anything.

As a drive-by, remove the pointless pad1 field from struct
snd_emu10k1_fx8010 - it is not visible to user space, so it has no
binary compatibility constraints.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230421141006.1005509-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          | 6 ++----
 sound/pci/emu10k1/emu10k1_main.c | 7 ++++---
 sound/pci/emu10k1/emufx.c        | 9 ++++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 39787fecc8d9..3407ca4a1210 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1599,10 +1599,8 @@ struct snd_emu10k1_fx8010_pcm {
 };
 
 struct snd_emu10k1_fx8010 {
-	unsigned short fxbus_mask;	/* used FX buses (bitmask) */
-	unsigned short extin_mask;	/* used external inputs (bitmask) */
-	unsigned short extout_mask;	/* used external outputs (bitmask) */
-	unsigned short pad1;
+	unsigned short extin_mask;	/* used external inputs (bitmask); not used for Audigy */
+	unsigned short extout_mask;	/* used external outputs (bitmask); not used for Audigy */
 	unsigned int itram_size;	/* internal TRAM size in samples */
 	struct snd_dma_buffer etram_pages; /* external TRAM pages and size */
 	unsigned int dbg;		/* FX debugger register */
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 3880f359e688..65fd6b62bc9c 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -1901,11 +1901,12 @@ int snd_emu10k1_create(struct snd_card *card,
 
 	pci_set_master(pci);
 
-	emu->fx8010.fxbus_mask = 0x303f;
+	// The masks are not used for Audigy.
+	// FIXME: these should come from the card_capabilites table.
 	if (extin_mask == 0)
-		extin_mask = 0x3fcf;
+		extin_mask = 0x3fcf;  // EXTIN_*
 	if (extout_mask == 0)
-		extout_mask = 0x7fff;
+		extout_mask = 0x7fff;  // EXTOUT_*
 	emu->fx8010.extin_mask = extin_mask;
 	emu->fx8010.extout_mask = extout_mask;
 	emu->enable_ir = enable_ir;
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index 6cf7c8b1de47..c74e66e03ae0 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -2523,7 +2523,7 @@ static void snd_emu10k1_fx8010_info(struct snd_emu10k1 *emu,
 				   struct snd_emu10k1_fx8010_info *info)
 {
 	const char * const *fxbus, * const *extin, * const *extout;
-	unsigned short fxbus_mask, extin_mask, extout_mask;
+	unsigned short extin_mask, extout_mask;
 	int res;
 
 	info->internal_tram_size = emu->fx8010.itram_size;
@@ -2531,11 +2531,10 @@ static void snd_emu10k1_fx8010_info(struct snd_emu10k1 *emu,
 	fxbus = fxbuses;
 	extin = emu->audigy ? audigy_ins : creative_ins;
 	extout = emu->audigy ? audigy_outs : creative_outs;
-	fxbus_mask = emu->fx8010.fxbus_mask;
-	extin_mask = emu->fx8010.extin_mask;
-	extout_mask = emu->fx8010.extout_mask;
+	extin_mask = emu->audigy ? ~0 : emu->fx8010.extin_mask;
+	extout_mask = emu->audigy ? ~0 : emu->fx8010.extout_mask;
 	for (res = 0; res < 16; res++, fxbus++, extin++, extout++) {
-		copy_string(info->fxbus_names[res], fxbus_mask & (1 << res) ? *fxbus : NULL, "FXBUS", res);
+		copy_string(info->fxbus_names[res], *fxbus, "FXBUS", res);
 		copy_string(info->extin_names[res], extin_mask & (1 << res) ? *extin : NULL, "Unused", res);
 		copy_string(info->extout_names[res], extout_mask & (1 << res) ? *extout : NULL, "Unused", res);
 	}
-- 
cgit v1.2.3


From f6c73a11133ef991284311387ca707b48bf53912 Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Fri, 21 Apr 2023 08:52:42 -0700
Subject: fs.h: Add TRACE_IOCB_STRINGS for use in trace points

Add TRACE_IOCB_STRINGS macro which can be used in the trace point patch to
print different flag values with meaningful string output.

Tested-by: Disha Goel <disgoel@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: line up strings all prettylike]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/fs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..75d24fa082e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -340,6 +340,20 @@ enum rw_hint {
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
 
+/* for use in trace events */
+#define TRACE_IOCB_STRINGS \
+	{ IOCB_HIPRI,		"HIPRI" }, \
+	{ IOCB_DSYNC,		"DSYNC" }, \
+	{ IOCB_SYNC,		"SYNC" }, \
+	{ IOCB_NOWAIT,		"NOWAIT" }, \
+	{ IOCB_APPEND,		"APPEND" }, \
+	{ IOCB_EVENTFD,		"EVENTFD"}, \
+	{ IOCB_DIRECT,		"DIRECT" }, \
+	{ IOCB_WRITE,		"WRITE" }, \
+	{ IOCB_WAITQ,		"WAITQ" }, \
+	{ IOCB_NOIO,		"NOIO" }, \
+	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }
+
 struct kiocb {
 	struct file		*ki_filp;
 	loff_t			ki_pos;
-- 
cgit v1.2.3


From d3bff1fc50d4fcaccddbd63917dd94172e80c40e Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Fri, 21 Apr 2023 08:52:43 -0700
Subject: iomap: Remove IOMAP_DIO_NOSYNC unused dio flag

IOMAP_DIO_NOSYNC earlier was added for use in btrfs. But it seems for
aio dsync writes this is not useful anyway. For aio dsync case, we
we queue the request and return -EIOCBQUEUED. Now, since IOMAP_DIO_NOSYNC
doesn't let iomap_dio_complete() to call generic_write_sync(),
hence we may lose the sync write.

Hence kill this flag as it is not in use by any FS now.

Tested-by: Disha Goel <disgoel@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/direct-io.c  | 2 +-
 include/linux/iomap.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f771001574d0..36ab1152dbea 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -541,7 +541,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		/* for data sync or sync, we need sync completion processing */
-		if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
+		if (iocb_is_dsync(iocb)) {
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
 
 		       /*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0f8123504e5e..e2b836c2e119 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -377,12 +377,6 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_PARTIAL		(1 << 2)
 
-/*
- * The caller will sync the write if needed; do not sync it within
- * iomap_dio_rw.  Overrides IOMAP_DIO_FORCE_WAIT.
- */
-#define IOMAP_DIO_NOSYNC		(1 << 3)
-
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before);
-- 
cgit v1.2.3


From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:54 +0200
Subject: bpf: add bpf_link support for BPF_NETFILTER programs

Add bpf_link support skeleton.  To keep this reviewable, no bpf program
can be invoked yet, if a program is attached only a c-stub is called and
not the actual bpf program.

Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig.

Uapi example usage:
	union bpf_attr attr = { };

	attr.link_create.prog_fd = progfd;
	attr.link_create.attach_type = 0; /* unused */
	attr.link_create.netfilter.pf = PF_INET;
	attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN;
	attr.link_create.netfilter.priority = -128;

	err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr));

... this would attach progfd to ipv4:input hook.

Such hook gets removed automatically if the calling program exits.

BPF_NETFILTER program invocation is added in followup change.

NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it
allows to tell userspace which program is attached at the given hook
when user runs 'nft hook list' command rather than just the priority
and not-very-helpful 'this hook runs a bpf prog but I can't tell which
one'.

Will also be used to disallow registration of two bpf programs with
same priority in a followup patch.

v4: arm32 cmpxchg only supports 32bit operand
    s/prio/priority/
v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if
    more use cases pop up (arptables, ebtables, netdev ingress/egress etc).

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netfilter.h           |   1 +
 include/net/netfilter/nf_bpf_link.h |  10 +++
 include/uapi/linux/bpf.h            |  14 ++++
 kernel/bpf/syscall.c                |   6 ++
 net/netfilter/Kconfig               |   3 +
 net/netfilter/Makefile              |   1 +
 net/netfilter/nf_bpf_link.c         | 159 ++++++++++++++++++++++++++++++++++++
 7 files changed, 194 insertions(+)
 create mode 100644 include/net/netfilter/nf_bpf_link.h
 create mode 100644 net/netfilter/nf_bpf_link.c

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index c8e03bcaecaa..0762444e3767 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv,
 enum nf_hook_ops_type {
 	NF_HOOK_OP_UNDEFINED,
 	NF_HOOK_OP_NF_TABLES,
+	NF_HOOK_OP_BPF,
 };
 
 struct nf_hook_ops {
diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h
new file mode 100644
index 000000000000..eeaeaf3d15de
--- /dev/null
+++ b/include/net/netfilter/nf_bpf_link.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+#else
+static inline int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4b20a7269bee..1bb11a6ee667 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_NETFILTER,
 };
 
 enum bpf_attach_type {
@@ -1050,6 +1051,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 	BPF_LINK_TYPE_STRUCT_OPS = 9,
+	BPF_LINK_TYPE_NETFILTER = 10,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1560,6 +1562,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		pf;
+				__u32		hooknum;
+				__s32		priority;
+				__u32		flags;
+			} netfilter;
 		};
 	} link_create;
 
@@ -6410,6 +6418,12 @@ struct bpf_link_info {
 		struct {
 			__u32 map_id;
 		} struct_ops;
+		struct {
+			__u32 pf;
+			__u32 hooknum;
+			__s32 priority;
+			__u32 flags;
+		} netfilter;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bcf1a1920ddd..14f39c1e573e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
+#include <net/netfilter/nf_bpf_link.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -2462,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
+	case BPF_PROG_TYPE_NETFILTER:
 		return true;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		/* always unpriv */
@@ -4588,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	switch (prog->type) {
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_NETFILTER:
 		break;
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
@@ -4654,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_NETFILTER:
+		ret = bpf_nf_link_attach(attr, prog);
+		break;
 #endif
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d0bf630482c1..441d1f134110 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE
 config NETFILTER_FAMILY_ARP
 	bool
 
+config NETFILTER_BPF_LINK
+	def_bool BPF_SYSCALL
+
 config NETFILTER_NETLINK_HOOK
 	tristate "Netfilter base hook dump support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5ffef1cd6143..d4958e7e7631 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
 endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..efa4f3390742
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <uapi/linux/netfilter_ipv4.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
+				    const struct nf_hook_state *s)
+{
+	return NF_ACCEPT;
+}
+
+struct bpf_nf_link {
+	struct bpf_link link;
+	struct nf_hook_ops hook_ops;
+	struct net *net;
+	u32 dead;
+};
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	if (nf_link->dead)
+		return;
+
+	/* prevent hook-not-found warning splat from netfilter core when
+	 * .detach was already called
+	 */
+	if (!cmpxchg(&nf_link->dead, 0, 1))
+		nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+	bpf_nf_link_release(link);
+	return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+				  struct seq_file *seq)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+		   nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+		   nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+				      struct bpf_link_info *info)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	info->netfilter.pf = nf_link->hook_ops.pf;
+	info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+	info->netfilter.priority = nf_link->hook_ops.priority;
+	info->netfilter.flags = 0;
+
+	return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+			      struct bpf_prog *old_prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+	.release = bpf_nf_link_release,
+	.dealloc = bpf_nf_link_dealloc,
+	.detach = bpf_nf_link_detach,
+	.show_fdinfo = bpf_nf_link_show_info,
+	.fill_link_info = bpf_nf_link_fill_link_info,
+	.update_prog = bpf_nf_link_update,
+};
+
+static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
+{
+	switch (attr->link_create.netfilter.pf) {
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+		if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
+			return -EPROTO;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	if (attr->link_create.netfilter.flags)
+		return -EOPNOTSUPP;
+
+	/* make sure conntrack confirm is always last.
+	 *
+	 * In the future, if userspace can e.g. request defrag, then
+	 * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG"
+	 * should fail.
+	 */
+	switch (attr->link_create.netfilter.priority) {
+	case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */
+	case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */
+	}
+
+	return 0;
+}
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_link_primer link_primer;
+	struct bpf_nf_link *link;
+	int err;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	err = bpf_nf_check_pf_and_hooks(attr);
+	if (err)
+		return err;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog);
+
+	link->hook_ops.hook = nf_hook_run_bpf;
+	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+	link->hook_ops.priv = prog;
+
+	link->hook_ops.pf = attr->link_create.netfilter.pf;
+	link->hook_ops.priority = attr->link_create.netfilter.priority;
+	link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+	link->net = net;
+	link->dead = false;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	err = nf_register_net_hook(net, &link->hook_ops);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
-- 
cgit v1.2.3


From fd9c663b9ad67dedfc9a3fd3429ddd3e83782b4d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:55 +0200
Subject: bpf: minimal support for programs hooked into netfilter framework

This adds minimal support for BPF_PROG_TYPE_NETFILTER bpf programs
that will be invoked via the NF_HOOK() points in the ip stack.

Invocation incurs an indirect call.  This is not a necessity: Its
possible to add 'DEFINE_BPF_DISPATCHER(nf_progs)' and handle the
program invocation with the same method already done for xdp progs.

This isn't done here to keep the size of this chunk down.

Verifier restricts verdicts to either DROP or ACCEPT.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-3-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h           |  4 +++
 include/net/netfilter/nf_bpf_link.h |  5 +++
 kernel/bpf/btf.c                    |  6 ++++
 kernel/bpf/verifier.c               |  3 ++
 net/core/filter.c                   |  1 +
 net/netfilter/nf_bpf_link.c         | 70 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 88 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d4ee3ccd3753..39a999abb0ce 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,6 +79,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
+#ifdef CONFIG_NETFILTER
+BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
+	      struct bpf_nf_ctx, struct bpf_nf_ctx)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h
index eeaeaf3d15de..6c984b0ea838 100644
--- a/include/net/netfilter/nf_bpf_link.h
+++ b/include/net/netfilter/nf_bpf_link.h
@@ -1,5 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+struct bpf_nf_ctx {
+	const struct nf_hook_state *state;
+	struct sk_buff *skb;
+};
+
 #if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)
 int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7db4ec125fbd..6b682b8e4b50 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,9 @@
 #include <linux/bsearch.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
 #include <net/sock.h>
 #include "../tools/lib/bpf/relo_core.h"
 
@@ -212,6 +215,7 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_SK_SKB,
 	BTF_KFUNC_HOOK_SOCKET_FILTER,
 	BTF_KFUNC_HOOK_LWT,
+	BTF_KFUNC_HOOK_NETFILTER,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7802,6 +7806,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		return BTF_KFUNC_HOOK_LWT;
+	case BPF_PROG_TYPE_NETFILTER:
+		return BTF_KFUNC_HOOK_NETFILTER;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1e05355facdc..fc7281d39e46 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13816,6 +13816,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 		}
 		break;
 
+	case BPF_PROG_TYPE_NETFILTER:
+		range = tnum_range(NF_DROP, NF_ACCEPT);
+		break;
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
diff --git a/net/core/filter.c b/net/core/filter.c
index 44fb997434ad..d9ce04ca22ce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11717,6 +11717,7 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
 	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
 }
 late_initcall(bpf_kfunc_init);
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index efa4f3390742..49cfc5215386 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf.h>
+#include <linux/filter.h>
 #include <linux/netfilter.h>
 
 #include <net/netfilter/nf_bpf_link.h>
@@ -8,7 +9,13 @@
 static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
 				    const struct nf_hook_state *s)
 {
-	return NF_ACCEPT;
+	const struct bpf_prog *prog = bpf_prog;
+	struct bpf_nf_ctx ctx = {
+		.state = s,
+		.skb = skb,
+	};
+
+	return bpf_prog_run(prog, &ctx);
 }
 
 struct bpf_nf_link {
@@ -157,3 +164,64 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	return bpf_link_settle(&link_primer);
 }
+
+const struct bpf_prog_ops netfilter_prog_ops = {
+};
+
+static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
+{
+	struct btf *btf;
+	s32 type_id;
+
+	btf = bpf_get_btf_vmlinux();
+	if (IS_ERR_OR_NULL(btf))
+		return false;
+
+	type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+	if (WARN_ON_ONCE(type_id < 0))
+		return false;
+
+	info->btf = btf;
+	info->btf_id = type_id;
+	info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+	return true;
+}
+
+static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
+			       const struct bpf_prog *prog,
+			       struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
+		return false;
+
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_nf_ctx, skb):
+		if (size != sizeof_field(struct bpf_nf_ctx, skb))
+			return false;
+
+		return nf_ptr_to_btf_id(info, "sk_buff");
+	case bpf_ctx_range(struct bpf_nf_ctx, state):
+		if (size != sizeof_field(struct bpf_nf_ctx, state))
+			return false;
+
+		return nf_ptr_to_btf_id(info, "nf_hook_state");
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static const struct bpf_func_proto *
+bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+const struct bpf_verifier_ops netfilter_verifier_ops = {
+	.is_valid_access	= nf_is_valid_access,
+	.get_func_proto		= bpf_nf_func_proto,
+};
-- 
cgit v1.2.3


From 506a74db7e019a277e987fa65654bdd953859d5b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:56 +0200
Subject: netfilter: nfnetlink hook: dump bpf prog id

This allows userspace ("nft list hooks") to show which bpf program
is attached to which hook.

Without this, user only knows bpf prog is attached at prio
x, y, z at INPUT and FORWARD, but can't tell which program is where.

v4: kdoc fixups (Simon Horman)

Link: https://lore.kernel.org/bpf/ZEELzpNCnYJuZyod@corigine.com/
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-4-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/netfilter/nfnetlink_hook.h | 24 +++++++-
 net/netfilter/nfnetlink_hook.c                | 81 ++++++++++++++++++++++-----
 2 files changed, 89 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index bbcd285b22e1..84a561a74b98 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -32,8 +32,12 @@ enum nfnl_hook_attributes {
 /**
  * enum nfnl_hook_chain_info_attributes - chain description
  *
- * NFNLA_HOOK_INFO_DESC: nft chain and table name (enum nft_table_attributes) (NLA_NESTED)
- * NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32)
+ * @NFNLA_HOOK_INFO_DESC: nft chain and table name (NLA_NESTED)
+ * @NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32)
+ *
+ * NFNLA_HOOK_INFO_DESC depends on NFNLA_HOOK_INFO_TYPE value:
+ *   NFNL_HOOK_TYPE_NFTABLES: enum nft_table_attributes
+ *   NFNL_HOOK_TYPE_BPF: enum nfnl_hook_bpf_attributes
  */
 enum nfnl_hook_chain_info_attributes {
 	NFNLA_HOOK_INFO_UNSPEC,
@@ -55,10 +59,24 @@ enum nfnl_hook_chain_desc_attributes {
 /**
  * enum nfnl_hook_chaintype - chain type
  *
- * @NFNL_HOOK_TYPE_NFTABLES nf_tables base chain
+ * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain
+ * @NFNL_HOOK_TYPE_BPF: bpf program
  */
 enum nfnl_hook_chaintype {
 	NFNL_HOOK_TYPE_NFTABLES = 0x1,
+	NFNL_HOOK_TYPE_BPF,
+};
+
+/**
+ * enum nfnl_hook_bpf_attributes - bpf prog description
+ *
+ * @NFNLA_HOOK_BPF_ID: bpf program id (NLA_U32)
+ */
+enum nfnl_hook_bpf_attributes {
+	NFNLA_HOOK_BPF_UNSPEC,
+	NFNLA_HOOK_BPF_ID,
+	__NFNLA_HOOK_BPF_MAX,
 };
+#define NFNLA_HOOK_BPF_MAX (__NFNLA_HOOK_BPF_MAX - 1)
 
 #endif /* _NFNL_HOOK_H */
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index 8120aadf6a0f..ade8ee1988b1 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -5,6 +5,7 @@
  * Author: Florian Westphal <fw@strlen.de>
  */
 
+#include <linux/bpf.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
@@ -57,35 +58,76 @@ struct nfnl_dump_hook_data {
 	u8 hook;
 };
 
+static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t)
+{
+	struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+	int ret;
+
+	if (!nest)
+		return NULL;
+
+	ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t));
+	if (ret == 0)
+		return nest;
+
+	nla_nest_cancel(nlskb, nest);
+	return NULL;
+}
+
+static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb,
+				       const struct nfnl_dump_hook_data *ctx,
+				       unsigned int seq,
+				       const struct bpf_prog *prog)
+{
+	struct nlattr *nest, *nest2;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK))
+		return 0;
+
+	if (WARN_ON_ONCE(!prog))
+		return 0;
+
+	nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+	if (!nest2)
+		goto cancel_nest;
+
+	ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id));
+	if (ret)
+		goto cancel_nest;
+
+	nla_nest_end(nlskb, nest2);
+	nla_nest_end(nlskb, nest);
+	return 0;
+
+cancel_nest:
+	nla_nest_cancel(nlskb, nest);
+	return -EMSGSIZE;
+}
+
 static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
 					const struct nfnl_dump_hook_data *ctx,
 					unsigned int seq,
-					const struct nf_hook_ops *ops)
+					struct nft_chain *chain)
 {
 	struct net *net = sock_net(nlskb->sk);
 	struct nlattr *nest, *nest2;
-	struct nft_chain *chain;
 	int ret = 0;
 
-	if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES)
-		return 0;
-
-	chain = ops->priv;
 	if (WARN_ON_ONCE(!chain))
 		return 0;
 
 	if (!nft_is_active(net, chain))
 		return 0;
 
-	nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+	nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES);
 	if (!nest)
 		return -EMSGSIZE;
 
-	ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE,
-			   htonl(NFNL_HOOK_TYPE_NFTABLES));
-	if (ret)
-		goto cancel_nest;
-
 	nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
 	if (!nest2)
 		goto cancel_nest;
@@ -171,7 +213,20 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb,
 	if (ret)
 		goto nla_put_failure;
 
-	ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops);
+	switch (ops->hook_ops_type) {
+	case NF_HOOK_OP_NF_TABLES:
+		ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv);
+		break;
+	case NF_HOOK_OP_BPF:
+		ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
+		break;
+	case NF_HOOK_OP_UNDEFINED:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
 	if (ret)
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 2b99ef22e0d237e08bfc437e7d051f78f352aeb2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:59 +0200
Subject: bpf: add test_run support for netfilter program type

add glue code so a bpf program can be run using userspace-provided
netfilter state and packet/skb.

Default is to use ipv4:output hook point, but this can be overridden by
userspace.  Userspace provided netfilter state is restricted, only hook and
protocol families can be overridden and only to ipv4/ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-7-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h         |   3 +
 net/bpf/test_run.c          | 158 ++++++++++++++++++++++++++++++++++++++++++++
 net/netfilter/nf_bpf_link.c |   1 +
 3 files changed, 162 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 18b592fde896..e53ceee1df37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2264,6 +2264,9 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+			 const union bpf_attr *kattr,
+			 union bpf_attr __user *uattr);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f170e8a17974..e79e3a415ca9 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -19,7 +19,9 @@
 #include <linux/error-injection.h>
 #include <linux/smp.h>
 #include <linux/sock_diag.h>
+#include <linux/netfilter.h>
 #include <net/xdp.h>
+#include <net/netfilter/nf_bpf_link.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -1691,6 +1693,162 @@ out:
 	return err;
 }
 
+static int verify_and_copy_hook_state(struct nf_hook_state *state,
+				      const struct nf_hook_state *user,
+				      struct net_device *dev)
+{
+	if (user->in || user->out)
+		return -EINVAL;
+
+	if (user->net || user->sk || user->okfn)
+		return -EINVAL;
+
+	switch (user->pf) {
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+		switch (state->hook) {
+		case NF_INET_PRE_ROUTING:
+			state->in = dev;
+			break;
+		case NF_INET_LOCAL_IN:
+			state->in = dev;
+			break;
+		case NF_INET_FORWARD:
+			state->in = dev;
+			state->out = dev;
+			break;
+		case NF_INET_LOCAL_OUT:
+			state->out = dev;
+			break;
+		case NF_INET_POST_ROUTING:
+			state->out = dev;
+			break;
+		}
+
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	state->pf = user->pf;
+	state->hook = user->hook;
+
+	return 0;
+}
+
+static __be16 nfproto_eth(int nfproto)
+{
+	switch (nfproto) {
+	case NFPROTO_IPV4:
+		return htons(ETH_P_IP);
+	case NFPROTO_IPV6:
+		break;
+	}
+
+	return htons(ETH_P_IPV6);
+}
+
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+			 const union bpf_attr *kattr,
+			 union bpf_attr __user *uattr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct net_device *dev = net->loopback_dev;
+	struct nf_hook_state *user_ctx, hook_state = {
+		.pf = NFPROTO_IPV4,
+		.hook = NF_INET_LOCAL_OUT,
+	};
+	u32 size = kattr->test.data_size_in;
+	u32 repeat = kattr->test.repeat;
+	struct bpf_nf_ctx ctx = {
+		.state = &hook_state,
+	};
+	struct sk_buff *skb = NULL;
+	u32 retval, duration;
+	void *data;
+	int ret;
+
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+		return -EINVAL;
+
+	if (size < sizeof(struct iphdr))
+		return -EINVAL;
+
+	data = bpf_test_init(kattr, kattr->test.data_size_in, size,
+			     NET_SKB_PAD + NET_IP_ALIGN,
+			     SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	if (!repeat)
+		repeat = 1;
+
+	user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state));
+	if (IS_ERR(user_ctx)) {
+		kfree(data);
+		return PTR_ERR(user_ctx);
+	}
+
+	if (user_ctx) {
+		ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev);
+		if (ret)
+			goto out;
+	}
+
+	skb = slab_build_skb(data);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	data = NULL; /* data released via kfree_skb */
+
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+	__skb_put(skb, size);
+
+	ret = -EINVAL;
+
+	if (hook_state.hook != NF_INET_LOCAL_OUT) {
+		if (size < ETH_HLEN + sizeof(struct iphdr))
+			goto out;
+
+		skb->protocol = eth_type_trans(skb, dev);
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			if (hook_state.pf == NFPROTO_IPV4)
+				break;
+			goto out;
+		case htons(ETH_P_IPV6):
+			if (size < ETH_HLEN + sizeof(struct ipv6hdr))
+				goto out;
+			if (hook_state.pf == NFPROTO_IPV6)
+				break;
+			goto out;
+		default:
+			ret = -EPROTO;
+			goto out;
+		}
+
+		skb_reset_network_header(skb);
+	} else {
+		skb->protocol = nfproto_eth(hook_state.pf);
+	}
+
+	ctx.skb = skb;
+
+	ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false);
+	if (ret)
+		goto out;
+
+	ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
+
+out:
+	kfree(user_ctx);
+	kfree_skb(skb);
+	kfree(data);
+	return ret;
+}
+
 static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
 	.owner = THIS_MODULE,
 	.set   = &test_sk_check_kfunc_ids,
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 49cfc5215386..c36da56d756f 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -166,6 +166,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 }
 
 const struct bpf_prog_ops netfilter_prog_ops = {
+	.test_run = bpf_prog_test_run_nf,
 };
 
 static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
-- 
cgit v1.2.3


From 8e41e0a575664d26bb87e012c39435c4c3914ed9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 21 Apr 2023 13:39:10 -0700
Subject: Revert "ACPICA: Events: Support fixed PCIe wake event"

This reverts commit 5c62d5aab8752e5ee7bfbe75ed6060db1c787f98.

This broke wake-on-lan for multiple people, and for much too long.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=217069
Link: https://lore.kernel.org/all/754225a2-95a9-2c36-1886-7da1a78308c2@loongson.cn/
Link: https://github.com/acpica/acpica/pull/866
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Jianmin Lv <lvjianmin@loongson.cn>
Cc: Huacai Chen <chenhuacai@loongson.cn>
Cc: Bob Moore <robert.moore@intel.com>
Cc: stable@kernel.org # 6.2
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/acpica/evevent.c  | 11 -----------
 drivers/acpi/acpica/hwsleep.c  | 14 --------------
 drivers/acpi/acpica/utglobal.c |  4 ----
 include/acpi/actypes.h         |  3 +--
 4 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c
index 82d1728b9bc6..df596d46dd97 100644
--- a/drivers/acpi/acpica/evevent.c
+++ b/drivers/acpi/acpica/evevent.c
@@ -142,9 +142,6 @@ static acpi_status acpi_ev_fixed_event_initialize(void)
 			status =
 			    acpi_write_bit_register(acpi_gbl_fixed_event_info
 						    [i].enable_register_id,
-						    (i ==
-						     ACPI_EVENT_PCIE_WAKE) ?
-						    ACPI_ENABLE_EVENT :
 						    ACPI_DISABLE_EVENT);
 			if (ACPI_FAILURE(status)) {
 				return (status);
@@ -188,11 +185,6 @@ u32 acpi_ev_fixed_event_detect(void)
 		return (int_status);
 	}
 
-	if (fixed_enable & ACPI_BITMASK_PCIEXP_WAKE_DISABLE)
-		fixed_enable &= ~ACPI_BITMASK_PCIEXP_WAKE_DISABLE;
-	else
-		fixed_enable |= ACPI_BITMASK_PCIEXP_WAKE_DISABLE;
-
 	ACPI_DEBUG_PRINT((ACPI_DB_INTERRUPTS,
 			  "Fixed Event Block: Enable %08X Status %08X\n",
 			  fixed_enable, fixed_status));
@@ -258,9 +250,6 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event)
 	if (!acpi_gbl_fixed_event_handlers[event].handler) {
 		(void)acpi_write_bit_register(acpi_gbl_fixed_event_info[event].
 					      enable_register_id,
-					      (event ==
-					       ACPI_EVENT_PCIE_WAKE) ?
-					      ACPI_ENABLE_EVENT :
 					      ACPI_DISABLE_EVENT);
 
 		ACPI_ERROR((AE_INFO,
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 37b3f641feaa..bd936476dda9 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -311,20 +311,6 @@ acpi_status acpi_hw_legacy_wake(u8 sleep_state)
 				    [ACPI_EVENT_SLEEP_BUTTON].
 				    status_register_id, ACPI_CLEAR_STATUS);
 
-	/* Enable pcie wake event if support */
-	if ((acpi_gbl_FADT.flags & ACPI_FADT_PCI_EXPRESS_WAKE)) {
-		(void)
-		    acpi_write_bit_register(acpi_gbl_fixed_event_info
-					    [ACPI_EVENT_PCIE_WAKE].
-					    enable_register_id,
-					    ACPI_DISABLE_EVENT);
-		(void)
-		    acpi_write_bit_register(acpi_gbl_fixed_event_info
-					    [ACPI_EVENT_PCIE_WAKE].
-					    status_register_id,
-					    ACPI_CLEAR_STATUS);
-	}
-
 	acpi_hw_execute_sleep_method(METHOD_PATHNAME__SST, ACPI_SST_WORKING);
 	return_ACPI_STATUS(status);
 }
diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c
index 53afa5edb6ec..cda6e16dddf7 100644
--- a/drivers/acpi/acpica/utglobal.c
+++ b/drivers/acpi/acpica/utglobal.c
@@ -186,10 +186,6 @@ struct acpi_fixed_event_info acpi_gbl_fixed_event_info[ACPI_NUM_FIXED_EVENTS] =
 					ACPI_BITREG_RT_CLOCK_ENABLE,
 					ACPI_BITMASK_RT_CLOCK_STATUS,
 					ACPI_BITMASK_RT_CLOCK_ENABLE},
-	/* ACPI_EVENT_PCIE_WAKE     */ {ACPI_BITREG_PCIEXP_WAKE_STATUS,
-					ACPI_BITREG_PCIEXP_WAKE_DISABLE,
-					ACPI_BITMASK_PCIEXP_WAKE_STATUS,
-					ACPI_BITMASK_PCIEXP_WAKE_DISABLE},
 };
 #endif				/* !ACPI_REDUCED_HARDWARE */
 
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 95e4f56f9754..1b4f81f1ac5d 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -723,8 +723,7 @@ typedef u32 acpi_event_type;
 #define ACPI_EVENT_POWER_BUTTON         2
 #define ACPI_EVENT_SLEEP_BUTTON         3
 #define ACPI_EVENT_RTC                  4
-#define ACPI_EVENT_PCIE_WAKE            5
-#define ACPI_EVENT_MAX                  5
+#define ACPI_EVENT_MAX                  4
 #define ACPI_NUM_FIXED_EVENTS           ACPI_EVENT_MAX + 1
 
 /*
-- 
cgit v1.2.3


From 465e5e6a1698f3325f5d2262d2875779b06b50c7 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Mon, 17 Apr 2023 14:36:15 +0200
Subject: fs/buffer: add folio_set_bh helper

Patch series "convert create_page_buffers to folio_create_buffers".

One of the first kernel panic we hit when we try to increase the block
size > 4k is inside create_page_buffers()[1].  Even though buffer.c
function do not support large folios (folios > PAGE_SIZE) at the moment,
these changes are required when we want to remove that constraint.


This patch (of 4):

The folio version of set_bh_page().  This is required to convert
create_page_buffers() to folio_create_buffers() later in the series.

Link: https://lkml.kernel.org/r/20230417123618.22094-1-p.raghav@samsung.com
Link: https://lkml.kernel.org/r/20230417123618.22094-2-p.raghav@samsung.com
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 15 +++++++++++++++
 include/linux/buffer_head.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 9e1e2add541e..009d950fbf42 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1485,6 +1485,21 @@ void set_bh_page(struct buffer_head *bh,
 }
 EXPORT_SYMBOL(set_bh_page);
 
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+		  unsigned long offset)
+{
+	bh->b_folio = folio;
+	BUG_ON(offset >= folio_size(folio));
+	if (folio_test_highmem(folio))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = folio_address(folio) + offset;
+}
+EXPORT_SYMBOL(folio_set_bh);
+
 /*
  * Called when truncating a buffer on a page completely.
  */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 8f14dca5fed7..7e92d23f4782 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -196,6 +196,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh);
 void touch_buffer(struct buffer_head *bh);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+		  unsigned long offset);
 bool try_to_free_buffers(struct folio *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
-- 
cgit v1.2.3


From c71124a8afa441af203d2001a92c91f114446687 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Mon, 17 Apr 2023 14:36:16 +0200
Subject: buffer: add folio_alloc_buffers() helper

Folio version of alloc_page_buffers() helper.  This is required to convert
create_page_buffers() to folio_create_buffers() later in the series.

alloc_page_buffers() has been modified to call folio_alloc_buffers() which
adds one call to compound_head() but folio_alloc_buffers() removes one
call to compound_head() compared to the existing alloc_page_buffers()
implementation.

Link: https://lkml.kernel.org/r/20230417123618.22094-3-p.raghav@samsung.com
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 23 +++++++++++++++--------
 include/linux/buffer_head.h |  2 ++
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 009d950fbf42..bbd4aa670262 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -843,7 +843,7 @@ int remove_inode_buffers(struct inode *inode)
 }
 
 /*
- * Create the appropriate buffers when given a page for data area and
+ * Create the appropriate buffers when given a folio for data area and
  * the size of each buffer.. Use the bh->b_this_page linked list to
  * follow the buffers created.  Return NULL if unable to create more
  * buffers.
@@ -851,8 +851,8 @@ int remove_inode_buffers(struct inode *inode)
  * The retry flag is used to differentiate async IO (paging, swapping)
  * which may not fail from ordinary buffer allocations.
  */
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		bool retry)
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+					bool retry)
 {
 	struct buffer_head *bh, *head;
 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
@@ -862,12 +862,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
-	/* The page lock pins the memcg */
-	memcg = page_memcg(page);
+	/* The folio lock pins the memcg */
+	memcg = folio_memcg(folio);
 	old_memcg = set_active_memcg(memcg);
 
 	head = NULL;
-	offset = PAGE_SIZE;
+	offset = folio_size(folio);
 	while ((offset -= size) >= 0) {
 		bh = alloc_buffer_head(gfp);
 		if (!bh)
@@ -879,8 +879,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 
 		bh->b_size = size;
 
-		/* Link the buffer to its page */
-		set_bh_page(bh, page, offset);
+		/* Link the buffer to its folio */
+		folio_set_bh(bh, folio, offset);
 	}
 out:
 	set_active_memcg(old_memcg);
@@ -899,6 +899,13 @@ no_grow:
 
 	goto out;
 }
+EXPORT_SYMBOL_GPL(folio_alloc_buffers);
+
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+				       bool retry)
+{
+	return folio_alloc_buffers(page_folio(page), size, retry);
+}
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
 static inline void
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7e92d23f4782..0b14eab41bd1 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -199,6 +199,8 @@ void set_bh_page(struct buffer_head *bh,
 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 bool try_to_free_buffers(struct folio *);
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+					bool retry);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
-- 
cgit v1.2.3


From 8e2e17560bed73c2a73c94cbe378719f6cf850ee Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Mon, 17 Apr 2023 14:36:17 +0200
Subject: fs/buffer: add folio_create_empty_buffers helper

Folio version of create_empty_buffers().  This is required to convert
create_page_buffers() to folio_create_buffers() later in the series.

It removes several calls to compound_head() as it works directly on folio
compared to create_empty_buffers().  Hence, create_empty_buffers() has
been modified to call folio_create_empty_buffers().

Link: https://lkml.kernel.org/r/20230417123618.22094-4-p.raghav@samsung.com
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 28 +++++++++++++++++-----------
 include/linux/buffer_head.h |  2 ++
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index bbd4aa670262..e6266bf7eeea 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1594,18 +1594,17 @@ out:
 }
 EXPORT_SYMBOL(block_invalidate_folio);
 
-
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * block_dirty_folio() via private_lock.  try_to_free_buffers
- * is already excluded via the page lock.
+ * is already excluded via the folio lock.
  */
-void create_empty_buffers(struct page *page,
-			unsigned long blocksize, unsigned long b_state)
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+				unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
 
-	head = alloc_page_buffers(page, blocksize, true);
+	head = folio_alloc_buffers(folio, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -1614,19 +1613,26 @@ void create_empty_buffers(struct page *page,
 	} while (bh);
 	tail->b_this_page = head;
 
-	spin_lock(&page->mapping->private_lock);
-	if (PageUptodate(page) || PageDirty(page)) {
+	spin_lock(&folio->mapping->private_lock);
+	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
 		bh = head;
 		do {
-			if (PageDirty(page))
+			if (folio_test_dirty(folio))
 				set_buffer_dirty(bh);
-			if (PageUptodate(page))
+			if (folio_test_uptodate(folio))
 				set_buffer_uptodate(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
-	attach_page_private(page, head);
-	spin_unlock(&page->mapping->private_lock);
+	folio_attach_private(folio, head);
+	spin_unlock(&folio->mapping->private_lock);
+}
+EXPORT_SYMBOL(folio_create_empty_buffers);
+
+void create_empty_buffers(struct page *page,
+			unsigned long blocksize, unsigned long b_state)
+{
+	folio_create_empty_buffers(page_folio(page), blocksize, b_state);
 }
 EXPORT_SYMBOL(create_empty_buffers);
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 0b14eab41bd1..1520793c72da 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,6 +205,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+				unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_async_write(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 4bf4f155bfbc77a12ad2c9e12d9f57718adb9a5c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 17 Apr 2023 19:48:07 +0800
Subject: mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()

Both of them change the arg from page_list to folio_list when convert them
to use a folio, but not the declaration, let's correct it, also move the
reclaim_pages() from swap.h to internal.h as it only used in mm.

Link: https://lkml.kernel.org/r/20230417114807.186786-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviwed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 1 -
 mm/internal.h        | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7f7d5b9ddf7e..3c69cb653cb9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -442,7 +442,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 long remove_mapping(struct address_space *mapping, struct folio *folio);
 
-extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/mm/internal.h b/mm/internal.h
index 6483db57a31f..68410c6d97ac 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -783,8 +783,9 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
+unsigned long reclaim_pages(struct list_head *folio_list);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
-					    struct list_head *page_list);
+					    struct list_head *folio_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
 #define ALLOC_WMARK_LOW		WMARK_LOW
-- 
cgit v1.2.3


From d7597f59d1d33e9efbffa7060deb9ee5bd119e62 Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Mon, 17 Apr 2023 22:13:40 -0700
Subject: mm: add new api to enable ksm per process

Patch series "mm: process/cgroup ksm support", v9.

So far KSM can only be enabled by calling madvise for memory regions.  To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.

Use case 1:
  The madvise call is not available in the programming language.  An
  example for this are programs with forked workloads using a garbage
  collected language without pointers.  In such a language madvise cannot
  be made available.

  In addition the addresses of objects get moved around as they are
  garbage collected.  KSM sharing needs to be enabled "from the outside"
  for these type of workloads.

Use case 2:
  The same interpreter can also be used for workloads where KSM brings
  no benefit or even has overhead.  We'd like to be able to enable KSM on
  a workload by workload basis.

Use case 3:
  With the madvise call sharing opportunities are only enabled for the
  current process: it is a workload-local decision.  A considerable number
  of sharing opportunities may exist across multiple workloads or jobs (if
  they are part of the same security domain).  Only a higler level entity
  like a job scheduler or container can know for certain if its running
  one or more instances of a job.  That job scheduler however doesn't have
  the necessary internal workload knowledge to make targeted madvise
  calls.

Security concerns:

  In previous discussions security concerns have been brought up.  The
  problem is that an individual workload does not have the knowledge about
  what else is running on a machine.  Therefore it has to be very
  conservative in what memory areas can be shared or not.  However, if the
  system is dedicated to running multiple jobs within the same security
  domain, its the job scheduler that has the knowledge that sharing can be
  safely enabled and is even desirable.

Performance:

  Experiments with using UKSM have shown a capacity increase of around 20%.

  Here are the metrics from an instagram workload (taken from a machine
  with 64GB main memory):

   full_scans: 445
   general_profit: 20158298048
   max_page_sharing: 256
   merge_across_nodes: 1
   pages_shared: 129547
   pages_sharing: 5119146
   pages_to_scan: 4000
   pages_unshared: 1760924
   pages_volatile: 10761341
   run: 1
   sleep_millisecs: 20
   stable_node_chains: 167
   stable_node_chains_prune_millisecs: 2000
   stable_node_dups: 2751
   use_zero_pages: 0
   zero_pages_sharing: 0

After the service is running for 30 minutes to an hour, 4 to 5 million
shared pages are common for this workload when using KSM.


Detailed changes:

1. New options for prctl system command
   This patch series adds two new options to the prctl system call.
   The first one allows to enable KSM at the process level and the second
   one to query the setting.

The setting will be inherited by child processes.

With the above setting, KSM can be enabled for the seed process of a cgroup
and all processes in the cgroup will inherit the setting.

2. Changes to KSM processing
   When KSM is enabled at the process level, the KSM code will iterate
   over all the VMA's and enable KSM for the eligible VMA's.

   When forking a process that has KSM enabled, the setting will be
   inherited by the new child process.

3. Add general_profit metric
   The general_profit metric of KSM is specified in the documentation,
   but not calculated.  This adds the general profit metric to
   /sys/kernel/debug/mm/ksm.

4. Add more metrics to ksm_stat
   This adds the process profit metric to /proc/<pid>/ksm_stat.

5. Add more tests to ksm_tests and ksm_functional_tests
   This adds an option to specify the merge type to the ksm_tests.
   This allows to test madvise and prctl KSM.

   It also adds a two new tests to ksm_functional_tests: one to test
   the new prctl options and the other one is a fork test to verify that
   the KSM process setting is inherited by client processes.


This patch (of 3):

So far KSM can only be enabled by calling madvise for memory regions.  To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.

1. New options for prctl system command

   This patch series adds two new options to the prctl system call.
   The first one allows to enable KSM at the process level and the second
   one to query the setting.

   The setting will be inherited by child processes.

   With the above setting, KSM can be enabled for the seed process of a
   cgroup and all processes in the cgroup will inherit the setting.

2. Changes to KSM processing

   When KSM is enabled at the process level, the KSM code will iterate
   over all the VMA's and enable KSM for the eligible VMA's.

   When forking a process that has KSM enabled, the setting will be
   inherited by the new child process.

  1) Introduce new MMF_VM_MERGE_ANY flag

     This introduces the new flag MMF_VM_MERGE_ANY flag.  When this flag
     is set, kernel samepage merging (ksm) gets enabled for all vma's of a
     process.

  2) Setting VM_MERGEABLE on VMA creation

     When a VMA is created, if the MMF_VM_MERGE_ANY flag is set, the
     VM_MERGEABLE flag will be set for this VMA.

  3) support disabling of ksm for a process

     This adds the ability to disable ksm for a process if ksm has been
     enabled for the process with prctl.

  4) add new prctl option to get and set ksm for a process

     This adds two new options to the prctl system call
     - enable ksm for all vmas of a process (if the vmas support it).
     - query if ksm has been enabled for a process.

3. Disabling MMF_VM_MERGE_ANY for storage keys in s390

   In the s390 architecture when storage keys are used, the
   MMF_VM_MERGE_ANY will be disabled.

Link: https://lkml.kernel.org/r/20230418051342.1919757-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20230418051342.1919757-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap.c            |   7 +++
 include/linux/ksm.h            |  21 ++++++++-
 include/linux/sched/coredump.h |   1 +
 include/uapi/linux/prctl.h     |   2 +
 kernel/sys.c                   |  27 +++++++++++
 mm/ksm.c                       | 104 ++++++++++++++++++++++++++++++++++-------
 mm/mmap.c                      |   3 ++
 7 files changed, 146 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 5a716bdcba05..0949811761e6 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2591,6 +2591,13 @@ int gmap_mark_unmergeable(void)
 	int ret;
 	VMA_ITERATOR(vmi, mm, 0);
 
+	/*
+	 * Make sure to disable KSM (if enabled for the whole process or
+	 * individual VMAs). Note that nothing currently hinders user space
+	 * from re-enabling it.
+	 */
+	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+
 	for_each_vma(vmi, vma) {
 		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
 		vm_flags = vma->vm_flags;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index d9e701326a88..4647b0c70c12 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,13 +18,26 @@
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
+
+void ksm_add_vma(struct vm_area_struct *vma);
+int ksm_enable_merge_any(struct mm_struct *mm);
+
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
+	int ret;
+
+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
+		ret = __ksm_enter(mm);
+		if (ret)
+			return ret;
+	}
+
+	if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
+		set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+
 	return 0;
 }
 
@@ -57,6 +70,10 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 #endif
 #else  /* !CONFIG_KSM */
 
+static inline void ksm_add_vma(struct vm_area_struct *vma)
+{
+}
+
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	return 0;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0e17ae7fbfd3..0ee96ea7a0e9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
+#define MMF_VM_MERGE_ANY	29
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b99c0be72577..f23d9a16507f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -292,4 +292,6 @@ struct prctl_mm_map {
 
 #define PR_GET_AUXV			0x41555856
 
+#define PR_SET_MEMORY_MERGE		67
+#define PR_GET_MEMORY_MERGE		68
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 26c1399e0654..72cdb16e2636 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -15,6 +15,7 @@
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/kmod.h>
+#include <linux/ksm.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -2687,6 +2688,32 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
+#ifdef CONFIG_KSM
+	case PR_SET_MEMORY_MERGE:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(me->mm))
+			return -EINTR;
+
+		if (arg2) {
+			error = ksm_enable_merge_any(me->mm);
+		} else {
+			/*
+			 * TODO: we might want disable KSM on all VMAs and
+			 * trigger unsharing to completely disable KSM.
+			 */
+			clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+			error = 0;
+		}
+		mmap_write_unlock(me->mm);
+		break;
+	case PR_GET_MEMORY_MERGE:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+		break;
+#endif
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/ksm.c b/mm/ksm.c
index 37c63310bc4e..35ac6c741572 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -515,6 +515,28 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
 
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
+			     VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
+			     VM_MIXEDMAP))
+		return false;		/* just ignore the advice */
+
+	if (vma_is_dax(vma))
+		return false;
+
+#ifdef VM_SAO
+	if (vma->vm_flags & VM_SAO)
+		return false;
+#endif
+#ifdef VM_SPARC_ADI
+	if (vma->vm_flags & VM_SPARC_ADI)
+		return false;
+#endif
+
+	return true;
+}
+
 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
 		unsigned long addr)
 {
@@ -1026,6 +1048,7 @@ mm_exiting:
 
 			mm_slot_free(mm_slot_cache, mm_slot);
 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+			clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 			mmdrop(mm);
 		} else
 			spin_unlock(&ksm_mmlist_lock);
@@ -2408,6 +2431,7 @@ no_vmas:
 
 		mm_slot_free(mm_slot_cache, mm_slot);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 		mmap_read_unlock(mm);
 		mmdrop(mm);
 	} else {
@@ -2485,6 +2509,66 @@ static int ksm_scan_thread(void *nothing)
 	return 0;
 }
 
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+	unsigned long vm_flags = vma->vm_flags;
+
+	if (vm_flags & VM_MERGEABLE)
+		return;
+
+	if (vma_ksm_compatible(vma))
+		vm_flags_set(vma, VM_MERGEABLE);
+}
+
+/**
+ * ksm_add_vma - Mark vma as mergeable if compatible
+ *
+ * @vma:  Pointer to vma
+ */
+void ksm_add_vma(struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		__ksm_add_vma(vma);
+}
+
+static void ksm_add_vmas(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma)
+		__ksm_add_vma(vma);
+}
+
+/**
+ * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
+ *                        compatible VMA's
+ *
+ * @mm:  Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_enable_merge_any(struct mm_struct *mm)
+{
+	int err;
+
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return 0;
+
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+		err = __ksm_enter(mm);
+		if (err)
+			return err;
+	}
+
+	set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+	ksm_add_vmas(mm);
+
+	return 0;
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
@@ -2493,25 +2577,10 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
 	switch (advice) {
 	case MADV_MERGEABLE:
-		/*
-		 * Be somewhat over-protective for now!
-		 */
-		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
-				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_MIXEDMAP))
-			return 0;		/* just ignore the advice */
-
-		if (vma_is_dax(vma))
+		if (vma->vm_flags & VM_MERGEABLE)
 			return 0;
-
-#ifdef VM_SAO
-		if (*vm_flags & VM_SAO)
+		if (!vma_ksm_compatible(vma))
 			return 0;
-#endif
-#ifdef VM_SPARC_ADI
-		if (*vm_flags & VM_SPARC_ADI)
-			return 0;
-#endif
 
 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
 			err = __ksm_enter(mm);
@@ -2615,6 +2684,7 @@ void __ksm_exit(struct mm_struct *mm)
 
 	if (easy_to_free) {
 		mm_slot_free(mm_slot_cache, mm_slot);
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
 		mmdrop(mm);
 	} else if (mm_slot) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 790cc62c0038..51b6976fd525 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -46,6 +46,7 @@
 #include <linux/pkeys.h>
 #include <linux/oom.h>
 #include <linux/sched/mm.h>
+#include <linux/ksm.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2729,6 +2730,7 @@ unmap_writable:
 	if (file && vm_flags & VM_SHARED)
 		mapping_unmap_writable(file->f_mapping);
 	file = vma->vm_file;
+	ksm_add_vma(vma);
 expanded:
 	perf_event_mmap(vma);
 
@@ -3001,6 +3003,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		goto mas_store_fail;
 
 	mm->map_count++;
+	ksm_add_vma(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
-- 
cgit v1.2.3


From d21077fbc2fc987c2e593c34dc3b4d84e546dc9f Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Mon, 17 Apr 2023 22:13:41 -0700
Subject: mm: add new KSM process and sysfs knobs

This adds the general_profit KSM sysfs knob and the process profit metric
knobs to ksm_stat.

1) expose general_profit metric

   The documentation mentions a general profit metric, however this
   metric is not calculated.  In addition the formula depends on the size
   of internal structures, which makes it more difficult for an
   administrator to make the calculation.  Adding the metric for a better
   user experience.

2) document general_profit sysfs knob

3) calculate ksm process profit metric

   The ksm documentation mentions the process profit metric and how to
   calculate it.  This adds the calculation of the metric.

4) mm: expose ksm process profit metric in ksm_stat

   This exposes the ksm process profit metric in /proc/<pid>/ksm_stat.
   The documentation mentions the formula for the ksm process profit
   metric, however it does not calculate it.  In addition the formula
   depends on the size of internal structures.  So it makes sense to
   expose it.

5) document new procfs ksm knobs

Link: https://lkml.kernel.org/r/20230418051342.1919757-3-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-ksm |  8 ++++++++
 Documentation/admin-guide/mm/ksm.rst          |  5 ++++-
 fs/proc/base.c                                |  3 +++
 include/linux/ksm.h                           |  5 +++++
 mm/ksm.c                                      | 21 +++++++++++++++++++++
 5 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index d244674a9480..6041a025b65a 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -51,3 +51,11 @@ Description:	Control merging pages across different NUMA nodes.
 
 		When it is set to 0 only pages from the same node are merged,
 		otherwise pages from all nodes can be merged together (default).
+
+What:		/sys/kernel/mm/ksm/general_profit
+Date:		April 2023
+KernelVersion:  6.4
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Measure how effective KSM is.
+		general_profit: how effective is KSM. The formula for the
+		calculation is in Documentation/admin-guide/mm/ksm.rst.
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index eed51a910c94..551083a396fb 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -157,6 +157,8 @@ stable_node_chains_prune_millisecs
 
 The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
 
+general_profit
+        how effective is KSM. The calculation is explained below.
 pages_shared
         how many shared pages are being used
 pages_sharing
@@ -207,7 +209,8 @@ several times, which are unprofitable memory consumed.
 			  ksm_rmap_items * sizeof(rmap_item).
 
    where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
-   and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``.
+   and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``. The process profit
+   is also shown in ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
 
 From the perspective of application, a high ratio of ``ksm_rmap_items`` to
 ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e0e0ccd47aa..96a6a08c8235 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -96,6 +96,7 @@
 #include <linux/time_namespace.h>
 #include <linux/resctrl.h>
 #include <linux/cn_proc.h>
+#include <linux/ksm.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -3207,6 +3208,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 	mm = get_task_mm(task);
 	if (mm) {
 		seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+		seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+		seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
 		mmput(mm);
 	}
 
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 4647b0c70c12..7a9b76fb6c3f 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -68,6 +68,11 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
 void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 		       int force_early);
 #endif
+
+#ifdef CONFIG_PROC_FS
+long ksm_process_profit(struct mm_struct *);
+#endif /* CONFIG_PROC_FS */
+
 #else  /* !CONFIG_KSM */
 
 static inline void ksm_add_vma(struct vm_area_struct *vma)
diff --git a/mm/ksm.c b/mm/ksm.c
index 35ac6c741572..9e48258985d2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3007,6 +3007,14 @@ static void wait_while_offlining(void)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+#ifdef CONFIG_PROC_FS
+long ksm_process_profit(struct mm_struct *mm)
+{
+	return mm->ksm_merging_pages * PAGE_SIZE -
+		mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
+}
+#endif /* CONFIG_PROC_FS */
+
 #ifdef CONFIG_SYSFS
 /*
  * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -3271,6 +3279,18 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
 }
 KSM_ATTR_RO(pages_volatile);
 
+static ssize_t general_profit_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	long general_profit;
+
+	general_profit = ksm_pages_sharing * PAGE_SIZE -
+				ksm_rmap_items * sizeof(struct ksm_rmap_item);
+
+	return sysfs_emit(buf, "%ld\n", general_profit);
+}
+KSM_ATTR_RO(general_profit);
+
 static ssize_t stable_node_dups_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *buf)
 {
@@ -3335,6 +3355,7 @@ static struct attribute *ksm_attrs[] = {
 	&stable_node_dups_attr.attr,
 	&stable_node_chains_prune_millisecs_attr.attr,
 	&use_zero_pages_attr.attr,
+	&general_profit_attr.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From f724392415b3e1ae844d2766669c0d955fe9a17b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 18 Apr 2023 22:22:02 -0700
Subject: hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()

Some architectures can have their hugetlb pages down at the lowest PTE
level: their huge_pte_alloc() using pte_alloc_map(), but without any
following pte_unmap().  Since none of these arches uses CONFIG_HIGHPTE,
this is not seen as a problem at present; but would become a problem if
forthcoming changes were to add an rcu_read_lock() into pte_offset_map(),
with the rcu_read_unlock() expected in pte_unmap().

Similarly in their huge_pte_offset(): pte_offset_kernel() is good enough
for that, but it's probably less confusing if we define pte_offset_huge()
along with pte_alloc_huge().  Only define them without CONFIG_HIGHPTE: so
there would be a build error to signal if ever more work is needed.

For ease of development, define these now for 6.4-rc1, ahead of any use:
then architectures can integrate patches using them, independent from mm.

Link: https://lkml.kernel.org/r/ae9e7d98-8a3a-cfd9-4762-bcddffdf96cf@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 28703fe22386..cbe1e97a15a1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -191,6 +191,23 @@ extern struct list_head huge_boot_pages;
 
 /* arch callbacks */
 
+#ifndef CONFIG_HIGHPTE
+/*
+ * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
+ * which may go down to the lowest PTE level in their huge_pte_offset() and
+ * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
+ */
+static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
+{
+	return pte_offset_kernel(pmd, address);
+}
+static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
+				    unsigned long address)
+{
+	return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
+}
+#endif
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz);
 /*
-- 
cgit v1.2.3


From 531094dc7164718d28ebb581d729807d7e846363 Mon Sep 17 00:00:00 2001
From: Yonatan Nachum <ynachum@amazon.com>
Date: Tue, 4 Apr 2023 15:43:13 +0000
Subject: RDMA/efa: Add rdma write capability to device caps

Add rdma write capability that is propagated from the device to rdma-core.
Enable MR creation with remote write permissions according to this device
capability.

Link: https://lore.kernel.org/r/20230404154313.35194-1-ynachum@amazon.com
Reviewed-by: Firas Jahjah <firasj@amazon.com>
Reviewed-by: Michael Margolin <mrgolin@amazon.com>
Signed-off-by: Yonatan Nachum <ynachum@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 12 +++++--
 drivers/infiniband/hw/efa/efa_io_defs.h         | 42 +++++++++++++++++--------
 drivers/infiniband/hw/efa/efa_verbs.c           |  6 +++-
 include/uapi/rdma/efa-abi.h                     |  1 +
 4 files changed, 44 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index 3db791e6c030..4e93ef7f84ee 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -376,7 +376,9 @@ struct efa_admin_reg_mr_cmd {
 	 * 0 : local_write_enable - Local write permissions:
 	 *    must be set for RQ buffers and buffers posted for
 	 *    RDMA Read requests
-	 * 1 : reserved1 - MBZ
+	 * 1 : remote_write_enable - Remote write
+	 *    permissions: must be set to enable RDMA write to
+	 *    the region
 	 * 2 : remote_read_enable - Remote read permissions:
 	 *    must be set to enable RDMA read from the region
 	 * 7:3 : reserved2 - MBZ
@@ -620,7 +622,9 @@ struct efa_admin_feature_device_attr_desc {
 	 *    modify QP command
 	 * 2 : data_polling_128 - If set, 128 bytes data
 	 *    polling is supported
-	 * 31:3 : reserved - MBZ
+	 * 3 : rdma_write - If set, RDMA Write is supported
+	 *    on TX queues
+	 * 31:4 : reserved - MBZ
 	 */
 	u32 device_caps;
 
@@ -674,7 +678,7 @@ struct efa_admin_feature_queue_attr_desc {
 	/* The maximum size of LLQ in bytes */
 	u32 max_llq_size;
 
-	/* Maximum number of SGEs for a single RDMA read WQE */
+	/* Maximum number of SGEs for a single RDMA read/write WQE */
 	u16 max_wr_rdma_sges;
 
 	/*
@@ -979,6 +983,7 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
 #define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
 #define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_WRITE_ENABLE_MASK       BIT(1)
 #define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
 
 /* create_cq_cmd */
@@ -994,6 +999,7 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK  BIT(3)
 
 /* create_eq_cmd */
 #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
diff --git a/drivers/infiniband/hw/efa/efa_io_defs.h b/drivers/infiniband/hw/efa/efa_io_defs.h
index 17ba8984b11e..2d8eb96eaa81 100644
--- a/drivers/infiniband/hw/efa/efa_io_defs.h
+++ b/drivers/infiniband/hw/efa/efa_io_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_IO_H_
@@ -23,6 +23,8 @@ enum efa_io_send_op_type {
 	EFA_IO_SEND                                 = 0,
 	/* RDMA read */
 	EFA_IO_RDMA_READ                            = 1,
+	/* RDMA write */
+	EFA_IO_RDMA_WRITE                           = 2,
 };
 
 enum efa_io_comp_status {
@@ -62,8 +64,7 @@ struct efa_io_tx_meta_desc {
 
 	/*
 	 * control flags
-	 * 3:0 : op_type - operation type: send/rdma/fast mem
-	 *    ops/etc
+	 * 3:0 : op_type - enum efa_io_send_op_type
 	 * 4 : has_imm - immediate_data field carries valid
 	 *    data.
 	 * 5 : inline_msg - inline mode - inline message data
@@ -219,21 +220,22 @@ struct efa_io_cdesc_common {
 	 * 2:1 : q_type - enum efa_io_queue_type: send/recv
 	 * 3 : has_imm - indicates that immediate data is
 	 *    present - for RX completions only
-	 * 7:4 : reserved28 - MBZ
+	 * 6:4 : op_type - enum efa_io_send_op_type
+	 * 7 : reserved31 - MBZ
 	 */
 	u8 flags;
 
 	/* local QP number */
 	u16 qp_num;
-
-	/* Transferred length */
-	u16 length;
 };
 
 /* Tx completion descriptor */
 struct efa_io_tx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
+
+	/* MBZ */
+	u16 reserved16;
 };
 
 /* Rx Completion Descriptor */
@@ -241,6 +243,9 @@ struct efa_io_rx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
 
+	/* Transferred length bits[15:0] */
+	u16 length;
+
 	/* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
 	u16 ah;
 
@@ -250,16 +255,26 @@ struct efa_io_rx_cdesc {
 	u32 imm;
 };
 
+/* Rx Completion Descriptor RDMA write info */
+struct efa_io_rx_cdesc_rdma_write {
+	/* Transferred length bits[31:16] */
+	u16 length_hi;
+};
+
 /* Extended Rx Completion Descriptor */
 struct efa_io_rx_cdesc_ex {
 	/* Base RX completion info */
-	struct efa_io_rx_cdesc rx_cdesc_base;
+	struct efa_io_rx_cdesc base;
 
-	/*
-	 * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is
-	 * enabled.
-	 */
-	u8 src_addr[16];
+	union {
+		struct efa_io_rx_cdesc_rdma_write rdma_write;
+
+		/*
+		 * Valid only in case of unknown AH (0xFFFF) and CQ
+		 * set_src_addr is enabled.
+		 */
+		u8 src_addr[16];
+	} u;
 };
 
 /* tx_meta_desc */
@@ -285,5 +300,6 @@ struct efa_io_rx_cdesc_ex {
 #define EFA_IO_CDESC_COMMON_PHASE_MASK                      BIT(0)
 #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK                     GENMASK(2, 1)
 #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK                    BIT(3)
+#define EFA_IO_CDESC_COMMON_OP_TYPE_MASK                    GENMASK(6, 4)
 
 #endif /* _EFA_IO_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index c20136e9d3d1..8eca6c14d0cf 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -253,6 +253,9 @@ int efa_query_device(struct ib_device *ibdev,
 		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
 
+		if (EFA_DEV_CAP(dev, RDMA_WRITE))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
+
 		if (dev->neqs)
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
 
@@ -1572,7 +1575,8 @@ static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
 
 	supp_access_flags =
 		IB_ACCESS_LOCAL_WRITE |
-		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) |
+		(EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0);
 
 	access_flags &= ~IB_ACCESS_OPTIONAL;
 	if (access_flags & ~supp_access_flags) {
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 74406b4817ce..d94c32f28804 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -121,6 +121,7 @@ enum {
 	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
 	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
 	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
+	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
 };
 
 struct efa_ibv_ex_query_device_resp {
-- 
cgit v1.2.3


From 63e9bbbcca60333490e13744ae736d8f988e4950 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 11 Apr 2023 16:29:47 +0200
Subject: netfilter: nf_tables: don't store chain address on jump

Now that the rule trailer/end marker and the rcu head reside in the
same structure, we no longer need to save/restore the chain pointer
when performing/returning from a jump.

We can simply let the trace infra walk the evaluated rule until it
hits the end marker and then fetch the chain pointer from there.

When the rule is NULL (policy tracing), then chain and basechain
pointers were already identical, so just use the basechain.

This cuts size of jumpstack in half, from 256 to 128 bytes in 64bit,
scripts/stackusage says:

nf_tables_core.c:251 nft_do_chain    328     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 14 ++++++++++++--
 net/netfilter/nf_tables_api.c     |  7 -------
 net/netfilter/nf_tables_core.c    | 21 ++++++---------------
 net/netfilter/nf_tables_trace.c   | 30 ++++++++++++++++++++++++++----
 4 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 1b8e305bb54a..f476fd030626 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1046,6 +1046,18 @@ struct nft_rule_dp {
 		__attribute__((aligned(__alignof__(struct nft_expr))));
 };
 
+struct nft_rule_dp_last {
+	struct nft_rule_dp end;		/* end of nft_rule_blob marker */
+	struct rcu_head h;		/* call_rcu head */
+	struct nft_rule_blob *blob;	/* ptr to free via call_rcu */
+	const struct nft_chain *chain;	/* for nftables tracing */
+};
+
+static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule)
+{
+	return (void *)rule + sizeof(*rule) + rule->dlen;
+}
+
 struct nft_rule_blob {
 	unsigned long			size;
 	unsigned char			data[]
@@ -1396,7 +1408,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@pkt: pktinfo currently processed
  *	@basechain: base chain currently processed
- *	@chain: chain currently processed
  *	@rule:  rule that was evaluated
  *	@verdict: verdict given by rule
  */
@@ -1408,7 +1419,6 @@ struct nft_traceinfo {
 	u32				skbid;
 	const struct nft_pktinfo	*pkt;
 	const struct nft_base_chain	*basechain;
-	const struct nft_chain		*chain;
 	const struct nft_rule_dp	*rule;
 	const struct nft_verdict	*verdict;
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 79848a27e640..0e1c86bb51a2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2110,13 +2110,6 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
 	module_put(hook->type->owner);
 }
 
-struct nft_rule_dp_last {
-	struct nft_rule_dp end;	/* end of nft_rule_blob marker */
-	struct rcu_head h;
-	struct nft_rule_blob *blob;
-	const struct nft_chain *chain;	/* for tracing */
-};
-
 static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
 {
 	struct nft_rule_dp_last *lrule;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index ec3bab751092..89c05b64c2a2 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -42,13 +42,11 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
-					const struct nft_chain *chain,
 					enum nft_trace_types type)
 {
 	if (!info->trace || !info->nf_trace)
 		return;
 
-	info->chain = chain;
 	info->type = type;
 
 	nft_trace_notify(info);
@@ -56,14 +54,13 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 				    struct nft_traceinfo *info,
-				    const struct nft_chain *chain,
 				    const struct nft_rule_dp *rule,
 				    enum nft_trace_types type)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(info, chain, type);
+		__nft_trace_packet(info, type);
 	}
 }
 
@@ -111,7 +108,6 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 }
 
 static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
-					 const struct nft_chain *chain,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -133,17 +129,16 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
 		break;
 	}
 
-	__nft_trace_packet(info, chain, type);
+	__nft_trace_packet(info, type);
 }
 
 static inline void nft_trace_verdict(struct nft_traceinfo *info,
-				     const struct nft_chain *chain,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->rule = rule;
-		__nft_trace_verdict(info, chain, regs);
+		__nft_trace_verdict(info, regs);
 	}
 }
 
@@ -203,7 +198,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 }
 
 struct nft_jumpstack {
-	const struct nft_chain *chain;
 	const struct nft_rule_dp *rule;
 };
 
@@ -247,7 +241,6 @@ indirect_call:
 #define nft_rule_expr_first(rule)	(struct nft_expr *)&rule->data[0]
 #define nft_rule_expr_next(expr)	((void *)expr) + expr->ops->size
 #define nft_rule_expr_last(rule)	(struct nft_expr *)&rule->data[rule->dlen]
-#define nft_rule_next(rule)		(void *)rule + sizeof(*rule) + rule->dlen
 
 #define nft_rule_dp_for_each_expr(expr, last, rule) \
         for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
@@ -302,14 +295,14 @@ next_rule:
 			nft_trace_copy_nftrace(pkt, &info);
 			continue;
 		case NFT_CONTINUE:
-			nft_trace_packet(pkt, &info, chain, rule,
+			nft_trace_packet(pkt, &info, rule,
 					 NFT_TRACETYPE_RULE);
 			continue;
 		}
 		break;
 	}
 
-	nft_trace_verdict(&info, chain, rule, &regs);
+	nft_trace_verdict(&info, rule, &regs);
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
@@ -323,7 +316,6 @@ next_rule:
 	case NFT_JUMP:
 		if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
 			return NF_DROP;
-		jumpstack[stackptr].chain = chain;
 		jumpstack[stackptr].rule = nft_rule_next(rule);
 		stackptr++;
 		fallthrough;
@@ -339,12 +331,11 @@ next_rule:
 
 	if (stackptr > 0) {
 		stackptr--;
-		chain = jumpstack[stackptr].chain;
 		rule = jumpstack[stackptr].rule;
 		goto next_rule;
 	}
 
-	nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY);
+	nft_trace_packet(pkt, &info, NULL, NFT_TRACETYPE_POLICY);
 
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 1163ba9c1401..3d9b83d84a84 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -164,9 +164,29 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
 	return true;
 }
 
+static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *info)
+{
+	const struct nft_rule_dp *rule = info->rule;
+	const struct nft_rule_dp_last *last;
+
+	if (!rule)
+		return &info->basechain->chain;
+
+	while (!rule->is_last)
+		rule = nft_rule_next(rule);
+
+	last = (const struct nft_rule_dp_last *)rule;
+
+	if (WARN_ON_ONCE(!last->chain))
+		return &info->basechain->chain;
+
+	return last->chain;
+}
+
 void nft_trace_notify(struct nft_traceinfo *info)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
+	const struct nft_chain *chain;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned int size;
@@ -176,9 +196,11 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
 		return;
 
+	chain = nft_trace_get_chain(info);
+
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
-		nla_total_size(strlen(info->chain->table->name)) +
-		nla_total_size(strlen(info->chain->name)) +
+		nla_total_size(strlen(chain->table->name)) +
+		nla_total_size(strlen(chain->name)) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
 		nla_total_size(0) +			/* VERDICT, nested */
@@ -217,10 +239,10 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
 		goto nla_put_failure;
 
-	if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
+	if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name))
 		goto nla_put_failure;
 
-	if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name))
+	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
 	if (nf_trace_fill_rule_info(skb, info))
-- 
cgit v1.2.3


From 9a32e9850686599ed194ccdceb6cd3dd56b2d9b9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Apr 2023 17:13:19 +0200
Subject: netfilter: nf_tables: don't write table validation state without
 mutex

The ->cleanup callback needs to be removed, this doesn't work anymore as
the transaction mutex is already released in the ->abort function.

Just do it after a successful validation pass, this either happens
from commit or abort phases where transaction mutex is held.

Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h | 1 -
 net/netfilter/nf_tables_api.c       | 8 ++------
 net/netfilter/nfnetlink.c           | 2 --
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 241e005f290a..e9a9ab34a7cc 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -45,7 +45,6 @@ struct nfnetlink_subsystem {
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb,
 		     enum nfnl_abort_action action);
-	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0e1c86bb51a2..21eb273d2740 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8639,6 +8639,8 @@ static int nf_tables_validate(struct net *net)
 			if (nft_table_validate(net, table) < 0)
 				return -EAGAIN;
 		}
+
+		nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 		break;
 	}
 
@@ -9578,11 +9580,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 	return 0;
 }
 
-static void nf_tables_cleanup(struct net *net)
-{
-	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
-}
-
 static int nf_tables_abort(struct net *net, struct sk_buff *skb,
 			   enum nfnl_abort_action action)
 {
@@ -9616,7 +9613,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
-	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 81c7737c803a..ae7146475d17 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -590,8 +590,6 @@ done:
 			goto replay_abort;
 		}
 	}
-	if (ss->cleanup)
-		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
 	kfree_skb(skb);
-- 
cgit v1.2.3


From 00c320f9b75560628e840bef027a27c746706759 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Apr 2023 17:13:20 +0200
Subject: netfilter: nf_tables: make validation state per table

We only need to validate tables that saw changes in the current
transaction.

The existing code revalidates all tables, but this isn't needed as
cross-table jumps are not allowed (chains have table scope).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 ++-
 net/netfilter/nf_tables_api.c     | 38 ++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index f476fd030626..ec347d9cff9e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1209,6 +1209,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@genmask: generation mask
  *	@afinfo: address family info
  *	@name: name of the table
+ *	@validate_state: internal, set when transaction adds jumps
  */
 struct nft_table {
 	struct list_head		list;
@@ -1227,6 +1228,7 @@ struct nft_table {
 	char				*name;
 	u16				udlen;
 	u8				*udata;
+	u8				validate_state;
 };
 
 static inline bool nft_table_has_owner(const struct nft_table *table)
@@ -1698,7 +1700,6 @@ struct nftables_pernet {
 	struct mutex		commit_mutex;
 	u64			table_handle;
 	unsigned int		base_seq;
-	u8			validate_state;
 };
 
 extern unsigned int nf_tables_net_id;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 21eb273d2740..44ebc5f9598e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -102,11 +102,9 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
 	[NFT_MSG_DELFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
 };
 
-static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-
-	switch (nft_net->validate_state) {
+	switch (table->validate_state) {
 	case NFT_VALIDATE_SKIP:
 		WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
 		break;
@@ -117,7 +115,7 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
 			return;
 	}
 
-	nft_net->validate_state = new_validate_state;
+	table->validate_state = new_validate_state;
 }
 static void nf_tables_trans_destroy_work(struct work_struct *w);
 static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
@@ -1224,6 +1222,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
 	if (table == NULL)
 		goto err_kzalloc;
 
+	table->validate_state = NFT_VALIDATE_SKIP;
 	table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
 	if (table->name == NULL)
 		goto err_strdup;
@@ -3660,7 +3659,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 		}
 
 		if (expr_info[i].ops->validate)
-			nft_validate_state_update(net, NFT_VALIDATE_NEED);
+			nft_validate_state_update(table, NFT_VALIDATE_NEED);
 
 		expr_info[i].ops = NULL;
 		expr = nft_expr_next(expr);
@@ -3710,7 +3709,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (flow)
 		nft_trans_flow_rule(trans) = flow;
 
-	if (nft_net->validate_state == NFT_VALIDATE_DO)
+	if (table->validate_state == NFT_VALIDATE_DO)
 		return nft_table_validate(net, table);
 
 	return 0;
@@ -6312,7 +6311,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			if (desc.type == NFT_DATA_VERDICT &&
 			    (elem.data.val.verdict.code == NFT_GOTO ||
 			     elem.data.val.verdict.code == NFT_JUMP))
-				nft_validate_state_update(ctx->net,
+				nft_validate_state_update(ctx->table,
 							  NFT_VALIDATE_NEED);
 		}
 
@@ -6437,7 +6436,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
 				const struct nfnl_info *info,
 				const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(info->net);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
 	u8 family = info->nfmsg->nfgen_family;
@@ -6476,7 +6474,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
 		}
 	}
 
-	if (nft_net->validate_state == NFT_VALIDATE_DO)
+	if (table->validate_state == NFT_VALIDATE_DO)
 		return nft_table_validate(net, table);
 
 	return 0;
@@ -8628,19 +8626,20 @@ static int nf_tables_validate(struct net *net)
 	struct nftables_pernet *nft_net = nft_pernet(net);
 	struct nft_table *table;
 
-	switch (nft_net->validate_state) {
-	case NFT_VALIDATE_SKIP:
-		break;
-	case NFT_VALIDATE_NEED:
-		nft_validate_state_update(net, NFT_VALIDATE_DO);
-		fallthrough;
-	case NFT_VALIDATE_DO:
-		list_for_each_entry(table, &nft_net->tables, list) {
+	list_for_each_entry(table, &nft_net->tables, list) {
+		switch (table->validate_state) {
+		case NFT_VALIDATE_SKIP:
+			continue;
+		case NFT_VALIDATE_NEED:
+			nft_validate_state_update(table, NFT_VALIDATE_DO);
+			fallthrough;
+		case NFT_VALIDATE_DO:
 			if (nft_table_validate(net, table) < 0)
 				return -EAGAIN;
+
+			nft_validate_state_update(table, NFT_VALIDATE_SKIP);
 		}
 
-		nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 		break;
 	}
 
@@ -10355,7 +10354,6 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&nft_net->notify_list);
 	mutex_init(&nft_net->commit_mutex);
 	nft_net->base_seq = 1;
-	nft_net->validate_state = NFT_VALIDATE_SKIP;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 698bb828a6c20c86e30b307175be1827c071ce23 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:32 +0200
Subject: netfilter: nf_tables: do not store pktinfo in traceinfo structure

pass it as argument.  No change in object size.

stack usage decreases by 8 byte:
 nf_tables_core.c:254  nft_do_chain       320     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  5 ++---
 net/netfilter/nf_tables_core.c    | 21 ++++++++++++---------
 net/netfilter/nf_tables_trace.c   |  5 ++---
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ec347d9cff9e..cab351928cd2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1408,7 +1408,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@type: event type (enum nft_trace_types)
  *	@skbid: hash of skb to be used as trace id
  *	@packet_dumped: packet headers sent in a previous traceinfo message
- *	@pkt: pktinfo currently processed
  *	@basechain: base chain currently processed
  *	@rule:  rule that was evaluated
  *	@verdict: verdict given by rule
@@ -1419,7 +1418,6 @@ struct nft_traceinfo {
 	bool				packet_dumped;
 	enum nft_trace_types		type:8;
 	u32				skbid;
-	const struct nft_pktinfo	*pkt;
 	const struct nft_base_chain	*basechain;
 	const struct nft_rule_dp	*rule;
 	const struct nft_verdict	*verdict;
@@ -1429,7 +1427,8 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 		    const struct nft_verdict *verdict,
 		    const struct nft_chain *basechain);
 
-void nft_trace_notify(struct nft_traceinfo *info);
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
 	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index bed855638050..776eb2b9f632 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -41,7 +41,8 @@ static inline bool nf_skip_indirect_calls(void) { return false; }
 static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
-static noinline void __nft_trace_packet(struct nft_traceinfo *info,
+static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
 	if (!info->trace || !info->nf_trace)
@@ -49,7 +50,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 
 	info->type = type;
 
-	nft_trace_notify(info);
+	nft_trace_notify(pkt, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
@@ -60,7 +61,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(info, type);
+		__nft_trace_packet(pkt, info, type);
 	}
 }
 
@@ -105,7 +106,8 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 	regs->verdict.code = NFT_BREAK;
 }
 
-static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
+static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
+					 struct nft_traceinfo *info,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -123,20 +125,21 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
 		type = NFT_TRACETYPE_RULE;
 
 		if (info->trace)
-			info->nf_trace = info->pkt->skb->nf_trace;
+			info->nf_trace = pkt->skb->nf_trace;
 		break;
 	}
 
-	__nft_trace_packet(info, type);
+	__nft_trace_packet(pkt, info, type);
 }
 
-static inline void nft_trace_verdict(struct nft_traceinfo *info,
+static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
+				     struct nft_traceinfo *info,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->rule = rule;
-		__nft_trace_verdict(info, regs);
+		__nft_trace_verdict(pkt, info, regs);
 	}
 }
 
@@ -300,7 +303,7 @@ next_rule:
 		break;
 	}
 
-	nft_trace_verdict(&info, rule, &regs);
+	nft_trace_verdict(pkt, &info, rule, &regs);
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 3d9b83d84a84..0a0dcf2587fd 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -183,9 +183,9 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 	return last->chain;
 }
 
-void nft_trace_notify(struct nft_traceinfo *info)
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      struct nft_traceinfo *info)
 {
-	const struct nft_pktinfo *pkt = info->pkt;
 	const struct nft_chain *chain;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
@@ -305,7 +305,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 	info->trace = true;
 	info->nf_trace = pkt->skb->nf_trace;
 	info->packet_dumped = false;
-	info->pkt = pkt;
 	info->verdict = verdict;
 
 	net_get_random_once(&trace_key, sizeof(trace_key));
-- 
cgit v1.2.3


From 0a202145d5f9277dd24474aca8245731d030f29e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:33 +0200
Subject: netfilter: nf_tables: do not store verdict in traceinfo structure

Just pass it as argument to nft_trace_notify. Stack is reduced by 8 bytes:

nf_tables_core.c:256 nft_do_chain    312     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +---
 net/netfilter/nf_tables_core.c    | 14 ++++++++------
 net/netfilter/nf_tables_trace.c   | 21 +++++++++++----------
 3 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cab351928cd2..693469ecfa54 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1410,7 +1410,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@basechain: base chain currently processed
  *	@rule:  rule that was evaluated
- *	@verdict: verdict given by rule
  */
 struct nft_traceinfo {
 	bool				trace;
@@ -1420,14 +1419,13 @@ struct nft_traceinfo {
 	u32				skbid;
 	const struct nft_base_chain	*basechain;
 	const struct nft_rule_dp	*rule;
-	const struct nft_verdict	*verdict;
 };
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
-		    const struct nft_verdict *verdict,
 		    const struct nft_chain *basechain);
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      const struct nft_verdict *verdict,
 		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 776eb2b9f632..6debe8b2623f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -42,6 +42,7 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
 static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+					const struct nft_verdict *verdict,
 					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
@@ -50,10 +51,11 @@ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 
 	info->type = type;
 
-	nft_trace_notify(pkt, info);
+	nft_trace_notify(pkt, verdict, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+				    struct nft_verdict *verdict,
 				    struct nft_traceinfo *info,
 				    const struct nft_rule_dp *rule,
 				    enum nft_trace_types type)
@@ -61,7 +63,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(pkt, info, type);
+		__nft_trace_packet(pkt, verdict, info, type);
 	}
 }
 
@@ -129,7 +131,7 @@ static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 		break;
 	}
 
-	__nft_trace_packet(pkt, info, type);
+	__nft_trace_packet(pkt, &regs->verdict, info, type);
 }
 
 static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
@@ -264,7 +266,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 
 	info.trace = false;
 	if (static_branch_unlikely(&nft_trace_enabled))
-		nft_trace_init(&info, pkt, &regs.verdict, basechain);
+		nft_trace_init(&info, pkt, basechain);
 do_chain:
 	if (genbit)
 		blob = rcu_dereference(chain->blob_gen_1);
@@ -296,7 +298,7 @@ next_rule:
 			nft_trace_copy_nftrace(pkt, &info);
 			continue;
 		case NFT_CONTINUE:
-			nft_trace_packet(pkt, &info, rule,
+			nft_trace_packet(pkt, &regs.verdict,  &info, rule,
 					 NFT_TRACETYPE_RULE);
 			continue;
 		}
@@ -336,7 +338,7 @@ next_rule:
 		goto next_rule;
 	}
 
-	nft_trace_packet(pkt, &info, NULL, NFT_TRACETYPE_POLICY);
+	nft_trace_packet(pkt, &regs.verdict, &info, NULL, NFT_TRACETYPE_POLICY);
 
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 0a0dcf2587fd..e635104a42be 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -124,6 +124,7 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
 }
 
 static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+				   const struct nft_verdict *verdict,
 				   const struct nft_traceinfo *info)
 {
 	if (!info->rule || info->rule->is_last)
@@ -135,7 +136,7 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 	 * Since no rule matched, the ->rule pointer is invalid.
 	 */
 	if (info->type == NFT_TRACETYPE_RETURN &&
-	    info->verdict->code == NFT_CONTINUE)
+	    verdict->code == NFT_CONTINUE)
 		return 0;
 
 	return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
@@ -143,7 +144,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 			    NFTA_TRACE_PAD);
 }
 
-static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
+					 struct nft_traceinfo *info)
 {
 	switch (info->type) {
 	case NFT_TRACETYPE_RETURN:
@@ -153,7 +155,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
 		return false;
 	}
 
-	switch (info->verdict->code) {
+	switch (verdict->code) {
 	case NFT_JUMP:
 	case NFT_GOTO:
 		break;
@@ -184,6 +186,7 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 }
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      const struct nft_verdict *verdict,
 		      struct nft_traceinfo *info)
 {
 	const struct nft_chain *chain;
@@ -217,8 +220,8 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 		nla_total_size(sizeof(u32)) +		/* nfproto */
 		nla_total_size(sizeof(u32));		/* policy */
 
-	if (nft_trace_have_verdict_chain(info))
-		size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+	if (nft_trace_have_verdict_chain(verdict, info))
+		size += nla_total_size(strlen(verdict->chain->name)); /* jump target */
 
 	skb = nlmsg_new(size, GFP_ATOMIC);
 	if (!skb)
@@ -245,7 +248,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
-	if (nf_trace_fill_rule_info(skb, info))
+	if (nf_trace_fill_rule_info(skb, verdict, info))
 		goto nla_put_failure;
 
 	switch (info->type) {
@@ -254,11 +257,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 		break;
 	case NFT_TRACETYPE_RETURN:
 	case NFT_TRACETYPE_RULE:
-		if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+		if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict))
 			goto nla_put_failure;
 
 		/* pkt->skb undefined iff NF_STOLEN, disable dump */
-		if (info->verdict->code == NF_STOLEN)
+		if (verdict->code == NF_STOLEN)
 			info->packet_dumped = true;
 		else
 			mark = pkt->skb->mark;
@@ -295,7 +298,6 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 }
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
-		    const struct nft_verdict *verdict,
 		    const struct nft_chain *chain)
 {
 	static siphash_key_t trace_key __read_mostly;
@@ -305,7 +307,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 	info->trace = true;
 	info->nf_trace = pkt->skb->nf_trace;
 	info->packet_dumped = false;
-	info->verdict = verdict;
 
 	net_get_random_once(&trace_key, sizeof(trace_key));
 
-- 
cgit v1.2.3


From 46df417544f4f7fa3438caada0dc0e534a694343 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:34 +0200
Subject: netfilter: nf_tables: do not store rule in traceinfo structure

pass it as argument instead.  This reduces size of traceinfo to
16 bytes.  Total stack usage:

 nf_tables_core.c:252 nft_do_chain    304     static

While its possible to also pass basechain as argument, doing so
increases nft_do_chaininfo function size.

Unlike pktinfo/verdict/rule the basechain info isn't used in
the expression evaluation path. gcc places it on the stack, which
results in extra push/pop when it gets passed to the trace helpers
as argument rather than as part of the traceinfo structure.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 +--
 net/netfilter/nf_tables_core.c    | 15 +++++++--------
 net/netfilter/nf_tables_trace.c   | 14 ++++++++------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 693469ecfa54..58a4d217faaf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1409,7 +1409,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@skbid: hash of skb to be used as trace id
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@basechain: base chain currently processed
- *	@rule:  rule that was evaluated
  */
 struct nft_traceinfo {
 	bool				trace;
@@ -1418,7 +1417,6 @@ struct nft_traceinfo {
 	enum nft_trace_types		type:8;
 	u32				skbid;
 	const struct nft_base_chain	*basechain;
-	const struct nft_rule_dp	*rule;
 };
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
@@ -1426,6 +1424,7 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
 		      const struct nft_verdict *verdict,
+		      const struct nft_rule_dp *rule,
 		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 6debe8b2623f..4d0ce12221f6 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -43,6 +43,7 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 
 static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 					const struct nft_verdict *verdict,
+					const struct nft_rule_dp *rule,
 					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
@@ -51,7 +52,7 @@ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 
 	info->type = type;
 
-	nft_trace_notify(pkt, verdict, info);
+	nft_trace_notify(pkt, verdict, rule, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
@@ -62,8 +63,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
-		info->rule = rule;
-		__nft_trace_packet(pkt, verdict, info, type);
+		__nft_trace_packet(pkt, verdict, rule, info, type);
 	}
 }
 
@@ -110,6 +110,7 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 
 static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 					 struct nft_traceinfo *info,
+					 const struct nft_rule_dp *rule,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -131,7 +132,7 @@ static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 		break;
 	}
 
-	__nft_trace_packet(pkt, &regs->verdict, info, type);
+	__nft_trace_packet(pkt, &regs->verdict, rule, info, type);
 }
 
 static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
@@ -139,10 +140,8 @@ static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
-	if (static_branch_unlikely(&nft_trace_enabled)) {
-		info->rule = rule;
-		__nft_trace_verdict(pkt, info, regs);
-	}
+	if (static_branch_unlikely(&nft_trace_enabled))
+		__nft_trace_verdict(pkt, info, rule, regs);
 }
 
 static bool nft_payload_fast_eval(const struct nft_expr *expr,
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index e635104a42be..6d41c0bd3d78 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -125,9 +125,10 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
 
 static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 				   const struct nft_verdict *verdict,
+				   const struct nft_rule_dp *rule,
 				   const struct nft_traceinfo *info)
 {
-	if (!info->rule || info->rule->is_last)
+	if (!rule || rule->is_last)
 		return 0;
 
 	/* a continue verdict with ->type == RETURN means that this is
@@ -140,7 +141,7 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 		return 0;
 
 	return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
-			    cpu_to_be64(info->rule->handle),
+			    cpu_to_be64(rule->handle),
 			    NFTA_TRACE_PAD);
 }
 
@@ -166,9 +167,9 @@ static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
 	return true;
 }
 
-static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *info)
+static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule,
+						   const struct nft_traceinfo *info)
 {
-	const struct nft_rule_dp *rule = info->rule;
 	const struct nft_rule_dp_last *last;
 
 	if (!rule)
@@ -187,6 +188,7 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
 		      const struct nft_verdict *verdict,
+		      const struct nft_rule_dp *rule,
 		      struct nft_traceinfo *info)
 {
 	const struct nft_chain *chain;
@@ -199,7 +201,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
 		return;
 
-	chain = nft_trace_get_chain(info);
+	chain = nft_trace_get_chain(rule, info);
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
 		nla_total_size(strlen(chain->table->name)) +
@@ -248,7 +250,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
-	if (nf_trace_fill_rule_info(skb, verdict, info))
+	if (nf_trace_fill_rule_info(skb, verdict, rule, info))
 		goto nla_put_failure;
 
 	switch (info->type) {
-- 
cgit v1.2.3


From e3478c68f6704638d08f437cbc552ca5970c151a Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:45 +0200
Subject: ipvs: Update width of source for ip_vs_sync_conn_options

In ip_vs_sync_conn_v0() copy is made to struct ip_vs_sync_conn_options.
That structure looks like this:

struct ip_vs_sync_conn_options {
        struct ip_vs_seq        in_seq;
        struct ip_vs_seq        out_seq;
};

The source of the copy is the in_seq field of struct ip_vs_conn.  Whose
type is struct ip_vs_seq. Thus we can see that the source - is not as
wide as the amount of data copied, which is the width of struct
ip_vs_sync_conn_option.

The copy is safe because the next field in is another struct ip_vs_seq.
Make use of struct_group() to annotate this.

Flagged by gcc-13 as:

 In file included from ./include/linux/string.h:254,
                  from ./include/linux/bitmap.h:11,
                  from ./include/linux/cpumask.h:12,
                  from ./arch/x86/include/asm/paravirt.h:17,
                  from ./arch/x86/include/asm/cpuid.h:62,
                  from ./arch/x86/include/asm/processor.h:19,
                  from ./arch/x86/include/asm/timex.h:5,
                  from ./include/linux/timex.h:67,
                  from ./include/linux/time32.h:13,
                  from ./include/linux/time.h:60,
                  from ./include/linux/stat.h:19,
                  from ./include/linux/module.h:13,
                  from net/netfilter/ipvs/ip_vs_sync.c:38:
 In function 'fortify_memcpy_chk',
     inlined from 'ip_vs_sync_conn_v0' at net/netfilter/ipvs/ip_vs_sync.c:606:3:
 ./include/linux/fortify-string.h:529:25: error: call to '__read_overflow2_field' declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror=attribute-warning]
   529 |                         __read_overflow2_field(q_size_field, size);
       |

Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             | 6 ++++--
 net/netfilter/ipvs/ip_vs_sync.c | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 6d71a5ff52df..e20f1f92066d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -630,8 +630,10 @@ struct ip_vs_conn {
 	 */
 	struct ip_vs_app        *app;           /* bound ip_vs_app object */
 	void                    *app_data;      /* Application private data */
-	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
-	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+	struct_group(sync_conn_opt,
+		struct ip_vs_seq  in_seq;       /* incoming seq. struct */
+		struct ip_vs_seq  out_seq;      /* outgoing seq. struct */
+	);
 
 	const struct ip_vs_pe	*pe;
 	char			*pe_data;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 4963fec815da..d4fe7bb4f853 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -603,7 +603,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
 		struct ip_vs_sync_conn_options *opt =
 			(struct ip_vs_sync_conn_options *)&s[1];
-		memcpy(opt, &cp->in_seq, sizeof(*opt));
+		memcpy(opt, &cp->sync_conn_opt, sizeof(*opt));
 	}
 
 	m->nr_conns++;
-- 
cgit v1.2.3


From 210ffe4a74caead4d6790747d32b63c6152c70b7 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:47 +0200
Subject: ipvs: Remove {Enter,Leave}Function

Remove EnterFunction and LeaveFunction.

These debugging macros seem well past their use-by date.  And seem to
have little value these days. Removing them allows some trivial cleanup
of some exit paths for some functions. These are also included in this
patch. There is likely scope for further cleanup of both debugging and
unwind paths. But let's leave that for another day.

Only intended to change debug output, and only when CONFIG_IP_VS_DEBUG
is enabled. Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             | 20 -------------
 net/netfilter/ipvs/ip_vs_core.c |  8 ------
 net/netfilter/ipvs/ip_vs_ctl.c  | 26 +----------------
 net/netfilter/ipvs/ip_vs_sync.c |  5 ----
 net/netfilter/ipvs/ip_vs_xmit.c | 62 ++++++-----------------------------------
 5 files changed, 9 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e20f1f92066d..a3adc246ee31 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -265,26 +265,6 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			pr_err(msg, ##__VA_ARGS__);			\
 	} while (0)
 
-#ifdef CONFIG_IP_VS_DEBUG
-#define EnterFunction(level)						\
-	do {								\
-		if (level <= ip_vs_get_debug_level())			\
-			printk(KERN_DEBUG				\
-			       pr_fmt("Enter: %s, %s line %i\n"),	\
-			       __func__, __FILE__, __LINE__);		\
-	} while (0)
-#define LeaveFunction(level)						\
-	do {								\
-		if (level <= ip_vs_get_debug_level())			\
-			printk(KERN_DEBUG				\
-			       pr_fmt("Leave: %s, %s line %i\n"),	\
-			       __func__, __FILE__, __LINE__);		\
-	} while (0)
-#else
-#define EnterFunction(level)   do {} while (0)
-#define LeaveFunction(level)   do {} while (0)
-#endif
-
 /* The port number of FTP service (in network order). */
 #define FTPPORT  cpu_to_be16(21)
 #define FTPDATA  cpu_to_be16(20)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2fcc26507d69..cb83ca506c5c 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1140,7 +1140,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
 	__be16 vport;
 	unsigned int flags;
 
-	EnterFunction(12);
 	vaddr = &svc->addr;
 	vport = svc->port;
 	daddr = &iph->saddr;
@@ -1208,7 +1207,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
 		      cp->flags, refcount_read(&cp->refcnt));
-	LeaveFunction(12);
 	return cp;
 }
 
@@ -1316,13 +1314,11 @@ after_nat:
 		ip_vs_update_conntrack(skb, cp, 0);
 	ip_vs_conn_put(cp);
 
-	LeaveFunction(11);
 	return NF_ACCEPT;
 
 drop:
 	ip_vs_conn_put(cp);
 	kfree_skb(skb);
-	LeaveFunction(11);
 	return NF_STOLEN;
 }
 
@@ -1341,8 +1337,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
 	int af = state->pf;
 	struct sock *sk;
 
-	EnterFunction(11);
-
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
@@ -2365,7 +2359,6 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
 	struct netns_ipvs *ipvs;
 	struct net *net;
 
-	EnterFunction(2);
 	list_for_each_entry(net, net_list, exit_list) {
 		ipvs = net_ipvs(net);
 		ip_vs_unregister_hooks(ipvs, AF_INET);
@@ -2374,7 +2367,6 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
 		smp_wmb();
 		ip_vs_sync_net_cleanup(ipvs);
 	}
-	LeaveFunction(2);
 }
 
 static struct pernet_operations ipvs_core_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2a5ed71c82c3..62606fb44d02 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1061,8 +1061,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	unsigned int atype;
 	int ret;
 
-	EnterFunction(2);
-
 #ifdef CONFIG_IP_VS_IPV6
 	if (udest->af == AF_INET6) {
 		atype = ipv6_addr_type(&udest->addr.in6);
@@ -1111,7 +1109,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	spin_lock_init(&dest->dst_lock);
 	__ip_vs_update_dest(svc, dest, udest, 1);
 
-	LeaveFunction(2);
 	return 0;
 
 err_stats:
@@ -1134,8 +1131,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	__be16 dport = udest->port;
 	int ret;
 
-	EnterFunction(2);
-
 	if (udest->weight < 0) {
 		pr_err("%s(): server weight less than zero\n", __func__);
 		return -ERANGE;
@@ -1183,7 +1178,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
 		if (ret < 0)
-			goto err;
+			return ret;
 		__ip_vs_update_dest(svc, dest, udest, 1);
 	} else {
 		/*
@@ -1192,9 +1187,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		ret = ip_vs_new_dest(svc, udest);
 	}
 
-err:
-	LeaveFunction(2);
-
 	return ret;
 }
 
@@ -1209,8 +1201,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	union nf_inet_addr daddr;
 	__be16 dport = udest->port;
 
-	EnterFunction(2);
-
 	if (udest->weight < 0) {
 		pr_err("%s(): server weight less than zero\n", __func__);
 		return -ERANGE;
@@ -1242,7 +1232,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	}
 
 	__ip_vs_update_dest(svc, dest, udest, 0);
-	LeaveFunction(2);
 
 	return 0;
 }
@@ -1317,8 +1306,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	struct ip_vs_dest *dest;
 	__be16 dport = udest->port;
 
-	EnterFunction(2);
-
 	/* We use function that requires RCU lock */
 	rcu_read_lock();
 	dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
@@ -1339,8 +1326,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	 */
 	__ip_vs_del_dest(svc->ipvs, dest, false);
 
-	LeaveFunction(2);
-
 	return 0;
 }
 
@@ -1746,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
 	struct netns_ipvs *ipvs;
 	struct net *net;
 
-	EnterFunction(2);
 	/* Check for "full" addressed entries */
 	mutex_lock(&__ip_vs_mutex);
 	list_for_each_entry(net, net_list, exit_list) {
@@ -1754,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
 		ip_vs_flush(ipvs, true);
 	}
 	mutex_unlock(&__ip_vs_mutex);
-	LeaveFunction(2);
 }
 
 /* Put all references for device (dst_cache) */
@@ -1792,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
-	EnterFunction(2);
 	mutex_lock(&__ip_vs_mutex);
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
@@ -1821,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	}
 	spin_unlock_bh(&ipvs->dest_trash_lock);
 	mutex_unlock(&__ip_vs_mutex);
-	LeaveFunction(2);
 	return NOTIFY_DONE;
 }
 
@@ -4537,8 +4518,6 @@ int __init ip_vs_control_init(void)
 	int idx;
 	int ret;
 
-	EnterFunction(2);
-
 	/* Initialize svc_table, ip_vs_svc_fwm_table */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
@@ -4551,15 +4530,12 @@ int __init ip_vs_control_init(void)
 	if (ret < 0)
 		return ret;
 
-	LeaveFunction(2);
 	return 0;
 }
 
 
 void ip_vs_control_cleanup(void)
 {
-	EnterFunction(2);
 	unregister_netdevice_notifier(&ip_vs_dst_notifier);
 	/* relying on common rcu_barrier() in ip_vs_cleanup() */
-	LeaveFunction(2);
 }
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index d4fe7bb4f853..264f2f87a437 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1582,13 +1582,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
 	struct kvec	iov;
 	int		len;
 
-	EnterFunction(7);
 	iov.iov_base     = (void *)buffer;
 	iov.iov_len      = length;
 
 	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
 
-	LeaveFunction(7);
 	return len;
 }
 
@@ -1614,15 +1612,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
 	struct kvec		iov = {buffer, buflen};
 	int			len;
 
-	EnterFunction(7);
-
 	/* Receive a packet */
 	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen);
 	len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
 	if (len < 0)
 		return len;
 
-	LeaveFunction(7);
 	return len;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 99c349c0d968..feb1d7fcb09f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -706,8 +706,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct iphdr  *iph = ip_hdr(skb);
 
-	EnterFunction(10);
-
 	if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
 			       IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
 		goto tx_error;
@@ -719,12 +717,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -735,8 +731,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 
-	EnterFunction(10);
-
 	if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
 				  &iph->daddr, NULL,
 				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
@@ -747,12 +741,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 #endif
@@ -768,8 +760,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int local, rc, was_input;
 
-	EnterFunction(10);
-
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
@@ -839,12 +829,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 
-	LeaveFunction(10);
 	return rc;
 
   tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -856,8 +844,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info *rt;		/* Route to the other host */
 	int local, rc;
 
-	EnterFunction(10);
-
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
 		__be16 _pt, *p;
@@ -927,11 +913,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 
-	LeaveFunction(10);
 	return rc;
 
 tx_error:
-	LeaveFunction(10);
 	kfree_skb(skb);
 	return NF_STOLEN;
 }
@@ -1149,8 +1133,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int tun_type, gso_type;
 	int tun_flags;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
@@ -1199,7 +1181,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 					 &next_protocol, NULL, &dsfield,
 					 &ttl, dfp);
 	if (IS_ERR(skb))
-		goto tx_error;
+		return NF_STOLEN;
 
 	gso_type = __tun_gso_type_mask(AF_INET, cp->af);
 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1267,14 +1249,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
 
-	LeaveFunction(10);
-
 	return NF_STOLEN;
 
   tx_error:
-	if (!IS_ERR(skb))
-		kfree_skb(skb);
-	LeaveFunction(10);
+	kfree_skb(skb);
 	return NF_STOLEN;
 }
 
@@ -1298,8 +1276,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int tun_type, gso_type;
 	int tun_flags;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
 				      &cp->daddr.in6,
 				      &saddr, ipvsh, 1,
@@ -1347,7 +1323,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 					 &next_protocol, &payload_len,
 					 &dsfield, &ttl, NULL);
 	if (IS_ERR(skb))
-		goto tx_error;
+		return NF_STOLEN;
 
 	gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1414,14 +1390,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
 
-	LeaveFunction(10);
-
 	return NF_STOLEN;
 
 tx_error:
-	if (!IS_ERR(skb))
-		kfree_skb(skb);
-	LeaveFunction(10);
+	kfree_skb(skb);
 	return NF_STOLEN;
 }
 #endif
@@ -1437,8 +1409,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	int local;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
@@ -1455,12 +1425,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
   tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -1471,8 +1439,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	int local;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
 				      &cp->daddr.in6,
 				      NULL, ipvsh, 0,
@@ -1489,12 +1455,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
 tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 #endif
@@ -1514,8 +1478,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int local;
 	int rt_mode, was_input;
 
-	EnterFunction(10);
-
 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 	   forwarded directly here, because there is no need to
 	   translate address/port back */
@@ -1526,7 +1488,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
 		atomic_inc(&cp->in_pkts);
-		goto out;
+		return rc;
 	}
 
 	/*
@@ -1582,14 +1544,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->ignore_df = 1;
 
-	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
-	goto out;
+	return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 
   tx_error:
 	kfree_skb(skb);
 	rc = NF_STOLEN;
-  out:
-	LeaveFunction(10);
 	return rc;
 }
 
@@ -1604,8 +1563,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int local;
 	int rt_mode;
 
-	EnterFunction(10);
-
 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 	   forwarded directly here, because there is no need to
 	   translate address/port back */
@@ -1616,7 +1573,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
 		atomic_inc(&cp->in_pkts);
-		goto out;
+		return rc;
 	}
 
 	/*
@@ -1671,14 +1628,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->ignore_df = 1;
 
-	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
-	goto out;
+	return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 
 tx_error:
 	kfree_skb(skb);
 	rc = NF_STOLEN;
-out:
-	LeaveFunction(10);
 	return rc;
 }
 #endif
-- 
cgit v1.2.3


From c7d15aaa105a9484b5385e5c391ea5203347d6b0 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:48 +0200
Subject: ipvs: Correct spelling in comments

Correct some spelling errors flagged by codespell and found by inspection.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a3adc246ee31..ff406ef4fd4a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -584,7 +584,7 @@ struct ip_vs_conn {
 	spinlock_t              lock;           /* lock for state transition */
 	volatile __u16          state;          /* state info */
 	volatile __u16          old_state;      /* old state, to be used for
-						 * state transition triggerd
+						 * state transition triggered
 						 * synchronization
 						 */
 	__u32			fwmark;		/* Fire wall mark from skb */
@@ -635,7 +635,7 @@ struct ip_vs_service_user_kern {
 	u16			protocol;
 	union nf_inet_addr	addr;		/* virtual ip address */
 	__be16			port;
-	u32			fwmark;		/* firwall mark of service */
+	u32			fwmark;		/* firewall mark of service */
 
 	/* virtual service options */
 	char			*sched_name;
@@ -1036,7 +1036,7 @@ struct netns_ipvs {
 	struct ipvs_sync_daemon_cfg	bcfg;	/* Backup Configuration */
 	/* net name space ptr */
 	struct net		*net;            /* Needed by timer routines */
-	/* Number of heterogeneous destinations, needed becaus heterogeneous
+	/* Number of heterogeneous destinations, needed because heterogeneous
 	 * are not supported when synchronization is enabled.
 	 */
 	unsigned int		mixed_address_family_dests;
-- 
cgit v1.2.3


From b9703ed44ffbfba85c103b9de01886a225e14b38 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 21 Apr 2023 00:34:31 +0200
Subject: netfilter: nf_tables: support for adding new devices to an existing
 netdev chain

This patch allows users to add devices to an existing netdev chain.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   6 ++
 net/netfilter/nf_tables_api.c     | 217 ++++++++++++++++++++++++--------------
 2 files changed, 142 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 58a4d217faaf..3ed21d2d5659 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1609,6 +1609,8 @@ struct nft_trans_chain {
 	struct nft_stats __percpu	*stats;
 	u8				policy;
 	u32				chain_id;
+	struct nft_base_chain		*basechain;
+	struct list_head		hook_list;
 };
 
 #define nft_trans_chain_update(trans)	\
@@ -1621,6 +1623,10 @@ struct nft_trans_chain {
 	(((struct nft_trans_chain *)trans->data)->policy)
 #define nft_trans_chain_id(trans)	\
 	(((struct nft_trans_chain *)trans->data)->chain_id)
+#define nft_trans_basechain(trans)	\
+	(((struct nft_trans_chain *)trans->data)->basechain)
+#define nft_trans_chain_hooks(trans)	\
+	(((struct nft_trans_chain *)trans->data)->hook_list)
 
 struct nft_trans_table {
 	bool				update;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 77975b4b0fdc..f8d8cace0c7d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1582,7 +1582,8 @@ nla_put_failure:
 }
 
 static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
-				   const struct nft_base_chain *basechain)
+				   const struct nft_base_chain *basechain,
+				   const struct list_head *hook_list)
 {
 	const struct nf_hook_ops *ops = &basechain->ops;
 	struct nft_hook *hook, *first = NULL;
@@ -1599,7 +1600,11 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
 
 	if (nft_base_chain_netdev(family, ops->hooknum)) {
 		nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
-		list_for_each_entry(hook, &basechain->hook_list, list) {
+
+		if (!hook_list)
+			hook_list = &basechain->hook_list;
+
+		list_for_each_entry(hook, hook_list, list) {
 			if (!first)
 				first = hook;
 
@@ -1624,7 +1629,8 @@ nla_put_failure:
 static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 				     u32 portid, u32 seq, int event, u32 flags,
 				     int family, const struct nft_table *table,
-				     const struct nft_chain *chain)
+				     const struct nft_chain *chain,
+				     const struct list_head *hook_list)
 {
 	struct nlmsghdr *nlh;
 
@@ -1649,7 +1655,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 		const struct nft_base_chain *basechain = nft_base_chain(chain);
 		struct nft_stats __percpu *stats;
 
-		if (nft_dump_basechain_hook(skb, family, basechain))
+		if (nft_dump_basechain_hook(skb, family, basechain, hook_list))
 			goto nla_put_failure;
 
 		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1684,7 +1690,8 @@ nla_put_failure:
 	return -1;
 }
 
-static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
+				   const struct list_head *hook_list)
 {
 	struct nftables_pernet *nft_net;
 	struct sk_buff *skb;
@@ -1704,7 +1711,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 
 	err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
 					event, flags, ctx->family, ctx->table,
-					ctx->chain);
+					ctx->chain, hook_list);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -1750,7 +1757,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 						      NFT_MSG_NEWCHAIN,
 						      NLM_F_MULTI,
 						      table->family, table,
-						      chain) < 0)
+						      chain, NULL) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1804,7 +1811,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
 
 	err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
 					info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
-					0, family, table, chain);
+					0, family, table, chain, NULL);
 	if (err < 0)
 		goto err_fill_chain_info;
 
@@ -2048,9 +2055,10 @@ static int nft_chain_parse_netdev(struct net *net,
 }
 
 static int nft_chain_parse_hook(struct net *net,
+				struct nft_base_chain *basechain,
 				const struct nlattr * const nla[],
 				struct nft_chain_hook *hook, u8 family,
-				struct netlink_ext_ack *extack, bool autoload)
+				struct netlink_ext_ack *extack)
 {
 	struct nftables_pernet *nft_net = nft_pernet(net);
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
@@ -2066,31 +2074,46 @@ static int nft_chain_parse_hook(struct net *net,
 	if (err < 0)
 		return err;
 
-	if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
-	    ha[NFTA_HOOK_PRIORITY] == NULL)
-		return -EINVAL;
+	if (!basechain) {
+		if (!ha[NFTA_HOOK_HOOKNUM] ||
+		    !ha[NFTA_HOOK_PRIORITY])
+			return -EINVAL;
 
-	hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
-	hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+		hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 
-	type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
-	if (!type)
-		return -EOPNOTSUPP;
+		type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+		if (!type)
+			return -EOPNOTSUPP;
 
-	if (nla[NFTA_CHAIN_TYPE]) {
-		type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
-						   family, autoload);
-		if (IS_ERR(type)) {
-			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
-			return PTR_ERR(type);
+		if (nla[NFTA_CHAIN_TYPE]) {
+			type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+							   family, true);
+			if (IS_ERR(type)) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+				return PTR_ERR(type);
+			}
 		}
-	}
-	if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
-		return -EOPNOTSUPP;
+		if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+			return -EOPNOTSUPP;
 
-	if (type->type == NFT_CHAIN_T_NAT &&
-	    hook->priority <= NF_IP_PRI_CONNTRACK)
-		return -EOPNOTSUPP;
+		if (type->type == NFT_CHAIN_T_NAT &&
+		    hook->priority <= NF_IP_PRI_CONNTRACK)
+			return -EOPNOTSUPP;
+	} else {
+		if (ha[NFTA_HOOK_HOOKNUM]) {
+			hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+			if (hook->num != basechain->ops.hooknum)
+				return -EOPNOTSUPP;
+		}
+		if (ha[NFTA_HOOK_PRIORITY]) {
+			hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+			if (hook->priority != basechain->ops.priority)
+				return -EOPNOTSUPP;
+		}
+
+		type = basechain->type;
+	}
 
 	if (!try_module_get(type->owner)) {
 		if (nla[NFTA_CHAIN_TYPE])
@@ -2184,12 +2207,8 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
 		list_splice_init(&hook->list, &basechain->hook_list);
 		list_for_each_entry(h, &basechain->hook_list, list)
 			nft_basechain_hook_init(&h->ops, family, hook, chain);
-
-		basechain->ops.hooknum	= hook->num;
-		basechain->ops.priority	= hook->priority;
-	} else {
-		nft_basechain_hook_init(&basechain->ops, family, hook, chain);
 	}
+	nft_basechain_hook_init(&basechain->ops, family, hook, chain);
 
 	chain->flags |= NFT_CHAIN_BASE | flags;
 	basechain->policy = NF_ACCEPT;
@@ -2239,13 +2258,13 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nft_stats __percpu *stats = NULL;
-		struct nft_chain_hook hook;
+		struct nft_chain_hook hook = {};
 
 		if (flags & NFT_CHAIN_BINDING)
 			return -EOPNOTSUPP;
 
-		err = nft_chain_parse_hook(net, nla, &hook, family, extack,
-					   true);
+		err = nft_chain_parse_hook(net, NULL, nla, &hook, family,
+					   extack);
 		if (err < 0)
 			return err;
 
@@ -2359,65 +2378,57 @@ err_destroy_chain:
 	return err;
 }
 
-static bool nft_hook_list_equal(struct list_head *hook_list1,
-				struct list_head *hook_list2)
-{
-	struct nft_hook *hook;
-	int n = 0, m = 0;
-
-	n = 0;
-	list_for_each_entry(hook, hook_list2, list) {
-		if (!nft_hook_list_find(hook_list1, hook))
-			return false;
-
-		n++;
-	}
-	list_for_each_entry(hook, hook_list1, list)
-		m++;
-
-	return n == m;
-}
-
 static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			      u32 flags, const struct nlattr *attr,
 			      struct netlink_ext_ack *extack)
 {
 	const struct nlattr * const *nla = ctx->nla;
+	struct nft_base_chain *basechain = NULL;
 	struct nft_table *table = ctx->table;
 	struct nft_chain *chain = ctx->chain;
-	struct nft_base_chain *basechain;
+	struct nft_chain_hook hook = {};
 	struct nft_stats *stats = NULL;
-	struct nft_chain_hook hook;
+	struct nft_hook *h, *next;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
+	bool unregister = false;
 	int err;
 
 	if (chain->flags ^ flags)
 		return -EOPNOTSUPP;
 
+	INIT_LIST_HEAD(&hook.list);
+
 	if (nla[NFTA_CHAIN_HOOK]) {
 		if (!nft_is_base_chain(chain)) {
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
-		err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
-					   extack, false);
+
+		basechain = nft_base_chain(chain);
+		err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
+					   ctx->family, extack);
 		if (err < 0)
 			return err;
 
-		basechain = nft_base_chain(chain);
 		if (basechain->type != hook.type) {
 			nft_chain_release_hook(&hook);
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
 
-		if (nft_base_chain_netdev(ctx->family, hook.num)) {
-			if (!nft_hook_list_equal(&basechain->hook_list,
-						 &hook.list)) {
-				nft_chain_release_hook(&hook);
-				NL_SET_BAD_ATTR(extack, attr);
-				return -EEXIST;
+		if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+			list_for_each_entry_safe(h, next, &hook.list, list) {
+				h->ops.pf	= basechain->ops.pf;
+				h->ops.hooknum	= basechain->ops.hooknum;
+				h->ops.priority	= basechain->ops.priority;
+				h->ops.priv	= basechain->ops.priv;
+				h->ops.hook	= basechain->ops.hook;
+
+				if (nft_hook_list_find(&basechain->hook_list, h)) {
+					list_del(&h->list);
+					kfree(h);
+				}
 			}
 		} else {
 			ops = &basechain->ops;
@@ -2428,7 +2439,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				return -EEXIST;
 			}
 		}
-		nft_chain_release_hook(&hook);
 	}
 
 	if (nla[NFTA_CHAIN_HANDLE] &&
@@ -2439,24 +2449,43 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 					  nla[NFTA_CHAIN_NAME], genmask);
 		if (!IS_ERR(chain2)) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
-			return -EEXIST;
+			err = -EEXIST;
+			goto err_hooks;
 		}
 	}
 
 	if (nla[NFTA_CHAIN_COUNTERS]) {
-		if (!nft_is_base_chain(chain))
-			return -EOPNOTSUPP;
+		if (!nft_is_base_chain(chain)) {
+			err = -EOPNOTSUPP;
+			goto err_hooks;
+		}
 
 		stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
-		if (IS_ERR(stats))
-			return PTR_ERR(stats);
+		if (IS_ERR(stats)) {
+			err = PTR_ERR(stats);
+			goto err_hooks;
+		}
 	}
 
+	if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+	    nft_is_base_chain(chain) &&
+	    !list_empty(&hook.list)) {
+		basechain = nft_base_chain(chain);
+		ops = &basechain->ops;
+
+		if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+			err = nft_netdev_register_hooks(ctx->net, &hook.list);
+			if (err < 0)
+				goto err_hooks;
+		}
+	}
+
+	unregister = true;
 	err = -ENOMEM;
 	trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
 				sizeof(struct nft_trans_chain));
 	if (trans == NULL)
-		goto err;
+		goto err_trans;
 
 	nft_trans_chain_stats(trans) = stats;
 	nft_trans_chain_update(trans) = true;
@@ -2475,7 +2504,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		err = -ENOMEM;
 		name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
 		if (!name)
-			goto err;
+			goto err_trans;
 
 		err = -EEXIST;
 		list_for_each_entry(tmp, &nft_net->commit_list, list) {
@@ -2486,18 +2515,35 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			    strcmp(name, nft_trans_chain_name(tmp)) == 0) {
 				NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 				kfree(name);
-				goto err;
+				goto err_trans;
 			}
 		}
 
 		nft_trans_chain_name(trans) = name;
 	}
+
+	nft_trans_basechain(trans) = basechain;
+	INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+	list_splice(&hook.list, &nft_trans_chain_hooks(trans));
+
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
 	return 0;
-err:
+
+err_trans:
 	free_percpu(stats);
 	kfree(trans);
+err_hooks:
+	if (nla[NFTA_CHAIN_HOOK]) {
+		list_for_each_entry_safe(h, next, &hook.list, list) {
+			if (unregister)
+				nf_unregister_net_hook(ctx->net, &h->ops);
+			list_del(&h->list);
+			kfree_rcu(h, rcu);
+		}
+		module_put(hook.type->owner);
+	}
+
 	return err;
 }
 
@@ -9244,19 +9290,22 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWCHAIN:
 			if (nft_trans_chain_update(trans)) {
 				nft_chain_commit_update(trans);
-				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN,
+						       &nft_trans_chain_hooks(trans));
+				list_splice(&nft_trans_chain_hooks(trans),
+					    &nft_trans_basechain(trans)->hook_list);
 				/* trans destroyed after rcu grace period */
 			} else {
 				nft_chain_commit_drop_policy(trans);
 				nft_clear(net, trans->ctx.chain);
-				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN, NULL);
 				nft_trans_destroy(trans);
 			}
 			break;
 		case NFT_MSG_DELCHAIN:
 		case NFT_MSG_DESTROYCHAIN:
 			nft_chain_del(trans->ctx.chain);
-			nf_tables_chain_notify(&trans->ctx, trans->msg_type);
+			nf_tables_chain_notify(&trans->ctx, trans->msg_type, NULL);
 			nf_tables_unregister_hook(trans->ctx.net,
 						  trans->ctx.table,
 						  trans->ctx.chain);
@@ -9423,7 +9472,10 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nf_tables_table_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWCHAIN:
-		nf_tables_chain_destroy(&trans->ctx);
+		if (nft_trans_chain_update(trans))
+			nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+		else
+			nf_tables_chain_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWRULE:
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
@@ -9486,6 +9538,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 			break;
 		case NFT_MSG_NEWCHAIN:
 			if (nft_trans_chain_update(trans)) {
+				nft_netdev_unregister_hooks(net,
+							    &nft_trans_chain_hooks(trans),
+							    true);
 				free_percpu(nft_trans_chain_stats(trans));
 				kfree(nft_trans_chain_name(trans));
 				nft_trans_destroy(trans);
-- 
cgit v1.2.3


From d4af7ca20173e61b77584e4f2025387b7b76ef75 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:02 +0200
Subject: ALSA: emu10k1: remove obsolete card type variable and defines

The use of the variable was removed in commit 2b637da5a1b ("clean up
card features"). That commit also broke user space (the ioctl
structure), at which point the defines became meaningless, so I don't
think purging them is a problem.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230421141006.1005452-3-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h      | 1 -
 include/uapi/sound/emu10k1.h | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 3407ca4a1210..ddc1e71a8de5 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1689,7 +1689,6 @@ struct snd_emu10k1 {
 	unsigned int revision;			/* chip revision */
 	unsigned int serial;			/* serial number */
 	unsigned short model;			/* subsystem id */
-	unsigned int card_type;			/* EMU10K1_CARD_* */
 	unsigned int ecard_ctrl;		/* ecard control bits */
 	unsigned int address_mode;		/* address mode */
 	unsigned long dma_mask;			/* PCI DMA mask */
diff --git a/include/uapi/sound/emu10k1.h b/include/uapi/sound/emu10k1.h
index 1c1f1dd44611..c2414bd5aecd 100644
--- a/include/uapi/sound/emu10k1.h
+++ b/include/uapi/sound/emu10k1.h
@@ -15,9 +15,6 @@
  * ---- FX8010 ----
  */
 
-#define EMU10K1_CARD_CREATIVE			0x00000000
-#define EMU10K1_CARD_EMUAPS			0x00000001
-
 #define EMU10K1_FX8010_PCM_COUNT		8
 
 /*
-- 
cgit v1.2.3


From 14a5c5a44b61953b3f84a01152fe1d40838f5d91 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:04 +0200
Subject: ALSA: emu10k1: remove unused snd_emu10k1_voice.emu field

It was written, but never read from. Its value is available via the epcm
field.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230421141006.1005452-5-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          | 1 -
 sound/pci/emu10k1/emu10k1_main.c | 4 +---
 sound/pci/emu10k1/p16v.c         | 2 --
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index ddc1e71a8de5..a1ea022bcdc8 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1485,7 +1485,6 @@ enum {
 struct snd_emu10k1;
 
 struct snd_emu10k1_voice {
-	struct snd_emu10k1 *emu;
 	int number;
 	unsigned int use: 1,
 	    pcm: 1,
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 65fd6b62bc9c..4af7c95fc665 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -1971,10 +1971,8 @@ int snd_emu10k1_create(struct snd_card *card,
 		pgtbl[idx] = cpu_to_le32(silent_page | idx);
 
 	/* set up voice indices */
-	for (idx = 0; idx < NUM_G; idx++) {
-		emu->voices[idx].emu = emu;
+	for (idx = 0; idx < NUM_G; idx++)
 		emu->voices[idx].number = idx;
-	}
 
 	err = snd_emu10k1_init(emu, enable_ir, 0);
 	if (err < 0)
diff --git a/sound/pci/emu10k1/p16v.c b/sound/pci/emu10k1/p16v.c
index 18a1b0740e6b..f5e0972187a7 100644
--- a/sound/pci/emu10k1/p16v.c
+++ b/sound/pci/emu10k1/p16v.c
@@ -181,7 +181,6 @@ static int snd_p16v_pcm_open_playback_channel(struct snd_pcm_substream *substrea
   
 	runtime->hw = snd_p16v_playback_hw;
 
-        channel->emu = emu;
         channel->number = channel_id;
 
         channel->use=1;
@@ -230,7 +229,6 @@ static int snd_p16v_pcm_open_capture_channel(struct snd_pcm_substream *substream
   
 	runtime->hw = snd_p16v_capture_hw;
 
-	channel->emu = emu;
 	channel->number = channel_id;
 
 	channel->use=1;
-- 
cgit v1.2.3


From 02a0d9c281401d4e1eb0f83ed337c8c6a70194d2 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:06 +0200
Subject: ALSA: emu10k1: clean up P16V part somewhat

Detach it better from the main PCM driver, which it really doesn't have
much in common with.

In particular, this moves the interrupt handler implementation into
p16v.c, and makes it access the substream runtime status more directly,
so it doesn't need to abuse structs snd_emu10k1_pcm and
snd_emu10k1_voice any more.

We don't need private pcm runtime data at all, as the only thing it was
used for (except the back-link to the substream) was the `running` flag.
So store that directly in runtime->private_data.

This somewhat radical strip-down shows that this driver contains some
complexity that was never actually utilized. I suppose the right way to
fully utilize the hardware in a simple way would be introducing more
substreams. This wouldn't require any of the removed code.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>

Link: https://lore.kernel.org/r/20230421141006.1005452-7-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h  |   3 +-
 sound/pci/emu10k1/irq.c  |  32 +++-----------
 sound/pci/emu10k1/p16v.c | 111 +++++++++++++++++++----------------------------
 3 files changed, 50 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index a1ea022bcdc8..ba2d48b38710 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1735,8 +1735,6 @@ struct snd_emu10k1 {
 	spinlock_t i2c_lock; /* serialises access to i2c port */
 
 	struct snd_emu10k1_voice voices[NUM_G];
-	struct snd_emu10k1_voice p16v_voices[4];
-	struct snd_emu10k1_voice p16v_capture_voice;
 	int p16v_device_offset;
 	u32 p16v_capture_source;
 	u32 p16v_capture_channel;
@@ -1756,6 +1754,7 @@ struct snd_emu10k1 {
 	void (*capture_efx_interrupt)(struct snd_emu10k1 *emu, unsigned int status);
 	void (*spdif_interrupt)(struct snd_emu10k1 *emu, unsigned int status);
 	void (*dsp_interrupt)(struct snd_emu10k1 *emu);
+	void (*p16v_interrupt)(struct snd_emu10k1 *emu);
 
 	struct snd_pcm_substream *pcm_capture_substream;
 	struct snd_pcm_substream *pcm_capture_mic_substream;
diff --git a/sound/pci/emu10k1/irq.c b/sound/pci/emu10k1/irq.c
index ebb2275efb6c..dfb44e5e69a7 100644
--- a/sound/pci/emu10k1/irq.c
+++ b/sound/pci/emu10k1/irq.c
@@ -18,7 +18,7 @@
 irqreturn_t snd_emu10k1_interrupt(int irq, void *dev_id)
 {
 	struct snd_emu10k1 *emu = dev_id;
-	unsigned int status, status2, orig_status, orig_status2;
+	unsigned int status, orig_status;
 	int handled = 0;
 	int timeout = 0;
 
@@ -139,32 +139,10 @@ irqreturn_t snd_emu10k1_interrupt(int irq, void *dev_id)
 			status &= ~IPR_FXDSP;
 		}
 		if (status & IPR_P16V) {
-			while ((status2 = inl(emu->port + IPR2)) != 0) {
-				u32 mask = INTE2_PLAYBACK_CH_0_LOOP;  /* Full Loop */
-				struct snd_emu10k1_voice *pvoice = &(emu->p16v_voices[0]);
-				struct snd_emu10k1_voice *cvoice = &(emu->p16v_capture_voice);
-
-				/* dev_dbg(emu->card->dev, "status2=0x%x\n", status2); */
-				orig_status2 = status2;
-				if(status2 & mask) {
-					if(pvoice->use) {
-						snd_pcm_period_elapsed(pvoice->epcm->substream);
-					} else { 
-						dev_err(emu->card->dev,
-							"p16v: status: 0x%08x, mask=0x%08x, pvoice=%p, use=%d\n",
-							status2, mask, pvoice,
-							pvoice->use);
-					}
-				}
-				if(status2 & 0x110000) {
-					/* dev_info(emu->card->dev, "capture int found\n"); */
-					if(cvoice->use) {
-						/* dev_info(emu->card->dev, "capture period_elapsed\n"); */
-						snd_pcm_period_elapsed(cvoice->epcm->substream);
-					}
-				}
-				outl(orig_status2, emu->port + IPR2); /* ack all */
-			}
+			if (emu->p16v_interrupt)
+				emu->p16v_interrupt(emu);
+			else
+				outl(0, emu->port + INTE2);
 			status &= ~IPR_P16V;
 		}
 
diff --git a/sound/pci/emu10k1/p16v.c b/sound/pci/emu10k1/p16v.c
index f5e0972187a7..ce4d3450959c 100644
--- a/sound/pci/emu10k1/p16v.c
+++ b/sound/pci/emu10k1/p16v.c
@@ -149,41 +149,19 @@ static const struct snd_pcm_hardware snd_p16v_capture_hw = {
 	.fifo_size =		0,
 };
 
-static void snd_p16v_pcm_free_substream(struct snd_pcm_runtime *runtime)
-{
-	struct snd_emu10k1_pcm *epcm = runtime->private_data;
-
-	kfree(epcm);
-}
-
 /* open_playback callback */
 static int snd_p16v_pcm_open_playback_channel(struct snd_pcm_substream *substream, int channel_id)
 {
-	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
-        struct snd_emu10k1_voice *channel = &(emu->p16v_voices[channel_id]);
-	struct snd_emu10k1_pcm *epcm;
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	int err;
 
-	epcm = kzalloc(sizeof(*epcm), GFP_KERNEL);
-	/* dev_dbg(emu->card->dev, "epcm kcalloc: %p\n", epcm); */
-
-	if (epcm == NULL)
-		return -ENOMEM;
-	epcm->emu = emu;
-	epcm->substream = substream;
 	/*
 	dev_dbg(emu->card->dev, "epcm device=%d, channel_id=%d\n",
 		   substream->pcm->device, channel_id);
 	*/
-	runtime->private_data = epcm;
-	runtime->private_free = snd_p16v_pcm_free_substream;
   
 	runtime->hw = snd_p16v_playback_hw;
 
-        channel->number = channel_id;
-
-        channel->use=1;
 #if 0 /* debug */
 	dev_dbg(emu->card->dev,
 		   "p16v: open channel_id=%d, channel=%p, use=0x%x\n",
@@ -192,7 +170,6 @@ static int snd_p16v_pcm_open_playback_channel(struct snd_pcm_substream *substrea
 	       channel_id, chip, channel);
 #endif /* debug */
 	/* channel->interrupt = snd_p16v_pcm_channel_interrupt; */
-	channel->epcm = epcm;
 	err = snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS);
 	if (err < 0)
                 return err;
@@ -204,43 +181,20 @@ static int snd_p16v_pcm_open_playback_channel(struct snd_pcm_substream *substrea
 
 	return 0;
 }
+
 /* open_capture callback */
 static int snd_p16v_pcm_open_capture_channel(struct snd_pcm_substream *substream, int channel_id)
 {
-	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
-	struct snd_emu10k1_voice *channel = &(emu->p16v_capture_voice);
-	struct snd_emu10k1_pcm *epcm;
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	int err;
 
-	epcm = kzalloc(sizeof(*epcm), GFP_KERNEL);
-	/* dev_dbg(emu->card->dev, "epcm kcalloc: %p\n", epcm); */
-
-	if (epcm == NULL)
-		return -ENOMEM;
-	epcm->emu = emu;
-	epcm->substream = substream;
 	/*
 	dev_dbg(emu->card->dev, "epcm device=%d, channel_id=%d\n",
 		   substream->pcm->device, channel_id);
 	*/
-	runtime->private_data = epcm;
-	runtime->private_free = snd_p16v_pcm_free_substream;
   
 	runtime->hw = snd_p16v_capture_hw;
 
-	channel->number = channel_id;
-
-	channel->use=1;
-#if 0 /* debug */
-	dev_dbg(emu->card->dev,
-		   "p16v: open channel_id=%d, channel=%p, use=0x%x\n",
-		   channel_id, channel, channel->use);
-	dev_dbg(emu->card->dev, "open:channel_id=%d, chip=%p, channel=%p\n",
-	       channel_id, chip, channel);
-#endif /* debug */
-	/* channel->interrupt = snd_p16v_pcm_channel_interrupt; */
-	channel->epcm = epcm;
 	err = snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS);
 	if (err < 0)
 		return err;
@@ -252,22 +206,12 @@ static int snd_p16v_pcm_open_capture_channel(struct snd_pcm_substream *substream
 /* close callback */
 static int snd_p16v_pcm_close_playback(struct snd_pcm_substream *substream)
 {
-	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
-	//struct snd_pcm_runtime *runtime = substream->runtime;
-	//struct snd_emu10k1_pcm *epcm = runtime->private_data;
-	emu->p16v_voices[substream->pcm->device - emu->p16v_device_offset].use = 0;
-	/* FIXME: maybe zero others */
 	return 0;
 }
 
 /* close callback */
 static int snd_p16v_pcm_close_capture(struct snd_pcm_substream *substream)
 {
-	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
-	//struct snd_pcm_runtime *runtime = substream->runtime;
-	//struct snd_emu10k1_pcm *epcm = runtime->private_data;
-	emu->p16v_capture_voice.use = 0;
-	/* FIXME: maybe zero others */
 	return 0;
 }
 
@@ -409,13 +353,48 @@ static void snd_p16v_intr_disable(struct snd_emu10k1 *emu, unsigned int intrenb)
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
 }
 
+static void snd_p16v_interrupt(struct snd_emu10k1 *emu)
+{
+	unsigned int status;
+
+	while ((status = inl(emu->port + IPR2)) != 0) {
+		u32 mask = INTE2_PLAYBACK_CH_0_LOOP;  /* Full Loop */
+
+		/* dev_dbg(emu->card->dev, "p16v status=0x%x\n", status); */
+		if (status & mask) {
+			struct snd_pcm_substream *substream =
+					emu->pcm_p16v->streams[SNDRV_PCM_STREAM_PLAYBACK].substream;
+			struct snd_pcm_runtime *runtime = substream->runtime;
+
+			if (runtime && runtime->private_data) {
+				snd_pcm_period_elapsed(substream);
+			} else {
+				dev_err(emu->card->dev,
+					"p16v: status: 0x%08x, mask=0x%08x\n",
+					status, mask);
+			}
+		}
+		if (status & 0x110000) {
+			struct snd_pcm_substream *substream =
+					emu->pcm_p16v->streams[SNDRV_PCM_STREAM_CAPTURE].substream;
+			struct snd_pcm_runtime *runtime = substream->runtime;
+
+			/* dev_info(emu->card->dev, "capture int found\n"); */
+			if (runtime && runtime->private_data) {
+				/* dev_info(emu->card->dev, "capture period_elapsed\n"); */
+				snd_pcm_period_elapsed(substream);
+			}
+		}
+		outl(status, emu->port + IPR2); /* ack all */
+	}
+}
+
 /* trigger_playback callback */
 static int snd_p16v_pcm_trigger_playback(struct snd_pcm_substream *substream,
 				    int cmd)
 {
 	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
 	struct snd_pcm_runtime *runtime;
-	struct snd_emu10k1_pcm *epcm;
 	int channel;
 	int result = 0;
         struct snd_pcm_substream *s;
@@ -437,10 +416,9 @@ static int snd_p16v_pcm_trigger_playback(struct snd_pcm_substream *substream,
 		    s->stream != SNDRV_PCM_STREAM_PLAYBACK)
 			continue;
 		runtime = s->runtime;
-		epcm = runtime->private_data;
 		channel = substream->pcm->device-emu->p16v_device_offset;
 		/* dev_dbg(emu->card->dev, "p16v channel=%d\n", channel); */
-		epcm->running = running;
+		runtime->private_data = (void *)(ptrdiff_t)running;
 		basic |= (0x1<<channel);
 		inte |= (INTE2_PLAYBACK_CH_0_LOOP<<channel);
                 snd_pcm_trigger_done(s, substream);
@@ -469,7 +447,6 @@ static int snd_p16v_pcm_trigger_capture(struct snd_pcm_substream *substream,
 {
 	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_emu10k1_pcm *epcm = runtime->private_data;
 	int channel = 0;
 	int result = 0;
 	u32 inte = INTE2_CAPTURE_CH_0_LOOP | INTE2_CAPTURE_CH_0_HALF_LOOP;
@@ -478,13 +455,13 @@ static int snd_p16v_pcm_trigger_capture(struct snd_pcm_substream *substream,
 	case SNDRV_PCM_TRIGGER_START:
 		snd_p16v_intr_enable(emu, inte);
 		snd_emu10k1_ptr20_write(emu, BASIC_INTERRUPT, 0, snd_emu10k1_ptr20_read(emu, BASIC_INTERRUPT, 0)|(0x100<<channel));
-		epcm->running = 1;
+		runtime->private_data = (void *)1;
 		break;
 	case SNDRV_PCM_TRIGGER_STOP:
 		snd_emu10k1_ptr20_write(emu, BASIC_INTERRUPT, 0, snd_emu10k1_ptr20_read(emu, BASIC_INTERRUPT, 0) & ~(0x100<<channel));
 		snd_p16v_intr_disable(emu, inte);
 		//snd_emu10k1_ptr20_write(emu, EXTENDED_INT_MASK, 0, snd_emu10k1_ptr20_read(emu, EXTENDED_INT_MASK, 0) & ~(0x110000<<channel));
-		epcm->running = 0;
+		runtime->private_data = NULL;
 		break;
 	default:
 		result = -EINVAL;
@@ -499,10 +476,10 @@ snd_p16v_pcm_pointer_playback(struct snd_pcm_substream *substream)
 {
 	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_emu10k1_pcm *epcm = runtime->private_data;
 	snd_pcm_uframes_t ptr, ptr1, ptr2,ptr3,ptr4 = 0;
 	int channel = substream->pcm->device - emu->p16v_device_offset;
-	if (!epcm->running)
+
+	if (!runtime->private_data)
 		return 0;
 
 	ptr3 = snd_emu10k1_ptr20_read(emu, PLAYBACK_LIST_PTR, channel);
@@ -524,11 +501,10 @@ snd_p16v_pcm_pointer_capture(struct snd_pcm_substream *substream)
 {
 	struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream);
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_emu10k1_pcm *epcm = runtime->private_data;
 	snd_pcm_uframes_t ptr, ptr1, ptr2 = 0;
 	int channel = 0;
 
-	if (!epcm->running)
+	if (!runtime->private_data)
 		return 0;
 
 	ptr1 = snd_emu10k1_ptr20_read(emu, CAPTURE_POINTER, channel);
@@ -589,6 +565,7 @@ int snd_p16v_pcm(struct snd_emu10k1 *emu, int device)
 	pcm->dev_subclass = SNDRV_PCM_SUBCLASS_GENERIC_MIX;
 	strcpy(pcm->name, "p16v");
 	emu->pcm_p16v = pcm;
+	emu->p16v_interrupt = snd_p16v_interrupt;
 
 	for(substream = pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream; 
 	    substream; 
-- 
cgit v1.2.3


From 10f212bd7a693e379154891cc16959bc4884668a Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:00 +0200
Subject: ALSA: emu10k1: properly assert E-MU FPGA access constaints

Assert the validity of the registers and values, as them being out of
range would indicate an error in the driver. Consequently, don't bother
returning error codes; they were ignored everywhere anyway.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230421141006.1005539-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h |  6 +++---
 sound/pci/emu10k1/io.c  | 36 +++++++++++++++++-------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index ba2d48b38710..00c2d8cd5a8d 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1819,9 +1819,9 @@ unsigned int snd_emu10k1_ptr20_read(struct snd_emu10k1 * emu, unsigned int reg,
 void snd_emu10k1_ptr20_write(struct snd_emu10k1 *emu, unsigned int reg, unsigned int chn, unsigned int data);
 int snd_emu10k1_spi_write(struct snd_emu10k1 * emu, unsigned int data);
 int snd_emu10k1_i2c_write(struct snd_emu10k1 *emu, u32 reg, u32 value);
-int snd_emu1010_fpga_write(struct snd_emu10k1 * emu, u32 reg, u32 value);
-int snd_emu1010_fpga_read(struct snd_emu10k1 * emu, u32 reg, u32 *value);
-int snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 * emu, u32 dst, u32 src);
+void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value);
+void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value);
+void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 src);
 unsigned int snd_emu10k1_efx_read(struct snd_emu10k1 *emu, unsigned int pc);
 void snd_emu10k1_intr_enable(struct snd_emu10k1 *emu, unsigned int intrenb);
 void snd_emu10k1_intr_disable(struct snd_emu10k1 *emu, unsigned int intrenb);
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index e15092ce9848..35bc73d99d04 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -233,15 +233,15 @@ int snd_emu10k1_i2c_write(struct snd_emu10k1 *emu,
 	return err;
 }
 
-int snd_emu1010_fpga_write(struct snd_emu10k1 * emu, u32 reg, u32 value)
+void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value)
 {
 	unsigned long flags;
 
-	if (reg > 0x3f)
-		return 1;
+	if (snd_BUG_ON(reg > 0x3f))
+		return;
 	reg += 0x40; /* 0x40 upwards are registers. */
-	if (value > 0x3f) /* 0 to 0x3f are values */
-		return 1;
+	if (snd_BUG_ON(value > 0x3f)) /* 0 to 0x3f are values */
+		return;
 	spin_lock_irqsave(&emu->emu_lock, flags);
 	outl(reg, emu->port + A_IOCFG);
 	udelay(10);
@@ -251,15 +251,13 @@ int snd_emu1010_fpga_write(struct snd_emu10k1 * emu, u32 reg, u32 value)
 	udelay(10);
 	outl(value | 0x80 , emu->port + A_IOCFG);  /* High bit clocks the value into the fpga. */
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
-
-	return 0;
 }
 
-int snd_emu1010_fpga_read(struct snd_emu10k1 * emu, u32 reg, u32 *value)
+void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
 {
 	unsigned long flags;
-	if (reg > 0x3f)
-		return 1;
+	if (snd_BUG_ON(reg > 0x3f))
+		return;
 	reg += 0x40; /* 0x40 upwards are registers. */
 	spin_lock_irqsave(&emu->emu_lock, flags);
 	outl(reg, emu->port + A_IOCFG);
@@ -268,21 +266,21 @@ int snd_emu1010_fpga_read(struct snd_emu10k1 * emu, u32 reg, u32 *value)
 	udelay(10);
 	*value = ((inl(emu->port + A_IOCFG) >> 8) & 0x7f);
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
-
-	return 0;
 }
 
 /* Each Destination has one and only one Source,
  * but one Source can feed any number of Destinations simultaneously.
  */
-int snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 * emu, u32 dst, u32 src)
+void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 src)
 {
-	snd_emu1010_fpga_write(emu, 0x00, ((dst >> 8) & 0x3f) );
-	snd_emu1010_fpga_write(emu, 0x01, (dst & 0x3f) );
-	snd_emu1010_fpga_write(emu, 0x02, ((src >> 8) & 0x3f) );
-	snd_emu1010_fpga_write(emu, 0x03, (src & 0x3f) );
-
-	return 0;
+	if (snd_BUG_ON(dst & ~0x71f))
+		return;
+	if (snd_BUG_ON(src & ~0x71f))
+		return;
+	snd_emu1010_fpga_write(emu, 0x00, dst >> 8);
+	snd_emu1010_fpga_write(emu, 0x01, dst & 0x1f);
+	snd_emu1010_fpga_write(emu, 0x02, src >> 8);
+	snd_emu1010_fpga_write(emu, 0x03, src & 0x1f);
 }
 
 void snd_emu10k1_intr_enable(struct snd_emu10k1 *emu, unsigned int intrenb)
-- 
cgit v1.2.3


From a1c87c0b27059b4155c7aba6b34810c889e8b6a9 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Fri, 21 Apr 2023 16:10:01 +0200
Subject: ALSA: emu10k1: fix access to Audigy GPIO port

As the register definition clearly states, this is a 16-bit register,
yet we did all accesses as 32-bit. The writes in particular would have
the potential to clear the TIMER register (depending on how the bus/card
actually handles the too long writes).

This commit also introduces a separate define A_GPIO which aliases
A_IOCFG, which better reflects the distinct usage on E-MU cards.
This is done in the same commit to keep the churn down, as we're
touching all involved lines anyway.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230421141006.1005539-2-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          |  7 ++++-
 sound/pci/emu10k1/emu10k1_main.c | 62 ++++++++++++++++++++--------------------
 sound/pci/emu10k1/emumixer.c     | 14 ++++-----
 sound/pci/emu10k1/io.c           | 14 ++++-----
 4 files changed, 51 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 00c2d8cd5a8d..89dbd2e93410 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -251,11 +251,16 @@
 #define MUSTAT_IRDYN		0x80		/* 0 = MIDI data or command ACK			*/
 #define MUSTAT_ORDYN		0x40		/* 0 = MUDATA can accept a command or data	*/
 
-#define A_IOCFG			0x18		/* GPIO on Audigy card (16bits)			*/
+#define A_GPIO			0x18		/* GPIO on Audigy card (16bits)			*/
 #define A_GPINPUT_MASK		0xff00
 #define A_GPOUTPUT_MASK		0x00ff
 
+// The GPIO port is used for I/O config on Sound Blasters;
+// card-specific info can be found in the emu_chip_details table.
+// On E-MU cards the port is used as the interface to the FPGA.
+
 // Audigy output/GPIO stuff taken from the kX drivers
+#define A_IOCFG			A_GPIO
 #define A_IOCFG_GPOUT0		0x0044		/* analog/digital				*/
 #define A_IOCFG_DISABLE_ANALOG	0x0040		/* = 'enable' for Audigy2 (chiprev=4)		*/
 #define A_IOCFG_ENABLE_DIGITAL	0x0004
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 4af7c95fc665..29b6da68ca6c 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -223,8 +223,8 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir, int resume)
 		 */
 		outl(0x7a0000, emu->port + 0x20);
 		outl(0xFF000000, emu->port + 0x24);
-		tmp = inl(emu->port + A_IOCFG) & ~0x8; /* Clear bit 3 */
-		outl(tmp, emu->port + A_IOCFG);
+		tmp = inw(emu->port + A_IOCFG) & ~0x8; /* Clear bit 3 */
+		outw(tmp, emu->port + A_IOCFG);
 	}
 	if (emu->card_capabilities->spi_dac) { /* Audigy 2 ZS Notebook with DAC Wolfson WM8768/WM8568 */
 		int size, n;
@@ -244,15 +244,15 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir, int resume)
 		 * GPIO6: Unknown
 		 * GPIO7: Unknown
 		 */
-		outl(0x76, emu->port + A_IOCFG); /* Windows uses 0x3f76 */
+		outw(0x76, emu->port + A_IOCFG); /* Windows uses 0x3f76 */
 	}
 	if (emu->card_capabilities->i2c_adc) { /* Audigy 2 ZS Notebook with ADC Wolfson WM8775 */
 		int size, n;
 
 		snd_emu10k1_ptr20_write(emu, P17V_I2S_SRC_SEL, 0, 0x2020205f);
-		tmp = inl(emu->port + A_IOCFG);
-		outl(tmp | 0x4, emu->port + A_IOCFG);  /* Set bit 2 for mic input */
-		tmp = inl(emu->port + A_IOCFG);
+		tmp = inw(emu->port + A_IOCFG);
+		outw(tmp | 0x4, emu->port + A_IOCFG);  /* Set bit 2 for mic input */
+		tmp = inw(emu->port + A_IOCFG);
 		size = ARRAY_SIZE(i2c_adc_init);
 		for (n = 0; n < size; n++)
 			snd_emu10k1_i2c_write(emu, i2c_adc_init[n][0], i2c_adc_init[n][1]);
@@ -308,12 +308,12 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir, int resume)
 		} else if (emu->card_capabilities->i2c_adc) {
 			;  /* Disable A_IOCFG for Audigy 2 ZS Notebook */
 		} else if (emu->audigy) {
-			unsigned int reg = inl(emu->port + A_IOCFG);
-			outl(reg | A_IOCFG_GPOUT2, emu->port + A_IOCFG);
+			u16 reg = inw(emu->port + A_IOCFG);
+			outw(reg | A_IOCFG_GPOUT2, emu->port + A_IOCFG);
 			udelay(500);
-			outl(reg | A_IOCFG_GPOUT1 | A_IOCFG_GPOUT2, emu->port + A_IOCFG);
+			outw(reg | A_IOCFG_GPOUT1 | A_IOCFG_GPOUT2, emu->port + A_IOCFG);
 			udelay(100);
-			outl(reg, emu->port + A_IOCFG);
+			outw(reg, emu->port + A_IOCFG);
 		} else {
 			unsigned int reg = inl(emu->port + HCFG);
 			outl(reg | HCFG_GPOUT2, emu->port + HCFG);
@@ -329,8 +329,8 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir, int resume)
 	} else if (emu->card_capabilities->i2c_adc) {
 		;  /* Disable A_IOCFG for Audigy 2 ZS Notebook */
 	} else if (emu->audigy) {	/* enable analog output */
-		unsigned int reg = inl(emu->port + A_IOCFG);
-		outl(reg | A_IOCFG_GPOUT0, emu->port + A_IOCFG);
+		u16 reg = inw(emu->port + A_IOCFG);
+		outw(reg | A_IOCFG_GPOUT0, emu->port + A_IOCFG);
 	}
 
 	if (emu->address_mode == 0) {
@@ -354,19 +354,19 @@ static void snd_emu10k1_audio_enable(struct snd_emu10k1 *emu)
 	} else if (emu->card_capabilities->i2c_adc) {
 		;  /* Disable A_IOCFG for Audigy 2 ZS Notebook */
 	} else if (emu->audigy) {
-		outl(inl(emu->port + A_IOCFG) & ~0x44, emu->port + A_IOCFG);
+		outw(inw(emu->port + A_IOCFG) & ~0x44, emu->port + A_IOCFG);
 
 		if (emu->card_capabilities->ca0151_chip) { /* audigy2 */
 			/* Unmute Analog now.  Set GPO6 to 1 for Apollo.
 			 * This has to be done after init ALice3 I2SOut beyond 48KHz.
 			 * So, sequence is important. */
-			outl(inl(emu->port + A_IOCFG) | 0x0040, emu->port + A_IOCFG);
+			outw(inw(emu->port + A_IOCFG) | 0x0040, emu->port + A_IOCFG);
 		} else if (emu->card_capabilities->ca0108_chip) { /* audigy2 value */
 			/* Unmute Analog now. */
-			outl(inl(emu->port + A_IOCFG) | 0x0060, emu->port + A_IOCFG);
+			outw(inw(emu->port + A_IOCFG) | 0x0060, emu->port + A_IOCFG);
 		} else {
 			/* Disable routing from AC97 line out to Front speakers */
-			outl(inl(emu->port + A_IOCFG) | 0x0080, emu->port + A_IOCFG);
+			outw(inw(emu->port + A_IOCFG) | 0x0080, emu->port + A_IOCFG);
 		}
 	}
 
@@ -651,9 +651,9 @@ static int snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
 				     const struct firmware *fw_entry)
 {
 	int n, i;
-	int reg;
-	int value;
-	__always_unused unsigned int write_post;
+	u16 reg;
+	u8 value;
+	__always_unused u16 write_post;
 	unsigned long flags;
 
 	if (!fw_entry)
@@ -666,11 +666,11 @@ static int snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
 	 * FPGA CONFIG OFF -> FPGA PGMN
 	 */
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	outl(0x00, emu->port + A_IOCFG); /* Set PGMN low for 1uS. */
-	write_post = inl(emu->port + A_IOCFG);
+	outw(0x00, emu->port + A_GPIO); /* Set PGMN low for 1uS. */
+	write_post = inw(emu->port + A_GPIO);
 	udelay(100);
-	outl(0x80, emu->port + A_IOCFG); /* Leave bit 7 set during netlist setup. */
-	write_post = inl(emu->port + A_IOCFG);
+	outw(0x80, emu->port + A_GPIO); /* Leave bit 7 set during netlist setup. */
+	write_post = inw(emu->port + A_GPIO);
 	udelay(100); /* Allow FPGA memory to clean */
 	for (n = 0; n < fw_entry->size; n++) {
 		value = fw_entry->data[n];
@@ -679,15 +679,15 @@ static int snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
 			if (value & 0x1)
 				reg = reg | 0x20;
 			value = value >> 1;
-			outl(reg, emu->port + A_IOCFG);
-			write_post = inl(emu->port + A_IOCFG);
-			outl(reg | 0x40, emu->port + A_IOCFG);
-			write_post = inl(emu->port + A_IOCFG);
+			outw(reg, emu->port + A_GPIO);
+			write_post = inw(emu->port + A_GPIO);
+			outw(reg | 0x40, emu->port + A_GPIO);
+			write_post = inw(emu->port + A_GPIO);
 		}
 	}
 	/* After programming, set GPIO bit 4 high again. */
-	outl(0x10, emu->port + A_IOCFG);
-	write_post = inl(emu->port + A_IOCFG);
+	outw(0x10, emu->port + A_GPIO);
+	write_post = inw(emu->port + A_GPIO);
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
 
 	return 0;
@@ -2053,7 +2053,7 @@ void snd_emu10k1_suspend_regs(struct snd_emu10k1 *emu)
 				*val = snd_emu10k1_ptr_read(emu, *reg, i);
 	}
 	if (emu->audigy)
-		emu->saved_a_iocfg = inl(emu->port + A_IOCFG);
+		emu->saved_a_iocfg = inw(emu->port + A_IOCFG);
 	emu->saved_hcfg = inl(emu->port + HCFG);
 }
 
@@ -2080,7 +2080,7 @@ void snd_emu10k1_resume_regs(struct snd_emu10k1 *emu)
 
 	/* resore for spdif */
 	if (emu->audigy)
-		outl(emu->saved_a_iocfg, emu->port + A_IOCFG);
+		outw(emu->saved_a_iocfg, emu->port + A_IOCFG);
 	outl(emu->saved_hcfg, emu->port + HCFG);
 
 	val = emu->saved_ptr;
diff --git a/sound/pci/emu10k1/emumixer.c b/sound/pci/emu10k1/emumixer.c
index 3c115f8ab96c..754d91050af2 100644
--- a/sound/pci/emu10k1/emumixer.c
+++ b/sound/pci/emu10k1/emumixer.c
@@ -924,7 +924,7 @@ static int snd_audigy_i2c_capture_source_put(struct snd_kcontrol *kcontrol,
 	struct snd_emu10k1 *emu = snd_kcontrol_chip(kcontrol);
 	unsigned int source_id;
 	unsigned int ngain, ogain;
-	u32 gpio;
+	u16 gpio;
 	int change = 0;
 	unsigned long flags;
 	u32 source;
@@ -941,11 +941,11 @@ static int snd_audigy_i2c_capture_source_put(struct snd_kcontrol *kcontrol,
 	if (change) {
 		snd_emu10k1_i2c_write(emu, ADC_MUX, 0); /* Mute input */
 		spin_lock_irqsave(&emu->emu_lock, flags);
-		gpio = inl(emu->port + A_IOCFG);
+		gpio = inw(emu->port + A_IOCFG);
 		if (source_id==0)
-			outl(gpio | 0x4, emu->port + A_IOCFG);
+			outw(gpio | 0x4, emu->port + A_IOCFG);
 		else
-			outl(gpio & ~0x4, emu->port + A_IOCFG);
+			outw(gpio & ~0x4, emu->port + A_IOCFG);
 		spin_unlock_irqrestore(&emu->emu_lock, flags);
 
 		ngain = emu->i2c_capture_volume[source_id][0]; /* Left */
@@ -1632,7 +1632,7 @@ static int snd_emu10k1_shared_spdif_get(struct snd_kcontrol *kcontrol,
 	struct snd_emu10k1 *emu = snd_kcontrol_chip(kcontrol);
 
 	if (emu->audigy)
-		ucontrol->value.integer.value[0] = inl(emu->port + A_IOCFG) & A_IOCFG_GPOUT0 ? 1 : 0;
+		ucontrol->value.integer.value[0] = inw(emu->port + A_IOCFG) & A_IOCFG_GPOUT0 ? 1 : 0;
 	else
 		ucontrol->value.integer.value[0] = inl(emu->port + HCFG) & HCFG_GPOUT0 ? 1 : 0;
 	if (emu->card_capabilities->invert_shared_spdif)
@@ -1657,13 +1657,13 @@ static int snd_emu10k1_shared_spdif_put(struct snd_kcontrol *kcontrol,
 	if ( emu->card_capabilities->i2c_adc) {
 		/* Do nothing for Audigy 2 ZS Notebook */
 	} else if (emu->audigy) {
-		reg = inl(emu->port + A_IOCFG);
+		reg = inw(emu->port + A_IOCFG);
 		val = sw ? A_IOCFG_GPOUT0 : 0;
 		change = (reg & A_IOCFG_GPOUT0) != val;
 		if (change) {
 			reg &= ~A_IOCFG_GPOUT0;
 			reg |= val;
-			outl(reg | val, emu->port + A_IOCFG);
+			outw(reg | val, emu->port + A_IOCFG);
 		}
 	}
 	reg = inl(emu->port + HCFG);
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index 35bc73d99d04..f0134689c320 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -243,13 +243,13 @@ void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value)
 	if (snd_BUG_ON(value > 0x3f)) /* 0 to 0x3f are values */
 		return;
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	outl(reg, emu->port + A_IOCFG);
+	outw(reg, emu->port + A_GPIO);
 	udelay(10);
-	outl(reg | 0x80, emu->port + A_IOCFG);  /* High bit clocks the value into the fpga. */
+	outw(reg | 0x80, emu->port + A_GPIO);  /* High bit clocks the value into the fpga. */
 	udelay(10);
-	outl(value, emu->port + A_IOCFG);
+	outw(value, emu->port + A_GPIO);
 	udelay(10);
-	outl(value | 0x80 , emu->port + A_IOCFG);  /* High bit clocks the value into the fpga. */
+	outw(value | 0x80 , emu->port + A_GPIO);  /* High bit clocks the value into the fpga. */
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
 }
 
@@ -260,11 +260,11 @@ void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
 		return;
 	reg += 0x40; /* 0x40 upwards are registers. */
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	outl(reg, emu->port + A_IOCFG);
+	outw(reg, emu->port + A_GPIO);
 	udelay(10);
-	outl(reg | 0x80, emu->port + A_IOCFG);  /* High bit clocks the value into the fpga. */
+	outw(reg | 0x80, emu->port + A_GPIO);  /* High bit clocks the value into the fpga. */
 	udelay(10);
-	*value = ((inl(emu->port + A_IOCFG) >> 8) & 0x7f);
+	*value = ((inw(emu->port + A_GPIO) >> 8) & 0x7f);
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
 }
 
-- 
cgit v1.2.3


From edb10ace4dcd05f804d59dfdf2fee9121c92faee Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 2 Mar 2023 15:40:14 +0300
Subject: dt-bindings: reset: Add binding for MediaTek MT6735 TOPRGU/WDT

Add a DT binding for the MT6735 top reset generation unit/watchdog timer.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20230302124015.75546-2-y.oudjana@protonmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 .../devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml  |  1 +
 include/dt-bindings/reset/mediatek,mt6735-wdt.h         | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 include/dt-bindings/reset/mediatek,mt6735-wdt.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml b/Documentation/devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml
index 66cacea8e47f..cc502838bc39 100644
--- a/Documentation/devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml
+++ b/Documentation/devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml
@@ -22,6 +22,7 @@ properties:
       - enum:
           - mediatek,mt2712-wdt
           - mediatek,mt6589-wdt
+          - mediatek,mt6735-wdt
           - mediatek,mt6795-wdt
           - mediatek,mt7986-wdt
           - mediatek,mt8183-wdt
diff --git a/include/dt-bindings/reset/mediatek,mt6735-wdt.h b/include/dt-bindings/reset/mediatek,mt6735-wdt.h
new file mode 100644
index 000000000000..c6056e676d46
--- /dev/null
+++ b/include/dt-bindings/reset/mediatek,mt6735-wdt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef _DT_BINDINGS_RESET_MEDIATEK_MT6735_WDT_H_
+#define _DT_BINDINGS_RESET_MEDIATEK_MT6735_WDT_H_
+
+#define MT6735_TOPRGU_MM_RST		1
+#define MT6735_TOPRGU_MFG_RST		2
+#define MT6735_TOPRGU_VENC_RST		3
+#define MT6735_TOPRGU_VDEC_RST		4
+#define MT6735_TOPRGU_IMG_RST		5
+#define MT6735_TOPRGU_MD_RST		7
+#define MT6735_TOPRGU_CONN_RST		9
+#define MT6735_TOPRGU_C2K_SW_RST	14
+#define MT6735_TOPRGU_C2K_RST		15
+#define MT6735_TOPRGU_RST_NUM		9
+
+#endif
-- 
cgit v1.2.3


From e0416e7d33361d2ad0bf9f007428346579ac854a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Apr 2023 23:03:46 +0100
Subject: rxrpc: Fix potential race in error handling in afs_make_call()

If the rxrpc call set up by afs_make_call() receives an error whilst it is
transmitting the request, there's the possibility that it may get to the
point the rxrpc call is ended (after the error_kill_call label) just as the
call is queued for async processing.

This could manifest itself as call->rxcall being seen as NULL in
afs_deliver_to_call() when it tries to lock the call.

Fix this by splitting rxrpc_kernel_end_call() into a function to shut down
an rxrpc call and a function to release the caller's reference and calling
the latter only when we get to afs_put_call().

Reported-by: Jeffrey Altman <jaltman@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: kafs-testing+fedora36_64checkkafs-build-306@auristor.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/rxrpc.rst | 17 ++++++++++++-----
 fs/afs/rxrpc.c                     |  9 ++++-----
 include/net/af_rxrpc.h             |  3 ++-
 net/rxrpc/af_rxrpc.c               | 37 +++++++++++++++++++++++++------------
 net/rxrpc/rxperf.c                 |  3 ++-
 5 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
index ec1323d92c96..e807e18ba32a 100644
--- a/Documentation/networking/rxrpc.rst
+++ b/Documentation/networking/rxrpc.rst
@@ -848,14 +848,21 @@ The kernel interface functions are as follows:
      returned.  The caller now holds a reference on this and it must be
      properly ended.
 
- (#) End a client call::
+ (#) Shut down a client call::
 
-	void rxrpc_kernel_end_call(struct socket *sock,
+	void rxrpc_kernel_shutdown_call(struct socket *sock,
+					struct rxrpc_call *call);
+
+     This is used to shut down a previously begun call.  The user_call_ID is
+     expunged from AF_RXRPC's knowledge and will not be seen again in
+     association with the specified call.
+
+ (#) Release the ref on a client call::
+
+	void rxrpc_kernel_put_call(struct socket *sock,
 				   struct rxrpc_call *call);
 
-     This is used to end a previously begun call.  The user_call_ID is expunged
-     from AF_RXRPC's knowledge and will not be seen again in association with
-     the specified call.
+     This is used to release the caller's ref on an rxrpc call.
 
  (#) Send data through a call::
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 7817e2b860e5..e08b850c3e6d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -179,7 +179,8 @@ void afs_put_call(struct afs_call *call)
 		ASSERT(call->type->name != NULL);
 
 		if (call->rxcall) {
-			rxrpc_kernel_end_call(net->socket, call->rxcall);
+			rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
+			rxrpc_kernel_put_call(net->socket, call->rxcall);
 			call->rxcall = NULL;
 		}
 		if (call->type->destructor)
@@ -420,10 +421,8 @@ error_kill_call:
 	 * The call, however, might be queued on afs_async_calls and we need to
 	 * make sure we don't get any more notifications that might requeue it.
 	 */
-	if (call->rxcall) {
-		rxrpc_kernel_end_call(call->net->socket, call->rxcall);
-		call->rxcall = NULL;
-	}
+	if (call->rxcall)
+		rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall);
 	if (call->async) {
 		if (cancel_work_sync(&call->async_work))
 			afs_put_call(call);
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index ba717eac0229..01a35e113ab9 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -57,7 +57,8 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 			   struct iov_iter *, size_t *, bool, u32 *, u16 *);
 bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 			     u32, int, enum rxrpc_abort_reason);
-void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
+void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call);
+void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
 bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 102f5cbff91a..c32b164206f9 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -342,31 +342,44 @@ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall,
 }
 
 /**
- * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
+ * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using
  * @sock: The socket the call is on
  * @call: The call to end
  *
- * Allow a kernel service to end a call it was using.  The call must be
+ * Allow a kernel service to shut down a call it was using.  The call must be
  * complete before this is called (the call should be aborted if necessary).
  */
-void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
+void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call)
 {
 	_enter("%d{%d}", call->debug_id, refcount_read(&call->ref));
 
 	mutex_lock(&call->user_mutex);
-	rxrpc_release_call(rxrpc_sk(sock->sk), call);
-
-	/* Make sure we're not going to call back into a kernel service */
-	if (call->notify_rx) {
-		spin_lock(&call->notify_lock);
-		call->notify_rx = rxrpc_dummy_notify_rx;
-		spin_unlock(&call->notify_lock);
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+		rxrpc_release_call(rxrpc_sk(sock->sk), call);
+
+		/* Make sure we're not going to call back into a kernel service */
+		if (call->notify_rx) {
+			spin_lock(&call->notify_lock);
+			call->notify_rx = rxrpc_dummy_notify_rx;
+			spin_unlock(&call->notify_lock);
+		}
 	}
-
 	mutex_unlock(&call->user_mutex);
+}
+EXPORT_SYMBOL(rxrpc_kernel_shutdown_call);
+
+/**
+ * rxrpc_kernel_put_call - Release a reference to a call
+ * @sock: The socket the call is on
+ * @call: The call to put
+ *
+ * Drop the application's ref on an rxrpc call.
+ */
+void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call)
+{
 	rxrpc_put_call(call, rxrpc_call_put_kernel);
 }
-EXPORT_SYMBOL(rxrpc_kernel_end_call);
+EXPORT_SYMBOL(rxrpc_kernel_put_call);
 
 /**
  * rxrpc_kernel_check_life - Check to see whether a call is still alive
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 4a2e90015ca7..085e7892d310 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -342,7 +342,8 @@ static void rxperf_deliver_to_call(struct work_struct *work)
 call_complete:
 	rxperf_set_call_complete(call, ret, remote_abort);
 	/* The call may have been requeued */
-	rxrpc_kernel_end_call(rxperf_socket, call->rxcall);
+	rxrpc_kernel_shutdown_call(rxperf_socket, call->rxcall);
+	rxrpc_kernel_put_call(rxperf_socket, call->rxcall);
 	cancel_work(&call->work);
 	kfree(call);
 }
-- 
cgit v1.2.3


From 6d26d985eeda89faedabbcf6607c37454b9691b0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 22 Apr 2023 09:35:44 +0200
Subject: bpf: fix link failure with NETFILTER=y INET=n

Explicitly check if NETFILTER_BPF_LINK is enabled, else configs
that have NETFILTER=y but CONFIG_INET=n fail to link:

> kernel/bpf/syscall.o: undefined reference to `netfilter_prog_ops'
> kernel/bpf/verifier.o: undefined reference to `netfilter_verifier_ops'

Fixes: fd9c663b9ad6 ("bpf: minimal support for programs hooked into netfilter framework")
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202304220903.fRZTJtxe-lkp@intel.com/
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230422073544.17634-1-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 39a999abb0ce..fc0d6f32c687 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,7 +79,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
-#ifdef CONFIG_NETFILTER
+#ifdef CONFIG_NETFILTER_BPF_LINK
 BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
 	      struct bpf_nf_ctx, struct bpf_nf_ctx)
 #endif
-- 
cgit v1.2.3


From 8b2dd46d9a0370cf092fdd798dd66634abaf03e0 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 15:24:30 +0200
Subject: ALSA: emu10k1: remove unused emu->pcm_playback_efx_substream field

Amends historic commit 27ae958cf6 ("emu10k1 driver - add multichannel
device hw:x,3 [2-8/8]").

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422132430.1057468-2-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h    | 1 -
 sound/pci/emu10k1/emupcm.c | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 89dbd2e93410..f21441cda6b9 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1764,7 +1764,6 @@ struct snd_emu10k1 {
 	struct snd_pcm_substream *pcm_capture_substream;
 	struct snd_pcm_substream *pcm_capture_mic_substream;
 	struct snd_pcm_substream *pcm_capture_efx_substream;
-	struct snd_pcm_substream *pcm_playback_efx_substream;
 
 	struct snd_timer *timer;
 
diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c
index b7830fd5c2b4..c2749ad59e10 100644
--- a/sound/pci/emu10k1/emupcm.c
+++ b/sound/pci/emu10k1/emupcm.c
@@ -1044,8 +1044,6 @@ static int snd_emu10k1_efx_playback_open(struct snd_pcm_substream *substream)
 	epcm->type = PLAYBACK_EFX;
 	epcm->substream = substream;
 	
-	emu->pcm_playback_efx_substream = substream;
-
 	runtime->private_data = epcm;
 	runtime->private_free = snd_emu10k1_pcm_free_substream;
 	runtime->hw = snd_emu10k1_efx_playback;
-- 
cgit v1.2.3


From 6fb861bb3caf2ca8e440f7a3fdcca35fcac917b4 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 15:24:30 +0200
Subject: ALSA: emu10k1: fix snd_emu1010_fpga_read() input masking for rev2
 cards

Unlike the Alice2 chips used on 1st generation E-MU cards, the
Tina/Tina2 chips used on the 2nd gen cards have only six GPIN pins,
which means that we need to use a smaller mask. Failure to do so would
falsify the read data if the FPGA tried to raise an IRQ right at that
moment. This wasn't a problem so far, as we didn't actually enable FPGA
IRQs, but that's going to change soon.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422132430.1057490-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h | 3 ++-
 sound/pci/emu10k1/io.c  | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index f21441cda6b9..790dedd42340 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -252,7 +252,8 @@
 #define MUSTAT_ORDYN		0x40		/* 0 = MUDATA can accept a command or data	*/
 
 #define A_GPIO			0x18		/* GPIO on Audigy card (16bits)			*/
-#define A_GPINPUT_MASK		0xff00
+#define A_GPINPUT_MASK		0xff00		/* Alice/2 has 8 input pins			*/
+#define A3_GPINPUT_MASK		0x3f00		/* ... while Tina/2 has only 6			*/
 #define A_GPOUTPUT_MASK		0x00ff
 
 // The GPIO port is used for I/O config on Sound Blasters;
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index f0134689c320..42b06f2e5552 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -255,6 +255,9 @@ void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value)
 
 void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
 {
+	// The higest input pin is used as the designated interrupt trigger,
+	// so it needs to be masked out.
+	u32 mask = emu->card_capabilities->ca0108_chip ? 0x1f : 0x7f;
 	unsigned long flags;
 	if (snd_BUG_ON(reg > 0x3f))
 		return;
@@ -264,7 +267,7 @@ void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
 	udelay(10);
 	outw(reg | 0x80, emu->port + A_GPIO);  /* High bit clocks the value into the fpga. */
 	udelay(10);
-	*value = ((inw(emu->port + A_GPIO) >> 8) & 0x7f);
+	*value = ((inw(emu->port + A_GPIO) >> 8) & mask);
 	spin_unlock_irqrestore(&emu->emu_lock, flags);
 }
 
-- 
cgit v1.2.3


From 3db166d6cf0ea73dd2c887036aad2e95e0884d9b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 18 Apr 2023 10:39:01 -0700
Subject: cxl/mbox: Deprecate poison commands

The CXL subsystem is adding formal mechanisms for managing device
poison. Minimize the maintenance burden going forward, and maximize
the investment in common tooling by deprecating direct user access
to poison commands outside of CXL_MEM_RAW_COMMANDS debug scenarios.

A new cxl_deprecated_commands[] list is created for querying which
command ids defined in previous kernels are now deprecated.

CXL Media and Poison Management commands, opcodes 0x43XX, defined in
CXL 3.0 Spec, Table 8-93 are deprecated with one exception: Get Scan
Media Capabilities. Keep Get Scan Media Capabilities as it simply
provides information and has no impact on the device state.

Effectively all of the commands defined in:

commit 87815ee9d006 ("cxl/pci: Add media provisioning required commands")

...were defined prematurely and should have waited until the kernel
implementation was decided. To my knowledge there are no shipping
devices with poison support and no known tools that would regress with
this change.

Co-developed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Link: https://lore.kernel.org/r/652197e9bc8885e6448d989405b9e50ee9d6b0a6.1681838291.git.alison.schofield@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      |  5 -----
 include/uapi/linux/cxl_mem.h | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index f2addb457172..938cff2c948e 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -61,12 +61,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
 	CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
 	CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
-	CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0),
-	CXL_CMD(INJECT_POISON, 0x8, 0, 0),
-	CXL_CMD(CLEAR_POISON, 0x48, 0, 0),
 	CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
-	CXL_CMD(SCAN_MEDIA, 0x11, 0, 0),
-	CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0),
 };
 
 /*
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 86bbacf2a315..14bc6e742148 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -40,19 +40,22 @@
 	___C(SET_ALERT_CONFIG, "Set Alert Configuration"),                \
 	___C(GET_SHUTDOWN_STATE, "Get Shutdown State"),                   \
 	___C(SET_SHUTDOWN_STATE, "Set Shutdown State"),                   \
-	___C(GET_POISON, "Get Poison List"),                              \
-	___C(INJECT_POISON, "Inject Poison"),                             \
-	___C(CLEAR_POISON, "Clear Poison"),                               \
+	___DEPRECATED(GET_POISON, "Get Poison List"),                     \
+	___DEPRECATED(INJECT_POISON, "Inject Poison"),                    \
+	___DEPRECATED(CLEAR_POISON, "Clear Poison"),                      \
 	___C(GET_SCAN_MEDIA_CAPS, "Get Scan Media Capabilities"),         \
-	___C(SCAN_MEDIA, "Scan Media"),                                   \
-	___C(GET_SCAN_MEDIA, "Get Scan Media Results"),                   \
+	___DEPRECATED(SCAN_MEDIA, "Scan Media"),                          \
+	___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"),          \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
+#define ___DEPRECATED(a, b) CXL_MEM_DEPRECATED_ID_##a
 enum { CXL_CMDS };
 
 #undef ___C
+#undef ___DEPRECATED
 #define ___C(a, b) { b }
+#define ___DEPRECATED(a, b) { "Deprecated " b }
 static const struct {
 	const char *name;
 } cxl_command_names[] __attribute__((__unused__)) = { CXL_CMDS };
@@ -68,6 +71,28 @@ static const struct {
  */
 
 #undef ___C
+#undef ___DEPRECATED
+#define ___C(a, b) (0)
+#define ___DEPRECATED(a, b) (1)
+
+static const __u8 cxl_deprecated_commands[]
+	__attribute__((__unused__)) = { CXL_CMDS };
+
+/*
+ * Here's how this actually breaks out:
+ * cxl_deprecated_commands[] = {
+ *	[CXL_MEM_COMMAND_ID_INVALID] = 0,
+ *	[CXL_MEM_COMMAND_ID_IDENTIFY] = 0,
+ *	...
+ *	[CXL_MEM_DEPRECATED_ID_GET_POISON] = 1,
+ *	[CXL_MEM_DEPRECATED_ID_INJECT_POISON] = 1,
+ *	[CXL_MEM_DEPRECATED_ID_CLEAR_POISON] = 1,
+ *	...
+ * };
+ */
+
+#undef ___C
+#undef ___DEPRECATED
 
 /**
  * struct cxl_command_info - Command information returned from a query.
-- 
cgit v1.2.3


From a869057cd639ed41a1b16bc072fb20cb1ed1dc51 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:15 +0200
Subject: ALSA: emu10k1: comment updates

Move comments to better locations, de-duplicate, fix/remove incorrect/
outdated ones, add new ones, and unify spacing somewhat.

While at it, also add testing credits for Jonathan Dowland (SB Live!
Platinum) and myself (E-MU 0404b).

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-2-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          | 504 +++++++++++++++++++--------------------
 include/uapi/sound/emu10k1.h     |   3 +
 sound/pci/emu10k1/emu10k1.c      |  11 -
 sound/pci/emu10k1/emu10k1_main.c |  77 +++---
 sound/pci/emu10k1/emufx.c        |  12 +-
 sound/pci/emu10k1/emumixer.c     |   5 +-
 sound/pci/emu10k1/emupcm.c       |  26 +-
 sound/pci/emu10k1/io.c           |   8 -
 sound/pci/emu10k1/p16v.h         |   2 +-
 sound/pci/emu10k1/p17v.h         |   4 +-
 10 files changed, 312 insertions(+), 340 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 790dedd42340..7786d5807679 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -97,9 +97,9 @@
 #define IPR_CHANNELLOOP		0x00000040	/* Channel (half) loop interrupt(s) pending	*/
 #define IPR_CHANNELNUMBERMASK	0x0000003f	/* When IPR_CHANNELLOOP is set, indicates the	*/
 						/* highest set channel in CLIPL, CLIPH, HLIPL,  */
-						/* or HLIPH.  When IP is written with CL set,	*/
+						/* or HLIPH.  When IPR is written with CL set,	*/
 						/* the bit in H/CLIPL or H/CLIPH corresponding	*/
-						/* to the CIN value written will be cleared.	*/
+						/* to the CN value written will be cleared.	*/
 
 #define INTE			0x0c		/* Interrupt enable register			*/
 #define INTE_VIRTUALSB_MASK	0xc0000000	/* Virtual Soundblaster I/O port capture	*/
@@ -180,6 +180,7 @@
 #define HCFG_CODECFORMAT_MASK	0x00030000	/* CODEC format					*/
 
 /* Specific to Alice2, CA0102 */
+
 #define HCFG_CODECFORMAT_AC97_1	0x00000000	/* AC97 CODEC format -- Ver 1.03		*/
 #define HCFG_CODECFORMAT_AC97_2	0x00010000	/* AC97 CODEC format -- Ver 2.1			*/
 #define HCFG_AUTOMUTE_ASYNC	0x00008000	/* When set, the async sample rate convertors	*/
@@ -200,9 +201,8 @@
  						/* I2S format input				*/
 /* Rest of HCFG 0x0000000f same as below. LOCKSOUNDCACHE etc.  */
 
-
-
 /* Older chips */
+
 #define HCFG_CODECFORMAT_AC97	0x00000000	/* AC97 CODEC format -- Primary Output		*/
 #define HCFG_CODECFORMAT_I2S	0x00010000	/* I2S CODEC format -- Secondary (Rear) Output	*/
 #define HCFG_GPINPUT0		0x00004000	/* External pin112				*/
@@ -238,7 +238,7 @@
 						/* Should be set to 1 when the EMU10K1 is	*/
 						/* completely initialized.			*/
 
-//For Audigy, MPU port move to 0x70-0x74 ptr register
+// On Audigy, the MPU port moved to the 0x70-0x74 ptr registers
 
 #define MUDATA			0x18		/* MPU401 data register (8 bits)       		*/
 
@@ -277,12 +277,6 @@
 #define A_IOCFG_REAR_JACK       0x8000
 #define A_IOCFG_PHONES_JACK     0x0100          /* LiveDrive					*/
 
-/* outputs:
- *	for audigy2 platinum:	0xa00
- *	for a2 platinum ex:	0x1c00
- *	for a1 platinum:	0x0
- */
-
 #define TIMER			0x1a		/* Timer terminal count register		*/
 						/* NOTE: After the rate is changed, a maximum	*/
 						/* of 1024 sample periods should be allowed	*/
@@ -323,7 +317,7 @@
 						/* 0x00000000 2-channel output. */
 						/* 0x00000200 8-channel output. */
 						/* 0x00000004 pauses stream/irq fail. */
-						/* Rest of bits no nothing to sound output */
+						/* Rest of bits do nothing to sound output */
 						/* bit 0: Enable P16V audio.
 						 * bit 1: Lock P16V record memory cache.
 						 * bit 2: Lock P16V playback memory cache.
@@ -337,6 +331,7 @@
 						 */
 #define IPR3			0x38		/* Cdif interrupt pending register		*/
 #define INTE3			0x3c		/* Cdif interrupt enable register. 	*/
+
 /************************************************************************************************/
 /* PCI function 1 registers, address = <val> + PCIBASE1						*/
 /************************************************************************************************/
@@ -355,11 +350,38 @@
 #define JOYSTICK_BUTTONS	0x0f		/* Joystick button data				*/
 #define JOYSTICK_COMPARATOR	0xf0		/* Joystick comparator data			*/
 
-
 /********************************************************************************************************/
 /* Emu10k1 pointer-offset register set, accessed through the PTR and DATA registers			*/
 /********************************************************************************************************/
 
+// No official documentation was released for EMU10K1, but some info
+// about playback can be extrapolated from the EMU8K documents:
+// "AWE32/EMU8000 Programmer’s Guide" (emu8kpgm.pdf) - registers
+// "AWE32 Developer's Information Pack" (adip301.pdf) - high-level view
+
+// The short version:
+// - The engine has 64 playback channels, also called voices. The channels
+//   operate independently, except when paired for stereo (see below).
+// - PCM samples are fetched into the cache; see description of CD0 below.
+// - Samples are consumed at the rate CPF_CURRENTPITCH.
+// - 8-bit samples are transformed upon use: cooked = (raw ^ 0x80) << 8
+// - 8 samples are read at CCR_READADDRESS:CPF_FRACADDRESS and interpolated
+//   according to CCCA_INTERPROM_*. With CCCA_INTERPROM_0 selected and a zero
+//   CPF_FRACADDRESS, this results in CCR_READADDRESS[3] being used verbatim.
+// - The value is multiplied by CVCF_CURRENTVOL.
+// - The value goes through a filter with cutoff CVCF_CURRENTFILTER;
+//   delay stages Z1 and Z2.
+// - The value is added by so-called `sends` to 4 (EMU10K1) / 8 (EMU10K2)
+//   of the 16 (EMU10K1) / 64 (EMU10K2) FX bus accumulators via FXRT*,
+//   multiplied by a per-send amount (*_FXSENDAMOUNT_*).
+//   The scaling of the send amounts is exponential-ish.
+// - The DSP has a go at FXBUS* and outputs the values to EXTOUT* or EMU32OUT*.
+// - The pitch, volume, and filter cutoff can be modulated by two envelope
+//   engines and two low frequency oscillators.
+// - To avoid abrupt changes to the parameters (which may cause audible
+//   distortion), the modulation engine sets the target registers, towards
+//   which the current registers "swerve" gradually.
+
 #define CPF			0x00		/* Current pitch and fraction register			*/
 #define CPF_CURRENTPITCH_MASK	0xffff0000	/* Current pitch (linear, 0x4000 == unity pitch shift) 	*/
 #define CPF_CURRENTPITCH	0x10100000
@@ -399,7 +421,7 @@
 #define PSST_LOOPSTARTADDR_MASK	0x00ffffff	/* Loop start address of the specified channel		*/
 #define PSST_LOOPSTARTADDR	0x18000006
 
-#define DSL			0x07		/* Send D amount and loop start address register	*/
+#define DSL			0x07		/* Send D amount and loop end address register	*/
 #define DSL_FXSENDAMOUNT_D_MASK	0xff000000	/* Linear level of channel output sent to FX send bus D	*/
 
 #define DSL_FXSENDAMOUNT_D	0x08180007
@@ -424,25 +446,28 @@
 #define CCCA_INTERPROM_6	0x0c000000	/* Select interpolation ROM 6				*/
 #define CCCA_INTERPROM_7	0x0e000000	/* Select interpolation ROM 7				*/
 #define CCCA_8BITSELECT		0x01000000	/* 1 = Sound memory for this channel uses 8-bit samples	*/
+						/* 8-bit samples are unsigned, 16-bit ones signed	*/
 #define CCCA_CURRADDR_MASK	0x00ffffff	/* Current address of the selected channel		*/
 #define CCCA_CURRADDR		0x18000008
 
 #define CCR			0x09		/* Cache control register				*/
 #define CCR_CACHEINVALIDSIZE	0x07190009
-#define CCR_CACHEINVALIDSIZE_MASK	0xfe000000	/* Number of invalid samples cache for this channel    	*/
+#define CCR_CACHEINVALIDSIZE_MASK 0xfe000000	/* Number of invalid samples before the read address	*/
 #define CCR_CACHELOOPFLAG	0x01000000	/* 1 = Cache has a loop service pending			*/
 #define CCR_INTERLEAVEDSAMPLES	0x00800000	/* 1 = A cache service will fetch interleaved samples	*/
+						/* Auto-set from CPF_STEREO_MASK			*/
 #define CCR_WORDSIZEDSAMPLES	0x00400000	/* 1 = A cache service will fetch word sized samples	*/
+						/* Auto-set from CCCA_8BITSELECT			*/
 #define CCR_READADDRESS		0x06100009
-#define CCR_READADDRESS_MASK	0x003f0000	/* Location of cache just beyond current cache service	*/
+#define CCR_READADDRESS_MASK	0x003f0000	/* Next cached sample to play				*/
 #define CCR_LOOPINVALSIZE	0x0000fe00	/* Number of invalid samples in cache prior to loop	*/
 						/* NOTE: This is valid only if CACHELOOPFLAG is set	*/
 #define CCR_LOOPFLAG		0x00000100	/* Set for a single sample period when a loop occurs	*/
-#define CCR_CACHELOOPADDRHI	0x000000ff	/* DSL_LOOPSTARTADDR's hi byte if CACHELOOPFLAG is set	*/
+#define CCR_CACHELOOPADDRHI	0x000000ff	/* CLP_LOOPSTARTADDR's hi byte if CACHELOOPFLAG is set	*/
 
 #define CLP			0x0a		/* Cache loop register (valid if CCR_CACHELOOPFLAG = 1) */
 						/* NOTE: This register is normally not used		*/
-#define CLP_CACHELOOPADDR	0x0000ffff	/* Cache loop address (DSL_LOOPSTARTADDR [0..15])	*/
+#define CLP_CACHELOOPADDR	0x0000ffff	/* Cache loop address low word				*/
 
 #define FXRT			0x0b		/* Effects send routing register			*/
 						/* NOTE: It is illegal to assign the same routing to	*/
@@ -454,7 +479,6 @@
 
 #define A_HR			0x0b	/* High Resolution. 24bit playback from host to DSP. */
 #define MAPA			0x0c		/* Cache map A						*/
-
 #define MAPB			0x0d		/* Cache map B						*/
 
 #define MAP_PTE_MASK0		0xfffff000	/* The 20 MSBs of the PTE indexed by the PTI		*/
@@ -463,7 +487,7 @@
 #define MAP_PTE_MASK1		0xffffe000	/* The 19 MSBs of the PTE indexed by the PTI		*/
 #define MAP_PTI_MASK1		0x00001fff	/* The 13 bit index to one of the 8192 PTE dwords      	*/
 
-/* 0x0e, 0x0f: Not used */
+/* 0x0e, 0x0f: Internal state, at least on Audigy */
 
 #define ENVVOL			0x10		/* Volume envelope register				*/
 #define ENVVOL_MASK		0x0000ffff	/* Current value of volume envelope state variable	*/  
@@ -476,9 +500,9 @@
 						/* 0 = infinite, 1 = 10.9msec, ... 0x7f = 5.5msec	*/
 
 #define DCYSUSV 		0x12		/* Volume envelope sustain and decay register		*/
-#define DCYSUSV_PHASE1_MASK	0x00008000	/* 0 = Begin attack phase, 1 = begin release phase	*/
+#define DCYSUSV_PHASE1_MASK	0x00008000	/* 0 = Begin decay phase, 1 = begin release phase	*/
 #define DCYSUSV_SUSTAINLEVEL_MASK 0x00007f00	/* 127 = full, 0 = off, 0.75dB increments		*/
-#define DCYSUSV_CHANNELENABLE_MASK 0x00000080	/* 1 = Inhibit envelope engine from writing values in	*/
+#define DCYSUSV_CHANNELENABLE_MASK 0x00000080	/* 0 = Inhibit envelope engine from writing values in	*/
 						/* this channel and from writing to pitch, filter and	*/
 						/* volume targets.					*/
 #define DCYSUSV_DECAYTIME_MASK	0x0000007f	/* Volume envelope decay time, log encoded     		*/
@@ -499,7 +523,7 @@
 						/* 0 = infinite, 1 = 11msec, ... 0x7f = 5.5msec		*/
 
 #define DCYSUSM			0x16		/* Modulation envelope decay and sustain register	*/
-#define DCYSUSM_PHASE1_MASK	0x00008000	/* 0 = Begin attack phase, 1 = begin release phase	*/
+#define DCYSUSM_PHASE1_MASK	0x00008000	/* 0 = Begin decay phase, 1 = begin release phase	*/
 #define DCYSUSM_SUSTAINLEVEL_MASK 0x00007f00	/* 127 = full, 0 = off, 0.75dB increments		*/
 #define DCYSUSM_DECAYTIME_MASK	0x0000007f	/* Envelope decay time, log encoded			*/
 						/* 0 = 43.7msec, 1 = 21.8msec, 0x7f = 22msec		*/
@@ -521,7 +545,6 @@
 #define IFATN_ATTENUATION_MASK	0x000000ff	/* Initial attenuation in 0.375dB steps			*/
 #define IFATN_ATTENUATION	0x08000019
 
-
 #define PEFE			0x1a		/* Pitch envelope and filter envelope amount register	*/
 #define PEFE_PITCHAMOUNT_MASK	0x0000ff00	/* Pitch envlope amount					*/
 						/* Signed 2's complement, +/- one octave peak extremes	*/
@@ -529,19 +552,19 @@
 #define PEFE_FILTERAMOUNT_MASK	0x000000ff	/* Filter envlope amount				*/
 						/* Signed 2's complement, +/- six octaves peak extremes */
 #define PEFE_FILTERAMOUNT	0x0800001a
+
 #define FMMOD			0x1b		/* Vibrato/filter modulation from LFO register		*/
 #define FMMOD_MODVIBRATO	0x0000ff00	/* Vibrato LFO modulation depth				*/
 						/* Signed 2's complement, +/- one octave extremes	*/
 #define FMMOD_MOFILTER		0x000000ff	/* Filter LFO modulation depth				*/
 						/* Signed 2's complement, +/- three octave extremes	*/
 
-
 #define TREMFRQ 		0x1c		/* Tremolo amount and modulation LFO frequency register	*/
 #define TREMFRQ_DEPTH		0x0000ff00	/* Tremolo depth					*/
 						/* Signed 2's complement, with +/- 12dB extremes	*/
-
 #define TREMFRQ_FREQUENCY	0x000000ff	/* Tremolo LFO frequency				*/
 						/* ??Hz steps, maximum of ?? Hz.			*/
+
 #define FM2FRQ2 		0x1d		/* Vibrato amount and vibrato LFO frequency register	*/
 #define FM2FRQ2_DEPTH		0x0000ff00	/* Vibrato LFO vibrato depth				*/
 						/* Signed 2's complement, +/- one octave extremes	*/
@@ -645,7 +668,7 @@
 #define FXBA			0x47		/* FX Buffer Address */
 #define FXBA_MASK		0xfffff000	/* 20 bit base address					*/
 
-#define A_HWM			0x48	/* High PCI Water Mark - word access, defaults to 3f */
+#define A_HWM			0x48		/* High PCI Water Mark - word access, defaults to 3f */
 
 #define MICBS			0x49		/* Microphone buffer size register			*/
 
@@ -653,9 +676,7 @@
 
 #define FXBS			0x4b		/* FX buffer size register				*/
 
-/* register: 0x4c..4f: ffff-ffff current amounts, per-channel */
-
-/* The following mask values define the size of the ADC, MIX and FX buffers in bytes */
+/* The following mask values define the size of the ADC, MIC and FX buffers in bytes */
 #define ADCBS_BUFSIZE_NONE	0x00000000
 #define ADCBS_BUFSIZE_384	0x00000001
 #define ADCBS_BUFSIZE_448	0x00000002
@@ -689,29 +710,21 @@
 #define ADCBS_BUFSIZE_57344	0x0000001e
 #define ADCBS_BUFSIZE_65536	0x0000001f
 
-/* Current Send B, A Amounts */
-#define A_CSBA			0x4c
-
-/* Current Send D, C Amounts */
-#define A_CSDC			0x4d
-
-/* Current Send F, E Amounts */
-#define A_CSFE			0x4e
-
-/* Current Send H, G Amounts */
-#define A_CSHG			0x4f
-
+#define A_CSBA			0x4c		/* FX send B & A current amounts			*/
+#define A_CSDC			0x4d		/* FX send D & C current amounts			*/
+#define A_CSFE			0x4e		/* FX send F & E current amounts			*/
+#define A_CSHG			0x4f		/* FX send H & G current amounts			*/
 
-#define CDCS			0x50		/* CD-ROM digital channel status register	*/
+// NOTE: 0x50,51,52: 64-bit (split over voices 0 & 1)
+#define CDCS			0x50		/* CD-ROM digital channel status register		*/
 
-#define GPSCS			0x51		/* General Purpose SPDIF channel status register*/
+#define GPSCS			0x51		/* General Purpose SPDIF channel status register	*/
 
-#define DBG			0x52		/* DO NOT PROGRAM THIS REGISTER!!! MAY DESTROY CHIP */
+#define DBG			0x52
 
-/* S/PDIF Input C Channel Status */
-#define A_SPSC			0x52
+#define A_SPSC			0x52		/* S/PDIF Input C Channel Status			*/
 
-#define REG53			0x53		/* DO NOT PROGRAM THIS REGISTER!!! MAY DESTROY CHIP */
+#define REG53			0x53		/* DO NOT PROGRAM THIS REGISTER!!! MAY DESTROY CHIP	*/
 
 #define A_DBG			 0x53
 #define A_DBG_SINGLE_STEP	 0x00020000	/* Set to zero to start dsp */
@@ -720,7 +733,7 @@
 #define A_DBG_SATURATION_OCCURED 0x20000000
 #define A_DBG_SATURATION_ADDR	 0x0ffc0000
 
-// NOTE: 0x54,55,56: 64-bit
+// NOTE: 0x54,55,56: 64-bit (split over voices 0 & 1)
 #define SPCS0			0x54		/* SPDIF output Channel Status 0 register	*/
 
 #define SPCS1			0x55		/* SPDIF output Channel Status 1 register	*/
@@ -753,17 +766,14 @@
 
 /* 0x57: Not used */
 
-/* The 32-bit CLIx and SOLx registers all have one bit per channel control/status      		*/
+/* The 32-bit CLIx and SOLEx registers all have one bit per channel control/status      	*/
 #define CLIEL			0x58		/* Channel loop interrupt enable low register	*/
-
 #define CLIEH			0x59		/* Channel loop interrupt enable high register	*/
 
 #define CLIPL			0x5a		/* Channel loop interrupt pending low register	*/
-
 #define CLIPH			0x5b		/* Channel loop interrupt pending high register	*/
 
 #define SOLEL			0x5c		/* Stop on loop enable low register		*/
-
 #define SOLEH			0x5d		/* Stop on loop enable high register		*/
 
 #define SPBYPASS		0x5e		/* SPDIF BYPASS mode register			*/
@@ -773,13 +783,12 @@
 #define SPBYPASS_FORMAT		0x00000f00      /* If 1, SPDIF XX uses 24 bit, if 0 - 20 bit	*/
 
 #define AC97SLOT		0x5f            /* additional AC97 slots enable bits		*/
-#define AC97SLOT_REAR_RIGHT	0x01		/* Rear left */
-#define AC97SLOT_REAR_LEFT	0x02		/* Rear right */
-#define AC97SLOT_CNTR		0x10            /* Center enable */
-#define AC97SLOT_LFE		0x20            /* LFE enable */
+#define AC97SLOT_REAR_RIGHT	0x01		/* Rear left					*/
+#define AC97SLOT_REAR_LEFT	0x02		/* Rear right					*/
+#define AC97SLOT_CNTR		0x10            /* Center enable				*/
+#define AC97SLOT_LFE		0x20            /* LFE enable					*/
 
-/* PCB Revision */
-#define A_PCB			0x5f
+#define A_PCB			0x5f		/* PCB Revision					*/
 
 // NOTE: 0x60,61,62: 64-bit
 #define CDSRCS			0x60		/* CD-ROM Sample Rate Converter status register	*/
@@ -819,27 +828,21 @@
 #define FXIDX_MASK		0x0000ffff	/* 16-bit value					*/
 #define FXIDX_IDX		0x10000065
 
-/* The 32-bit HLIx and HLIPx registers all have one bit per channel control/status      		*/
+/* The 32-bit HLIEx and HLIPx registers all have one bit per channel control/status      		*/
 #define HLIEL			0x66		/* Channel half loop interrupt enable low register	*/
-
 #define HLIEH			0x67		/* Channel half loop interrupt enable high register	*/
 
 #define HLIPL			0x68		/* Channel half loop interrupt pending low register	*/
-
 #define HLIPH			0x69		/* Channel half loop interrupt pending high register	*/
 
-/* S/PDIF Host Record Index (bypasses SRC) */
-#define A_SPRI			0x6a
-/* S/PDIF Host Record Address */
-#define A_SPRA			0x6b
-/* S/PDIF Host Record Control */
-#define A_SPRC			0x6c
-/* Delayed Interrupt Counter & Enable */
-#define A_DICE			0x6d
-/* Tank Table Base */
-#define A_TTB			0x6e
-/* Tank Delay Offset */
-#define A_TDOF			0x6f
+#define A_SPRI			0x6a		/* S/PDIF Host Record Index (bypasses SRC)	*/
+#define A_SPRA			0x6b		/* S/PDIF Host Record Address			*/
+#define A_SPRC			0x6c		/* S/PDIF Host Record Control			*/
+
+#define A_DICE			0x6d		/* Delayed Interrupt Counter & Enable		*/
+
+#define A_TTB			0x6e		/* Tank Table Base				*/
+#define A_TDOF			0x6f		/* Tank Delay Offset				*/
 
 /* This is the MPU port on the card (via the game port)						*/
 #define A_MUDATA1		0x70
@@ -852,7 +855,7 @@
 #define A_MUSTAT2		A_MUCMD2	
 
 /* The next two are the Audigy equivalent of FXWC						*/
-/* the Audigy can record any output (16bit, 48kHz, up to 64 channel simultaneously) 		*/
+/* the Audigy can record any output (16bit, 48kHz, up to 64 channels simultaneously) 		*/
 /* Each bit selects a channel for recording */
 #define A_FXWC1			0x74            /* Selects 0x7f-0x60 for FX recording           */
 #define A_FXWC2			0x75		/* Selects 0x9f-0x80 for FX recording           */
@@ -880,20 +883,13 @@
 #define A_PCM_96000		0x00004000
 #define A_PCM_44100		0x00008000
 
-/* I2S0 Sample Rate Tracker Status */
-#define A_SRT3			0x77
-
-/* I2S1 Sample Rate Tracker Status */
-#define A_SRT4			0x78
-
-/* I2S2 Sample Rate Tracker Status */
-#define A_SRT5			0x79
+#define A_SRT3			0x77		/* I2S0 Sample Rate Tracker Status		*/
+#define A_SRT4			0x78		/* I2S1 Sample Rate Tracker Status		*/
+#define A_SRT5			0x79		/* I2S2 Sample Rate Tracker Status		*/
 /* - default to 0x01080000 on my audigy 2 ZS --rlrevell	*/
 
-/* Tank Table DMA Address */
-#define A_TTDA			0x7a
-/* Tank Table DMA Data */
-#define A_TTDD			0x7b
+#define A_TTDA			0x7a		/* Tank Table DMA Address			*/
+#define A_TTDD			0x7b		/* Tank Table DMA Data				*/
 
 #define A_FXRT2			0x7c
 #define A_FXRT_CHANNELE		0x0000003f	/* Effects send bus number for channel's effects send E	*/
@@ -906,6 +902,7 @@
 #define A_FXSENDAMOUNT_F_MASK	0x00FF0000
 #define A_FXSENDAMOUNT_G_MASK	0x0000FF00
 #define A_FXSENDAMOUNT_H_MASK	0x000000FF
+
 /* 0x7c, 0x7e "high bit is used for filtering" */
  
 /* The send amounts for this one are the same as used with the emu10k1 */
@@ -956,15 +953,47 @@
 #define A_HIWORD_RESULT_MASK	0x007ff000
 #define A_HIWORD_OPA_MASK	0x000007ff
 
+
+/************************************************************************************************/
+/* E-MU Digital Audio System overview								*/
+/************************************************************************************************/
+
+// - These cards use a regular PCI-attached Audigy chip (Alice2/Tina/Tina2);
+//   the PCIe variants simply put the Audigy chip behind a PCI bridge.
+// - All physical PCM I/O is routed through an additional FPGA; the regular
+//   EXTIN/EXTOUT ports are unconnected.
+// - The FPGA has a signal routing matrix, to connect each destination (output
+//   socket or capture channel) to a source (input socket or playback channel).
+// - The FPGA is controlled via Audigy's GPIO port, while sample data is
+//   transmitted via proprietary EMU32 serial links. On first-generation
+//   E-MU 1010 cards, Audigy's I2S inputs are also used for sample data.
+// - The Audio/Micro Dock is attached to Hana via EDI, a "network" link.
+// - The Audigy chip operates in slave mode; the clock is supplied by the FPGA.
+//   Gen1 E-MU 1010 cards have two crystals (for 44.1 kHz and 48 kHz multiples),
+//   while the later cards use a single crystal and a PLL chip.
+// - The whole card is switched to 2x/4x mode to achieve 88.2/96/176.4/192 kHz
+//   sample rates. Alice2/Tina keeps running at 44.1/48 kHz, but multiple channels
+//   are bundled.
+// - The number of available EMU32/EDI channels is hit in 2x/4x mode, so the total
+//   number of usable inputs/outputs is limited, esp. with ADAT in use.
+// - S/PDIF is unavailable in 4x mode (only over TOSLINK on newer 1010 cards) due
+//   to being unspecified at 176.4/192 kHz. Therefore, the Dock's S/PDIF channels
+//   can overlap with the Dock's ADC/DAC's high channels.
+// - The code names are mentioned below and in the emu_chip_details table.
+
 /************************************************************************************************/
-/* EMU1010m HANA FPGA registers									*/
+/* EMU1010 FPGA registers									*/
 /************************************************************************************************/
+
 #define EMU_HANA_DESTHI		0x00	/* 0000xxx  3 bits Link Destination */
 #define EMU_HANA_DESTLO		0x01	/* 00xxxxx  5 bits */
+
 #define EMU_HANA_SRCHI		0x02	/* 0000xxx  3 bits Link Source */
 #define EMU_HANA_SRCLO		0x03	/* 00xxxxx  5 bits */
+
 #define EMU_HANA_DOCK_PWR	0x04	/* 000000x  1 bits Audio Dock power */
 #define EMU_HANA_DOCK_PWR_ON		0x01 /* Audio Dock power on */
+
 #define EMU_HANA_WCLOCK		0x05	/* 0000xxx  3 bits Word Clock source select  */
 					/* Must be written after power on to reset DLL */
 					/* One is unable to detect the Audio dock without this */
@@ -1073,21 +1102,19 @@
 #define EMU_HANA_0202_DAC_PAD1	0x10	/* 14dB Attenuation on 0202 DAC 1. Left and Right */
 
 /* 0x14 - 0x1f Unused R/W registers */
-#define EMU_HANA_IRQ_STATUS	0x20	/* 000xxxx  4 bits IRQ Status  */
-#if 0  /* Already defined for reg 0x09 IRQ_ENABLE */
-#define EMU_HANA_IRQ_WCLK_CHANGED	0x01
-#define EMU_HANA_IRQ_ADAT		0x02
-#define EMU_HANA_IRQ_DOCK		0x04
-#define EMU_HANA_IRQ_DOCK_LOST		0x08
-#endif
+
+#define EMU_HANA_IRQ_STATUS	0x20	/* 00xxxxx  5 bits IRQ Status  */
+					/* Same bits as for EMU_HANA_IRQ_ENABLE */
+					/* Reading the register resets it. */
 
 #define EMU_HANA_OPTION_CARDS	0x21	/* 000xxxx  4 bits Presence of option cards */
-#define EMU_HANA_OPTION_HAMOA	0x01	/* HAMOA card present */
+#define EMU_HANA_OPTION_HAMOA	0x01	/* Hamoa (analog I/O) card present */
 #define EMU_HANA_OPTION_SYNC	0x02	/* Sync card present */
-#define EMU_HANA_OPTION_DOCK_ONLINE	0x04	/* Audio Dock online and FPGA configured */
-#define EMU_HANA_OPTION_DOCK_OFFLINE	0x08	/* Audio Dock online and FPGA not configured */
+#define EMU_HANA_OPTION_DOCK_ONLINE	0x04	/* Audio/Micro dock present and FPGA configured */
+#define EMU_HANA_OPTION_DOCK_OFFLINE	0x08	/* Audio/Micro dock present and FPGA not configured */
 
-#define EMU_HANA_ID		0x22	/* 1010101  7 bits ID byte & 0x7f = 0x55 */
+#define EMU_HANA_ID		0x22	/* 1010101  7 bits ID byte & 0x7f = 0x55 with Alice2 */
+					/* 0010101  5 bits ID byte & 0x1f = 0x15 with Tina/2 */
 
 #define EMU_HANA_MAJOR_REV	0x23	/* 0000xxx  3 bit  Hana FPGA Major rev */
 #define EMU_HANA_MINOR_REV	0x24	/* 0000xxx  3 bit  Hana FPGA Minor rev */
@@ -1110,31 +1137,31 @@
 
 #define EMU_HANA2_WC_SPDIF_HI	0x2e	/* 0xxxxxx  6 bit  HANA2 SPDIF IN Word clock, upper 6 bits */
 #define EMU_HANA2_WC_SPDIF_LO	0x2f	/* 0xxxxxx  6 bit  HANA2 SPDIF IN Word clock, lower 6 bits */
+
 /* 0x30 - 0x3f Unused Read only registers */
 
 /************************************************************************************************/
-/* EMU1010m HANA Destinations									*/
+/* EMU1010 Audio Destinations									*/
 /************************************************************************************************/
-/* Hana, original 1010,1212,1820 using Alice2
- * Destiniations for SRATEX = 1X rates: 44.1 kHz or 48 kHz
+/* Hana, original 1010,1212m,1820[m] using Alice2
  * 0x00, 0x00-0x0f: 16 EMU32 channels to Alice2
- * 0x01, 0x10-0x1f: 32 Elink channels to Audio Dock
- * 0x01, 0x00: Dock DAC 1 Left
- * 0x01, 0x04: Dock DAC 1 Right
- * 0x01, 0x08: Dock DAC 2 Left
- * 0x01, 0x0c: Dock DAC 2 Right
- * 0x01, 0x10: Dock DAC 3 Left
- * 0x01, 0x12: PHONES Left
- * 0x01, 0x14: Dock DAC 3 Right
- * 0x01, 0x16: PHONES Right
- * 0x01, 0x18: Dock DAC 4 Left
- * 0x01, 0x1a: S/PDIF Left
- * 0x01, 0x1c: Dock DAC 4 Right
- * 0x01, 0x1e: S/PDIF Right
+ * 0x01, 0x00-0x1f: 32 EDI channels to Audio Dock
+ *       0x00: Dock DAC 1 Left
+ *       0x04: Dock DAC 1 Right
+ *       0x08: Dock DAC 2 Left
+ *       0x0c: Dock DAC 2 Right
+ *       0x10: Dock DAC 3 Left
+ *       0x12: PHONES Left (n/a in 2x/4x mode; output mirrors DAC4 Left)
+ *       0x14: Dock DAC 3 Right
+ *       0x16: PHONES Right (n/a in 2x/4x mode; output mirrors DAC4 Right)
+ *       0x18: Dock DAC 4 Left
+ *       0x1a: S/PDIF Left
+ *       0x1c: Dock DAC 4 Right
+ *       0x1e: S/PDIF Right
  * 0x02, 0x00: Hana S/PDIF Left
  * 0x02, 0x01: Hana S/PDIF Right
- * 0x03, 0x00: Hanoa DAC Left
- * 0x03, 0x01: Hanoa DAC Right
+ * 0x03, 0x00: Hamoa DAC Left
+ * 0x03, 0x01: Hamoa DAC Right
  * 0x04, 0x00-0x07: Hana ADAT
  * 0x05, 0x00: I2S0 Left to Alice2
  * 0x05, 0x01: I2S0 Right to Alice2
@@ -1146,40 +1173,29 @@
  * Hana2 never released, but used Tina
  * Not needed.
  *
- * Hana3, rev2 1010,1212,1616 using Tina
- * Destinations for SRATEX = 1X rates: 44.1 kHz or 48 kHz
+ * Hana3, rev2 1010,1212m,1616[m] using Tina
  * 0x00, 0x00-0x0f: 16 EMU32A channels to Tina
- * 0x01, 0x10-0x1f: 32 EDI channels to Micro Dock
- * 0x01, 0x00: Dock DAC 1 Left
- * 0x01, 0x04: Dock DAC 1 Right
- * 0x01, 0x08: Dock DAC 2 Left
- * 0x01, 0x0c: Dock DAC 2 Right
- * 0x01, 0x10: Dock DAC 3 Left
- * 0x01, 0x12: Dock S/PDIF Left
- * 0x01, 0x14: Dock DAC 3 Right
- * 0x01, 0x16: Dock S/PDIF Right
- * 0x01, 0x18-0x1f: Dock ADAT 0-7
+ * 0x01, 0x00-0x1f: 32 EDI channels to Micro Dock
+ *       0x00: Dock DAC 1 Left
+ *       0x04: Dock DAC 1 Right
+ *       0x08: Dock DAC 2 Left
+ *       0x0c: Dock DAC 2 Right
+ *       0x10: Dock DAC 3 Left
+ *       0x12: Dock S/PDIF Left
+ *       0x14: Dock DAC 3 Right
+ *       0x16: Dock S/PDIF Right
+ *       0x18-0x1f: Dock ADAT 0-7
  * 0x02, 0x00: Hana3 S/PDIF Left
  * 0x02, 0x01: Hana3 S/PDIF Right
- * 0x03, 0x00: Hanoa DAC Left
- * 0x03, 0x01: Hanoa DAC Right
+ * 0x03, 0x00: Hamoa DAC Left
+ * 0x03, 0x01: Hamoa DAC Right
  * 0x04, 0x00-0x07: Hana3 ADAT 0-7
  * 0x05, 0x00-0x0f: 16 EMU32B channels to Tina
  * 0x06-0x07: Not used
  *
  * HanaLite, rev1 0404 using Alice2
- * Destiniations for SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00, 0x00-0x0f: 16 EMU32 channels to Alice2
- * 0x01: Not used
- * 0x02, 0x00: S/PDIF Left
- * 0x02, 0x01: S/PDIF Right
- * 0x03, 0x00: DAC Left
- * 0x03, 0x01: DAC Right
- * 0x04-0x07: Not used
- *
- * HanaLiteLite, rev2 0404 using Alice2
- * Destiniations for SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00, 0x00-0x0f: 16 EMU32 channels to Alice2
+ * HanaLiteLite, rev2 0404 using Tina
+ * 0x00, 0x00-0x0f: 16 EMU32 channels to Alice2/Tina
  * 0x01: Not used
  * 0x02, 0x00: S/PDIF Left
  * 0x02, 0x01: S/PDIF Right
@@ -1188,35 +1204,21 @@
  * 0x04-0x07: Not used
  *
  * Mana, Cardbus 1616 using Tina2
- * Destinations for SRATEX = 1X rates: 44.1 kHz or 48 kHz
  * 0x00, 0x00-0x0f: 16 EMU32A channels to Tina2
- * 0x01, 0x10-0x1f: 32 EDI channels to Micro Dock
- * 0x01, 0x00: Dock DAC 1 Left
- * 0x01, 0x04: Dock DAC 1 Right
- * 0x01, 0x08: Dock DAC 2 Left
- * 0x01, 0x0c: Dock DAC 2 Right
- * 0x01, 0x10: Dock DAC 3 Left
- * 0x01, 0x12: Dock S/PDIF Left
- * 0x01, 0x14: Dock DAC 3 Right
- * 0x01, 0x16: Dock S/PDIF Right
- * 0x01, 0x18-0x1f: Dock ADAT 0-7
+ * 0x01, 0x00-0x1f: 32 EDI channels to Micro Dock
+ *       (same as rev2 1010)
  * 0x02: Not used
  * 0x03, 0x00: Mana DAC Left
  * 0x03, 0x01: Mana DAC Right
  * 0x04, 0x00-0x0f: 16 EMU32B channels to Tina2
  * 0x05-0x07: Not used
- *
- *
  */
+
 /* 32-bit destinations of signal in the Hana FPGA. Destinations are either
- * physical outputs of Hana, or outputs going to Alice2 (audigy) for capture
- * - 16 x EMU_DST_ALICE2_EMU32_X.
- */
-/* EMU32 = 32-bit serial channel between Alice2 (audigy) and Hana (FPGA) */
-/* EMU_DST_ALICE2_EMU32_X - data channels from Hana to Alice2 used for capture.
- * Which data is fed into a EMU_DST_ALICE2_EMU32_X channel in Hana depends on
- * setup of mixer control for each destination - see emumixer.c -
- * snd_emu1010_output_enum_ctls[], snd_emu1010_input_enum_ctls[]
+ * physical outputs of Hana, or outputs going to Alice2/Tina for capture -
+ * 16 x EMU_DST_ALICE2_EMU32_X (2x on rev2 boards). Which data is fed into
+ * a channel depends on the mixer control setting for each destination - see
+ * emumixer.c - snd_emu1010_output_enum_ctls[], snd_emu1010_input_enum_ctls[]
  */
 #define EMU_DST_ALICE2_EMU32_0	0x000f	/* 16 EMU32 channels to Alice2 +0 to +0xf */
 #define EMU_DST_ALICE2_EMU32_1	0x0000	/* 16 EMU32 channels to Alice2 +0 to +0xf */
@@ -1286,6 +1288,7 @@
 #define EMU_DST_HAMOA_DAC_RIGHT2	0x0303	/* Hamoa DAC Right, 2nd or 96kHz */
 #define EMU_DST_HAMOA_DAC_RIGHT3	0x0305	/* Hamoa DAC Right, 3rd or 192kHz */
 #define EMU_DST_HAMOA_DAC_RIGHT4	0x0307	/* Hamoa DAC Right, 4th or 192kHz */
+// In S/MUX mode, the samples of one channel are adjacent.
 #define EMU_DST_HANA_ADAT	0x0400	/* Hana ADAT 8 channel out +0 to +7 */
 #define EMU_DST_ALICE_I2S0_LEFT		0x0500	/* Alice2 I2S0 Left */
 #define EMU_DST_ALICE_I2S0_RIGHT	0x0501	/* Alice2 I2S0 Right */
@@ -1295,39 +1298,32 @@
 #define EMU_DST_ALICE_I2S2_RIGHT	0x0701	/* Alice2 I2S2 Right */
 
 /* Additional destinations for 1616(M)/Microdock */
-/* Microdock S/PDIF OUT Left, 1st or 48kHz only */
-#define EMU_DST_MDOCK_SPDIF_LEFT1	0x0112
-/* Microdock S/PDIF OUT Left, 2nd or 96kHz */
-#define EMU_DST_MDOCK_SPDIF_LEFT2	0x0113
-/* Microdock S/PDIF OUT Right, 1st or 48kHz only */
-#define EMU_DST_MDOCK_SPDIF_RIGHT1	0x0116
-/* Microdock S/PDIF OUT Right, 2nd or 96kHz  */
-#define EMU_DST_MDOCK_SPDIF_RIGHT2	0x0117
-/* Microdock S/PDIF ADAT 8 channel out +8 to +f */
-#define EMU_DST_MDOCK_ADAT		0x0118
-
-/* Headphone jack on 1010 cardbus? 44.1/48kHz only? */
-#define EMU_DST_MANA_DAC_LEFT		0x0300
-/* Headphone jack on 1010 cardbus? 44.1/48kHz only? */
-#define EMU_DST_MANA_DAC_RIGHT		0x0301
+
+#define EMU_DST_MDOCK_SPDIF_LEFT1	0x0112	/* Microdock S/PDIF OUT Left, 1st or 48kHz only */
+#define EMU_DST_MDOCK_SPDIF_LEFT2	0x0113	/* Microdock S/PDIF OUT Left, 2nd or 96kHz */
+#define EMU_DST_MDOCK_SPDIF_RIGHT1	0x0116	/* Microdock S/PDIF OUT Right, 1st or 48kHz only */
+#define EMU_DST_MDOCK_SPDIF_RIGHT2	0x0117	/* Microdock S/PDIF OUT Right, 2nd or 96kHz  */
+#define EMU_DST_MDOCK_ADAT		0x0118	/* Microdock S/PDIF ADAT 8 channel out +8 to +f */
+
+#define EMU_DST_MANA_DAC_LEFT		0x0300	/* Headphone jack on 1010 cardbus? 44.1/48kHz only? */
+#define EMU_DST_MANA_DAC_RIGHT		0x0301	/* Headphone jack on 1010 cardbus? 44.1/48kHz only? */
 
 /************************************************************************************************/
-/* EMU1010m HANA Sources									*/
+/* EMU1010 Audio Sources									*/
 /************************************************************************************************/
-/* Hana, original 1010,1212,1820 using Alice2
- * Sources SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00,0x00-0x1f: Silence
- * 0x01, 0x10-0x1f: 32 Elink channels from Audio Dock
- * 0x01, 0x00: Dock Mic A
- * 0x01, 0x04: Dock Mic B
- * 0x01, 0x08: Dock ADC 1 Left
- * 0x01, 0x0c: Dock ADC 1 Right
- * 0x01, 0x10: Dock ADC 2 Left
- * 0x01, 0x14: Dock ADC 2 Right
- * 0x01, 0x18: Dock ADC 3 Left
- * 0x01, 0x1c: Dock ADC 3 Right
- * 0x02, 0x00: Hana ADC Left
- * 0x02, 0x01: Hana ADC Right
+/* Hana, original 1010,1212m,1820[m] using Alice2
+ * 0x00, 0x00-0x1f: Silence
+ * 0x01, 0x00-0x1f: 32 EDI channels from Audio Dock
+ *       0x00: Dock Mic A
+ *       0x04: Dock Mic B
+ *       0x08: Dock ADC 1 Left
+ *       0x0c: Dock ADC 1 Right
+ *       0x10: Dock ADC 2 Left
+ *       0x14: Dock ADC 2 Right
+ *       0x18: Dock ADC 3 Left
+ *       0x1c: Dock ADC 3 Right
+ * 0x02, 0x00: Hamoa ADC Left
+ * 0x02, 0x01: Hamoa ADC Right
  * 0x03, 0x00-0x0f: 16 inputs from Alice2 Emu32A output
  * 0x03, 0x10-0x1f: 16 inputs from Alice2 Emu32B output
  * 0x04, 0x00-0x07: Hana ADAT
@@ -1338,23 +1334,20 @@
  * Hana2 never released, but used Tina
  * Not needed.
  *
- * Hana3, rev2 1010,1212,1616 using Tina
- * Sources SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00,0x00-0x1f: Silence
- * 0x01, 0x10-0x1f: 32 Elink channels from Audio Dock
- * 0x01, 0x00: Dock Mic A
- * 0x01, 0x04: Dock Mic B
- * 0x01, 0x08: Dock ADC 1 Left
- * 0x01, 0x0c: Dock ADC 1 Right
- * 0x01, 0x10: Dock ADC 2 Left
- * 0x01, 0x12: Dock S/PDIF Left
- * 0x01, 0x14: Dock ADC 2 Right
- * 0x01, 0x16: Dock S/PDIF Right
- * 0x01, 0x18-0x1f: Dock ADAT 0-7
- * 0x01, 0x18: Dock ADC 3 Left
- * 0x01, 0x1c: Dock ADC 3 Right
- * 0x02, 0x00: Hanoa ADC Left
- * 0x02, 0x01: Hanoa ADC Right
+ * Hana3, rev2 1010,1212m,1616[m] using Tina
+ * 0x00, 0x00-0x1f: Silence
+ * 0x01, 0x00-0x1f: 32 EDI channels from Micro Dock
+ *       0x00: Dock Mic A
+ *       0x04: Dock Mic B
+ *       0x08: Dock ADC 1 Left
+ *       0x0c: Dock ADC 1 Right
+ *       0x10: Dock ADC 2 Left
+ *       0x12: Dock S/PDIF Left
+ *       0x14: Dock ADC 2 Right
+ *       0x16: Dock S/PDIF Right
+ *       0x18-0x1f: Dock ADAT 0-7
+ * 0x02, 0x00: Hamoa ADC Left
+ * 0x02, 0x01: Hamoa ADC Right
  * 0x03, 0x00-0x0f: 16 inputs from Tina Emu32A output
  * 0x03, 0x10-0x1f: 16 inputs from Tina Emu32B output
  * 0x04, 0x00-0x07: Hana3 ADAT
@@ -1363,58 +1356,32 @@
  * 0x06-0x07: Not used
  *
  * HanaLite, rev1 0404 using Alice2
- * Sources SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00,0x00-0x1f: Silence
+ * HanaLiteLite, rev2 0404 using Tina
+ * 0x00, 0x00-0x1f: Silence
  * 0x01: Not used
  * 0x02, 0x00: ADC Left
  * 0x02, 0x01: ADC Right
- * 0x03, 0x00-0x0f: 16 inputs from Alice2 Emu32A output
- * 0x03, 0x10-0x1f: 16 inputs from Alice2 Emu32B output
- * 0x04: Not used
- * 0x05, 0x00: S/PDIF Left
- * 0x05, 0x01: S/PDIF Right
- * 0x06-0x07: Not used
- *
- * HanaLiteLite, rev2 0404 using Alice2
- * Sources SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00,0x00-0x1f: Silence
- * 0x01: Not used
- * 0x02, 0x00: ADC Left
- * 0x02, 0x01: ADC Right
- * 0x03, 0x00-0x0f: 16 inputs from Alice2 Emu32A output
- * 0x03, 0x10-0x1f: 16 inputs from Alice2 Emu32B output
+ * 0x03, 0x00-0x0f: 16 inputs from Alice2/Tina Emu32A output
+ * 0x03, 0x10-0x1f: 16 inputs from Alice2/Tina Emu32B output
  * 0x04: Not used
  * 0x05, 0x00: S/PDIF Left
  * 0x05, 0x01: S/PDIF Right
  * 0x06-0x07: Not used
  *
  * Mana, Cardbus 1616 using Tina2
- * Sources SRATEX = 1X rates: 44.1 kHz or 48 kHz
- * 0x00,0x00-0x1f: Silence
- * 0x01, 0x10-0x1f: 32 Elink channels from Audio Dock
- * 0x01, 0x00: Dock Mic A
- * 0x01, 0x04: Dock Mic B
- * 0x01, 0x08: Dock ADC 1 Left
- * 0x01, 0x0c: Dock ADC 1 Right
- * 0x01, 0x10: Dock ADC 2 Left
- * 0x01, 0x12: Dock S/PDIF Left
- * 0x01, 0x14: Dock ADC 2 Right
- * 0x01, 0x16: Dock S/PDIF Right
- * 0x01, 0x18-0x1f: Dock ADAT 0-7
- * 0x01, 0x18: Dock ADC 3 Left
- * 0x01, 0x1c: Dock ADC 3 Right
+ * 0x00, 0x00-0x1f: Silence
+ * 0x01, 0x00-0x1f: 32 EDI channels from Micro Dock
+ *       (same as rev2 1010)
  * 0x02: Not used
- * 0x03, 0x00-0x0f: 16 inputs from Tina Emu32A output
- * 0x03, 0x10-0x1f: 16 inputs from Tina Emu32B output
+ * 0x03, 0x00-0x0f: 16 inputs from Tina2 Emu32A output
+ * 0x03, 0x10-0x1f: 16 inputs from Tina2 Emu32B output
  * 0x04-0x07: Not used
- *
  */
 
 /* 32-bit sources of signal in the Hana FPGA. The sources are routed to
- * destinations using mixer control for each destination - see emumixer.c
- * Sources are either physical inputs of FPGA,
- * or outputs from Alice (audigy) - 16 x EMU_SRC_ALICE_EMU32A +
- * 16 x EMU_SRC_ALICE_EMU32B
+ * destinations using a mixer control for each destination - see emumixer.c.
+ * Sources are either physical inputs of Hana, or inputs from Alice2/Tina -
+ * 16 x EMU_SRC_ALICE_EMU32A + 16 x EMU_SRC_ALICE_EMU32B.
  */
 #define EMU_SRC_SILENCE		0x0000	/* Silence */
 #define EMU_SRC_DOCK_MIC_A1	0x0100	/* Audio Dock Mic A, 1st or 48kHz only */
@@ -1459,6 +1426,7 @@
 #define EMU_SRC_HAMOA_ADC_RIGHT4	0x0207	/* Hamoa ADC Right, 4th or 192kHz */
 #define EMU_SRC_ALICE_EMU32A		0x0300	/* Alice2 EMU32a 16 outputs. +0 to +0xf */
 #define EMU_SRC_ALICE_EMU32B		0x0310	/* Alice2 EMU32b 16 outputs. +0 to +0xf */
+// In S/MUX mode, the samples of one channel are adjacent.
 #define EMU_SRC_HANA_ADAT	0x0400	/* Hana ADAT 8 channel in +0 to +7 */
 #define EMU_SRC_HANA_SPDIF_LEFT1	0x0500	/* Hana SPDIF Left, 1st or 48kHz only */
 #define EMU_SRC_HANA_SPDIF_LEFT2	0x0502	/* Hana SPDIF Left, 2nd or 96kHz */
@@ -1466,16 +1434,12 @@
 #define EMU_SRC_HANA_SPDIF_RIGHT2	0x0503	/* Hana SPDIF Right, 2nd or 96kHz */
 
 /* Additional inputs for 1616(M)/Microdock */
-/* Microdock S/PDIF Left, 1st or 48kHz only */
-#define EMU_SRC_MDOCK_SPDIF_LEFT1	0x0112
-/* Microdock S/PDIF Left, 2nd or 96kHz */
-#define EMU_SRC_MDOCK_SPDIF_LEFT2	0x0113
-/* Microdock S/PDIF Right, 1st or 48kHz only */
-#define EMU_SRC_MDOCK_SPDIF_RIGHT1	0x0116
-/* Microdock S/PDIF Right, 2nd or 96kHz */
-#define EMU_SRC_MDOCK_SPDIF_RIGHT2	0x0117
-/* Microdock ADAT 8 channel in +8 to +f */
-#define EMU_SRC_MDOCK_ADAT		0x0118
+
+#define EMU_SRC_MDOCK_SPDIF_LEFT1	0x0112	/* Microdock S/PDIF Left, 1st or 48kHz only */
+#define EMU_SRC_MDOCK_SPDIF_LEFT2	0x0113	/* Microdock S/PDIF Left, 2nd or 96kHz */
+#define EMU_SRC_MDOCK_SPDIF_RIGHT1	0x0116	/* Microdock S/PDIF Right, 1st or 48kHz only */
+#define EMU_SRC_MDOCK_SPDIF_RIGHT2	0x0117	/* Microdock S/PDIF Right, 2nd or 96kHz */
+#define EMU_SRC_MDOCK_ADAT		0x0118	/* Microdock ADAT 8 channel in +8 to +f */
 
 /* 0x600 and 0x700 no used */
 
@@ -1642,14 +1606,26 @@ enum {
 	EMU_MODEL_EMU0404,
 };
 
+// Chip-o-logy:
+// - All SB Live! cards use EMU10K1 chips
+// - All SB Audigy cards use CA* chips, termed "emu10k2" by the driver
+// - Original Audigy uses CA0100 "Alice"
+// - Audigy 2 uses CA0102/CA10200 "Alice2"
+//   - Has an interface for CA0151 (P16V) "Alice3"
+// - Audigy 2 Value uses CA0108/CA10300 "Tina"
+//   - Approximately a CA0102 with an on-chip CA0151 (P17V)
+// - Audigy 2 ZS NB uses CA0109 "Tina2"
+//   - Cardbus version of CA0108
 struct snd_emu_chip_details {
 	u32 vendor;
 	u32 device;
 	u32 subsystem;
 	unsigned char revision;
 	unsigned char emu10k1_chip; /* Original SB Live. Not SB Live 24bit. */
+				    /* Redundant with emu10k2_chip being unset. */
 	unsigned char emu10k2_chip; /* Audigy 1 or Audigy 2. */
 	unsigned char ca0102_chip;  /* Audigy 1 or Audigy 2. Not SB Audigy 2 Value. */
+				    /* Redundant with ca0108_chip being unset. */
 	unsigned char ca0108_chip;  /* Audigy 2 Value */
 	unsigned char ca_cardbus_chip; /* Audigy 2 ZS Notebook */
 	unsigned char ca0151_chip;  /* P16V */
@@ -1659,8 +1635,8 @@ struct snd_emu_chip_details {
 	unsigned char ac97_chip;    /* Has an AC97 chip: 1 = mandatory, 2 = optional */
 	unsigned char ecard;        /* APS EEPROM */
 	unsigned char emu_model;     /* EMU model type */
-	unsigned char spi_dac;      /* SPI interface for DAC */
-	unsigned char i2c_adc;      /* I2C interface for ADC */
+	unsigned char spi_dac;      /* SPI interface for DAC; requires ca0108_chip */
+	unsigned char i2c_adc;      /* I2C interface for ADC; requires ca0108_chip */
 	unsigned char adc_1361t;    /* Use Philips 1361T ADC */
 	unsigned char invert_shared_spdif; /* analog/digital switch inverted */
 	const char *driver;
@@ -1734,9 +1710,9 @@ struct snd_emu10k1 {
 	void *synth;
 	int (*get_synth_voice)(struct snd_emu10k1 *emu);
 
-	spinlock_t reg_lock;
-	spinlock_t emu_lock;
-	spinlock_t voice_lock;
+	spinlock_t reg_lock;  // high-level driver lock
+	spinlock_t emu_lock;  // low-level i/o lock
+	spinlock_t voice_lock;  // voice allocator lock
 	spinlock_t spi_lock; /* serialises access to spi port */
 	spinlock_t i2c_lock; /* serialises access to i2c port */
 
diff --git a/include/uapi/sound/emu10k1.h b/include/uapi/sound/emu10k1.h
index c2414bd5aecd..33e5228f5d8c 100644
--- a/include/uapi/sound/emu10k1.h
+++ b/include/uapi/sound/emu10k1.h
@@ -111,6 +111,9 @@
 #define CC_REG_NONZERO	C_00000100
 
 /* FX buses */
+// These are arbitrary mappings; our DSP code simply expects
+// the config files to route the channels this way.
+// The numbers are documented in {audigy,sb-live}-mixer.rst.
 #define FXBUS_PCM_LEFT		0x00
 #define FXBUS_PCM_RIGHT		0x01
 #define FXBUS_PCM_LEFT_REAR	0x02
diff --git a/sound/pci/emu10k1/emu10k1.c b/sound/pci/emu10k1/emu10k1.c
index 672af4b9597b..b8163f26004a 100644
--- a/sound/pci/emu10k1/emu10k1.c
+++ b/sound/pci/emu10k1/emu10k1.c
@@ -68,17 +68,6 @@ static const struct pci_device_id snd_emu10k1_ids[] = {
 	{ 0, }
 };
 
-/*
- * Audigy 2 Value notes:
- * A_IOCFG Input (GPIO)
- * 0x400  = Front analog jack plugged in. (Green socket)
- * 0x1000 = Read analog jack plugged in. (Black socket)
- * 0x2000 = Center/LFE analog jack plugged in. (Orange socket)
- * A_IOCFG Output (GPIO)
- * 0x60 = Sound out of front Left.
- * Win sets it to 0xXX61
- */
-
 MODULE_DEVICE_TABLE(pci, snd_emu10k1_ids);
 
 static int snd_card_emu10k1_probe(struct pci_dev *pci,
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index c37df604d470..39b63c4ca1df 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -162,6 +162,8 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir)
 	outl(0, emu->port + INTE);
 	snd_emu10k1_ptr_write(emu, CLIEL, 0, 0);
 	snd_emu10k1_ptr_write(emu, CLIEH, 0, 0);
+
+	/* disable stop on loop end */
 	snd_emu10k1_ptr_write(emu, SOLEL, 0, 0);
 	snd_emu10k1_ptr_write(emu, SOLEH, 0, 0);
 
@@ -660,13 +662,14 @@ static int snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
 		return -EIO;
 
 	/* The FPGA is a Xilinx Spartan IIE XC2S50E */
+	/* On E-MU 0404b it is a Xilinx Spartan III XC3S50 */
 	/* GPIO7 -> FPGA PGMN
 	 * GPIO6 -> FPGA CCLK
 	 * GPIO5 -> FPGA DIN
 	 * FPGA CONFIG OFF -> FPGA PGMN
 	 */
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	outw(0x00, emu->port + A_GPIO); /* Set PGMN low for 1uS. */
+	outw(0x00, emu->port + A_GPIO); /* Set PGMN low for 100uS. */
 	write_post = inw(emu->port + A_GPIO);
 	udelay(100);
 	outw(0x80, emu->port + A_GPIO); /* Leave bit 7 set during netlist setup. */
@@ -782,7 +785,7 @@ static void emu1010_firmware_work(struct work_struct *work)
 	} else if (!reg && emu->emu1010.last_reg) {
 		/* Audio Dock removed */
 		dev_info(emu->card->dev, "emu1010: Audio Dock detached\n");
-		/* Unmute all */
+		/* The hardware auto-mutes all, so we unmute again */
 		snd_emu1010_fpga_write(emu, EMU_HANA_UNMUTE, EMU_UNMUTE);
 	}
 
@@ -794,29 +797,6 @@ static void emu1010_firmware_work(struct work_struct *work)
 }
 
 /*
- * EMU-1010 - details found out from this driver, official MS Win drivers,
- * testing the card:
- *
- * Audigy2 (aka Alice2):
- * ---------------------
- * 	* communication over PCI
- * 	* conversion of 32-bit data coming over EMU32 links from HANA FPGA
- *	  to 2 x 16-bit, using internal DSP instructions
- * 	* slave mode, clock supplied by HANA
- * 	* linked to HANA using:
- * 		32 x 32-bit serial EMU32 output channels
- * 		16 x EMU32 input channels
- * 		(?) x I2S I/O channels (?)
- *
- * FPGA (aka HANA):
- * ---------------
- * 	* provides all (?) physical inputs and outputs of the card
- * 		(ADC, DAC, SPDIF I/O, ADAT I/O, etc.)
- * 	* provides clock signal for the card and Alice2
- * 	* two crystals - for 44.1kHz and 48kHz multiples
- * 	* provides internal routing of signal sources to signal destinations
- * 	* inputs/outputs to Alice2 - see above
- *
  * Current status of the driver:
  * ----------------------------
  * 	* only 44.1/48kHz supported (the MS Win driver supports up to 192 kHz)
@@ -884,11 +864,8 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 	snd_emu1010_fpga_read(emu, EMU_HANA_OPTION_CARDS, &reg);
 	dev_info(emu->card->dev, "emu1010: Card options = 0x%x\n", reg);
 	/* Optical -> ADAT I/O  */
-	/* 0 : SPDIF
-	 * 1 : ADAT
-	 */
 	emu->emu1010.optical_in = 1; /* IN_ADAT */
-	emu->emu1010.optical_out = 1; /* IN_ADAT */
+	emu->emu1010.optical_out = 1; /* OUT_ADAT */
 	tmp = (emu->emu1010.optical_in ? EMU_HANA_OPTICAL_IN_ADAT : 0) |
 		(emu->emu1010.optical_out ? EMU_HANA_OPTICAL_OUT_ADAT : 0);
 	snd_emu1010_fpga_write(emu, EMU_HANA_OPTICAL_TYPE, tmp);
@@ -1279,6 +1256,15 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 * AC97: STAC9750
 	 * CA0151: None
 	 */
+	/*
+	 * A_IOCFG Input (GPIO)
+	 * 0x400  = Front analog jack plugged in. (Green socket)
+	 * 0x1000 = Rear analog jack plugged in. (Black socket)
+	 * 0x2000 = Center/LFE analog jack plugged in. (Orange socket)
+	 * A_IOCFG Output (GPIO)
+	 * 0x60 = Sound out of front Left.
+	 * Win sets it to 0xXX61
+	 */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x10011102,
 	 .driver = "Audigy2", .name = "SB Audigy 2 Value [SB0400]",
 	 .id = "Audigy2",
@@ -1327,6 +1313,9 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spi_dac = 1,
 	 .i2c_adc = 1,
 	 .spk71 = 1} ,
+	/* This is MAEM8950 "Mana" */
+	/* Attach MicroDock[M] to make it an E-MU 1616[m]. */
+	/* Does NOT support sync daughter card (obviously). */
 	/* Tested by James@superbug.co.uk 4th Nov 2007. */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x42011102,
 	 .driver = "Audigy2", .name = "E-mu 1010 Notebook [MAEM8950]",
@@ -1337,7 +1326,10 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spk71 = 1 ,
 	 .emu_model = EMU_MODEL_EMU1616},
 	/* Tested by James@superbug.co.uk 4th Nov 2007. */
-	/* This is MAEM8960, 0202 is MAEM 8980 */
+	/* This is MAEM8960 "Hana3", 0202 is MAEM8980 */
+	/* Attach 0202 daughter card to make it an E-MU 1212m, OR a
+	 * MicroDock[M] to make it an E-MU 1616[m]. */
+	/* Does NOT support sync daughter card. */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x40041102,
 	 .driver = "Audigy2", .name = "E-mu 1010b PCI [MAEM8960]",
 	 .id = "EMU1010",
@@ -1347,6 +1339,11 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .emu_model = EMU_MODEL_EMU1010B}, /* EMU 1010 new revision */
 	/* Tested by Maxim Kachur <mcdebugger@duganet.ru> 17th Oct 2012. */
 	/* This is MAEM8986, 0202 is MAEM8980 */
+	/* Attach 0202 daughter card to make it an E-MU 1212m, OR a
+	 * MicroDockM to make it an E-MU 1616m. The non-m
+	 * version was never sold with this card, but should
+	 * still work. */
+	/* Does NOT support sync daughter card. */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x40071102,
 	 .driver = "Audigy2", .name = "E-mu 1010 PCIe [MAEM8986]",
 	 .id = "EMU1010",
@@ -1355,7 +1352,10 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spk71 = 1,
 	 .emu_model = EMU_MODEL_EMU1010B}, /* EMU 1010 PCIe */
 	/* Tested by James@superbug.co.uk 8th July 2005. */
-	/* This is MAEM8810, 0202 is MAEM8820 */
+	/* This is MAEM8810 "Hana", 0202 is MAEM8820 "Hamoa" */
+	/* Attach 0202 daughter card to make it an E-MU 1212m, OR an
+	 * AudioDock[M] to make it an E-MU 1820[m]. */
+	/* Supports sync daughter card. */
 	{.vendor = 0x1102, .device = 0x0004, .subsystem = 0x40011102,
 	 .driver = "Audigy2", .name = "E-mu 1010 [MAEM8810]",
 	 .id = "EMU1010",
@@ -1363,7 +1363,9 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .ca0102_chip = 1,
 	 .spk71 = 1,
 	 .emu_model = EMU_MODEL_EMU1010}, /* EMU 1010 old revision */
-	/* EMU0404b */
+	/* This is MAEM8852 "HanaLiteLite" */
+	/* Supports sync daughter card. */
+	/* Tested by oswald.buddenhagen@gmx.de Mar 2023. */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x40021102,
 	 .driver = "Audigy2", .name = "E-mu 0404b PCI [MAEM8852]",
 	 .id = "EMU0404",
@@ -1371,6 +1373,8 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .ca0108_chip = 1,
 	 .spk71 = 1,
 	 .emu_model = EMU_MODEL_EMU0404}, /* EMU 0404 new revision */
+	/* This is MAEM8850 "HanaLite" */
+	/* Supports sync daughter card. */
 	/* Tested by James@superbug.co.uk 20-3-2007. */
 	{.vendor = 0x1102, .device = 0x0004, .subsystem = 0x40021102,
 	 .driver = "Audigy2", .name = "E-mu 0404 [MAEM8850]",
@@ -1380,6 +1384,7 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spk71 = 1,
 	 .emu_model = EMU_MODEL_EMU0404}, /* EMU 0404 */
 	/* EMU0404 PCIe */
+	/* Does NOT support sync daughter card. */
 	{.vendor = 0x1102, .device = 0x0008, .subsystem = 0x40051102,
 	 .driver = "Audigy2", .name = "E-mu 0404 PCIe [MAEM8984]",
 	 .id = "EMU0404",
@@ -1387,7 +1392,6 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .ca0108_chip = 1,
 	 .spk71 = 1,
 	 .emu_model = EMU_MODEL_EMU0404}, /* EMU 0404 PCIe ver_03 */
-	/* Note that all E-mu cards require kernel 2.6 or newer. */
 	{.vendor = 0x1102, .device = 0x0008,
 	 .driver = "Audigy2", .name = "SB Audigy 2 Value [Unknown]",
 	 .id = "Audigy2",
@@ -1468,6 +1472,8 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spdif_bug = 1,
 	 .adc_1361t = 1,  /* 24 bit capture instead of 16bit */
 	 .ac97_chip = 1} ,
+	/* Audigy 2 Platinum EX */
+	/* Win driver sets A_IOCFG output to 0x1c00 */
 	{.vendor = 0x1102, .device = 0x0004, .subsystem = 0x10051102,
 	 .driver = "Audigy2", .name = "Audigy 2 Platinum EX [SB0280]",
 	 .id = "Audigy2",
@@ -1488,6 +1494,8 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .spdif_bug = 1,
 	 .invert_shared_spdif = 1,	/* digital/analog switch swapped */
 	 .ac97_chip = 1} ,
+	/* Audigy 2 Platinum */
+	/* Win driver sets A_IOCFG output to 0xa00 */
 	{.vendor = 0x1102, .device = 0x0004, .subsystem = 0x10021102,
 	 .driver = "Audigy2", .name = "SB Audigy 2 Platinum [SB0240P]",
 	 .id = "Audigy2",
@@ -1593,6 +1601,9 @@ static const struct snd_emu_chip_details emu_chip_details[] = {
 	 .emu10k1_chip = 1,
 	 .ac97_chip = 1,
 	 .sblive51 = 1} ,
+	/* SB Live! Platinum */
+	/* Win driver sets A_IOCFG output to 0 */
+	/* Tested by Jonathan Dowland <jon@dow.land> Apr 2023. */
 	{.vendor = 0x1102, .device = 0x0002, .subsystem = 0x80401102,
 	 .driver = "EMU10K1", .name = "SB Live! Platinum [CT4760P]",
 	 .id = "Live",
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index 6a51aed59238..510b776f856d 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -1187,8 +1187,8 @@ snd_emu10k1_init_stereo_onoff_control(struct snd_emu10k1_fx8010_control_gpr *ctl
 }
 
 /*
- * Used for emu1010 - conversion from 32-bit capture inputs from HANA
- * to 2 x 16-bit registers in audigy - their values are read via DMA.
+ * Used for emu1010 - conversion from 32-bit capture inputs from the FPGA
+ * to 2 x 16-bit registers in Audigy - their values are read via DMA.
  * Conversion is performed by Audigy DSP instructions of FX8010.
  */
 static int snd_emu10k1_audigy_dsp_convert_32_to_2x16(
@@ -1330,8 +1330,9 @@ static int _snd_emu10k1_audigy_init_efx(struct snd_emu10k1 *emu)
 #define A_ADD_VOLUME_IN(var,vol,input) \
 A_OP(icode, &ptr, iMAC0, A_GPR(var), A_GPR(var), A_GPR(vol), A_EXTIN(input))
 
-	/* emu1212 DSP 0 and DSP 1 Capture */
 	if (emu->card_capabilities->emu_model) {
+		/* EMU1010 DSP 0 and DSP 1 Capture */
+		// The 24 MSB hold the actual value. We implicitly discard the 16 LSB.
 		if (emu->card_capabilities->ca0108_chip) {
 			/* Note:JCD:No longer bit shift lower 16bits to upper 16bits of 32bit value. */
 			A_OP(icode, &ptr, iMACINT0, A_GPR(tmp), A_C_00000000, A3_EMU32IN(0x0), A_C_00000001);
@@ -1636,8 +1637,11 @@ A_OP(icode, &ptr, iMAC0, A_GPR(var), A_GPR(var), A_GPR(vol), A_EXTIN(input))
 #endif
 
 	if (emu->card_capabilities->emu_model) {
+		/* Capture 16 channels of S32_LE sound. */
 		if (emu->card_capabilities->ca0108_chip) {
 			dev_info(emu->card->dev, "EMU2 inputs on\n");
+			/* Note that the Tina[2] DSPs have 16 more EMU32 inputs which we don't use. */
+
 			for (z = 0; z < 0x10; z++) {
 				snd_emu10k1_audigy_dsp_convert_32_to_2x16( icode, &ptr, tmp, 
 									bit_shifter16,
@@ -1646,7 +1650,7 @@ A_OP(icode, &ptr, iMAC0, A_GPR(var), A_GPR(var), A_GPR(vol), A_EXTIN(input))
 			}
 		} else {
 			dev_info(emu->card->dev, "EMU inputs on\n");
-			/* Capture 16 (originally 8) channels of S32_LE sound */
+			/* Note that the Alice2 DSPs have 6 I2S inputs which we don't use. */
 
 			/*
 			dev_dbg(emu->card->dev, "emufx.c: gpr=0x%x, tmp=0x%x\n",
diff --git a/sound/pci/emu10k1/emumixer.c b/sound/pci/emu10k1/emumixer.c
index 754d91050af2..3d2ed5e0a59d 100644
--- a/sound/pci/emu10k1/emumixer.c
+++ b/sound/pci/emu10k1/emumixer.c
@@ -346,7 +346,7 @@ static const unsigned int emu1616_output_dst[] = {
 };
 
 /*
- * Data destinations - HANA outputs going to Alice2 (audigy) for
+ * Data destinations - FPGA outputs going to Alice2 (Audigy) for
  *   capture (EMU32 + I2S links)
  * Each destination has an enum mixer control to choose a data source
  */
@@ -367,6 +367,7 @@ static const unsigned int emu1010_input_dst[] = {
 	EMU_DST_ALICE2_EMU32_D,
 	EMU_DST_ALICE2_EMU32_E,
 	EMU_DST_ALICE2_EMU32_F,
+	/* These exist only on rev1 EMU1010 cards. */
 	EMU_DST_ALICE_I2S0_LEFT,
 	EMU_DST_ALICE_I2S0_RIGHT,
 	EMU_DST_ALICE_I2S1_LEFT,
@@ -708,7 +709,7 @@ static int snd_emu1010_internal_clock_put(struct snd_kcontrol *kcontrol,
 			/* 44100 */
 			/* Mute all */
 			snd_emu1010_fpga_write(emu, EMU_HANA_UNMUTE, EMU_MUTE );
-			/* Default fallback clock 48kHz */
+			/* Default fallback clock 44.1kHz */
 			snd_emu1010_fpga_write(emu, EMU_HANA_DEFCLOCK, EMU_HANA_DEFCLOCK_44_1K );
 			/* Word Clock source, Internal 44.1kHz x1 */
 			snd_emu1010_fpga_write(emu, EMU_HANA_WCLOCK,
diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c
index c2749ad59e10..a91ea0f863d3 100644
--- a/sound/pci/emu10k1/emupcm.c
+++ b/sound/pci/emu10k1/emupcm.c
@@ -326,7 +326,6 @@ static void snd_emu10k1_pcm_init_voice(struct snd_emu10k1 *emu,
 	} else
 		snd_emu10k1_ptr_write(emu, FXRT, voice,
 				      snd_emu10k1_compose_send_routing(send_routing));
-	/* Stop CA */
 	/* Assumption that PT is already 0 so no harm overwriting */
 	snd_emu10k1_ptr_write(emu, PTRX, voice, (send_amount[0] << 8) | send_amount[1]);
 	snd_emu10k1_ptr_write(emu, DSL, voice, end_addr | (send_amount[3] << 24));
@@ -480,9 +479,6 @@ static int snd_emu10k1_efx_playback_prepare(struct snd_pcm_substream *substream)
 	start_addr = epcm->start_addr;
 	end_addr = epcm->start_addr + snd_pcm_lib_buffer_bytes(substream);
 
-	/*
-	 * the kX driver leaves some space between voices
-	 */
 	channel_size = ( end_addr - start_addr ) / NUM_EFX_PLAYBACK;
 
 	snd_emu10k1_pcm_init_voice(emu, 1, 1, epcm->extra,
@@ -1218,9 +1214,7 @@ static int snd_emu10k1_capture_efx_open(struct snd_pcm_substream *substream)
 	runtime->hw.rate_min = runtime->hw.rate_max = 48000;
 	spin_lock_irq(&emu->reg_lock);
 	if (emu->card_capabilities->emu_model) {
-		/*  Nb. of channels has been increased to 16 */
 		/* TODO
-		 * SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE
 		 * SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000 |
 		 * SNDRV_PCM_RATE_88200 | SNDRV_PCM_RATE_96000 |
 		 * SNDRV_PCM_RATE_176400 | SNDRV_PCM_RATE_192000
@@ -1231,13 +1225,14 @@ static int snd_emu10k1_capture_efx_open(struct snd_pcm_substream *substream)
 		 * Need to add mixer control to fix sample rate
 		 *                 
 		 * There are 32 mono channels of 16bits each.
-		 * 24bit Audio uses 2x channels over 16bit
-		 * 96kHz uses 2x channels over 48kHz
-		 * 192kHz uses 4x channels over 48kHz
-		 * So, for 48kHz 24bit, one has 16 channels
-		 * for 96kHz 24bit, one has 8 channels
-		 * for 192kHz 24bit, one has 4 channels
-		 *
+		 * 24bit Audio uses 2x channels over 16bit,
+		 * 96kHz uses 2x channels over 48kHz,
+		 * 192kHz uses 4x channels over 48kHz.
+		 * So, for 48kHz 24bit, one has 16 channels,
+		 * for 96kHz 24bit, one has 8 channels,
+		 * for 192kHz 24bit, one has 4 channels.
+		 * 1010rev2 and 1616(m) cards have double that,
+		 * but we don't exceed 16 channels anyway.
 		 */
 #if 1
 		switch (emu->emu1010.internal_clock) {
@@ -1459,11 +1454,12 @@ static int snd_emu10k1_pcm_efx_voices_mask_put(struct snd_kcontrol *kcontrol, st
 			nval[idx / 32] |= 1 << (idx % 32);
 			bits++;
 		}
-		
+
+	// Check that the number of requested channels is a power of two
+	// not bigger than the number of available channels.
 	for (idx = 0; idx < nefxb; idx++)
 		if (1 << idx == bits)
 			break;
-	
 	if (idx >= nefxb)
 		return -EINVAL;
 
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index 42b06f2e5552..c60573f14ea8 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -314,7 +314,6 @@ void snd_emu10k1_voice_intr_enable(struct snd_emu10k1 *emu, unsigned int voicenu
 	unsigned int val;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(CLIEH << 16, emu->port + PTR);
 		val = inl(emu->port + DATA);
@@ -334,7 +333,6 @@ void snd_emu10k1_voice_intr_disable(struct snd_emu10k1 *emu, unsigned int voicen
 	unsigned int val;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(CLIEH << 16, emu->port + PTR);
 		val = inl(emu->port + DATA);
@@ -353,7 +351,6 @@ void snd_emu10k1_voice_intr_ack(struct snd_emu10k1 *emu, unsigned int voicenum)
 	unsigned long flags;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(CLIPH << 16, emu->port + PTR);
 		voicenum = 1 << (voicenum - 32);
@@ -371,7 +368,6 @@ void snd_emu10k1_voice_half_loop_intr_enable(struct snd_emu10k1 *emu, unsigned i
 	unsigned int val;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(HLIEH << 16, emu->port + PTR);
 		val = inl(emu->port + DATA);
@@ -391,7 +387,6 @@ void snd_emu10k1_voice_half_loop_intr_disable(struct snd_emu10k1 *emu, unsigned
 	unsigned int val;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(HLIEH << 16, emu->port + PTR);
 		val = inl(emu->port + DATA);
@@ -410,7 +405,6 @@ void snd_emu10k1_voice_half_loop_intr_ack(struct snd_emu10k1 *emu, unsigned int
 	unsigned long flags;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(HLIPH << 16, emu->port + PTR);
 		voicenum = 1 << (voicenum - 32);
@@ -428,7 +422,6 @@ void snd_emu10k1_voice_set_loop_stop(struct snd_emu10k1 *emu, unsigned int voice
 	unsigned int sol;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(SOLEH << 16, emu->port + PTR);
 		sol = inl(emu->port + DATA);
@@ -448,7 +441,6 @@ void snd_emu10k1_voice_clear_loop_stop(struct snd_emu10k1 *emu, unsigned int voi
 	unsigned int sol;
 
 	spin_lock_irqsave(&emu->emu_lock, flags);
-	/* voice interrupt */
 	if (voicenum >= 32) {
 		outl(SOLEH << 16, emu->port + PTR);
 		sol = inl(emu->port + DATA);
diff --git a/sound/pci/emu10k1/p16v.h b/sound/pci/emu10k1/p16v.h
index 3cdafa311617..9d429ad1feff 100644
--- a/sound/pci/emu10k1/p16v.h
+++ b/sound/pci/emu10k1/p16v.h
@@ -64,7 +64,7 @@
  */
 
 /********************************************************************************************************/
-/* Audigy2 P16V pointer-offset register set, accessed through the PTR2 and DATA2 registers                     */
+/* Audigy2 P16V pointer-offset register set, accessed through the PTR2 and DATA2 registers              */
 /********************************************************************************************************/
                                                                                                                            
 /* The sample rate of the SPDIF outputs is set by modifying a register in the EMU10K2 PTR register A_SPDIF_SAMPLERATE.
diff --git a/sound/pci/emu10k1/p17v.h b/sound/pci/emu10k1/p17v.h
index 3a6568346fad..d4ada1c430c8 100644
--- a/sound/pci/emu10k1/p17v.h
+++ b/sound/pci/emu10k1/p17v.h
@@ -6,8 +6,8 @@
  */
 
 /******************************************************************************/
-/* Audigy2Value Tina (P17V) pointer-offset register set,
- * accessed through the PTR20 and DATA24 registers  */
+/* Audigy2Value Tina (P17V) pointer-offset register set,                      */
+/* accessed through the PTR2 and DATA2 registers                              */
 /******************************************************************************/
 
 /* 00 - 07: Not used */
-- 
cgit v1.2.3


From 6815f5359aa5d02d1a936221812ab46624c39283 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:16 +0200
Subject: ALSA: emu10k1: fix lineup of EMU_HANA_* defines

The bit values are supposed to be internally indented by one step
relative to the register addresses.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-3-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 7786d5807679..35a60bdcf641 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1072,34 +1072,34 @@
 #define EMU_HANA_DOCK_LEDS_3_MANUAL_SIGNAL	0x20	/* Manual Signal detection */
 
 #define EMU_HANA_ADC_PADS	0x10	/* 0000xxx  3 bit  Audio Dock ADC 14dB pads */
-#define EMU_HANA_DOCK_ADC_PAD1	0x01	/* 14dB Attenuation on Audio Dock ADC 1 */
-#define EMU_HANA_DOCK_ADC_PAD2	0x02	/* 14dB Attenuation on Audio Dock ADC 2 */
-#define EMU_HANA_DOCK_ADC_PAD3	0x04	/* 14dB Attenuation on Audio Dock ADC 3 */
-#define EMU_HANA_0202_ADC_PAD1	0x08	/* 14dB Attenuation on 0202 ADC 1 */
+#define EMU_HANA_DOCK_ADC_PAD1		0x01	/* 14dB Attenuation on Audio Dock ADC 1 */
+#define EMU_HANA_DOCK_ADC_PAD2		0x02	/* 14dB Attenuation on Audio Dock ADC 2 */
+#define EMU_HANA_DOCK_ADC_PAD3		0x04	/* 14dB Attenuation on Audio Dock ADC 3 */
+#define EMU_HANA_0202_ADC_PAD1		0x08	/* 14dB Attenuation on 0202 ADC 1 */
 
 #define EMU_HANA_DOCK_MISC	0x11	/* 0xxxxxx  6 bit  Audio Dock misc bits */
-#define EMU_HANA_DOCK_DAC1_MUTE	0x01	/* DAC 1 Mute */
-#define EMU_HANA_DOCK_DAC2_MUTE	0x02	/* DAC 2 Mute */
-#define EMU_HANA_DOCK_DAC3_MUTE	0x04	/* DAC 3 Mute */
-#define EMU_HANA_DOCK_DAC4_MUTE	0x08	/* DAC 4 Mute */
+#define EMU_HANA_DOCK_DAC1_MUTE		0x01	/* DAC 1 Mute */
+#define EMU_HANA_DOCK_DAC2_MUTE		0x02	/* DAC 2 Mute */
+#define EMU_HANA_DOCK_DAC3_MUTE		0x04	/* DAC 3 Mute */
+#define EMU_HANA_DOCK_DAC4_MUTE		0x08	/* DAC 4 Mute */
 #define EMU_HANA_DOCK_PHONES_192_DAC1	0x00	/* DAC 1 Headphones source at 192kHz */
 #define EMU_HANA_DOCK_PHONES_192_DAC2	0x10	/* DAC 2 Headphones source at 192kHz */
 #define EMU_HANA_DOCK_PHONES_192_DAC3	0x20	/* DAC 3 Headphones source at 192kHz */
 #define EMU_HANA_DOCK_PHONES_192_DAC4	0x30	/* DAC 4 Headphones source at 192kHz */
 
 #define EMU_HANA_MIDI_OUT	0x12	/* 00xxxxx  5 bit  Source for each MIDI out port */
-#define EMU_HANA_MIDI_OUT_0202	0x01 /* 0202 MIDI from Alice 2. 0 = A, 1 = B */
-#define EMU_HANA_MIDI_OUT_DOCK1	0x02 /* Audio Dock MIDI1 front, from Alice 2. 0 = A, 1 = B */
-#define EMU_HANA_MIDI_OUT_DOCK2	0x04 /* Audio Dock MIDI2 rear, from Alice 2. 0 = A, 1 = B */
-#define EMU_HANA_MIDI_OUT_SYNC2	0x08 /* Sync card. Not the actual MIDI out jack. 0 = A, 1 = B */
-#define EMU_HANA_MIDI_OUT_LOOP	0x10 /* 0 = bits (3:0) normal. 1 = MIDI loopback enabled. */
+#define EMU_HANA_MIDI_OUT_0202		0x01 /* 0202 MIDI from Alice 2. 0 = A, 1 = B */
+#define EMU_HANA_MIDI_OUT_DOCK1		0x02 /* Audio Dock MIDI1 front, from Alice 2. 0 = A, 1 = B */
+#define EMU_HANA_MIDI_OUT_DOCK2		0x04 /* Audio Dock MIDI2 rear, from Alice 2. 0 = A, 1 = B */
+#define EMU_HANA_MIDI_OUT_SYNC2		0x08 /* Sync card. Not the actual MIDI out jack. 0 = A, 1 = B */
+#define EMU_HANA_MIDI_OUT_LOOP		0x10 /* 0 = bits (3:0) normal. 1 = MIDI loopback enabled. */
 
 #define EMU_HANA_DAC_PADS	0x13	/* 00xxxxx  5 bit  DAC 14dB attenuation pads */
-#define EMU_HANA_DOCK_DAC_PAD1	0x01	/* 14dB Attenuation on AudioDock DAC 1. Left and Right */
-#define EMU_HANA_DOCK_DAC_PAD2	0x02	/* 14dB Attenuation on AudioDock DAC 2. Left and Right */
-#define EMU_HANA_DOCK_DAC_PAD3	0x04	/* 14dB Attenuation on AudioDock DAC 3. Left and Right */
-#define EMU_HANA_DOCK_DAC_PAD4	0x08	/* 14dB Attenuation on AudioDock DAC 4. Left and Right */
-#define EMU_HANA_0202_DAC_PAD1	0x10	/* 14dB Attenuation on 0202 DAC 1. Left and Right */
+#define EMU_HANA_DOCK_DAC_PAD1		0x01	/* 14dB Attenuation on AudioDock DAC 1. Left and Right */
+#define EMU_HANA_DOCK_DAC_PAD2		0x02	/* 14dB Attenuation on AudioDock DAC 2. Left and Right */
+#define EMU_HANA_DOCK_DAC_PAD3		0x04	/* 14dB Attenuation on AudioDock DAC 3. Left and Right */
+#define EMU_HANA_DOCK_DAC_PAD4		0x08	/* 14dB Attenuation on AudioDock DAC 4. Left and Right */
+#define EMU_HANA_0202_DAC_PAD1		0x10	/* 14dB Attenuation on 0202 DAC 1. Left and Right */
 
 /* 0x14 - 0x1f Unused R/W registers */
 
@@ -1108,8 +1108,8 @@
 					/* Reading the register resets it. */
 
 #define EMU_HANA_OPTION_CARDS	0x21	/* 000xxxx  4 bits Presence of option cards */
-#define EMU_HANA_OPTION_HAMOA	0x01	/* Hamoa (analog I/O) card present */
-#define EMU_HANA_OPTION_SYNC	0x02	/* Sync card present */
+#define EMU_HANA_OPTION_HAMOA		0x01	/* Hamoa (analog I/O) card present */
+#define EMU_HANA_OPTION_SYNC		0x02	/* Sync card present */
 #define EMU_HANA_OPTION_DOCK_ONLINE	0x04	/* Audio/Micro dock present and FPGA configured */
 #define EMU_HANA_OPTION_DOCK_OFFLINE	0x08	/* Audio/Micro dock present and FPGA not configured */
 
@@ -1123,8 +1123,8 @@
 #define EMU_DOCK_MINOR_REV	0x26	/* 0000xxx  3 bit  Audio Dock FPGA Minor rev */
 
 #define EMU_DOCK_BOARD_ID	0x27	/* 00000xx  2 bits Audio Dock ID pins */
-#define EMU_DOCK_BOARD_ID0	0x00	/* ID bit 0 */
-#define EMU_DOCK_BOARD_ID1	0x03	/* ID bit 1 */
+#define EMU_DOCK_BOARD_ID0		0x00	/* ID bit 0 */
+#define EMU_DOCK_BOARD_ID1		0x03	/* ID bit 1 */
 
 #define EMU_HANA_WC_SPDIF_HI	0x28	/* 0xxxxxx  6 bit  SPDIF IN Word clock, upper 6 bits */
 #define EMU_HANA_WC_SPDIF_LO	0x29	/* 0xxxxxx  6 bit  SPDIF IN Word clock, lower 6 bits */
-- 
cgit v1.2.3


From a062b1032adae9ad89afb8dca299ef3ce9aca566 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:17 +0200
Subject: ALSA: emu10k1: eliminate some unused defines

One might be mislead to think that these mean anything.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-4-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 35a60bdcf641..931f3016d60d 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -25,13 +25,9 @@
 /* ------------------- DEFINES -------------------- */
 
 #define EMUPAGESIZE     4096
-#define MAXREQVOICES    8
 #define MAXPAGES0       4096	/* 32 bit mode */
 #define MAXPAGES1       8192	/* 31 bit mode */
-#define RESERVED        0
-#define NUM_MIDI        16
 #define NUM_G           64              /* use all channels */
-#define NUM_FXSENDS     4
 #define NUM_EFX_PLAYBACK    16
 
 /* FIXME? - according to the OSS driver the EMU10K1 needs a 29 bit DMA mask */
@@ -39,7 +35,6 @@
 #define AUDIGY_DMA_MASK		0xffffffffUL	/* 32bit mode */
 
 #define TMEMSIZE        256*1024
-#define TMEMSIZEREG     4
 
 #define IP_TO_CP(ip) ((ip == 0) ? 0 : (((0x00001000uL | (ip & 0x00000FFFL)) << (((ip >> 12) & 0x000FL) + 4)) & 0xFFFF0000uL))
 
@@ -283,7 +278,6 @@
 						/* before the new rate is guaranteed accurate.	*/
 #define TIMER_RATE_MASK		0x000003ff	/* Timer interrupt rate in sample periods	*/
 						/* 0 == 1024 periods, [1..4] are not useful	*/
-#define TIMER_RATE		0x0a00001a
 
 #define AC97DATA		0x1c		/* AC97 register set data register (16 bit)	*/
 
-- 
cgit v1.2.3


From ac9219d93a983b0e146878695beb64ea624747d4 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:18 +0200
Subject: ALSA: emu10k1: remove some bogus defines

Firstly, remove the FXWC_* defines - the comment on FXWC implies that
the relevant defines are the (A_)EXTOUT_* ones. It's unclear where this
came from - it was in the initial ALSA import, but neither the driver
from Creative nor kX-project have these defines.

Secondly, remove A_HR, which made plain no sense (was unused, and
clashed with FXRT). Amends commit cbb7d8f9b7b ("emu10k1: Update
registers defines for the Audigy 2/emu10k2.5").

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-5-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h    | 15 ---------------
 sound/pci/emu10k1/emupcm.c |  1 -
 2 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 931f3016d60d..6927fac1aa5f 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -471,7 +471,6 @@
 #define FXRT_CHANNELC		0x0f000000	/* Effects send bus number for channel's effects send C	*/
 #define FXRT_CHANNELD		0xf0000000	/* Effects send bus number for channel's effects send D	*/
 
-#define A_HR			0x0b	/* High Resolution. 24bit playback from host to DSP. */
 #define MAPA			0x0c		/* Cache map A						*/
 #define MAPB			0x0d		/* Cache map B						*/
 
@@ -626,20 +625,6 @@
 						/* is 16bit, 48KHz only. All 32 channels can be enabled */
 						/* simultaneously.					*/
 
-#define FXWC_DEFAULTROUTE_C     (1<<0)		/* left emu out? */
-#define FXWC_DEFAULTROUTE_B     (1<<1)		/* right emu out? */
-#define FXWC_DEFAULTROUTE_A     (1<<12)
-#define FXWC_DEFAULTROUTE_D     (1<<13)
-#define FXWC_ADCLEFT            (1<<18)
-#define FXWC_CDROMSPDIFLEFT     (1<<18)
-#define FXWC_ADCRIGHT           (1<<19)
-#define FXWC_CDROMSPDIFRIGHT    (1<<19)
-#define FXWC_MIC                (1<<20)
-#define FXWC_ZOOMLEFT           (1<<20)
-#define FXWC_ZOOMRIGHT          (1<<21)
-#define FXWC_SPDIFLEFT          (1<<22)		/* 0x00400000 */
-#define FXWC_SPDIFRIGHT         (1<<23)		/* 0x00800000 */
-
 #define A_TBLSZ			0x43	/* Effects Tank Internal Table Size. Only low byte or register used */
 
 #define TCBS			0x44		/* Tank cache buffer size register			*/
diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c
index a91ea0f863d3..cc07bf4c6eb4 100644
--- a/sound/pci/emu10k1/emupcm.c
+++ b/sound/pci/emu10k1/emupcm.c
@@ -1745,7 +1745,6 @@ int snd_emu10k1_pcm_efx(struct snd_emu10k1 *emu, int device)
 	 * to these
 	 */	
 	
-	/* emu->efx_voices_mask[0] = FXWC_DEFAULTROUTE_C | FXWC_DEFAULTROUTE_A; */
 	if (emu->audigy) {
 		emu->efx_voices_mask[0] = 0;
 		if (emu->card_capabilities->emu_model)
-- 
cgit v1.2.3


From 145ec1fd00a875d5afc54f52295472e2a898e35c Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:19 +0200
Subject: ALSA: emu10k1: pull in some register definitions from kX-project

For documentation purposes and later use.

Some pre-existing but (mostly) unused definitions were renamed for
consistency.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-6-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          | 78 ++++++++++++++++++++++++++++------------
 sound/pci/emu10k1/emu10k1_main.c |  2 +-
 2 files changed, 56 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 6927fac1aa5f..0d288cc618c1 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -61,8 +61,8 @@
 						/* the relevant bits and zero to the other bits	*/
 #define IPR_P16V		0x80000000	/* Bit set when the CA0151 P16V chip wishes
 						   to interrupt */
-#define IPR_GPIOMSG		0x20000000	/* GPIO message interrupt (RE'd, still not sure 
-						   which INTE bits enable it)			*/
+#define IPR_WATERMARK_REACHED	0x40000000
+#define IPR_A_GPIO		0x20000000	/* GPIO input pin change			*/
 
 /* The next two interrupts are for the midi port on the Audigy Drive (A_MPU1)			*/
 #define IPR_A_MIDITRANSBUFEMPTY2 0x10000000	/* MIDI UART transmit buffer empty		*/
@@ -122,10 +122,14 @@
 						/* behavior and possibly random segfaults and	*/
 						/* lockups if enabled.				*/
 
+#define INTE_A_GPIOENABLE 	0x00040000	/* Enable GPIO input change interrupts		*/
+
 /* The next two interrupts are for the midi port on the Audigy Drive (A_MPU1)			*/
 #define INTE_A_MIDITXENABLE2	0x00020000	/* Enable MIDI transmit-buffer-empty interrupts	*/
 #define INTE_A_MIDIRXENABLE2	0x00010000	/* Enable MIDI receive-buffer-empty interrupts	*/
 
+#define INTE_A_SPDIF_BUFFULL_ENABLE 	0x00008000
+#define INTE_A_SPDIF_HALFBUFFULL_ENABLE	0x00004000
 
 #define INTE_SAMPLERATETRACKER	0x00002000	/* Enable sample rate tracker interrupts	*/
 						/* NOTE: This bit must always be enabled       	*/
@@ -146,9 +150,10 @@
 #define WC			0x10		/* Wall Clock register				*/
 #define WC_SAMPLECOUNTER_MASK	0x03FFFFC0	/* Sample periods elapsed since reset		*/
 #define WC_SAMPLECOUNTER	0x14060010
-#define WC_CURRENTCHANNEL	0x0000003F	/* Channel [0..63] currently being serviced	*/
+#define WC_CURRENTCHANNEL_MASK	0x0000003F	/* Channel [0..63] currently being serviced	*/
 						/* NOTE: Each channel takes 1/64th of a sample	*/
 						/* period to be serviced.			*/
+#define WC_CURRENTCHANNEL	0x06000010
 
 #define HCFG			0x14		/* Hardware config register			*/
 						/* NOTE: There is no reason to use the legacy	*/
@@ -276,7 +281,7 @@
 						/* NOTE: After the rate is changed, a maximum	*/
 						/* of 1024 sample periods should be allowed	*/
 						/* before the new rate is guaranteed accurate.	*/
-#define TIMER_RATE_MASK		0x000003ff	/* Timer interrupt rate in sample periods	*/
+#define TIMER_RATE_MASK		0x03ff		/* Timer interrupt rate in sample periods	*/
 						/* 0 == 1024 periods, [1..4] are not useful	*/
 
 #define AC97DATA		0x1c		/* AC97 register set data register (16 bit)	*/
@@ -425,7 +430,7 @@
 
 #define CCCA			0x08		/* Filter Q, interp. ROM, byte size, cur. addr register */
 #define CCCA_RESONANCE		0xf0000000	/* Lowpass filter resonance (Q) height			*/
-#define CCCA_INTERPROMMASK	0x0e000000	/* Selects passband of interpolation ROM		*/
+#define CCCA_INTERPROM_MASK	0x0e000000	/* Selects passband of interpolation ROM		*/
 						/* 1 == full band, 7 == lowpass				*/
 						/* ROM 0 is used when pitch shifting downward or less	*/
 						/* then 3 semitones upward.  Increasingly higher ROM	*/
@@ -487,7 +492,7 @@
 						/* 0x8000-n == 666*n usec delay	       			*/
 
 #define ATKHLDV 		0x11		/* Volume envelope hold and attack register		*/
-#define ATKHLDV_PHASE0		0x00008000	/* 0 = Begin attack phase				*/
+#define ATKHLDV_PHASE0_MASK	0x00008000	/* 0 = Begin attack phase				*/
 #define ATKHLDV_HOLDTIME_MASK	0x00007f00	/* Envelope hold time (127-n == n*88.2msec)		*/
 #define ATKHLDV_ATTACKTIME_MASK	0x0000007f	/* Envelope attack time, log encoded			*/
 						/* 0 = infinite, 1 = 10.9msec, ... 0x7f = 5.5msec	*/
@@ -510,7 +515,7 @@
 						/* 0x8000-n == 666*n usec delay				*/
 
 #define ATKHLDM			0x15		/* Modulation envelope hold and attack register		*/
-#define ATKHLDM_PHASE0		0x00008000	/* 0 = Begin attack phase				*/
+#define ATKHLDM_PHASE0_MASK	0x00008000	/* 0 = Begin attack phase				*/
 #define ATKHLDM_HOLDTIME	0x00007f00	/* Envelope hold time (127-n == n*42msec)		*/
 #define ATKHLDM_ATTACKTIME	0x0000007f	/* Envelope attack time, log encoded			*/
 						/* 0 = infinite, 1 = 11msec, ... 0x7f = 5.5msec		*/
@@ -839,16 +844,15 @@
 #define A_FXWC1			0x74            /* Selects 0x7f-0x60 for FX recording           */
 #define A_FXWC2			0x75		/* Selects 0x9f-0x80 for FX recording           */
 
-/* Extended Hardware Control */
-#define A_SPDIF_SAMPLERATE	0x76		/* Set the sample rate of SPDIF output		*/
-#define A_SAMPLE_RATE		0x76		/* Various sample rate settings. */
-#define A_SAMPLE_RATE_NOT_USED  0x0ffc111e	/* Bits that are not used and cannot be set. 	*/
-#define A_SAMPLE_RATE_UNKNOWN	0xf0030001	/* Bits that can be set, but have unknown use. 	*/
+#define A_EHC			0x76		/* Extended Hardware Control */
+
+#define A_SPDIF_SAMPLERATE	A_EHC		/* Set the sample rate of SPDIF output		*/
 #define A_SPDIF_RATE_MASK	0x000000e0	/* Any other values for rates, just use 48000	*/
-#define A_SPDIF_48000		0x00000000
+#define A_SPDIF_48000		0x00000000	/* kX calls this BYPASS				*/
 #define A_SPDIF_192000		0x00000020
 #define A_SPDIF_96000		0x00000040
 #define A_SPDIF_44100		0x00000080
+#define A_SPDIF_MUTED		0x000000c0
 
 #define A_I2S_CAPTURE_RATE_MASK	0x00000e00	/* This sets the capture PCM rate, but it is    */
 #define A_I2S_CAPTURE_48000	0x00000000	/* unclear if this sets the ADC rate as well.	*/
@@ -856,17 +860,29 @@
 #define A_I2S_CAPTURE_96000	0x00000400
 #define A_I2S_CAPTURE_44100	0x00000800
 
-#define A_PCM_RATE_MASK		0x0000e000	/* This sets the playback PCM rate on the P16V	*/
-#define A_PCM_48000		0x00000000
-#define A_PCM_192000		0x00002000
-#define A_PCM_96000		0x00004000
-#define A_PCM_44100		0x00008000
+#define A_EHC_SRC48_MASK	0x0000e000	/* This sets the playback PCM rate on the P16V	*/
+#define A_EHC_SRC48_BYPASS	0x00000000
+#define A_EHC_SRC48_192		0x00002000
+#define A_EHC_SRC48_96		0x00004000
+#define A_EHC_SRC48_44		0x00008000
+#define A_EHC_SRC48_MUTED	0x0000c000
+
+#define A_EHC_P17V_TVM		0x00000001	/* Tank virtual memory mode			*/
+#define A_EHC_P17V_SEL0_MASK	0x00030000	/* Aka A_EHC_P16V_PB_RATE; 00: 48, 01: 44.1, 10: 96, 11: 192 */
+#define A_EHC_P17V_SEL1_MASK	0x000c0000
+#define A_EHC_P17V_SEL2_MASK	0x00300000
+#define A_EHC_P17V_SEL3_MASK	0x00c00000
+
+#define A_EHC_ASYNC_BYPASS	0x80000000
 
 #define A_SRT3			0x77		/* I2S0 Sample Rate Tracker Status		*/
 #define A_SRT4			0x78		/* I2S1 Sample Rate Tracker Status		*/
 #define A_SRT5			0x79		/* I2S2 Sample Rate Tracker Status		*/
 /* - default to 0x01080000 on my audigy 2 ZS --rlrevell	*/
 
+#define A_SRT_ESTSAMPLERATE	0x001fffff
+#define A_SRT_RATELOCKED	0x01000000
+
 #define A_TTDA			0x7a		/* Tank Table DMA Address			*/
 #define A_TTDD			0x7b		/* Tank Table DMA Data				*/
 
@@ -981,7 +997,7 @@
 #define EMU_HANA_WCLOCK_INT_44_1K	0x01
 #define EMU_HANA_WCLOCK_HANA_SPDIF_IN	0x02
 #define EMU_HANA_WCLOCK_HANA_ADAT_IN	0x03
-#define EMU_HANA_WCLOCK_SYNC_BNCN	0x04
+#define EMU_HANA_WCLOCK_SYNC_BNC	0x04
 #define EMU_HANA_WCLOCK_2ND_HANA	0x05
 #define EMU_HANA_WCLOCK_SRC_RESERVED	0x06
 #define EMU_HANA_WCLOCK_OFF		0x07 /* For testing, forces fallback to DEFCLOCK */
@@ -1010,10 +1026,10 @@
 #define EMU_HANA_IRQ_DOCK_LOST		0x08
 
 #define EMU_HANA_SPDIF_MODE	0x0a	/* 00xxxxx  5 bits SPDIF MODE  */
-#define EMU_HANA_SPDIF_MODE_TX_COMSUMER	0x00
+#define EMU_HANA_SPDIF_MODE_TX_CONSUMER	0x00
 #define EMU_HANA_SPDIF_MODE_TX_PRO	0x01
 #define EMU_HANA_SPDIF_MODE_TX_NOCOPY	0x02
-#define EMU_HANA_SPDIF_MODE_RX_COMSUMER	0x00
+#define EMU_HANA_SPDIF_MODE_RX_CONSUMER	0x00
 #define EMU_HANA_SPDIF_MODE_RX_PRO	0x04
 #define EMU_HANA_SPDIF_MODE_RX_NOCOPY	0x08
 #define EMU_HANA_SPDIF_MODE_RX_INVALID	0x10
@@ -1025,8 +1041,12 @@
 #define EMU_HANA_OPTICAL_OUT_ADAT	0x02
 
 #define EMU_HANA_MIDI_IN		0x0c	/* 000000x  1 bit  Control MIDI  */
-#define EMU_HANA_MIDI_IN_FROM_HAMOA	0x00 /* HAMOA MIDI in to Alice 2 MIDI B */
-#define EMU_HANA_MIDI_IN_FROM_DOCK	0x01 /* Audio Dock MIDI in to Alice 2 MIDI B */
+#define EMU_HANA_MIDI_INA_FROM_HAMOA	0x01 /* HAMOA MIDI in to Alice 2 MIDI A */
+#define EMU_HANA_MIDI_INA_FROM_DOCK1	0x02 /* Audio Dock-1 MIDI in to Alice 2 MIDI A */
+#define EMU_HANA_MIDI_INA_FROM_DOCK2	0x03 /* Audio Dock-2 MIDI in to Alice 2 MIDI A */
+#define EMU_HANA_MIDI_INB_FROM_HAMOA	0x08 /* HAMOA MIDI in to Alice 2 MIDI B */
+#define EMU_HANA_MIDI_INB_FROM_DOCK1	0x10 /* Audio Dock-1 MIDI in to Alice 2 MIDI B */
+#define EMU_HANA_MIDI_INB_FROM_DOCK2	0x18 /* Audio Dock-2 MIDI in to Alice 2 MIDI B */
 
 #define EMU_HANA_DOCK_LEDS_1	0x0d	/* 000xxxx  4 bit  Audio Dock LEDs  */
 #define EMU_HANA_DOCK_LEDS_1_MIDI1	0x01	/* MIDI 1 LED on */
@@ -1119,6 +1139,10 @@
 
 /* 0x30 - 0x3f Unused Read only registers */
 
+// The meaning of this is not clear; kX-project just calls it "lock" in some info-only code.
+#define EMU_HANA_LOCK_STS_LO	0x38	/* 0xxxxxx  lower 6 bits */
+#define EMU_HANA_LOCK_STS_HI	0x39	/* 0xxxxxx  upper 6 bits */
+
 /************************************************************************************************/
 /* EMU1010 Audio Destinations									*/
 /************************************************************************************************/
@@ -1257,8 +1281,12 @@
 #define EMU_DST_DOCK_SPDIF_RIGHT2	0x011f	/* Audio Dock SPDIF Right, 2nd or 96kHz */
 #define EMU_DST_HANA_SPDIF_LEFT1	0x0200	/* Hana SPDIF Left, 1st or 48kHz only */
 #define EMU_DST_HANA_SPDIF_LEFT2	0x0202	/* Hana SPDIF Left, 2nd or 96kHz */
+#define EMU_DST_HANA_SPDIF_LEFT3	0x0204	/* Hana SPDIF Left, 3rd or 192kHz */
+#define EMU_DST_HANA_SPDIF_LEFT4	0x0206	/* Hana SPDIF Left, 4th or 192kHz */
 #define EMU_DST_HANA_SPDIF_RIGHT1	0x0201	/* Hana SPDIF Right, 1st or 48kHz only */
 #define EMU_DST_HANA_SPDIF_RIGHT2	0x0203	/* Hana SPDIF Right, 2nd or 96kHz */
+#define EMU_DST_HANA_SPDIF_RIGHT3	0x0205	/* Hana SPDIF Right, 3rd or 192kHz */
+#define EMU_DST_HANA_SPDIF_RIGHT4	0x0207	/* Hana SPDIF Right, 4th or 192kHz */
 #define EMU_DST_HAMOA_DAC_LEFT1	0x0300	/* Hamoa DAC Left, 1st or 48kHz only */
 #define EMU_DST_HAMOA_DAC_LEFT2	0x0302	/* Hamoa DAC Left, 2nd or 96kHz */
 #define EMU_DST_HAMOA_DAC_LEFT3	0x0304	/* Hamoa DAC Left, 3rd or 192kHz */
@@ -1409,8 +1437,12 @@
 #define EMU_SRC_HANA_ADAT	0x0400	/* Hana ADAT 8 channel in +0 to +7 */
 #define EMU_SRC_HANA_SPDIF_LEFT1	0x0500	/* Hana SPDIF Left, 1st or 48kHz only */
 #define EMU_SRC_HANA_SPDIF_LEFT2	0x0502	/* Hana SPDIF Left, 2nd or 96kHz */
+#define EMU_SRC_HANA_SPDIF_LEFT3	0x0504	/* Hana SPDIF Left, 3rd or 192kHz */
+#define EMU_SRC_HANA_SPDIF_LEFT4	0x0506	/* Hana SPDIF Left, 4th or 192kHz */
 #define EMU_SRC_HANA_SPDIF_RIGHT1	0x0501	/* Hana SPDIF Right, 1st or 48kHz only */
 #define EMU_SRC_HANA_SPDIF_RIGHT2	0x0503	/* Hana SPDIF Right, 2nd or 96kHz */
+#define EMU_SRC_HANA_SPDIF_RIGHT3	0x0505	/* Hana SPDIF Right, 3rd or 192kHz */
+#define EMU_SRC_HANA_SPDIF_RIGHT4	0x0507	/* Hana SPDIF Right, 4th or 192kHz */
 
 /* Additional inputs for 1616(M)/Microdock */
 
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 39b63c4ca1df..007c445d72bf 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -1953,7 +1953,7 @@ static const unsigned char saved_regs[] = {
 	0xff /* end */
 };
 static const unsigned char saved_regs_audigy[] = {
-	A_ADCIDX, A_MICIDX, A_FXWC1, A_FXWC2, A_SAMPLE_RATE,
+	A_ADCIDX, A_MICIDX, A_FXWC1, A_FXWC2, A_EHC,
 	A_FXRT2, A_SENDAMOUNTS, A_FXRT1,
 	0xff /* end */
 };
-- 
cgit v1.2.3


From 2696d5a3b0ec9f242b0220999933b6c78502b1b2 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:20 +0200
Subject: ALSA: emu10k1: fixup DSP defines

Firstly, fix the distribution between public and private headers.
Otherwise, some of the already public macros wouldn't actually work, and
the SNDRV_EMU10K1_IOCTL_DBG_READ result for Audigy would be useless.

Secondly, add condition code registers for Audigy. These are just
aliases for selected constant registers, and thus are generation-
specific. At least A_CC_REG_ZERO is actually correct ...

Finally, shuffle around some defines to more logical places while at it,
and fix up some more comments.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230422161021.1143903-7-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h      |  46 ++------------
 include/uapi/sound/emu10k1.h | 144 ++++++++++++++++++++++++++++---------------
 2 files changed, 99 insertions(+), 91 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 0d288cc618c1..5958cae819fd 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -704,18 +704,15 @@
 
 #define GPSCS			0x51		/* General Purpose SPDIF channel status register	*/
 
+// Corresponding EMU10K1_DBG_* constants are in the public header
 #define DBG			0x52
 
 #define A_SPSC			0x52		/* S/PDIF Input C Channel Status			*/
 
 #define REG53			0x53		/* DO NOT PROGRAM THIS REGISTER!!! MAY DESTROY CHIP	*/
 
-#define A_DBG			 0x53
-#define A_DBG_SINGLE_STEP	 0x00020000	/* Set to zero to start dsp */
-#define A_DBG_ZC		 0x40000000	/* zero tram counter */
-#define A_DBG_STEP_ADDR		 0x000003ff
-#define A_DBG_SATURATION_OCCURED 0x20000000
-#define A_DBG_SATURATION_ADDR	 0x0ffc0000
+// Corresponding A_DBG_* constants are in the public header
+#define A_DBG			0x53
 
 // NOTE: 0x54,55,56: 64-bit (split over voices 0 & 1)
 #define SPCS0			0x54		/* SPDIF output Channel Status 0 register	*/
@@ -908,45 +905,14 @@
 #define A_FXRT_CHANNELD		0x3f000000
 
 /* 0x7f: Not used */
-/* Each FX general purpose register is 32 bits in length, all bits are used			*/
-#define FXGPREGBASE		0x100		/* FX general purpose registers base       	*/
-#define A_FXGPREGBASE		0x400		/* Audigy GPRs, 0x400 to 0x5ff			*/
-
-#define A_TANKMEMCTLREGBASE	0x100		/* Tank memory control registers base - only for Audigy */
-#define A_TANKMEMCTLREG_MASK	0x1f		/* only 5 bits used - only for Audigy */
-
-/* Tank audio data is logarithmically compressed down to 16 bits before writing to TRAM and is	*/
-/* decompressed back to 20 bits on a read.  There are a total of 160 locations, the last 32	*/
-/* locations are for external TRAM. 								*/
-#define TANKMEMDATAREGBASE	0x200		/* Tank memory data registers base     		*/
-#define TANKMEMDATAREG_MASK	0x000fffff	/* 20 bit tank audio data field			*/
-
-/* Combined address field and memory opcode or flag field.  160 locations, last 32 are external	*/
-#define TANKMEMADDRREGBASE	0x300		/* Tank memory address registers base		*/
-#define TANKMEMADDRREG_ADDR_MASK 0x000fffff	/* 20 bit tank address field			*/
-#define TANKMEMADDRREG_CLEAR	0x00800000	/* Clear tank memory				*/
-#define TANKMEMADDRREG_ALIGN	0x00400000	/* Align read or write relative to tank access	*/
-#define TANKMEMADDRREG_WRITE	0x00200000	/* Write to tank memory				*/
-#define TANKMEMADDRREG_READ	0x00100000	/* Read from tank memory			*/
 
-#define MICROCODEBASE		0x400		/* Microcode data base address			*/
+/* The public header defines the GPR and TRAM base addresses that
+ * are valid for _both_ CPU and DSP addressing. */
 
 /* Each DSP microcode instruction is mapped into 2 doublewords 					*/
 /* NOTE: When writing, always write the LO doubleword first.  Reads can be in either order.	*/
-#define LOWORD_OPX_MASK		0x000ffc00	/* Instruction operand X			*/
-#define LOWORD_OPY_MASK		0x000003ff	/* Instruction operand Y			*/
-#define HIWORD_OPCODE_MASK	0x00f00000	/* Instruction opcode				*/
-#define HIWORD_RESULT_MASK	0x000ffc00	/* Instruction result				*/
-#define HIWORD_OPA_MASK		0x000003ff	/* Instruction operand A			*/
-
-
-/* Audigy Soundcard have a different instruction format */
+#define MICROCODEBASE		0x400		/* Microcode data base address			*/
 #define A_MICROCODEBASE		0x600
-#define A_LOWORD_OPY_MASK	0x000007ff
-#define A_LOWORD_OPX_MASK	0x007ff000
-#define A_HIWORD_OPCODE_MASK	0x0f000000
-#define A_HIWORD_RESULT_MASK	0x007ff000
-#define A_HIWORD_OPA_MASK	0x000007ff
 
 
 /************************************************************************************************/
diff --git a/include/uapi/sound/emu10k1.h b/include/uapi/sound/emu10k1.h
index 33e5228f5d8c..c8e131d6da00 100644
--- a/include/uapi/sound/emu10k1.h
+++ b/include/uapi/sound/emu10k1.h
@@ -43,6 +43,19 @@
 #define iINTERP  0x0e	/* R = A + (X * (Y - A) >> 31)  ; saturation */
 #define iSKIP    0x0f	/* R = A (cc_reg), X (count), Y (cc_test) */
 
+#define LOWORD_OPX_MASK		0x000ffc00	/* Instruction operand X			*/
+#define LOWORD_OPY_MASK		0x000003ff	/* Instruction operand Y			*/
+#define HIWORD_OPCODE_MASK	0x00f00000	/* Instruction opcode				*/
+#define HIWORD_RESULT_MASK	0x000ffc00	/* Instruction result				*/
+#define HIWORD_OPA_MASK		0x000003ff	/* Instruction operand A			*/
+
+/* Audigy Soundcards have a different instruction format */
+#define A_LOWORD_OPX_MASK	0x007ff000
+#define A_LOWORD_OPY_MASK	0x000007ff
+#define A_HIWORD_OPCODE_MASK	0x0f000000
+#define A_HIWORD_RESULT_MASK	0x007ff000
+#define A_HIWORD_OPA_MASK	0x000007ff
+
 /* GPRs */
 #define FXBUS(x)	(0x00 + (x))	/* x = 0x00 - 0x0f */
 #define EXTIN(x)	(0x10 + (x))	/* x = 0x00 - 0x0f */
@@ -50,6 +63,16 @@
 #define FXBUS2(x)	(0x30 + (x))	/* x = 0x00 - 0x0f copies of fx buses for capture -> FXWC high 16 bits */
 					/* NB: 0x31 and 0x32 are shared with Center/LFE on SB live 5.1 */
 
+#define A_FXBUS(x)	(0x00 + (x))	/* x = 0x00 - 0x3f FX buses */
+#define A_EXTIN(x)	(0x40 + (x))	/* x = 0x00 - 0x0f physical ins */
+#define A_P16VIN(x)	(0x50 + (x))	/* x = 0x00 - 0x0f p16v ins (A2 only) "EMU32 inputs" */
+#define A_EXTOUT(x)	(0x60 + (x))	/* x = 0x00 - 0x1f physical outs -> A_FXWC1 0x79-7f unknown   */
+#define A_FXBUS2(x)	(0x80 + (x))	/* x = 0x00 - 0x1f extra outs used for EFX capture -> A_FXWC2 */
+#define A_EMU32OUTH(x)	(0xa0 + (x))	/* x = 0x00 - 0x0f "EMU32_OUT_10 - _1F" */
+#define A_EMU32OUTL(x)	(0xb0 + (x))	/* x = 0x00 - 0x0f "EMU32_OUT_01 - _0F" */
+#define A3_EMU32IN(x)	(0x160 + (x))	/* x = 0x00 - 0x1f "EMU32_IN_00 - _1F" - Only when .device = 0x0008 */
+#define A3_EMU32OUT(x)	(0x1E0 + (x))	/* x = 0x00 - 0x1f "EMU32_OUT_00 - _1F" - Only when .device = 0x0008 */
+
 #define C_00000000	0x40
 #define C_00000001	0x41
 #define C_00000002	0x42
@@ -78,12 +101,66 @@
 #define GPR_NOISE1	0x59		/* noise source */
 #define GPR_IRQ		0x5a		/* IRQ register */
 #define GPR_DBAC	0x5b		/* TRAM Delay Base Address Counter */
+
+/* Audigy constants */
+#define A_C_00000000	0xc0
+#define A_C_00000001	0xc1
+#define A_C_00000002	0xc2
+#define A_C_00000003	0xc3
+#define A_C_00000004	0xc4
+#define A_C_00000008	0xc5
+#define A_C_00000010	0xc6
+#define A_C_00000020	0xc7
+#define A_C_00000100	0xc8
+#define A_C_00010000	0xc9
+#define A_C_00000800	0xca
+#define A_C_10000000	0xcb
+#define A_C_20000000	0xcc
+#define A_C_40000000	0xcd
+#define A_C_80000000	0xce
+#define A_C_7fffffff	0xcf
+#define A_C_ffffffff	0xd0
+#define A_C_fffffffe	0xd1
+#define A_C_c0000000	0xd2
+#define A_C_4f1bbcdc	0xd3
+#define A_C_5a7ef9db	0xd4
+#define A_C_00100000	0xd5
+#define A_GPR_ACCU	0xd6		/* ACCUM, accumulator */
+#define A_GPR_COND	0xd7		/* CCR, condition register */
+#define A_GPR_NOISE0	0xd8		/* noise source */
+#define A_GPR_NOISE1	0xd9		/* noise source */
+#define A_GPR_IRQ	0xda		/* IRQ register */
+#define A_GPR_DBAC	0xdb		/* TRAM Delay Base Address Counter - internal */
+#define A_GPR_DBACE	0xde		/* TRAM Delay Base Address Counter - external */
+
+/* Each FX general purpose register is 32 bits in length, all bits are used			*/
+#define FXGPREGBASE		0x100		/* FX general purpose registers base       	*/
+#define A_FXGPREGBASE		0x400		/* Audigy GPRs, 0x400 to 0x5ff			*/
+
+#define A_TANKMEMCTLREGBASE	0x100		/* Tank memory control registers base - only for Audigy */
+#define A_TANKMEMCTLREG_MASK	0x1f		/* only 5 bits used - only for Audigy */
+
+/* Tank audio data is logarithmically compressed down to 16 bits before writing to TRAM and is	*/
+/* decompressed back to 20 bits on a read.  There are a total of 160 locations, the last 32	*/
+/* locations are for external TRAM. 								*/
+#define TANKMEMDATAREGBASE	0x200		/* Tank memory data registers base     		*/
+#define TANKMEMDATAREG_MASK	0x000fffff	/* 20 bit tank audio data field			*/
+
+/* Combined address field and memory opcode or flag field.  160 locations, last 32 are external	*/
+#define TANKMEMADDRREGBASE	0x300		/* Tank memory address registers base		*/
+#define TANKMEMADDRREG_ADDR_MASK 0x000fffff	/* 20 bit tank address field			*/
+#define TANKMEMADDRREG_CLEAR	0x00800000	/* Clear tank memory				*/
+#define TANKMEMADDRREG_ALIGN	0x00400000	/* Align read or write relative to tank access	*/
+#define TANKMEMADDRREG_WRITE	0x00200000	/* Write to tank memory				*/
+#define TANKMEMADDRREG_READ	0x00100000	/* Read from tank memory			*/
+
 #define GPR(x)		(FXGPREGBASE + (x)) /* free GPRs: x = 0x00 - 0xff */
 #define ITRAM_DATA(x)	(TANKMEMDATAREGBASE + 0x00 + (x)) /* x = 0x00 - 0x7f */
 #define ETRAM_DATA(x)	(TANKMEMDATAREGBASE + 0x80 + (x)) /* x = 0x00 - 0x1f */
 #define ITRAM_ADDR(x)	(TANKMEMADDRREGBASE + 0x00 + (x)) /* x = 0x00 - 0x7f */
 #define ETRAM_ADDR(x)	(TANKMEMADDRREGBASE + 0x80 + (x)) /* x = 0x00 - 0x1f */
 
+#define A_GPR(x)	(A_FXGPREGBASE + (x))
 #define A_ITRAM_DATA(x)	(TANKMEMDATAREGBASE + 0x00 + (x)) /* x = 0x00 - 0xbf */
 #define A_ETRAM_DATA(x)	(TANKMEMDATAREGBASE + 0xc0 + (x)) /* x = 0x00 - 0x3f */
 #define A_ITRAM_ADDR(x)	(TANKMEMADDRREGBASE + 0x00 + (x)) /* x = 0x00 - 0xbf */
@@ -91,17 +168,6 @@
 #define A_ITRAM_CTL(x)	(A_TANKMEMCTLREGBASE + 0x00 + (x)) /* x = 0x00 - 0xbf */
 #define A_ETRAM_CTL(x)	(A_TANKMEMCTLREGBASE + 0xc0 + (x)) /* x = 0x00 - 0x3f */
 
-#define A_FXBUS(x)	(0x00 + (x))	/* x = 0x00 - 0x3f FX buses */
-#define A_EXTIN(x)	(0x40 + (x))	/* x = 0x00 - 0x0f physical ins */
-#define A_P16VIN(x)	(0x50 + (x))	/* x = 0x00 - 0x0f p16v ins (A2 only) "EMU32 inputs" */
-#define A_EXTOUT(x)	(0x60 + (x))	/* x = 0x00 - 0x1f physical outs -> A_FXWC1 0x79-7f unknown   */
-#define A_FXBUS2(x)	(0x80 + (x))	/* x = 0x00 - 0x1f extra outs used for EFX capture -> A_FXWC2 */
-#define A_EMU32OUTH(x)	(0xa0 + (x))	/* x = 0x00 - 0x0f "EMU32_OUT_10 - _1F" - ??? */
-#define A_EMU32OUTL(x)	(0xb0 + (x))	/* x = 0x00 - 0x0f "EMU32_OUT_1 - _F" - ??? */
-#define A3_EMU32IN(x)	(0x160 + (x))	/* x = 0x00 - 0x3f "EMU32_IN_00 - _3F" - Only when .device = 0x0008 */
-#define A3_EMU32OUT(x)	(0x1E0 + (x))	/* x = 0x00 - 0x0f "EMU32_OUT_00 - _3F" - Only when .device = 0x0008 */
-#define A_GPR(x)	(A_FXGPREGBASE + (x))
-
 /* cc_reg constants */
 #define CC_REG_NORMALIZED C_00000001
 #define CC_REG_BORROW	C_00000002
@@ -110,6 +176,13 @@
 #define CC_REG_SATURATE	C_00000010
 #define CC_REG_NONZERO	C_00000100
 
+#define A_CC_REG_NORMALIZED	A_C_00000001
+#define A_CC_REG_BORROW		A_C_00000002
+#define A_CC_REG_MINUS		A_C_00000004
+#define A_CC_REG_ZERO		A_C_00000008
+#define A_CC_REG_SATURATE	A_C_00000010
+#define A_CC_REG_NONZERO	A_C_00000100
+
 /* FX buses */
 // These are arbitrary mappings; our DSP code simply expects
 // the config files to route the channels this way.
@@ -203,38 +276,7 @@
 #define A_EXTOUT_ADC_CAP_R	0x17	/*                    right */
 #define A_EXTOUT_MIC_CAP	0x18	/* Mic capture buffer */
 
-/* Audigy constants */
-#define A_C_00000000	0xc0
-#define A_C_00000001	0xc1
-#define A_C_00000002	0xc2
-#define A_C_00000003	0xc3
-#define A_C_00000004	0xc4
-#define A_C_00000008	0xc5
-#define A_C_00000010	0xc6
-#define A_C_00000020	0xc7
-#define A_C_00000100	0xc8
-#define A_C_00010000	0xc9
-#define A_C_00000800	0xca
-#define A_C_10000000	0xcb
-#define A_C_20000000	0xcc
-#define A_C_40000000	0xcd
-#define A_C_80000000	0xce
-#define A_C_7fffffff	0xcf
-#define A_C_ffffffff	0xd0
-#define A_C_fffffffe	0xd1
-#define A_C_c0000000	0xd2
-#define A_C_4f1bbcdc	0xd3
-#define A_C_5a7ef9db	0xd4
-#define A_C_00100000	0xd5
-#define A_GPR_ACCU	0xd6		/* ACCUM, accumulator */
-#define A_GPR_COND	0xd7		/* CCR, condition register */
-#define A_GPR_NOISE0	0xd8		/* noise source */
-#define A_GPR_NOISE1	0xd9		/* noise source */
-#define A_GPR_IRQ	0xda		/* IRQ register */
-#define A_GPR_DBAC	0xdb		/* TRAM Delay Base Address Counter - internal */
-#define A_GPR_DBACE	0xde		/* TRAM Delay Base Address Counter - external */
-
-/* definitions for debug register */
+/* Definitions for debug register. Note that these are for emu10k1 ONLY. */
 #define EMU10K1_DBG_ZC			0x80000000	/* zero tram counter */
 #define EMU10K1_DBG_SATURATION_OCCURED	0x02000000	/* saturation control */
 #define EMU10K1_DBG_SATURATION_ADDR	0x01ff0000	/* saturation address */
@@ -243,14 +285,14 @@
 #define EMU10K1_DBG_CONDITION_CODE	0x00003e00	/* condition code */
 #define EMU10K1_DBG_SINGLE_STEP_ADDR	0x000001ff	/* single step address */
 
-/* tank memory address line */
-#ifndef __KERNEL__
-#define TANKMEMADDRREG_ADDR_MASK 0x000fffff	/* 20 bit tank address field			*/
-#define TANKMEMADDRREG_CLEAR	 0x00800000	/* Clear tank memory				*/
-#define TANKMEMADDRREG_ALIGN	 0x00400000	/* Align read or write relative to tank access	*/
-#define TANKMEMADDRREG_WRITE	 0x00200000	/* Write to tank memory				*/
-#define TANKMEMADDRREG_READ	 0x00100000	/* Read from tank memory			*/
-#endif
+/* Definitions for emu10k2 debug register. */
+#define A_DBG_ZC			0x40000000	/* zero tram counter */
+#define A_DBG_SATURATION_OCCURED	0x20000000
+#define A_DBG_SATURATION_ADDR		0x0ffc0000
+#define A_DBG_SINGLE_STEP		0x00020000	/* Set to zero to start dsp */
+#define A_DBG_STEP			0x00010000
+#define A_DBG_CONDITION_CODE		0x0000f800
+#define A_DBG_STEP_ADDR			0x000003ff
 
 struct snd_emu10k1_fx8010_info {
 	unsigned int internal_tram_size;	/* in samples */
-- 
cgit v1.2.3


From 8d60d5cabea12cd533e4f216ab3b773390165aac Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 22 Apr 2023 18:10:20 +0200
Subject: ALSA: emu10k1: use high-level I/O functions also during init

... and also use more pre-defined constants on the way (some of which
required adjustment).

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>

Link: https://lore.kernel.org/r/20230422161021.1143967-1-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h          |  9 +++++----
 sound/pci/emu10k1/emu10k1_main.c | 20 ++++++--------------
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 5958cae819fd..05a09826eef0 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -852,10 +852,11 @@
 #define A_SPDIF_MUTED		0x000000c0
 
 #define A_I2S_CAPTURE_RATE_MASK	0x00000e00	/* This sets the capture PCM rate, but it is    */
-#define A_I2S_CAPTURE_48000	0x00000000	/* unclear if this sets the ADC rate as well.	*/
-#define A_I2S_CAPTURE_192000	0x00000200
-#define A_I2S_CAPTURE_96000	0x00000400
-#define A_I2S_CAPTURE_44100	0x00000800
+#define A_I2S_CAPTURE_RATE	0x03090076	/* unclear if this sets the ADC rate as well.	*/
+#define A_I2S_CAPTURE_48000	0x0
+#define A_I2S_CAPTURE_192000	0x1
+#define A_I2S_CAPTURE_96000	0x2
+#define A_I2S_CAPTURE_44100	0x4
 
 #define A_EHC_SRC48_MASK	0x0000e000	/* This sets the playback PCM rate on the P16V	*/
 #define A_EHC_SRC48_BYPASS	0x00000000
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index d9199d92ccd3..3abdaf1b9624 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -187,10 +187,7 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir)
 	} else if (emu->card_capabilities->ca0151_chip) { /* audigy2 */
 		/* Hacks for Alice3 to work independent of haP16V driver */
 		/* Setup SRCMulti_I2S SamplingRate */
-		tmp = snd_emu10k1_ptr_read(emu, A_SPDIF_SAMPLERATE, 0);
-		tmp &= 0xfffff1ff;
-		tmp |= (0x2<<9);
-		snd_emu10k1_ptr_write(emu, A_SPDIF_SAMPLERATE, 0, tmp);
+		snd_emu10k1_ptr_write(emu, A_I2S_CAPTURE_RATE, 0, A_I2S_CAPTURE_96000);
 
 		/* Setup SRCSel (Enable Spdif,I2S SRCMulti) */
 		snd_emu10k1_ptr20_write(emu, SRCSel, 0, 0x14);
@@ -206,25 +203,20 @@ static int snd_emu10k1_init(struct snd_emu10k1 *emu, int enable_ir)
 		/* Hacks for Alice3 to work independent of haP16V driver */
 		dev_info(emu->card->dev, "Audigy2 value: Special config.\n");
 		/* Setup SRCMulti_I2S SamplingRate */
-		tmp = snd_emu10k1_ptr_read(emu, A_SPDIF_SAMPLERATE, 0);
-		tmp &= 0xfffff1ff;
-		tmp |= (0x2<<9);
-		snd_emu10k1_ptr_write(emu, A_SPDIF_SAMPLERATE, 0, tmp);
+		snd_emu10k1_ptr_write(emu, A_I2S_CAPTURE_RATE, 0, A_I2S_CAPTURE_96000);
 
 		/* Setup SRCSel (Enable Spdif,I2S SRCMulti) */
-		outl(0x600000, emu->port + 0x20);
-		outl(0x14, emu->port + 0x24);
+		snd_emu10k1_ptr20_write(emu, P17V_SRCSel, 0, 0x14);
 
 		/* Setup SRCMulti Input Audio Enable */
-		outl(0x7b0000, emu->port + 0x20);
-		outl(0xFF000000, emu->port + 0x24);
+		snd_emu10k1_ptr20_write(emu, P17V_MIXER_I2S_ENABLE, 0, 0xFF000000);
 
 		/* Setup SPDIF Out Audio Enable */
 		/* The Audigy 2 Value has a separate SPDIF out,
 		 * so no need for a mixer switch
 		 */
-		outl(0x7a0000, emu->port + 0x20);
-		outl(0xFF000000, emu->port + 0x24);
+		snd_emu10k1_ptr20_write(emu, P17V_MIXER_SPDIF_ENABLE, 0, 0xFF000000);
+
 		tmp = inw(emu->port + A_IOCFG) & ~0x8; /* Clear bit 3 */
 		outw(tmp, emu->port + A_IOCFG);
 	}
-- 
cgit v1.2.3


From 87eff2ec57b6d68d294013d8dd21e839a1175e3a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Apr 2023 09:43:57 +0000
Subject: net: optimize napi_threaded_poll() vs RPS/RFS

We use napi_threaded_poll() in order to reduce our softirq dependency.

We can add a followup of 821eba962d95 ("net: optimize napi_schedule_rps()")
to further remove the need of firing NET_RX_SOFTIRQ whenever
RPS/RFS are used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 12 ++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a6a3e9457d6c..08fbd4622ccf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3194,7 +3194,10 @@ struct softnet_data {
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
 #endif
+
 	bool			in_net_rx_action;
+	bool			in_napi_threaded_poll;
+
 #ifdef CONFIG_NET_FLOW_LIMIT
 	struct sd_flow_limit __rcu *flow_limit;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 7d9ec23f97c6..735096d42c1d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4603,10 +4603,10 @@ static void napi_schedule_rps(struct softnet_data *sd)
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 		mysd->rps_ipi_list = sd;
 
-		/* If not called from net_rx_action()
+		/* If not called from net_rx_action() or napi_threaded_poll()
 		 * we have to raise NET_RX_SOFTIRQ.
 		 */
-		if (!mysd->in_net_rx_action)
+		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
 			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 		return;
 	}
@@ -6631,11 +6631,19 @@ static int napi_threaded_poll(void *data)
 
 			local_bh_disable();
 			sd = this_cpu_ptr(&softnet_data);
+			sd->in_napi_threaded_poll = true;
 
 			have = netpoll_poll_lock(napi);
 			__napi_poll(napi, &repoll);
 			netpoll_poll_unlock(have);
 
+			sd->in_napi_threaded_poll = false;
+			barrier();
+
+			if (sd_has_rps_ipi_waiting(sd)) {
+				local_irq_disable();
+				net_rps_action_and_irq_enable(sd);
+			}
 			skb_defer_free_flush(sd);
 			local_bh_enable();
 
-- 
cgit v1.2.3


From f90615ada0b1e21a9d93ff89b04549fd7a92c92b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:55:53 +0300
Subject: net: vlan: don't adjust MAC header in __vlan_insert_inner_tag()
 unless set

This is a preparatory change for the deletion of skb_reset_mac_header(skb)
from __dev_queue_xmit(). After that deletion, skb_mac_header(skb) will
no longer be set in TX paths, from which __vlan_insert_inner_tag() can
still be called (perhaps indirectly).

If we don't make this change, then an unset MAC header (equal to ~0U)
will become set after the adjustment with VLAN_HLEN.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 6864b89ef868..90b76d63c11c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -351,7 +351,8 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
 	/* Move the mac header sans proto to the beginning of the new header. */
 	if (likely(mac_len > ETH_TLEN))
 		memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
-	skb->mac_header -= VLAN_HLEN;
+	if (skb_mac_header_was_set(skb))
+		skb->mac_header -= VLAN_HLEN;
 
 	veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);
 
-- 
cgit v1.2.3


From 1f5020acb33f926030f62563c86dffca35c7b701 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:55:54 +0300
Subject: net: vlan: introduce skb_vlan_eth_hdr()

Similar to skb_eth_hdr() introduced in commit 96cc4b69581d ("macvlan: do
not assume mac_header is set in macvlan_broadcast()"), let's introduce a
skb_vlan_eth_hdr() helper which can be used in TX-only code paths to get
to the VLAN header based on skb->data rather than based on the
skb_mac_header(skb).

We also consolidate the drivers that dereference skb->data to go through
this helper.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  3 +--
 drivers/net/ethernet/emulex/benet/be_main.c          |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  2 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c       |  4 ++--
 drivers/net/ethernet/sfc/tx_tso.c                    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c    |  7 ++-----
 drivers/staging/gdm724x/gdm_lte.c                    |  4 ++--
 include/linux/if_vlan.h                              | 12 ++++++++++--
 net/batman-adv/soft-interface.c                      |  2 +-
 12 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 12083b9679b5..6ea5521074d3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1935,8 +1935,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
 
 		/* Skip VLAN tag if present */
 		if (ether_type == ETH_P_8021Q) {
-			struct vlan_ethhdr *vhdr =
-				(struct vlan_ethhdr *)skb->data;
+			struct vlan_ethhdr *vhdr = skb_vlan_eth_hdr(skb);
 
 			ether_type = ntohs(vhdr->h_vlan_encapsulated_proto);
 		}
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index aed1b622f51f..7e408bcc88de 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1124,7 +1124,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 						  struct be_wrb_params
 						  *wrb_params)
 {
-	struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+	struct vlan_ethhdr *veh = skb_vlan_eth_hdr(skb);
 	unsigned int eth_hdr_len;
 	struct iphdr *ip;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 5caea154362f..7356ad965487 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1532,7 +1532,7 @@ static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring,
 	if (unlikely(rc < 0))
 		return rc;
 
-	vhdr = (struct vlan_ethhdr *)skb->data;
+	vhdr = skb_vlan_eth_hdr(skb);
 	vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT)
 					 & VLAN_PRIO_MASK);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8c2cbaa0ede..8b8bf4880faa 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3063,7 +3063,7 @@ static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 			rc = skb_cow_head(skb, 0);
 			if (rc < 0)
 				return rc;
-			vhdr = (struct vlan_ethhdr *)skb->data;
+			vhdr = skb_vlan_eth_hdr(skb);
 			vhdr->h_vlan_TCI = htons(tx_flags >>
 						 I40E_TX_FLAGS_VLAN_SHIFT);
 		} else {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f2604fc05991..e961ef4bbf4d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8798,7 +8798,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 
 			if (skb_cow_head(skb, 0))
 				goto out_drop;
-			vhdr = (struct vlan_ethhdr *)skb->data;
+			vhdr = skb_vlan_eth_hdr(skb);
 			vhdr->h_vlan_TCI = htons(tx_flags >>
 						 IXGBE_TX_FLAGS_VLAN_SHIFT);
 		} else {
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 59d0dd862fd1..1d1e183d3a8b 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1854,7 +1854,7 @@ netxen_tso_check(struct net_device *netdev,
 
 	if (protocol == cpu_to_be16(ETH_P_8021Q)) {
 
-		vh = (struct vlan_ethhdr *)skb->data;
+		vh = skb_vlan_eth_hdr(skb);
 		protocol = vh->h_vlan_encapsulated_proto;
 		flags = FLAGS_VLAN_TAGGED;
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index 92930a055cbc..41894d154013 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -318,7 +318,7 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
 
 	if (adapter->flags & QLCNIC_VLAN_FILTERING) {
 		if (protocol == ETH_P_8021Q) {
-			vh = (struct vlan_ethhdr *)skb->data;
+			vh = skb_vlan_eth_hdr(skb);
 			vlan_id = ntohs(vh->h_vlan_TCI);
 		} else if (skb_vlan_tag_present(skb)) {
 			vlan_id = skb_vlan_tag_get(skb);
@@ -468,7 +468,7 @@ static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
 	u32 producer = tx_ring->producer;
 
 	if (protocol == ETH_P_8021Q) {
-		vh = (struct vlan_ethhdr *)skb->data;
+		vh = skb_vlan_eth_hdr(skb);
 		flags = QLCNIC_FLAGS_VLAN_TAGGED;
 		vlan_tci = ntohs(vh->h_vlan_TCI);
 		protocol = ntohs(vh->h_vlan_encapsulated_proto);
diff --git a/drivers/net/ethernet/sfc/tx_tso.c b/drivers/net/ethernet/sfc/tx_tso.c
index 898e5c61d908..d381d8164f07 100644
--- a/drivers/net/ethernet/sfc/tx_tso.c
+++ b/drivers/net/ethernet/sfc/tx_tso.c
@@ -147,7 +147,7 @@ static __be16 efx_tso_check_protocol(struct sk_buff *skb)
 	EFX_WARN_ON_ONCE_PARANOID(((struct ethhdr *)skb->data)->h_proto !=
 				  protocol);
 	if (protocol == htons(ETH_P_8021Q)) {
-		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+		struct vlan_ethhdr *veh = skb_vlan_eth_hdr(skb);
 
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f116e4ae293b..0fca81507a77 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4569,13 +4569,10 @@ dma_map_err:
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct vlan_ethhdr *veth;
-	__be16 vlan_proto;
+	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
+	__be16 vlan_proto = veth->h_vlan_proto;
 	u16 vlanid;
 
-	veth = (struct vlan_ethhdr *)skb->data;
-	vlan_proto = veth->h_vlan_proto;
-
 	if ((vlan_proto == htons(ETH_P_8021Q) &&
 	     dev->features & NETIF_F_HW_VLAN_CTAG_RX) ||
 	    (vlan_proto == htons(ETH_P_8021AD) &&
diff --git a/drivers/staging/gdm724x/gdm_lte.c b/drivers/staging/gdm724x/gdm_lte.c
index 671ee8843c88..5703a9ddb6d0 100644
--- a/drivers/staging/gdm724x/gdm_lte.c
+++ b/drivers/staging/gdm724x/gdm_lte.c
@@ -349,7 +349,7 @@ static s32 gdm_lte_tx_nic_type(struct net_device *dev, struct sk_buff *skb)
 	/* Get ethernet protocol */
 	eth = (struct ethhdr *)skb->data;
 	if (ntohs(eth->h_proto) == ETH_P_8021Q) {
-		vlan_eth = (struct vlan_ethhdr *)skb->data;
+		vlan_eth = skb_vlan_eth_hdr(skb);
 		mac_proto = ntohs(vlan_eth->h_vlan_encapsulated_proto);
 		network_data = skb->data + VLAN_ETH_HLEN;
 		nic_type |= NIC_TYPE_F_VLAN;
@@ -435,7 +435,7 @@ static netdev_tx_t gdm_lte_tx(struct sk_buff *skb, struct net_device *dev)
 	 * driver based on the NIC mac
 	 */
 	if (nic_type & NIC_TYPE_F_VLAN) {
-		struct vlan_ethhdr *vlan_eth = (struct vlan_ethhdr *)skb->data;
+		struct vlan_ethhdr *vlan_eth = skb_vlan_eth_hdr(skb);
 
 		nic->vlan_id = ntohs(vlan_eth->h_vlan_TCI) & VLAN_VID_MASK;
 		data_buf = skb->data + (VLAN_ETH_HLEN - ETH_HLEN);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 90b76d63c11c..3698f2b391cd 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -62,6 +62,14 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
 	return (struct vlan_ethhdr *)skb_mac_header(skb);
 }
 
+/* Prefer this version in TX path, instead of
+ * skb_reset_mac_header() + vlan_eth_hdr()
+ */
+static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
+{
+	return (struct vlan_ethhdr *)skb->data;
+}
+
 #define VLAN_PRIO_MASK		0xe000 /* Priority Code Point */
 #define VLAN_PRIO_SHIFT		13
 #define VLAN_CFI_MASK		0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
@@ -529,7 +537,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
  */
 static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 {
-	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data;
+	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
 
 	if (!eth_type_vlan(veth->h_vlan_proto))
 		return -EINVAL;
@@ -713,7 +721,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
 		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
 			return false;
 
-		veh = (struct vlan_ethhdr *)skb->data;
+		veh = skb_vlan_eth_hdr(skb);
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 125f4628687c..d3fdf82282af 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -439,7 +439,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
 		if (!pskb_may_pull(skb, VLAN_ETH_HLEN))
 			goto dropped;
 
-		vhdr = (struct vlan_ethhdr *)skb->data;
+		vhdr = skb_vlan_eth_hdr(skb);
 
 		/* drop batman-in-batman packets to prevent loops */
 		if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
-- 
cgit v1.2.3


From 0bcf2e4aca6c29a07555b713f2fb461dc38d5977 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:56:01 +0300
Subject: net: dsa: tag_ocelot: call only the relevant portion of
 __skb_vlan_pop() on TX

ocelot_xmit_get_vlan_info() calls __skb_vlan_pop() as the most
appropriate helper I could find which strips away a VLAN header.
That's all I need it to do, but __skb_vlan_pop() has more logic, which
will become incompatible with the future revert of commit 6d1ccff62780
("net: reset mac header in dev_start_xmit()").

Namely, it performs a sanity check on skb_mac_header(), which will stop
being set after the above revert, so it will return an error instead of
removing the VLAN tag.

ocelot_xmit_get_vlan_info() gets called in 2 circumstances:

(1) the port is under a VLAN-aware bridge and the bridge sends
    VLAN-tagged packets

(2) the port is under a VLAN-aware bridge and somebody else (an 8021q
    upper) sends VLAN-tagged packets (using a VID that isn't in the
    bridge vlan tables)

In case (1), there is actually no bug to defend against, because
br_dev_xmit() calls skb_reset_mac_header() and things continue to work.

However, in case (2), illustrated using the commands below, it can be
seen that our intervention is needed, since __skb_vlan_pop() complains:

$ ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up
$ ip link set $eth master br0 && ip link set $eth up
$ ip link add link $eth name $eth.100 type vlan id 100 && ip link set $eth.100 up
$ ip addr add 192.168.100.1/24 dev $eth.100

I could fend off the checks in __skb_vlan_pop() with some
skb_mac_header_was_set() calls, but seeing how few callers of
__skb_vlan_pop() there are from TX paths, that seems rather
unproductive.

As an alternative solution, extract the bare minimum logic to strip a
VLAN header, and move it to a new helper named vlan_remove_tag(), close
to the definition of vlan_insert_tag(). Document it appropriately and
make ocelot_xmit_get_vlan_info() call this smaller helper instead.

Seeing that it doesn't appear illegal to test skb->protocol in the TX
path, I guess it would be a good for vlan_remove_tag() to also absorb
the vlan_set_encap_proto() function call.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 21 +++++++++++++++++++++
 net/core/skbuff.c       |  8 +-------
 net/dsa/tag_ocelot.c    |  2 +-
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3698f2b391cd..0f40f379d75c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -685,6 +685,27 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
 		skb->protocol = htons(ETH_P_802_2);
 }
 
+/**
+ * vlan_remove_tag - remove outer VLAN tag from payload
+ * @skb: skbuff to remove tag from
+ * @vlan_tci: buffer to store value
+ *
+ * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
+ * pointing at the MAC header.
+ *
+ * Returns a new pointer to skb->data, or NULL on failure to pull.
+ */
+static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
+{
+	struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+
+	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+	vlan_set_encap_proto(skb, vhdr);
+	return __skb_pull(skb, VLAN_HLEN);
+}
+
 /**
  * skb_vlan_tagged - check if skb is vlan tagged.
  * @skb: skbuff to query
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 304a966164d8..c7c141f6fc14 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5996,7 +5996,6 @@ EXPORT_SYMBOL(skb_ensure_writable);
  */
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 {
-	struct vlan_hdr *vhdr;
 	int offset = skb->data - skb_mac_header(skb);
 	int err;
 
@@ -6012,13 +6011,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 
 	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
 
-	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
-	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
-
-	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
-	__skb_pull(skb, VLAN_HLEN);
+	vlan_remove_tag(skb, vlan_tci);
 
-	vlan_set_encap_proto(skb, vhdr);
 	skb->mac_header += VLAN_HLEN;
 
 	if (skb_network_offset(skb) < ETH_HLEN)
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 73ee09de1a3a..20bf7074d5a6 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -30,7 +30,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
 	br_vlan_get_proto(br, &proto);
 
 	if (ntohs(hdr->h_vlan_proto) == proto) {
-		__skb_vlan_pop(skb, &tci);
+		vlan_remove_tag(skb, &tci);
 		*vlan_tci = tci;
 	} else {
 		rcu_read_lock();
-- 
cgit v1.2.3


From a80d2c545ded86d0350b9a870735565d8b749786 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 13 Feb 2023 14:28:55 -0800
Subject: Bluetooth: MGMT: Use BIT macro when defining bitfields

This makes use of BIT macro when defining bitfields which makes it
clearer what bit it is toggling.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/mgmt.h | 80 ++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index e18a927669c0..a5801649f619 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -91,26 +91,26 @@ struct mgmt_rp_read_index_list {
 #define MGMT_MAX_NAME_LENGTH		(HCI_MAX_NAME_LENGTH + 1)
 #define MGMT_MAX_SHORT_NAME_LENGTH	(HCI_MAX_SHORT_NAME_LENGTH + 1)
 
-#define MGMT_SETTING_POWERED		0x00000001
-#define MGMT_SETTING_CONNECTABLE	0x00000002
-#define MGMT_SETTING_FAST_CONNECTABLE	0x00000004
-#define MGMT_SETTING_DISCOVERABLE	0x00000008
-#define MGMT_SETTING_BONDABLE		0x00000010
-#define MGMT_SETTING_LINK_SECURITY	0x00000020
-#define MGMT_SETTING_SSP		0x00000040
-#define MGMT_SETTING_BREDR		0x00000080
-#define MGMT_SETTING_HS			0x00000100
-#define MGMT_SETTING_LE			0x00000200
-#define MGMT_SETTING_ADVERTISING	0x00000400
-#define MGMT_SETTING_SECURE_CONN	0x00000800
-#define MGMT_SETTING_DEBUG_KEYS		0x00001000
-#define MGMT_SETTING_PRIVACY		0x00002000
-#define MGMT_SETTING_CONFIGURATION	0x00004000
-#define MGMT_SETTING_STATIC_ADDRESS	0x00008000
-#define MGMT_SETTING_PHY_CONFIGURATION	0x00010000
-#define MGMT_SETTING_WIDEBAND_SPEECH	0x00020000
-#define MGMT_SETTING_CIS_CENTRAL	0x00040000
-#define MGMT_SETTING_CIS_PERIPHERAL	0x00080000
+#define MGMT_SETTING_POWERED		BIT(0)
+#define MGMT_SETTING_CONNECTABLE	BIT(1)
+#define MGMT_SETTING_FAST_CONNECTABLE	BIT(2)
+#define MGMT_SETTING_DISCOVERABLE	BIT(3)
+#define MGMT_SETTING_BONDABLE		BIT(4)
+#define MGMT_SETTING_LINK_SECURITY	BIT(5)
+#define MGMT_SETTING_SSP		BIT(6)
+#define MGMT_SETTING_BREDR		BIT(7)
+#define MGMT_SETTING_HS			BIT(8)
+#define MGMT_SETTING_LE			BIT(9)
+#define MGMT_SETTING_ADVERTISING	BIT(10)
+#define MGMT_SETTING_SECURE_CONN	BIT(11)
+#define MGMT_SETTING_DEBUG_KEYS		BIT(12)
+#define MGMT_SETTING_PRIVACY		BIT(13)
+#define MGMT_SETTING_CONFIGURATION	BIT(14)
+#define MGMT_SETTING_STATIC_ADDRESS	BIT(15)
+#define MGMT_SETTING_PHY_CONFIGURATION	BIT(16)
+#define MGMT_SETTING_WIDEBAND_SPEECH	BIT(17)
+#define MGMT_SETTING_CIS_CENTRAL	BIT(18)
+#define MGMT_SETTING_CIS_PERIPHERAL	BIT(19)
 
 #define MGMT_OP_READ_INFO		0x0004
 #define MGMT_READ_INFO_SIZE		0
@@ -635,21 +635,21 @@ struct mgmt_rp_get_phy_configuration {
 } __packed;
 #define MGMT_GET_PHY_CONFIGURATION_SIZE	0
 
-#define MGMT_PHY_BR_1M_1SLOT	0x00000001
-#define MGMT_PHY_BR_1M_3SLOT	0x00000002
-#define MGMT_PHY_BR_1M_5SLOT	0x00000004
-#define MGMT_PHY_EDR_2M_1SLOT	0x00000008
-#define MGMT_PHY_EDR_2M_3SLOT	0x00000010
-#define MGMT_PHY_EDR_2M_5SLOT	0x00000020
-#define MGMT_PHY_EDR_3M_1SLOT	0x00000040
-#define MGMT_PHY_EDR_3M_3SLOT	0x00000080
-#define MGMT_PHY_EDR_3M_5SLOT	0x00000100
-#define MGMT_PHY_LE_1M_TX		0x00000200
-#define MGMT_PHY_LE_1M_RX		0x00000400
-#define MGMT_PHY_LE_2M_TX		0x00000800
-#define MGMT_PHY_LE_2M_RX		0x00001000
-#define MGMT_PHY_LE_CODED_TX	0x00002000
-#define MGMT_PHY_LE_CODED_RX	0x00004000
+#define MGMT_PHY_BR_1M_1SLOT		BIT(0)
+#define MGMT_PHY_BR_1M_3SLOT		BIT(1)
+#define MGMT_PHY_BR_1M_5SLOT		BIT(2)
+#define MGMT_PHY_EDR_2M_1SLOT		BIT(3)
+#define MGMT_PHY_EDR_2M_3SLOT		BIT(4)
+#define MGMT_PHY_EDR_2M_5SLOT		BIT(5)
+#define MGMT_PHY_EDR_3M_1SLOT		BIT(6)
+#define MGMT_PHY_EDR_3M_3SLOT		BIT(7)
+#define MGMT_PHY_EDR_3M_5SLOT		BIT(8)
+#define MGMT_PHY_LE_1M_TX		BIT(9)
+#define MGMT_PHY_LE_1M_RX		BIT(10)
+#define MGMT_PHY_LE_2M_TX		BIT(11)
+#define MGMT_PHY_LE_2M_RX		BIT(12)
+#define MGMT_PHY_LE_CODED_TX		BIT(13)
+#define MGMT_PHY_LE_CODED_RX		BIT(14)
 
 #define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \
 			     MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \
@@ -974,11 +974,11 @@ struct mgmt_ev_auth_failed {
 	__u8	status;
 } __packed;
 
-#define MGMT_DEV_FOUND_CONFIRM_NAME		0x01
-#define MGMT_DEV_FOUND_LEGACY_PAIRING		0x02
-#define MGMT_DEV_FOUND_NOT_CONNECTABLE		0x04
-#define MGMT_DEV_FOUND_INITIATED_CONN		0x08
-#define MGMT_DEV_FOUND_NAME_REQUEST_FAILED	0x10
+#define MGMT_DEV_FOUND_CONFIRM_NAME		BIT(0)
+#define MGMT_DEV_FOUND_LEGACY_PAIRING		BIT(1)
+#define MGMT_DEV_FOUND_NOT_CONNECTABLE		BIT(2)
+#define MGMT_DEV_FOUND_INITIATED_CONN		BIT(3)
+#define MGMT_DEV_FOUND_NAME_REQUEST_FAILED	BIT(4)
 
 #define MGMT_EV_DEVICE_FOUND		0x0012
 struct mgmt_ev_device_found {
-- 
cgit v1.2.3


From d4b20f0b8491bfb3238ebf7df4b13243189620d2 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 24 Feb 2023 15:27:33 -0800
Subject: Bluetooth: hci_core: Make hci_conn_hash_add append to the list

This makes hci_conn_hash_add append to the tail of the conn_hash so it
matches the order they are created, this is required if the controller
attempts to match the order of ACL with CIS which uses append logic
when programming the CIS ids on the CIG.

The result of this change affects Create CIS:

Before:

< HCI Command: LE Create Connected Isochronous Stream (0x08|0x0064) plen 9
        Number of CIS: 2
        CIS Handle: 2560
        ACL Handle: 3586
        CIS Handle: 2561
        ACL Handle: 3585

After:

< HCI Command: LE Create Connected Isochronous Stream (0x08|0x0064) plen 9
        Number of CIS: 2
        CIS Handle: 2560
        ACL Handle: 3585
        CIS Handle: 2561
        ACL Handle: 3586

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d5311ceb21c6..e22e45fbe8db 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -979,7 +979,7 @@ static inline bool hci_conn_sc_enabled(struct hci_conn *conn)
 static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
-	list_add_rcu(&c->list, &h->list);
+	list_add_tail_rcu(&c->list, &h->list);
 	switch (c->type) {
 	case ACL_LINK:
 		h->acl_num++;
-- 
cgit v1.2.3


From b8b23001b8025a61f0979578884a74faa825023e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 8 Mar 2023 16:16:31 -0800
Subject: Bluetooth: L2CAP: Delay identity address updates

This delays the identity address updates to give time for userspace to
process the new address otherwise there is a risk that userspace
creates a duplicated device if the MGMT event is delayed for some
reason.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h | 2 +-
 net/bluetooth/l2cap_core.c    | 7 +++----
 net/bluetooth/smp.c           | 9 ++++++++-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 2f766e3437ce..cf393e72d6ed 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -694,7 +694,7 @@ struct l2cap_conn {
 	struct sk_buff_head	pending_rx;
 	struct work_struct	pending_rx_work;
 
-	struct work_struct	id_addr_update_work;
+	struct delayed_work	id_addr_timer;
 
 	__u8			disc_reason;
 
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 55a7226233f9..5cc95fd17f7d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -745,7 +745,7 @@ EXPORT_SYMBOL_GPL(l2cap_chan_list);
 static void l2cap_conn_update_id_addr(struct work_struct *work)
 {
 	struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
-					       id_addr_update_work);
+					       id_addr_timer.work);
 	struct hci_conn *hcon = conn->hcon;
 	struct l2cap_chan *chan;
 
@@ -1907,8 +1907,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
 	if (work_pending(&conn->pending_rx_work))
 		cancel_work_sync(&conn->pending_rx_work);
 
-	if (work_pending(&conn->id_addr_update_work))
-		cancel_work_sync(&conn->id_addr_update_work);
+	cancel_delayed_work_sync(&conn->id_addr_timer);
 
 	l2cap_unregister_all_users(conn);
 
@@ -7874,7 +7873,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
 
 	skb_queue_head_init(&conn->pending_rx);
 	INIT_WORK(&conn->pending_rx_work, process_pending_rx);
-	INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr);
+	INIT_DELAYED_WORK(&conn->id_addr_timer, l2cap_conn_update_id_addr);
 
 	conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 70663229b3cc..f1a9fc0012f0 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -58,6 +58,8 @@
 
 #define SMP_TIMEOUT	msecs_to_jiffies(30000)
 
+#define ID_ADDR_TIMEOUT	msecs_to_jiffies(200)
+
 #define AUTH_REQ_MASK(dev)	(hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
 				 0x3f : 0x07)
 #define KEY_DIST_MASK		0x07
@@ -1067,7 +1069,12 @@ static void smp_notify_keys(struct l2cap_conn *conn)
 		if (hcon->type == LE_LINK) {
 			bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
 			hcon->dst_type = smp->remote_irk->addr_type;
-			queue_work(hdev->workqueue, &conn->id_addr_update_work);
+			/* Use a short delay to make sure the new address is
+			 * propagated _before_ the channels.
+			 */
+			queue_delayed_work(hdev->workqueue,
+					   &conn->id_addr_timer,
+					   ID_ADDR_TIMEOUT);
 		}
 	}
 
-- 
cgit v1.2.3


From 8194f1ef5a815aea815a91daf2c721eab2674f1f Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <anarsoul@gmail.com>
Date: Tue, 7 Mar 2023 23:17:30 +0100
Subject: Bluetooth: Add new quirk for broken local ext features page 2

Some adapters (e.g. RTL8723CS) advertise that they have more than
2 pages for local ext features, but they don't support any features
declared in these pages. RTL8723CS reports max_page = 2 and declares
support for sync train and secure connection, but it responds with
either garbage or with error in status on corresponding commands.

Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: Bastian Germann <bage@debian.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 7 +++++++
 net/bluetooth/hci_event.c   | 9 +++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 400f8a7d0c3f..997107bfc0b1 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -294,6 +294,13 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG,
+
+	/* When this quirk is set, max_page for local extended features
+	 * is set to 1, even if controller reports higher number. Some
+	 * controllers (e.g. RTL8723CS) report more pages, but they
+	 * don't actually support features declared there.
+	 */
+	HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e87c928c9e17..51f13518dba9 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -886,8 +886,13 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
 	if (rp->status)
 		return rp->status;
 
-	if (hdev->max_page < rp->max_page)
-		hdev->max_page = rp->max_page;
+	if (hdev->max_page < rp->max_page) {
+		if (test_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
+			     &hdev->quirks))
+			bt_dev_warn(hdev, "broken local ext features page 2");
+		else
+			hdev->max_page = rp->max_page;
+	}
 
 	if (rp->page < HCI_MAX_PAGES)
 		memcpy(hdev->features[rp->page], rp->features, 8);
-- 
cgit v1.2.3


From 29f93a687f3c435e94d026ee1fc8ad18ce56b7fb Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Thu, 16 Mar 2023 22:52:11 +0530
Subject: serdev: Replace all instances of ENOTSUPP with EOPNOTSUPP

This replaces all instances of ENOTSUPP with EOPNOTSUPP since ENOTSUPP
is not a standard error code. This will help maintain consistency in
error codes when new serdev API's are added.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/tty/serdev/core.c           | 6 +++---
 drivers/tty/serdev/serdev-ttyport.c | 4 ++--
 include/linux/serdev.h              | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 678014253b7b..078fe64625cb 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -366,7 +366,7 @@ int serdev_device_set_parity(struct serdev_device *serdev,
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->set_parity)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->set_parity(ctrl, parity);
 }
@@ -388,7 +388,7 @@ int serdev_device_get_tiocm(struct serdev_device *serdev)
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->get_tiocm)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->get_tiocm(ctrl);
 }
@@ -399,7 +399,7 @@ int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->set_tiocm)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->set_tiocm(ctrl, set, clear);
 }
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index d367803e2044..f26ff82723f1 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -231,7 +231,7 @@ static int ttyport_get_tiocm(struct serdev_controller *ctrl)
 	struct tty_struct *tty = serport->tty;
 
 	if (!tty->ops->tiocmget)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return tty->ops->tiocmget(tty);
 }
@@ -242,7 +242,7 @@ static int ttyport_set_tiocm(struct serdev_controller *ctrl, unsigned int set, u
 	struct tty_struct *tty = serport->tty;
 
 	if (!tty->ops->tiocmset)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return tty->ops->tiocmset(tty, set, clear);
 }
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 5f6bfe4f6d95..babedb13785b 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -250,11 +250,11 @@ static inline int serdev_device_write_buf(struct serdev_device *serdev,
 static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {}
 static inline int serdev_device_get_tiocm(struct serdev_device *serdev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
 				      size_t count, unsigned long timeout)
-- 
cgit v1.2.3


From 8eaf839e4ac4feadf06e03eeff34059795450712 Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Thu, 16 Mar 2023 22:52:12 +0530
Subject: serdev: Add method to assert break signal over tty UART port

Adds serdev_device_break_ctl() and an implementation for ttyport.
This function simply calls the break_ctl in tty layer, which can
assert a break signal over UART-TX line, if the tty and the
underlying platform and UART peripheral supports this operation.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/tty/serdev/core.c           | 11 +++++++++++
 drivers/tty/serdev/serdev-ttyport.c | 12 ++++++++++++
 include/linux/serdev.h              |  6 ++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 078fe64625cb..e7d663901c07 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -405,6 +405,17 @@ int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 }
 EXPORT_SYMBOL_GPL(serdev_device_set_tiocm);
 
+int serdev_device_break_ctl(struct serdev_device *serdev, int break_state)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->break_ctl)
+		return -EOPNOTSUPP;
+
+	return ctrl->ops->break_ctl(ctrl, break_state);
+}
+EXPORT_SYMBOL_GPL(serdev_device_break_ctl);
+
 static int serdev_drv_probe(struct device *dev)
 {
 	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index f26ff82723f1..8033ef19669c 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -247,6 +247,17 @@ static int ttyport_set_tiocm(struct serdev_controller *ctrl, unsigned int set, u
 	return tty->ops->tiocmset(tty, set, clear);
 }
 
+static int ttyport_break_ctl(struct serdev_controller *ctrl, unsigned int break_state)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	if (!tty->ops->break_ctl)
+		return -EOPNOTSUPP;
+
+	return tty->ops->break_ctl(tty, break_state);
+}
+
 static const struct serdev_controller_ops ctrl_ops = {
 	.write_buf = ttyport_write_buf,
 	.write_flush = ttyport_write_flush,
@@ -259,6 +270,7 @@ static const struct serdev_controller_ops ctrl_ops = {
 	.wait_until_sent = ttyport_wait_until_sent,
 	.get_tiocm = ttyport_get_tiocm,
 	.set_tiocm = ttyport_set_tiocm,
+	.break_ctl = ttyport_break_ctl,
 };
 
 struct device *serdev_tty_port_register(struct tty_port *port,
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index babedb13785b..f5f97fa25e8a 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -93,6 +93,7 @@ struct serdev_controller_ops {
 	void (*wait_until_sent)(struct serdev_controller *, long);
 	int (*get_tiocm)(struct serdev_controller *);
 	int (*set_tiocm)(struct serdev_controller *, unsigned int, unsigned int);
+	int (*break_ctl)(struct serdev_controller *ctrl, unsigned int break_state);
 };
 
 /**
@@ -203,6 +204,7 @@ int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_
 void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_get_tiocm(struct serdev_device *);
 int serdev_device_set_tiocm(struct serdev_device *, int, int);
+int serdev_device_break_ctl(struct serdev_device *serdev, int break_state);
 void serdev_device_write_wakeup(struct serdev_device *);
 int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long);
 void serdev_device_write_flush(struct serdev_device *);
@@ -256,6 +258,10 @@ static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set,
 {
 	return -EOPNOTSUPP;
 }
+static inline int serdev_device_break_ctl(struct serdev_device *serdev, int break_state)
+{
+	return -EOPNOTSUPP;
+}
 static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
 				      size_t count, unsigned long timeout)
 {
-- 
cgit v1.2.3


From 9695ef876fd122cb7bbc04a4a93b8727d2e36bda Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Thu, 30 Mar 2023 09:58:23 -0700
Subject: Bluetooth: Add support for hci devcoredump

Add devcoredump APIs to hci core so that drivers only have to provide
the dump skbs instead of managing the synchronization and timeouts.

The devcoredump APIs should be used in the following manner:
 - hci_devcoredump_init is called to allocate the dump.
 - hci_devcoredump_append is called to append any skbs with dump data
   OR hci_devcoredump_append_pattern is called to insert a pattern.
 - hci_devcoredump_complete is called when all dump packets have been
   sent OR hci_devcoredump_abort is called to indicate an error and
   cancel an ongoing dump collection.

The high level APIs just prepare some skbs with the appropriate data and
queue it for the dump to process. Packets part of the crashdump can be
intercepted in the driver in interrupt context and forwarded directly to
the devcoredump APIs.

Internally, there are 5 states for the dump: idle, active, complete,
abort and timeout. A devcoredump will only be in active state after it
has been initialized. Once active, it accepts data to be appended,
patterns to be inserted (i.e. memset) and a completion event or an abort
event to generate a devcoredump. The timeout is initialized at the same
time the dump is initialized (defaulting to 10s) and will be cleared
either when the timeout occurs or the dump is complete or aborted.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/coredump.h | 116 +++++++++
 include/net/bluetooth/hci_core.h |  14 +
 net/bluetooth/Makefile           |   2 +
 net/bluetooth/coredump.c         | 535 +++++++++++++++++++++++++++++++++++++++
 net/bluetooth/hci_core.c         |   1 +
 net/bluetooth/hci_sync.c         |   2 +
 6 files changed, 670 insertions(+)
 create mode 100644 include/net/bluetooth/coredump.h
 create mode 100644 net/bluetooth/coredump.c

(limited to 'include')

diff --git a/include/net/bluetooth/coredump.h b/include/net/bluetooth/coredump.h
new file mode 100644
index 000000000000..72f51b587a04
--- /dev/null
+++ b/include/net/bluetooth/coredump.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google Corporation
+ */
+
+#ifndef __COREDUMP_H
+#define __COREDUMP_H
+
+#define DEVCOREDUMP_TIMEOUT	msecs_to_jiffies(10000)	/* 10 sec */
+
+typedef void (*coredump_t)(struct hci_dev *hdev);
+typedef void (*dmp_hdr_t)(struct hci_dev *hdev, struct sk_buff *skb);
+typedef void (*notify_change_t)(struct hci_dev *hdev, int state);
+
+/* struct hci_devcoredump - Devcoredump state
+ *
+ * @supported: Indicates if FW dump collection is supported by driver
+ * @state: Current state of dump collection
+ * @timeout: Indicates a timeout for collecting the devcoredump
+ *
+ * @alloc_size: Total size of the dump
+ * @head: Start of the dump
+ * @tail: Pointer to current end of dump
+ * @end: head + alloc_size for easy comparisons
+ *
+ * @dump_q: Dump queue for state machine to process
+ * @dump_rx: Devcoredump state machine work
+ * @dump_timeout: Devcoredump timeout work
+ *
+ * @coredump: Called from the driver's .coredump() function.
+ * @dmp_hdr: Create a dump header to identify controller/fw/driver info
+ * @notify_change: Notify driver when devcoredump state has changed
+ */
+struct hci_devcoredump {
+	bool		supported;
+
+	enum devcoredump_state {
+		HCI_DEVCOREDUMP_IDLE,
+		HCI_DEVCOREDUMP_ACTIVE,
+		HCI_DEVCOREDUMP_DONE,
+		HCI_DEVCOREDUMP_ABORT,
+		HCI_DEVCOREDUMP_TIMEOUT,
+	} state;
+
+	unsigned long	timeout;
+
+	size_t		alloc_size;
+	char		*head;
+	char		*tail;
+	char		*end;
+
+	struct sk_buff_head	dump_q;
+	struct work_struct	dump_rx;
+	struct delayed_work	dump_timeout;
+
+	coredump_t		coredump;
+	dmp_hdr_t		dmp_hdr;
+	notify_change_t		notify_change;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+void hci_devcd_reset(struct hci_dev *hdev);
+void hci_devcd_rx(struct work_struct *work);
+void hci_devcd_timeout(struct work_struct *work);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+		       dmp_hdr_t dmp_hdr, notify_change_t notify_change);
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size);
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb);
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len);
+int hci_devcd_complete(struct hci_dev *hdev);
+int hci_devcd_abort(struct hci_dev *hdev);
+
+#else
+
+static inline void hci_devcd_reset(struct hci_dev *hdev) {}
+static inline void hci_devcd_rx(struct work_struct *work) {}
+static inline void hci_devcd_timeout(struct work_struct *work) {}
+
+static inline int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+				     dmp_hdr_t dmp_hdr,
+				     notify_change_t notify_change)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_append_pattern(struct hci_dev *hdev,
+					   u8 pattern, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_complete(struct hci_dev *hdev)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_abort(struct hci_dev *hdev)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __COREDUMP_H */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index e22e45fbe8db..a461335beacf 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -32,6 +32,7 @@
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sync.h>
 #include <net/bluetooth/hci_sock.h>
+#include <net/bluetooth/coredump.h>
 
 /* HCI priority */
 #define HCI_PRIO_MAX	7
@@ -590,6 +591,10 @@ struct hci_dev {
 	const char		*fw_info;
 	struct dentry		*debugfs;
 
+#ifdef CONFIG_DEV_COREDUMP
+	struct hci_devcoredump	dump;
+#endif
+
 	struct device		dev;
 
 	struct rfkill		*rfkill;
@@ -1497,6 +1502,15 @@ static inline void hci_set_aosp_capable(struct hci_dev *hdev)
 #endif
 }
 
+static inline void hci_devcd_setup(struct hci_dev *hdev)
+{
+#ifdef CONFIG_DEV_COREDUMP
+	INIT_WORK(&hdev->dump.dump_rx, hci_devcd_rx);
+	INIT_DELAYED_WORK(&hdev->dump.dump_timeout, hci_devcd_timeout);
+	skb_queue_head_init(&hdev->dump.dump_q);
+#endif
+}
+
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 0e7b7db42750..141ac1fda0bf 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -17,6 +17,8 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 	ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o hci_codec.o \
 	eir.o hci_sync.o
 
+bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
+
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_LE) += iso.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c
new file mode 100644
index 000000000000..08fa98505454
--- /dev/null
+++ b/net/bluetooth/coredump.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Google Corporation
+ */
+
+#include <linux/devcoredump.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+enum hci_devcoredump_pkt_type {
+	HCI_DEVCOREDUMP_PKT_INIT,
+	HCI_DEVCOREDUMP_PKT_SKB,
+	HCI_DEVCOREDUMP_PKT_PATTERN,
+	HCI_DEVCOREDUMP_PKT_COMPLETE,
+	HCI_DEVCOREDUMP_PKT_ABORT,
+};
+
+struct hci_devcoredump_skb_cb {
+	u16 pkt_type;
+};
+
+struct hci_devcoredump_skb_pattern {
+	u8 pattern;
+	u32 len;
+} __packed;
+
+#define hci_dmp_cb(skb)	((struct hci_devcoredump_skb_cb *)((skb)->cb))
+
+#define DBG_UNEXPECTED_STATE() \
+	bt_dev_dbg(hdev, \
+		   "Unexpected packet (%d) for state (%d). ", \
+		   hci_dmp_cb(skb)->pkt_type, hdev->dump.state)
+
+#define MAX_DEVCOREDUMP_HDR_SIZE	512	/* bytes */
+
+static int hci_devcd_update_hdr_state(char *buf, size_t size, int state)
+{
+	int len = 0;
+
+	if (!buf)
+		return 0;
+
+	len = scnprintf(buf, size, "Bluetooth devcoredump\nState: %d\n", state);
+
+	return len + 1; /* scnprintf adds \0 at the end upon state rewrite */
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_update_state(struct hci_dev *hdev, int state)
+{
+	bt_dev_dbg(hdev, "Updating devcoredump state from %d to %d.",
+		   hdev->dump.state, state);
+
+	hdev->dump.state = state;
+
+	return hci_devcd_update_hdr_state(hdev->dump.head,
+					  hdev->dump.alloc_size, state);
+}
+
+static int hci_devcd_mkheader(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	char dump_start[] = "--- Start dump ---\n";
+	char hdr[80];
+	int hdr_len;
+
+	hdr_len = hci_devcd_update_hdr_state(hdr, sizeof(hdr),
+					     HCI_DEVCOREDUMP_IDLE);
+	skb_put_data(skb, hdr, hdr_len);
+
+	if (hdev->dump.dmp_hdr)
+		hdev->dump.dmp_hdr(hdev, skb);
+
+	skb_put_data(skb, dump_start, strlen(dump_start));
+
+	return skb->len;
+}
+
+/* Do not call with hci_dev_lock since this calls driver code. */
+static void hci_devcd_notify(struct hci_dev *hdev, int state)
+{
+	if (hdev->dump.notify_change)
+		hdev->dump.notify_change(hdev, state);
+}
+
+/* Call with hci_dev_lock only. */
+void hci_devcd_reset(struct hci_dev *hdev)
+{
+	hdev->dump.head = NULL;
+	hdev->dump.tail = NULL;
+	hdev->dump.alloc_size = 0;
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+	cancel_delayed_work(&hdev->dump.dump_timeout);
+	skb_queue_purge(&hdev->dump.dump_q);
+}
+
+/* Call with hci_dev_lock only. */
+static void hci_devcd_free(struct hci_dev *hdev)
+{
+	if (hdev->dump.head)
+		vfree(hdev->dump.head);
+
+	hci_devcd_reset(hdev);
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_alloc(struct hci_dev *hdev, u32 size)
+{
+	hdev->dump.head = vmalloc(size);
+	if (!hdev->dump.head)
+		return -ENOMEM;
+
+	hdev->dump.alloc_size = size;
+	hdev->dump.tail = hdev->dump.head;
+	hdev->dump.end = hdev->dump.head + size;
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+	return 0;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_copy(struct hci_dev *hdev, char *buf, u32 size)
+{
+	if (hdev->dump.tail + size > hdev->dump.end)
+		return false;
+
+	memcpy(hdev->dump.tail, buf, size);
+	hdev->dump.tail += size;
+
+	return true;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_memset(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+	if (hdev->dump.tail + len > hdev->dump.end)
+		return false;
+
+	memset(hdev->dump.tail, pattern, len);
+	hdev->dump.tail += len;
+
+	return true;
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_prepare(struct hci_dev *hdev, u32 dump_size)
+{
+	struct sk_buff *skb;
+	int dump_hdr_size;
+	int err = 0;
+
+	skb = alloc_skb(MAX_DEVCOREDUMP_HDR_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	dump_hdr_size = hci_devcd_mkheader(hdev, skb);
+
+	if (hci_devcd_alloc(hdev, dump_hdr_size + dump_size)) {
+		err = -ENOMEM;
+		goto hdr_free;
+	}
+
+	/* Insert the device header */
+	if (!hci_devcd_copy(hdev, skb->data, skb->len)) {
+		bt_dev_err(hdev, "Failed to insert header");
+		hci_devcd_free(hdev);
+
+		err = -ENOMEM;
+		goto hdr_free;
+	}
+
+hdr_free:
+	kfree_skb(skb);
+
+	return err;
+}
+
+static void hci_devcd_handle_pkt_init(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	u32 *dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_IDLE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (skb->len != sizeof(*dump_size)) {
+		bt_dev_dbg(hdev, "Invalid dump init pkt");
+		return;
+	}
+
+	dump_size = skb_pull_data(skb, sizeof(*dump_size));
+	if (!*dump_size) {
+		bt_dev_err(hdev, "Zero size dump init pkt");
+		return;
+	}
+
+	if (hci_devcd_prepare(hdev, *dump_size)) {
+		bt_dev_err(hdev, "Failed to prepare for dump");
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ACTIVE);
+	queue_delayed_work(hdev->workqueue, &hdev->dump.dump_timeout,
+			   hdev->dump.timeout);
+}
+
+static void hci_devcd_handle_pkt_skb(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (!hci_devcd_copy(hdev, skb->data, skb->len))
+		bt_dev_dbg(hdev, "Failed to insert skb");
+}
+
+static void hci_devcd_handle_pkt_pattern(struct hci_dev *hdev,
+					 struct sk_buff *skb)
+{
+	struct hci_devcoredump_skb_pattern *pattern;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (skb->len != sizeof(*pattern)) {
+		bt_dev_dbg(hdev, "Invalid pattern skb");
+		return;
+	}
+
+	pattern = skb_pull_data(skb, sizeof(*pattern));
+
+	if (!hci_devcd_memset(hdev, pattern->pattern, pattern->len))
+		bt_dev_dbg(hdev, "Failed to set pattern");
+}
+
+static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev,
+					  struct sk_buff *skb)
+{
+	u32 dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_DONE);
+	dump_size = hdev->dump.tail - hdev->dump.head;
+
+	bt_dev_dbg(hdev, "complete with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+}
+
+static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev,
+				       struct sk_buff *skb)
+{
+	u32 dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ABORT);
+	dump_size = hdev->dump.tail - hdev->dump.head;
+
+	bt_dev_dbg(hdev, "aborted with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	/* Emit a devcoredump with the available data */
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+}
+
+/* Bluetooth devcoredump state machine.
+ *
+ * Devcoredump states:
+ *
+ *      HCI_DEVCOREDUMP_IDLE: The default state.
+ *
+ *      HCI_DEVCOREDUMP_ACTIVE: A devcoredump will be in this state once it has
+ *              been initialized using hci_devcd_init(). Once active, the driver
+ *              can append data using hci_devcd_append() or insert a pattern
+ *              using hci_devcd_append_pattern().
+ *
+ *      HCI_DEVCOREDUMP_DONE: Once the dump collection is complete, the drive
+ *              can signal the completion using hci_devcd_complete(). A
+ *              devcoredump is generated indicating the completion event and
+ *              then the state machine is reset to the default state.
+ *
+ *      HCI_DEVCOREDUMP_ABORT: The driver can cancel ongoing dump collection in
+ *              case of any error using hci_devcd_abort(). A devcoredump is
+ *              still generated with the available data indicating the abort
+ *              event and then the state machine is reset to the default state.
+ *
+ *      HCI_DEVCOREDUMP_TIMEOUT: A timeout timer for HCI_DEVCOREDUMP_TIMEOUT sec
+ *              is started during devcoredump initialization. Once the timeout
+ *              occurs, the driver is notified, a devcoredump is generated with
+ *              the available data indicating the timeout event and then the
+ *              state machine is reset to the default state.
+ *
+ * The driver must register using hci_devcd_register() before using the hci
+ * devcoredump APIs.
+ */
+void hci_devcd_rx(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev, dump.dump_rx);
+	struct sk_buff *skb;
+	int start_state;
+
+	while ((skb = skb_dequeue(&hdev->dump.dump_q))) {
+		/* Return if timeout occurs. The timeout handler function
+		 * hci_devcd_timeout() will report the available dump data.
+		 */
+		if (hdev->dump.state == HCI_DEVCOREDUMP_TIMEOUT) {
+			kfree_skb(skb);
+			return;
+		}
+
+		hci_dev_lock(hdev);
+		start_state = hdev->dump.state;
+
+		switch (hci_dmp_cb(skb)->pkt_type) {
+		case HCI_DEVCOREDUMP_PKT_INIT:
+			hci_devcd_handle_pkt_init(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_SKB:
+			hci_devcd_handle_pkt_skb(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_PATTERN:
+			hci_devcd_handle_pkt_pattern(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_COMPLETE:
+			hci_devcd_handle_pkt_complete(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_ABORT:
+			hci_devcd_handle_pkt_abort(hdev, skb);
+			break;
+
+		default:
+			bt_dev_dbg(hdev, "Unknown packet (%d) for state (%d). ",
+				   hci_dmp_cb(skb)->pkt_type, hdev->dump.state);
+			break;
+		}
+
+		hci_dev_unlock(hdev);
+		kfree_skb(skb);
+
+		/* Notify the driver about any state changes before resetting
+		 * the state machine
+		 */
+		if (start_state != hdev->dump.state)
+			hci_devcd_notify(hdev, hdev->dump.state);
+
+		/* Reset the state machine if the devcoredump is complete */
+		hci_dev_lock(hdev);
+		if (hdev->dump.state == HCI_DEVCOREDUMP_DONE ||
+		    hdev->dump.state == HCI_DEVCOREDUMP_ABORT)
+			hci_devcd_reset(hdev);
+		hci_dev_unlock(hdev);
+	}
+}
+EXPORT_SYMBOL(hci_devcd_rx);
+
+void hci_devcd_timeout(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    dump.dump_timeout.work);
+	u32 dump_size;
+
+	hci_devcd_notify(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+	hci_dev_lock(hdev);
+
+	cancel_work(&hdev->dump.dump_rx);
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+	dump_size = hdev->dump.tail - hdev->dump.head;
+	bt_dev_dbg(hdev, "timeout with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	/* Emit a devcoredump with the available data */
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+
+	hci_devcd_reset(hdev);
+
+	hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(hci_devcd_timeout);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+		       dmp_hdr_t dmp_hdr, notify_change_t notify_change)
+{
+	/* Driver must implement coredump() and dmp_hdr() functions for
+	 * bluetooth devcoredump. The coredump() should trigger a coredump
+	 * event on the controller when the device's coredump sysfs entry is
+	 * written to. The dmp_hdr() should create a dump header to identify
+	 * the controller/fw/driver info.
+	 */
+	if (!coredump || !dmp_hdr)
+		return -EINVAL;
+
+	hci_dev_lock(hdev);
+	hdev->dump.coredump = coredump;
+	hdev->dump.dmp_hdr = dmp_hdr;
+	hdev->dump.notify_change = notify_change;
+	hdev->dump.supported = true;
+	hdev->dump.timeout = DEVCOREDUMP_TIMEOUT;
+	hci_dev_unlock(hdev);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_register);
+
+static inline bool hci_devcd_enabled(struct hci_dev *hdev)
+{
+	return hdev->dump.supported;
+}
+
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(sizeof(dump_size), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_INIT;
+	skb_put_data(skb, &dump_size, sizeof(dump_size));
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_init);
+
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	if (!skb)
+		return -ENOMEM;
+
+	if (!hci_devcd_enabled(hdev)) {
+		kfree_skb(skb);
+		return -EOPNOTSUPP;
+	}
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_SKB;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append);
+
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+	struct hci_devcoredump_skb_pattern p;
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(sizeof(p), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	p.pattern = pattern;
+	p.len = len;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_PATTERN;
+	skb_put_data(skb, &p, sizeof(p));
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append_pattern);
+
+int hci_devcd_complete(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_COMPLETE;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_complete);
+
+int hci_devcd_abort(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_ABORT;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_abort);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 334e308451f5..393b317ae68f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2544,6 +2544,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
 	INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout);
 
+	hci_devcd_setup(hdev);
 	hci_request_setup(hdev);
 
 	hci_init_sysfs(hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 31231f0e4a28..f21497ebc814 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4727,6 +4727,8 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 		goto done;
 	}
 
+	hci_devcd_reset(hdev);
+
 	set_bit(HCI_RUNNING, &hdev->flags);
 	hci_sock_dev_event(hdev, HCI_DEV_OPEN);
 
-- 
cgit v1.2.3


From 0fe8c8d071343fa9278980ce4b6f8e6ea24a2ed1 Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Fri, 31 Mar 2023 18:38:01 +0300
Subject: Bluetooth: Split bt_iso_qos into dedicated structures

Split bt_iso_qos into dedicated unicast and broadcast
structures and add additional broadcast parameters.

Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections")
Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  43 +++++++---
 include/net/bluetooth/hci_core.h  |   9 ++-
 net/bluetooth/hci_conn.c          | 162 ++++++++++++++++++++------------------
 net/bluetooth/hci_event.c         |  33 ++++----
 net/bluetooth/iso.c               | 125 +++++++++++++++++++++++------
 5 files changed, 237 insertions(+), 135 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bcc5a4cd2c17..1b4230cd42a3 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -171,23 +172,39 @@ struct bt_iso_io_qos {
 	__u8  rtn;
 };
 
-struct bt_iso_qos {
-	union {
-		__u8  cig;
-		__u8  big;
-	};
-	union {
-		__u8  cis;
-		__u8  bis;
-	};
-	union {
-		__u8  sca;
-		__u8  sync_interval;
-	};
+struct bt_iso_ucast_qos {
+	__u8  cig;
+	__u8  cis;
+	__u8  sca;
+	__u8  packing;
+	__u8  framing;
+	struct bt_iso_io_qos in;
+	struct bt_iso_io_qos out;
+};
+
+struct bt_iso_bcast_qos {
+	__u8  big;
+	__u8  bis;
+	__u8  sync_interval;
 	__u8  packing;
 	__u8  framing;
 	struct bt_iso_io_qos in;
 	struct bt_iso_io_qos out;
+	__u8  encryption;
+	__u8  bcode[16];
+	__u8  options;
+	__u16 skip;
+	__u16 sync_timeout;
+	__u8  sync_cte_type;
+	__u8  mse;
+	__u16 timeout;
+};
+
+struct bt_iso_qos {
+	union {
+		struct bt_iso_ucast_qos ucast;
+		struct bt_iso_bcast_qos bcast;
+	};
 };
 
 #define BT_ISO_PHY_1M		0x01
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a461335beacf..f11689284112 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -1096,7 +1097,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
 		if (bacmp(&c->dst, ba) || c->type != ISO_LINK)
 			continue;
 
-		if (c->iso_qos.big == big && c->iso_qos.bis == bis) {
+		if (c->iso_qos.bcast.big == big && c->iso_qos.bcast.bis == bis) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1205,7 +1206,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
 		if (c->type != ISO_LINK)
 			continue;
 
-		if (handle == c->iso_qos.cig) {
+		if (handle == c->iso_qos.ucast.cig) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1228,7 +1229,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
 		if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK)
 			continue;
 
-		if (handle == c->iso_qos.big) {
+		if (handle == c->iso_qos.bcast.big) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1337,7 +1338,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos,
 				 __u8 data_len, __u8 *data);
 int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid);
+		       __u8 sid, struct bt_iso_qos *qos);
 int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
 			   __u16 sync_handle, __u8 num_bis, __u8 bis[]);
 int hci_conn_check_link_mode(struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 8455ba141ee6..5672b4924572 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -795,8 +796,8 @@ static void bis_list(struct hci_conn *conn, void *data)
 	if (bacmp(&conn->dst, BDADDR_ANY))
 		return;
 
-	if (d->big != conn->iso_qos.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
-	    d->bis != conn->iso_qos.bis)
+	if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
+	    d->bis != conn->iso_qos.bcast.bis)
 		return;
 
 	d->count++;
@@ -916,10 +917,10 @@ static void bis_cleanup(struct hci_conn *conn)
 		if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags))
 			return;
 
-		hci_le_terminate_big(hdev, conn->iso_qos.big,
-				     conn->iso_qos.bis);
+		hci_le_terminate_big(hdev, conn->iso_qos.bcast.big,
+				     conn->iso_qos.bcast.bis);
 	} else {
-		hci_le_big_terminate(hdev, conn->iso_qos.big,
+		hci_le_big_terminate(hdev, conn->iso_qos.bcast.big,
 				     conn->sync_handle);
 	}
 }
@@ -959,7 +960,7 @@ static void cis_cleanup(struct hci_conn *conn)
 	struct iso_list_data d;
 
 	memset(&d, 0, sizeof(d));
-	d.cig = conn->iso_qos.cig;
+	d.cig = conn->iso_qos.ucast.cig;
 
 	/* Check if ISO connection is a CIS and remove CIG if there are
 	 * no other connections using it.
@@ -968,7 +969,7 @@ static void cis_cleanup(struct hci_conn *conn)
 	if (d.count)
 		return;
 
-	hci_le_remove_cig(hdev, conn->iso_qos.cig);
+	hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig);
 }
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
@@ -1411,7 +1412,7 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
 	struct iso_list_data data;
 
 	/* Allocate a BIG if not set */
-	if (qos->big == BT_ISO_QOS_BIG_UNSET) {
+	if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) {
 		for (data.big = 0x00; data.big < 0xef; data.big++) {
 			data.count = 0;
 			data.bis = 0xff;
@@ -1426,7 +1427,7 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
 			return -EADDRNOTAVAIL;
 
 		/* Update BIG */
-		qos->big = data.big;
+		qos->bcast.big = data.big;
 	}
 
 	return 0;
@@ -1437,7 +1438,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
 	struct iso_list_data data;
 
 	/* Allocate BIS if not set */
-	if (qos->bis == BT_ISO_QOS_BIS_UNSET) {
+	if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) {
 		/* Find an unused adv set to advertise BIS, skip instance 0x00
 		 * since it is reserved as general purpose set.
 		 */
@@ -1455,7 +1456,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
 			return -EADDRNOTAVAIL;
 
 		/* Update BIS */
-		qos->bis = data.bis;
+		qos->bcast.bis = data.bis;
 	}
 
 	return 0;
@@ -1484,8 +1485,8 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (err)
 		return ERR_PTR(err);
 
-	data.big = qos->big;
-	data.bis = qos->bis;
+	data.big = qos->bcast.big;
+	data.bis = qos->bcast.bis;
 	data.count = 0;
 
 	/* Check if there is already a matching BIG/BIS */
@@ -1493,7 +1494,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (data.count)
 		return ERR_PTR(-EADDRINUSE);
 
-	conn = hci_conn_hash_lookup_bis(hdev, dst, qos->big, qos->bis);
+	conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, qos->bcast.bis);
 	if (conn)
 		return ERR_PTR(-EADDRINUSE);
 
@@ -1648,13 +1649,13 @@ static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos)
 {
 	struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis];
 
-	cis->cis_id = qos->cis;
-	cis->c_sdu  = cpu_to_le16(qos->out.sdu);
-	cis->p_sdu  = cpu_to_le16(qos->in.sdu);
-	cis->c_phy  = qos->out.phy ? qos->out.phy : qos->in.phy;
-	cis->p_phy  = qos->in.phy ? qos->in.phy : qos->out.phy;
-	cis->c_rtn  = qos->out.rtn;
-	cis->p_rtn  = qos->in.rtn;
+	cis->cis_id = qos->ucast.cis;
+	cis->c_sdu  = cpu_to_le16(qos->ucast.out.sdu);
+	cis->p_sdu  = cpu_to_le16(qos->ucast.in.sdu);
+	cis->c_phy  = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy;
+	cis->p_phy  = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy;
+	cis->c_rtn  = qos->ucast.out.rtn;
+	cis->p_rtn  = qos->ucast.in.rtn;
 
 	d->pdu.cp.num_cis++;
 }
@@ -1667,8 +1668,8 @@ static void cis_list(struct hci_conn *conn, void *data)
 	if (!bacmp(&conn->dst, BDADDR_ANY))
 		return;
 
-	if (d->cig != conn->iso_qos.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
-	    d->cis != conn->iso_qos.cis)
+	if (d->cig != conn->iso_qos.ucast.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
+	    d->cis != conn->iso_qos.ucast.cis)
 		return;
 
 	d->count++;
@@ -1687,17 +1688,18 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
 
 	memset(&cp, 0, sizeof(cp));
 
-	cp.handle = qos->big;
-	cp.adv_handle = qos->bis;
+	cp.handle = qos->bcast.big;
+	cp.adv_handle = qos->bcast.bis;
 	cp.num_bis  = 0x01;
-	hci_cpu_to_le24(qos->out.interval, cp.bis.sdu_interval);
-	cp.bis.sdu = cpu_to_le16(qos->out.sdu);
-	cp.bis.latency =  cpu_to_le16(qos->out.latency);
-	cp.bis.rtn  = qos->out.rtn;
-	cp.bis.phy  = qos->out.phy;
-	cp.bis.packing = qos->packing;
-	cp.bis.framing = qos->framing;
-	cp.bis.encryption = 0x00;
+	hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval);
+	cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu);
+	cp.bis.latency =  cpu_to_le16(qos->bcast.out.latency);
+	cp.bis.rtn  = qos->bcast.out.rtn;
+	cp.bis.phy  = qos->bcast.out.phy;
+	cp.bis.packing = qos->bcast.packing;
+	cp.bis.framing = qos->bcast.framing;
+	cp.bis.encryption = qos->bcast.encryption;
+	memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode));
 	memset(&cp.bis.bcode, 0, sizeof(cp.bis.bcode));
 
 	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp);
@@ -1711,7 +1713,7 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 	memset(&data, 0, sizeof(data));
 
 	/* Allocate a CIG if not set */
-	if (qos->cig == BT_ISO_QOS_CIG_UNSET) {
+	if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) {
 		for (data.cig = 0x00; data.cig < 0xff; data.cig++) {
 			data.count = 0;
 			data.cis = 0xff;
@@ -1731,22 +1733,22 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 			return false;
 
 		/* Update CIG */
-		qos->cig = data.cig;
+		qos->ucast.cig = data.cig;
 	}
 
-	data.pdu.cp.cig_id = qos->cig;
-	hci_cpu_to_le24(qos->out.interval, data.pdu.cp.c_interval);
-	hci_cpu_to_le24(qos->in.interval, data.pdu.cp.p_interval);
-	data.pdu.cp.sca = qos->sca;
-	data.pdu.cp.packing = qos->packing;
-	data.pdu.cp.framing = qos->framing;
-	data.pdu.cp.c_latency = cpu_to_le16(qos->out.latency);
-	data.pdu.cp.p_latency = cpu_to_le16(qos->in.latency);
+	data.pdu.cp.cig_id = qos->ucast.cig;
+	hci_cpu_to_le24(qos->ucast.out.interval, data.pdu.cp.c_interval);
+	hci_cpu_to_le24(qos->ucast.in.interval, data.pdu.cp.p_interval);
+	data.pdu.cp.sca = qos->ucast.sca;
+	data.pdu.cp.packing = qos->ucast.packing;
+	data.pdu.cp.framing = qos->ucast.framing;
+	data.pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency);
+	data.pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency);
 
-	if (qos->cis != BT_ISO_QOS_CIS_UNSET) {
+	if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) {
 		data.count = 0;
-		data.cig = qos->cig;
-		data.cis = qos->cis;
+		data.cig = qos->ucast.cig;
+		data.cis = qos->ucast.cis;
 
 		hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND,
 					 &data);
@@ -1757,7 +1759,7 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 	}
 
 	/* Reprogram all CIS(s) with the same CIG */
-	for (data.cig = qos->cig, data.cis = 0x00; data.cis < 0x11;
+	for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0x11;
 	     data.cis++) {
 		data.count = 0;
 
@@ -1767,14 +1769,14 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 			continue;
 
 		/* Allocate a CIS if not set */
-		if (qos->cis == BT_ISO_QOS_CIS_UNSET) {
+		if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) {
 			/* Update CIS */
-			qos->cis = data.cis;
+			qos->ucast.cis = data.cis;
 			cis_add(&data, qos);
 		}
 	}
 
-	if (qos->cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis)
+	if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis)
 		return false;
 
 	if (hci_send_cmd(hdev, HCI_OP_LE_SET_CIG_PARAMS,
@@ -1809,32 +1811,32 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 		return cis;
 
 	/* Update LINK PHYs according to QoS preference */
-	cis->le_tx_phy = qos->out.phy;
-	cis->le_rx_phy = qos->in.phy;
+	cis->le_tx_phy = qos->ucast.out.phy;
+	cis->le_rx_phy = qos->ucast.in.phy;
 
 	/* If output interval is not set use the input interval as it cannot be
 	 * 0x000000.
 	 */
-	if (!qos->out.interval)
-		qos->out.interval = qos->in.interval;
+	if (!qos->ucast.out.interval)
+		qos->ucast.out.interval = qos->ucast.in.interval;
 
 	/* If input interval is not set use the output interval as it cannot be
 	 * 0x000000.
 	 */
-	if (!qos->in.interval)
-		qos->in.interval = qos->out.interval;
+	if (!qos->ucast.in.interval)
+		qos->ucast.in.interval = qos->ucast.out.interval;
 
 	/* If output latency is not set use the input latency as it cannot be
 	 * 0x0000.
 	 */
-	if (!qos->out.latency)
-		qos->out.latency = qos->in.latency;
+	if (!qos->ucast.out.latency)
+		qos->ucast.out.latency = qos->ucast.in.latency;
 
 	/* If input latency is not set use the output latency as it cannot be
 	 * 0x0000.
 	 */
-	if (!qos->in.latency)
-		qos->in.latency = qos->out.latency;
+	if (!qos->ucast.in.latency)
+		qos->ucast.in.latency = qos->ucast.out.latency;
 
 	if (!hci_le_set_cig_params(cis, qos)) {
 		hci_conn_drop(cis);
@@ -1854,7 +1856,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 
 	memset(&cmd, 0, sizeof(cmd));
 
-	if (conn->iso_qos.out.sdu) {
+	if (conn->iso_qos.ucast.out.sdu) {
 		cmd.handle = cpu_to_le16(conn->handle);
 		cmd.direction = 0x00; /* Input (Host to Controller) */
 		cmd.path = 0x00; /* HCI path if enabled */
@@ -1865,7 +1867,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 			return false;
 	}
 
-	if (conn->iso_qos.in.sdu) {
+	if (conn->iso_qos.ucast.in.sdu) {
 		cmd.handle = cpu_to_le16(conn->handle);
 		cmd.direction = 0x01; /* Output (Controller to Host) */
 		cmd.path = 0x00; /* HCI path if enabled */
@@ -1892,7 +1894,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 	cmd.cis[0].acl_handle = cpu_to_le16(conn->link->handle);
 	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
 	cmd.cp.num_cis++;
-	cig = conn->iso_qos.cig;
+	cig = conn->iso_qos.ucast.cig;
 
 	hci_dev_lock(hdev);
 
@@ -1902,7 +1904,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
 
 		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED || conn->iso_qos.cig != cig)
+		    conn->state == BT_CONNECTED || conn->iso_qos.ucast.cig != cig)
 			continue;
 
 		/* Check if all CIS(s) belonging to a CIG are ready */
@@ -2002,8 +2004,8 @@ static void hci_bind_bis(struct hci_conn *conn,
 			 struct bt_iso_qos *qos)
 {
 	/* Update LINK PHYs according to QoS preference */
-	conn->le_tx_phy = qos->out.phy;
-	conn->le_tx_phy = qos->out.phy;
+	conn->le_tx_phy = qos->bcast.out.phy;
+	conn->le_tx_phy = qos->bcast.out.phy;
 	conn->iso_qos = *qos;
 	conn->state = BT_BOUND;
 }
@@ -2016,16 +2018,16 @@ static int create_big_sync(struct hci_dev *hdev, void *data)
 	u32 flags = 0;
 	int err;
 
-	if (qos->out.phy == 0x02)
+	if (qos->bcast.out.phy == 0x02)
 		flags |= MGMT_ADV_FLAG_SEC_2M;
 
 	/* Align intervals */
-	interval = qos->out.interval / 1250;
+	interval = qos->bcast.out.interval / 1250;
 
-	if (qos->bis)
-		sync_interval = qos->sync_interval * 1600;
+	if (qos->bcast.bis)
+		sync_interval = qos->bcast.sync_interval * 1600;
 
-	err = hci_start_per_adv_sync(hdev, qos->bis, conn->le_per_adv_data_len,
+	err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->le_per_adv_data_len,
 				     conn->le_per_adv_data, flags, interval,
 				     interval, sync_interval);
 	if (err)
@@ -2062,7 +2064,7 @@ static int create_pa_sync(struct hci_dev *hdev, void *data)
 }
 
 int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid)
+		       __u8 sid, struct bt_iso_qos *qos)
 {
 	struct hci_cp_le_pa_create_sync *cp;
 
@@ -2075,9 +2077,13 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
 		return -ENOMEM;
 	}
 
+	cp->options = qos->bcast.options;
 	cp->sid = sid;
 	cp->addr_type = dst_type;
 	bacpy(&cp->addr, dst);
+	cp->skip = cpu_to_le16(qos->bcast.skip);
+	cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+	cp->sync_cte_type = qos->bcast.sync_cte_type;
 
 	/* Queue start pa_create_sync and scan */
 	return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete);
@@ -2100,8 +2106,12 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
 		return err;
 
 	memset(&pdu, 0, sizeof(pdu));
-	pdu.cp.handle = qos->big;
+	pdu.cp.handle = qos->bcast.big;
 	pdu.cp.sync_handle = cpu_to_le16(sync_handle);
+	pdu.cp.encryption = qos->bcast.encryption;
+	memcpy(pdu.cp.bcode, qos->bcast.bcode, sizeof(pdu.cp.bcode));
+	pdu.cp.mse = qos->bcast.mse;
+	pdu.cp.timeout = cpu_to_le16(qos->bcast.timeout);
 	pdu.cp.num_bis = num_bis;
 	memcpy(pdu.bis, bis, num_bis);
 
@@ -2151,7 +2161,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 		return ERR_PTR(err);
 	}
 
-	hci_iso_qos_setup(hdev, conn, &qos->out,
+	hci_iso_qos_setup(hdev, conn, &qos->bcast.out,
 			  conn->le_tx_phy ? conn->le_tx_phy :
 			  hdev->le_tx_def_phys);
 
@@ -2177,9 +2187,9 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (IS_ERR(le))
 		return le;
 
-	hci_iso_qos_setup(hdev, le, &qos->out,
+	hci_iso_qos_setup(hdev, le, &qos->ucast.out,
 			  le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
-	hci_iso_qos_setup(hdev, le, &qos->in,
+	hci_iso_qos_setup(hdev, le, &qos->ucast.in,
 			  le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
 
 	cis = hci_bind_cis(hdev, dst, dst_type, qos);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 51f13518dba9..0e0a93cc1218 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -3833,7 +3834,7 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		if (conn->type != ISO_LINK || conn->iso_qos.cig != rp->cig_id ||
+		if (conn->type != ISO_LINK || conn->iso_qos.ucast.cig != rp->cig_id ||
 		    conn->state == BT_CONNECTED)
 			continue;
 
@@ -3890,7 +3891,7 @@ static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
 	/* Input (Host to Controller) */
 	case 0x00:
 		/* Only confirm connection if output only */
-		if (conn->iso_qos.out.sdu && !conn->iso_qos.in.sdu)
+		if (conn->iso_qos.ucast.out.sdu && !conn->iso_qos.ucast.in.sdu)
 			hci_connect_cfm(conn, rp->status);
 		break;
 	/* Output (Controller to Host) */
@@ -6818,15 +6819,15 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
 		memset(&interval, 0, sizeof(interval));
 
 		memcpy(&interval, ev->c_latency, sizeof(ev->c_latency));
-		conn->iso_qos.in.interval = le32_to_cpu(interval);
+		conn->iso_qos.ucast.in.interval = le32_to_cpu(interval);
 		memcpy(&interval, ev->p_latency, sizeof(ev->p_latency));
-		conn->iso_qos.out.interval = le32_to_cpu(interval);
-		conn->iso_qos.in.latency = le16_to_cpu(ev->interval);
-		conn->iso_qos.out.latency = le16_to_cpu(ev->interval);
-		conn->iso_qos.in.sdu = le16_to_cpu(ev->c_mtu);
-		conn->iso_qos.out.sdu = le16_to_cpu(ev->p_mtu);
-		conn->iso_qos.in.phy = ev->c_phy;
-		conn->iso_qos.out.phy = ev->p_phy;
+		conn->iso_qos.ucast.out.interval = le32_to_cpu(interval);
+		conn->iso_qos.ucast.in.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.ucast.out.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.ucast.in.sdu = le16_to_cpu(ev->c_mtu);
+		conn->iso_qos.ucast.out.sdu = le16_to_cpu(ev->p_mtu);
+		conn->iso_qos.ucast.in.phy = ev->c_phy;
+		conn->iso_qos.ucast.out.phy = ev->p_phy;
 	}
 
 	if (!ev->status) {
@@ -6900,8 +6901,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
 		cis->handle = cis_handle;
 	}
 
-	cis->iso_qos.cig = ev->cig_id;
-	cis->iso_qos.cis = ev->cis_id;
+	cis->iso_qos.ucast.cig = ev->cig_id;
+	cis->iso_qos.ucast.cis = ev->cis_id;
 
 	if (!(flags & HCI_PROTO_DEFER)) {
 		hci_le_accept_cis(hdev, ev->cis_handle);
@@ -6988,13 +6989,13 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 			bis->handle = handle;
 		}
 
-		bis->iso_qos.big = ev->handle;
+		bis->iso_qos.bcast.big = ev->handle;
 		memset(&interval, 0, sizeof(interval));
 		memcpy(&interval, ev->latency, sizeof(ev->latency));
-		bis->iso_qos.in.interval = le32_to_cpu(interval);
+		bis->iso_qos.bcast.in.interval = le32_to_cpu(interval);
 		/* Convert ISO Interval (1.25 ms slots) to latency (ms) */
-		bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
-		bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu);
+		bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
+		bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu);
 
 		hci_iso_setup_path(bis);
 	}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 8d136a730163..74117df03a3f 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -3,6 +3,7 @@
  * BlueZ - Bluetooth protocol stack for Linux
  *
  * Copyright (C) 2022 Intel Corporation
+ * Copyright 2023 NXP
  */
 
 #include <linux/module.h>
@@ -59,11 +60,17 @@ struct iso_pinfo {
 	__u16			sync_handle;
 	__u32			flags;
 	struct bt_iso_qos	qos;
+	bool			qos_user_set;
 	__u8			base_len;
 	__u8			base[BASE_MAX_LENGTH];
 	struct iso_conn		*conn;
 };
 
+static struct bt_iso_qos default_qos;
+
+static bool check_ucast_qos(struct bt_iso_qos *qos);
+static bool check_bcast_qos(struct bt_iso_qos *qos);
+
 /* ---- ISO timers ---- */
 #define ISO_CONN_TIMEOUT	(HZ * 40)
 #define ISO_DISCONN_TIMEOUT	(HZ * 2)
@@ -264,8 +271,15 @@ static int iso_connect_bis(struct sock *sk)
 		goto unlock;
 	}
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		err = -EINVAL;
+		goto unlock;
+	}
+
 	/* Fail if out PHYs are marked as disabled */
-	if (!iso_pi(sk)->qos.out.phy) {
+	if (!iso_pi(sk)->qos.bcast.out.phy) {
 		err = -EINVAL;
 		goto unlock;
 	}
@@ -336,8 +350,15 @@ static int iso_connect_cis(struct sock *sk)
 		goto unlock;
 	}
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		err = -EINVAL;
+		goto unlock;
+	}
+
 	/* Fail if either PHYs are marked as disabled */
-	if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) {
+	if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) {
 		err = -EINVAL;
 		goto unlock;
 	}
@@ -417,7 +438,7 @@ static int iso_send_frame(struct sock *sk, struct sk_buff *skb)
 
 	BT_DBG("sk %p len %d", sk, skb->len);
 
-	if (skb->len > qos->out.sdu)
+	if (skb->len > qos->ucast.out.sdu)
 		return -EMSGSIZE;
 
 	len = skb->len;
@@ -680,13 +701,23 @@ static struct proto iso_proto = {
 }
 
 static struct bt_iso_qos default_qos = {
-	.cig		= BT_ISO_QOS_CIG_UNSET,
-	.cis		= BT_ISO_QOS_CIS_UNSET,
-	.sca		= 0x00,
-	.packing	= 0x00,
-	.framing	= 0x00,
-	.in		= DEFAULT_IO_QOS,
-	.out		= DEFAULT_IO_QOS,
+	.bcast = {
+		.big			= BT_ISO_QOS_BIG_UNSET,
+		.bis			= BT_ISO_QOS_BIS_UNSET,
+		.sync_interval		= 0x00,
+		.packing		= 0x00,
+		.framing		= 0x00,
+		.in			= DEFAULT_IO_QOS,
+		.out			= DEFAULT_IO_QOS,
+		.encryption		= 0x00,
+		.bcode			= {0x00},
+		.options		= 0x00,
+		.skip			= 0x0000,
+		.sync_timeout		= 0x4000,
+		.sync_cte_type		= 0x00,
+		.mse			= 0x00,
+		.timeout		= 0x4000,
+	},
 };
 
 static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
@@ -893,9 +924,15 @@ static int iso_listen_bis(struct sock *sk)
 	if (!hdev)
 		return -EHOSTUNREACH;
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		return -EINVAL;
+	}
+
 	err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
 				 le_addr_type(iso_pi(sk)->dst_type),
-				 iso_pi(sk)->bc_sid);
+				 iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
 
 	hci_dev_put(hdev);
 
@@ -1154,21 +1191,62 @@ static bool check_io_qos(struct bt_iso_io_qos *qos)
 	return true;
 }
 
-static bool check_qos(struct bt_iso_qos *qos)
+static bool check_ucast_qos(struct bt_iso_qos *qos)
 {
-	if (qos->sca > 0x07)
+	if (qos->ucast.sca > 0x07)
 		return false;
 
-	if (qos->packing > 0x01)
+	if (qos->ucast.packing > 0x01)
 		return false;
 
-	if (qos->framing > 0x01)
+	if (qos->ucast.framing > 0x01)
 		return false;
 
-	if (!check_io_qos(&qos->in))
+	if (!check_io_qos(&qos->ucast.in))
 		return false;
 
-	if (!check_io_qos(&qos->out))
+	if (!check_io_qos(&qos->ucast.out))
+		return false;
+
+	return true;
+}
+
+static bool check_bcast_qos(struct bt_iso_qos *qos)
+{
+	if (qos->bcast.sync_interval > 0x07)
+		return false;
+
+	if (qos->bcast.packing > 0x01)
+		return false;
+
+	if (qos->bcast.framing > 0x01)
+		return false;
+
+	if (!check_io_qos(&qos->bcast.in))
+		return false;
+
+	if (!check_io_qos(&qos->bcast.out))
+		return false;
+
+	if (qos->bcast.encryption > 0x01)
+		return false;
+
+	if (qos->bcast.options > 0x07)
+		return false;
+
+	if (qos->bcast.skip > 0x01f3)
+		return false;
+
+	if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000)
+		return false;
+
+	if (qos->bcast.sync_cte_type > 0x1f)
+		return false;
+
+	if (qos->bcast.mse > 0x1f)
+		return false;
+
+	if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000)
 		return false;
 
 	return true;
@@ -1179,7 +1257,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	int len, err = 0;
-	struct bt_iso_qos qos;
+	struct bt_iso_qos qos = default_qos;
 	u32 opt;
 
 	BT_DBG("sk %p", sk);
@@ -1212,24 +1290,19 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 		}
 
 		len = min_t(unsigned int, sizeof(qos), optlen);
-		if (len != sizeof(qos)) {
-			err = -EINVAL;
-			break;
-		}
-
-		memset(&qos, 0, sizeof(qos));
 
 		if (copy_from_sockptr(&qos, optval, len)) {
 			err = -EFAULT;
 			break;
 		}
 
-		if (!check_qos(&qos)) {
+		if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) {
 			err = -EINVAL;
 			break;
 		}
 
 		iso_pi(sk)->qos = qos;
+		iso_pi(sk)->qos_user_set = true;
 
 		break;
 
@@ -1419,7 +1492,7 @@ static bool iso_match_big(struct sock *sk, void *data)
 {
 	struct hci_evt_le_big_sync_estabilished *ev = data;
 
-	return ev->handle == iso_pi(sk)->qos.big;
+	return ev->handle == iso_pi(sk)->qos.bcast.big;
 }
 
 static void iso_conn_ready(struct iso_conn *conn)
-- 
cgit v1.2.3


From 288c90224eec55d13e786844b7954ef060752089 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 19 Dec 2022 13:37:02 -0800
Subject: Bluetooth: Enable all supported LE PHY by default

This enables 2M and Coded PHY by default if they are marked as supported
in the LE features bits.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  4 ++++
 net/bluetooth/hci_sync.c         | 28 ++++++++++++++++++++++++----
 net/bluetooth/mgmt.c             |  4 ++--
 3 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f11689284112..827e67159523 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1683,9 +1683,13 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))
 
+#define le_2m_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_2M))
+
 #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M))
 
+#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED))
+
 #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \
 			 ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index f21497ebc814..00017f75cd41 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4414,18 +4414,38 @@ static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev)
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
 
-/* Set Default PHY parameters if command is supported */
+/* Set Default PHY parameters if command is supported, enables all supported
+ * PHYs according to the LE Features bits.
+ */
 static int hci_le_set_default_phy_sync(struct hci_dev *hdev)
 {
 	struct hci_cp_le_set_default_phy cp;
 
-	if (!(hdev->commands[35] & 0x20))
+	if (!(hdev->commands[35] & 0x20)) {
+		/* If the command is not supported it means only 1M PHY is
+		 * supported.
+		 */
+		hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+		hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
 		return 0;
+	}
 
 	memset(&cp, 0, sizeof(cp));
 	cp.all_phys = 0x00;
-	cp.tx_phys = hdev->le_tx_def_phys;
-	cp.rx_phys = hdev->le_rx_def_phys;
+	cp.tx_phys = HCI_LE_SET_PHY_1M;
+	cp.rx_phys = HCI_LE_SET_PHY_1M;
+
+	/* Enables 2M PHY if supported */
+	if (le_2m_capable(hdev)) {
+		cp.tx_phys |= HCI_LE_SET_PHY_2M;
+		cp.rx_phys |= HCI_LE_SET_PHY_2M;
+	}
+
+	/* Enables Coded PHY if supported */
+	if (le_coded_capable(hdev)) {
+		cp.tx_phys |= HCI_LE_SET_PHY_CODED;
+		cp.rx_phys |= HCI_LE_SET_PHY_CODED;
+	}
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 249dc6777fb4..5f8c144c84b8 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -8393,10 +8393,10 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 		flags |= MGMT_ADV_FLAG_HW_OFFLOAD;
 		flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER;
 
-		if (hdev->le_features[1] & HCI_LE_PHY_2M)
+		if (le_2m_capable(hdev))
 			flags |= MGMT_ADV_FLAG_SEC_2M;
 
-		if (hdev->le_features[1] & HCI_LE_PHY_CODED)
+		if (le_coded_capable(hdev))
 			flags |= MGMT_ADV_FLAG_SEC_CODED;
 	}
 
-- 
cgit v1.2.3


From 06149746e7203d5ffe2d6faf9799ee36203aa8b8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 11 Apr 2023 16:02:22 -0700
Subject: Bluetooth: hci_conn: Add support for linking multiple hcon

Since it is required for some configurations to have multiple CIS with
the same peer which is now covered by iso-tester in the following test
cases:

    ISO AC 6(i) - Success
    ISO AC 7(i) - Success
    ISO AC 8(i) - Success
    ISO AC 9(i) - Success
    ISO AC 11(i) - Success

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  14 +++-
 net/bluetooth/hci_conn.c         | 155 ++++++++++++++++++++++++++++-----------
 net/bluetooth/hci_event.c        |  92 +++++++++++------------
 net/bluetooth/iso.c              |   8 +-
 4 files changed, 172 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 827e67159523..4fe1e71cb9d8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -770,7 +770,10 @@ struct hci_conn {
 	void		*iso_data;
 	struct amp_mgr	*amp_mgr;
 
-	struct hci_conn	*link;
+	struct list_head link_list;
+	struct hci_conn	*parent;
+	struct hci_link *link;
+
 	struct bt_codec codec;
 
 	void (*connect_cfm_cb)	(struct hci_conn *conn, u8 status);
@@ -780,6 +783,11 @@ struct hci_conn {
 	void (*cleanup)(struct hci_conn *conn);
 };
 
+struct hci_link {
+	struct list_head list;
+	struct hci_conn *conn;
+};
+
 struct hci_chan {
 	struct list_head list;
 	__u16 handle;
@@ -1383,12 +1391,14 @@ static inline void hci_conn_put(struct hci_conn *conn)
 	put_device(&conn->dev);
 }
 
-static inline void hci_conn_hold(struct hci_conn *conn)
+static inline struct hci_conn *hci_conn_hold(struct hci_conn *conn)
 {
 	BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt));
 
 	atomic_inc(&conn->refcnt);
 	cancel_delayed_work(&conn->disc_work);
+
+	return conn;
 }
 
 static inline void hci_conn_drop(struct hci_conn *conn)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 01e0b7255174..d8466abbb36a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -330,8 +330,11 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle)
 static bool find_next_esco_param(struct hci_conn *conn,
 				 const struct sco_param *esco_param, int size)
 {
+	if (!conn->parent)
+		return false;
+
 	for (; conn->attempt <= size; conn->attempt++) {
-		if (lmp_esco_2m_capable(conn->link) ||
+		if (lmp_esco_2m_capable(conn->parent) ||
 		    (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3))
 			break;
 		BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported",
@@ -461,7 +464,7 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
 		break;
 
 	case BT_CODEC_CVSD:
-		if (lmp_esco_capable(conn->link)) {
+		if (conn->parent && lmp_esco_capable(conn->parent)) {
 			if (!find_next_esco_param(conn, esco_param_cvsd,
 						  ARRAY_SIZE(esco_param_cvsd)))
 				return -EINVAL;
@@ -531,7 +534,7 @@ static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 		param = &esco_param_msbc[conn->attempt - 1];
 		break;
 	case SCO_AIRMODE_CVSD:
-		if (lmp_esco_capable(conn->link)) {
+		if (conn->parent && lmp_esco_capable(conn->parent)) {
 			if (!find_next_esco_param(conn, esco_param_cvsd,
 						  ARRAY_SIZE(esco_param_cvsd)))
 				return false;
@@ -637,21 +640,22 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
 /* Device _must_ be locked */
 void hci_sco_setup(struct hci_conn *conn, __u8 status)
 {
-	struct hci_conn *sco = conn->link;
+	struct hci_link *link;
 
-	if (!sco)
+	link = list_first_entry_or_null(&conn->link_list, struct hci_link, list);
+	if (!link || !link->conn)
 		return;
 
 	BT_DBG("hcon %p", conn);
 
 	if (!status) {
 		if (lmp_esco_capable(conn->hdev))
-			hci_setup_sync(sco, conn->handle);
+			hci_setup_sync(link->conn, conn->handle);
 		else
-			hci_add_sco(sco, conn->handle);
+			hci_add_sco(link->conn, conn->handle);
 	} else {
-		hci_connect_cfm(sco, status);
-		hci_conn_del(sco);
+		hci_connect_cfm(link->conn, status);
+		hci_conn_del(link->conn);
 	}
 }
 
@@ -1042,6 +1046,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	skb_queue_head_init(&conn->data_q);
 
 	INIT_LIST_HEAD(&conn->chan_list);
+	INIT_LIST_HEAD(&conn->link_list);
 
 	INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout);
 	INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
@@ -1069,15 +1074,39 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	return conn;
 }
 
-static bool hci_conn_unlink(struct hci_conn *conn)
+static void hci_conn_unlink(struct hci_conn *conn)
 {
+	struct hci_dev *hdev = conn->hdev;
+
+	bt_dev_dbg(hdev, "hcon %p", conn);
+
+	if (!conn->parent) {
+		struct hci_link *link, *t;
+
+		list_for_each_entry_safe(link, t, &conn->link_list, list)
+			hci_conn_unlink(link->conn);
+
+		return;
+	}
+
 	if (!conn->link)
-		return false;
+		return;
+
+	hci_conn_put(conn->parent);
+	conn->parent = NULL;
 
-	conn->link->link = NULL;
+	list_del_rcu(&conn->link->list);
+	synchronize_rcu();
+
+	kfree(conn->link);
 	conn->link = NULL;
 
-	return true;
+	/* Due to race, SCO connection might be not established
+	 * yet at this point. Delete it now, otherwise it is
+	 * possible for it to be stuck and can't be deleted.
+	 */
+	if (conn->handle == HCI_CONN_HANDLE_UNSET)
+		hci_conn_del(conn);
 }
 
 int hci_conn_del(struct hci_conn *conn)
@@ -1091,18 +1120,7 @@ int hci_conn_del(struct hci_conn *conn)
 	cancel_delayed_work_sync(&conn->idle_work);
 
 	if (conn->type == ACL_LINK) {
-		struct hci_conn *link = conn->link;
-
-		if (link) {
-			hci_conn_unlink(conn);
-			/* Due to race, SCO connection might be not established
-			 * yet at this point. Delete it now, otherwise it is
-			 * possible for it to be stuck and can't be deleted.
-			 */
-			if (link->handle == HCI_CONN_HANDLE_UNSET)
-				hci_conn_del(link);
-		}
-
+		hci_conn_unlink(conn);
 		/* Unacked frames */
 		hdev->acl_cnt += conn->sent;
 	} else if (conn->type == LE_LINK) {
@@ -1113,7 +1131,7 @@ int hci_conn_del(struct hci_conn *conn)
 		else
 			hdev->acl_cnt += conn->sent;
 	} else {
-		struct hci_conn *acl = conn->link;
+		struct hci_conn *acl = conn->parent;
 
 		if (acl) {
 			hci_conn_unlink(conn);
@@ -1600,11 +1618,40 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 	return acl;
 }
 
+static struct hci_link *hci_conn_link(struct hci_conn *parent,
+				      struct hci_conn *conn)
+{
+	struct hci_dev *hdev = parent->hdev;
+	struct hci_link *link;
+
+	bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn);
+
+	if (conn->link)
+		return conn->link;
+
+	if (conn->parent)
+		return NULL;
+
+	link = kzalloc(sizeof(*link), GFP_KERNEL);
+	if (!link)
+		return NULL;
+
+	link->conn = hci_conn_hold(conn);
+	conn->link = link;
+	conn->parent = hci_conn_get(parent);
+
+	/* Use list_add_tail_rcu append to the list */
+	list_add_tail_rcu(&link->list, &parent->link_list);
+
+	return link;
+}
+
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 				 __u16 setting, struct bt_codec *codec)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
+	struct hci_link *link;
 
 	acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
 			      CONN_REASON_SCO_CONNECT);
@@ -1620,10 +1667,12 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		}
 	}
 
-	acl->link = sco;
-	sco->link = acl;
-
-	hci_conn_hold(sco);
+	link = hci_conn_link(acl, sco);
+	if (!link) {
+		hci_conn_drop(acl);
+		hci_conn_drop(sco);
+		return NULL;
+	}
 
 	sco->setting = setting;
 	sco->codec = *codec;
@@ -1890,7 +1939,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 	u8 cig;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.cis[0].acl_handle = cpu_to_le16(conn->link->handle);
+	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
 	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
 	cmd.cp.num_cis++;
 	cig = conn->iso_qos.ucast.cig;
@@ -1903,11 +1952,12 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
 
 		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED || conn->iso_qos.ucast.cig != cig)
+		    conn->state == BT_CONNECTED ||
+		    conn->iso_qos.ucast.cig != cig)
 			continue;
 
 		/* Check if all CIS(s) belonging to a CIG are ready */
-		if (!conn->link || conn->link->state != BT_CONNECTED ||
+		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
 		    conn->state != BT_CONNECT) {
 			cmd.cp.num_cis = 0;
 			break;
@@ -1924,7 +1974,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		 * command have been generated, the Controller shall return the
 		 * error code Command Disallowed (0x0C).
 		 */
-		cis->acl_handle = cpu_to_le16(conn->link->handle);
+		cis->acl_handle = cpu_to_le16(conn->parent->handle);
 		cis->cis_handle = cpu_to_le16(conn->handle);
 		cmd.cp.num_cis++;
 	}
@@ -1943,15 +1993,33 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 int hci_le_create_cis(struct hci_conn *conn)
 {
 	struct hci_conn *cis;
+	struct hci_link *link, *t;
 	struct hci_dev *hdev = conn->hdev;
 	int err;
 
+	bt_dev_dbg(hdev, "hcon %p", conn);
+
 	switch (conn->type) {
 	case LE_LINK:
-		if (!conn->link || conn->state != BT_CONNECTED)
+		if (conn->state != BT_CONNECTED || list_empty(&conn->link_list))
 			return -EINVAL;
-		cis = conn->link;
-		break;
+
+		cis = NULL;
+
+		/* hci_conn_link uses list_add_tail_rcu so the list is in
+		 * the same order as the connections are requested.
+		 */
+		list_for_each_entry_safe(link, t, &conn->link_list, list) {
+			if (link->conn->state == BT_BOUND) {
+				err = hci_le_create_cis(link->conn);
+				if (err)
+					return err;
+
+				cis = link->conn;
+			}
+		}
+
+		return cis ? 0 : -EINVAL;
 	case ISO_LINK:
 		cis = conn;
 		break;
@@ -2172,6 +2240,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 {
 	struct hci_conn *le;
 	struct hci_conn *cis;
+	struct hci_link *link;
 
 	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 		le = hci_connect_le(hdev, dst, dst_type, false,
@@ -2197,16 +2266,18 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 		return cis;
 	}
 
-	le->link = cis;
-	cis->link = le;
-
-	hci_conn_hold(cis);
+	link = hci_conn_link(le, cis);
+	if (!link) {
+		hci_conn_drop(le);
+		hci_conn_drop(cis);
+		return NULL;
+	}
 
 	/* If LE is already connected and CIS handle is already set proceed to
 	 * Create CIS immediately.
 	 */
 	if (le->state == BT_CONNECTED && cis->handle != HCI_CONN_HANDLE_UNSET)
-		hci_le_create_cis(le);
+		hci_le_create_cis(cis);
 
 	return cis;
 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0e0a93cc1218..d00ef6e3fc45 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2345,7 +2345,8 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
 {
 	struct hci_cp_add_sco *cp;
-	struct hci_conn *acl, *sco;
+	struct hci_conn *acl;
+	struct hci_link *link;
 	__u16 handle;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
@@ -2365,12 +2366,13 @@ static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
 
 	acl = hci_conn_hash_lookup_handle(hdev, handle);
 	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+		link = list_first_entry_or_null(&acl->link_list,
+						struct hci_link, list);
+		if (link && link->conn) {
+			link->conn->state = BT_CLOSED;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
+			hci_connect_cfm(link->conn, status);
+			hci_conn_del(link->conn);
 		}
 	}
 
@@ -2637,74 +2639,61 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
 	hci_dev_unlock(hdev);
 }
 
-static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+static void hci_setup_sync_conn_status(struct hci_dev *hdev, __u16 handle,
+				       __u8 status)
 {
-	struct hci_cp_setup_sync_conn *cp;
-	struct hci_conn *acl, *sco;
-	__u16 handle;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", status);
-
-	if (!status)
-		return;
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
-	if (!cp)
-		return;
+	struct hci_conn *acl;
+	struct hci_link *link;
 
-	handle = __le16_to_cpu(cp->handle);
-
-	bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
+	bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", handle, status);
 
 	hci_dev_lock(hdev);
 
 	acl = hci_conn_hash_lookup_handle(hdev, handle);
 	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+		link = list_first_entry_or_null(&acl->link_list,
+						struct hci_link, list);
+		if (link && link->conn) {
+			link->conn->state = BT_CLOSED;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
+			hci_connect_cfm(link->conn, status);
+			hci_conn_del(link->conn);
 		}
 	}
 
 	hci_dev_unlock(hdev);
 }
 
-static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
 {
-	struct hci_cp_enhanced_setup_sync_conn *cp;
-	struct hci_conn *acl, *sco;
-	__u16 handle;
+	struct hci_cp_setup_sync_conn *cp;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	if (!status)
 		return;
 
-	cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+	cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
 	if (!cp)
 		return;
 
-	handle = __le16_to_cpu(cp->handle);
+	hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
+}
 
-	bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
+static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_enhanced_setup_sync_conn *cp;
 
-	hci_dev_lock(hdev);
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
-	acl = hci_conn_hash_lookup_handle(hdev, handle);
-	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+	if (!status)
+		return;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
-		}
-	}
+	cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+	if (!cp)
+		return;
 
-	hci_dev_unlock(hdev);
+	hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
 }
 
 static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
@@ -3834,19 +3823,20 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		if (conn->type != ISO_LINK || conn->iso_qos.ucast.cig != rp->cig_id ||
+		if (conn->type != ISO_LINK ||
+		    conn->iso_qos.ucast.cig != rp->cig_id ||
 		    conn->state == BT_CONNECTED)
 			continue;
 
 		conn->handle = __le16_to_cpu(rp->handle[i++]);
 
-		bt_dev_dbg(hdev, "%p handle 0x%4.4x link %p", conn,
-			   conn->handle, conn->link);
+		bt_dev_dbg(hdev, "%p handle 0x%4.4x parent %p", conn,
+			   conn->handle, conn->parent);
 
 		/* Create CIS if LE is already connected */
-		if (conn->link && conn->link->state == BT_CONNECTED) {
+		if (conn->parent && conn->parent->state == BT_CONNECTED) {
 			rcu_read_unlock();
-			hci_le_create_cis(conn->link);
+			hci_le_create_cis(conn);
 			rcu_read_lock();
 		}
 
@@ -5031,7 +5021,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 		if (conn->out) {
 			conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
 					(hdev->esco_type & EDR_ESCO_MASK);
-			if (hci_setup_sync(conn, conn->link->handle))
+			if (hci_setup_sync(conn, conn->parent->handle))
 				goto unlock;
 		}
 		fallthrough;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 74117df03a3f..34d55a85d8f6 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1657,8 +1657,12 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
 
 		/* Check if LE link has failed */
 		if (status) {
-			if (hcon->link)
-				iso_conn_del(hcon->link, bt_to_errno(status));
+			struct hci_link *link, *t;
+
+			list_for_each_entry_safe(link, t, &hcon->link_list,
+						 list)
+				iso_conn_del(link->conn, bt_to_errno(status));
+
 			return;
 		}
 
-- 
cgit v1.2.3


From c14516faede33c2c31da45cf950d55dbff42962e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 11 Apr 2023 16:14:25 -0700
Subject: Bluetooth: hci_conn: Fix not matching by CIS ID

This fixes only matching CIS by address which prevents creating new hcon
if upper layer is requesting a specific CIS ID.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 12 +++++++++++-
 net/bluetooth/hci_conn.c         |  3 ++-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 4fe1e71cb9d8..a6c8aee2f256 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1180,7 +1180,9 @@ static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
 
 static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
 							bdaddr_t *ba,
-							__u8 ba_type)
+							__u8 ba_type,
+							__u8 cig,
+							__u8 id)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn  *c;
@@ -1191,6 +1193,14 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
 		if (c->type != ISO_LINK)
 			continue;
 
+		/* Match CIG ID if set */
+		if (cig != BT_ISO_QOS_CIG_UNSET && cig != c->iso_qos.ucast.cig)
+			continue;
+
+		/* Match CIS ID if set */
+		if (id != BT_ISO_QOS_CIS_UNSET && id != c->iso_qos.ucast.cis)
+			continue;
+
 		if (ba_type == c->dst_type && !bacmp(&c->dst, ba)) {
 			rcu_read_unlock();
 			return c;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index d8466abbb36a..c215e983e287 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1841,7 +1841,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 {
 	struct hci_conn *cis;
 
-	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type);
+	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
+				       qos->ucast.cis);
 	if (!cis) {
 		cis = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
 		if (!cis)
-- 
cgit v1.2.3


From c09b80be6ffc338634b2f5f8cfa12b6843410834 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 12 Apr 2023 17:45:51 -0700
Subject: Bluetooth: hci_conn: Fix not waiting for HCI_EVT_LE_CIS_ESTABLISHED

When submitting HCI_OP_LE_CREATE_CIS the code shall wait for
HCI_EVT_LE_CIS_ESTABLISHED thus enforcing the serialization of
HCI_OP_LE_CREATE_CIS as the Core spec does not allow to send them in
parallel:

  BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2566:

  If the Host issues this command before all the HCI_LE_CIS_Established
  events from the previous use of the command have been generated, the
  Controller shall return the error code Command Disallowed (0x0C).

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_conn.c         | 58 +----------------------------------
 net/bluetooth/hci_sync.c         | 65 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 17f5a4c32f36..f61b249787fc 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -122,6 +122,8 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 
 int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
 
+int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c215e983e287..640b951bf40a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1932,63 +1932,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 
 static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 {
-	struct {
-		struct hci_cp_le_create_cis cp;
-		struct hci_cis cis[0x1f];
-	} cmd;
-	struct hci_conn *conn = data;
-	u8 cig;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
-	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
-	cmd.cp.num_cis++;
-	cig = conn->iso_qos.ucast.cig;
-
-	hci_dev_lock(hdev);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
-
-		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED ||
-		    conn->iso_qos.ucast.cig != cig)
-			continue;
-
-		/* Check if all CIS(s) belonging to a CIG are ready */
-		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
-		    conn->state != BT_CONNECT) {
-			cmd.cp.num_cis = 0;
-			break;
-		}
-
-		/* Group all CIS with state BT_CONNECT since the spec don't
-		 * allow to send them individually:
-		 *
-		 * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
-		 * page 2566:
-		 *
-		 * If the Host issues this command before all the
-		 * HCI_LE_CIS_Established events from the previous use of the
-		 * command have been generated, the Controller shall return the
-		 * error code Command Disallowed (0x0C).
-		 */
-		cis->acl_handle = cpu_to_le16(conn->parent->handle);
-		cis->cis_handle = cpu_to_le16(conn->handle);
-		cmd.cp.num_cis++;
-	}
-
-	rcu_read_unlock();
-
-	hci_dev_unlock(hdev);
-
-	if (!cmd.cp.num_cis)
-		return 0;
-
-	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_CIS, sizeof(cmd.cp) +
-			    sizeof(cmd.cis[0]) * cmd.cp.num_cis, &cmd);
+	return hci_le_create_cis_sync(hdev, data);
 }
 
 int hci_le_create_cis(struct hci_conn *conn)
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 00017f75cd41..6f060d00a70a 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6137,6 +6137,71 @@ done:
 	return err;
 }
 
+int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	struct {
+		struct hci_cp_le_create_cis cp;
+		struct hci_cis cis[0x1f];
+	} cmd;
+	u8 cig;
+	struct hci_conn *hcon = conn;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
+	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
+	cmd.cp.num_cis++;
+	cig = conn->iso_qos.ucast.cig;
+
+	hci_dev_lock(hdev);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
+
+		if (conn == hcon || conn->type != ISO_LINK ||
+		    conn->state == BT_CONNECTED ||
+		    conn->iso_qos.ucast.cig != cig)
+			continue;
+
+		/* Check if all CIS(s) belonging to a CIG are ready */
+		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
+		    conn->state != BT_CONNECT) {
+			cmd.cp.num_cis = 0;
+			break;
+		}
+
+		/* Group all CIS with state BT_CONNECT since the spec don't
+		 * allow to send them individually:
+		 *
+		 * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+		 * page 2566:
+		 *
+		 * If the Host issues this command before all the
+		 * HCI_LE_CIS_Established events from the previous use of the
+		 * command have been generated, the Controller shall return the
+		 * error code Command Disallowed (0x0C).
+		 */
+		cis->acl_handle = cpu_to_le16(conn->parent->handle);
+		cis->cis_handle = cpu_to_le16(conn->handle);
+		cmd.cp.num_cis++;
+	}
+
+	rcu_read_unlock();
+
+	hci_dev_unlock(hdev);
+
+	if (!cmd.cp.num_cis)
+		return 0;
+
+	/* Wait for HCI_LE_CIS_Established */
+	return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
+					sizeof(cmd.cp) + sizeof(cmd.cis[0]) *
+					cmd.cp.num_cis, &cmd,
+					HCI_EVT_LE_CIS_ESTABLISHED,
+					conn->conn_timeout, NULL);
+}
+
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
 {
 	struct hci_cp_le_remove_cig cp;
-- 
cgit v1.2.3


From 91b6d02ddcd113352bdd895990b252065c596de7 Mon Sep 17 00:00:00 2001
From: Raul Cheleguini <raul.cheleguini@gmail.com>
Date: Thu, 23 Mar 2023 10:45:39 -0300
Subject: Bluetooth: Add new quirk for broken set random RPA timeout for
 ATS2851

The ATS2851 based controller advertises support for command "LE Set Random
Private Address Timeout" but does not actually implement it, impeding the
controller initialization.

Add the quirk HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT to unblock the controller
initialization.

< HCI Command: LE Set Resolvable Private... (0x08|0x002e) plen 2
        Timeout: 900 seconds
> HCI Event: Command Status (0x0f) plen 4
      LE Set Resolvable Private Address Timeout (0x08|0x002e) ncmd 1
        Status: Unknown HCI Command (0x01)

Co-developed-by: imoc <wzj9912@gmail.com>
Signed-off-by: imoc <wzj9912@gmail.com>
Signed-off-by: Raul Cheleguini <raul.cheleguini@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c   | 1 +
 include/net/bluetooth/hci.h | 8 ++++++++
 net/bluetooth/hci_sync.c    | 6 +++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 2303b0a66323..3aa189b1986d 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4149,6 +4149,7 @@ static int btusb_probe(struct usb_interface *intf,
 		/* Support is advertised, but not implemented */
 		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks);
+		set_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
 	}
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 997107bfc0b1..07df96c47ef4 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -301,6 +301,14 @@ enum {
 	 * don't actually support features declared there.
 	 */
 	HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
+
+	/*
+	 * When this quirk is set, the HCI_OP_LE_SET_RPA_TIMEOUT command is
+	 * skipped during initialization. This is required for the Actions
+	 * Semiconductor ATS2851 based controllers, which erroneously claims
+	 * to support it.
+	 */
+	HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT,
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 6f060d00a70a..771aaa808967 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4093,7 +4093,8 @@ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev)
 {
 	__le16 timeout = cpu_to_le16(hdev->rpa_timeout);
 
-	if (!(hdev->commands[35] & 0x04))
+	if (!(hdev->commands[35] & 0x04) ||
+	    test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT,
@@ -4553,6 +4554,9 @@ static const struct {
 			 "HCI Set Event Filter command not supported."),
 	HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN,
 			 "HCI Enhanced Setup Synchronous Connection command is "
+			 "advertised, but not supported."),
+	HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT,
+			 "HCI LE Set Random Private Address Timeout command is "
 			 "advertised, but not supported.")
 };
 
-- 
cgit v1.2.3


From d883a4669a1def6d121ccf5e64ad28260d1c9531 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 21 Apr 2023 11:37:55 -0700
Subject: Bluetooth: hci_sync: Only allow hci_cmd_sync_queue if running

This makes sure hci_cmd_sync_queue only queue new work if HCI_RUNNING
has been set otherwise there is a risk of commands being sent while
turning off.

Because hci_cmd_sync_queue can no longer queue work while HCI_RUNNING is
not set it cannot be used to power on adapters so instead
hci_cmd_sync_submit is introduced which bypass the HCI_RUNNING check, so
it behaves like the old implementation.

Link: https://lore.kernel.org/all/CAB4PzUpDMvdc8j2MdeSAy1KkAE-D3woprCwAdYWeOc-3v3c9Sw@mail.gmail.com/
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_sync.c         | 25 +++++++++++++++++++++++--
 net/bluetooth/mgmt.c             | 12 ++++++++----
 3 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index f61b249787fc..2495be4d8b82 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -41,6 +41,8 @@ void hci_cmd_sync_clear(struct hci_dev *hdev);
 void hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
 void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
 
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 771aaa808967..647a8ce54062 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -684,8 +684,12 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 }
 EXPORT_SYMBOL(hci_cmd_sync_cancel);
 
-int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
-		       void *data, hci_cmd_sync_work_destroy_t destroy)
+/* Submit HCI command to be run in as cmd_sync_work:
+ *
+ * - hdev must _not_ be unregistered
+ */
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			void *data, hci_cmd_sync_work_destroy_t destroy)
 {
 	struct hci_cmd_sync_work_entry *entry;
 
@@ -708,6 +712,23 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 
 	return 0;
 }
+EXPORT_SYMBOL(hci_cmd_sync_submit);
+
+/* Queue HCI command:
+ *
+ * - hdev must be running
+ */
+int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+		       void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	/* Only queue command if hdev is running which means it had been opened
+	 * and is either on init phase or is already up.
+	 */
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return -ENETDOWN;
+
+	return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
 EXPORT_SYMBOL(hci_cmd_sync_queue);
 
 int hci_update_eir_sync(struct hci_dev *hdev)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 13c745876b39..f7b2d0971f24 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1400,11 +1400,15 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 	}
 
 	/* Cancel potentially blocking sync operation before power off */
-	if (cp->val == 0x00)
+	if (cp->val == 0x00) {
 		__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
-
-	err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
-				 mgmt_set_powered_complete);
+		err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
+					 mgmt_set_powered_complete);
+	} else {
+		/* Use hci_cmd_sync_submit since hdev might not be running */
+		err = hci_cmd_sync_submit(hdev, set_powered_sync, cmd,
+					  mgmt_set_powered_complete);
+	}
 
 	if (err < 0)
 		mgmt_pending_remove(cmd);
-- 
cgit v1.2.3


From 74d7970febf7e9005375aeda0df821d2edffc9f7 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Fri, 21 Apr 2023 16:09:01 +0900
Subject: ksmbd: fix racy issue from using ->d_parent and ->d_name

Al pointed out that ksmbd has racy issue from using ->d_parent and ->d_name
in ksmbd_vfs_unlink and smb2_vfs_rename(). and use new lock_rename_child()
to lock stable parent while underlying rename racy.
Introduce vfs_path_parent_lookup helper to avoid out of share access and
export vfs functions like the following ones to use
vfs_path_parent_lookup().
 - rename __lookup_hash() to lookup_one_qstr_excl().
 - export lookup_one_qstr_excl().
 - export getname_kernel() and putname().

vfs_path_parent_lookup() is used for parent lookup of destination file
using absolute pathname given from FILE_RENAME_INFORMATION request.

Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/smb2pdu.c    | 147 ++++-------------
 fs/ksmbd/vfs.c        | 435 ++++++++++++++++++++++----------------------------
 fs/ksmbd/vfs.h        |  19 +--
 fs/ksmbd/vfs_cache.c  |   5 +-
 fs/namei.c            |  57 +++++--
 include/linux/namei.h |   6 +
 6 files changed, 283 insertions(+), 386 deletions(-)

(limited to 'include')

diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 215fea2883f6..bbc9e92935fb 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2408,7 +2408,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
 			return rc;
 	}
 
-	rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
+	rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0);
 	if (rc) {
 		pr_err("cannot get linux path (%s), err = %d\n",
 		       name, rc);
@@ -2699,8 +2699,10 @@ int smb2_open(struct ksmbd_work *work)
 		goto err_out1;
 	}
 
-	rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
+	rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
 	if (!rc) {
+		file_present = true;
+
 		if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
 			/*
 			 * If file exists with under flags, return access
@@ -2709,7 +2711,6 @@ int smb2_open(struct ksmbd_work *work)
 			if (req->CreateDisposition == FILE_OVERWRITE_IF_LE ||
 			    req->CreateDisposition == FILE_OPEN_IF_LE) {
 				rc = -EACCES;
-				path_put(&path);
 				goto err_out;
 			}
 
@@ -2717,26 +2718,23 @@ int smb2_open(struct ksmbd_work *work)
 				ksmbd_debug(SMB,
 					    "User does not have write permission\n");
 				rc = -EACCES;
-				path_put(&path);
 				goto err_out;
 			}
 		} else if (d_is_symlink(path.dentry)) {
 			rc = -EACCES;
-			path_put(&path);
 			goto err_out;
 		}
-	}
 
-	if (rc) {
+		file_present = true;
+		idmap = mnt_idmap(path.mnt);
+	} else {
 		if (rc != -ENOENT)
 			goto err_out;
 		ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n",
 			    name, rc);
 		rc = 0;
-	} else {
-		file_present = true;
-		idmap = mnt_idmap(path.mnt);
 	}
+
 	if (stream_name) {
 		if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
 			if (s_type == DATA_STREAM) {
@@ -2864,8 +2862,9 @@ int smb2_open(struct ksmbd_work *work)
 
 			if ((daccess & FILE_DELETE_LE) ||
 			    (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
-				rc = ksmbd_vfs_may_delete(idmap,
-							  path.dentry);
+				rc = inode_permission(idmap,
+						      d_inode(path.dentry->d_parent),
+						      MAY_EXEC | MAY_WRITE);
 				if (rc)
 					goto err_out;
 			}
@@ -3236,10 +3235,13 @@ int smb2_open(struct ksmbd_work *work)
 	}
 
 err_out:
-	if (file_present || created)
-		path_put(&path);
+	if (file_present || created) {
+		inode_unlock(d_inode(path.dentry->d_parent));
+		dput(path.dentry);
+	}
 	ksmbd_revert_fsids(work);
 err_out1:
+
 	if (rc) {
 		if (rc == -EINVAL)
 			rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -5390,44 +5392,19 @@ int smb2_echo(struct ksmbd_work *work)
 
 static int smb2_rename(struct ksmbd_work *work,
 		       struct ksmbd_file *fp,
-		       struct mnt_idmap *idmap,
 		       struct smb2_file_rename_info *file_info,
 		       struct nls_table *local_nls)
 {
 	struct ksmbd_share_config *share = fp->tcon->share_conf;
-	char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL;
-	char *pathname = NULL;
-	struct path path;
-	bool file_present = true;
-	int rc;
+	char *new_name = NULL;
+	int rc, flags = 0;
 
 	ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n");
-	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
-
-	abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
-	if (IS_ERR(abs_oldname)) {
-		rc = -EINVAL;
-		goto out;
-	}
-	old_name = strrchr(abs_oldname, '/');
-	if (old_name && old_name[1] != '\0') {
-		old_name++;
-	} else {
-		ksmbd_debug(SMB, "can't get last component in path %s\n",
-			    abs_oldname);
-		rc = -ENOENT;
-		goto out;
-	}
-
 	new_name = smb2_get_name(file_info->FileName,
 				 le32_to_cpu(file_info->FileNameLength),
 				 local_nls);
-	if (IS_ERR(new_name)) {
-		rc = PTR_ERR(new_name);
-		goto out;
-	}
+	if (IS_ERR(new_name))
+		return PTR_ERR(new_name);
 
 	if (strchr(new_name, ':')) {
 		int s_type;
@@ -5453,7 +5430,7 @@ static int smb2_rename(struct ksmbd_work *work,
 		if (rc)
 			goto out;
 
-		rc = ksmbd_vfs_setxattr(idmap,
+		rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
 					fp->filp->f_path.dentry,
 					xattr_stream_name,
 					NULL, 0, 0);
@@ -5468,47 +5445,18 @@ static int smb2_rename(struct ksmbd_work *work,
 	}
 
 	ksmbd_debug(SMB, "new name %s\n", new_name);
-	rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1);
-	if (rc) {
-		if (rc != -ENOENT)
-			goto out;
-		file_present = false;
-	} else {
-		path_put(&path);
-	}
-
 	if (ksmbd_share_veto_filename(share, new_name)) {
 		rc = -ENOENT;
 		ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name);
 		goto out;
 	}
 
-	if (file_info->ReplaceIfExists) {
-		if (file_present) {
-			rc = ksmbd_vfs_remove_file(work, new_name);
-			if (rc) {
-				if (rc != -ENOTEMPTY)
-					rc = -EINVAL;
-				ksmbd_debug(SMB, "cannot delete %s, rc %d\n",
-					    new_name, rc);
-				goto out;
-			}
-		}
-	} else {
-		if (file_present &&
-		    strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) {
-			rc = -EEXIST;
-			ksmbd_debug(SMB,
-				    "cannot rename already existing file\n");
-			goto out;
-		}
-	}
+	if (!file_info->ReplaceIfExists)
+		flags = RENAME_NOREPLACE;
 
-	rc = ksmbd_vfs_fp_rename(work, fp, new_name);
+	rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
 out:
-	kfree(pathname);
-	if (!IS_ERR(new_name))
-		kfree(new_name);
+	kfree(new_name);
 	return rc;
 }
 
@@ -5548,18 +5496,17 @@ static int smb2_create_link(struct ksmbd_work *work,
 	}
 
 	ksmbd_debug(SMB, "target name is %s\n", target_name);
-	rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0);
+	rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
+					&path, 0);
 	if (rc) {
 		if (rc != -ENOENT)
 			goto out;
 		file_present = false;
-	} else {
-		path_put(&path);
 	}
 
 	if (file_info->ReplaceIfExists) {
 		if (file_present) {
-			rc = ksmbd_vfs_remove_file(work, link_name);
+			rc = ksmbd_vfs_remove_file(work, &path);
 			if (rc) {
 				rc = -EINVAL;
 				ksmbd_debug(SMB, "cannot delete %s\n",
@@ -5579,6 +5526,10 @@ static int smb2_create_link(struct ksmbd_work *work,
 	if (rc)
 		rc = -EINVAL;
 out:
+	if (file_present) {
+		inode_unlock(d_inode(path.dentry->d_parent));
+		path_put(&path);
+	}
 	if (!IS_ERR(link_name))
 		kfree(link_name);
 	kfree(pathname);
@@ -5756,12 +5707,6 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
 			   struct smb2_file_rename_info *rename_info,
 			   unsigned int buf_len)
 {
-	struct mnt_idmap *idmap;
-	struct ksmbd_file *parent_fp;
-	struct dentry *parent;
-	struct dentry *dentry = fp->filp->f_path.dentry;
-	int ret;
-
 	if (!(fp->daccess & FILE_DELETE_LE)) {
 		pr_err("no right to delete : 0x%x\n", fp->daccess);
 		return -EACCES;
@@ -5771,32 +5716,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
 			le32_to_cpu(rename_info->FileNameLength))
 		return -EINVAL;
 
-	idmap = file_mnt_idmap(fp->filp);
-	if (ksmbd_stream_fd(fp))
-		goto next;
-
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
-	}
-
-	parent_fp = ksmbd_lookup_fd_inode(d_inode(parent));
-	inode_unlock(d_inode(parent));
-	dput(parent);
+	if (!le32_to_cpu(rename_info->FileNameLength))
+		return -EINVAL;
 
-	if (parent_fp) {
-		if (parent_fp->daccess & FILE_DELETE_LE) {
-			pr_err("parent dir is opened with delete access\n");
-			ksmbd_fd_put(work, parent_fp);
-			return -ESHARE;
-		}
-		ksmbd_fd_put(work, parent_fp);
-	}
-next:
-	return smb2_rename(work, fp, idmap, rename_info,
-			   work->conn->local_nls);
+	return smb2_rename(work, fp, rename_info, work->conn->local_nls);
 }
 
 static int set_file_disposition_info(struct ksmbd_file *fp,
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index cef07d7fb7dc..778c152708e4 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -18,6 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/xacct.h>
 #include <linux/crc32c.h>
+#include <linux/namei.h>
 
 #include "glob.h"
 #include "oplock.h"
@@ -35,19 +36,6 @@
 #include "mgmt/user_session.h"
 #include "mgmt/user_config.h"
 
-static char *extract_last_component(char *path)
-{
-	char *p = strrchr(path, '/');
-
-	if (p && p[1] != '\0') {
-		*p = '\0';
-		p++;
-	} else {
-		p = NULL;
-	}
-	return p;
-}
-
 static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
 				    struct inode *parent_inode,
 				    struct inode *inode)
@@ -61,65 +49,77 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
 
 /**
  * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
- *
- * the parent dentry got by dget_parent or @parent could be
- * unstable, we try to lock a parent inode and lookup the
- * child dentry again.
- *
- * the reference count of @parent isn't incremented.
  */
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
-			  struct dentry *child)
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
 {
-	struct dentry *dentry;
-	int ret = 0;
-
 	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
-	dentry = lookup_one(idmap, child->d_name.name, parent,
-			    child->d_name.len);
-	if (IS_ERR(dentry)) {
-		ret = PTR_ERR(dentry);
-		goto out_err;
-	}
-
-	if (dentry != child) {
-		ret = -ESTALE;
-		dput(dentry);
-		goto out_err;
+	if (child->d_parent != parent) {
+		inode_unlock(d_inode(parent));
+		return -ENOENT;
 	}
 
-	dput(dentry);
 	return 0;
-out_err:
-	inode_unlock(d_inode(parent));
-	return ret;
 }
 
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap,
-			 struct dentry *dentry)
+static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
+					char *pathname, unsigned int flags,
+					struct path *path)
 {
-	struct dentry *parent;
-	int ret;
+	struct qstr last;
+	struct filename *filename;
+	struct path *root_share_path = &share_conf->vfs_path;
+	int err, type;
+	struct path parent_path;
+	struct dentry *d;
+
+	if (pathname[0] == '\0') {
+		pathname = share_conf->path;
+		root_share_path = NULL;
+	} else {
+		flags |= LOOKUP_BENEATH;
+	}
 
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
+	filename = getname_kernel(pathname);
+	if (IS_ERR(filename))
+		return PTR_ERR(filename);
+
+	err = vfs_path_parent_lookup(filename, flags,
+				     &parent_path, &last, &type,
+				     root_share_path);
+	putname(filename);
+	if (err)
+		return err;
+
+	if (unlikely(type != LAST_NORM)) {
+		path_put(&parent_path);
+		return -ENOENT;
 	}
 
-	ret = inode_permission(idmap, d_inode(parent),
-			       MAY_EXEC | MAY_WRITE);
+	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+	d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+	if (IS_ERR(d))
+		goto err_out;
 
-	inode_unlock(d_inode(parent));
-	dput(parent);
-	return ret;
+	if (d_is_negative(d)) {
+		dput(d);
+		goto err_out;
+	}
+
+	path->dentry = d;
+	path->mnt = share_conf->vfs_path.mnt;
+	path_put(&parent_path);
+
+	return 0;
+
+err_out:
+	inode_unlock(parent_path.dentry->d_inode);
+	path_put(&parent_path);
+	return -ENOENT;
 }
 
 int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 				   struct dentry *dentry, __le32 *daccess)
 {
-	struct dentry *parent;
 	int ret = 0;
 
 	*daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL);
@@ -136,18 +136,9 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 	if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC))
 		*daccess |= FILE_EXECUTE_LE;
 
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
-	}
-
-	if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE))
+	if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE))
 		*daccess |= FILE_DELETE_LE;
 
-	inode_unlock(d_inode(parent));
-	dput(parent);
 	return ret;
 }
 
@@ -580,54 +571,32 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
  *
  * Return:	0 on success, otherwise error
  */
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
 {
 	struct mnt_idmap *idmap;
-	struct path path;
-	struct dentry *parent;
+	struct dentry *parent = path->dentry->d_parent;
 	int err;
 
 	if (ksmbd_override_fsids(work))
 		return -ENOMEM;
 
-	err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false);
-	if (err) {
-		ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
-		ksmbd_revert_fsids(work);
-		return err;
-	}
-
-	idmap = mnt_idmap(path.mnt);
-	parent = dget_parent(path.dentry);
-	err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry);
-	if (err) {
-		dput(parent);
-		path_put(&path);
-		ksmbd_revert_fsids(work);
-		return err;
-	}
-
-	if (!d_inode(path.dentry)->i_nlink) {
+	if (!d_inode(path->dentry)->i_nlink) {
 		err = -ENOENT;
 		goto out_err;
 	}
 
-	if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
-		err = vfs_rmdir(idmap, d_inode(parent), path.dentry);
+	idmap = mnt_idmap(path->mnt);
+	if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
+		err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
 		if (err && err != -ENOTEMPTY)
-			ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
-				    err);
+			ksmbd_debug(VFS, "rmdir failed, err %d\n", err);
 	} else {
-		err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL);
+		err = vfs_unlink(idmap, d_inode(parent), path->dentry, NULL);
 		if (err)
-			ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
-				    err);
+			ksmbd_debug(VFS, "unlink failed, err %d\n", err);
 	}
 
 out_err:
-	inode_unlock(d_inode(parent));
-	dput(parent);
-	path_put(&path);
 	ksmbd_revert_fsids(work);
 	return err;
 }
@@ -686,149 +655,114 @@ out1:
 	return err;
 }
 
-static int ksmbd_validate_entry_in_use(struct dentry *src_dent)
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+		     char *newname, int flags)
 {
-	struct dentry *dst_dent;
-
-	spin_lock(&src_dent->d_lock);
-	list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) {
-		struct ksmbd_file *child_fp;
+	struct dentry *old_parent, *new_dentry, *trap;
+	struct dentry *old_child = old_path->dentry;
+	struct path new_path;
+	struct qstr new_last;
+	struct renamedata rd;
+	struct filename *to;
+	struct ksmbd_share_config *share_conf = work->tcon->share_conf;
+	struct ksmbd_file *parent_fp;
+	int new_type;
+	int err, lookup_flags = LOOKUP_NO_SYMLINKS;
 
-		if (d_really_is_negative(dst_dent))
-			continue;
+	if (ksmbd_override_fsids(work))
+		return -ENOMEM;
 
-		child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent));
-		if (child_fp) {
-			spin_unlock(&src_dent->d_lock);
-			ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n");
-			return -EACCES;
-		}
+	to = getname_kernel(newname);
+	if (IS_ERR(to)) {
+		err = PTR_ERR(to);
+		goto revert_fsids;
 	}
-	spin_unlock(&src_dent->d_lock);
 
-	return 0;
-}
+retry:
+	err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
+				     &new_path, &new_last, &new_type,
+				     &share_conf->vfs_path);
+	if (err)
+		goto out1;
 
-static int __ksmbd_vfs_rename(struct ksmbd_work *work,
-			      struct mnt_idmap *src_idmap,
-			      struct dentry *src_dent_parent,
-			      struct dentry *src_dent,
-			      struct mnt_idmap *dst_idmap,
-			      struct dentry *dst_dent_parent,
-			      struct dentry *trap_dent,
-			      char *dst_name)
-{
-	struct dentry *dst_dent;
-	int err;
+	if (old_path->mnt != new_path.mnt) {
+		err = -EXDEV;
+		goto out2;
+	}
 
-	if (!work->tcon->posix_extensions) {
-		err = ksmbd_validate_entry_in_use(src_dent);
-		if (err)
-			return err;
+	trap = lock_rename_child(old_child, new_path.dentry);
+
+	old_parent = dget(old_child->d_parent);
+	if (d_unhashed(old_child)) {
+		err = -EINVAL;
+		goto out3;
 	}
 
-	if (d_really_is_negative(src_dent_parent))
-		return -ENOENT;
-	if (d_really_is_negative(dst_dent_parent))
-		return -ENOENT;
-	if (d_really_is_negative(src_dent))
-		return -ENOENT;
-	if (src_dent == trap_dent)
-		return -EINVAL;
+	parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent));
+	if (parent_fp) {
+		if (parent_fp->daccess & FILE_DELETE_LE) {
+			pr_err("parent dir is opened with delete access\n");
+			err = -ESHARE;
+			ksmbd_fd_put(work, parent_fp);
+			goto out3;
+		}
+		ksmbd_fd_put(work, parent_fp);
+	}
 
-	if (ksmbd_override_fsids(work))
-		return -ENOMEM;
+	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+					  lookup_flags | LOOKUP_RENAME_TARGET);
+	if (IS_ERR(new_dentry)) {
+		err = PTR_ERR(new_dentry);
+		goto out3;
+	}
 
-	dst_dent = lookup_one(dst_idmap, dst_name,
-			      dst_dent_parent, strlen(dst_name));
-	err = PTR_ERR(dst_dent);
-	if (IS_ERR(dst_dent)) {
-		pr_err("lookup failed %s [%d]\n", dst_name, err);
-		goto out;
+	if (d_is_symlink(new_dentry)) {
+		err = -EACCES;
+		goto out4;
 	}
 
-	err = -ENOTEMPTY;
-	if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) {
-		struct renamedata rd = {
-			.old_mnt_idmap	= src_idmap,
-			.old_dir	= d_inode(src_dent_parent),
-			.old_dentry	= src_dent,
-			.new_mnt_idmap	= dst_idmap,
-			.new_dir	= d_inode(dst_dent_parent),
-			.new_dentry	= dst_dent,
-		};
-		err = vfs_rename(&rd);
+	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
+		err = -EEXIST;
+		goto out4;
 	}
-	if (err)
-		pr_err("vfs_rename failed err %d\n", err);
-	if (dst_dent)
-		dput(dst_dent);
-out:
-	ksmbd_revert_fsids(work);
-	return err;
-}
 
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
-			char *newname)
-{
-	struct mnt_idmap *idmap;
-	struct path dst_path;
-	struct dentry *src_dent_parent, *dst_dent_parent;
-	struct dentry *src_dent, *trap_dent, *src_child;
-	char *dst_name;
-	int err;
+	if (old_child == trap) {
+		err = -EINVAL;
+		goto out4;
+	}
 
-	dst_name = extract_last_component(newname);
-	if (!dst_name) {
-		dst_name = newname;
-		newname = "";
+	if (new_dentry == trap) {
+		err = -ENOTEMPTY;
+		goto out4;
 	}
 
-	src_dent_parent = dget_parent(fp->filp->f_path.dentry);
-	src_dent = fp->filp->f_path.dentry;
+	rd.old_mnt_idmap	= mnt_idmap(old_path->mnt),
+	rd.old_dir		= d_inode(old_parent),
+	rd.old_dentry		= old_child,
+	rd.new_mnt_idmap	= mnt_idmap(new_path.mnt),
+	rd.new_dir		= new_path.dentry->d_inode,
+	rd.new_dentry		= new_dentry,
+	rd.flags		= flags,
+	err = vfs_rename(&rd);
+	if (err)
+		ksmbd_debug(VFS, "vfs_rename failed err %d\n", err);
 
-	err = ksmbd_vfs_kern_path(work, newname,
-				  LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
-				  &dst_path, false);
-	if (err) {
-		ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err);
-		goto out;
+out4:
+	dput(new_dentry);
+out3:
+	dput(old_parent);
+	unlock_rename(old_parent, new_path.dentry);
+out2:
+	path_put(&new_path);
+
+	if (retry_estale(err, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
 	}
-	dst_dent_parent = dst_path.dentry;
-
-	trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
-	dget(src_dent);
-	dget(dst_dent_parent);
-	idmap = file_mnt_idmap(fp->filp);
-	src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent,
-			       src_dent->d_name.len);
-	if (IS_ERR(src_child)) {
-		err = PTR_ERR(src_child);
-		goto out_lock;
-	}
-
-	if (src_child != src_dent) {
-		err = -ESTALE;
-		dput(src_child);
-		goto out_lock;
-	}
-	dput(src_child);
-
-	err = __ksmbd_vfs_rename(work,
-				 idmap,
-				 src_dent_parent,
-				 src_dent,
-				 mnt_idmap(dst_path.mnt),
-				 dst_dent_parent,
-				 trap_dent,
-				 dst_name);
-out_lock:
-	dput(src_dent);
-	dput(dst_dent_parent);
-	unlock_rename(src_dent_parent, dst_dent_parent);
-	path_put(&dst_path);
-out:
-	dput(src_dent_parent);
+out1:
+	putname(to);
+revert_fsids:
+	ksmbd_revert_fsids(work);
 	return err;
 }
 
@@ -1079,14 +1013,16 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
 	return vfs_removexattr(idmap, dentry, attr_name);
 }
 
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
-		     struct dentry *dir, struct dentry *dentry)
+int ksmbd_vfs_unlink(struct file *filp)
 {
 	int err = 0;
+	struct dentry *dir, *dentry = filp->f_path.dentry;
+	struct mnt_idmap *idmap = file_mnt_idmap(filp);
 
-	err = ksmbd_vfs_lock_parent(idmap, dir, dentry);
+	dir = dget_parent(dentry);
+	err = ksmbd_vfs_lock_parent(dir, dentry);
 	if (err)
-		return err;
+		goto out;
 	dget(dentry);
 
 	if (S_ISDIR(d_inode(dentry)->i_mode))
@@ -1098,6 +1034,8 @@ int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
 	inode_unlock(d_inode(dir));
 	if (err)
 		ksmbd_debug(VFS, "failed to delete, err %d\n", err);
+out:
+	dput(dir);
 
 	return err;
 }
@@ -1200,7 +1138,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
 }
 
 /**
- * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
  * @name:	file path that is relative to share
  * @flags:	lookup flags
  * @path:	if lookup succeed, return path info
@@ -1208,24 +1146,20 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
  *
  * Return:	0 on success, otherwise error
  */
-int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
-			unsigned int flags, struct path *path, bool caseless)
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+			       unsigned int flags, struct path *path,
+			       bool caseless)
 {
 	struct ksmbd_share_config *share_conf = work->tcon->share_conf;
 	int err;
+	struct path parent_path;
 
-	flags |= LOOKUP_BENEATH;
-	err = vfs_path_lookup(share_conf->vfs_path.dentry,
-			      share_conf->vfs_path.mnt,
-			      name,
-			      flags,
-			      path);
+	err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path);
 	if (!err)
-		return 0;
+		return err;
 
 	if (caseless) {
 		char *filepath;
-		struct path parent;
 		size_t path_len, remain_len;
 
 		filepath = kstrdup(name, GFP_KERNEL);
@@ -1235,10 +1169,10 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 		path_len = strlen(filepath);
 		remain_len = path_len;
 
-		parent = share_conf->vfs_path;
-		path_get(&parent);
+		parent_path = share_conf->vfs_path;
+		path_get(&parent_path);
 
-		while (d_can_lookup(parent.dentry)) {
+		while (d_can_lookup(parent_path.dentry)) {
 			char *filename = filepath + path_len - remain_len;
 			char *next = strchrnul(filename, '/');
 			size_t filename_len = next - filename;
@@ -1247,12 +1181,11 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 			if (filename_len == 0)
 				break;
 
-			err = ksmbd_vfs_lookup_in_dir(&parent, filename,
+			err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
 						      filename_len,
 						      work->conn->um);
-			path_put(&parent);
 			if (err)
-				goto out;
+				goto out2;
 
 			next[0] = '\0';
 
@@ -1260,23 +1193,31 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 					      share_conf->vfs_path.mnt,
 					      filepath,
 					      flags,
-					      &parent);
+					      path);
 			if (err)
-				goto out;
-			else if (is_last) {
-				*path = parent;
-				goto out;
-			}
+				goto out2;
+			else if (is_last)
+				goto out1;
+			path_put(&parent_path);
+			parent_path = *path;
 
 			next[0] = '/';
 			remain_len -= filename_len + 1;
 		}
 
-		path_put(&parent);
 		err = -EINVAL;
-out:
+out2:
+		path_put(&parent_path);
+out1:
 		kfree(filepath);
 	}
+
+	if (!err) {
+		err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry);
+		if (err)
+			dput(path->dentry);
+		path_put(&parent_path);
+	}
 	return err;
 }
 
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 9d676ab0cd25..a4ae89f3230d 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -71,9 +71,7 @@ struct ksmbd_kstat {
 	__le32			file_attributes;
 };
 
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
-			  struct dentry *child);
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry);
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child);
 int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 				   struct dentry *dentry, __le32 *daccess);
 int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode);
@@ -84,12 +82,12 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
 		    char *buf, size_t count, loff_t *pos, bool sync,
 		    ssize_t *written);
 int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path);
 int ksmbd_vfs_link(struct ksmbd_work *work,
 		   const char *oldname, const char *newname);
 int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
-			char *newname);
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+		     char *newname, int flags);
 int ksmbd_vfs_truncate(struct ksmbd_work *work,
 		       struct ksmbd_file *fp, loff_t size);
 struct srv_copychunk;
@@ -116,9 +114,9 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
 				size_t *xattr_stream_name_size, int s_type);
 int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
 			   struct dentry *dentry, char *attr_name);
-int ksmbd_vfs_kern_path(struct ksmbd_work *work,
-			char *name, unsigned int flags, struct path *path,
-			bool caseless);
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+			       unsigned int flags, struct path *path,
+			       bool caseless);
 struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 					  const char *name,
 					  unsigned int flags,
@@ -131,8 +129,7 @@ struct file_allocated_range_buffer;
 int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
 			 struct file_allocated_range_buffer *ranges,
 			 unsigned int in_count, unsigned int *out_count);
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir,
-		     struct dentry *dentry);
+int ksmbd_vfs_unlink(struct file *filp);
 void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
 int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
 				struct mnt_idmap *idmap,
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 054a7d2e0f48..2d0138e72d78 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -244,7 +244,6 @@ void ksmbd_release_inode_hash(void)
 
 static void __ksmbd_inode_close(struct ksmbd_file *fp)
 {
-	struct dentry *dir, *dentry;
 	struct ksmbd_inode *ci = fp->f_ci;
 	int err;
 	struct file *filp;
@@ -263,11 +262,9 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
 	if (atomic_dec_and_test(&ci->m_count)) {
 		write_lock(&ci->m_lock);
 		if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
-			dentry = filp->f_path.dentry;
-			dir = dentry->d_parent;
 			ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
 			write_unlock(&ci->m_lock);
-			ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry);
+			ksmbd_vfs_unlink(filp);
 			write_lock(&ci->m_lock);
 		}
 		write_unlock(&ci->m_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 6bc1964e2214..c1d59ae5af83 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -254,6 +254,7 @@ getname_kernel(const char * filename)
 
 	return result;
 }
+EXPORT_SYMBOL(getname_kernel);
 
 void putname(struct filename *name)
 {
@@ -271,6 +272,7 @@ void putname(struct filename *name)
 	} else
 		__putname(name);
 }
+EXPORT_SYMBOL(putname);
 
 /**
  * check_acl - perform ACL permission checking
@@ -1581,8 +1583,9 @@ static struct dentry *lookup_dcache(const struct qstr *name,
  * when directory is guaranteed to have no in-lookup children
  * at all.
  */
-static struct dentry *__lookup_hash(const struct qstr *name,
-		struct dentry *base, unsigned int flags)
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+				    struct dentry *base,
+				    unsigned int flags)
 {
 	struct dentry *dentry = lookup_dcache(name, base, flags);
 	struct dentry *old;
@@ -1606,6 +1609,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	}
 	return dentry;
 }
+EXPORT_SYMBOL(lookup_one_qstr_excl);
 
 static struct dentry *lookup_fast(struct nameidata *nd)
 {
@@ -2532,16 +2536,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags,
 }
 
 /* Note: this does not consume "name" */
-static int filename_parentat(int dfd, struct filename *name,
-			     unsigned int flags, struct path *parent,
-			     struct qstr *last, int *type)
+static int __filename_parentat(int dfd, struct filename *name,
+			       unsigned int flags, struct path *parent,
+			       struct qstr *last, int *type,
+			       const struct path *root)
 {
 	int retval;
 	struct nameidata nd;
 
 	if (IS_ERR(name))
 		return PTR_ERR(name);
-	set_nameidata(&nd, dfd, name, NULL);
+	set_nameidata(&nd, dfd, name, root);
 	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
 	if (unlikely(retval == -ECHILD))
 		retval = path_parentat(&nd, flags, parent);
@@ -2556,6 +2561,13 @@ static int filename_parentat(int dfd, struct filename *name,
 	return retval;
 }
 
+static int filename_parentat(int dfd, struct filename *name,
+			     unsigned int flags, struct path *parent,
+			     struct qstr *last, int *type)
+{
+	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
+}
+
 /* does lookup, returns the object with parent locked */
 static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
 {
@@ -2571,7 +2583,7 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
 		return ERR_PTR(-EINVAL);
 	}
 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	d = __lookup_hash(&last, path->dentry, 0);
+	d = lookup_one_qstr_excl(&last, path->dentry, 0);
 	if (IS_ERR(d)) {
 		inode_unlock(path->dentry->d_inode);
 		path_put(path);
@@ -2599,6 +2611,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
 }
 EXPORT_SYMBOL(kern_path);
 
+/**
+ * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
+ * @filename: filename structure
+ * @flags: lookup flags
+ * @parent: pointer to struct path to fill
+ * @last: last component
+ * @type: type of the last component
+ * @root: pointer to struct path of the base directory
+ */
+int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
+			   struct path *parent, struct qstr *last, int *type,
+			   const struct path *root)
+{
+	return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
+				    type, root);
+}
+EXPORT_SYMBOL(vfs_path_parent_lookup);
+
 /**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
  * @dentry:  pointer to dentry of the base directory
@@ -3852,7 +3882,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	if (last.name[last.len] && !want_dir)
 		create_flags = 0;
 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
+	dentry = lookup_one_qstr_excl(&last, path->dentry,
+				      reval_flag | create_flags);
 	if (IS_ERR(dentry))
 		goto unlock;
 
@@ -4212,7 +4243,7 @@ retry:
 		goto exit2;
 
 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit3;
@@ -4345,7 +4376,7 @@ retry:
 		goto exit2;
 retry_deleg:
 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 
@@ -4909,7 +4940,8 @@ retry:
 retry_deleg:
 	trap = lock_rename(new_path.dentry, old_path.dentry);
 
-	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
+	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
+					  lookup_flags);
 	error = PTR_ERR(old_dentry);
 	if (IS_ERR(old_dentry))
 		goto exit3;
@@ -4917,7 +4949,8 @@ retry_deleg:
 	error = -ENOENT;
 	if (d_is_negative(old_dentry))
 		goto exit4;
-	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
+	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+					  lookup_flags | target_flags);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto exit4;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 5864e4d82e56..1463cbda4888 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -57,12 +57,18 @@ static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
 	return user_path_at_empty(dfd, name, flags, path, NULL);
 }
 
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+				    struct dentry *base,
+				    unsigned int flags);
 extern int kern_path(const char *, unsigned, struct path *);
 
 extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
 extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
 extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
+int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
+			   struct path *parent, struct qstr *last, int *type,
+			   const struct path *root);
 int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
 		    unsigned int, struct path *);
 
-- 
cgit v1.2.3


From 32d2a15ec54a3d8b708b343e50072ccb0e5da83f Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 6 Mar 2023 14:59:04 -0600
Subject: platform/chrome: Replace fake flexible arrays with flexible-array
 member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zero-length arrays as fake flexible arrays are deprecated and we are
moving towards adopting C99 flexible-array members instead.

Use the DECLARE_FLEX_ARRAY() helper macro to transform zero-length
arrays in unions with flexible-array members.

Address the following warning found with GCC-13 and
-fstrict-flex-arrays=3 enabled:
drivers/iio/accel/cros_ec_accel_legacy.c:66:46: warning: array subscript <unknown> is outside array bounds of ‘struct ec_response_motion_sensor_data[0]’ [-Warray-bounds=]

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE
routines on memcpy() and help us make progress towards globally
enabling -fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/21
Link: https://github.com/KSPP/linux/issues/193
Link: https://github.com/KSPP/linux/issues/262
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/ZAZUGBmSLc5wg7AK@work
---
 include/linux/platform_data/cros_ec_commands.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 7e9c76aedd2d..ab721cf13a98 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -2701,7 +2701,7 @@ struct ec_response_motion_sense {
 			 * Sensor data is truncated if response_max is too small
 			 * for holding all the data.
 			 */
-			struct ec_response_motion_sensor_data sensor[0];
+			DECLARE_FLEX_ARRAY(struct ec_response_motion_sensor_data, sensor);
 		} dump;
 
 		/* Used for MOTIONSENSE_CMD_INFO. */
-- 
cgit v1.2.3


From 7002cbd625467084f1ef01b6e365e10b51fc4b9f Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sun, 23 Apr 2023 20:10:02 +0200
Subject: ALSA: emu10k1: use high-level I/O in set_filterQ()

This makes the code shorter and more legible.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Link: https://lore.kernel.org/r/20230423181002.1246793-2-oswald.buddenhagen@gmx.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h              | 3 ++-
 sound/pci/emu10k1/emu10k1_callback.c | 5 +----
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 05a09826eef0..8fe80dcee71b 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -429,7 +429,8 @@
 #define DSL_LOOPENDADDR		0x18000007
 
 #define CCCA			0x08		/* Filter Q, interp. ROM, byte size, cur. addr register */
-#define CCCA_RESONANCE		0xf0000000	/* Lowpass filter resonance (Q) height			*/
+#define CCCA_RESONANCE_MASK	0xf0000000	/* Lowpass filter resonance (Q) height			*/
+#define CCCA_RESONANCE		0x041c0008
 #define CCCA_INTERPROM_MASK	0x0e000000	/* Selects passband of interpolation ROM		*/
 						/* 1 == full band, 7 == lowpass				*/
 						/* ROM 0 is used when pitch shifting downward or less	*/
diff --git a/sound/pci/emu10k1/emu10k1_callback.c b/sound/pci/emu10k1/emu10k1_callback.c
index dba1e9fc2eec..c6d152575181 100644
--- a/sound/pci/emu10k1/emu10k1_callback.c
+++ b/sound/pci/emu10k1/emu10k1_callback.c
@@ -531,8 +531,5 @@ set_fm2frq2(struct snd_emu10k1 *hw, struct snd_emux_voice *vp)
 static void
 set_filterQ(struct snd_emu10k1 *hw, struct snd_emux_voice *vp)
 {
-	unsigned int val;
-	val = snd_emu10k1_ptr_read(hw, CCCA, vp->ch) & ~CCCA_RESONANCE;
-	val |= (vp->reg.parm.filterQ << 28);
-	snd_emu10k1_ptr_write(hw, CCCA, vp->ch, val);
+	snd_emu10k1_ptr_write(hw, CCCA_RESONANCE, vp->ch, vp->reg.parm.filterQ);
 }
-- 
cgit v1.2.3


From bfff83c4824308f7acce47baf88858b52bc1314f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 28 Mar 2023 19:15:29 -0700
Subject: linux/vt_buffer.h: allow either builtin or modular for macros

Fix build errors on ARCH=alpha when CONFIG_MDA_CONSOLE=m.
This allows the ARCH macros to be the only ones defined.

In file included from ../drivers/video/console/mdacon.c:37:
../arch/alpha/include/asm/vga.h:17:40: error: expected identifier or '(' before 'volatile'
   17 | static inline void scr_writew(u16 val, volatile u16 *addr)
      |                                        ^~~~~~~~
../include/linux/vt_buffer.h:24:34: note: in definition of macro 'scr_writew'
   24 | #define scr_writew(val, addr) (*(addr) = (val))
      |                                  ^~~~
../include/linux/vt_buffer.h:24:40: error: expected ')' before '=' token
   24 | #define scr_writew(val, addr) (*(addr) = (val))
      |                                        ^
../arch/alpha/include/asm/vga.h:17:20: note: in expansion of macro 'scr_writew'
   17 | static inline void scr_writew(u16 val, volatile u16 *addr)
      |                    ^~~~~~~~~~
../arch/alpha/include/asm/vga.h:25:29: error: expected identifier or '(' before 'volatile'
   25 | static inline u16 scr_readw(volatile const u16 *addr)
      |                             ^~~~~~~~

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/vt_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 848db1b1569f..919d999a8c1d 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -16,7 +16,7 @@
 
 #include <linux/string.h>
 
-#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
+#if IS_ENABLED(CONFIG_VGA_CONSOLE) || IS_ENABLED(CONFIG_MDA_CONSOLE)
 #include <asm/vga.h>
 #endif
 
-- 
cgit v1.2.3


From 7f8da9915fcc6386edf86471bf31e162845930a4 Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 2 Mar 2023 11:46:47 -0500
Subject: KEYS: Create static version of public_key_verify_signature

The kernel test robot reports undefined reference to
public_key_verify_signature when CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE is
not defined. Create a static version in this case and return -EINVAL.

Fixes: db6c43bd2132 ("crypto: KEYS: convert public key and digsig asym to the akcipher api")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/crypto/public_key.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 68f7aa2a7e55..6d61695e1cde 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -80,7 +80,16 @@ extern int create_signature(struct kernel_pkey_params *, const void *, void *);
 extern int verify_signature(const struct key *,
 			    const struct public_key_signature *);
 
+#if IS_REACHABLE(CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE)
 int public_key_verify_signature(const struct public_key *pkey,
 				const struct public_key_signature *sig);
+#else
+static inline
+int public_key_verify_signature(const struct public_key *pkey,
+				const struct public_key_signature *sig)
+{
+	return -EINVAL;
+}
+#endif
 
 #endif /* _LINUX_PUBLIC_KEY_H */
-- 
cgit v1.2.3


From 30eae2b037af54b24109dcaea21db46f6285c69b Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 2 Mar 2023 11:46:49 -0500
Subject: KEYS: X.509: Parse Basic Constraints for CA

Parse the X.509 Basic Constraints.  The basic constraints extension
identifies whether the subject of the certificate is a CA.

BasicConstraints ::= SEQUENCE {
        cA                      BOOLEAN DEFAULT FALSE,
        pathLenConstraint       INTEGER (0..MAX) OPTIONAL }

If the CA is true, store it in the public_key.  This will be used
in a follow on patch that requires knowing if the public key is a CA.

Link: https://www.rfc-editor.org/rfc/rfc5280#section-4.2.1.9
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 22 ++++++++++++++++++++++
 include/crypto/public_key.h               |  2 ++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 7a9b084e2043..77547d4bd94d 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -586,6 +586,28 @@ int x509_process_extension(void *context, size_t hdrlen,
 		return 0;
 	}
 
+	if (ctx->last_oid == OID_basicConstraints) {
+		/*
+		 * Get hold of the basicConstraints
+		 * v[1] is the encoding size
+		 *	(Expect 0x2 or greater, making it 1 or more bytes)
+		 * v[2] is the encoding type
+		 *	(Expect an ASN1_BOOL for the CA)
+		 * v[3] is the contents of the ASN1_BOOL
+		 *      (Expect 1 if the CA is TRUE)
+		 * vlen should match the entire extension size
+		 */
+		if (v[0] != (ASN1_CONS_BIT | ASN1_SEQ))
+			return -EBADMSG;
+		if (vlen < 2)
+			return -EBADMSG;
+		if (v[1] != vlen - 2)
+			return -EBADMSG;
+		if (vlen >= 4 && v[1] != 0 && v[2] == ASN1_BOOL && v[3] == 1)
+			ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_CA;
+		return 0;
+	}
+
 	return 0;
 }
 
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 6d61695e1cde..c401762850f2 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -28,6 +28,8 @@ struct public_key {
 	bool key_is_private;
 	const char *id_type;
 	const char *pkey_algo;
+	unsigned long key_eflags;	/* key extension flags */
+#define KEY_EFLAG_CA		0	/* set if the CA basic constraints is set */
 };
 
 extern void public_key_free(struct public_key *key);
-- 
cgit v1.2.3


From 567671281a751b80918a4531c4ba84b90a2a42c0 Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 2 Mar 2023 11:46:50 -0500
Subject: KEYS: X.509: Parse Key Usage

Parse the X.509 Key Usage.  The key usage extension defines the purpose of
the key contained in the certificate.

   id-ce-keyUsage OBJECT IDENTIFIER ::=  { id-ce 15 }

      KeyUsage ::= BIT STRING {
           digitalSignature        (0),
           contentCommitment       (1),
           keyEncipherment         (2),
           dataEncipherment        (3),
           keyAgreement            (4),
           keyCertSign             (5),
           cRLSign                 (6),
           encipherOnly            (7),
           decipherOnly            (8) }

If the keyCertSign or digitalSignature is set, store it in the
public_key structure. Having the purpose of the key being stored
during parsing, allows enforcement on the usage field in the future.
This will be used in a follow on patch that requires knowing the
certificate key usage type.

Link: https://www.rfc-editor.org/rfc/rfc5280#section-4.2.1.3
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 28 ++++++++++++++++++++++++++++
 include/crypto/public_key.h               |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 77547d4bd94d..0a7049b470c1 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -579,6 +579,34 @@ int x509_process_extension(void *context, size_t hdrlen,
 		return 0;
 	}
 
+	if (ctx->last_oid == OID_keyUsage) {
+		/*
+		 * Get hold of the keyUsage bit string
+		 * v[1] is the encoding size
+		 *       (Expect either 0x02 or 0x03, making it 1 or 2 bytes)
+		 * v[2] is the number of unused bits in the bit string
+		 *       (If >= 3 keyCertSign is missing when v[1] = 0x02)
+		 * v[3] and possibly v[4] contain the bit string
+		 *
+		 * From RFC 5280 4.2.1.3:
+		 *   0x04 is where keyCertSign lands in this bit string
+		 *   0x80 is where digitalSignature lands in this bit string
+		 */
+		if (v[0] != ASN1_BTS)
+			return -EBADMSG;
+		if (vlen < 4)
+			return -EBADMSG;
+		if (v[2] >= 8)
+			return -EBADMSG;
+		if (v[3] & 0x80)
+			ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_DIGITALSIG;
+		if (v[1] == 0x02 && v[2] <= 2 && (v[3] & 0x04))
+			ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN;
+		else if (vlen > 4 && v[1] == 0x03 && (v[3] & 0x04))
+			ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN;
+		return 0;
+	}
+
 	if (ctx->last_oid == OID_authorityKeyIdentifier) {
 		/* Get hold of the CA key fingerprint */
 		ctx->raw_akid = v;
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index c401762850f2..03c3fb990d59 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -30,6 +30,8 @@ struct public_key {
 	const char *pkey_algo;
 	unsigned long key_eflags;	/* key extension flags */
 #define KEY_EFLAG_CA		0	/* set if the CA basic constraints is set */
+#define KEY_EFLAG_DIGITALSIG	1	/* set if the digitalSignature usage is set */
+#define KEY_EFLAG_KEYCERTSIGN	2	/* set if the keyCertSign usage is set */
 };
 
 extern void public_key_free(struct public_key *key);
-- 
cgit v1.2.3


From 76adb2fbc69a13c80b39042aab4d34e99309c8d4 Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 2 Mar 2023 11:46:51 -0500
Subject: KEYS: CA link restriction

Add a new link restriction.  Restrict the addition of keys in a keyring
based on the key to be added being a CA.

Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 crypto/asymmetric_keys/restrict.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/crypto/public_key.h       | 15 +++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/crypto/asymmetric_keys/restrict.c b/crypto/asymmetric_keys/restrict.c
index 6b1ac5f5896a..48457c6f33f9 100644
--- a/crypto/asymmetric_keys/restrict.c
+++ b/crypto/asymmetric_keys/restrict.c
@@ -108,6 +108,44 @@ int restrict_link_by_signature(struct key *dest_keyring,
 	return ret;
 }
 
+/**
+ * restrict_link_by_ca - Restrict additions to a ring of CA keys
+ * @dest_keyring: Keyring being linked to.
+ * @type: The type of key being added.
+ * @payload: The payload of the new key.
+ * @trust_keyring: Unused.
+ *
+ * Check if the new certificate is a CA. If it is a CA, then mark the new
+ * certificate as being ok to link.
+ *
+ * Returns 0 if the new certificate was accepted, -ENOKEY if the
+ * certificate is not a CA. -ENOPKG if the signature uses unsupported
+ * crypto, or some other error if there is a matching certificate but
+ * the signature check cannot be performed.
+ */
+int restrict_link_by_ca(struct key *dest_keyring,
+			const struct key_type *type,
+			const union key_payload *payload,
+			struct key *trust_keyring)
+{
+	const struct public_key *pkey;
+
+	if (type != &key_type_asymmetric)
+		return -EOPNOTSUPP;
+
+	pkey = payload->data[asym_crypto];
+	if (!pkey)
+		return -ENOPKG;
+	if (!test_bit(KEY_EFLAG_CA, &pkey->key_eflags))
+		return -ENOKEY;
+	if (!test_bit(KEY_EFLAG_KEYCERTSIGN, &pkey->key_eflags))
+		return -ENOKEY;
+	if (test_bit(KEY_EFLAG_DIGITALSIG, &pkey->key_eflags))
+		return -ENOKEY;
+
+	return 0;
+}
+
 static bool match_either_id(const struct asymmetric_key_id **pair,
 			    const struct asymmetric_key_id *single)
 {
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 03c3fb990d59..653992a6e941 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -75,6 +75,21 @@ extern int restrict_link_by_key_or_keyring_chain(struct key *trust_keyring,
 						 const union key_payload *payload,
 						 struct key *trusted);
 
+#if IS_REACHABLE(CONFIG_ASYMMETRIC_KEY_TYPE)
+extern int restrict_link_by_ca(struct key *dest_keyring,
+			       const struct key_type *type,
+			       const union key_payload *payload,
+			       struct key *trust_keyring);
+#else
+static inline int restrict_link_by_ca(struct key *dest_keyring,
+				      const struct key_type *type,
+				      const union key_payload *payload,
+				      struct key *trust_keyring)
+{
+	return 0;
+}
+#endif
+
 extern int query_asymmetric_key(const struct kernel_pkey_params *,
 				struct kernel_pkey_query *);
 
-- 
cgit v1.2.3


From 32c2f2a46db1322caaf78d5ea747ed5c56d2e353 Mon Sep 17 00:00:00 2001
From: Andrew Halaney <ahalaney@redhat.com>
Date: Thu, 13 Apr 2023 14:15:39 -0500
Subject: clk: qcom: gcc-sc8280xp: Add EMAC GDSCs

Add the EMAC GDSCs to allow the EMAC hardware to be enabled.

Acked-by: Stephen Boyd <sboyd@kernel.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Tested-by: Brian Masney <bmasney@redhat.com>
Signed-off-by: Andrew Halaney <ahalaney@redhat.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230413191541.1073027-2-ahalaney@redhat.com
---
 drivers/clk/qcom/gcc-sc8280xp.c               | 18 ++++++++++++++++++
 include/dt-bindings/clock/qcom,gcc-sc8280xp.h |  2 ++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/qcom/gcc-sc8280xp.c b/drivers/clk/qcom/gcc-sc8280xp.c
index b3198784e1c3..04a99dbaa57e 100644
--- a/drivers/clk/qcom/gcc-sc8280xp.c
+++ b/drivers/clk/qcom/gcc-sc8280xp.c
@@ -6873,6 +6873,22 @@ static struct gdsc usb30_sec_gdsc = {
 	.pwrsts = PWRSTS_RET_ON,
 };
 
+static struct gdsc emac_0_gdsc = {
+	.gdscr = 0xaa004,
+	.pd = {
+		.name = "emac_0_gdsc",
+	},
+	.pwrsts = PWRSTS_OFF_ON,
+};
+
+static struct gdsc emac_1_gdsc = {
+	.gdscr = 0xba004,
+	.pd = {
+		.name = "emac_1_gdsc",
+	},
+	.pwrsts = PWRSTS_OFF_ON,
+};
+
 static struct clk_regmap *gcc_sc8280xp_clocks[] = {
 	[GCC_AGGRE_NOC_PCIE0_TUNNEL_AXI_CLK] = &gcc_aggre_noc_pcie0_tunnel_axi_clk.clkr,
 	[GCC_AGGRE_NOC_PCIE1_TUNNEL_AXI_CLK] = &gcc_aggre_noc_pcie1_tunnel_axi_clk.clkr,
@@ -7351,6 +7367,8 @@ static struct gdsc *gcc_sc8280xp_gdscs[] = {
 	[USB30_MP_GDSC] = &usb30_mp_gdsc,
 	[USB30_PRIM_GDSC] = &usb30_prim_gdsc,
 	[USB30_SEC_GDSC] = &usb30_sec_gdsc,
+	[EMAC_0_GDSC] = &emac_0_gdsc,
+	[EMAC_1_GDSC] = &emac_1_gdsc,
 };
 
 static const struct clk_rcg_dfs_data gcc_dfs_clocks[] = {
diff --git a/include/dt-bindings/clock/qcom,gcc-sc8280xp.h b/include/dt-bindings/clock/qcom,gcc-sc8280xp.h
index cb2fb638825c..721105ea4fad 100644
--- a/include/dt-bindings/clock/qcom,gcc-sc8280xp.h
+++ b/include/dt-bindings/clock/qcom,gcc-sc8280xp.h
@@ -492,5 +492,7 @@
 #define USB30_MP_GDSC					9
 #define USB30_PRIM_GDSC					10
 #define USB30_SEC_GDSC					11
+#define EMAC_0_GDSC					12
+#define EMAC_1_GDSC					13
 
 #endif
-- 
cgit v1.2.3


From f68a40ee4732731f149961abab27a45b6c11f413 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Tue, 7 Mar 2023 09:43:13 -0300
Subject: clocksource/drivers/timer-imx-gpt: Remove non-DT function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mxc_timer_init() was originally only used by non-DT i.MX platforms.

i.MX has already been converted to be a DT-only platform.

Remove the unused mxc_timer_init() function.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20230307124313.708255-1-festevam@denx.de
---
 drivers/clocksource/timer-imx-gpt.c | 19 -------------------
 include/soc/imx/timer.h             |  7 -------
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/drivers/clocksource/timer-imx-gpt.c b/drivers/clocksource/timer-imx-gpt.c
index 7b2c70f2f353..ca3e4cbc80c6 100644
--- a/drivers/clocksource/timer-imx-gpt.c
+++ b/drivers/clocksource/timer-imx-gpt.c
@@ -420,25 +420,6 @@ static int __init _mxc_timer_init(struct imx_timer *imxtm)
 	return mxc_clockevent_init(imxtm);
 }
 
-void __init mxc_timer_init(unsigned long pbase, int irq, enum imx_gpt_type type)
-{
-	struct imx_timer *imxtm;
-
-	imxtm = kzalloc(sizeof(*imxtm), GFP_KERNEL);
-	BUG_ON(!imxtm);
-
-	imxtm->clk_per = clk_get_sys("imx-gpt.0", "per");
-	imxtm->clk_ipg = clk_get_sys("imx-gpt.0", "ipg");
-
-	imxtm->base = ioremap(pbase, SZ_4K);
-	BUG_ON(!imxtm->base);
-
-	imxtm->type = type;
-	imxtm->irq = irq;
-
-	_mxc_timer_init(imxtm);
-}
-
 static int __init mxc_timer_init_dt(struct device_node *np,  enum imx_gpt_type type)
 {
 	struct imx_timer *imxtm;
diff --git a/include/soc/imx/timer.h b/include/soc/imx/timer.h
index b888d5076b4d..25f29c6bbd0b 100644
--- a/include/soc/imx/timer.h
+++ b/include/soc/imx/timer.h
@@ -13,11 +13,4 @@ enum imx_gpt_type {
 	GPT_TYPE_IMX6DL,	/* i.MX6DL/SX/SL */
 };
 
-/*
- * This is a stop-gap solution for clock drivers like imx1/imx21 which call
- * mxc_timer_init() to initialize timer for non-DT boot.  It can be removed
- * when these legacy non-DT support is converted or dropped.
- */
-void mxc_timer_init(unsigned long pbase, int irq, enum imx_gpt_type type);
-
 #endif  /* __SOC_IMX_TIMER_H__ */
-- 
cgit v1.2.3


From 96928d9032a7c34f12a88df879665562bcebf59a Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Sat, 15 Apr 2023 19:01:10 +0900
Subject: seq_buf: Add seq_buf_do_printk() helper

Sometimes we use seq_buf to format a string buffer, which
we then pass to printk(). However, in certain situations
the seq_buf string buffer can get too big, exceeding the
PRINTKRB_RECORD_MAX bytes limit, and causing printk() to
truncate the string.

Add a new seq_buf helper. This helper prints the seq_buf
string buffer line by line, using \n as a delimiter,
rather than passing the whole string buffer to printk()
at once.

Link: https://lkml.kernel.org/r/20230415100110.1419872-1-senozhatsky@chromium.org

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h |  2 ++
 lib/seq_buf.c           | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 5b31c5147969..515d7fcb9634 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -159,4 +159,6 @@ extern int
 seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
 #endif
 
+void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
+
 #endif /* _LINUX_SEQ_BUF_H */
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 0a68f7aa85d6..45c450f423fa 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -93,6 +93,38 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(seq_buf_printf);
 
+/**
+ * seq_buf_do_printk - printk seq_buf line by line
+ * @s: seq_buf descriptor
+ * @lvl: printk level
+ *
+ * printk()-s a multi-line sequential buffer line by line. The function
+ * makes sure that the buffer in @s is nul terminated and safe to read
+ * as a string.
+ */
+void seq_buf_do_printk(struct seq_buf *s, const char *lvl)
+{
+	const char *start, *lf;
+
+	if (s->size == 0 || s->len == 0)
+		return;
+
+	seq_buf_terminate(s);
+
+	start = s->buffer;
+	while ((lf = strchr(start, '\n'))) {
+		int len = lf - start + 1;
+
+		printk("%s%.*s", lvl, len, start);
+		start = ++lf;
+	}
+
+	/* No trailing LF */
+	if (start < s->buffer + s->len)
+		printk("%s%s\n", lvl, start);
+}
+EXPORT_SYMBOL_GPL(seq_buf_do_printk);
+
 #ifdef CONFIG_BINARY_PRINTF
 /**
  * seq_buf_bprintf - Write the printf string from binary arguments
-- 
cgit v1.2.3


From f736d2c0caa8348b0bcd897972e470f7a596caad Mon Sep 17 00:00:00 2001
From: David Virag <virag.david003@gmail.com>
Date: Tue, 31 Jan 2023 19:30:06 +0100
Subject: mfd: sec: Remove PMICs without compatibles

The S5M8751 and S5M8763 PMIC chips have no corresponding compatible
values, so since board file support was removed for this driver, there
is no way to specify these PMICs as present in boards anymore.
Remove leftovers of these chips since it's dead code.

Signed-off-by: David Virag <virag.david003@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230131183008.4451-2-virag.david003@gmail.com
---
 drivers/mfd/sec-core.c              | 46 -------------------
 drivers/mfd/sec-irq.c               | 89 ------------------------------------
 include/linux/mfd/samsung/core.h    |  1 -
 include/linux/mfd/samsung/s5m8763.h | 90 -------------------------------------
 4 files changed, 226 deletions(-)
 delete mode 100644 include/linux/mfd/samsung/s5m8763.h

(limited to 'include')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index b03edda56009..c2d0ed496959 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -24,22 +24,9 @@
 #include <linux/mfd/samsung/s2mps14.h>
 #include <linux/mfd/samsung/s2mps15.h>
 #include <linux/mfd/samsung/s2mpu02.h>
-#include <linux/mfd/samsung/s5m8763.h>
 #include <linux/mfd/samsung/s5m8767.h>
 #include <linux/regmap.h>
 
-static const struct mfd_cell s5m8751_devs[] = {
-	{ .name = "s5m8751-pmic", },
-	{ .name = "s5m-charger", },
-	{ .name = "s5m8751-codec", },
-};
-
-static const struct mfd_cell s5m8763_devs[] = {
-	{ .name = "s5m8763-pmic", },
-	{ .name = "s5m-rtc", },
-	{ .name = "s5m-charger", },
-};
-
 static const struct mfd_cell s5m8767_devs[] = {
 	{ .name = "s5m8767-pmic", },
 	{ .name = "s5m-rtc", },
@@ -158,19 +145,6 @@ static bool s2mpu02_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
-static bool s5m8763_volatile(struct device *dev, unsigned int reg)
-{
-	switch (reg) {
-	case S5M8763_REG_IRQM1:
-	case S5M8763_REG_IRQM2:
-	case S5M8763_REG_IRQM3:
-	case S5M8763_REG_IRQM4:
-		return false;
-	default:
-		return true;
-	}
-}
-
 static const struct regmap_config sec_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -230,15 +204,6 @@ static const struct regmap_config s2mpu02_regmap_config = {
 	.cache_type = REGCACHE_FLAT,
 };
 
-static const struct regmap_config s5m8763_regmap_config = {
-	.reg_bits = 8,
-	.val_bits = 8,
-
-	.max_register = S5M8763_REG_LBCNFG2,
-	.volatile_reg = s5m8763_volatile,
-	.cache_type = REGCACHE_FLAT,
-};
-
 static const struct regmap_config s5m8767_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -348,9 +313,6 @@ static int sec_pmic_probe(struct i2c_client *i2c)
 	case S2MPS15X:
 		regmap = &s2mps15_regmap_config;
 		break;
-	case S5M8763X:
-		regmap = &s5m8763_regmap_config;
-		break;
 	case S5M8767X:
 		regmap = &s5m8767_regmap_config;
 		break;
@@ -375,14 +337,6 @@ static int sec_pmic_probe(struct i2c_client *i2c)
 	pm_runtime_set_active(sec_pmic->dev);
 
 	switch (sec_pmic->device_type) {
-	case S5M8751X:
-		sec_devs = s5m8751_devs;
-		num_sec_devs = ARRAY_SIZE(s5m8751_devs);
-		break;
-	case S5M8763X:
-		sec_devs = s5m8763_devs;
-		num_sec_devs = ARRAY_SIZE(s5m8763_devs);
-		break;
 	case S5M8767X:
 		sec_devs = s5m8767_devs;
 		num_sec_devs = ARRAY_SIZE(s5m8767_devs);
diff --git a/drivers/mfd/sec-irq.c b/drivers/mfd/sec-irq.c
index f5f59fdc72fe..e191aeb0c07c 100644
--- a/drivers/mfd/sec-irq.c
+++ b/drivers/mfd/sec-irq.c
@@ -14,7 +14,6 @@
 #include <linux/mfd/samsung/s2mps11.h>
 #include <linux/mfd/samsung/s2mps14.h>
 #include <linux/mfd/samsung/s2mpu02.h>
-#include <linux/mfd/samsung/s5m8763.h>
 #include <linux/mfd/samsung/s5m8767.h>
 
 static const struct regmap_irq s2mps11_irqs[] = {
@@ -297,81 +296,6 @@ static const struct regmap_irq s5m8767_irqs[] = {
 	},
 };
 
-static const struct regmap_irq s5m8763_irqs[] = {
-	[S5M8763_IRQ_DCINF] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_DCINF_MASK,
-	},
-	[S5M8763_IRQ_DCINR] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_DCINR_MASK,
-	},
-	[S5M8763_IRQ_JIGF] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_JIGF_MASK,
-	},
-	[S5M8763_IRQ_JIGR] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_JIGR_MASK,
-	},
-	[S5M8763_IRQ_PWRONF] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_PWRONF_MASK,
-	},
-	[S5M8763_IRQ_PWRONR] = {
-		.reg_offset = 0,
-		.mask = S5M8763_IRQ_PWRONR_MASK,
-	},
-	[S5M8763_IRQ_WTSREVNT] = {
-		.reg_offset = 1,
-		.mask = S5M8763_IRQ_WTSREVNT_MASK,
-	},
-	[S5M8763_IRQ_SMPLEVNT] = {
-		.reg_offset = 1,
-		.mask = S5M8763_IRQ_SMPLEVNT_MASK,
-	},
-	[S5M8763_IRQ_ALARM1] = {
-		.reg_offset = 1,
-		.mask = S5M8763_IRQ_ALARM1_MASK,
-	},
-	[S5M8763_IRQ_ALARM0] = {
-		.reg_offset = 1,
-		.mask = S5M8763_IRQ_ALARM0_MASK,
-	},
-	[S5M8763_IRQ_ONKEY1S] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_ONKEY1S_MASK,
-	},
-	[S5M8763_IRQ_TOPOFFR] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_TOPOFFR_MASK,
-	},
-	[S5M8763_IRQ_DCINOVPR] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_DCINOVPR_MASK,
-	},
-	[S5M8763_IRQ_CHGRSTF] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_CHGRSTF_MASK,
-	},
-	[S5M8763_IRQ_DONER] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_DONER_MASK,
-	},
-	[S5M8763_IRQ_CHGFAULT] = {
-		.reg_offset = 2,
-		.mask = S5M8763_IRQ_CHGFAULT_MASK,
-	},
-	[S5M8763_IRQ_LOBAT1] = {
-		.reg_offset = 3,
-		.mask = S5M8763_IRQ_LOBAT1_MASK,
-	},
-	[S5M8763_IRQ_LOBAT2] = {
-		.reg_offset = 3,
-		.mask = S5M8763_IRQ_LOBAT2_MASK,
-	},
-};
-
 static const struct regmap_irq_chip s2mps11_irq_chip = {
 	.name = "s2mps11",
 	.irqs = s2mps11_irqs,
@@ -425,16 +349,6 @@ static const struct regmap_irq_chip s5m8767_irq_chip = {
 	.ack_base = S5M8767_REG_INT1,
 };
 
-static const struct regmap_irq_chip s5m8763_irq_chip = {
-	.name = "s5m8763",
-	.irqs = s5m8763_irqs,
-	.num_irqs = ARRAY_SIZE(s5m8763_irqs),
-	.num_regs = 4,
-	.status_base = S5M8763_REG_IRQ1,
-	.mask_base = S5M8763_REG_IRQM1,
-	.ack_base = S5M8763_REG_IRQ1,
-};
-
 int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 {
 	int ret = 0;
@@ -448,9 +362,6 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 	}
 
 	switch (type) {
-	case S5M8763X:
-		sec_irq_chip = &s5m8763_irq_chip;
-		break;
 	case S5M8767X:
 		sec_irq_chip = &s5m8767_irq_chip;
 		break;
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index f92fe090473d..07aae649a86f 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -36,7 +36,6 @@
 struct gpio_desc;
 
 enum sec_device_type {
-	S5M8751X,
 	S5M8763X,
 	S5M8767X,
 	S2MPA01,
diff --git a/include/linux/mfd/samsung/s5m8763.h b/include/linux/mfd/samsung/s5m8763.h
deleted file mode 100644
index c534f086ca16..000000000000
--- a/include/linux/mfd/samsung/s5m8763.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright (c) 2011 Samsung Electronics Co., Ltd
- *              http://www.samsung.com
- */
-
-#ifndef __LINUX_MFD_S5M8763_H
-#define __LINUX_MFD_S5M8763_H
-
-/* S5M8763 registers */
-enum s5m8763_reg {
-	S5M8763_REG_IRQ1,
-	S5M8763_REG_IRQ2,
-	S5M8763_REG_IRQ3,
-	S5M8763_REG_IRQ4,
-	S5M8763_REG_IRQM1,
-	S5M8763_REG_IRQM2,
-	S5M8763_REG_IRQM3,
-	S5M8763_REG_IRQM4,
-	S5M8763_REG_STATUS1,
-	S5M8763_REG_STATUS2,
-	S5M8763_REG_STATUSM1,
-	S5M8763_REG_STATUSM2,
-	S5M8763_REG_CHGR1,
-	S5M8763_REG_CHGR2,
-	S5M8763_REG_LDO_ACTIVE_DISCHARGE1,
-	S5M8763_REG_LDO_ACTIVE_DISCHARGE2,
-	S5M8763_REG_BUCK_ACTIVE_DISCHARGE3,
-	S5M8763_REG_ONOFF1,
-	S5M8763_REG_ONOFF2,
-	S5M8763_REG_ONOFF3,
-	S5M8763_REG_ONOFF4,
-	S5M8763_REG_BUCK1_VOLTAGE1,
-	S5M8763_REG_BUCK1_VOLTAGE2,
-	S5M8763_REG_BUCK1_VOLTAGE3,
-	S5M8763_REG_BUCK1_VOLTAGE4,
-	S5M8763_REG_BUCK2_VOLTAGE1,
-	S5M8763_REG_BUCK2_VOLTAGE2,
-	S5M8763_REG_BUCK3,
-	S5M8763_REG_BUCK4,
-	S5M8763_REG_LDO1_LDO2,
-	S5M8763_REG_LDO3,
-	S5M8763_REG_LDO4,
-	S5M8763_REG_LDO5,
-	S5M8763_REG_LDO6,
-	S5M8763_REG_LDO7,
-	S5M8763_REG_LDO7_LDO8,
-	S5M8763_REG_LDO9_LDO10,
-	S5M8763_REG_LDO11,
-	S5M8763_REG_LDO12,
-	S5M8763_REG_LDO13,
-	S5M8763_REG_LDO14,
-	S5M8763_REG_LDO15,
-	S5M8763_REG_LDO16,
-	S5M8763_REG_BKCHR,
-	S5M8763_REG_LBCNFG1,
-	S5M8763_REG_LBCNFG2,
-};
-
-/* S5M8763 regulator ids */
-enum s5m8763_regulators {
-	S5M8763_LDO1,
-	S5M8763_LDO2,
-	S5M8763_LDO3,
-	S5M8763_LDO4,
-	S5M8763_LDO5,
-	S5M8763_LDO6,
-	S5M8763_LDO7,
-	S5M8763_LDO8,
-	S5M8763_LDO9,
-	S5M8763_LDO10,
-	S5M8763_LDO11,
-	S5M8763_LDO12,
-	S5M8763_LDO13,
-	S5M8763_LDO14,
-	S5M8763_LDO15,
-	S5M8763_LDO16,
-	S5M8763_BUCK1,
-	S5M8763_BUCK2,
-	S5M8763_BUCK3,
-	S5M8763_BUCK4,
-	S5M8763_AP_EN32KHZ,
-	S5M8763_CP_EN32KHZ,
-	S5M8763_ENCHGVI,
-	S5M8763_ESAFEUSB1,
-	S5M8763_ESAFEUSB2,
-};
-
-#define S5M8763_ENRAMP                  (1 << 4)
-#endif /* __LINUX_MFD_S5M8763_H */
-- 
cgit v1.2.3


From a3165abaa9bda3e729b434a3e684fcfa205f252a Mon Sep 17 00:00:00 2001
From: David Virag <virag.david003@gmail.com>
Date: Tue, 31 Jan 2023 19:30:07 +0100
Subject: rtc: s5m: Drop S5M8763 support

The S5M8763 MFD has no device tree compatible, and since board file
support for it was removed, there's no way to use this MFD. After
removing the remaining code for it from the MFD driver, also remove
support for it in the s5m RTC driver, and all remaining references to
it.

Signed-off-by: David Virag <virag.david003@gmail.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230131183008.4451-3-virag.david003@gmail.com
---
 drivers/rtc/rtc-s5m.c            | 82 ++--------------------------------------
 include/linux/mfd/samsung/core.h |  1 -
 include/linux/mfd/samsung/irq.h  | 50 ------------------------
 3 files changed, 3 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c
index 4243fe6d3842..dad294a0ce2a 100644
--- a/drivers/rtc/rtc-s5m.c
+++ b/drivers/rtc/rtc-s5m.c
@@ -85,7 +85,7 @@ struct s5m_rtc_reg_config {
 	unsigned int write_alarm_udr_mask;
 };
 
-/* Register map for S5M8763 and S5M8767 */
+/* Register map for S5M8767 */
 static const struct s5m_rtc_reg_config s5m_rtc_regs = {
 	.regs_count		= 8,
 	.time			= S5M_RTC_SEC,
@@ -236,7 +236,6 @@ static int s5m_check_peding_alarm_interrupt(struct s5m_rtc_info *info,
 
 	switch (info->device_type) {
 	case S5M8767X:
-	case S5M8763X:
 		ret = regmap_read(info->regmap, S5M_RTC_STATUS, &val);
 		val &= S5M_ALARM0_STATUS;
 		break;
@@ -299,7 +298,6 @@ static int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info)
 
 	data |= info->regs->write_alarm_udr_mask;
 	switch (info->device_type) {
-	case S5M8763X:
 	case S5M8767X:
 		data &= ~S5M_RTC_TIME_EN_MASK;
 		break;
@@ -329,38 +327,6 @@ static int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info)
 	return ret;
 }
 
-static void s5m8763_data_to_tm(u8 *data, struct rtc_time *tm)
-{
-	tm->tm_sec = bcd2bin(data[RTC_SEC]);
-	tm->tm_min = bcd2bin(data[RTC_MIN]);
-
-	if (data[RTC_HOUR] & HOUR_12) {
-		tm->tm_hour = bcd2bin(data[RTC_HOUR] & 0x1f);
-		if (data[RTC_HOUR] & HOUR_PM)
-			tm->tm_hour += 12;
-	} else {
-		tm->tm_hour = bcd2bin(data[RTC_HOUR] & 0x3f);
-	}
-
-	tm->tm_wday = data[RTC_WEEKDAY] & 0x07;
-	tm->tm_mday = bcd2bin(data[RTC_DATE]);
-	tm->tm_mon = bcd2bin(data[RTC_MONTH]);
-	tm->tm_year = bcd2bin(data[RTC_YEAR1]) + bcd2bin(data[RTC_YEAR2]) * 100;
-	tm->tm_year -= 1900;
-}
-
-static void s5m8763_tm_to_data(struct rtc_time *tm, u8 *data)
-{
-	data[RTC_SEC] = bin2bcd(tm->tm_sec);
-	data[RTC_MIN] = bin2bcd(tm->tm_min);
-	data[RTC_HOUR] = bin2bcd(tm->tm_hour);
-	data[RTC_WEEKDAY] = tm->tm_wday;
-	data[RTC_DATE] = bin2bcd(tm->tm_mday);
-	data[RTC_MONTH] = bin2bcd(tm->tm_mon);
-	data[RTC_YEAR1] = bin2bcd(tm->tm_year % 100);
-	data[RTC_YEAR2] = bin2bcd((tm->tm_year + 1900) / 100);
-}
-
 static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	struct s5m_rtc_info *info = dev_get_drvdata(dev);
@@ -385,10 +351,6 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm)
 		return ret;
 
 	switch (info->device_type) {
-	case S5M8763X:
-		s5m8763_data_to_tm(data, tm);
-		break;
-
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -412,9 +374,6 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	int ret = 0;
 
 	switch (info->device_type) {
-	case S5M8763X:
-		s5m8763_tm_to_data(tm, data);
-		break;
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -444,7 +403,6 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct s5m_rtc_info *info = dev_get_drvdata(dev);
 	u8 data[RTC_MAX_NUM_TIME_REGS];
-	unsigned int val;
 	int ret, i;
 
 	ret = regmap_bulk_read(info->regmap, info->regs->alarm0, data,
@@ -453,15 +411,6 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 		return ret;
 
 	switch (info->device_type) {
-	case S5M8763X:
-		s5m8763_data_to_tm(data, &alrm->time);
-		ret = regmap_read(info->regmap, S5M_ALARM0_CONF, &val);
-		if (ret < 0)
-			return ret;
-
-		alrm->enabled = !!val;
-		break;
-
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -500,10 +449,6 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info)
 	dev_dbg(info->dev, "%s: %ptR(%d)\n", __func__, &tm, tm.tm_wday);
 
 	switch (info->device_type) {
-	case S5M8763X:
-		ret = regmap_write(info->regmap, S5M_ALARM0_CONF, 0);
-		break;
-
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -531,7 +476,6 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info)
 {
 	int ret;
 	u8 data[RTC_MAX_NUM_TIME_REGS];
-	u8 alarm0_conf;
 	struct rtc_time tm;
 
 	ret = regmap_bulk_read(info->regmap, info->regs->alarm0, data,
@@ -543,11 +487,6 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info)
 	dev_dbg(info->dev, "%s: %ptR(%d)\n", __func__, &tm, tm.tm_wday);
 
 	switch (info->device_type) {
-	case S5M8763X:
-		alarm0_conf = 0x77;
-		ret = regmap_write(info->regmap, S5M_ALARM0_CONF, alarm0_conf);
-		break;
-
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -585,10 +524,6 @@ static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	int ret;
 
 	switch (info->device_type) {
-	case S5M8763X:
-		s5m8763_tm_to_data(&alrm->time, data);
-		break;
-
 	case S5M8767X:
 	case S2MPS15X:
 	case S2MPS14X:
@@ -655,7 +590,6 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 	int ret;
 
 	switch (info->device_type) {
-	case S5M8763X:
 	case S5M8767X:
 		/* UDR update time. Default of 7.32 ms is too long. */
 		ret = regmap_update_bits(info->regmap, S5M_RTC_UDR_CON,
@@ -729,11 +663,6 @@ static int s5m_rtc_probe(struct platform_device *pdev)
 		info->regs = &s2mps13_rtc_regs;
 		alarm_irq = S2MPS14_IRQ_RTCA0;
 		break;
-	case S5M8763X:
-		regmap_cfg = &s5m_rtc_regmap_config;
-		info->regs = &s5m_rtc_regs;
-		alarm_irq = S5M8763_IRQ_ALARM0;
-		break;
 	case S5M8767X:
 		regmap_cfg = &s5m_rtc_regmap_config;
 		info->regs = &s5m_rtc_regs;
@@ -786,13 +715,8 @@ static int s5m_rtc_probe(struct platform_device *pdev)
 
 	info->rtc_dev->ops = &s5m_rtc_ops;
 
-	if (info->device_type == S5M8763X) {
-		info->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_0000;
-		info->rtc_dev->range_max = RTC_TIMESTAMP_END_9999;
-	} else {
-		info->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000;
-		info->rtc_dev->range_max = RTC_TIMESTAMP_END_2099;
-	}
+	info->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000;
+	info->rtc_dev->range_max = RTC_TIMESTAMP_END_2099;
 
 	if (!info->irq) {
 		clear_bit(RTC_FEATURE_ALARM, info->rtc_dev->features);
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index 07aae649a86f..a212b9f72bc9 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -36,7 +36,6 @@
 struct gpio_desc;
 
 enum sec_device_type {
-	S5M8763X,
 	S5M8767X,
 	S2MPA01,
 	S2MPS11X,
diff --git a/include/linux/mfd/samsung/irq.h b/include/linux/mfd/samsung/irq.h
index 6cfe4201a106..3fd2775eb9bb 100644
--- a/include/linux/mfd/samsung/irq.h
+++ b/include/linux/mfd/samsung/irq.h
@@ -194,54 +194,4 @@ enum s5m8767_irq {
 #define S5M8767_IRQ_RTC1S_MASK		(1 << 4)
 #define S5M8767_IRQ_WTSR_MASK		(1 << 5)
 
-enum s5m8763_irq {
-	S5M8763_IRQ_DCINF,
-	S5M8763_IRQ_DCINR,
-	S5M8763_IRQ_JIGF,
-	S5M8763_IRQ_JIGR,
-	S5M8763_IRQ_PWRONF,
-	S5M8763_IRQ_PWRONR,
-
-	S5M8763_IRQ_WTSREVNT,
-	S5M8763_IRQ_SMPLEVNT,
-	S5M8763_IRQ_ALARM1,
-	S5M8763_IRQ_ALARM0,
-
-	S5M8763_IRQ_ONKEY1S,
-	S5M8763_IRQ_TOPOFFR,
-	S5M8763_IRQ_DCINOVPR,
-	S5M8763_IRQ_CHGRSTF,
-	S5M8763_IRQ_DONER,
-	S5M8763_IRQ_CHGFAULT,
-
-	S5M8763_IRQ_LOBAT1,
-	S5M8763_IRQ_LOBAT2,
-
-	S5M8763_IRQ_NR,
-};
-
-#define S5M8763_IRQ_DCINF_MASK		(1 << 2)
-#define S5M8763_IRQ_DCINR_MASK		(1 << 3)
-#define S5M8763_IRQ_JIGF_MASK		(1 << 4)
-#define S5M8763_IRQ_JIGR_MASK		(1 << 5)
-#define S5M8763_IRQ_PWRONF_MASK		(1 << 6)
-#define S5M8763_IRQ_PWRONR_MASK		(1 << 7)
-
-#define S5M8763_IRQ_WTSREVNT_MASK	(1 << 0)
-#define S5M8763_IRQ_SMPLEVNT_MASK	(1 << 1)
-#define S5M8763_IRQ_ALARM1_MASK		(1 << 2)
-#define S5M8763_IRQ_ALARM0_MASK		(1 << 3)
-
-#define S5M8763_IRQ_ONKEY1S_MASK	(1 << 0)
-#define S5M8763_IRQ_TOPOFFR_MASK	(1 << 2)
-#define S5M8763_IRQ_DCINOVPR_MASK	(1 << 3)
-#define S5M8763_IRQ_CHGRSTF_MASK	(1 << 4)
-#define S5M8763_IRQ_DONER_MASK		(1 << 5)
-#define S5M8763_IRQ_CHGFAULT_MASK	(1 << 7)
-
-#define S5M8763_IRQ_LOBAT1_MASK		(1 << 0)
-#define S5M8763_IRQ_LOBAT2_MASK		(1 << 1)
-
-#define S5M8763_ENRAMP                  (1 << 4)
-
 #endif /*  __LINUX_MFD_SEC_IRQ_H */
-- 
cgit v1.2.3


From 86c6bb0edffa9fc02b4e3801b48c8e82114f1352 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 14 Feb 2023 09:58:59 +0100
Subject: mfd: core: Reorder fields in 'struct mfd_cell' to save some memory

Group some variables based on their sizes to reduce hole and avoid padding.
On x86_64, this shrinks the size from 144 to 128 bytes.

As an example:

$ size drivers/mfd/as3722.o (Before)
   text	   data	    bss	    dec	    hex	filename
   9441	    680	     16	  10137	   2799	drivers/mfd/as3722.o

$ size drivers/mfd/as3722.o (After)
   text	   data	    bss	    dec	    hex	filename
   9345	    680	     16	  10041	   2739	drivers/mfd/as3722.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/bb631974888dfe1af593b6280cf30fb913d2d1a4.1676365116.git.christophe.jaillet@wanadoo.fr
---
 include/linux/mfd/core.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 14ca7b471576..fc4a0e9fb3bb 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -78,6 +78,9 @@ struct mfd_cell {
 	void			*platform_data;
 	size_t			pdata_size;
 
+	/* Matches ACPI */
+	const struct mfd_cell_acpi_match	*acpi_match;
+
 	/* Software node for the device. */
 	const struct software_node *swnode;
 
@@ -97,9 +100,6 @@ struct mfd_cell {
 	/* Set to 'true' to use 'of_reg' (above) - allows for of_reg=0 */
 	bool use_of_reg;
 
-	/* Matches ACPI */
-	const struct mfd_cell_acpi_match	*acpi_match;
-
 	/*
 	 * These resources can be specified relative to the parent device.
 	 * For accessing hardware you should use resources from the platform dev
@@ -119,8 +119,8 @@ struct mfd_cell {
 	/* A list of regulator supplies that should be mapped to the MFD
 	 * device rather than the child device when requested
 	 */
-	const char * const	*parent_supplies;
 	int			num_parent_supplies;
+	const char * const	*parent_supplies;
 };
 
 /*
-- 
cgit v1.2.3


From b8fd17d9505e18001d10ceaa350c118a82b467bb Mon Sep 17 00:00:00 2001
From: Jakob Hauser <jahau@rocketmail.com>
Date: Tue, 28 Feb 2023 23:32:20 +0100
Subject: mfd: rt5033: Fix comments and style in includes

Fix comments and remove some empty lines in rt5033-private.h. Align struct
rt5033_charger in rt5033.h.

Signed-off-by: Jakob Hauser <jahau@rocketmail.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/606950da6f4b36f5a124ff13756c78644fc89804.1677620677.git.jahau@rocketmail.com
---
 include/linux/mfd/rt5033-private.h | 17 +++++++----------
 include/linux/mfd/rt5033.h         |  7 +++----
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h
index 2d1895c3efbf..6bb432f6a96c 100644
--- a/include/linux/mfd/rt5033-private.h
+++ b/include/linux/mfd/rt5033-private.h
@@ -107,14 +107,13 @@ enum rt5033_reg {
 #define RT5033_LDO_CTRL_MASK			0x1f
 
 /* RT5033 charger property - model, manufacturer */
-
 #define RT5033_CHARGER_MODEL	"RT5033WSC Charger"
 #define RT5033_MANUFACTURER	"Richtek Technology Corporation"
 
 /*
- * RT5033 charger fast-charge current lmits (as in CHGCTRL1 register),
- * AICR mode limits the input current for example,
- * the AIRC 100 mode limits the input current to 100 mA.
+ * While RT5033 charger can limit the fast-charge current (as in CHGCTRL1
+ * register), AICR mode limits the input current. For example, the AIRC 100
+ * mode limits the input current to 100 mA.
  */
 #define RT5033_AICR_100_MODE			0x20
 #define RT5033_AICR_500_MODE			0x40
@@ -139,10 +138,9 @@ enum rt5033_reg {
 #define RT5033_TE_ENABLE_MASK			0x08
 
 /*
- * RT5033 charger opa mode. RT50300 have two opa mode charger mode
- * and boost mode for OTG
+ * RT5033 charger opa mode. RT5033 has two opa modes for OTG: charger mode
+ * and boost mode.
  */
-
 #define RT5033_CHARGER_MODE			0x00
 #define RT5033_BOOST_MODE			0x01
 
@@ -181,18 +179,17 @@ enum rt5033_reg {
  * RT5033 charger pre-charge threshold volt limits
  * (as in CHGCTRL5 register), uV
  */
-
 #define RT5033_CHARGER_PRE_THRESHOLD_LIMIT_MIN	2300000U
 #define RT5033_CHARGER_PRE_THRESHOLD_STEP_NUM	100000U
 #define RT5033_CHARGER_PRE_THRESHOLD_LIMIT_MAX	3800000U
 
 /*
- * RT5033 charger enable UUG, If UUG enable MOS auto control by H/W charger
+ * RT5033 charger UUG. It enables MOS auto control by H/W charger
  * circuit.
  */
 #define RT5033_CHARGER_UUG_ENABLE		0x02
 
-/* RT5033 charger High impedance mode */
+/* RT5033 charger high impedance mode */
 #define RT5033_CHARGER_HZ_DISABLE		0x00
 #define RT5033_CHARGER_HZ_ENABLE		0x01
 
diff --git a/include/linux/mfd/rt5033.h b/include/linux/mfd/rt5033.h
index 3c23b6220c04..8f306ac15a27 100644
--- a/include/linux/mfd/rt5033.h
+++ b/include/linux/mfd/rt5033.h
@@ -49,10 +49,9 @@ struct rt5033_charger_data {
 };
 
 struct rt5033_charger {
-	struct device		*dev;
-	struct rt5033_dev	*rt5033;
-	struct power_supply	psy;
-
+	struct device			*dev;
+	struct rt5033_dev		*rt5033;
+	struct power_supply		psy;
 	struct rt5033_charger_data	*chg;
 };
 
-- 
cgit v1.2.3


From 0742c2a6335281608ac9e9aee67493e9e30f6195 Mon Sep 17 00:00:00 2001
From: Patrick Rudolph <patrick.rudolph@9elements.com>
Date: Tue, 7 Mar 2023 13:12:45 +0100
Subject: mfd: max597x: Add support for MAX5970 and MAX5978

Implement a regulator driver with IRQ support for fault management.
Written against documentation [1] and [2] and tested on real hardware.

Every channel has it's own regulator supply nammed 'vss1-supply' and
'vss2-supply'. The regulator supply is used to determine the output
voltage, as the smart switch provides no output regulation.
The driver requires the 'shunt-resistor-micro-ohms' to be present in
the devicetree to properly calculate current related values.

You must specify compatible devictree layout:

regulator@3a {
        reg = <0x3a>;
        vss1-supply = <&p3v3>;
        compatible = "maxim,max5978";

        ...

        regulators {
                sw0_ref: SW0 {
                        regulator-compatible = "SW0";
                        shunt-resistor-micro-ohms = <12000>;
                        ...
                }
        }
}

1: https://datasheets.maximintegrated.com/en/ds/MAX5970.pdf
2: https://datasheets.maximintegrated.com/en/ds/MAX5978.pdf

...
Changes in V12:
- Use simple_mfd_i2c driver and remove previous implementation.
- Remove newline
- Use _MFD_MAX597X_H in header file
- Successfull build need following patch from regulator:
https://lore.kernel.org/r/20230216075302.68935-1-Naresh.Solanki@9elements.com
https://lore.kernel.org/r/20230210163225.1208035-1-Naresh.Solanki@9elements.com

Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Marcello Sylvester Bauer <sylv@sylv.io>
Signed-off-by: Naresh Solanki <Naresh.Solanki@9elements.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230307121246.127425-2-Naresh.Solanki@9elements.com
---
 drivers/mfd/Kconfig          | 10 +++++
 drivers/mfd/simple-mfd-i2c.c | 13 ++++++
 include/linux/mfd/max597x.h  | 96 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+)
 create mode 100644 include/linux/mfd/max597x.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index fcc141e067b9..d381d0e63455 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -266,6 +266,16 @@ config MFD_MADERA_SPI
 	  Support for the Cirrus Logic Madera platform audio SoC
 	  core functionality controlled via SPI.
 
+config MFD_MAX597X
+	tristate "Maxim 597x power switch and monitor"
+	depends on (I2C && OF)
+	select MFD_SIMPLE_MFD_I2C
+	help
+	  This driver controls a Maxim 5970/5978 switch via I2C bus.
+	  The MAX5970/5978 is a smart switch with no output regulation, but
+	  fault protection and voltage and current monitoring capabilities.
+	  Also it supports upto 4 indication leds.
+
 config MFD_CS47L15
 	bool "Cirrus Logic CS47L15"
 	select PINCTRL_CS47L15
diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c
index e31f13fd6a79..20782b4dd172 100644
--- a/drivers/mfd/simple-mfd-i2c.c
+++ b/drivers/mfd/simple-mfd-i2c.c
@@ -72,9 +72,22 @@ static const struct simple_mfd_data silergy_sy7636a = {
 	.mfd_cell_size = ARRAY_SIZE(sy7636a_cells),
 };
 
+static const struct mfd_cell max597x_cells[] = {
+	{ .name = "max597x-regulator", },
+	{ .name = "max597x-iio", },
+	{ .name = "max597x-led", },
+};
+
+static const struct simple_mfd_data maxim_max597x = {
+	.mfd_cell = max597x_cells,
+	.mfd_cell_size = ARRAY_SIZE(max597x_cells),
+};
+
 static const struct of_device_id simple_mfd_i2c_of_match[] = {
 	{ .compatible = "kontron,sl28cpld" },
 	{ .compatible = "silergy,sy7636a", .data = &silergy_sy7636a},
+	{ .compatible = "maxim,max5970", .data = &maxim_max597x},
+	{ .compatible = "maxim,max5978", .data = &maxim_max597x},
 	{}
 };
 MODULE_DEVICE_TABLE(of, simple_mfd_i2c_of_match);
diff --git a/include/linux/mfd/max597x.h b/include/linux/mfd/max597x.h
new file mode 100644
index 000000000000..a850b2e02e6a
--- /dev/null
+++ b/include/linux/mfd/max597x.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Device driver for regulators in MAX5970 and MAX5978 IC
+ *
+ * Copyright (c) 2022 9elements GmbH
+ *
+ * Author: Patrick Rudolph <patrick.rudolph@9elements.com>
+ */
+
+#ifndef _MFD_MAX597X_H
+#define _MFD_MAX597X_H
+
+#include <linux/regmap.h>
+
+#define MAX5970_NUM_SWITCHES 2
+#define MAX5978_NUM_SWITCHES 1
+#define MAX597X_NUM_LEDS     4
+
+struct max597x_data {
+	int num_switches;
+	u32 irng[MAX5970_NUM_SWITCHES];
+	u32 mon_rng[MAX5970_NUM_SWITCHES];
+	u32 shunt_micro_ohms[MAX5970_NUM_SWITCHES];
+};
+
+enum max597x_chip_type {
+	MAX597x_TYPE_MAX5978 = 1,
+	MAX597x_TYPE_MAX5970,
+};
+
+#define MAX5970_REG_CURRENT_L(ch)		(0x01 + (ch) * 4)
+#define MAX5970_REG_CURRENT_H(ch)		(0x00 + (ch) * 4)
+#define MAX5970_REG_VOLTAGE_L(ch)		(0x03 + (ch) * 4)
+#define MAX5970_REG_VOLTAGE_H(ch)		(0x02 + (ch) * 4)
+#define MAX5970_REG_MON_RANGE			0x18
+#define  MAX5970_MON_MASK			0x3
+#define  MAX5970_MON(reg, ch)			(((reg) >> ((ch) * 2)) & MAX5970_MON_MASK)
+#define  MAX5970_MON_MAX_RANGE_UV		16000000
+
+#define MAX5970_REG_CH_UV_WARN_H(ch)		(0x1A + (ch) * 10)
+#define MAX5970_REG_CH_UV_WARN_L(ch)		(0x1B + (ch) * 10)
+#define MAX5970_REG_CH_UV_CRIT_H(ch)		(0x1C + (ch) * 10)
+#define MAX5970_REG_CH_UV_CRIT_L(ch)		(0x1D + (ch) * 10)
+#define MAX5970_REG_CH_OV_WARN_H(ch)		(0x1E + (ch) * 10)
+#define MAX5970_REG_CH_OV_WARN_L(ch)		(0x1F + (ch) * 10)
+#define MAX5970_REG_CH_OV_CRIT_H(ch)		(0x20 + (ch) * 10)
+#define MAX5970_REG_CH_OV_CRIT_L(ch)		(0x21 + (ch) * 10)
+
+#define  MAX5970_VAL2REG_H(x)		(((x) >> 2) & 0xFF)
+#define  MAX5970_VAL2REG_L(x)		((x) & 0x3)
+
+#define MAX5970_REG_DAC_FAST(ch)	(0x2E + (ch))
+
+#define MAX5970_FAST2SLOW_RATIO		200
+
+#define MAX5970_REG_STATUS0		0x31
+#define  MAX5970_CB_IFAULTF(ch)		(1 << (ch))
+#define  MAX5970_CB_IFAULTS(ch)		(1 << ((ch) + 4))
+
+#define MAX5970_REG_STATUS1		0x32
+#define  STATUS1_PROT_MASK		0x3
+#define  STATUS1_PROT(reg) \
+	(((reg) >> 6) & STATUS1_PROT_MASK)
+#define  STATUS1_PROT_SHUTDOWN		0
+#define  STATUS1_PROT_CLEAR_PG		1
+#define  STATUS1_PROT_ALERT_ONLY	2
+
+#define MAX5970_REG_STATUS2		0x33
+#define  MAX5970_IRNG_MASK		0x3
+#define  MAX5970_IRNG(reg, ch) \
+	(((reg) >> ((ch) * 2)) & MAX5970_IRNG_MASK)
+
+#define MAX5970_REG_STATUS3		0x34
+#define  MAX5970_STATUS3_ALERT		BIT(4)
+#define  MAX5970_STATUS3_PG(ch)		BIT(ch)
+
+#define MAX5970_REG_FAULT0		0x35
+#define  UV_STATUS_WARN(ch)		(1 << (ch))
+#define  UV_STATUS_CRIT(ch)		(1 << ((ch) + 4))
+
+#define MAX5970_REG_FAULT1		0x36
+#define  OV_STATUS_WARN(ch)		(1 << (ch))
+#define  OV_STATUS_CRIT(ch)		(1 << ((ch) + 4))
+
+#define MAX5970_REG_FAULT2		0x37
+#define  OC_STATUS_WARN(ch)		(1 << (ch))
+
+#define MAX5970_REG_CHXEN		0x3b
+#define  CHXEN(ch)			(3 << ((ch) * 2))
+
+#define MAX5970_REG_LED_FLASH		0x43
+
+#define MAX_REGISTERS			0x49
+#define ADC_MASK			0x3FF
+
+#endif				/* _MFD_MAX597X_H */
-- 
cgit v1.2.3


From a0b9becad8a7d62a81530035a69f744c0e389737 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 8 Mar 2023 10:12:57 +0100
Subject: mfd: core: Remove .enable() and .disable() callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With commit dd77f5fa97d3 ("mfd: Remove toshiba tmio drivers") the last
mfd driver that implements these callbacks is gone and since commit
652719b1003a ("w1: remove ds1wm driver") the last user is gone. The
corresponding functions mfd_cell_enable() and mfd_cell_disable() are
also unused (since commit 0ca222c81977 ("leds: Remove asic3 driver")).

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230308091257.2404932-1-u.kleine-koenig@pengutronix.de
---
 drivers/mfd/mfd-core.c   | 26 --------------------------
 include/linux/mfd/core.h | 12 ------------
 2 files changed, 38 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index 16d1861e9682..695d50b3bac6 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -33,32 +33,6 @@ static struct device_type mfd_dev_type = {
 	.name	= "mfd_device",
 };
 
-int mfd_cell_enable(struct platform_device *pdev)
-{
-	const struct mfd_cell *cell = mfd_get_cell(pdev);
-
-	if (!cell->enable) {
-		dev_dbg(&pdev->dev, "No .enable() call-back registered\n");
-		return 0;
-	}
-
-	return cell->enable(pdev);
-}
-EXPORT_SYMBOL(mfd_cell_enable);
-
-int mfd_cell_disable(struct platform_device *pdev)
-{
-	const struct mfd_cell *cell = mfd_get_cell(pdev);
-
-	if (!cell->disable) {
-		dev_dbg(&pdev->dev, "No .disable() call-back registered\n");
-		return 0;
-	}
-
-	return cell->disable(pdev);
-}
-EXPORT_SYMBOL(mfd_cell_disable);
-
 #if IS_ENABLED(CONFIG_ACPI)
 struct match_ids_walk_data {
 	struct acpi_device_id *ids;
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index fc4a0e9fb3bb..47e7a3a61ce6 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -68,9 +68,6 @@ struct mfd_cell {
 	int			id;
 	int			level;
 
-	int			(*enable)(struct platform_device *dev);
-	int			(*disable)(struct platform_device *dev);
-
 	int			(*suspend)(struct platform_device *dev);
 	int			(*resume)(struct platform_device *dev);
 
@@ -123,15 +120,6 @@ struct mfd_cell {
 	const char * const	*parent_supplies;
 };
 
-/*
- * Convenience functions for clients using shared cells.  Refcounting
- * happens automatically, with the cell's enable/disable callbacks
- * being called only when a device is first being enabled or no other
- * clients are making use of it.
- */
-extern int mfd_cell_enable(struct platform_device *pdev);
-extern int mfd_cell_disable(struct platform_device *pdev);
-
 /*
  * Given a platform device that's been created by mfd_add_devices(), fetch
  * the mfd_cell that created it.
-- 
cgit v1.2.3


From 67d6c76fc815cddc77de9529221f9ff8dd1fb10e Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Mon, 27 Mar 2023 14:39:53 -0400
Subject: mfd: rsmu: Support 32-bit address space

We used to assume 0x2010xxxx address. Now that we need to access
0x2011xxxx address, we need to support read/write the whole 32-bit
address space.

Also defined RSMU_MAX_WRITE_COUNT and RSMU_MAX_READ_COUNT for readability

Signed-off-by: Min Li <min.li.xe@renesas.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/MW5PR03MB693295AF31ABCAF6AE52EE74A08B9@MW5PR03MB6932.namprd03.prod.outlook.com
---
 drivers/mfd/rsmu.h       |   2 +
 drivers/mfd/rsmu_i2c.c   | 171 +++++++++++++++++++++++++++++++++++++----------
 drivers/mfd/rsmu_spi.c   |  52 ++++++++------
 include/linux/mfd/rsmu.h |   5 +-
 4 files changed, 174 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/rsmu.h b/drivers/mfd/rsmu.h
index bb88597d189f..1bb04cafa45d 100644
--- a/drivers/mfd/rsmu.h
+++ b/drivers/mfd/rsmu.h
@@ -10,6 +10,8 @@
 
 #include <linux/mfd/rsmu.h>
 
+#define RSMU_CM_SCSR_BASE		0x20100000
+
 int rsmu_core_init(struct rsmu_ddata *rsmu);
 void rsmu_core_exit(struct rsmu_ddata *rsmu);
 
diff --git a/drivers/mfd/rsmu_i2c.c b/drivers/mfd/rsmu_i2c.c
index 15d25b081434..221023faaadf 100644
--- a/drivers/mfd/rsmu_i2c.c
+++ b/drivers/mfd/rsmu_i2c.c
@@ -18,11 +18,12 @@
 #include "rsmu.h"
 
 /*
- * 16-bit register address: the lower 8 bits of the register address come
- * from the offset addr byte and the upper 8 bits come from the page register.
+ * 32-bit register address: the lower 8 bits of the register address come
+ * from the offset addr byte and the upper 24 bits come from the page register.
  */
-#define	RSMU_CM_PAGE_ADDR		0xFD
-#define	RSMU_CM_PAGE_WINDOW		256
+#define	RSMU_CM_PAGE_ADDR		0xFC
+#define RSMU_CM_PAGE_MASK		0xFFFFFF00
+#define RSMU_CM_ADDRESS_MASK		0x000000FF
 
 /*
  * 15-bit register address: the lower 7 bits of the register address come
@@ -31,18 +32,6 @@
 #define	RSMU_SABRE_PAGE_ADDR		0x7F
 #define	RSMU_SABRE_PAGE_WINDOW		128
 
-static const struct regmap_range_cfg rsmu_cm_range_cfg[] = {
-	{
-		.range_min = 0,
-		.range_max = 0xD000,
-		.selector_reg = RSMU_CM_PAGE_ADDR,
-		.selector_mask = 0xFF,
-		.selector_shift = 0,
-		.window_start = 0,
-		.window_len = RSMU_CM_PAGE_WINDOW,
-	}
-};
-
 static const struct regmap_range_cfg rsmu_sabre_range_cfg[] = {
 	{
 		.range_min = 0,
@@ -55,35 +44,141 @@ static const struct regmap_range_cfg rsmu_sabre_range_cfg[] = {
 	}
 };
 
-static bool rsmu_cm_volatile_reg(struct device *dev, unsigned int reg)
+static bool rsmu_sabre_volatile_reg(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
-	case RSMU_CM_PAGE_ADDR:
+	case RSMU_SABRE_PAGE_ADDR:
 		return false;
 	default:
 		return true;
 	}
 }
 
-static bool rsmu_sabre_volatile_reg(struct device *dev, unsigned int reg)
+static int rsmu_read_device(struct rsmu_ddata *rsmu, u8 reg, u8 *buf, u16 bytes)
 {
-	switch (reg) {
-	case RSMU_SABRE_PAGE_ADDR:
-		return false;
-	default:
-		return true;
+	struct i2c_client *client = to_i2c_client(rsmu->dev);
+	struct i2c_msg msg[2];
+	int cnt;
+
+	msg[0].addr = client->addr;
+	msg[0].flags = 0;
+	msg[0].len = 1;
+	msg[0].buf = &reg;
+
+	msg[1].addr = client->addr;
+	msg[1].flags = I2C_M_RD;
+	msg[1].len = bytes;
+	msg[1].buf = buf;
+
+	cnt = i2c_transfer(client->adapter, msg, 2);
+
+	if (cnt < 0) {
+		dev_err(rsmu->dev, "i2c_transfer failed at addr: %04x!", reg);
+		return cnt;
+	} else if (cnt != 2) {
+		dev_err(rsmu->dev,
+			"i2c_transfer sent only %d of 2 messages", cnt);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int rsmu_write_device(struct rsmu_ddata *rsmu, u8 reg, u8 *buf, u16 bytes)
+{
+	struct i2c_client *client = to_i2c_client(rsmu->dev);
+	u8 msg[RSMU_MAX_WRITE_COUNT + 1]; /* 1 Byte added for the device register */
+	int cnt;
+
+	if (bytes > RSMU_MAX_WRITE_COUNT)
+		return -EINVAL;
+
+	msg[0] = reg;
+	memcpy(&msg[1], buf, bytes);
+
+	cnt = i2c_master_send(client, msg, bytes + 1);
+
+	if (cnt < 0) {
+		dev_err(&client->dev,
+			"i2c_master_send failed at addr: %04x!", reg);
+		return cnt;
 	}
+
+	return 0;
+}
+
+static int rsmu_write_page_register(struct rsmu_ddata *rsmu, u32 reg)
+{
+	u32 page = reg & RSMU_CM_PAGE_MASK;
+	u8 buf[4];
+	int err;
+
+	/* Do not modify offset register for none-scsr registers */
+	if (reg < RSMU_CM_SCSR_BASE)
+		return 0;
+
+	/* Simply return if we are on the same page */
+	if (rsmu->page == page)
+		return 0;
+
+	buf[0] = 0x0;
+	buf[1] = (u8)((page >> 8) & 0xFF);
+	buf[2] = (u8)((page >> 16) & 0xFF);
+	buf[3] = (u8)((page >> 24) & 0xFF);
+
+	err = rsmu_write_device(rsmu, RSMU_CM_PAGE_ADDR, buf, sizeof(buf));
+	if (err)
+		dev_err(rsmu->dev, "Failed to set page offset 0x%x\n", page);
+	else
+		/* Remember the last page */
+		rsmu->page = page;
+
+	return err;
+}
+
+static int rsmu_reg_read(void *context, unsigned int reg, unsigned int *val)
+{
+	struct rsmu_ddata *rsmu = i2c_get_clientdata((struct i2c_client *)context);
+	u8 addr = (u8)(reg & RSMU_CM_ADDRESS_MASK);
+	int err;
+
+	err = rsmu_write_page_register(rsmu, reg);
+	if (err)
+		return err;
+
+	err = rsmu_read_device(rsmu, addr, (u8 *)val, 1);
+	if (err)
+		dev_err(rsmu->dev, "Failed to read offset address 0x%x\n", addr);
+
+	return err;
+}
+
+static int rsmu_reg_write(void *context, unsigned int reg, unsigned int val)
+{
+	struct rsmu_ddata *rsmu = i2c_get_clientdata((struct i2c_client *)context);
+	u8 addr = (u8)(reg & RSMU_CM_ADDRESS_MASK);
+	u8 data = (u8)val;
+	int err;
+
+	err = rsmu_write_page_register(rsmu, reg);
+	if (err)
+		return err;
+
+	err = rsmu_write_device(rsmu, addr, &data, 1);
+	if (err)
+		dev_err(rsmu->dev,
+			"Failed to write offset address 0x%x\n", addr);
+
+	return err;
 }
 
 static const struct regmap_config rsmu_cm_regmap_config = {
-	.reg_bits = 8,
+	.reg_bits = 32,
 	.val_bits = 8,
-	.max_register = 0xD000,
-	.ranges = rsmu_cm_range_cfg,
-	.num_ranges = ARRAY_SIZE(rsmu_cm_range_cfg),
-	.volatile_reg = rsmu_cm_volatile_reg,
-	.cache_type = REGCACHE_RBTREE,
-	.can_multi_write = true,
+	.max_register = 0x20120000,
+	.reg_read = rsmu_reg_read,
+	.reg_write = rsmu_reg_write,
+	.cache_type = REGCACHE_NONE,
 };
 
 static const struct regmap_config rsmu_sabre_regmap_config = {
@@ -101,14 +196,14 @@ static const struct regmap_config rsmu_sl_regmap_config = {
 	.reg_bits = 16,
 	.val_bits = 8,
 	.reg_format_endian = REGMAP_ENDIAN_BIG,
-	.max_register = 0x339,
+	.max_register = 0x340,
 	.cache_type = REGCACHE_NONE,
 	.can_multi_write = true,
 };
 
-static int rsmu_i2c_probe(struct i2c_client *client)
+static int rsmu_i2c_probe(struct i2c_client *client,
+			  const struct i2c_device_id *id)
 {
-	const struct i2c_device_id *id = i2c_client_get_device_id(client);
 	const struct regmap_config *cfg;
 	struct rsmu_ddata *rsmu;
 	int ret;
@@ -136,7 +231,11 @@ static int rsmu_i2c_probe(struct i2c_client *client)
 		dev_err(rsmu->dev, "Unsupported RSMU device type: %d\n", rsmu->type);
 		return -ENODEV;
 	}
-	rsmu->regmap = devm_regmap_init_i2c(client, cfg);
+
+	if (rsmu->type == RSMU_CM)
+		rsmu->regmap = devm_regmap_init(&client->dev, NULL, client, cfg);
+	else
+		rsmu->regmap = devm_regmap_init_i2c(client, cfg);
 	if (IS_ERR(rsmu->regmap)) {
 		ret = PTR_ERR(rsmu->regmap);
 		dev_err(rsmu->dev, "Failed to allocate register map: %d\n", ret);
@@ -180,7 +279,7 @@ static struct i2c_driver rsmu_i2c_driver = {
 		.name = "rsmu-i2c",
 		.of_match_table = of_match_ptr(rsmu_i2c_of_match),
 	},
-	.probe_new = rsmu_i2c_probe,
+	.probe = rsmu_i2c_probe,
 	.remove	= rsmu_i2c_remove,
 	.id_table = rsmu_i2c_id,
 };
diff --git a/drivers/mfd/rsmu_spi.c b/drivers/mfd/rsmu_spi.c
index 2428aaa9aaed..a4a595bb8d0d 100644
--- a/drivers/mfd/rsmu_spi.c
+++ b/drivers/mfd/rsmu_spi.c
@@ -19,19 +19,21 @@
 
 #define	RSMU_CM_PAGE_ADDR		0x7C
 #define	RSMU_SABRE_PAGE_ADDR		0x7F
-#define	RSMU_HIGHER_ADDR_MASK		0xFF80
-#define	RSMU_HIGHER_ADDR_SHIFT		7
-#define	RSMU_LOWER_ADDR_MASK		0x7F
+#define	RSMU_PAGE_MASK			0xFFFFFF80
+#define	RSMU_ADDR_MASK			0x7F
 
 static int rsmu_read_device(struct rsmu_ddata *rsmu, u8 reg, u8 *buf, u16 bytes)
 {
 	struct spi_device *client = to_spi_device(rsmu->dev);
 	struct spi_transfer xfer = {0};
 	struct spi_message msg;
-	u8 cmd[256] = {0};
-	u8 rsp[256] = {0};
+	u8 cmd[RSMU_MAX_READ_COUNT + 1] = {0};
+	u8 rsp[RSMU_MAX_READ_COUNT + 1] = {0};
 	int ret;
 
+	if (bytes > RSMU_MAX_READ_COUNT)
+		return -EINVAL;
+
 	cmd[0] = reg | 0x80;
 	xfer.rx_buf = rsp;
 	xfer.len = bytes + 1;
@@ -66,7 +68,10 @@ static int rsmu_write_device(struct rsmu_ddata *rsmu, u8 reg, u8 *buf, u16 bytes
 	struct spi_device *client = to_spi_device(rsmu->dev);
 	struct spi_transfer xfer = {0};
 	struct spi_message msg;
-	u8 cmd[256] = {0};
+	u8 cmd[RSMU_MAX_WRITE_COUNT + 1] = {0};
+
+	if (bytes > RSMU_MAX_WRITE_COUNT)
+		return -EINVAL;
 
 	cmd[0] = reg;
 	memcpy(&cmd[1], buf, bytes);
@@ -86,26 +91,35 @@ static int rsmu_write_device(struct rsmu_ddata *rsmu, u8 reg, u8 *buf, u16 bytes
  * 16-bit register address: the lower 7 bits of the register address come
  * from the offset addr byte and the upper 9 bits come from the page register.
  */
-static int rsmu_write_page_register(struct rsmu_ddata *rsmu, u16 reg)
+static int rsmu_write_page_register(struct rsmu_ddata *rsmu, u32 reg)
 {
 	u8 page_reg;
-	u8 buf[2];
+	u8 buf[4];
 	u16 bytes;
-	u16 page;
+	u32 page;
 	int err;
 
 	switch (rsmu->type) {
 	case RSMU_CM:
+		/* Do not modify page register for none-scsr registers */
+		if (reg < RSMU_CM_SCSR_BASE)
+			return 0;
 		page_reg = RSMU_CM_PAGE_ADDR;
-		page = reg & RSMU_HIGHER_ADDR_MASK;
+		page = reg & RSMU_PAGE_MASK;
 		buf[0] = (u8)(page & 0xff);
 		buf[1] = (u8)((page >> 8) & 0xff);
-		bytes = 2;
+		buf[2] = (u8)((page >> 16) & 0xff);
+		buf[3] = (u8)((page >> 24) & 0xff);
+		bytes = 4;
 		break;
 	case RSMU_SABRE:
+		/* Do not modify page register if reg is page register itself */
+		if ((reg & RSMU_ADDR_MASK) == RSMU_ADDR_MASK)
+			return 0;
 		page_reg = RSMU_SABRE_PAGE_ADDR;
-		page = reg >> RSMU_HIGHER_ADDR_SHIFT;
-		buf[0] = (u8)(page & 0xff);
+		page = reg & RSMU_PAGE_MASK;
+		/* The three page bits are located in the single Page Register */
+		buf[0] = (u8)((page >> 7) & 0x7);
 		bytes = 1;
 		break;
 	default:
@@ -129,8 +143,8 @@ static int rsmu_write_page_register(struct rsmu_ddata *rsmu, u16 reg)
 
 static int rsmu_reg_read(void *context, unsigned int reg, unsigned int *val)
 {
-	struct rsmu_ddata *rsmu = spi_get_drvdata(context);
-	u8 addr = (u8)(reg & RSMU_LOWER_ADDR_MASK);
+	struct rsmu_ddata *rsmu = spi_get_drvdata((struct spi_device *)context);
+	u8 addr = (u8)(reg & RSMU_ADDR_MASK);
 	int err;
 
 	err = rsmu_write_page_register(rsmu, reg);
@@ -146,8 +160,8 @@ static int rsmu_reg_read(void *context, unsigned int reg, unsigned int *val)
 
 static int rsmu_reg_write(void *context, unsigned int reg, unsigned int val)
 {
-	struct rsmu_ddata *rsmu = spi_get_drvdata(context);
-	u8 addr = (u8)(reg & RSMU_LOWER_ADDR_MASK);
+	struct rsmu_ddata *rsmu = spi_get_drvdata((struct spi_device *)context);
+	u8 addr = (u8)(reg & RSMU_ADDR_MASK);
 	u8 data = (u8)val;
 	int err;
 
@@ -164,9 +178,9 @@ static int rsmu_reg_write(void *context, unsigned int reg, unsigned int val)
 }
 
 static const struct regmap_config rsmu_cm_regmap_config = {
-	.reg_bits = 16,
+	.reg_bits = 32,
 	.val_bits = 8,
-	.max_register = 0xD000,
+	.max_register = 0x20120000,
 	.reg_read = rsmu_reg_read,
 	.reg_write = rsmu_reg_write,
 	.cache_type = REGCACHE_NONE,
diff --git a/include/linux/mfd/rsmu.h b/include/linux/mfd/rsmu.h
index 6870de608233..0379aa207428 100644
--- a/include/linux/mfd/rsmu.h
+++ b/include/linux/mfd/rsmu.h
@@ -8,6 +8,9 @@
 #ifndef __LINUX_MFD_RSMU_H
 #define __LINUX_MFD_RSMU_H
 
+#define RSMU_MAX_WRITE_COUNT	(255)
+#define RSMU_MAX_READ_COUNT	(255)
+
 /* The supported devices are ClockMatrix, Sabre and SnowLotus */
 enum rsmu_type {
 	RSMU_CM		= 0x34000,
@@ -31,6 +34,6 @@ struct rsmu_ddata {
 	struct regmap *regmap;
 	struct mutex lock;
 	enum rsmu_type type;
-	u16 page;
+	u32 page;
 };
 #endif /*  __LINUX_MFD_RSMU_H */
-- 
cgit v1.2.3


From 654c293e1687b31819f9bf1ac71b5a85a8053210 Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das.jz@bp.renesas.com>
Date: Thu, 30 Mar 2023 12:16:28 +0100
Subject: mfd: Add Renesas RZ/G2L MTU3a core driver

The RZ/G2L multi-function timer pulse unit 3 (MTU3a) is embedded in
the Renesas RZ/G2L family SoCs. It consists of eight 16-bit timer
channels and one 32-bit timer channel. It supports the following
functions
 - Counter
 - Timer
 - PWM

The 8/16/32 bit registers are mixed in each channel.

Add MTU3a core driver for RZ/G2L SoC. The core driver shares the
clk and channel register access for the other child devices like
Counter, PWM and Clock event.

Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230330111632.169434-3-biju.das.jz@bp.renesas.com
---
 drivers/mfd/Kconfig         |  10 ++
 drivers/mfd/Makefile        |   1 +
 drivers/mfd/rz-mtu3.c       | 391 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/rz-mtu3.h       | 147 +++++++++++++++++
 include/linux/mfd/rz-mtu3.h | 257 +++++++++++++++++++++++++++++
 5 files changed, 806 insertions(+)
 create mode 100644 drivers/mfd/rz-mtu3.c
 create mode 100644 drivers/mfd/rz-mtu3.h
 create mode 100644 include/linux/mfd/rz-mtu3.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 51d54a1b8673..e90463c4441c 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1315,6 +1315,16 @@ config MFD_SC27XX_PMIC
 	  This driver provides common support for accessing the SC27xx PMICs,
 	  and it also adds the irq_chip parts for handling the PMIC chip events.
 
+config RZ_MTU3
+	bool "Renesas RZ/G2L MTU3a core driver"
+	depends on (ARCH_RZG2L && OF) || COMPILE_TEST
+	help
+	  Select this option to enable Renesas RZ/G2L MTU3a core driver for
+	  the Multi-Function Timer Pulse Unit 3 (MTU3a) hardware available
+	  on SoCs from Renesas. The core driver shares the clk and channel
+	  register access for the other child devices like Counter, PWM,
+	  Clock Source, and Clock event.
+
 config ABX500_CORE
 	bool "ST-Ericsson ABX500 Mixed Signal Circuit register functions"
 	depends on ARCH_U8500 || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 2f6c89d1e277..1d2392f06f78 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -174,6 +174,7 @@ pcf50633-objs			:= pcf50633-core.o pcf50633-irq.o
 obj-$(CONFIG_MFD_PCF50633)	+= pcf50633.o
 obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
 obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
+obj-$(CONFIG_RZ_MTU3)		+= rz-mtu3.o
 obj-$(CONFIG_ABX500_CORE)	+= abx500-core.o
 obj-$(CONFIG_MFD_DB8500_PRCMU)	+= db8500-prcmu.o
 # ab8500-core need to come after db8500-prcmu (which provides the channel)
diff --git a/drivers/mfd/rz-mtu3.c b/drivers/mfd/rz-mtu3.c
new file mode 100644
index 000000000000..04006f4aa702
--- /dev/null
+++ b/drivers/mfd/rz-mtu3.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Renesas RZ/G2L Multi-Function Timer Pulse Unit 3(MTU3a) Core driver
+ *
+ * Copyright (C) 2023 Renesas Electronics Corporation
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rz-mtu3.h>
+#include <linux/of_platform.h>
+#include <linux/reset.h>
+#include <linux/spinlock.h>
+
+#include "rz-mtu3.h"
+
+struct rz_mtu3_priv {
+	void __iomem *mmio;
+	struct reset_control *rstc;
+	raw_spinlock_t lock;
+};
+
+/******* MTU3 registers (original offset is +0x1200) *******/
+static const unsigned long rz_mtu3_8bit_ch_reg_offs[][13] = {
+	[RZ_MTU3_CHAN_0] = MTU_8BIT_CH_0(0x104, 0x090, 0x100, 0x128, 0x101, 0x102, 0x103, 0x126),
+	[RZ_MTU3_CHAN_1] = MTU_8BIT_CH_1_2(0x184, 0x091, 0x185, 0x180, 0x194, 0x181, 0x182),
+	[RZ_MTU3_CHAN_2] = MTU_8BIT_CH_1_2(0x204, 0x092, 0x205, 0x200, 0x20c, 0x201, 0x202),
+	[RZ_MTU3_CHAN_3] = MTU_8BIT_CH_3_4_6_7(0x008, 0x093, 0x02c, 0x000, 0x04c, 0x002, 0x004, 0x005, 0x038),
+	[RZ_MTU3_CHAN_4] = MTU_8BIT_CH_3_4_6_7(0x009, 0x094, 0x02d, 0x001, 0x04d, 0x003, 0x006, 0x007, 0x039),
+	[RZ_MTU3_CHAN_5] = MTU_8BIT_CH_5(0xab2, 0x1eb, 0xab4, 0xab6, 0xa84, 0xa85, 0xa86, 0xa94, 0xa95, 0xa96, 0xaa4, 0xaa5, 0xaa6),
+	[RZ_MTU3_CHAN_6] = MTU_8BIT_CH_3_4_6_7(0x808, 0x893, 0x82c, 0x800, 0x84c, 0x802, 0x804, 0x805, 0x838),
+	[RZ_MTU3_CHAN_7] = MTU_8BIT_CH_3_4_6_7(0x809, 0x894, 0x82d, 0x801, 0x84d, 0x803, 0x806, 0x807, 0x839),
+	[RZ_MTU3_CHAN_8] = MTU_8BIT_CH_8(0x404, 0x098, 0x400, 0x406, 0x401, 0x402, 0x403)
+};
+
+static const unsigned long rz_mtu3_16bit_ch_reg_offs[][12] = {
+	[RZ_MTU3_CHAN_0] = MTU_16BIT_CH_0(0x106, 0x108, 0x10a, 0x10c, 0x10e, 0x120, 0x122),
+	[RZ_MTU3_CHAN_1] = MTU_16BIT_CH_1_2(0x186, 0x188, 0x18a),
+	[RZ_MTU3_CHAN_2] = MTU_16BIT_CH_1_2(0x206, 0x208, 0x20a),
+	[RZ_MTU3_CHAN_3] = MTU_16BIT_CH_3_6(0x010, 0x018, 0x01a, 0x024, 0x026, 0x072),
+	[RZ_MTU3_CHAN_4] = MTU_16BIT_CH_4_7(0x012, 0x01c, 0x01e, 0x028, 0x2a, 0x074, 0x076, 0x040, 0x044, 0x046, 0x048, 0x04a),
+	[RZ_MTU3_CHAN_5] = MTU_16BIT_CH_5(0xa80, 0xa82, 0xa90, 0xa92, 0xaa0, 0xaa2),
+	[RZ_MTU3_CHAN_6] = MTU_16BIT_CH_3_6(0x810, 0x818, 0x81a, 0x824, 0x826, 0x872),
+	[RZ_MTU3_CHAN_7] = MTU_16BIT_CH_4_7(0x812, 0x81c, 0x81e, 0x828, 0x82a, 0x874, 0x876, 0x840, 0x844, 0x846, 0x848, 0x84a)
+};
+
+static const unsigned long rz_mtu3_32bit_ch_reg_offs[][5] = {
+	[RZ_MTU3_CHAN_1] = MTU_32BIT_CH_1(0x1a0, 0x1a4, 0x1a8),
+	[RZ_MTU3_CHAN_8] = MTU_32BIT_CH_8(0x408, 0x40c, 0x410, 0x414, 0x418)
+};
+
+static bool rz_mtu3_is_16bit_shared_reg(u16 offset)
+{
+	return (offset == RZ_MTU3_TDDRA || offset == RZ_MTU3_TDDRB ||
+		offset == RZ_MTU3_TCDRA || offset == RZ_MTU3_TCDRB ||
+		offset == RZ_MTU3_TCBRA || offset == RZ_MTU3_TCBRB ||
+		offset == RZ_MTU3_TCNTSA || offset == RZ_MTU3_TCNTSB);
+}
+
+u16 rz_mtu3_shared_reg_read(struct rz_mtu3_channel *ch, u16 offset)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+
+	if (rz_mtu3_is_16bit_shared_reg(offset))
+		return readw(priv->mmio + offset);
+	else
+		return readb(priv->mmio + offset);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_shared_reg_read);
+
+u8 rz_mtu3_8bit_ch_read(struct rz_mtu3_channel *ch, u16 offset)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	ch_offs = rz_mtu3_8bit_ch_reg_offs[ch->channel_number][offset];
+
+	return readb(priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_8bit_ch_read);
+
+u16 rz_mtu3_16bit_ch_read(struct rz_mtu3_channel *ch, u16 offset)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	/* MTU8 doesn't have 16-bit registers */
+	if (ch->channel_number == RZ_MTU3_CHAN_8)
+		return 0;
+
+	ch_offs = rz_mtu3_16bit_ch_reg_offs[ch->channel_number][offset];
+
+	return readw(priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_16bit_ch_read);
+
+u32 rz_mtu3_32bit_ch_read(struct rz_mtu3_channel *ch, u16 offset)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	if (ch->channel_number != RZ_MTU3_CHAN_1 && ch->channel_number != RZ_MTU3_CHAN_8)
+		return 0;
+
+	ch_offs = rz_mtu3_32bit_ch_reg_offs[ch->channel_number][offset];
+
+	return readl(priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_32bit_ch_read);
+
+void rz_mtu3_8bit_ch_write(struct rz_mtu3_channel *ch, u16 offset, u8 val)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	ch_offs = rz_mtu3_8bit_ch_reg_offs[ch->channel_number][offset];
+	writeb(val, priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_8bit_ch_write);
+
+void rz_mtu3_16bit_ch_write(struct rz_mtu3_channel *ch, u16 offset, u16 val)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	/* MTU8 doesn't have 16-bit registers */
+	if (ch->channel_number == RZ_MTU3_CHAN_8)
+		return;
+
+	ch_offs = rz_mtu3_16bit_ch_reg_offs[ch->channel_number][offset];
+	writew(val, priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_16bit_ch_write);
+
+void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 offset, u32 val)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	u16 ch_offs;
+
+	if (ch->channel_number != RZ_MTU3_CHAN_1 && ch->channel_number != RZ_MTU3_CHAN_8)
+		return;
+
+	ch_offs = rz_mtu3_32bit_ch_reg_offs[ch->channel_number][offset];
+	writel(val, priv->mmio + ch_offs);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_32bit_ch_write);
+
+void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 offset, u16 value)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+
+	if (rz_mtu3_is_16bit_shared_reg(offset))
+		writew(value, priv->mmio + offset);
+	else
+		writeb((u8)value, priv->mmio + offset);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_shared_reg_write);
+
+void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch, u16 offset,
+				   u16 pos, u8 val)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	unsigned long tmdr, flags;
+
+	raw_spin_lock_irqsave(&priv->lock, flags);
+	tmdr = rz_mtu3_shared_reg_read(ch, offset);
+	__assign_bit(pos, &tmdr, !!val);
+	rz_mtu3_shared_reg_write(ch, offset, tmdr);
+	raw_spin_unlock_irqrestore(&priv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_shared_reg_update_bit);
+
+static u16 rz_mtu3_get_tstr_offset(struct rz_mtu3_channel *ch)
+{
+	u16 offset;
+
+	switch (ch->channel_number) {
+	case RZ_MTU3_CHAN_0:
+	case RZ_MTU3_CHAN_1:
+	case RZ_MTU3_CHAN_2:
+	case RZ_MTU3_CHAN_3:
+	case RZ_MTU3_CHAN_4:
+	case RZ_MTU3_CHAN_8:
+		offset = RZ_MTU3_TSTRA;
+		break;
+	case RZ_MTU3_CHAN_5:
+		offset = RZ_MTU3_TSTR;
+		break;
+	case RZ_MTU3_CHAN_6:
+	case RZ_MTU3_CHAN_7:
+		offset = RZ_MTU3_TSTRB;
+		break;
+	default:
+		offset = 0;
+		break;
+	}
+
+	return offset;
+}
+
+static u8 rz_mtu3_get_tstr_bit_pos(struct rz_mtu3_channel *ch)
+{
+	u8 bitpos;
+
+	switch (ch->channel_number) {
+	case RZ_MTU3_CHAN_0:
+	case RZ_MTU3_CHAN_1:
+	case RZ_MTU3_CHAN_2:
+	case RZ_MTU3_CHAN_6:
+	case RZ_MTU3_CHAN_7:
+		bitpos = ch->channel_number;
+		break;
+	case RZ_MTU3_CHAN_3:
+		bitpos = 6;
+		break;
+	case RZ_MTU3_CHAN_4:
+		bitpos = 7;
+		break;
+	case RZ_MTU3_CHAN_5:
+		bitpos = 2;
+		break;
+	case RZ_MTU3_CHAN_8:
+		bitpos = 3;
+		break;
+	default:
+		bitpos = 0;
+		break;
+	}
+
+	return bitpos;
+}
+
+static void rz_mtu3_start_stop_ch(struct rz_mtu3_channel *ch, bool start)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	unsigned long flags, tstr;
+	u16 offset;
+	u8 bitpos;
+
+	/* start stop register shared by multiple timer channels */
+	raw_spin_lock_irqsave(&priv->lock, flags);
+
+	offset = rz_mtu3_get_tstr_offset(ch);
+	bitpos = rz_mtu3_get_tstr_bit_pos(ch);
+	tstr = rz_mtu3_shared_reg_read(ch, offset);
+	__assign_bit(bitpos, &tstr, start);
+	rz_mtu3_shared_reg_write(ch, offset, tstr);
+
+	raw_spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+bool rz_mtu3_is_enabled(struct rz_mtu3_channel *ch)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(ch->dev->parent);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+	unsigned long flags, tstr;
+	bool ret = false;
+	u16 offset;
+	u8 bitpos;
+
+	/* start stop register shared by multiple timer channels */
+	raw_spin_lock_irqsave(&priv->lock, flags);
+
+	offset = rz_mtu3_get_tstr_offset(ch);
+	bitpos = rz_mtu3_get_tstr_bit_pos(ch);
+	tstr = rz_mtu3_shared_reg_read(ch, offset);
+	ret = tstr & BIT(bitpos);
+
+	raw_spin_unlock_irqrestore(&priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_is_enabled);
+
+int rz_mtu3_enable(struct rz_mtu3_channel *ch)
+{
+	/* enable channel */
+	rz_mtu3_start_stop_ch(ch, true);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_enable);
+
+void rz_mtu3_disable(struct rz_mtu3_channel *ch)
+{
+	/* disable channel */
+	rz_mtu3_start_stop_ch(ch, false);
+}
+EXPORT_SYMBOL_GPL(rz_mtu3_disable);
+
+static void rz_mtu3_reset_assert(void *data)
+{
+	struct rz_mtu3 *mtu = dev_get_drvdata(data);
+	struct rz_mtu3_priv *priv = mtu->priv_data;
+
+	mfd_remove_devices(data);
+	reset_control_assert(priv->rstc);
+}
+
+static const struct mfd_cell rz_mtu3_devs[] = {
+	{
+		.name = "rz-mtu3-counter",
+	},
+	{
+		.name = "pwm-rz-mtu3",
+	},
+};
+
+static int rz_mtu3_probe(struct platform_device *pdev)
+{
+	struct rz_mtu3_priv *priv;
+	struct rz_mtu3 *ddata;
+	unsigned int i;
+	int ret;
+
+	ddata = devm_kzalloc(&pdev->dev, sizeof(*ddata), GFP_KERNEL);
+	if (!ddata)
+		return -ENOMEM;
+
+	ddata->priv_data = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!ddata->priv_data)
+		return -ENOMEM;
+
+	priv = ddata->priv_data;
+
+	priv->mmio = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(priv->mmio))
+		return PTR_ERR(priv->mmio);
+
+	priv->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL);
+	if (IS_ERR(priv->rstc))
+		return PTR_ERR(priv->rstc);
+
+	ddata->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(ddata->clk))
+		return PTR_ERR(ddata->clk);
+
+	reset_control_deassert(priv->rstc);
+	raw_spin_lock_init(&priv->lock);
+	platform_set_drvdata(pdev, ddata);
+
+	for (i = 0; i < RZ_MTU_NUM_CHANNELS; i++) {
+		ddata->channels[i].channel_number = i;
+		ddata->channels[i].is_busy = false;
+		mutex_init(&ddata->channels[i].lock);
+	}
+
+	ret = mfd_add_devices(&pdev->dev, 0, rz_mtu3_devs,
+			      ARRAY_SIZE(rz_mtu3_devs), NULL, 0, NULL);
+	if (ret < 0)
+		goto err_assert;
+
+	return devm_add_action_or_reset(&pdev->dev, rz_mtu3_reset_assert,
+					&pdev->dev);
+
+err_assert:
+	reset_control_assert(priv->rstc);
+	return ret;
+}
+
+static const struct of_device_id rz_mtu3_of_match[] = {
+	{ .compatible = "renesas,rz-mtu3", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, rz_mtu3_of_match);
+
+static struct platform_driver rz_mtu3_driver = {
+	.probe = rz_mtu3_probe,
+	.driver	= {
+		.name = "rz-mtu3",
+		.of_match_table = rz_mtu3_of_match,
+	},
+};
+module_platform_driver(rz_mtu3_driver);
+
+MODULE_AUTHOR("Biju Das <biju.das.jz@bp.renesas.com>");
+MODULE_DESCRIPTION("Renesas RZ/G2L MTU3a Core Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/rz-mtu3.h b/drivers/mfd/rz-mtu3.h
new file mode 100644
index 000000000000..51a1298b0613
--- /dev/null
+++ b/drivers/mfd/rz-mtu3.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MFD internals for Renesas RZ/G2L MTU3 Core driver
+ *
+ * Copyright (C) 2023 Renesas Electronics Corporation
+ */
+
+#ifndef RZ_MTU3_MFD_H
+#define RZ_MTU3_MFD_H
+
+#define MTU_8BIT_CH_0(_tier, _nfcr, _tcr, _tcr2, _tmdr1, _tiorh, _tiorl, _tbtm) \
+	{ \
+		[RZ_MTU3_TIER] = _tier, \
+		[RZ_MTU3_NFCR] = _nfcr, \
+		[RZ_MTU3_TCR] = _tcr, \
+		[RZ_MTU3_TCR2] = _tcr2, \
+		[RZ_MTU3_TMDR1] = _tmdr1, \
+		[RZ_MTU3_TIORH] = _tiorh, \
+		[RZ_MTU3_TIORL] = _tiorl, \
+		[RZ_MTU3_TBTM] = _tbtm \
+	}
+
+#define MTU_8BIT_CH_1_2(_tier, _nfcr, _tsr, _tcr, _tcr2, _tmdr1, _tior) \
+	{ \
+		[RZ_MTU3_TIER] = _tier, \
+		[RZ_MTU3_NFCR] = _nfcr, \
+		[RZ_MTU3_TSR] = _tsr, \
+		[RZ_MTU3_TCR] = _tcr, \
+		[RZ_MTU3_TCR2] = _tcr2, \
+		[RZ_MTU3_TMDR1] = _tmdr1, \
+		[RZ_MTU3_TIOR] = _tior \
+	} \
+
+#define MTU_8BIT_CH_3_4_6_7(_tier, _nfcr, _tsr, _tcr, _tcr2, _tmdr1, _tiorh, _tiorl, _tbtm) \
+	{ \
+		[RZ_MTU3_TIER] = _tier, \
+		[RZ_MTU3_NFCR] = _nfcr, \
+		[RZ_MTU3_TSR] = _tsr, \
+		[RZ_MTU3_TCR] = _tcr, \
+		[RZ_MTU3_TCR2] = _tcr2, \
+		[RZ_MTU3_TMDR1] = _tmdr1, \
+		[RZ_MTU3_TIORH] = _tiorh, \
+		[RZ_MTU3_TIORL] = _tiorl, \
+		[RZ_MTU3_TBTM] = _tbtm \
+	} \
+
+#define MTU_8BIT_CH_5(_tier, _nfcr, _tstr, _tcntcmpclr, _tcru, _tcr2u, _tioru, \
+		      _tcrv, _tcr2v, _tiorv, _tcrw, _tcr2w, _tiorw) \
+	{ \
+		[RZ_MTU3_TIER] = _tier, \
+		[RZ_MTU3_NFCR] = _nfcr, \
+		[RZ_MTU3_TSTR] = _tstr, \
+		[RZ_MTU3_TCNTCMPCLR] = _tcntcmpclr, \
+		[RZ_MTU3_TCRU] = _tcru, \
+		[RZ_MTU3_TCR2U] = _tcr2u, \
+		[RZ_MTU3_TIORU] = _tioru, \
+		[RZ_MTU3_TCRV] = _tcrv, \
+		[RZ_MTU3_TCR2V] = _tcr2v, \
+		[RZ_MTU3_TIORV] = _tiorv, \
+		[RZ_MTU3_TCRW] = _tcrw, \
+		[RZ_MTU3_TCR2W] = _tcr2w, \
+		[RZ_MTU3_TIORW] = _tiorw \
+	} \
+
+#define MTU_8BIT_CH_8(_tier, _nfcr, _tcr, _tcr2, _tmdr1, _tiorh, _tiorl) \
+	{ \
+		[RZ_MTU3_TIER] = _tier, \
+		[RZ_MTU3_NFCR] = _nfcr, \
+		[RZ_MTU3_TCR] = _tcr, \
+		[RZ_MTU3_TCR2] = _tcr2, \
+		[RZ_MTU3_TMDR1] = _tmdr1, \
+		[RZ_MTU3_TIORH] = _tiorh, \
+		[RZ_MTU3_TIORL] = _tiorl \
+	} \
+
+#define MTU_16BIT_CH_0(_tcnt, _tgra, _tgrb, _tgrc, _tgrd, _tgre, _tgrf) \
+	{ \
+		[RZ_MTU3_TCNT] = _tcnt, \
+		[RZ_MTU3_TGRA] = _tgra, \
+		[RZ_MTU3_TGRB] = _tgrb, \
+		[RZ_MTU3_TGRC] = _tgrc, \
+		[RZ_MTU3_TGRD] = _tgrd, \
+		[RZ_MTU3_TGRE] = _tgre, \
+		[RZ_MTU3_TGRF] = _tgrf \
+	}
+
+#define MTU_16BIT_CH_1_2(_tcnt, _tgra, _tgrb) \
+	{ \
+		[RZ_MTU3_TCNT] = _tcnt, \
+		[RZ_MTU3_TGRA] = _tgra, \
+		[RZ_MTU3_TGRB] = _tgrb \
+	}
+
+#define MTU_16BIT_CH_3_6(_tcnt, _tgra, _tgrb, _tgrc, _tgrd, _tgre) \
+	{ \
+		[RZ_MTU3_TCNT] = _tcnt, \
+		[RZ_MTU3_TGRA] = _tgra, \
+		[RZ_MTU3_TGRB] = _tgrb, \
+		[RZ_MTU3_TGRC] = _tgrc, \
+		[RZ_MTU3_TGRD] = _tgrd, \
+		[RZ_MTU3_TGRE] = _tgre \
+	}
+
+#define MTU_16BIT_CH_4_7(_tcnt, _tgra, _tgrb, _tgrc, _tgrd, _tgre, _tgrf, \
+			  _tadcr, _tadcora, _tadcorb, _tadcobra, _tadcobrb) \
+	{ \
+		[RZ_MTU3_TCNT] = _tcnt, \
+		[RZ_MTU3_TGRA] = _tgra, \
+		[RZ_MTU3_TGRB] = _tgrb, \
+		[RZ_MTU3_TGRC] = _tgrc, \
+		[RZ_MTU3_TGRD] = _tgrd, \
+		[RZ_MTU3_TGRE] = _tgre, \
+		[RZ_MTU3_TGRF] = _tgrf, \
+		[RZ_MTU3_TADCR] = _tadcr, \
+		[RZ_MTU3_TADCORA] = _tadcora, \
+		[RZ_MTU3_TADCORB] = _tadcorb, \
+		[RZ_MTU3_TADCOBRA] = _tadcobra, \
+		[RZ_MTU3_TADCOBRB] = _tadcobrb \
+	}
+
+#define MTU_16BIT_CH_5(_tcntu, _tgru, _tcntv, _tgrv, _tcntw, _tgrw) \
+	{ \
+		[RZ_MTU3_TCNTU] = _tcntu, \
+		[RZ_MTU3_TGRU] = _tgru, \
+		[RZ_MTU3_TCNTV] = _tcntv, \
+		[RZ_MTU3_TGRV] = _tgrv, \
+		[RZ_MTU3_TCNTW] = _tcntw, \
+		[RZ_MTU3_TGRW] = _tgrw \
+	}
+
+#define MTU_32BIT_CH_1(_tcntlw, _tgralw, _tgrblw) \
+	{ \
+	       [RZ_MTU3_TCNTLW] = _tcntlw, \
+	       [RZ_MTU3_TGRALW] = _tgralw, \
+	       [RZ_MTU3_TGRBLW] = _tgrblw \
+	}
+
+#define MTU_32BIT_CH_8(_tcnt, _tgra, _tgrb, _tgrc, _tgrd) \
+	{ \
+	       [RZ_MTU3_TCNT] = _tcnt, \
+	       [RZ_MTU3_TGRA] = _tgra, \
+	       [RZ_MTU3_TGRB] = _tgrb, \
+	       [RZ_MTU3_TGRC] = _tgrc, \
+	       [RZ_MTU3_TGRD] = _tgrd \
+	}
+
+#endif
diff --git a/include/linux/mfd/rz-mtu3.h b/include/linux/mfd/rz-mtu3.h
new file mode 100644
index 000000000000..c5173bc06270
--- /dev/null
+++ b/include/linux/mfd/rz-mtu3.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Renesas Electronics Corporation
+ */
+#ifndef __MFD_RZ_MTU3_H__
+#define __MFD_RZ_MTU3_H__
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+
+/* 8-bit shared register offsets macros */
+#define RZ_MTU3_TSTRA	0x080 /* Timer start register A */
+#define RZ_MTU3_TSTRB	0x880 /* Timer start register B */
+
+/* 16-bit shared register offset macros */
+#define RZ_MTU3_TDDRA	0x016 /* Timer dead time data register A */
+#define RZ_MTU3_TDDRB	0x816 /* Timer dead time data register B */
+#define RZ_MTU3_TCDRA	0x014 /* Timer cycle data register A */
+#define RZ_MTU3_TCDRB	0x814 /* Timer cycle data register B */
+#define RZ_MTU3_TCBRA	0x022 /* Timer cycle buffer register A */
+#define RZ_MTU3_TCBRB	0x822 /* Timer cycle buffer register B */
+#define RZ_MTU3_TCNTSA	0x020 /* Timer subcounter A */
+#define RZ_MTU3_TCNTSB	0x820 /* Timer subcounter B */
+
+/*
+ * MTU5 contains 3 timer counter registers and is totaly different
+ * from other channels, so we must separate its offset
+ */
+
+/* 8-bit register offset macros of MTU3 channels except MTU5 */
+#define RZ_MTU3_TIER	0 /* Timer interrupt register */
+#define RZ_MTU3_NFCR	1 /* Noise filter control register */
+#define RZ_MTU3_TSR	2 /* Timer status register */
+#define RZ_MTU3_TCR	3 /* Timer control register */
+#define RZ_MTU3_TCR2	4 /* Timer control register 2 */
+
+/* Timer mode register 1 */
+#define RZ_MTU3_TMDR1	5
+#define RZ_MTU3_TMDR1_MD		GENMASK(3, 0)
+#define RZ_MTU3_TMDR1_MD_NORMAL		FIELD_PREP(RZ_MTU3_TMDR1_MD, 0)
+#define RZ_MTU3_TMDR1_MD_PWMMODE1	FIELD_PREP(RZ_MTU3_TMDR1_MD, 2)
+
+#define RZ_MTU3_TIOR	6 /* Timer I/O control register */
+#define RZ_MTU3_TIORH	6 /* Timer I/O control register H */
+#define RZ_MTU3_TIORL	7 /* Timer I/O control register L */
+/* Only MTU3/4/6/7 have TBTM registers */
+#define RZ_MTU3_TBTM	8 /* Timer buffer operation transfer mode register */
+
+/* 8-bit MTU5 register offset macros */
+#define RZ_MTU3_TSTR		2 /* MTU5 Timer start register */
+#define RZ_MTU3_TCNTCMPCLR	3 /* MTU5 Timer compare match clear register */
+#define RZ_MTU3_TCRU		4 /* Timer control register U */
+#define RZ_MTU3_TCR2U		5 /* Timer control register 2U */
+#define RZ_MTU3_TIORU		6 /* Timer I/O control register U */
+#define RZ_MTU3_TCRV		7 /* Timer control register V */
+#define RZ_MTU3_TCR2V		8 /* Timer control register 2V */
+#define RZ_MTU3_TIORV		9 /* Timer I/O control register V */
+#define RZ_MTU3_TCRW		10 /* Timer control register W */
+#define RZ_MTU3_TCR2W		11 /* Timer control register 2W */
+#define RZ_MTU3_TIORW		12 /* Timer I/O control register W */
+
+/* 16-bit register offset macros of MTU3 channels except MTU5 */
+#define RZ_MTU3_TCNT		0 /* Timer counter */
+#define RZ_MTU3_TGRA		1 /* Timer general register A */
+#define RZ_MTU3_TGRB		2 /* Timer general register B */
+#define RZ_MTU3_TGRC		3 /* Timer general register C */
+#define RZ_MTU3_TGRD		4 /* Timer general register D */
+#define RZ_MTU3_TGRE		5 /* Timer general register E */
+#define RZ_MTU3_TGRF		6 /* Timer general register F */
+/* Timer A/D converter start request registers */
+#define RZ_MTU3_TADCR		7 /* control register */
+#define RZ_MTU3_TADCORA		8 /* cycle set register A */
+#define RZ_MTU3_TADCORB		9 /* cycle set register B */
+#define RZ_MTU3_TADCOBRA	10 /* cycle set buffer register A */
+#define RZ_MTU3_TADCOBRB	11 /* cycle set buffer register B */
+
+/* 16-bit MTU5 register offset macros */
+#define RZ_MTU3_TCNTU		0 /* MTU5 Timer counter U */
+#define RZ_MTU3_TGRU		1 /* MTU5 Timer general register U */
+#define RZ_MTU3_TCNTV		2 /* MTU5 Timer counter V */
+#define RZ_MTU3_TGRV		3 /* MTU5 Timer general register V */
+#define RZ_MTU3_TCNTW		4 /* MTU5 Timer counter W */
+#define RZ_MTU3_TGRW		5 /* MTU5 Timer general register W */
+
+/* 32-bit register offset */
+#define RZ_MTU3_TCNTLW		0 /* Timer longword counter */
+#define RZ_MTU3_TGRALW		1 /* Timer longword general register A */
+#define RZ_MTU3_TGRBLW		2 /* Timer longowrd general register B */
+
+#define RZ_MTU3_TMDR3		0x191 /* MTU1 Timer Mode Register 3 */
+
+/* Macros for setting registers */
+#define RZ_MTU3_TCR_CCLR	GENMASK(7, 5)
+#define RZ_MTU3_TCR_CKEG	GENMASK(4, 3)
+#define RZ_MTU3_TCR_TPCS	GENMASK(2, 0)
+#define RZ_MTU3_TCR_CCLR_TGRA	BIT(5)
+#define RZ_MTU3_TCR_CCLR_TGRC	FIELD_PREP(RZ_MTU3_TCR_CCLR, 5)
+#define RZ_MTU3_TCR_CKEG_RISING	FIELD_PREP(RZ_MTU3_TCR_CKEG, 0)
+
+#define RZ_MTU3_TIOR_IOB			GENMASK(7, 4)
+#define RZ_MTU3_TIOR_IOA			GENMASK(3, 0)
+#define RZ_MTU3_TIOR_OC_RETAIN			0
+#define RZ_MTU3_TIOR_OC_INIT_OUT_LO_HI_OUT	2
+#define RZ_MTU3_TIOR_OC_INIT_OUT_HI_TOGGLE_OUT	7
+
+#define RZ_MTU3_TIOR_OC_IOA_H_COMP_MATCH \
+	FIELD_PREP(RZ_MTU3_TIOR_IOA, RZ_MTU3_TIOR_OC_INIT_OUT_LO_HI_OUT)
+#define RZ_MTU3_TIOR_OC_IOB_TOGGLE \
+	FIELD_PREP(RZ_MTU3_TIOR_IOB, RZ_MTU3_TIOR_OC_INIT_OUT_HI_TOGGLE_OUT)
+
+enum rz_mtu3_channels {
+	RZ_MTU3_CHAN_0,
+	RZ_MTU3_CHAN_1,
+	RZ_MTU3_CHAN_2,
+	RZ_MTU3_CHAN_3,
+	RZ_MTU3_CHAN_4,
+	RZ_MTU3_CHAN_5,
+	RZ_MTU3_CHAN_6,
+	RZ_MTU3_CHAN_7,
+	RZ_MTU3_CHAN_8,
+	RZ_MTU_NUM_CHANNELS
+};
+
+/**
+ * struct rz_mtu3_channel - MTU3 channel private data
+ *
+ * @dev: device handle
+ * @channel_number: channel number
+ * @lock: Lock to protect channel state
+ * @is_busy: channel state
+ */
+struct rz_mtu3_channel {
+	struct device *dev;
+	unsigned int channel_number;
+	struct mutex lock;
+	bool is_busy;
+};
+
+/**
+ * struct rz_mtu3 - MTU3 core private data
+ *
+ * @clk: MTU3 module clock
+ * @rz_mtu3_channel: HW channels
+ * @priv_data: MTU3 core driver private data
+ */
+struct rz_mtu3 {
+	struct clk *clk;
+	struct rz_mtu3_channel channels[RZ_MTU_NUM_CHANNELS];
+
+	void *priv_data;
+};
+
+#if IS_ENABLED(CONFIG_RZ_MTU3)
+static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
+{
+	mutex_lock(&ch->lock);
+	if (ch->is_busy) {
+		mutex_unlock(&ch->lock);
+		return false;
+	}
+
+	ch->is_busy = true;
+	mutex_unlock(&ch->lock);
+
+	return true;
+}
+
+static inline void rz_mtu3_release_channel(struct rz_mtu3_channel *ch)
+{
+	mutex_lock(&ch->lock);
+	ch->is_busy = false;
+	mutex_unlock(&ch->lock);
+}
+
+bool rz_mtu3_is_enabled(struct rz_mtu3_channel *ch);
+void rz_mtu3_disable(struct rz_mtu3_channel *ch);
+int rz_mtu3_enable(struct rz_mtu3_channel *ch);
+
+u8 rz_mtu3_8bit_ch_read(struct rz_mtu3_channel *ch, u16 off);
+u16 rz_mtu3_16bit_ch_read(struct rz_mtu3_channel *ch, u16 off);
+u32 rz_mtu3_32bit_ch_read(struct rz_mtu3_channel *ch, u16 off);
+u16 rz_mtu3_shared_reg_read(struct rz_mtu3_channel *ch, u16 off);
+
+void rz_mtu3_8bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u8 val);
+void rz_mtu3_16bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u16 val);
+void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val);
+void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val);
+void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch, u16 off,
+				   u16 pos, u8 val);
+#else
+static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
+{
+	return false;
+}
+
+static inline void rz_mtu3_release_channel(struct rz_mtu3_channel *ch)
+{
+}
+
+static inline bool rz_mtu3_is_enabled(struct rz_mtu3_channel *ch)
+{
+	return false;
+}
+
+static inline void rz_mtu3_disable(struct rz_mtu3_channel *ch)
+{
+}
+
+static inline int rz_mtu3_enable(struct rz_mtu3_channel *ch)
+{
+	return 0;
+}
+
+static inline u8 rz_mtu3_8bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
+{
+	return 0;
+}
+
+static inline u16 rz_mtu3_16bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
+{
+	return 0;
+}
+
+static inline u32 rz_mtu3_32bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
+{
+	return 0;
+}
+
+static inline u16 rz_mtu3_shared_reg_read(struct rz_mtu3_channel *ch, u16 off)
+{
+	return 0;
+}
+
+static inline void rz_mtu3_8bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u8 val)
+{
+}
+
+static inline void rz_mtu3_16bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
+{
+}
+
+static inline void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val)
+{
+}
+
+static inline void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
+{
+}
+
+static inline void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch,
+						 u16 off, u16 pos, u8 val)
+{
+}
+#endif
+
+#endif /* __MFD_RZ_MTU3_H__ */
-- 
cgit v1.2.3


From 378b0e9f247450c9c4e49f02d9d79799a0c7c349 Mon Sep 17 00:00:00 2001
From: Patrick Delaunay <patrick.delaunay@foss.st.com>
Date: Mon, 17 Apr 2023 18:14:06 +0200
Subject: dt-bindings: mfd: stm32: Remove unnecessary blank lines

Remove double blank line.

Signed-off-by: Patrick Delaunay <patrick.delaunay@foss.st.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230417181342.v2.1.I483a676579cc7e3ac07e1db649091553743fecc8@changeid
---
 include/dt-bindings/mfd/stm32f4-rcc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/mfd/stm32f4-rcc.h b/include/dt-bindings/mfd/stm32f4-rcc.h
index 309e8c79f27b..36448a5619a1 100644
--- a/include/dt-bindings/mfd/stm32f4-rcc.h
+++ b/include/dt-bindings/mfd/stm32f4-rcc.h
@@ -34,7 +34,6 @@
 #define STM32F4_AHB1_RESET(bit) (STM32F4_RCC_AHB1_##bit + (0x10 * 8))
 #define STM32F4_AHB1_CLOCK(bit) (STM32F4_RCC_AHB1_##bit)
 
-
 /* AHB2 */
 #define STM32F4_RCC_AHB2_DCMI	0
 #define STM32F4_RCC_AHB2_CRYP	4
-- 
cgit v1.2.3


From dcb779fcd4ed5984ad15991d574943d12a8693d1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 15 Feb 2023 06:53:54 -0500
Subject: nfsd: allow reaping files still under writeback

On most filesystems, there is no reason to delay reaping an nfsd_file
just because its underlying inode is still under writeback. nfsd just
relies on client activity or the local flusher threads to do writeback.

The main exception is NFS, which flushes all of its dirty data on last
close. Add a new EXPORT_OP_FLUSH_ON_CLOSE flag to allow filesystems to
signal that they do this, and only skip closing files under writeback on
such filesystems.

Also, remove a redundant NULL file pointer check in
nfsd_file_check_writeback, and clean up nfs's export op flag
definitions.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/export.c          |  9 ++++++---
 fs/nfsd/filecache.c      | 12 +++++++++++-
 include/linux/exportfs.h |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index d6a6d1ebb8fd..be686b8e0c54 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -149,7 +149,10 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
-		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
-		EXPORT_OP_NOATOMIC_ATTR,
+	.flags = EXPORT_OP_NOWCC		|
+		 EXPORT_OP_NOSUBTREECHK		|
+		 EXPORT_OP_CLOSE_BEFORE_UNLINK	|
+		 EXPORT_OP_REMOTE_FS		|
+		 EXPORT_OP_NOATOMIC_ATTR	|
+		 EXPORT_OP_FLUSH_ON_CLOSE,
 };
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9b7082fdd211..a6fa6e980277 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -402,13 +402,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
 	struct file *file = nf->nf_file;
 	struct address_space *mapping;
 
-	if (!file || !(file->f_mode & FMODE_WRITE))
+	/* File not open for write? */
+	if (!(file->f_mode & FMODE_WRITE))
 		return false;
+
+	/*
+	 * Some filesystems (e.g. NFS) flush all dirty data on close.
+	 * On others, there is no need to wait for writeback.
+	 */
+	if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE))
+		return false;
+
 	mapping = file->f_mapping;
 	return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
 		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
 }
 
+
 static bool nfsd_file_lru_add(struct nfsd_file *nf)
 {
 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 601700fedc91..9edb29101ec8 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -220,6 +220,7 @@ struct export_operations {
 #define EXPORT_OP_NOATOMIC_ATTR		(0x10) /* Filesystem cannot supply
 						  atomic attribute updates
 						*/
+#define EXPORT_OP_FLUSH_ON_CLOSE	(0x20) /* fs flushes file data on close */
 	unsigned long	flags;
 };
 
-- 
cgit v1.2.3


From c88c680c6de5077292667fae4a147fd5a6523479 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 3 Mar 2023 07:15:58 -0500
Subject: lockd: remove 2 unused helper functions

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lockd/lockd.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0168ac9fdda8..26c2aed31a0c 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -99,21 +99,11 @@ struct nsm_handle {
 /*
  * Rigorous type checking on sockaddr type conversions
  */
-static inline struct sockaddr_in *nlm_addr_in(const struct nlm_host *host)
-{
-	return (struct sockaddr_in *)&host->h_addr;
-}
-
 static inline struct sockaddr *nlm_addr(const struct nlm_host *host)
 {
 	return (struct sockaddr *)&host->h_addr;
 }
 
-static inline struct sockaddr_in *nlm_srcaddr_in(const struct nlm_host *host)
-{
-	return (struct sockaddr_in *)&host->h_srcaddr;
-}
-
 static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host)
 {
 	return (struct sockaddr *)&host->h_srcaddr;
-- 
cgit v1.2.3


From f0aa4852e63f9c1cfd4322c770e69d7e6817e906 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 3 Mar 2023 07:15:59 -0500
Subject: lockd: move struct nlm_wait to lockd.h

The next patch needs struct nlm_wait in fs/lockd/clntproc.c, so move
the definition to a shared header file. As an added clean-up, drop
the unused b_reclaim field.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/clntlock.c         | 12 ------------
 include/linux/lockd/lockd.h | 11 ++++++++++-
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 82b19a30e0f0..464cb15c1a06 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -29,18 +29,6 @@ static int			reclaimer(void *ptr);
  * client perspective.
  */
 
-/*
- * This is the representation of a blocked client lock.
- */
-struct nlm_wait {
-	struct list_head	b_list;		/* linked list */
-	wait_queue_head_t	b_wait;		/* where to wait on */
-	struct nlm_host *	b_host;
-	struct file_lock *	b_lock;		/* local file lock */
-	unsigned short		b_reclaim;	/* got to reclaim lock */
-	__be32			b_status;	/* grant callback status */
-};
-
 static LIST_HEAD(nlm_blocked);
 static DEFINE_SPINLOCK(nlm_blocked_lock);
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 26c2aed31a0c..de5ad2c2179a 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -121,7 +121,16 @@ struct nlm_lockowner {
 	uint32_t pid;
 };
 
-struct nlm_wait;
+/*
+ * This is the representation of a blocked client lock.
+ */
+struct nlm_wait {
+	struct list_head	b_list;		/* linked list */
+	wait_queue_head_t	b_wait;		/* where to wait on */
+	struct nlm_host		*b_host;
+	struct file_lock	*b_lock;	/* local file lock */
+	__be32			b_status;	/* grant callback status */
+};
 
 /*
  * Memory chunk for NLM client RPC request.
-- 
cgit v1.2.3


From 2005f5b9c35bd736c81e9f24f5c5051967c022ee Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 3 Mar 2023 07:16:00 -0500
Subject: lockd: fix races in client GRANTED_MSG wait logic

After the wait for a grant is done (for whatever reason), nlmclnt_block
updates the status of the nlm_rqst with the status of the block. At the
point it does this, however, the block is still queued its status could
change at any time.

This is particularly a problem when the waiting task is signaled during
the wait. We can end up giving up on the lock just before the GRANTED_MSG
callback comes in, and accept it even though the lock request gets back
an error, leaving a dangling lock on the server.

Since the nlm_wait never lives beyond the end of nlmclnt_lock, put it on
the stack and add functions to allow us to enqueue and dequeue the
block. Enqueue it just before the lock/wait loop, and dequeue it
just after we exit the loop instead of waiting until the end of
the function. Also, scrape the status at the time that we dequeue it to
ensure that it's final.

Reported-by: Yongcheng Yang <yoyang@redhat.com>
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2063818
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/clntlock.c         | 42 +++++++++++++++++++++---------------------
 fs/lockd/clntproc.c         | 28 ++++++++++++++++++----------
 include/linux/lockd/lockd.h |  8 +++++---
 3 files changed, 44 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 464cb15c1a06..c374ee072db3 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -82,41 +82,42 @@ void nlmclnt_done(struct nlm_host *host)
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
 
+void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct file_lock *fl)
+{
+	block->b_host = host;
+	block->b_lock = fl;
+	init_waitqueue_head(&block->b_wait);
+	block->b_status = nlm_lck_blocked;
+}
+
 /*
  * Queue up a lock for blocking so that the GRANTED request can see it
  */
-struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
+void nlmclnt_queue_block(struct nlm_wait *block)
 {
-	struct nlm_wait *block;
-
-	block = kmalloc(sizeof(*block), GFP_KERNEL);
-	if (block != NULL) {
-		block->b_host = host;
-		block->b_lock = fl;
-		init_waitqueue_head(&block->b_wait);
-		block->b_status = nlm_lck_blocked;
-
-		spin_lock(&nlm_blocked_lock);
-		list_add(&block->b_list, &nlm_blocked);
-		spin_unlock(&nlm_blocked_lock);
-	}
-	return block;
+	spin_lock(&nlm_blocked_lock);
+	list_add(&block->b_list, &nlm_blocked);
+	spin_unlock(&nlm_blocked_lock);
 }
 
-void nlmclnt_finish_block(struct nlm_wait *block)
+/*
+ * Dequeue the block and return its final status
+ */
+__be32 nlmclnt_dequeue_block(struct nlm_wait *block)
 {
-	if (block == NULL)
-		return;
+	__be32 status;
+
 	spin_lock(&nlm_blocked_lock);
 	list_del(&block->b_list);
+	status = block->b_status;
 	spin_unlock(&nlm_blocked_lock);
-	kfree(block);
+	return status;
 }
 
 /*
  * Block on a lock
  */
-int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
+int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 {
 	long ret;
 
@@ -142,7 +143,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 	/* Reset the lock status after a server reboot so we resend */
 	if (block->b_status == nlm_lck_denied_grace_period)
 		block->b_status = nlm_lck_blocked;
-	req->a_res.status = block->b_status;
 	return 0;
 }
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 16b4de868cd2..a14c9110719c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -516,9 +516,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	const struct cred *cred = nfs_file_cred(fl->fl_file);
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
-	struct nlm_wait *block = NULL;
+	struct nlm_wait block;
 	unsigned char fl_flags = fl->fl_flags;
 	unsigned char fl_type;
+	__be32 b_status;
 	int status = -ENOLCK;
 
 	if (nsm_monitor(host) < 0)
@@ -531,31 +532,41 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	if (status < 0)
 		goto out;
 
-	block = nlmclnt_prepare_block(host, fl);
+	nlmclnt_prepare_block(&block, host, fl);
 again:
 	/*
 	 * Initialise resp->status to a valid non-zero value,
 	 * since 0 == nlm_lck_granted
 	 */
 	resp->status = nlm_lck_blocked;
-	for(;;) {
+
+	/*
+	 * A GRANTED callback can come at any time -- even before the reply
+	 * to the LOCK request arrives, so we queue the wait before
+	 * requesting the lock.
+	 */
+	nlmclnt_queue_block(&block);
+	for (;;) {
 		/* Reboot protection */
 		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(cred, req, NLMPROC_LOCK);
 		if (status < 0)
 			break;
 		/* Did a reclaimer thread notify us of a server reboot? */
-		if (resp->status ==  nlm_lck_denied_grace_period)
+		if (resp->status == nlm_lck_denied_grace_period)
 			continue;
 		if (resp->status != nlm_lck_blocked)
 			break;
 		/* Wait on an NLM blocking lock */
-		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+		status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT);
 		if (status < 0)
 			break;
-		if (resp->status != nlm_lck_blocked)
+		if (block.b_status != nlm_lck_blocked)
 			break;
 	}
+	b_status = nlmclnt_dequeue_block(&block);
+	if (resp->status == nlm_lck_blocked)
+		resp->status = b_status;
 
 	/* if we were interrupted while blocking, then cancel the lock request
 	 * and exit
@@ -564,7 +575,7 @@ again:
 		if (!req->a_args.block)
 			goto out_unlock;
 		if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
-			goto out_unblock;
+			goto out;
 	}
 
 	if (resp->status == nlm_granted) {
@@ -593,8 +604,6 @@ again:
 		status = -ENOLCK;
 	else
 		status = nlm_stat_to_errno(resp->status);
-out_unblock:
-	nlmclnt_finish_block(block);
 out:
 	nlmclnt_release_call(req);
 	return status;
@@ -602,7 +611,6 @@ out_unlock:
 	/* Fatal error: ensure that we remove the lock altogether */
 	dprintk("lockd: lock attempt ended in fatal error.\n"
 		"       Attempting to unlock.\n");
-	nlmclnt_finish_block(block);
 	fl_type = fl->fl_type;
 	fl->fl_type = F_UNLCK;
 	down_read(&host->h_rwsem);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index de5ad2c2179a..f42594a9efe0 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -211,9 +211,11 @@ struct nlm_rqst * nlm_alloc_call(struct nlm_host *host);
 int		  nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 int		  nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 void		  nlmclnt_release_call(struct nlm_rqst *);
-struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
-void		  nlmclnt_finish_block(struct nlm_wait *block);
-int		  nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
+void		  nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host,
+					struct file_lock *fl);
+void		  nlmclnt_queue_block(struct nlm_wait *block);
+__be32		  nlmclnt_dequeue_block(struct nlm_wait *block);
+int		  nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
 __be32		  nlmclnt_grant(const struct sockaddr *addr,
 				const struct nlm_lock *lock);
 void		  nlmclnt_recovery(struct nlm_host *);
-- 
cgit v1.2.3


From e59fb6749ed833deee5b3cfd7e89925296d41f49 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 3 Mar 2023 07:16:02 -0500
Subject: nfs: move nfs_fhandle_hash to common include file

lockd needs to be able to hash filehandles for tracepoints. Move the
nfs_fhandle_hash() helper to a common nfs include file.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/internal.h   | 15 ---------------
 include/linux/nfs.h | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a65fe2a63ab..10fb5e7573eb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -846,27 +846,12 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
 }
 
 #ifdef CONFIG_CRC32
-/**
- * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
- * @fh - pointer to filehandle
- *
- * returns a crc32 hash for the filehandle that is compatible with
- * the one displayed by "wireshark".
- */
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
-	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
-}
 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
 {
 	return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
 				NFS4_STATEID_OTHER_SIZE);
 }
 #else
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
-	return 0;
-}
 static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
 {
 	return 0;
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index b06375e88e58..ceb70a926b95 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -10,6 +10,7 @@
 
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/string.h>
+#include <linux/crc32.h>
 #include <uapi/linux/nfs.h>
 
 /*
@@ -44,4 +45,23 @@ enum nfs3_stable_how {
 	/* used by direct.c to mark verf as invalid */
 	NFS_INVALID_STABLE_HOW = -1
 };
+
+#ifdef CONFIG_CRC32
+/**
+ * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+#else /* CONFIG_CRC32 */
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+	return 0;
+}
+#endif /* CONFIG_CRC32 */
 #endif /* _LINUX_NFS_H */
-- 
cgit v1.2.3


From cf64b9bce95095b80f4589e4f54572cc5d8c1538 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 8 Mar 2023 17:51:00 +1100
Subject: SUNRPC: return proper error from get_expiry()

The get_expiry() function currently returns a timestamp, and uses the
special return value of 0 to indicate an error.

Unfortunately this causes a problem when 0 is the correct return value.

On a system with no RTC it is possible that the boot time will be seen
to be "3".  When exportfs probes to see if a particular filesystem
supports NFS export it tries to cache information with an expiry time of
"3".  The intention is for this to be "long in the past".  Even with no
RTC it will not be far in the future (at most a second or two) so this
is harmless.
But if the boot time happens to have been calculated to be "3", then
get_expiry will fail incorrectly as it converts the number to "seconds
since bootime" - 0.

To avoid this problem we change get_expiry() to report the error quite
separately from the expiry time.  The error is now the return value.
The expiry time is reported through a by-reference parameter.

Reported-by: Jerry Zhang <jerry@skydio.com>
Tested-by: Jerry Zhang <jerry@skydio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c                  | 13 ++++++-------
 fs/nfsd/nfs4idmap.c               |  8 ++++----
 include/linux/sunrpc/cache.h      | 15 ++++++++-------
 net/sunrpc/auth_gss/svcauth_gss.c | 12 ++++++------
 net/sunrpc/svcauth_unix.c         | 12 ++++++------
 5 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 668c7527b17e..6da74aebe1fb 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -123,11 +123,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* OK, we seem to have a valid key */
 	key.h.flags = 0;
-	key.h.expiry_time = get_expiry(&mesg);
-	if (key.h.expiry_time == 0)
+	err = get_expiry(&mesg, &key.h.expiry_time);
+	if (err)
 		goto out;
 
-	key.ek_client = dom;	
+	key.ek_client = dom;
 	key.ek_fsidtype = fsidtype;
 	memcpy(key.ek_fsid, buf, len);
 
@@ -610,9 +610,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	exp.ex_devid_map = NULL;
 
 	/* expiry */
-	err = -EINVAL;
-	exp.h.expiry_time = get_expiry(&mesg);
-	if (exp.h.expiry_time == 0)
+	err = get_expiry(&mesg, &exp.h.expiry_time);
+	if (err)
 		goto out3;
 
 	/* flags */
@@ -624,7 +623,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (err || an_int < 0)
 			goto out3;
 		exp.ex_flags= an_int;
-	
+
 		/* anon uid */
 		err = get_int(&mesg, &an_int);
 		if (err)
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5e9809aff37e..7a806ac13e31 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -240,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 		goto out;
 
 	/* expiry */
-	ent.h.expiry_time = get_expiry(&buf);
-	if (ent.h.expiry_time == 0)
+	error = get_expiry(&buf, &ent.h.expiry_time);
+	if (error)
 		goto out;
 
 	error = -ENOMEM;
@@ -408,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	memcpy(ent.name, buf1, sizeof(ent.name));
 
 	/* expiry */
-	ent.h.expiry_time = get_expiry(&buf);
-	if (ent.h.expiry_time == 0)
+	error = get_expiry(&buf, &ent.h.expiry_time);
+	if (error)
 		goto out;
 
 	/* ID */
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index ec5a555df96f..518bd28f5ab8 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -300,17 +300,18 @@ static inline int get_time(char **bpp, time64_t *time)
 	return 0;
 }
 
-static inline time64_t get_expiry(char **bpp)
+static inline int get_expiry(char **bpp, time64_t *rvp)
 {
-	time64_t rv;
+	int error;
 	struct timespec64 boot;
 
-	if (get_time(bpp, &rv))
-		return 0;
-	if (rv < 0)
-		return 0;
+	error = get_time(bpp, rvp);
+	if (error)
+		return error;
+
 	getboottime64(&boot);
-	return rv - boot.tv_sec;
+	(*rvp) -= boot.tv_sec;
+	return 0;
 }
 
 #endif /*  _LINUX_SUNRPC_CACHE_H_ */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 9c843974bb48..c4a566737085 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -257,11 +257,11 @@ static int rsi_parse(struct cache_detail *cd,
 
 	rsii.h.flags = 0;
 	/* expiry */
-	expiry = get_expiry(&mesg);
-	status = -EINVAL;
-	if (expiry == 0)
+	status = get_expiry(&mesg, &expiry);
+	if (status)
 		goto out;
 
+	status = -EINVAL;
 	/* major/minor */
 	len = qword_get(&mesg, buf, mlen);
 	if (len <= 0)
@@ -483,11 +483,11 @@ static int rsc_parse(struct cache_detail *cd,
 
 	rsci.h.flags = 0;
 	/* expiry */
-	expiry = get_expiry(&mesg);
-	status = -EINVAL;
-	if (expiry == 0)
+	status = get_expiry(&mesg, &expiry);
+	if (status)
 		goto out;
 
+	status = -EINVAL;
 	rscp = rsc_lookup(cd, &rsci);
 	if (!rscp)
 		goto out;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 4246363cb095..4485088ce27b 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -225,9 +225,9 @@ static int ip_map_parse(struct cache_detail *cd,
 		return -EINVAL;
 	}
 
-	expiry = get_expiry(&mesg);
-	if (expiry ==0)
-		return -EINVAL;
+	err = get_expiry(&mesg, &expiry);
+	if (err)
+		return err;
 
 	/* domainname, or empty for NEGATIVE */
 	len = qword_get(&mesg, buf, mlen);
@@ -506,9 +506,9 @@ static int unix_gid_parse(struct cache_detail *cd,
 	uid = make_kuid(current_user_ns(), id);
 	ug.uid = uid;
 
-	expiry = get_expiry(&mesg);
-	if (expiry == 0)
-		return -EINVAL;
+	err = get_expiry(&mesg, &expiry);
+	if (err)
+		return err;
 
 	rv = get_int(&mesg, &gids);
 	if (rv || gids < 0 || gids > 8192)
-- 
cgit v1.2.3


From 0f5162480bd25bd97b91c9153db7afbd89698804 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Mar 2023 17:09:20 -0400
Subject: NFSD: Watch for rq_pages bounds checking errors in
 nfsd_splice_actor()

There have been several bugs over the years where the NFSD splice
actor has attempted to write outside the rq_pages array.

This is a "should never happen" condition, but if for some reason
the pipe splice actor should attempt to walk past the end of
rq_pages, it needs to terminate the READ operation to prevent
corruption of the pointer addresses in the fields just beyond the
array.

A server crash is thus prevented. Since the code is not behaving,
the READ operation returns -EIO to the client. None of the READ
payload data can be trusted if the splice actor isn't operating as
expected.

Suggested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/vfs.c                 |  6 +++++-
 include/linux/sunrpc/svc.h    |  2 +-
 include/trace/events/sunrpc.h | 25 +++++++++++++++++++++++++
 net/sunrpc/svc.c              | 15 ++++++++++++++-
 4 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5783209f17fc..10aa68ca82ef 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -930,6 +930,9 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
  * Grab and keep cached pages associated with a file in the svc_rqst
  * so that they can be passed to the network sendmsg/sendpage routines
  * directly. They will be released after the sending has completed.
+ *
+ * Return values: Number of bytes consumed, or -EIO if there are no
+ * remaining pages in rqstp->rq_pages.
  */
 static int
 nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
@@ -948,7 +951,8 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		 */
 		if (page == *(rqstp->rq_next_page - 1))
 			continue;
-		svc_rqst_replace_page(rqstp, page);
+		if (unlikely(!svc_rqst_replace_page(rqstp, page)))
+			return -EIO;
 	}
 	if (rqstp->rq_res.page_len == 0)	// first call
 		rqstp->rq_res.page_base = offset % PAGE_SIZE;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 877891536c2f..f5af055280ff 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -422,7 +422,7 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    int (*threadfn)(void *data));
 struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
-void		   svc_rqst_replace_page(struct svc_rqst *rqstp,
+bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
 void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 3ca54536f8f7..5a3bb42e1f50 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1790,6 +1790,31 @@ DEFINE_EVENT(svc_rqst_status, svc_send,
 	TP_PROTO(const struct svc_rqst *rqst, int status),
 	TP_ARGS(rqst, status));
 
+TRACE_EVENT(svc_replace_page_err,
+	TP_PROTO(const struct svc_rqst *rqst),
+
+	TP_ARGS(rqst),
+	TP_STRUCT__entry(
+		SVC_RQST_ENDPOINT_FIELDS(rqst)
+
+		__field(const void *, begin)
+		__field(const void *, respages)
+		__field(const void *, nextpage)
+	),
+
+	TP_fast_assign(
+		SVC_RQST_ENDPOINT_ASSIGNMENTS(rqst);
+
+		__entry->begin = rqst->rq_pages;
+		__entry->respages = rqst->rq_respages;
+		__entry->nextpage = rqst->rq_next_page;
+	),
+
+	TP_printk(SVC_RQST_ENDPOINT_FORMAT " begin=%p respages=%p nextpage=%p",
+		SVC_RQST_ENDPOINT_VARARGS,
+		__entry->begin, __entry->respages, __entry->nextpage)
+);
+
 TRACE_EVENT(svc_stats_latency,
 	TP_PROTO(
 		const struct svc_rqst *rqst
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index fea7ce8fba14..633aa1eb476b 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -842,9 +842,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads);
  *
  * When replacing a page in rq_pages, batch the release of the
  * replaced pages to avoid hammering the page allocator.
+ *
+ * Return values:
+ *   %true: page replaced
+ *   %false: array bounds checking failed
  */
-void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
+bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
 {
+	struct page **begin = rqstp->rq_pages;
+	struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES];
+
+	if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) {
+		trace_svc_replace_page_err(rqstp);
+		return false;
+	}
+
 	if (*rqstp->rq_next_page) {
 		if (!pagevec_space(&rqstp->rq_pvec))
 			__pagevec_release(&rqstp->rq_pvec);
@@ -853,6 +865,7 @@ void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
 
 	get_page(page);
 	*(rqstp->rq_next_page++) = page;
+	return true;
 }
 EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
 
-- 
cgit v1.2.3


From 55fcc7d9159de886296626e47db2c81f8578c7e1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 3 Apr 2023 13:53:07 -0400
Subject: SUNRPC: Ignore return value of ->xpo_sendto

Clean up: All callers of svc_process() ignore its return value, so
svc_process() can safely be converted to return void. Ditto for
svc_send().

The return value of ->xpo_sendto() is now used only as part of a
trace event.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h     |  2 +-
 include/linux/sunrpc/svcsock.h |  2 +-
 net/sunrpc/svc.c               | 13 +++++++------
 net/sunrpc/svc_xprt.c          | 21 +++++++++------------
 4 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index f5af055280ff..2d31121fc2e6 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -430,7 +430,7 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 				     int (*threadfn)(void *data));
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
-int		   svc_process(struct svc_rqst *);
+void		   svc_process(struct svc_rqst *rqstp);
 int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
 			struct svc_rqst *);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index bcc555c7ae9c..dd73fa174af5 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -56,7 +56,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
  */
 void		svc_close_net(struct svc_serv *, struct net *);
 int		svc_recv(struct svc_rqst *, long);
-int		svc_send(struct svc_rqst *);
+void		svc_send(struct svc_rqst *rqstp);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
 bool		svc_alien_sock(struct net *net, int fd);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 633aa1eb476b..0aa8892fad63 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1444,11 +1444,12 @@ err_system_err:
 	goto sendit;
 }
 
-/*
- * Process the RPC request.
+/**
+ * svc_process - Execute one RPC transaction
+ * @rqstp: RPC transaction context
+ *
  */
-int
-svc_process(struct svc_rqst *rqstp)
+void svc_process(struct svc_rqst *rqstp)
 {
 	struct kvec		*resv = &rqstp->rq_res.head[0];
 	__be32 *p;
@@ -1484,7 +1485,8 @@ svc_process(struct svc_rqst *rqstp)
 
 	if (!svc_process_common(rqstp))
 		goto out_drop;
-	return svc_send(rqstp);
+	svc_send(rqstp);
+	return;
 
 out_baddir:
 	svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
@@ -1492,7 +1494,6 @@ out_baddir:
 	rqstp->rq_server->sv_stats->rpcbadfmt++;
 out_drop:
 	svc_drop(rqstp);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(svc_process);
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ba629297da4e..36c79b718323 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -909,18 +909,20 @@ void svc_drop(struct svc_rqst *rqstp)
 }
 EXPORT_SYMBOL_GPL(svc_drop);
 
-/*
- * Return reply to client.
+/**
+ * svc_send - Return reply to client
+ * @rqstp: RPC transaction context
+ *
  */
-int svc_send(struct svc_rqst *rqstp)
+void svc_send(struct svc_rqst *rqstp)
 {
 	struct svc_xprt	*xprt;
-	int		len = -EFAULT;
 	struct xdr_buf	*xb;
+	int status;
 
 	xprt = rqstp->rq_xprt;
 	if (!xprt)
-		goto out;
+		return;
 
 	/* calculate over-all length */
 	xb = &rqstp->rq_res;
@@ -930,15 +932,10 @@ int svc_send(struct svc_rqst *rqstp)
 	trace_svc_xdr_sendto(rqstp->rq_xid, xb);
 	trace_svc_stats_latency(rqstp);
 
-	len = xprt->xpt_ops->xpo_sendto(rqstp);
+	status = xprt->xpt_ops->xpo_sendto(rqstp);
 
-	trace_svc_send(rqstp, len);
+	trace_svc_send(rqstp, status);
 	svc_xprt_release(rqstp);
-
-	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
-		len = 0;
-out:
-	return len;
 }
 
 /*
-- 
cgit v1.2.3


From b20cb39def085723868972182fb58fa906839a4f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Apr 2023 20:17:56 -0400
Subject: SUNRPC: Relocate svc_free_res_pages()

Clean-up: There doesn't seem to be a reason why this function is
stuck in a header. One thing it prevents is the convenient addition
of tracing. Moving it to a source file also makes the rq_respages
clean-up logic easier to find.

Reviewed-by: Calum Mackay <calum.mackay@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 12 +-----------
 net/sunrpc/svc.c           | 19 +++++++++++++++++++
 net/sunrpc/svc_xprt.c      |  2 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2d31121fc2e6..762d7231e574 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -309,17 +309,6 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
 	return (struct sockaddr *) &rqst->rq_daddr;
 }
 
-static inline void svc_free_res_pages(struct svc_rqst *rqstp)
-{
-	while (rqstp->rq_next_page != rqstp->rq_respages) {
-		struct page **pp = --rqstp->rq_next_page;
-		if (*pp) {
-			put_page(*pp);
-			*pp = NULL;
-		}
-	}
-}
-
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct svc_xprt		*xprt;
@@ -424,6 +413,7 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
+void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
 void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0aa8892fad63..0fc70cc405b2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -869,6 +869,25 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
 }
 EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
 
+/**
+ * svc_rqst_release_pages - Release Reply buffer pages
+ * @rqstp: RPC transaction context
+ *
+ * Release response pages that might still be in flight after
+ * svc_send, and any spliced filesystem-owned pages.
+ */
+void svc_rqst_release_pages(struct svc_rqst *rqstp)
+{
+	while (rqstp->rq_next_page != rqstp->rq_respages) {
+		struct page **pp = --rqstp->rq_next_page;
+
+		if (*pp) {
+			put_page(*pp);
+			*pp = NULL;
+		}
+	}
+}
+
 /*
  * Called from a server thread as it's exiting. Caller must hold the "service
  * mutex" for the service.
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 36c79b718323..533e08c4f319 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -542,7 +542,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
 	rqstp->rq_deferred = NULL;
 
 	pagevec_release(&rqstp->rq_pvec);
-	svc_free_res_pages(rqstp);
+	svc_rqst_release_pages(rqstp);
 	rqstp->rq_res.page_len = 0;
 	rqstp->rq_res.page_base = 0;
 
-- 
cgit v1.2.3


From e0f8ad2a705367518b5c56bf9d6da89681467c02 Mon Sep 17 00:00:00 2001
From: Shengyu Qu <wiagn233@outlook.com>
Date: Fri, 21 Apr 2023 23:08:15 +0800
Subject: mfd: axp20x: Add support for AXP15060 PMIC

The AXP15060 is a PMIC chip produced by X-Powers, and could be connected
via an I2C bus.

Describe the regmap and the MFD bits, along with the registers exposed
via I2C. Eventually advertise the device using a new compatible string
and add support for power off the system.

The driver would disable PEK function if IRQ is not configured in device
tree, since some boards (For example, Starfive Visionfive 2) didn't
connect IRQ line of PMIC to SOC.

GPIO function isn't enabled in this commit, since its configuration
operation is different from any existing AXP PMICs and needs
logic modification on existing driver. GPIO support might come in later
patches.

Signed-off-by: Shengyu Qu <wiagn233@outlook.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/TY3P286MB261162D57695AC8164ED50E298609@TY3P286MB2611.JPNP286.PROD.OUTLOOK.COM
---
 drivers/mfd/axp20x-i2c.c   |   2 +
 drivers/mfd/axp20x.c       | 107 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h |  85 +++++++++++++++++++++++++++++++++++
 3 files changed, 194 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/axp20x-i2c.c b/drivers/mfd/axp20x-i2c.c
index f49fbd307958..b4f5cb457117 100644
--- a/drivers/mfd/axp20x-i2c.c
+++ b/drivers/mfd/axp20x-i2c.c
@@ -65,6 +65,7 @@ static const struct of_device_id axp20x_i2c_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
 	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
+	{ .compatible = "x-powers,axp15060", .data = (void *)AXP15060_ID },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, axp20x_i2c_of_match);
@@ -78,6 +79,7 @@ static const struct i2c_device_id axp20x_i2c_id[] = {
 	{ "axp223", 0 },
 	{ "axp803", 0 },
 	{ "axp806", 0 },
+	{ "axp15060", 0 },
 	{ },
 };
 MODULE_DEVICE_TABLE(i2c, axp20x_i2c_id);
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index 7720ac15c7d4..72b87aae60cc 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -43,6 +43,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP806",
 	"AXP809",
 	"AXP813",
+	"AXP15060",
 };
 
 static const struct regmap_range axp152_writeable_ranges[] = {
@@ -169,6 +170,31 @@ static const struct regmap_access_table axp806_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp806_volatile_ranges),
 };
 
+static const struct regmap_range axp15060_writeable_ranges[] = {
+	regmap_reg_range(AXP15060_PWR_OUT_CTRL1, AXP15060_DCDC_MODE_CTRL2),
+	regmap_reg_range(AXP15060_OUTPUT_MONITOR_DISCHARGE, AXP15060_CPUSLDO_V_CTRL),
+	regmap_reg_range(AXP15060_PWR_WAKEUP_CTRL, AXP15060_PWR_DISABLE_DOWN_SEQ),
+	regmap_reg_range(AXP15060_PEK_KEY, AXP15060_PEK_KEY),
+	regmap_reg_range(AXP15060_IRQ1_EN, AXP15060_IRQ2_EN),
+	regmap_reg_range(AXP15060_IRQ1_STATE, AXP15060_IRQ2_STATE),
+};
+
+static const struct regmap_range axp15060_volatile_ranges[] = {
+	regmap_reg_range(AXP15060_STARTUP_SRC, AXP15060_STARTUP_SRC),
+	regmap_reg_range(AXP15060_PWR_WAKEUP_CTRL, AXP15060_PWR_DISABLE_DOWN_SEQ),
+	regmap_reg_range(AXP15060_IRQ1_STATE, AXP15060_IRQ2_STATE),
+};
+
+static const struct regmap_access_table axp15060_writeable_table = {
+	.yes_ranges	= axp15060_writeable_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp15060_writeable_ranges),
+};
+
+static const struct regmap_access_table axp15060_volatile_table = {
+	.yes_ranges	= axp15060_volatile_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp15060_volatile_ranges),
+};
+
 static const struct resource axp152_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
@@ -237,6 +263,11 @@ static const struct resource axp809_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP809_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
 };
 
+static const struct resource axp15060_pek_resources[] = {
+	DEFINE_RES_IRQ_NAMED(AXP15060_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
+	DEFINE_RES_IRQ_NAMED(AXP15060_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
+};
+
 static const struct regmap_config axp152_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -282,6 +313,15 @@ static const struct regmap_config axp806_regmap_config = {
 	.cache_type	= REGCACHE_RBTREE,
 };
 
+static const struct regmap_config axp15060_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.wr_table	= &axp15060_writeable_table,
+	.volatile_table	= &axp15060_volatile_table,
+	.max_register	= AXP15060_IRQ2_STATE,
+	.cache_type	= REGCACHE_RBTREE,
+};
+
 #define INIT_REGMAP_IRQ(_variant, _irq, _off, _mask)			\
 	[_variant##_IRQ_##_irq] = { .reg_offset = (_off), .mask = BIT(_mask) }
 
@@ -503,6 +543,23 @@ static const struct regmap_irq axp809_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP809, GPIO0_INPUT,		4, 0),
 };
 
+static const struct regmap_irq axp15060_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP15060, DIE_TEMP_HIGH_LV1,	0, 0),
+	INIT_REGMAP_IRQ(AXP15060, DIE_TEMP_HIGH_LV2,	0, 1),
+	INIT_REGMAP_IRQ(AXP15060, DCDC1_V_LOW,		0, 2),
+	INIT_REGMAP_IRQ(AXP15060, DCDC2_V_LOW,		0, 3),
+	INIT_REGMAP_IRQ(AXP15060, DCDC3_V_LOW,		0, 4),
+	INIT_REGMAP_IRQ(AXP15060, DCDC4_V_LOW,		0, 5),
+	INIT_REGMAP_IRQ(AXP15060, DCDC5_V_LOW,		0, 6),
+	INIT_REGMAP_IRQ(AXP15060, DCDC6_V_LOW,		0, 7),
+	INIT_REGMAP_IRQ(AXP15060, PEK_LONG,			1, 0),
+	INIT_REGMAP_IRQ(AXP15060, PEK_SHORT,			1, 1),
+	INIT_REGMAP_IRQ(AXP15060, GPIO1_INPUT,		1, 2),
+	INIT_REGMAP_IRQ(AXP15060, PEK_FAL_EDGE,			1, 3),
+	INIT_REGMAP_IRQ(AXP15060, PEK_RIS_EDGE,			1, 4),
+	INIT_REGMAP_IRQ(AXP15060, GPIO2_INPUT,		1, 5),
+};
+
 static const struct regmap_irq_chip axp152_regmap_irq_chip = {
 	.name			= "axp152_irq_chip",
 	.status_base		= AXP152_IRQ1_STATE,
@@ -582,6 +639,17 @@ static const struct regmap_irq_chip axp809_regmap_irq_chip = {
 	.num_regs		= 5,
 };
 
+static const struct regmap_irq_chip axp15060_regmap_irq_chip = {
+	.name			= "axp15060",
+	.status_base		= AXP15060_IRQ1_STATE,
+	.ack_base		= AXP15060_IRQ1_STATE,
+	.unmask_base		= AXP15060_IRQ1_EN,
+	.init_ack_masked	= true,
+	.irqs			= axp15060_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp15060_regmap_irqs),
+	.num_regs		= 2,
+};
+
 static const struct mfd_cell axp20x_cells[] = {
 	{
 		.name		= "axp20x-gpio",
@@ -826,6 +894,23 @@ static const struct mfd_cell axp813_cells[] = {
 	},
 };
 
+static const struct mfd_cell axp15060_cells[] = {
+	{
+		.name		= "axp221-pek",
+		.num_resources	= ARRAY_SIZE(axp15060_pek_resources),
+		.resources	= axp15060_pek_resources,
+	}, {
+		.name		= "axp20x-regulator",
+	},
+};
+
+/* For boards that don't have IRQ line connected to SOC. */
+static const struct mfd_cell axp_regulator_only_cells[] = {
+	{
+		.name		= "axp20x-regulator",
+	},
+};
+
 static int axp20x_power_off(struct sys_off_data *data)
 {
 	struct axp20x_dev *axp20x = data->cb_data;
@@ -935,6 +1020,28 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		 */
 		axp20x->regmap_irq_chip = &axp803_regmap_irq_chip;
 		break;
+	case AXP15060_ID:
+		/*
+		 * Don't register the power key part if there is no interrupt
+		 * line.
+		 *
+		 * Since most use cases of AXP PMICs are Allwinner SOCs, board
+		 * designers follow Allwinner's reference design and connects
+		 * IRQ line to SOC, there's no need for those variants to deal
+		 * with cases that IRQ isn't connected. However, AXP15660 is
+		 * used by some other vendors' SOCs that didn't connect IRQ
+		 * line, we need to deal with this case.
+		 */
+		if (axp20x->irq > 0) {
+			axp20x->nr_cells = ARRAY_SIZE(axp15060_cells);
+			axp20x->cells = axp15060_cells;
+		} else {
+			axp20x->nr_cells = ARRAY_SIZE(axp_regulator_only_cells);
+			axp20x->cells = axp_regulator_only_cells;
+		}
+		axp20x->regmap_cfg = &axp15060_regmap_config;
+		axp20x->regmap_irq_chip = &axp15060_regmap_irq_chip;
+		break;
 	default:
 		dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
 		return -EINVAL;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 2058194807bd..beb3f44f85c5 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -21,6 +21,7 @@ enum axp20x_variants {
 	AXP806_ID,
 	AXP809_ID,
 	AXP813_ID,
+	AXP15060_ID,
 	NR_AXP20X_VARIANTS,
 };
 
@@ -131,6 +132,39 @@ enum axp20x_variants {
 /* Other DCDC regulator control registers are the same as AXP803 */
 #define AXP813_DCDC7_V_OUT		0x26
 
+#define AXP15060_STARTUP_SRC		0x00
+#define AXP15060_PWR_OUT_CTRL1		0x10
+#define AXP15060_PWR_OUT_CTRL2		0x11
+#define AXP15060_PWR_OUT_CTRL3		0x12
+#define AXP15060_DCDC1_V_CTRL		0x13
+#define AXP15060_DCDC2_V_CTRL		0x14
+#define AXP15060_DCDC3_V_CTRL		0x15
+#define AXP15060_DCDC4_V_CTRL		0x16
+#define AXP15060_DCDC5_V_CTRL		0x17
+#define AXP15060_DCDC6_V_CTRL		0x18
+#define AXP15060_ALDO1_V_CTRL		0x19
+#define AXP15060_DCDC_MODE_CTRL1		0x1a
+#define AXP15060_DCDC_MODE_CTRL2		0x1b
+#define AXP15060_OUTPUT_MONITOR_DISCHARGE		0x1e
+#define AXP15060_IRQ_PWROK_VOFF		0x1f
+#define AXP15060_ALDO2_V_CTRL		0x20
+#define AXP15060_ALDO3_V_CTRL		0x21
+#define AXP15060_ALDO4_V_CTRL		0x22
+#define AXP15060_ALDO5_V_CTRL		0x23
+#define AXP15060_BLDO1_V_CTRL		0x24
+#define AXP15060_BLDO2_V_CTRL		0x25
+#define AXP15060_BLDO3_V_CTRL		0x26
+#define AXP15060_BLDO4_V_CTRL		0x27
+#define AXP15060_BLDO5_V_CTRL		0x28
+#define AXP15060_CLDO1_V_CTRL		0x29
+#define AXP15060_CLDO2_V_CTRL		0x2a
+#define AXP15060_CLDO3_V_CTRL		0x2b
+#define AXP15060_CLDO4_V_CTRL		0x2d
+#define AXP15060_CPUSLDO_V_CTRL		0x2e
+#define AXP15060_PWR_WAKEUP_CTRL		0x31
+#define AXP15060_PWR_DISABLE_DOWN_SEQ		0x32
+#define AXP15060_PEK_KEY		0x36
+
 /* Interrupt */
 #define AXP152_IRQ1_EN			0x40
 #define AXP152_IRQ2_EN			0x41
@@ -152,6 +186,11 @@ enum axp20x_variants {
 #define AXP20X_IRQ5_STATE		0x4c
 #define AXP20X_IRQ6_STATE		0x4d
 
+#define AXP15060_IRQ1_EN		0x40
+#define AXP15060_IRQ2_EN		0x41
+#define AXP15060_IRQ1_STATE		0x48
+#define AXP15060_IRQ2_STATE		0x49
+
 /* ADC */
 #define AXP20X_ACIN_V_ADC_H		0x56
 #define AXP20X_ACIN_V_ADC_L		0x57
@@ -222,6 +261,8 @@ enum axp20x_variants {
 #define AXP22X_GPIO_STATE		0x94
 #define AXP22X_GPIO_PULL_DOWN		0x95
 
+#define AXP15060_CLDO4_GPIO2_MODESET		0x2c
+
 /* Battery */
 #define AXP20X_CHRG_CC_31_24		0xb0
 #define AXP20X_CHRG_CC_23_16		0xb1
@@ -419,6 +460,33 @@ enum {
 	AXP813_REG_ID_MAX,
 };
 
+enum {
+	AXP15060_DCDC1 = 0,
+	AXP15060_DCDC2,
+	AXP15060_DCDC3,
+	AXP15060_DCDC4,
+	AXP15060_DCDC5,
+	AXP15060_DCDC6,
+	AXP15060_ALDO1,
+	AXP15060_ALDO2,
+	AXP15060_ALDO3,
+	AXP15060_ALDO4,
+	AXP15060_ALDO5,
+	AXP15060_BLDO1,
+	AXP15060_BLDO2,
+	AXP15060_BLDO3,
+	AXP15060_BLDO4,
+	AXP15060_BLDO5,
+	AXP15060_CLDO1,
+	AXP15060_CLDO2,
+	AXP15060_CLDO3,
+	AXP15060_CLDO4,
+	AXP15060_CPUSLDO,
+	AXP15060_SW,
+	AXP15060_RTC_LDO,
+	AXP15060_REG_ID_MAX,
+};
+
 /* IRQs */
 enum {
 	AXP152_IRQ_LDO0IN_CONNECT = 1,
@@ -637,6 +705,23 @@ enum axp809_irqs {
 	AXP809_IRQ_GPIO0_INPUT,
 };
 
+enum axp15060_irqs {
+	AXP15060_IRQ_DIE_TEMP_HIGH_LV1 = 1,
+	AXP15060_IRQ_DIE_TEMP_HIGH_LV2,
+	AXP15060_IRQ_DCDC1_V_LOW,
+	AXP15060_IRQ_DCDC2_V_LOW,
+	AXP15060_IRQ_DCDC3_V_LOW,
+	AXP15060_IRQ_DCDC4_V_LOW,
+	AXP15060_IRQ_DCDC5_V_LOW,
+	AXP15060_IRQ_DCDC6_V_LOW,
+	AXP15060_IRQ_PEK_LONG,
+	AXP15060_IRQ_PEK_SHORT,
+	AXP15060_IRQ_GPIO1_INPUT,
+	AXP15060_IRQ_PEK_FAL_EDGE,
+	AXP15060_IRQ_PEK_RIS_EDGE,
+	AXP15060_IRQ_GPIO2_INPUT,
+};
+
 struct axp20x_dev {
 	struct device			*dev;
 	int				irq;
-- 
cgit v1.2.3


From ff53cd52d9bdbf4074d2bbe9b591729997780bd3 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 18 Mar 2023 17:36:25 +0000
Subject: blk-integrity: register sysfs attributes on struct device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "integrity" kobject only acted as a holder for static sysfs entries.
It also was embedded into struct gendisk without managing it, violating
assumptions of the driver core.

Instead register the sysfs entries directly onto the struct device.

Also drop the now unused member integrity_kobj from struct gendisk.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230309-kobj_release-gendisk_integrity-v3-3-ceccb4493c46@weissschuh.net
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-integrity.c  | 55 +++-----------------------------------------------
 block/blk.h            | 10 +--------
 block/genhd.c          | 12 ++++-------
 include/linux/blkdev.h |  3 ---
 4 files changed, 8 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 1cbfdea88c72..d4e9b4556d14 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -212,31 +212,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
 	return true;
 }
 
-static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
-				   char *page)
-{
-	struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
-	struct device *dev = disk_to_dev(disk);
-	struct device_attribute *dev_attr =
-		container_of(attr, struct device_attribute, attr);
-
-	return dev_attr->show(dev, dev_attr, page);
-}
-
-static ssize_t integrity_attr_store(struct kobject *kobj,
-				    struct attribute *attr, const char *page,
-				    size_t count)
-{
-	struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
-	struct device *dev = disk_to_dev(disk);
-	struct device_attribute *dev_attr =
-		container_of(attr, struct device_attribute, attr);
-
-	if (!dev_attr->store)
-		return 0;
-	return dev_attr->store(dev, dev_attr, page, count);
-}
-
 static inline struct blk_integrity *dev_to_bi(struct device *dev)
 {
 	return &dev_to_disk(dev)->queue->integrity;
@@ -345,16 +320,10 @@ static struct attribute *integrity_attrs[] = {
 	&dev_attr_device_is_integrity_capable.attr,
 	NULL
 };
-ATTRIBUTE_GROUPS(integrity);
 
-static const struct sysfs_ops integrity_ops = {
-	.show	= &integrity_attr_show,
-	.store	= &integrity_attr_store,
-};
-
-static const struct kobj_type integrity_ktype = {
-	.default_groups = integrity_groups,
-	.sysfs_ops	= &integrity_ops,
+const struct attribute_group blk_integrity_attr_group = {
+	.name = "integrity",
+	.attrs = integrity_attrs,
 };
 
 static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
@@ -433,21 +402,3 @@ void blk_integrity_unregister(struct gendisk *disk)
 	memset(bi, 0, sizeof(*bi));
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
-
-int blk_integrity_add(struct gendisk *disk)
-{
-	int ret;
-
-	ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
-				   &disk_to_dev(disk)->kobj, "%s", "integrity");
-	if (!ret)
-		kobject_uevent(&disk->integrity_kobj, KOBJ_ADD);
-	return ret;
-}
-
-void blk_integrity_del(struct gendisk *disk)
-{
-	kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE);
-	kobject_del(&disk->integrity_kobj);
-	kobject_put(&disk->integrity_kobj);
-}
diff --git a/block/blk.h b/block/blk.h
index 564119a76bc5..45547bcf1119 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -214,8 +214,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 				bip_next->bip_vec[0].bv_offset);
 }
 
-int blk_integrity_add(struct gendisk *disk);
-void blk_integrity_del(struct gendisk *);
+extern const struct attribute_group blk_integrity_attr_group;
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 static inline bool blk_integrity_merge_rq(struct request_queue *rq,
 		struct request *r1, struct request *r2)
@@ -248,13 +247,6 @@ static inline bool bio_integrity_endio(struct bio *bio)
 static inline void bio_integrity_free(struct bio *bio)
 {
 }
-static inline int blk_integrity_add(struct gendisk *disk)
-{
-	return 0;
-}
-static inline void blk_integrity_del(struct gendisk *disk)
-{
-}
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 unsigned long blk_rq_timeout(unsigned long timeout);
diff --git a/block/genhd.c b/block/genhd.c
index 8d56fb5f08b2..9fa4a7cd978c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -478,15 +478,11 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
-	ret = blk_integrity_add(disk);
-	if (ret)
-		goto out_del_block_link;
-
 	disk->part0->bd_holder_dir =
 		kobject_create_and_add("holders", &ddev->kobj);
 	if (!disk->part0->bd_holder_dir) {
 		ret = -ENOMEM;
-		goto out_del_integrity;
+		goto out_del_block_link;
 	}
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 	if (!disk->slave_dir) {
@@ -549,8 +545,6 @@ out_put_slave_dir:
 	disk->slave_dir = NULL;
 out_put_holder_dir:
 	kobject_put(disk->part0->bd_holder_dir);
-out_del_integrity:
-	blk_integrity_del(disk);
 out_del_block_link:
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(ddev));
@@ -613,7 +607,6 @@ void del_gendisk(struct gendisk *disk)
 	if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
 		return;
 
-	blk_integrity_del(disk);
 	disk_del_events(disk);
 
 	mutex_lock(&disk->open_mutex);
@@ -1150,6 +1143,9 @@ static const struct attribute_group *disk_attr_groups[] = {
 	&disk_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	&blk_trace_attr_group,
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	&blk_integrity_attr_group,
 #endif
 	NULL
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6ede578dfbc6..aac5c8d7a9ff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -157,9 +157,6 @@ struct gendisk {
 	struct timer_rand_state *random;
 	atomic_t sync_io;		/* RAID */
 	struct disk_events *ev;
-#ifdef  CONFIG_BLK_DEV_INTEGRITY
-	struct kobject integrity_kobj;
-#endif	/* CONFIG_BLK_DEV_INTEGRITY */
 
 #ifdef CONFIG_BLK_DEV_ZONED
 	/*
-- 
cgit v1.2.3


From 7cefbaf081eb7c114299f7478ed4fa0d90ec90bb Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 19 Apr 2023 10:33:38 +0200
Subject: thermal: core: Encapsulate tz->device field

There are still some drivers needing to play with the thermal zone
device internals. That is not the best but until we can figure out if
the information is really needed, let's encapsulate the field used in
the thermal zone device structure, so we can move forward relocating
the thermal zone device structure definition in the thermal framework
private headers.

Some drivers are accessing tz->device, that implies they need to have
the knowledge of the thermal_zone_device structure but we want to
self-encapsulate this structure and reduce the scope of the structure
to the thermal core only.

By adding this wrapper, these drivers won't need the thermal zone
device structure definition and are no longer an obstacle to its
relocation to the private thermal core headers.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 6 ++++++
 include/linux/thermal.h        | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index c5025aca22ee..842f678c1c3e 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1398,6 +1398,12 @@ int thermal_zone_device_id(struct thermal_zone_device *tzd)
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_id);
 
+struct device *thermal_zone_device(struct thermal_zone_device *tzd)
+{
+	return &tzd->device;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device);
+
 /**
  * thermal_zone_device_unregister - removes the registered thermal zone device
  * @tz: the thermal zone device to remove
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 82ddb32f9876..87837094d549 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -313,6 +313,7 @@ thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
 const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 int thermal_zone_device_id(struct thermal_zone_device *tzd);
+struct device *thermal_zone_device(struct thermal_zone_device *tzd);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
-- 
cgit v1.2.3


From 4f20b7471c57032860065591a17efd3325216bde Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 21 Apr 2023 16:54:24 +0200
Subject: libgcc: add forward declarations for generic library routines

With W=1 on platforms that use the generic gcc library routines
(csky/loongarch/mips/riscv/sh/xtensa):

    lib/ashldi3.c:9:19: warning: no previous prototype for '__ashldi3' [-Wmissing-prototypes]
	9 | long long notrace __ashldi3(long long u, word_type b)
	  |                   ^~~~~~~~~
      CC      lib/ashrdi3.o
    lib/ashrdi3.c:9:19: warning: no previous prototype for '__ashrdi3' [-Wmissing-prototypes]
	9 | long long notrace __ashrdi3(long long u, word_type b)
	  |                   ^~~~~~~~~
      CC      lib/cmpdi2.o
    lib/cmpdi2.c:9:19: warning: no previous prototype for '__cmpdi2' [-Wmissing-prototypes]
	9 | word_type notrace __cmpdi2(long long a, long long b)
	  |                   ^~~~~~~~
      CC      lib/lshrdi3.o
    lib/lshrdi3.c:9:19: warning: no previous prototype for '__lshrdi3' [-Wmissing-prototypes]
	9 | long long notrace __lshrdi3(long long u, word_type b)
	  |                   ^~~~~~~~~
      CC      lib/muldi3.o
    lib/muldi3.c:49:19: warning: no previous prototype for '__muldi3' [-Wmissing-prototypes]
       49 | long long notrace __muldi3(long long u, long long v)
	  |                   ^~~~~~~~
      CC      lib/ucmpdi2.o
    lib/ucmpdi2.c:8:19: warning: no previous prototype for '__ucmpdi2' [-Wmissing-prototypes]
	8 | word_type notrace __ucmpdi2(unsigned long long a, unsigned long long b)
	  |                   ^~~~~~~~~

Fix this by adding forward declarations to the common libgcc header
file.

Link: https://lkml.kernel.org/r/5cdbe08296693dd53849f199c3933e16e97b33c1.1682088593.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reported-by: kernel test robot <lkp@intel.com>
  Link: https://lore.kernel.org/oe-kbuild-all/202303272214.RxzpA6bP-lkp@intel.com/
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/libgcc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/libgcc.h b/include/linux/libgcc.h
index b8dc75f0c830..fc388da6a027 100644
--- a/include/linux/libgcc.h
+++ b/include/linux/libgcc.h
@@ -27,4 +27,11 @@ typedef union {
 	long long ll;
 } DWunion;
 
+long long notrace __ashldi3(long long u, word_type b);
+long long notrace __ashrdi3(long long u, word_type b);
+word_type notrace __cmpdi2(long long a, long long b);
+long long notrace __lshrdi3(long long u, word_type b);
+long long notrace __muldi3(long long u, long long v);
+word_type notrace __ucmpdi2(unsigned long long a, unsigned long long b);
+
 #endif /* __ASM_LIBGCC_H */
-- 
cgit v1.2.3


From 5e052dda121e2870dd87181783da4a95d7d2927b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 17 Apr 2023 09:42:14 -0400
Subject: SUNRPC: Recognize control messages in server-side TCP socket code

To support kTLS, the server-side TCP socket receive path needs to
watch for CMSGs.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/net/tls.h    |  2 ++
 net/sunrpc/svcsock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 154949c7b0c8..6056ce5a2aa5 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -69,6 +69,8 @@ extern const struct tls_cipher_size_desc tls_cipher_size_desc[];
 
 #define TLS_CRYPTO_INFO_READY(info)	((info)->cipher_type)
 
+#define TLS_RECORD_TYPE_ALERT		0x15
+#define TLS_RECORD_TYPE_HANDSHAKE	0x16
 #define TLS_RECORD_TYPE_DATA		0x17
 
 #define TLS_AAD_SPACE_SIZE		13
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 302a14dd7882..c5b74f523fc4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -43,6 +43,7 @@
 #include <net/udp.h>
 #include <net/tcp.h>
 #include <net/tcp_states.h>
+#include <net/tls.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <asm/ioctls.h>
@@ -216,6 +217,49 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
 	return len;
 }
 
+static int
+svc_tcp_sock_process_cmsg(struct svc_sock *svsk, struct msghdr *msg,
+			  struct cmsghdr *cmsg, int ret)
+{
+	if (cmsg->cmsg_level == SOL_TLS &&
+	    cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+		u8 content_type = *((u8 *)CMSG_DATA(cmsg));
+
+		switch (content_type) {
+		case TLS_RECORD_TYPE_DATA:
+			/* TLS sets EOR at the end of each application data
+			 * record, even though there might be more frames
+			 * waiting to be decrypted.
+			 */
+			msg->msg_flags &= ~MSG_EOR;
+			break;
+		case TLS_RECORD_TYPE_ALERT:
+			ret = -ENOTCONN;
+			break;
+		default:
+			ret = -EAGAIN;
+		}
+	}
+	return ret;
+}
+
+static int
+svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg)
+{
+	union {
+		struct cmsghdr	cmsg;
+		u8		buf[CMSG_SPACE(sizeof(u8))];
+	} u;
+	int ret;
+
+	msg->msg_control = &u;
+	msg->msg_controllen = sizeof(u);
+	ret = sock_recvmsg(svsk->sk_sock, msg, MSG_DONTWAIT);
+	if (unlikely(msg->msg_controllen != sizeof(u)))
+		ret = svc_tcp_sock_process_cmsg(svsk, msg, &u.cmsg, ret);
+	return ret;
+}
+
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek)
 {
@@ -263,7 +307,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
 		iov_iter_advance(&msg.msg_iter, seek);
 		buflen -= seek;
 	}
-	len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
+	len = svc_tcp_sock_recv_cmsg(svsk, &msg);
 	if (len > 0)
 		svc_flush_bvec(bvec, len, seek);
 
@@ -877,7 +921,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
 		iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
 		iov.iov_len  = want;
 		iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
-		len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
+		len = svc_tcp_sock_recv_cmsg(svsk, &msg);
 		if (len < 0)
 			return len;
 		svsk->sk_tcplen += len;
-- 
cgit v1.2.3


From b3cbf98e2fdf3cb147a95161560cd25987284330 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Apr 2023 13:56:24 -0400
Subject: SUNRPC: Support TLS handshake in the server-side TCP socket code

This patch adds opportunitistic RPC-with-TLS to the Linux in-kernel
NFS server. If the client requests RPC-with-TLS and the user space
handshake agent is running, the server will set up a TLS session.

There are no policy settings yet. For example, the server cannot
yet require the use of RPC-with-TLS to access its data.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_xprt.h |   5 +-
 include/linux/sunrpc/svcsock.h  |   2 +
 include/trace/events/sunrpc.h   |  16 ++++++-
 net/sunrpc/svc_xprt.c           |   5 +-
 net/sunrpc/svcauth_unix.c       |  11 ++++-
 net/sunrpc/svcsock.c            | 101 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 132 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 775368802762..867479204840 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -27,7 +27,7 @@ struct svc_xprt_ops {
 	void		(*xpo_detach)(struct svc_xprt *);
 	void		(*xpo_free)(struct svc_xprt *);
 	void		(*xpo_kill_temp_xprt)(struct svc_xprt *);
-	void		(*xpo_start_tls)(struct svc_xprt *);
+	void		(*xpo_handshake)(struct svc_xprt *xprt);
 };
 
 struct svc_xprt_class {
@@ -70,6 +70,9 @@ struct svc_xprt {
 #define XPT_LOCAL	12		/* connection from loopback interface */
 #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
 #define XPT_CONG_CTRL	14		/* has congestion control */
+#define XPT_HANDSHAKE	15		/* xprt requests a handshake */
+#define XPT_TLS_SESSION	16		/* transport-layer security established */
+#define XPT_PEER_AUTH	17		/* peer has been authenticated */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index dd73fa174af5..d16ae621782c 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -38,6 +38,8 @@ struct svc_sock {
 	/* Number of queued send requests */
 	atomic_t		sk_sendqlen;
 
+	struct completion	sk_handshake_done;
+
 	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
 };
 
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 5a3bb42e1f50..31bc7025cb44 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1857,7 +1857,10 @@ TRACE_EVENT(svc_stats_latency,
 		{ BIT(XPT_CACHE_AUTH),		"CACHE_AUTH" },		\
 		{ BIT(XPT_LOCAL),		"LOCAL" },		\
 		{ BIT(XPT_KILL_TEMP),		"KILL_TEMP" },		\
-		{ BIT(XPT_CONG_CTRL),		"CONG_CTRL" })
+		{ BIT(XPT_CONG_CTRL),		"CONG_CTRL" },		\
+		{ BIT(XPT_HANDSHAKE),		"HANDSHAKE" },		\
+		{ BIT(XPT_TLS_SESSION),		"TLS_SESSION" },	\
+		{ BIT(XPT_PEER_AUTH),		"PEER_AUTH" })
 
 TRACE_EVENT(svc_xprt_create_err,
 	TP_PROTO(
@@ -1990,6 +1993,17 @@ DEFINE_SVC_XPRT_EVENT(close);
 DEFINE_SVC_XPRT_EVENT(detach);
 DEFINE_SVC_XPRT_EVENT(free);
 
+#define DEFINE_SVC_TLS_EVENT(name) \
+	DEFINE_EVENT(svc_xprt_event, svc_tls_##name, \
+		TP_PROTO(const struct svc_xprt *xprt), \
+		TP_ARGS(xprt))
+
+DEFINE_SVC_TLS_EVENT(start);
+DEFINE_SVC_TLS_EVENT(upcall);
+DEFINE_SVC_TLS_EVENT(unavailable);
+DEFINE_SVC_TLS_EVENT(not_started);
+DEFINE_SVC_TLS_EVENT(timed_out);
+
 TRACE_EVENT(svc_xprt_accept,
 	TP_PROTO(
 		const struct svc_xprt *xprt,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3b9708b39e35..84e5d7d31481 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -427,7 +427,7 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
 
 	if (xpt_flags & BIT(XPT_BUSY))
 		return false;
-	if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE)))
+	if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE)))
 		return true;
 	if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) {
 		if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
@@ -828,6 +828,9 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 			module_put(xprt->xpt_class->xcl_owner);
 		}
 		svc_xprt_received(xprt);
+	} else if (test_bit(XPT_HANDSHAKE, &xprt->xpt_flags)) {
+		xprt->xpt_ops->xpo_handshake(xprt);
+		svc_xprt_received(xprt);
 	} else if (svc_xprt_reserve_slot(rqstp, xprt)) {
 		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 4485088ce27b..174783f804fa 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -17,8 +17,9 @@
 #include <net/ipv6.h>
 #include <linux/kernel.h>
 #include <linux/user_namespace.h>
-#define RPCDBG_FACILITY	RPCDBG_AUTH
+#include <trace/events/sunrpc.h>
 
+#define RPCDBG_FACILITY	RPCDBG_AUTH
 
 #include "netns.h"
 
@@ -832,6 +833,7 @@ svcauth_tls_accept(struct svc_rqst *rqstp)
 {
 	struct xdr_stream *xdr = &rqstp->rq_arg_stream;
 	struct svc_cred	*cred = &rqstp->rq_cred;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
 	u32 flavor, len;
 	void *body;
 	__be32 *p;
@@ -865,14 +867,19 @@ svcauth_tls_accept(struct svc_rqst *rqstp)
 	if (cred->cr_group_info == NULL)
 		return SVC_CLOSE;
 
-	if (rqstp->rq_xprt->xpt_ops->xpo_start_tls) {
+	if (xprt->xpt_ops->xpo_handshake) {
 		p = xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2 + 8);
 		if (!p)
 			return SVC_CLOSE;
+		trace_svc_tls_start(xprt);
 		*p++ = rpc_auth_null;
 		*p++ = cpu_to_be32(8);
 		memcpy(p, "STARTTLS", 8);
+
+		set_bit(XPT_HANDSHAKE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
 	} else {
+		trace_svc_tls_unavailable(xprt);
 		if (xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream,
 						  RPC_AUTH_NULL, NULL, 0) < 0)
 			return SVC_CLOSE;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c5b74f523fc4..a51c9b989d58 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -44,9 +44,11 @@
 #include <net/tcp.h>
 #include <net/tcp_states.h>
 #include <net/tls.h>
+#include <net/handshake.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <asm/ioctls.h>
+#include <linux/key.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/clnt.h>
@@ -64,6 +66,12 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+/* To-do: to avoid tying up an nfsd thread while waiting for a
+ * handshake request, the request could instead be deferred.
+ */
+enum {
+	SVC_HANDSHAKE_TO	= 5U * HZ
+};
 
 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
 					 int flags);
@@ -359,6 +367,8 @@ static void svc_data_ready(struct sock *sk)
 		rmb();
 		svsk->sk_odata(sk);
 		trace_svcsock_data_ready(&svsk->sk_xprt, 0);
+		if (test_bit(XPT_HANDSHAKE, &svsk->sk_xprt.xpt_flags))
+			return;
 		if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
 			svc_xprt_enqueue(&svsk->sk_xprt);
 	}
@@ -396,6 +406,88 @@ static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
 	sock_no_linger(svsk->sk_sock->sk);
 }
 
+/**
+ * svc_tcp_handshake_done - Handshake completion handler
+ * @data: address of xprt to wake
+ * @status: status of handshake
+ * @peerid: serial number of key containing the remote peer's identity
+ *
+ * If a security policy is specified as an export option, we don't
+ * have a specific export here to check. So we set a "TLS session
+ * is present" flag on the xprt and let an upper layer enforce local
+ * security policy.
+ */
+static void svc_tcp_handshake_done(void *data, int status, key_serial_t peerid)
+{
+	struct svc_xprt *xprt = data;
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+	if (!status) {
+		if (peerid != TLS_NO_PEERID)
+			set_bit(XPT_PEER_AUTH, &xprt->xpt_flags);
+		set_bit(XPT_TLS_SESSION, &xprt->xpt_flags);
+	}
+	clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags);
+	complete_all(&svsk->sk_handshake_done);
+}
+
+/**
+ * svc_tcp_handshake - Perform a transport-layer security handshake
+ * @xprt: connected transport endpoint
+ *
+ */
+static void svc_tcp_handshake(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct sock *sk = svsk->sk_sock->sk;
+	struct tls_handshake_args args = {
+		.ta_sock	= svsk->sk_sock,
+		.ta_done	= svc_tcp_handshake_done,
+		.ta_data	= xprt,
+	};
+	int ret;
+
+	trace_svc_tls_upcall(xprt);
+
+	clear_bit(XPT_TLS_SESSION, &xprt->xpt_flags);
+	init_completion(&svsk->sk_handshake_done);
+
+	ret = tls_server_hello_x509(&args, GFP_KERNEL);
+	if (ret) {
+		trace_svc_tls_not_started(xprt);
+		goto out_failed;
+	}
+
+	ret = wait_for_completion_interruptible_timeout(&svsk->sk_handshake_done,
+							SVC_HANDSHAKE_TO);
+	if (ret <= 0) {
+		if (tls_handshake_cancel(sk)) {
+			trace_svc_tls_timed_out(xprt);
+			goto out_close;
+		}
+	}
+
+	if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) {
+		trace_svc_tls_unavailable(xprt);
+		goto out_close;
+	}
+
+	/* Mark the transport ready in case the remote sent RPC
+	 * traffic before the kernel received the handshake
+	 * completion downcall.
+	 */
+	set_bit(XPT_DATA, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+	return;
+
+out_close:
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+out_failed:
+	clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags);
+	set_bit(XPT_DATA, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+}
+
 /*
  * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
  */
@@ -1257,6 +1349,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_has_wspace = svc_tcp_has_wspace,
 	.xpo_accept = svc_tcp_accept,
 	.xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt,
+	.xpo_handshake = svc_tcp_handshake,
 };
 
 static struct svc_xprt_class svc_tcp_class = {
@@ -1580,10 +1673,12 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
 	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct socket *sock = svsk->sk_sock;
 
-	if (svsk->sk_sock->file)
-		sockfd_put(svsk->sk_sock);
+	tls_handshake_cancel(sock->sk);
+	if (sock->file)
+		sockfd_put(sock);
 	else
-		sock_release(svsk->sk_sock);
+		sock_release(sock);
 	kfree(svsk);
 }
-- 
cgit v1.2.3


From 9280c577431401544e63dfb489a830a42bee25eb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Apr 2023 13:56:31 -0400
Subject: NFSD: Handle new xprtsec= export option

Enable administrators to require clients to use transport layer
security when accessing particular exports.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c                 | 51 +++++++++++++++++++++++++++++++++++++---
 fs/nfsd/export.h                 |  1 +
 include/uapi/linux/nfsd/export.h | 13 ++++++++++
 3 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 6da74aebe1fb..ae85257b4238 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 	return 0;
-
 }
 
 #ifdef CONFIG_NFSD_V4
@@ -546,6 +545,29 @@ static inline int
 secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
 #endif
 
+static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	unsigned int i, mode, listsize;
+	int err;
+
+	err = get_uint(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize > NFSEXP_XPRTSEC_NUM)
+		return -EINVAL;
+
+	exp->ex_xprtsec_modes = 0;
+	for (i = 0; i < listsize; i++) {
+		err = get_uint(mesg, &mode);
+		if (err)
+			return err;
+		if (mode > NFSEXP_XPRTSEC_MTLS)
+			return -EINVAL;
+		exp->ex_xprtsec_modes |= mode;
+	}
+	return 0;
+}
+
 static inline int
 nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
 {
@@ -608,6 +630,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	exp.ex_client = dom;
 	exp.cd = cd;
 	exp.ex_devid_map = NULL;
+	exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL;
 
 	/* expiry */
 	err = get_expiry(&mesg, &exp.h.expiry_time);
@@ -649,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
 			else if (strcmp(buf, "secinfo") == 0)
 				err = secinfo_parse(&mesg, buf, &exp);
+			else if (strcmp(buf, "xprtsec") == 0)
+				err = xprtsec_parse(&mesg, buf, &exp);
 			else
 				/* quietly ignore unknown words and anything
 				 * following. Newer user-space can try to set
@@ -662,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
 		if (err)
 			goto out4;
+
 		/*
 		 * No point caching this if it would immediately expire.
 		 * Also, this protects exportfs's dummy export from the
@@ -823,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
 	}
+	new->ex_xprtsec_modes = item->ex_xprtsec_modes;
 }
 
 static struct cache_head *svc_export_alloc(void)
@@ -1034,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd,
 
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 {
-	struct exp_flavor_info *f;
-	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+	struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) {
+		if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	goto denied;
 
+ok:
 	/* legacy gss-only clients are always OK: */
 	if (exp->ex_client == rqstp->rq_gssclient)
 		return 0;
@@ -1061,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 	if (nfsd4_spo_must_allow(rqstp))
 		return 0;
 
+denied:
 	return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
 }
 
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index d03f7f6a8642..2df8ae25aad3 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -77,6 +77,7 @@ struct svc_export {
 	struct cache_detail	*cd;
 	struct rcu_head		ex_rcu;
 	struct export_stats	ex_stats;
+	unsigned long		ex_xprtsec_modes;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 2124ba904779..a73ca3703abb 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -62,5 +62,18 @@
 					| NFSEXP_ALLSQUASH \
 					| NFSEXP_INSECURE_PORT)
 
+/*
+ * Transport layer security policies that are permitted to access
+ * an export
+ */
+#define NFSEXP_XPRTSEC_NONE	0x0001
+#define NFSEXP_XPRTSEC_TLS	0x0002
+#define NFSEXP_XPRTSEC_MTLS	0x0004
+
+#define NFSEXP_XPRTSEC_NUM	(3)
+
+#define NFSEXP_XPRTSEC_ALL	(NFSEXP_XPRTSEC_NONE | \
+				 NFSEXP_XPRTSEC_TLS | \
+				 NFSEXP_XPRTSEC_MTLS)
 
 #endif /* _UAPINFSD_EXPORT_H */
-- 
cgit v1.2.3


From daf376a366fd2d469d66ab83dfdc074777462bab Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 10 Apr 2023 13:06:08 -0500
Subject: uapi nbd: improve doc links to userspace spec

The uapi <linux/nbd.h> header intentionally documents only the NBD
server features that the kernel module will utilize as a client.  But
while it already had one mention of skipped bits due to userspace
extensions, it did not actually direct the reader to the canonical
source to learn about those extensions.

While touching comments, fix an outdated reference that listed only
READ and WRITE as commands.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20230410180611.1051618-2-eblake@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/nbd.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 20d6cc91435d..8797387caaf7 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -11,6 +11,8 @@
  *            Cleanup PARANOIA usage & code.
  * 2004/02/19 Paul Clements
  *            Removed PARANOIA, plus various cleanup and comments
+ * 2023 Copyright Red Hat
+ *            Link to userspace extensions.
  */
 
 #ifndef _UAPILINUX_NBD_H
@@ -30,12 +32,18 @@
 #define NBD_SET_TIMEOUT _IO( 0xab, 9 )
 #define NBD_SET_FLAGS   _IO( 0xab, 10)
 
+/*
+ * See also https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md
+ * for additional userspace extensions not yet utilized in the kernel module.
+ */
+
 enum {
 	NBD_CMD_READ = 0,
 	NBD_CMD_WRITE = 1,
 	NBD_CMD_DISC = 2,
 	NBD_CMD_FLUSH = 3,
 	NBD_CMD_TRIM = 4
+	/* userspace defines additional extension commands */
 };
 
 /* values for flags field, these are server interaction specific. */
@@ -64,14 +72,15 @@ enum {
 #define NBD_REQUEST_MAGIC 0x25609513
 #define NBD_REPLY_MAGIC 0x67446698
 /* Do *not* use magics: 0x12560953 0x96744668. */
+/* magic 0x668e33ef for structured reply not supported by kernel yet */
 
 /*
  * This is the packet used for communication between client and
  * server. All data are in network byte order.
  */
 struct nbd_request {
-	__be32 magic;
-	__be32 type;	/* == READ || == WRITE 	*/
+	__be32 magic;	/* NBD_REQUEST_MAGIC	*/
+	__be32 type;	/* See NBD_CMD_*	*/
 	char handle[8];
 	__be64 from;
 	__be32 len;
@@ -82,7 +91,7 @@ struct nbd_request {
  * it has completed an I/O request (or an error occurs).
  */
 struct nbd_reply {
-	__be32 magic;
+	__be32 magic;		/* NBD_REPLY_MAGIC	*/
 	__be32 error;		/* 0 = ok, else error	*/
 	char handle[8];		/* handle you got from request	*/
 };
-- 
cgit v1.2.3


From 2686eb845da7762ee98b17e578b0c081aafb77b9 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 10 Apr 2023 13:06:09 -0500
Subject: uapi nbd: add cookie alias to handle

The uapi <linux/nbd.h> header declares a 'char handle[8]' per request;
which is overloaded in English (are you referring to "handle" the
verb, such as handling a signal or writing a callback handler, or
"handle" the noun, the value used in a lookup table to correlate a
response back to the request).  Many user-space NBD implementations
(both servers and clients) have instead used 'uint64_t cookie' or
similar, as it is easier to directly assign an integer than to futz
around with memcpy.  In fact, upstream documentation is now
encouraging this shift in terminology:
https://github.com/NetworkBlockDevice/nbd/commit/ca4392eb2b

Accomplish this by use of an anonymous union to provide the alias for
anyone getting the definition from the uapi; this does not break
existing clients, while exposing the nicer name for those who prefer
it.  Note that block/nbd.c still uses the term handle (in fact, it
actually combines a 32-bit cookie and a 32-bit tag into the 64-bit
handle), but that internal usage is not changed by the public uapi,
since no compliant NBD server has any reason to inspect or alter the
64 bits sent over the socket.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20230410180611.1051618-3-eblake@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/nbd.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 8797387caaf7..80ce0ef43afd 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -12,7 +12,7 @@
  * 2004/02/19 Paul Clements
  *            Removed PARANOIA, plus various cleanup and comments
  * 2023 Copyright Red Hat
- *            Link to userspace extensions.
+ *            Link to userspace extensions, favor cookie over handle.
  */
 
 #ifndef _UAPILINUX_NBD_H
@@ -81,7 +81,10 @@ enum {
 struct nbd_request {
 	__be32 magic;	/* NBD_REQUEST_MAGIC	*/
 	__be32 type;	/* See NBD_CMD_*	*/
-	char handle[8];
+	union {
+		__be64 cookie;	/* Opaque identifier for request	*/
+		char handle[8];	/* older spelling of cookie		*/
+	};
 	__be64 from;
 	__be32 len;
 } __attribute__((packed));
@@ -93,6 +96,9 @@ struct nbd_request {
 struct nbd_reply {
 	__be32 magic;		/* NBD_REPLY_MAGIC	*/
 	__be32 error;		/* 0 = ok, else error	*/
-	char handle[8];		/* handle you got from request	*/
+	union {
+		__be64 cookie;	/* Opaque identifier from request	*/
+		char handle[8];	/* older spelling of cookie		*/
+	};
 };
 #endif /* _UAPILINUX_NBD_H */
-- 
cgit v1.2.3


From 0c8862de05c1a087795ee0a87bf61a6394306cc0 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Wed, 26 Apr 2023 21:49:37 +0300
Subject: tpm: Re-enable TPM chip boostrapping non-tpm_tis TPM drivers

TPM chip bootstrapping was removed from tpm_chip_register(), and it
was relocated to tpm_tis_core. This breaks all drivers which are not
based on tpm_tis because the chip will not get properly initialized.

Take the corrective steps:
1. Rename tpm_chip_startup() as tpm_chip_bootstrap() and make it one-shot.
2. Call tpm_chip_bootstrap() in tpm_chip_register(), which reverts the
   things  as tehy used to be.

Cc: Lino Sanfilippo <l.sanfilippo@kunbus.com>
Fixes: 548eb516ec0f ("tpm, tpm_tis: startup chip before testing for interrupts")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Link: https://lore.kernel.org/all/ZEjqhwHWBnxcaRV5@xpf.sh.intel.com/
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-chip.c     | 22 +++++++++++++++++++---
 drivers/char/tpm/tpm.h          |  2 +-
 drivers/char/tpm/tpm_tis_core.c |  2 +-
 include/linux/tpm.h             | 13 +++++++------
 4 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 6fdfa65a00c3..9c0a6aad8114 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -606,13 +606,19 @@ static int tpm_get_pcr_allocation(struct tpm_chip *chip)
 }
 
 /*
- * tpm_chip_startup() - performs auto startup and allocates the PCRs
+ * tpm_chip_bootstrap() - Boostrap TPM chip after power on
  * @chip: TPM chip to use.
+ *
+ * Initialize TPM chip after power on. This a one-shot function: subsequent
+ * calls will have no effect.
  */
-int tpm_chip_startup(struct tpm_chip *chip)
+int tpm_chip_bootstrap(struct tpm_chip *chip)
 {
 	int rc;
 
+	if (chip->flags & TPM_CHIP_FLAG_BOOTSTRAPPED)
+		return 0;
+
 	rc = tpm_chip_start(chip);
 	if (rc)
 		return rc;
@@ -625,9 +631,15 @@ int tpm_chip_startup(struct tpm_chip *chip)
 stop:
 	tpm_chip_stop(chip);
 
+	/*
+	 * Unconditionally set, as driver initialization should cease, when the
+	 * boostrapping process fails.
+	 */
+	chip->flags |= TPM_CHIP_FLAG_BOOTSTRAPPED;
+
 	return rc;
 }
-EXPORT_SYMBOL_GPL(tpm_chip_startup);
+EXPORT_SYMBOL_GPL(tpm_chip_bootstrap);
 
 /*
  * tpm_chip_register() - create a character device for the TPM chip
@@ -644,6 +656,10 @@ int tpm_chip_register(struct tpm_chip *chip)
 {
 	int rc;
 
+	rc = tpm_chip_bootstrap(chip);
+	if (rc)
+		return rc;
+
 	tpm_sysfs_add_device(chip);
 
 	tpm_bios_log_setup(chip);
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 88d3bd76e076..f6c99b3f0045 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -263,7 +263,7 @@ static inline void tpm_msleep(unsigned int delay_msec)
 		     delay_msec * 1000);
 };
 
-int tpm_chip_startup(struct tpm_chip *chip);
+int tpm_chip_bootstrap(struct tpm_chip *chip);
 int tpm_chip_start(struct tpm_chip *chip);
 void tpm_chip_stop(struct tpm_chip *chip);
 struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip);
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index c2421162cf34..02945d53fcef 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -1139,7 +1139,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
 	init_waitqueue_head(&priv->read_queue);
 	init_waitqueue_head(&priv->int_queue);
 
-	rc = tpm_chip_startup(chip);
+	rc = tpm_chip_bootstrap(chip);
 	if (rc)
 		goto out_err;
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 4dc97b9f65fb..77693389c3f9 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -274,13 +274,14 @@ enum tpm2_cc_attrs {
 #define TPM_VID_ATML     0x1114
 
 enum tpm_chip_flags {
-	TPM_CHIP_FLAG_TPM2		= BIT(1),
-	TPM_CHIP_FLAG_IRQ		= BIT(2),
-	TPM_CHIP_FLAG_VIRTUAL		= BIT(3),
-	TPM_CHIP_FLAG_HAVE_TIMEOUTS	= BIT(4),
-	TPM_CHIP_FLAG_ALWAYS_POWERED	= BIT(5),
+	TPM_CHIP_FLAG_BOOTSTRAPPED		= BIT(0),
+	TPM_CHIP_FLAG_TPM2			= BIT(1),
+	TPM_CHIP_FLAG_IRQ			= BIT(2),
+	TPM_CHIP_FLAG_VIRTUAL			= BIT(3),
+	TPM_CHIP_FLAG_HAVE_TIMEOUTS		= BIT(4),
+	TPM_CHIP_FLAG_ALWAYS_POWERED		= BIT(5),
 	TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED	= BIT(6),
-	TPM_CHIP_FLAG_FIRMWARE_UPGRADE	= BIT(7),
+	TPM_CHIP_FLAG_FIRMWARE_UPGRADE		= BIT(7),
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
-- 
cgit v1.2.3


From fbd2a05f29a95d5b42b294bf47e55a711424965b Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 6 Apr 2023 15:16:52 -0400
Subject: NFSv4.2: Rework scratch handling for READ_PLUS

Instead of using a tiny, static scratch buffer, we should use a kmalloc()-ed
buffer that is allocated when checking for read plus usage. This lets us
use the buffer before decoding any part of the READ_PLUS operation
instead of setting it right before segment decoding, meaning it should
be a little more robust.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs42xdr.c       |  4 ++--
 fs/nfs/nfs4proc.c       | 17 ++++++++++++-----
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index d80ee88ca996..a6df815a140c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1122,7 +1122,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	uint32_t segments;
 	struct read_plus_segment *segs;
 	int status, i;
-	char scratch_buf[16];
 	__be32 *p;
 
 	status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1143,7 +1142,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	if (!segs)
 		return -ENOMEM;
 
-	xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
 	status = -EIO;
 	for (i = 0; i < segments; i++) {
 		status = decode_read_plus_segment(xdr, &segs[i]);
@@ -1348,6 +1346,8 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
 	struct compound_hdr hdr;
 	int status;
 
+	xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch));
+
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5607b1e2b821..18f25ff4bff7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5439,6 +5439,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
 
 static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
+	if (hdr->res.scratch)
+		kfree(hdr->res.scratch);
 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -5452,17 +5454,22 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 }
 
 #if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
 				    struct rpc_message *msg)
 {
 	/* Note: We don't use READ_PLUS with pNFS yet */
-	if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp)
+	if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) {
 		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+		hdr->res.scratch = kmalloc(32, GFP_KERNEL);
+		return hdr->res.scratch != NULL;
+	}
+	return false;
 }
 #else
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
 				    struct rpc_message *msg)
 {
+	return false;
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -5472,8 +5479,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
 	hdr->timestamp   = jiffies;
 	if (!hdr->pgio_done_cb)
 		hdr->pgio_done_cb = nfs4_read_done_cb;
-	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-	nfs42_read_plus_support(hdr, msg);
+	if (!nfs42_read_plus_support(hdr, msg))
+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e196ef595908..29a1b39794bf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -670,6 +670,7 @@ struct nfs_pgio_res {
 		struct {
 			unsigned int		replen;		/* used by read */
 			int			eof;		/* used by read */
+			void *			scratch;	/* used by read */
 		};
 		struct {
 			struct nfs_writeverf *	verf;		/* used by write */
-- 
cgit v1.2.3


From e6ce9d741163af0b846637ce6550ae8a671b1588 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 5 Apr 2023 16:17:06 +0200
Subject: locking/atomic: Add generic try_cmpxchg{,64}_local() support

Add generic support for try_cmpxchg{,64}_local() and their falbacks.

These provides the generic try_cmpxchg_local family of functions
from the arch_ prefixed version, also adding explicit instrumentation.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20230405141710.3551-2-ubizjak@gmail.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/atomic/atomic-arch-fallback.h | 24 +++++++++++++++++++++++-
 include/linux/atomic/atomic-instrumented.h  | 20 +++++++++++++++++++-
 scripts/atomic/gen-atomic-fallback.sh       |  4 ++++
 scripts/atomic/gen-atomic-instrumented.sh   |  2 +-
 4 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 4226379a232d..a6e4437c5f36 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -217,6 +217,28 @@
 
 #endif /* arch_try_cmpxchg64_relaxed */
 
+#ifndef arch_try_cmpxchg_local
+#define arch_try_cmpxchg_local(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg_local((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg_local */
+
+#ifndef arch_try_cmpxchg64_local
+#define arch_try_cmpxchg64_local(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg64_local((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg64_local */
+
 #ifndef arch_atomic_read_acquire
 static __always_inline int
 arch_atomic_read_acquire(const atomic_t *v)
@@ -2646,4 +2668,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 00071fffa021cec66f6290d706d69c91df87bade
+// ad2e2b4d168dbc60a73922616047a9bfa446af36
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 0496816738ca..245ba661c493 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -2132,6 +2132,24 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 
+#define try_cmpxchg_local(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_local(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
 #define cmpxchg_double(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2149,4 +2167,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a
+// 97fe4d79aa058d2164df824632cbc4f716d2a407
diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh
index 3a07695e3c89..6e853f0dad8d 100755
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -225,6 +225,10 @@ for cmpxchg in "cmpxchg" "cmpxchg64"; do
 	gen_try_cmpxchg_fallbacks "${cmpxchg}"
 done
 
+for cmpxchg in "cmpxchg_local" "cmpxchg64_local"; do
+	gen_try_cmpxchg_fallback "${cmpxchg}" ""
+done
+
 grep '^[a-z]' "$1" | while read name meta args; do
 	gen_proto "${meta}" "${name}" "atomic" "int" ${args}
 done
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 77c06526a574..c8165e9431bf 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -173,7 +173,7 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg" "try_cmpxchg64"; do
 	done
 done
 
-for xchg in "cmpxchg_local" "cmpxchg64_local" "sync_cmpxchg"; do
+for xchg in "cmpxchg_local" "cmpxchg64_local" "sync_cmpxchg" "try_cmpxchg_local" "try_cmpxchg64_local" ; do
 	gen_xchg "${xchg}" "" ""
 	printf "\n"
 done
-- 
cgit v1.2.3


From 8fc4fddaf9a184eea7da21290236a1764e608a01 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 5 Apr 2023 16:17:07 +0200
Subject: locking/generic: Wire up local{,64}_try_cmpxchg()

Implement generic support for local{,64}_try_cmpxchg().

Redirect to the atomic_ family of functions when the target
does not provide its own local.h definitions.

For 64-bit targets, implement local64_try_cmpxchg and
local64_cmpxchg using typed C wrappers that call local_
family of functions and provide additional checking
of their input arguments.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20230405141710.3551-3-ubizjak@gmail.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/local.h   |  1 +
 include/asm-generic/local64.h | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/local.h b/include/asm-generic/local.h
index fca7f1d84818..7f97018df66f 100644
--- a/include/asm-generic/local.h
+++ b/include/asm-generic/local.h
@@ -42,6 +42,7 @@ typedef struct
 #define local_inc_return(l) atomic_long_inc_return(&(l)->a)
 
 #define local_cmpxchg(l, o, n) atomic_long_cmpxchg((&(l)->a), (o), (n))
+#define local_try_cmpxchg(l, po, n) atomic_long_try_cmpxchg((&(l)->a), (po), (n))
 #define local_xchg(l, n) atomic_long_xchg((&(l)->a), (n))
 #define local_add_unless(l, _a, u) atomic_long_add_unless((&(l)->a), (_a), (u))
 #define local_inc_not_zero(l) atomic_long_inc_not_zero(&(l)->a)
diff --git a/include/asm-generic/local64.h b/include/asm-generic/local64.h
index 765be0b7d883..14963a7a6253 100644
--- a/include/asm-generic/local64.h
+++ b/include/asm-generic/local64.h
@@ -42,7 +42,16 @@ typedef struct {
 #define local64_sub_return(i, l) local_sub_return((i), (&(l)->a))
 #define local64_inc_return(l)	local_inc_return(&(l)->a)
 
-#define local64_cmpxchg(l, o, n) local_cmpxchg((&(l)->a), (o), (n))
+static inline s64 local64_cmpxchg(local64_t *l, s64 old, s64 new)
+{
+	return local_cmpxchg(&l->a, old, new);
+}
+
+static inline bool local64_try_cmpxchg(local64_t *l, s64 *old, s64 new)
+{
+	return local_try_cmpxchg(&l->a, (long *)old, new);
+}
+
 #define local64_xchg(l, n)	local_xchg((&(l)->a), (n))
 #define local64_add_unless(l, _a, u) local_add_unless((&(l)->a), (_a), (u))
 #define local64_inc_not_zero(l)	local_inc_not_zero(&(l)->a)
@@ -81,6 +90,7 @@ typedef struct {
 #define local64_inc_return(l)	atomic64_inc_return(&(l)->a)
 
 #define local64_cmpxchg(l, o, n) atomic64_cmpxchg((&(l)->a), (o), (n))
+#define local64_try_cmpxchg(l, po, n) atomic64_try_cmpxchg((&(l)->a), (po), (n))
 #define local64_xchg(l, n)	atomic64_xchg((&(l)->a), (n))
 #define local64_add_unless(l, _a, u) atomic64_add_unless((&(l)->a), (_a), (u))
 #define local64_inc_not_zero(l)	atomic64_inc_not_zero(&(l)->a)
-- 
cgit v1.2.3


From ec570320b09f76d52819e60abdccf372658216b6 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 13 Apr 2023 17:06:44 +0100
Subject: locking/atomic: Correct (cmp)xchg() instrumentation

All xchg() and cmpxchg() ops are atomic RMWs, but currently we
instrument these with instrument_atomic_write() rather than
instrument_atomic_read_write(), missing the read aspect.

Similarly, all try_cmpxchg() ops are non-atomic RMWs on *oldp, but we
instrument these accesses with instrument_atomic_write() rather than
instrument_read_write(), missing the read aspect and erroneously marking
these as atomic.

Fix the instrumentation for both points.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230413160644.490976-1-mark.rutland@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/atomic/atomic-instrumented.h | 76 +++++++++++++++---------------
 scripts/atomic/gen-atomic-instrumented.sh  |  6 +--
 2 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 245ba661c493..03a232a1fa57 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1948,14 +1948,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg(__ai_ptr, __VA_ARGS__); \
 })
 
 #define xchg_acquire(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg_acquire(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -1963,14 +1963,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_release(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg_release(__ai_ptr, __VA_ARGS__); \
 })
 
 #define xchg_relaxed(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -1978,14 +1978,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 
 #define cmpxchg_acquire(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -1993,14 +1993,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_release(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
 })
 
 #define cmpxchg_relaxed(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2008,14 +2008,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64(__ai_ptr, __VA_ARGS__); \
 })
 
 #define cmpxchg64_acquire(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2023,14 +2023,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_release(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
 })
 
 #define cmpxchg64_relaxed(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2039,8 +2039,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2048,8 +2048,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2058,8 +2058,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
 	kcsan_release(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2067,8 +2067,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2077,8 +2077,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2086,8 +2086,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2096,8 +2096,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
 	kcsan_release(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2105,22 +2105,22 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
 #define cmpxchg_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
 })
 
 #define cmpxchg64_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2128,7 +2128,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
 	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2136,8 +2136,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2145,8 +2145,8 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	typeof(oldp) __ai_oldp = (oldp); \
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
-	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
 	arch_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
@@ -2154,7 +2154,7 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
 	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
 	arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \
 })
 
@@ -2162,9 +2162,9 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 #define cmpxchg_double_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
+	instrument_atomic_read_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
 	arch_cmpxchg_double_local(__ai_ptr, __VA_ARGS__); \
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 97fe4d79aa058d2164df824632cbc4f716d2a407
+// 6b513a42e1a1b5962532a019b7fc91eaa044ad5e
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index c8165e9431bf..d9ffd74f73ca 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -104,8 +104,8 @@ cat <<EOF
 EOF
 [ -n "$kcsan_barrier" ] && printf "\t${kcsan_barrier}; \\\\\n"
 cat <<EOF
-	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
-	instrument_atomic_write(__ai_oldp, ${mult}sizeof(*__ai_oldp)); \\
+	instrument_atomic_read_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
+	instrument_read_write(__ai_oldp, ${mult}sizeof(*__ai_oldp)); \\
 	arch_${xchg}${order}(__ai_ptr, __ai_oldp, __VA_ARGS__); \\
 })
 EOF
@@ -119,7 +119,7 @@ cat <<EOF
 EOF
 [ -n "$kcsan_barrier" ] && printf "\t${kcsan_barrier}; \\\\\n"
 cat <<EOF
-	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
+	instrument_atomic_read_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
 	arch_${xchg}${order}(__ai_ptr, __VA_ARGS__); \\
 })
 EOF
-- 
cgit v1.2.3


From db099c625b13a74d462521a46d98a8ce5b53af5d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 28 Apr 2023 21:27:56 +0100
Subject: rxrpc: Fix timeout of a call that hasn't yet been granted a channel

afs_make_call() calls rxrpc_kernel_begin_call() to begin a call (which may
get stalled in the background waiting for a connection to become
available); it then calls rxrpc_kernel_set_max_life() to set the timeouts -
but that starts the call timer so the call timer might then expire before
we get a connection assigned - leading to the following oops if the call
stalled:

	BUG: kernel NULL pointer dereference, address: 0000000000000000
	...
	CPU: 1 PID: 5111 Comm: krxrpcio/0 Not tainted 6.3.0-rc7-build3+ #701
	RIP: 0010:rxrpc_alloc_txbuf+0xc0/0x157
	...
	Call Trace:
	 <TASK>
	 rxrpc_send_ACK+0x50/0x13b
	 rxrpc_input_call_event+0x16a/0x67d
	 rxrpc_io_thread+0x1b6/0x45f
	 ? _raw_spin_unlock_irqrestore+0x1f/0x35
	 ? rxrpc_input_packet+0x519/0x519
	 kthread+0xe7/0xef
	 ? kthread_complete_and_exit+0x1b/0x1b
	 ret_from_fork+0x22/0x30

Fix this by noting the timeouts in struct rxrpc_call when the call is
created.  The timer will be started when the first packet is transmitted.

It shouldn't be possible to trigger this directly from userspace through
AF_RXRPC as sendmsg() will return EBUSY if the call is in the
waiting-for-conn state if it dropped out of the wait due to a signal.

Fixes: 9d35d880e0e4 ("rxrpc: Move client call connection to the I/O thread")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/afs/afs.h            |  4 ++--
 fs/afs/internal.h       |  2 +-
 fs/afs/rxrpc.c          |  8 +++-----
 include/net/af_rxrpc.h  | 21 +++++++++++----------
 net/rxrpc/af_rxrpc.c    |  3 +++
 net/rxrpc/ar-internal.h |  1 +
 net/rxrpc/call_object.c |  9 ++++++++-
 net/rxrpc/sendmsg.c     |  1 +
 8 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 432cb4b23961..81815724db6c 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -19,8 +19,8 @@
 #define AFSPATHMAX		1024	/* Maximum length of a pathname plus NUL */
 #define AFSOPAQUEMAX		1024	/* Maximum length of an opaque field */
 
-#define AFS_VL_MAX_LIFESPAN	(120 * HZ)
-#define AFS_PROBE_MAX_LIFESPAN	(30 * HZ)
+#define AFS_VL_MAX_LIFESPAN	120
+#define AFS_PROBE_MAX_LIFESPAN	30
 
 typedef u64			afs_volid_t;
 typedef u64			afs_vnodeid_t;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ad8523d0d038..68ae91d21b57 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -128,7 +128,7 @@ struct afs_call {
 	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
-	unsigned int		max_lifespan;	/* Maximum lifespan to set if not 0 */
+	unsigned int		max_lifespan;	/* Maximum lifespan in secs to set if not 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		count2;		/* count used in unmarshalling */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e08b850c3e6d..ed1644e7683f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -335,7 +335,9 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
 	/* create a call */
 	rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
 					 (unsigned long)call,
-					 tx_total_len, gfp,
+					 tx_total_len,
+					 call->max_lifespan,
+					 gfp,
 					 (call->async ?
 					  afs_wake_up_async_call :
 					  afs_wake_up_call_waiter),
@@ -350,10 +352,6 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
 	}
 
 	call->rxcall = rxcall;
-
-	if (call->max_lifespan)
-		rxrpc_kernel_set_max_life(call->net->socket, rxcall,
-					  call->max_lifespan);
 	call->issue_time = ktime_get_real();
 
 	/* send the request */
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 01a35e113ab9..5531dd08061e 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -40,16 +40,17 @@ typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long);
 void rxrpc_kernel_new_call_notification(struct socket *,
 					rxrpc_notify_new_call_t,
 					rxrpc_discard_new_call_t);
-struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
-					   struct sockaddr_rxrpc *,
-					   struct key *,
-					   unsigned long,
-					   s64,
-					   gfp_t,
-					   rxrpc_notify_rx_t,
-					   bool,
-					   enum rxrpc_interruptibility,
-					   unsigned int);
+struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+					   struct sockaddr_rxrpc *srx,
+					   struct key *key,
+					   unsigned long user_call_ID,
+					   s64 tx_total_len,
+					   u32 hard_timeout,
+					   gfp_t gfp,
+					   rxrpc_notify_rx_t notify_rx,
+					   bool upgrade,
+					   enum rxrpc_interruptibility interruptibility,
+					   unsigned int debug_id);
 int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
 			   struct msghdr *, size_t,
 			   rxrpc_notify_end_tx_t);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c32b164206f9..31f738d65f1c 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -265,6 +265,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
  * @key: The security context to use (defaults to socket setting)
  * @user_call_ID: The ID to use
  * @tx_total_len: Total length of data to transmit during the call (or -1)
+ * @hard_timeout: The maximum lifespan of the call in sec
  * @gfp: The allocation constraints
  * @notify_rx: Where to send notifications instead of socket queue
  * @upgrade: Request service upgrade for call
@@ -283,6 +284,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 					   struct key *key,
 					   unsigned long user_call_ID,
 					   s64 tx_total_len,
+					   u32 hard_timeout,
 					   gfp_t gfp,
 					   rxrpc_notify_rx_t notify_rx,
 					   bool upgrade,
@@ -313,6 +315,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	p.tx_total_len		= tx_total_len;
 	p.interruptibility	= interruptibility;
 	p.kernel		= true;
+	p.timeouts.hard		= hard_timeout;
 
 	memset(&cp, 0, sizeof(cp));
 	cp.local		= rx->local;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 67b0a894162d..5d44dc08f66d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -616,6 +616,7 @@ struct rxrpc_call {
 	unsigned long		expect_term_by;	/* When we expect call termination by */
 	u32			next_rx_timo;	/* Timeout for next Rx packet (jif) */
 	u32			next_req_timo;	/* Timeout for next Rx request packet (jif) */
+	u32			hard_timo;	/* Maximum lifetime or 0 (jif) */
 	struct timer_list	timer;		/* Combined event timer */
 	struct work_struct	destroyer;	/* In-process-context destroyer */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index e9f1f49d18c2..fecbc73054bc 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -226,6 +226,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
 	if (cp->exclusive)
 		__set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
 
+	if (p->timeouts.normal)
+		call->next_rx_timo = min(msecs_to_jiffies(p->timeouts.normal), 1UL);
+	if (p->timeouts.idle)
+		call->next_req_timo = min(msecs_to_jiffies(p->timeouts.idle), 1UL);
+	if (p->timeouts.hard)
+		call->hard_timo = p->timeouts.hard * HZ;
+
 	ret = rxrpc_init_client_call_security(call);
 	if (ret < 0) {
 		rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret);
@@ -257,7 +264,7 @@ void rxrpc_start_call_timer(struct rxrpc_call *call)
 	call->keepalive_at = j;
 	call->expect_rx_by = j;
 	call->expect_req_by = j;
-	call->expect_term_by = j;
+	call->expect_term_by = j + call->hard_timo;
 	call->timer.expires = now;
 }
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index c1b074c17b33..8e0b94714e84 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -651,6 +651,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		if (IS_ERR(call))
 			return PTR_ERR(call);
 		/* ... and we have the call lock. */
+		p.call.nr_timeouts = 0;
 		ret = 0;
 		if (rxrpc_call_is_complete(call))
 			goto out_put_unlock;
-- 
cgit v1.2.3


From 24139c07f413ef4b555482c758343d71392a19bc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Sat, 22 Apr 2023 22:54:18 +0200
Subject: mm/ksm: unmerge and clear VM_MERGEABLE when setting
 PR_SET_MEMORY_MERGE=0

Patch series "mm/ksm: improve PR_SET_MEMORY_MERGE=0 handling and cleanup
disabling KSM", v2.

(1) Make PR_SET_MEMORY_MERGE=0 unmerge pages like setting MADV_UNMERGEABLE
does, (2) add a selftest for it and (3) factor out disabling of KSM from
s390/gmap code.


This patch (of 3):

Let's unmerge any KSM pages when setting PR_SET_MEMORY_MERGE=0, and clear
the VM_MERGEABLE flag from all VMAs -- just like KSM would.  Of course,
only do that if we previously set PR_SET_MEMORY_MERGE=1.

Link: https://lkml.kernel.org/r/20230422205420.30372-1-david@redhat.com
Link: https://lkml.kernel.org/r/20230422205420.30372-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Stefan Roesch <shr@devkernel.io>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h |  1 +
 kernel/sys.c        | 12 +++--------
 mm/ksm.c            | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7a9b76fb6c3f..429efa6ff4ae 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -21,6 +21,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
 void ksm_add_vma(struct vm_area_struct *vma);
 int ksm_enable_merge_any(struct mm_struct *mm);
+int ksm_disable_merge_any(struct mm_struct *mm);
 
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
diff --git a/kernel/sys.c b/kernel/sys.c
index 72cdb16e2636..339fee3eff6a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2695,16 +2695,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (mmap_write_lock_killable(me->mm))
 			return -EINTR;
 
-		if (arg2) {
+		if (arg2)
 			error = ksm_enable_merge_any(me->mm);
-		} else {
-			/*
-			 * TODO: we might want disable KSM on all VMAs and
-			 * trigger unsharing to completely disable KSM.
-			 */
-			clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
-			error = 0;
-		}
+		else
+			error = ksm_disable_merge_any(me->mm);
 		mmap_write_unlock(me->mm);
 		break;
 	case PR_GET_MEMORY_MERGE:
diff --git a/mm/ksm.c b/mm/ksm.c
index 9e48258985d2..823bb3475a68 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2520,6 +2520,22 @@ static void __ksm_add_vma(struct vm_area_struct *vma)
 		vm_flags_set(vma, VM_MERGEABLE);
 }
 
+static int __ksm_del_vma(struct vm_area_struct *vma)
+{
+	int err;
+
+	if (!(vma->vm_flags & VM_MERGEABLE))
+		return 0;
+
+	if (vma->anon_vma) {
+		err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+		if (err)
+			return err;
+	}
+
+	vm_flags_clear(vma, VM_MERGEABLE);
+	return 0;
+}
 /**
  * ksm_add_vma - Mark vma as mergeable if compatible
  *
@@ -2542,6 +2558,20 @@ static void ksm_add_vmas(struct mm_struct *mm)
 		__ksm_add_vma(vma);
 }
 
+static int ksm_del_vmas(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	int err;
+
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		err = __ksm_del_vma(vma);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 /**
  * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
  *                        compatible VMA's
@@ -2569,6 +2599,35 @@ int ksm_enable_merge_any(struct mm_struct *mm)
 	return 0;
 }
 
+/**
+ * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
+ *			   previously enabled via ksm_enable_merge_any().
+ *
+ * Disabling merging implies unmerging any merged pages, like setting
+ * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
+ * merging on all compatible VMA's remains enabled.
+ *
+ * @mm: Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_disable_merge_any(struct mm_struct *mm)
+{
+	int err;
+
+	if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return 0;
+
+	err = ksm_del_vmas(mm);
+	if (err) {
+		ksm_add_vmas(mm);
+		return err;
+	}
+
+	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+	return 0;
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
-- 
cgit v1.2.3


From 2c281f54f556e1f3266c8cb104adf9eea7a7b742 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Sat, 22 Apr 2023 23:01:56 +0200
Subject: mm/ksm: move disabling KSM from s390/gmap code to KSM code

Let's factor out actual disabling of KSM.  The existing "mm->def_flags &=
~VM_MERGEABLE;" was essentially a NOP and can be dropped, because
def_flags should never include VM_MERGEABLE.  Note that we don't currently
prevent re-enabling KSM.

This should now be faster in case KSM was never enabled, because we only
conditionally iterate all VMAs.  Further, it certainly looks cleaner.

Link: https://lkml.kernel.org/r/20230422210156.33630-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Stefan Roesch <shr@devkernel.io>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap.c | 20 +-------------------
 include/linux/ksm.h |  6 ++++++
 mm/ksm.c            | 11 +++++++++++
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 0949811761e6..dfe905c7bd8e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2585,30 +2585,12 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
 
 int gmap_mark_unmergeable(void)
 {
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long vm_flags;
-	int ret;
-	VMA_ITERATOR(vmi, mm, 0);
-
 	/*
 	 * Make sure to disable KSM (if enabled for the whole process or
 	 * individual VMAs). Note that nothing currently hinders user space
 	 * from re-enabling it.
 	 */
-	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
-
-	for_each_vma(vmi, vma) {
-		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
-		vm_flags = vma->vm_flags;
-		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-				  MADV_UNMERGEABLE, &vm_flags);
-		if (ret)
-			return ret;
-		vm_flags_reset(vma, vm_flags);
-	}
-	mm->def_flags &= ~VM_MERGEABLE;
-	return 0;
+	return ksm_disable(current->mm);
 }
 EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
 
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 429efa6ff4ae..899a314bc487 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -22,6 +22,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 void ksm_add_vma(struct vm_area_struct *vma);
 int ksm_enable_merge_any(struct mm_struct *mm);
 int ksm_disable_merge_any(struct mm_struct *mm);
+int ksm_disable(struct mm_struct *mm);
 
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
@@ -80,6 +81,11 @@ static inline void ksm_add_vma(struct vm_area_struct *vma)
 {
 }
 
+static inline int ksm_disable(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 823bb3475a68..0156bded3a66 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2628,6 +2628,17 @@ int ksm_disable_merge_any(struct mm_struct *mm)
 	return 0;
 }
 
+int ksm_disable(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+		return 0;
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return ksm_disable_merge_any(mm);
+	return ksm_del_vmas(mm);
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
-- 
cgit v1.2.3


From 245f0922689364b21163af4937a05ea0ba576fae Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 17 Apr 2023 12:53:23 +0800
Subject: mm: hwpoison: coredump: support recovery from dump_user_range()

dump_user_range() is used to copy the user page to a coredump file, but if
a hardware memory error occurred during copy, which called from
__kernel_write_iter() in dump_user_range(), it crashes,

  CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425

  pc : __memcpy+0x110/0x260
  lr : _copy_from_iter+0x3bc/0x4c8
  ...
  Call trace:
   __memcpy+0x110/0x260
   copy_page_from_iter+0xcc/0x130
   pipe_write+0x164/0x6d8
   __kernel_write_iter+0x9c/0x210
   dump_user_range+0xc8/0x1d8
   elf_core_dump+0x308/0x368
   do_coredump+0x2e8/0xa40
   get_signal+0x59c/0x788
   do_signal+0x118/0x1f8
   do_notify_resume+0xf0/0x280
   el0_da+0x130/0x138
   el0t_64_sync_handler+0x68/0xc0
   el0t_64_sync+0x188/0x190

Generally, the '->write_iter' of file ops will use copy_page_from_iter()
and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
in both of them to handle #MC during source read, which stop coredump
processing and kill the task instead of kernel panic, but the source
address may not always a user address, so introduce a new copy_mc flag in
struct iov_iter{} to indicate that the iter could do a safe memory copy,
also introduce the helpers to set/cleck the flag, for now, it's only used
in coredump's dump_user_range(), but it could expand to any other
scenarios to fix the similar issue.

Link: https://lkml.kernel.org/r/20230417045323.11054-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tong Tiangen <tongtiangen@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/coredump.c       |  1 +
 include/linux/uio.h | 16 ++++++++++++++++
 lib/iov_iter.c      | 17 +++++++++++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	pos = file->f_pos;
 	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
 	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+	iov_iter_set_copy_mc(&iter);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3d386849a758..044c1d8c230c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -42,6 +42,7 @@ struct iov_iter_state {
 
 struct iov_iter {
 	u8 iter_type;
+	bool copy_mc;
 	bool nofault;
 	bool data_source;
 	bool user_backed;
@@ -256,8 +257,22 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 
 #ifdef CONFIG_ARCH_HAS_COPY_MC
 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+static inline void iov_iter_set_copy_mc(struct iov_iter *i)
+{
+	i->copy_mc = true;
+}
+
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+	return i->copy_mc;
+}
 #else
 #define _copy_mc_to_iter _copy_to_iter
+static inline void iov_iter_set_copy_mc(struct iov_iter *i) { }
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+	return false;
+}
 #endif
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
@@ -380,6 +395,7 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter) {
 		.iter_type = ITER_UBUF,
+		.copy_mc = false,
 		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c3dbe994112c..960223ed9199 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -434,6 +434,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter) {
 		.iter_type = ITER_IOVEC,
+		.copy_mc = false,
 		.nofault = false,
 		.user_backed = true,
 		.data_source = direction,
@@ -630,6 +631,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 #endif /* CONFIG_ARCH_HAS_COPY_MC */
 
+static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
+				 size_t size)
+{
+	if (iov_iter_is_copy_mc(i))
+		return (void *)copy_mc_to_kernel(to, from, size);
+	return memcpy(to, from, size);
+}
+
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(!i->data_source))
@@ -639,7 +648,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(addr + off, base, len),
-		memcpy(addr + off, base, len)
+		memcpy_from_iter(i, addr + off, base, len)
 	)
 
 	return bytes;
@@ -862,7 +871,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
 	}
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(p + off, base, len),
-		memcpy(p + off, base, len)
+		memcpy_from_iter(i, p + off, base, len)
 	)
 	kunmap_atomic(kaddr);
 	return bytes;
@@ -1043,6 +1052,7 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter){
 		.iter_type = ITER_KVEC,
+		.copy_mc = false,
 		.data_source = direction,
 		.kvec = kvec,
 		.nr_segs = nr_segs,
@@ -1059,6 +1069,7 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 	WARN_ON(direction & ~(READ | WRITE));
 	*i = (struct iov_iter){
 		.iter_type = ITER_BVEC,
+		.copy_mc = false,
 		.data_source = direction,
 		.bvec = bvec,
 		.nr_segs = nr_segs,
@@ -1105,6 +1116,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 	BUG_ON(direction & ~1);
 	*i = (struct iov_iter) {
 		.iter_type = ITER_XARRAY,
+		.copy_mc = false,
 		.data_source = direction,
 		.xarray = xarray,
 		.xarray_start = start,
@@ -1128,6 +1140,7 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 	BUG_ON(direction != READ);
 	*i = (struct iov_iter){
 		.iter_type = ITER_DISCARD,
+		.copy_mc = false,
 		.data_source = false,
 		.count = count,
 		.iov_offset = 0
-- 
cgit v1.2.3


From 0199849acd07d07e2a8e42757653ca8b14a122f5 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 2 May 2023 18:30:04 -0700
Subject: sysctl: remove register_sysctl_paths()

The deprecation for register_sysctl_paths() is over. We can rejoice as
we nuke register_sysctl_paths(). The routine register_sysctl_table()
was the only user left of register_sysctl_paths(), so we can now just
open code and move the implementation over to what used to be
to __register_sysctl_paths().

The old dynamic struct ctl_table_set *set is now the point to
sysctl_table_root.default_set.

The old dynamic const struct ctl_path *path was being used in the
routine register_sysctl_paths() with a static:

static const struct ctl_path null_path[] = { {} };

Since this is a null path we can now just simplfy the old routine
and remove its use as its always empty.

This saves us a total of 230 bytes.

$ ./scripts/bloat-o-meter vmlinux.old vmlinux
add/remove: 2/7 grow/shrink: 1/1 up/down: 1015/-1245 (-230)
Function                                     old     new   delta
register_leaf_sysctl_tables.constprop          -     524    +524
register_sysctl_table                         22     497    +475
__pfx_register_leaf_sysctl_tables.constprop       -      16     +16
null_path                                      8       -      -8
__pfx_register_sysctl_paths                   16       -     -16
__pfx_register_leaf_sysctl_tables             16       -     -16
__pfx___register_sysctl_paths                 16       -     -16
__register_sysctl_base                        29      12     -17
register_sysctl_paths                         18       -     -18
register_leaf_sysctl_tables                  534       -    -534
__register_sysctl_paths                      620       -    -620
Total: Before=21259666, After=21259436, chg -0.00%

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/proc/proc_sysctl.c     | 55 ++++-------------------------------------------
 include/linux/sysctl.h    | 12 -----------
 scripts/check-sysctl-docs | 16 --------------
 3 files changed, 4 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 81dbb175017e..8038833ff5b0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1575,25 +1575,18 @@ out:
 }
 
 /**
- * __register_sysctl_paths - register a sysctl table hierarchy
- * @set: Sysctl tree to register on
- * @path: The path to the directory the sysctl table is in.
+ * register_sysctl_table - register a sysctl table hierarchy
  * @table: the top-level table structure
  *
  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
  * array. A completely 0 filled entry terminates the table.
  * We are slowly deprecating this call so avoid its use.
- *
- * See __register_sysctl_table for more details.
  */
-struct ctl_table_header *__register_sysctl_paths(
-	struct ctl_table_set *set,
-	const struct ctl_path *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
 {
 	struct ctl_table *ctl_table_arg = table;
 	int nr_subheaders = count_subheaders(table);
 	struct ctl_table_header *header = NULL, **subheaders, **subheader;
-	const struct ctl_path *component;
 	char *new_path, *pos;
 
 	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -1601,11 +1594,6 @@ struct ctl_table_header *__register_sysctl_paths(
 		return NULL;
 
 	pos[0] = '\0';
-	for (component = path; component->procname; component++) {
-		pos = append_path(new_path, pos, component->procname);
-		if (!pos)
-			goto out;
-	}
 	while (table->procname && table->child && !table[1].procname) {
 		pos = append_path(new_path, pos, table->procname);
 		if (!pos)
@@ -1613,7 +1601,7 @@ struct ctl_table_header *__register_sysctl_paths(
 		table = table->child;
 	}
 	if (nr_subheaders == 1) {
-		header = __register_sysctl_table(set, new_path, table);
+		header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table);
 		if (header)
 			header->ctl_table_arg = ctl_table_arg;
 	} else {
@@ -1627,7 +1615,7 @@ struct ctl_table_header *__register_sysctl_paths(
 		header->ctl_table_arg = ctl_table_arg;
 
 		if (register_leaf_sysctl_tables(new_path, pos, &subheader,
-						set, table))
+						&sysctl_table_root.default_set, table))
 			goto err_register_leaves;
 	}
 
@@ -1646,41 +1634,6 @@ err_register_leaves:
 	header = NULL;
 	goto out;
 }
-
-/**
- * register_sysctl_paths - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- * We are slowly deprecating this caller so avoid future uses of it.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
-{
-	return __register_sysctl_paths(&sysctl_table_root.default_set,
-					path, table);
-}
-EXPORT_SYMBOL(register_sysctl_paths);
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-	static const struct ctl_path null_path[] = { {} };
-
-	return register_sysctl_paths(null_path, table);
-}
 EXPORT_SYMBOL(register_sysctl_table);
 
 int __register_sysctl_base(struct ctl_table *base_table)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 780690dc08cd..3d08277959af 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -221,14 +221,8 @@ extern void retire_sysctl_set(struct ctl_table_set *set);
 struct ctl_table_header *__register_sysctl_table(
 	struct ctl_table_set *set,
 	const char *path, struct ctl_table *table);
-struct ctl_table_header *__register_sysctl_paths(
-	struct ctl_table_set *set,
-	const struct ctl_path *path, struct ctl_table *table);
 struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table);
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table);
-
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init_bases(void);
@@ -277,12 +271,6 @@ static inline struct ctl_table_header *register_sysctl_mount_point(const char *p
 	return NULL;
 }
 
-static inline struct ctl_table_header *register_sysctl_paths(
-			const struct ctl_path *path, struct ctl_table *table)
-{
-	return NULL;
-}
-
 static inline struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
 {
 	return NULL;
diff --git a/scripts/check-sysctl-docs b/scripts/check-sysctl-docs
index 8bcb9e26c7bc..edc9a629d79e 100755
--- a/scripts/check-sysctl-docs
+++ b/scripts/check-sysctl-docs
@@ -156,22 +156,6 @@ curtable && /\.procname[\t ]*=[\t ]*".+"/ {
     }
 }
 
-/register_sysctl_paths\(.*\)/ {
-    match($0, /register_sysctl_paths\(([^)]+), ([^)]+)\)/, tables)
-    if (debug) print "Attaching table " tables[2] " to path " tables[1]
-    if (paths[tables[1]] == table) {
-	for (entry in entries[tables[2]]) {
-	    printentry(entry)
-	}
-    }
-    split(paths[tables[1]], components, "/")
-    if (length(components) > 1 && components[1] == table) {
-	# Count the first subdirectory as seen
-	seen[components[2]]++
-    }
-}
-
-
 END {
     for (entry in documented) {
 	if (!seen[entry]) {
-- 
cgit v1.2.3


From c1592a89942e9678f7d9c8030efa777c0d57edab Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 2 May 2023 10:25:24 +0200
Subject: netfilter: nf_tables: deactivate anonymous set from preparation phase

Toggle deleted anonymous sets as inactive in the next generation, so
users cannot perform any update on it. Clear the generation bitmask
in case the transaction is aborted.

The following KASAN splat shows a set element deletion for a bound
anonymous set that has been already removed in the same transaction.

[   64.921510] ==================================================================
[   64.923123] BUG: KASAN: wild-memory-access in nf_tables_commit+0xa24/0x1490 [nf_tables]
[   64.924745] Write of size 8 at addr dead000000000122 by task test/890
[   64.927903] CPU: 3 PID: 890 Comm: test Not tainted 6.3.0+ #253
[   64.931120] Call Trace:
[   64.932699]  <TASK>
[   64.934292]  dump_stack_lvl+0x33/0x50
[   64.935908]  ? nf_tables_commit+0xa24/0x1490 [nf_tables]
[   64.937551]  kasan_report+0xda/0x120
[   64.939186]  ? nf_tables_commit+0xa24/0x1490 [nf_tables]
[   64.940814]  nf_tables_commit+0xa24/0x1490 [nf_tables]
[   64.942452]  ? __kasan_slab_alloc+0x2d/0x60
[   64.944070]  ? nf_tables_setelem_notify+0x190/0x190 [nf_tables]
[   64.945710]  ? kasan_set_track+0x21/0x30
[   64.947323]  nfnetlink_rcv_batch+0x709/0xd90 [nfnetlink]
[   64.948898]  ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  1 +
 net/netfilter/nf_tables_api.c     | 12 ++++++++++++
 net/netfilter/nft_dynset.c        |  2 +-
 net/netfilter/nft_lookup.c        |  2 +-
 net/netfilter/nft_objref.c        |  2 +-
 5 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3ed21d2d5659..2e24ea1d744c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -619,6 +619,7 @@ struct nft_set_binding {
 };
 
 enum nft_trans_phase;
+void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set);
 void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
 			      struct nft_set_binding *binding,
 			      enum nft_trans_phase phase);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8b6c61a2196c..59fb8320ab4d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5127,12 +5127,24 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	}
 }
 
+void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	if (nft_set_is_anonymous(set))
+		nft_clear(ctx->net, set);
+
+	set->use++;
+}
+EXPORT_SYMBOL_GPL(nf_tables_activate_set);
+
 void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
 			      struct nft_set_binding *binding,
 			      enum nft_trans_phase phase)
 {
 	switch (phase) {
 	case NFT_TRANS_PREPARE:
+		if (nft_set_is_anonymous(set))
+			nft_deactivate_next(ctx->net, set);
+
 		set->use--;
 		return;
 	case NFT_TRANS_ABORT:
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 274579b1696e..bd19c7aec92e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -342,7 +342,7 @@ static void nft_dynset_activate(const struct nft_ctx *ctx,
 {
 	struct nft_dynset *priv = nft_expr_priv(expr);
 
-	priv->set->use++;
+	nf_tables_activate_set(ctx, priv->set);
 }
 
 static void nft_dynset_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index cecf8ab90e58..03ef4fdaa460 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -167,7 +167,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx,
 {
 	struct nft_lookup *priv = nft_expr_priv(expr);
 
-	priv->set->use++;
+	nf_tables_activate_set(ctx, priv->set);
 }
 
 static void nft_lookup_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index cb37169608ba..a48dd5b5d45b 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -185,7 +185,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx,
 {
 	struct nft_objref_map *priv = nft_expr_priv(expr);
 
-	priv->set->use++;
+	nf_tables_activate_set(ctx, priv->set);
 }
 
 static void nft_objref_map_destroy(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From 5d388143fa6c351d985ffd23ea50c91c8839141b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 3 Apr 2023 09:49:13 +0200
Subject: i2c: gxp: fix build failure without CONFIG_I2C_SLAVE

The gxp_i2c_slave_irq_handler() is hidden in an #ifdef, but the
caller uses an IS_ENABLED() check:

drivers/i2c/busses/i2c-gxp.c: In function 'gxp_i2c_irq_handler':
drivers/i2c/busses/i2c-gxp.c:467:29: error: implicit declaration of function 'gxp_i2c_slave_irq_handler'; did you mean 'gxp_i2c_irq_handler'? [-Werror=implicit-function-declaration]

It has to consistently use one method or the other to avoid warnings,
so move to IS_ENABLED() here for readability and build coverage, and
move the #ifdef in linux/i2c.h to allow building it as dead code.

Fixes: 4a55ed6f89f5 ("i2c: Add GXP SoC I2C Controller")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nick Hawkins <nick.hawkins@hpe.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/busses/i2c-gxp.c | 2 --
 include/linux/i2c.h          | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-gxp.c b/drivers/i2c/busses/i2c-gxp.c
index d4b55d989a26..8ea3fb5e4c7f 100644
--- a/drivers/i2c/busses/i2c-gxp.c
+++ b/drivers/i2c/busses/i2c-gxp.c
@@ -353,7 +353,6 @@ static void gxp_i2c_chk_data_ack(struct gxp_i2c_drvdata *drvdata)
 	writew(value, drvdata->base + GXP_I2CMCMD);
 }
 
-#if IS_ENABLED(CONFIG_I2C_SLAVE)
 static bool gxp_i2c_slave_irq_handler(struct gxp_i2c_drvdata *drvdata)
 {
 	u8 value;
@@ -437,7 +436,6 @@ static bool gxp_i2c_slave_irq_handler(struct gxp_i2c_drvdata *drvdata)
 
 	return true;
 }
-#endif
 
 static irqreturn_t gxp_i2c_irq_handler(int irq, void *_drvdata)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 5ba89663ea86..13a1ce38cb0c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -385,7 +385,6 @@ static inline void i2c_set_clientdata(struct i2c_client *client, void *data)
 
 /* I2C slave support */
 
-#if IS_ENABLED(CONFIG_I2C_SLAVE)
 enum i2c_slave_event {
 	I2C_SLAVE_READ_REQUESTED,
 	I2C_SLAVE_WRITE_REQUESTED,
@@ -396,9 +395,10 @@ enum i2c_slave_event {
 
 int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb);
 int i2c_slave_unregister(struct i2c_client *client);
-bool i2c_detect_slave_mode(struct device *dev);
 int i2c_slave_event(struct i2c_client *client,
 		    enum i2c_slave_event event, u8 *val);
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
+bool i2c_detect_slave_mode(struct device *dev);
 #else
 static inline bool i2c_detect_slave_mode(struct device *dev) { return false; }
 #endif
-- 
cgit v1.2.3


From 26e02e6c1012793248096028287ae7296029cd29 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Mon, 27 Mar 2023 10:36:37 +0200
Subject: dt-bindings: mailbox: mediatek,gce-mailbox: Add support for MT6795

Add a compatible string for the MT6795 Helio X10 SoC using MT8173
binding and add a header for the MT6795's GCE mailbox.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 .../bindings/mailbox/mediatek,gce-mailbox.yaml     |  20 ++--
 include/dt-bindings/gce/mediatek,mt6795-gce.h      | 123 +++++++++++++++++++++
 2 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 include/dt-bindings/gce/mediatek,mt6795-gce.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml b/Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml
index d383b2ab3ce8..cef9d7601398 100644
--- a/Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml
+++ b/Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml
@@ -16,14 +16,18 @@ description:
 
 properties:
   compatible:
-    enum:
-      - mediatek,mt6779-gce
-      - mediatek,mt8173-gce
-      - mediatek,mt8183-gce
-      - mediatek,mt8186-gce
-      - mediatek,mt8188-gce
-      - mediatek,mt8192-gce
-      - mediatek,mt8195-gce
+    oneOf:
+      - enum:
+          - mediatek,mt6779-gce
+          - mediatek,mt8173-gce
+          - mediatek,mt8183-gce
+          - mediatek,mt8186-gce
+          - mediatek,mt8188-gce
+          - mediatek,mt8192-gce
+          - mediatek,mt8195-gce
+      - items:
+          - const: mediatek,mt6795-gce
+          - const: mediatek,mt8173-gce
 
   "#mbox-cells":
     const: 2
diff --git a/include/dt-bindings/gce/mediatek,mt6795-gce.h b/include/dt-bindings/gce/mediatek,mt6795-gce.h
new file mode 100644
index 000000000000..97d5ba2d2b44
--- /dev/null
+++ b/include/dt-bindings/gce/mediatek,mt6795-gce.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023 Collabora Ltd.
+ * Author: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+#ifndef _DT_BINDINGS_GCE_MT6795_H
+#define _DT_BINDINGS_GCE_MT6795_H
+
+/* GCE HW thread priority */
+#define CMDQ_THR_PRIO_LOWEST			0
+#define CMDQ_THR_PRIO_NORMAL			1
+#define CMDQ_THR_PRIO_NORMAL_2			2
+#define CMDQ_THR_PRIO_MEDIUM			3
+#define CMDQ_THR_PRIO_MEDIUM_2			4
+#define CMDQ_THR_PRIO_HIGH			5
+#define CMDQ_THR_PRIO_HIGHER			6
+#define CMDQ_THR_PRIO_HIGHEST			7
+
+/* GCE SUBSYS */
+#define SUBSYS_1300XXXX				0
+#define SUBSYS_1400XXXX				1
+#define SUBSYS_1401XXXX				2
+#define SUBSYS_1402XXXX				3
+#define SUBSYS_1500XXXX				4
+#define SUBSYS_1600XXXX				5
+#define SUBSYS_1700XXXX				6
+#define SUBSYS_1800XXXX				7
+#define SUBSYS_1000XXXX				8
+#define SUBSYS_1001XXXX				9
+#define SUBSYS_1002XXXX				10
+#define SUBSYS_1003XXXX				11
+#define SUBSYS_1004XXXX				12
+#define SUBSYS_1005XXXX				13
+#define SUBSYS_1020XXXX				14
+#define SUBSYS_1021XXXX				15
+#define SUBSYS_1120XXXX				16
+#define SUBSYS_1121XXXX				17
+#define SUBSYS_1122XXXX				18
+#define SUBSYS_1123XXXX				19
+#define SUBSYS_1124XXXX				20
+#define SUBSYS_1125XXXX				21
+#define SUBSYS_1126XXXX				22
+
+/* GCE HW EVENT */
+#define CMDQ_EVENT_MDP_RDMA0_SOF		0
+#define CMDQ_EVENT_MDP_RDMA1_SOF		1
+#define CMDQ_EVENT_MDP_DSI0_TE_SOF		2
+#define CMDQ_EVENT_MDP_DSI1_TE_SOF		3
+#define CMDQ_EVENT_MDP_MVW_SOF			4
+#define CMDQ_EVENT_MDP_TDSHP0_SOF		5
+#define CMDQ_EVENT_MDP_TDSHP1_SOF		6
+#define CMDQ_EVENT_MDP_WDMA_SOF			7
+#define CMDQ_EVENT_MDP_WROT0_SOF		8
+#define CMDQ_EVENT_MDP_WROT1_SOF		9
+#define CMDQ_EVENT_MDP_CROP_SOF			10
+#define CMDQ_EVENT_DISP_OVL0_SOF		11
+#define CMDQ_EVENT_DISP_OVL1_SOF		12
+#define CMDQ_EVENT_DISP_RDMA0_SOF		13
+#define CMDQ_EVENT_DISP_RDMA1_SOF		14
+#define CMDQ_EVENT_DISP_RDMA2_SOF		15
+#define CMDQ_EVENT_DISP_WDMA0_SOF		16
+#define CMDQ_EVENT_DISP_WDMA1_SOF		17
+#define CMDQ_EVENT_DISP_COLOR0_SOF		18
+#define CMDQ_EVENT_DISP_COLOR1_SOF		19
+#define CMDQ_EVENT_DISP_AAL_SOF			20
+#define CMDQ_EVENT_DISP_GAMMA_SOF		21
+#define CMDQ_EVENT_DISP_UFOE_SOF		22
+#define CMDQ_EVENT_DISP_PWM0_SOF		23
+#define CMDQ_EVENT_DISP_PWM1_SOF		24
+#define CMDQ_EVENT_DISP_OD_SOF			25
+#define CMDQ_EVENT_MDP_RDMA0_EOF		26
+#define CMDQ_EVENT_MDP_RDMA1_EOF		27
+#define CMDQ_EVENT_MDP_RSZ0_EOF			28
+#define CMDQ_EVENT_MDP_RSZ1_EOF			29
+#define CMDQ_EVENT_MDP_RSZ2_EOF			30
+#define CMDQ_EVENT_MDP_TDSHP0_EOF		31
+#define CMDQ_EVENT_MDP_TDSHP1_EOF		32
+#define CMDQ_EVENT_MDP_WDMA_EOF			33
+#define CMDQ_EVENT_MDP_WROT0_WRITE_EOF		34
+#define CMDQ_EVENT_MDP_WROT0_READ_EOF		35
+#define CMDQ_EVENT_MDP_WROT1_WRITE_EOF		36
+#define CMDQ_EVENT_MDP_WROT1_READ_EOF		37
+#define CMDQ_EVENT_MDP_CROP_EOF			38
+#define CMDQ_EVENT_DISP_OVL0_EOF		39
+#define CMDQ_EVENT_DISP_OVL1_EOF		40
+#define CMDQ_EVENT_DISP_RDMA0_EOF		41
+#define CMDQ_EVENT_DISP_RDMA1_EOF		42
+#define CMDQ_EVENT_DISP_RDMA2_EOF		43
+#define CMDQ_EVENT_DISP_WDMA0_EOF		44
+#define CMDQ_EVENT_DISP_WDMA1_EOF		45
+#define CMDQ_EVENT_DISP_COLOR0_EOF		46
+#define CMDQ_EVENT_DISP_COLOR1_EOF		47
+#define CMDQ_EVENT_DISP_AAL_EOF			48
+#define CMDQ_EVENT_DISP_GAMMA_EOF		49
+#define CMDQ_EVENT_DISP_UFOE_EOF		50
+#define CMDQ_EVENT_DISP_DPI0_EOF		51
+#define CMDQ_EVENT_MUTEX0_STREAM_EOF		52
+#define CMDQ_EVENT_MUTEX1_STREAM_EOF		53
+#define CMDQ_EVENT_MUTEX2_STREAM_EOF		54
+#define CMDQ_EVENT_MUTEX3_STREAM_EOF		55
+#define CMDQ_EVENT_MUTEX4_STREAM_EOF		56
+#define CMDQ_EVENT_MUTEX5_STREAM_EOF		57
+#define CMDQ_EVENT_MUTEX6_STREAM_EOF		58
+#define CMDQ_EVENT_MUTEX7_STREAM_EOF		59
+#define CMDQ_EVENT_MUTEX8_STREAM_EOF		60
+#define CMDQ_EVENT_MUTEX9_STREAM_EOF		61
+#define CMDQ_EVENT_DISP_RDMA0_UNDERRUN		62
+#define CMDQ_EVENT_DISP_RDMA1_UNDERRUN		63
+#define CMDQ_EVENT_DISP_RDMA2_UNDERRUN		64
+#define CMDQ_EVENT_ISP_PASS2_2_EOF		129
+#define CMDQ_EVENT_ISP_PASS2_1_EOF		130
+#define CMDQ_EVENT_ISP_PASS2_0_EOF		131
+#define CMDQ_EVENT_ISP_PASS1_1_EOF		132
+#define CMDQ_EVENT_ISP_PASS1_0_EOF		133
+#define CMDQ_EVENT_CAMSV_2_PASS1_EOF		134
+#define CMDQ_EVENT_CAMSV_1_PASS1_EOF		135
+#define CMDQ_EVENT_SENINF_CAM1_2_3_FIFO_FULL	136
+#define CMDQ_EVENT_SENINF_CAM0_FIFO_FULL	137
+#define CMDQ_EVENT_JPGENC_PASS2_EOF		257
+#define CMDQ_EVENT_JPGENC_PASS1_EOF		258
+#define CMDQ_EVENT_JPGDEC_EOF			259
+
+#endif
-- 
cgit v1.2.3


From fd9b8547bc5c34186dc42ea05fb4380d21695374 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 4 May 2023 05:18:55 -0700
Subject: io_uring: Pass whole sqe to commands

Currently uring CMD operation relies on having large SQEs, but future
operations might want to use normal SQE.

The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
but, for commands that use normal SQE size, it might be necessary to
access the initial SQE fields outside of the payload/cmd block.  So,
saves the whole SQE other than just the pdu.

This changes slightly how the io_uring_cmd works, since the cmd
structures and callbacks are not opaque to io_uring anymore. I.e, the
callbacks can look at the SQE entries, not only, in the cmd structure.

The main advantage is that we don't need to create custom structures for
simple commands.

Creates io_uring_sqe_cmd() that returns the cmd private data as a null
pointer and avoids casting in the callee side.
Also, make most of ublk_drv's sqe->cmd priv structure into const, and use
io_uring_sqe_cmd() to get the private structure, removing the unwanted
cast. (There is one case where the cast is still needed since the
header->{len,addr} is updated in the private structure)

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20230504121856.904491-3-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c  | 26 +++++++++++++-------------
 drivers/nvme/host/ioctl.c |  2 +-
 include/linux/io_uring.h  |  7 ++++++-
 io_uring/opdef.c          |  2 +-
 io_uring/uring_cmd.c      |  9 +++------
 5 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c73cc57ec547..42f4d7ca962e 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1019,7 +1019,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
 }
 
 static void ublk_commit_completion(struct ublk_device *ub,
-		struct ublksrv_io_cmd *ub_cmd)
+		const struct ublksrv_io_cmd *ub_cmd)
 {
 	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
 	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
@@ -1263,7 +1263,7 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
 
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
-	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
+	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe);
 	struct ublk_device *ub = cmd->file->private_data;
 	struct ublk_queue *ubq;
 	struct ublk_io *io;
@@ -1567,7 +1567,7 @@ static struct ublk_device *ublk_get_device_from_id(int idx)
 
 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	int ublksrv_pid = (int)header->data[0];
 	struct gendisk *disk;
 	int ret = -EINVAL;
@@ -1630,7 +1630,7 @@ out_unlock:
 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	cpumask_var_t cpumask;
 	unsigned long queue;
@@ -1681,7 +1681,7 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 
 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublksrv_ctrl_dev_info info;
 	struct ublk_device *ub;
@@ -1844,7 +1844,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
 
 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 
 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
 			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
@@ -1863,7 +1863,7 @@ static int ublk_ctrl_stop_dev(struct ublk_device *ub)
 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 
 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
@@ -1894,7 +1894,7 @@ static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
 static int ublk_ctrl_get_params(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublk_params_header ph;
 	int ret;
@@ -1925,7 +1925,7 @@ static int ublk_ctrl_get_params(struct ublk_device *ub,
 static int ublk_ctrl_set_params(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublk_params_header ph;
 	int ret = -EFAULT;
@@ -1983,7 +1983,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	int ret = -EINVAL;
 	int i;
 
@@ -2025,7 +2025,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	int ublksrv_pid = (int)header->data[0];
 	int ret = -EINVAL;
 
@@ -2092,7 +2092,7 @@ exit:
 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	char *dev_path = NULL;
@@ -2171,7 +2171,7 @@ exit:
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	struct ublk_device *ub = NULL;
 	int ret = -EINVAL;
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d24ea2e05156..81c5c9e38477 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -552,7 +552,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	const struct nvme_uring_cmd *cmd = ioucmd->cmd;
+	const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe);
 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
 	struct nvme_uring_data d;
 	struct nvme_command c;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 35b9328ca335..3399d979ee1c 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -24,7 +24,7 @@ enum io_uring_cmd_flags {
 
 struct io_uring_cmd {
 	struct file	*file;
-	const void	*cmd;
+	const struct io_uring_sqe *sqe;
 	union {
 		/* callback to defer completions to task context */
 		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
@@ -66,6 +66,11 @@ static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
+
+static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
+{
+	return sqe->cmd;
+}
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index cca7c5b55208..3b9c6489b8b6 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -627,7 +627,7 @@ const struct io_cold_def io_cold_defs[] = {
 	},
 	[IORING_OP_URING_CMD] = {
 		.name			= "URING_CMD",
-		.async_size		= uring_cmd_pdu_size(1),
+		.async_size		= 2 * sizeof(struct io_uring_sqe),
 		.prep_async		= io_uring_cmd_prep_async,
 	},
 	[IORING_OP_SEND_ZC] = {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 5113c9a48583..ed536d7499db 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -69,15 +69,12 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 int io_uring_cmd_prep_async(struct io_kiocb *req)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	size_t cmd_size;
 
 	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
 	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
 
-	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
-
-	memcpy(req->async_data, ioucmd->cmd, cmd_size);
-	ioucmd->cmd = req->async_data;
+	memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx));
+	ioucmd->sqe = req->async_data;
 	return 0;
 }
 
@@ -103,7 +100,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = ctx->user_bufs[index];
 		io_req_set_rsrc_node(req, ctx, 0);
 	}
-	ioucmd->cmd = sqe->cmd;
+	ioucmd->sqe = sqe;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 	return 0;
 }
-- 
cgit v1.2.3


From cb9e6e584d58420df182102674e636fb841dae4c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 4 May 2023 11:52:49 +0200
Subject: bonding: add xdp_features support

Introduce xdp_features support for bonding driver according to the slave
devices attached to the master one. xdp_features is required whenever we
want to xdp_redirect traffic into a bond device and then into selected
slaves attached to it.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Jussi Maki <joamaki@gmail.com>
Tested-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    | 29 +++++++++++++++++++++++++++++
 drivers/net/bonding/bond_options.c |  2 ++
 include/net/bonding.h              |  1 +
 3 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 710548dbd0c1..3fed888629f7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1789,6 +1789,26 @@ static void bond_ether_setup(struct net_device *bond_dev)
 	bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 }
 
+void bond_xdp_set_features(struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	xdp_features_t val = NETDEV_XDP_ACT_MASK;
+	struct list_head *iter;
+	struct slave *slave;
+
+	ASSERT_RTNL();
+
+	if (!bond_xdp_check(bond)) {
+		xdp_clear_features_flag(bond_dev);
+		return;
+	}
+
+	bond_for_each_slave(bond, slave, iter)
+		val &= slave->dev->xdp_features;
+
+	xdp_set_features_flag(bond_dev, val);
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		 struct netlink_ext_ack *extack)
@@ -2236,6 +2256,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 			bpf_prog_inc(bond->xdp_prog);
 	}
 
+	bond_xdp_set_features(bond_dev);
+
 	slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
 		   bond_is_active_slave(new_slave) ? "an active" : "a backup",
 		   new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
@@ -2483,6 +2505,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 	if (!netif_is_bond_master(slave_dev))
 		slave_dev->priv_flags &= ~IFF_BONDING;
 
+	bond_xdp_set_features(bond_dev);
 	kobject_put(&slave->kobj);
 
 	return 0;
@@ -3930,6 +3953,9 @@ static int bond_slave_netdev_event(unsigned long event,
 		/* Propagate to master device */
 		call_netdevice_notifiers(event, slave->bond->dev);
 		break;
+	case NETDEV_XDP_FEAT_CHANGE:
+		bond_xdp_set_features(bond_dev);
+		break;
 	default:
 		break;
 	}
@@ -5874,6 +5900,9 @@ void bond_setup(struct net_device *bond_dev)
 	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
 		bond_dev->features |= BOND_XFRM_FEATURES;
 #endif /* CONFIG_XFRM_OFFLOAD */
+
+	if (bond_xdp_check(bond))
+		bond_dev->xdp_features = NETDEV_XDP_ACT_MASK;
 }
 
 /* Destroy a bonding device.
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index f71d5517f829..0498fc6731f8 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -877,6 +877,8 @@ static int bond_option_mode_set(struct bonding *bond,
 			netdev_update_features(bond->dev);
 	}
 
+	bond_xdp_set_features(bond->dev);
+
 	return 0;
 }
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index c3843239517d..a60a24923b55 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -659,6 +659,7 @@ void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
 int bond_sysfs_slave_add(struct slave *slave);
 void bond_sysfs_slave_del(struct slave *slave);
+void bond_xdp_set_features(struct net_device *bond_dev);
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		 struct netlink_ext_ack *extack);
 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
-- 
cgit v1.2.3


From 6ce2c04fcbcaa5eb086e5142ab359be306cbc3e1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 2 May 2023 21:32:33 -0400
Subject: ftrace: Add MODIFIED flag to show if IPMODIFY or direct was attached

If a function had ever had IPMODIFY or DIRECT attached to it, where this
is how live kernel patching and BPF overrides work, mark them and display
an "M" in the enabled_functions and touched_functions files. This can be
used for debugging. If a function had been modified and later there's a bug
in the code related to that function, this can be used to know if the cause
is possibly from a live kernel patch or a BPF program that changed the
behavior of the code.

Also update the documentation on the enabled_functions and
touched_functions output, as it was missing direct callers and CALL_OPS.
And include this new modify attribute.

Link: https://lore.kernel.org/linux-trace-kernel/20230502213233.004e3ae4@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 25 +++++++++++++++++++++++++
 include/linux/ftrace.h         |  4 +++-
 kernel/trace/ftrace.c          | 12 +++++++++---
 3 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index aaebb821912e..d5766229c71a 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -350,6 +350,19 @@ of ftrace. Here is a list of some of the key files:
 	an 'I' will be displayed on the same line as the function that
 	can be overridden.
 
+	If a non ftrace trampoline is attached (BPF) a 'D' will be displayed.
+	Note, normal ftrace trampolines can also be attached, but only one
+	"direct" trampoline can be attached to a given function at a time.
+
+	Some architectures can not call direct trampolines, but instead have
+	the ftrace ops function located above the function entry point. In
+	such cases an 'O' will be displayed.
+
+	If a function had either the "ip modify" or a "direct" call attached to
+	it in the past, a 'M' will be shown. This flag is never cleared. It is
+	used to know if a function was every modified by the ftrace infrastructure,
+	and can be used for debugging.
+
 	If the architecture supports it, it will also show what callback
 	is being directly called by the function. If the count is greater
 	than 1 it most likely will be ftrace_ops_list_func().
@@ -359,6 +372,18 @@ of ftrace. Here is a list of some of the key files:
 	its address will be printed as well as the function that the
 	trampoline calls.
 
+  touched_functions:
+
+	This file contains all the functions that ever had a function callback
+	to it via the ftrace infrastructure. It has the same format as
+	enabled_functions but shows all functions that have every been
+	traced.
+
+	To see any function that has every been modified by "ip modify" or a
+	direct trampoline, one can perform the following command:
+
+	grep ' M ' /sys/kernel/tracing/touched_functions
+
   function_profile_enabled:
 
 	When set it will enable all functions with either the function
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 327046f1278d..7dffd740e784 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -549,6 +549,7 @@ bool is_ftrace_trampoline(unsigned long addr);
  *  CALL_OPS - the record can use callsite-specific ops
  *  CALL_OPS_EN - the function is set up to use callsite-specific ops
  *  TOUCHED  - A callback was added since boot up
+ *  MODIFIED - The function had IPMODIFY or DIRECT attached to it
  *
  * When a new ftrace_ops is registered and wants a function to save
  * pt_regs, the rec->flags REGS is set. When the function has been
@@ -569,9 +570,10 @@ enum {
 	FTRACE_FL_CALL_OPS	= (1UL << 22),
 	FTRACE_FL_CALL_OPS_EN	= (1UL << 21),
 	FTRACE_FL_TOUCHED	= (1UL << 20),
+	FTRACE_FL_MODIFIED	= (1UL << 19),
 };
 
-#define FTRACE_REF_MAX_SHIFT	20
+#define FTRACE_REF_MAX_SHIFT	19
 #define FTRACE_REF_MAX		((1UL << FTRACE_REF_MAX_SHIFT) - 1)
 
 #define ftrace_rec_count(rec)	((rec)->flags & FTRACE_REF_MAX)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index db8532a4d5c8..885845fc851d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -46,7 +46,8 @@
 #include "trace_stat.h"
 
 /* Flags that do not get reset */
-#define FTRACE_NOCLEAR_FLAGS	(FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+#define FTRACE_NOCLEAR_FLAGS	(FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED | \
+				 FTRACE_FL_MODIFIED)
 
 #define FTRACE_INVALID_FUNCTION		"__ftrace_invalid_address__"
 
@@ -2273,6 +2274,10 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 					rec->flags &= ~FTRACE_FL_TRAMP_EN;
 			}
 
+			/* Keep track of anything that modifies the function */
+			if (rec->flags & (FTRACE_FL_DIRECT | FTRACE_FL_IPMODIFY))
+				rec->flags |= FTRACE_FL_MODIFIED;
+
 			if (flag & FTRACE_FL_DIRECT) {
 				/*
 				 * If there's only one user (direct_ops helper)
@@ -3866,12 +3871,13 @@ static int t_show(struct seq_file *m, void *v)
 	if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
 		struct ftrace_ops *ops;
 
-		seq_printf(m, " (%ld)%s%s%s%s",
+		seq_printf(m, " (%ld)%s%s%s%s%s",
 			   ftrace_rec_count(rec),
 			   rec->flags & FTRACE_FL_REGS ? " R" : "  ",
 			   rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ",
 			   rec->flags & FTRACE_FL_DIRECT ? " D" : "  ",
-			   rec->flags & FTRACE_FL_CALL_OPS ? " O" : "  ");
+			   rec->flags & FTRACE_FL_CALL_OPS ? " O" : "  ",
+			   rec->flags & FTRACE_FL_MODIFIED ? " M " : "   ");
 		if (rec->flags & FTRACE_FL_TRAMP_EN) {
 			ops = ftrace_find_tramp_ops_any(rec);
 			if (ops) {
-- 
cgit v1.2.3